Henryeahhh commited on
Commit
6d71685
·
verified ·
1 Parent(s): b00e6cc

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +9 -0
  2. all_flow_matching/glue_best/config.yaml +316 -0
  3. all_flow_matching/glue_best/step11000-action-head/metadata.pt +3 -0
  4. all_flow_matching/glue_best/step11500-action-head/metadata.pt +3 -0
  5. all_flow_matching/glue_best/step11500-unsharded/config.yaml +316 -0
  6. all_flow_matching/glue_best/step11500-unsharded/lora.pt +3 -0
  7. all_flow_matching/glue_best/step11500-unsharded/train.pt +3 -0
  8. all_flow_matching/glue_best/step11500/config.yaml +316 -0
  9. all_flow_matching/glue_best/wandb/wandb/debug-internal.log +13 -0
  10. all_flow_matching/glue_best/wandb/wandb/debug.log +0 -0
  11. all_flow_matching/glue_best/wandb/wandb/run-20250924_061930-dnrnwv30/run-dnrnwv30.wandb +0 -0
  12. all_flow_matching/glue_best/wandb/wandb/run-20250924_062357-hmmpns57/run-hmmpns57.wandb +0 -0
  13. all_flow_matching/glue_best/wandb/wandb/run-20250924_063128-wtatxotn/files/config.yaml +611 -0
  14. all_flow_matching/glue_best/wandb/wandb/run-20250924_063128-wtatxotn/files/output.log +33 -0
  15. all_flow_matching/glue_best/wandb/wandb/run-20250924_063128-wtatxotn/run-wtatxotn.wandb +0 -0
  16. all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/config.yaml +611 -0
  17. all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/output.log +33 -0
  18. all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/requirements.txt +283 -0
  19. all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/wandb-metadata.json +204 -0
  20. all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/wandb-summary.json +1 -0
  21. all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/logs/debug-internal.log +11 -0
  22. all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/run-6tj2c8pr.wandb +0 -0
  23. all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/config.yaml +611 -0
  24. all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/output.log +29 -0
  25. all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/requirements.txt +283 -0
  26. all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/wandb-metadata.json +204 -0
  27. all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/wandb-summary.json +1 -0
  28. all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/logs/debug-core.log +14 -0
  29. all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/logs/debug-internal.log +11 -0
  30. all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/logs/debug.log +1 -0
  31. all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/run-qsv5q1hc.wandb +0 -0
  32. all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/config.yaml +611 -0
  33. all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/output.log +47 -0
  34. all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/requirements.txt +283 -0
  35. all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/wandb-metadata.json +204 -0
  36. all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/wandb-summary.json +1 -0
  37. all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/logs/debug-core.log +14 -0
  38. all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/logs/debug-internal.log +11 -0
  39. all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/logs/debug.log +1 -0
  40. all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/run-lqn400wc.wandb +0 -0
  41. all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/config.yaml +615 -0
  42. all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/output.log +74 -0
  43. all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/requirements.txt +285 -0
  44. all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/wandb-metadata.json +204 -0
  45. all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/wandb-summary.json +1 -0
  46. all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/logs/debug-core.log +16 -0
  47. all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/logs/debug-internal.log +12 -0
  48. all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/logs/debug.log +0 -0
  49. all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/run-rwm1qqvr.wandb +0 -0
  50. all_flow_matching/glue_best/wandb/wandb/run-20250924_075956-zoletkkn/files/config.yaml +615 -0
.gitattributes CHANGED
@@ -42,3 +42,12 @@ wipe_l1_regression/wandb/wandb/run-20251005_163743-a1znetn8/run-a1znetn8.wandb f
42
  cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/run-quokv8gn.wandb filter=lfs diff=lfs merge=lfs -text
43
  cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/run-fqdwkc8m.wandb filter=lfs diff=lfs merge=lfs -text
44
  glue_flow_matching/wandb/wandb/run-20251002_163728-tmwli25x/run-tmwli25x.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
42
  cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/run-quokv8gn.wandb filter=lfs diff=lfs merge=lfs -text
43
  cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/run-fqdwkc8m.wandb filter=lfs diff=lfs merge=lfs -text
44
  glue_flow_matching/wandb/wandb/run-20251002_163728-tmwli25x/run-tmwli25x.wandb filter=lfs diff=lfs merge=lfs -text
45
+ glue_l1_regression/wandb/wandb/run-20251002_163729-7ovz4jzt/run-7ovz4jzt.wandb filter=lfs diff=lfs merge=lfs -text
46
+ eraser_l1_regression/wandb/wandb/run-20251011_163844-qzez8pv7/run-qzez8pv7.wandb filter=lfs diff=lfs merge=lfs -text
47
+ all_l1/wandb/wandb/run-20250930_131250-ea1k0g3y/run-ea1k0g3y.wandb filter=lfs diff=lfs merge=lfs -text
48
+ all_flow_matching/glue_best/wandb/wandb/run-20250924_081723-x94cyrsz/run-x94cyrsz.wandb filter=lfs diff=lfs merge=lfs -text
49
+ eraser_flow_matching/wandb/wandb/run-20251011_163832-yqnt28c8/run-yqnt28c8.wandb filter=lfs diff=lfs merge=lfs -text
50
+ all_flow_matching/wandb/wandb/run-20250928_104655-3b31u4we/run-3b31u4we.wandb filter=lfs diff=lfs merge=lfs -text
51
+ wipe_flow_matching/wandb/wandb/run-20251005_163812-0cfqmuqw/run-0cfqmuqw.wandb filter=lfs diff=lfs merge=lfs -text
52
+ pen_flow_matching/wandb/wandb/run-20251011_163844-a381qnn9/run-a381qnn9.wandb filter=lfs diff=lfs merge=lfs -text
53
+ cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/run-gqyapbwp.wandb filter=lfs diff=lfs merge=lfs -text
all_flow_matching/glue_best/config.yaml ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: glue_20250924_082336
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 3584
7
+ n_heads: 28
8
+ n_kv_heads: 4
9
+ qkv_bias: true
10
+ clip_qkv: null
11
+ n_layers: 28
12
+ mlp_ratio: 4
13
+ mlp_hidden_size: 37888
14
+ activation_type: swiglu
15
+ block_type: sequential
16
+ block_group_size: 1
17
+ rope: true
18
+ rope_full_precision: true
19
+ rope_theta: 1000000.0
20
+ vision_backbone:
21
+ image_model_type: openai
22
+ image_default_input_size:
23
+ - 336
24
+ - 336
25
+ image_patch_size: 14
26
+ image_pos_patch_size: 14
27
+ image_emb_dim: 1024
28
+ image_num_heads: 16
29
+ image_num_key_value_heads: 16
30
+ image_num_layers: 23
31
+ image_head_dim: 64
32
+ image_mlp_dim: 4096
33
+ image_mlp_activations: quick_gelu
34
+ image_dropout_rate: 0.0
35
+ image_num_pos: 577
36
+ image_norm_eps: 1.0e-05
37
+ attention_dropout: 0.0
38
+ residual_dropout: 0.0
39
+ initializer_range: 0.02
40
+ fsdp_wrap: false
41
+ resize_mode: default
42
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
43
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
44
+ low_cpu_fsdp: true
45
+ attention_type: sdpa
46
+ float32_attention: true
47
+ attention_dropout: 0.0
48
+ attention_layer_norm: false
49
+ residual_dropout: 0.1
50
+ response_residual_dropout: 0.0
51
+ embedding_dropout: 0.0
52
+ layer_norm_type: rms
53
+ layer_norm_with_affine: true
54
+ layer_norm_eps: 1.0e-06
55
+ attention_layer_norm_with_affine: true
56
+ max_sequence_length: 4096
57
+ max_position_embeddings: null
58
+ include_bias: false
59
+ bias_for_layer_norm: null
60
+ scale_logits: false
61
+ vocab_size: 152064
62
+ embedding_size: 152064
63
+ additional_vocab_size: 128
64
+ new_embedding_init_range: 0.02
65
+ weight_tying: false
66
+ init_device: null
67
+ init_fn: normal
68
+ init_std: 0.02
69
+ init_cutoff_factor: null
70
+ norm_after: false
71
+ precision: amp_bf16
72
+ max_crops: 12
73
+ crop_mode: overlap-and-resize-c2
74
+ use_col_tokens: true
75
+ prompt_type: uber_model
76
+ system_prompt_kind: demo_or_style
77
+ message_formatting: role
78
+ always_start_with_space: true
79
+ multi_annotation_weighting: root_subsegments
80
+ default_inference_len: 65
81
+ overlap_margins:
82
+ - 4
83
+ - 4
84
+ pad_value: 0.0
85
+ image_padding_embed: pad_and_partial_pad
86
+ fix_image_padding: true
87
+ vit_layers:
88
+ - -2
89
+ - -9
90
+ image_pooling_h: 2
91
+ image_pooling_w: 2
92
+ image_pooling_2d: attention_meanq
93
+ image_projector: mlp
94
+ image_feature_dropout: 0.0
95
+ initializer_range: 0.02
96
+ normalize_input_embeds: false
97
+ use_position_ids: true
98
+ head_dim: null
99
+ tokenizer:
100
+ identifier: Qwen/Qwen2-7B
101
+ tokenizer_dir: null
102
+ pad_tokenizer: true
103
+ moe_num_experts: 8
104
+ moe_top_k: 2
105
+ moe_mlp_impl: sparse
106
+ moe_log_expert_assignment: false
107
+ moe_shared_expert: false
108
+ moe_lbl_in_fp32: false
109
+ moe_interleave: false
110
+ moe_loss_weight: 0.1
111
+ moe_zloss_weight: null
112
+ moe_dropless: true
113
+ moe_capacity_factor: 1.25
114
+ action_head: flow_matching
115
+ num_diffusion_steps: 1000
116
+ num_diffusion_inference_steps: 30
117
+ use_proprio: true
118
+ action_head_dit_hidden_size: 1152
119
+ action_head_dit_depth: 28
120
+ action_head_dit_num_heads: 16
121
+ llm_causal_attention: false
122
+ action_use_left_eef: false
123
+ action_use_mobile_base: false
124
+ allow_resume: false
125
+ ft_llm: false
126
+ ft_vit: false
127
+ ft_connector: false
128
+ ft_embedding: lm_head
129
+ lora: false
130
+ use_lora: true
131
+ lora_rank: 32
132
+ lora_llm: true
133
+ lora_vit: false
134
+ lora_connector: false
135
+ early_exit: false
136
+ train_exit_random_layer: false
137
+ optimizer:
138
+ name: adamw
139
+ learning_rate: 0.0001
140
+ weight_decay: 0.01
141
+ betas:
142
+ - 0.9
143
+ - 0.95
144
+ eps: 1.0e-05
145
+ connector_learning_rate: 0.0002
146
+ vit_learning_rate: 6.0e-06
147
+ llm_learning_rate: 5.0e-05
148
+ connector_weight_decay: 0.0
149
+ vit_weight_decay: 0.0
150
+ llm_weight_decay: 0.0
151
+ connector_betas:
152
+ - 0.9
153
+ - 0.95
154
+ vit_betas:
155
+ - 0.9
156
+ - 0.95
157
+ llm_betas:
158
+ - 0.9
159
+ - 0.95
160
+ connector_eps: 1.0e-06
161
+ vit_eps: 1.0e-06
162
+ llm_eps: 1.0e-06
163
+ metrics_log_interval: 20
164
+ scheduler:
165
+ name: multimodal
166
+ units: steps
167
+ t_warmup: 100
168
+ t_max: null
169
+ alpha_f: 0.1
170
+ connector_t_warmup: 200
171
+ vit_t_warmup: 2000
172
+ llm_t_warmup: 2000
173
+ grad_clip_warmup_steps: null
174
+ grad_clip_warmup_factor: null
175
+ warmup_min_lr: 0.0
176
+ data:
177
+ dataset: vla_dataset_realworld
178
+ mixture: null
179
+ root_size_mixture: null
180
+ split: train
181
+ seed: 95818
182
+ shuffle_messages: false
183
+ pad: to_max
184
+ sequence_length: 1600
185
+ shuffle: true
186
+ for_inference: false
187
+ multi_modal: torch
188
+ num_workers: 0
189
+ drop_last: true
190
+ pin_memory: true
191
+ prefetch_factor: null
192
+ persistent_workers: false
193
+ timeout: 0
194
+ rlds_dataset_name: libero_4_task_suites_no_noops
195
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
196
+ use_wrist_image: true
197
+ use_proprio: true
198
+ rlds_shuffle_buffer_size: 100000
199
+ rlds_traj_threads: 8
200
+ rlds_read_threads: 8
201
+ lerobot_episode_index_start: null
202
+ lerobot_episode_index_end: null
203
+ restore_dataloader: true
204
+ fast_forward_batches: null
205
+ evaluators:
206
+ - label: val
207
+ data:
208
+ dataset: vla_dataset_realworld
209
+ mixture: null
210
+ root_size_mixture: null
211
+ split: validation
212
+ seed: null
213
+ shuffle_messages: false
214
+ pad: to_max
215
+ sequence_length: 1600
216
+ shuffle: false
217
+ for_inference: false
218
+ multi_modal: torch
219
+ num_workers: 0
220
+ drop_last: true
221
+ pin_memory: true
222
+ prefetch_factor: null
223
+ persistent_workers: true
224
+ timeout: 0
225
+ rlds_dataset_name: libero_4_task_suites_no_noops
226
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
227
+ use_wrist_image: true
228
+ use_proprio: true
229
+ rlds_shuffle_buffer_size: 256000
230
+ rlds_traj_threads: 8
231
+ rlds_read_threads: 8
232
+ lerobot_episode_index_start: 353
233
+ lerobot_episode_index_end: 765
234
+ device_eval_batch_size: null
235
+ subset_num_batches: 64
236
+ max_examples: null
237
+ max_new_tokens: 448
238
+ mm_evaluator: null
239
+ save_dir: null
240
+ save_to_checkpoint_dir: false
241
+ eval_name: null
242
+ skip_if_metrics_cached: true
243
+ eval_interval: 0
244
+ inf_eval_interval: -1
245
+ inf_evaluators: []
246
+ save_folder: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
247
+ remote_save_folder: null
248
+ canceled_check_interval: 50
249
+ save_interval: 500
250
+ save_interval_unsharded: 500
251
+ save_interval_ephemeral: null
252
+ save_interval_action_head: 500
253
+ save_num_checkpoints_to_keep: 1
254
+ save_num_unsharded_checkpoints_to_keep: 1
255
+ save_num_action_head_checkpoints_to_keep: 2
256
+ save_overwrite: true
257
+ force_save_unsharded: false
258
+ no_pre_train_checkpoint: true
259
+ initial_model_checkpoint: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
260
+ load_model_config: null
261
+ checkpoint_dir: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
262
+ load_path: null
263
+ load_path_sharded_checkpointer: null
264
+ reset_optimizer_state: false
265
+ reset_trainer_state: false
266
+ save_dataloader_state: false
267
+ reset_dataloader_state: false
268
+ keep_lr_on_load: true
269
+ sharded_checkpointer: torch_legacy
270
+ max_duration: 500000
271
+ global_train_batch_size: 126
272
+ device_train_batch_size: 15
273
+ device_train_microbatch_size: 16
274
+ device_eval_batch_size: 4
275
+ eval_subset_num_batches: -1
276
+ eval_on_load: false
277
+ device_inf_eval_batch_size: 16
278
+ inf_eval_subset_num_batches: -1
279
+ device_train_grad_accum: 0
280
+ max_grad_norm: 1.0
281
+ multi_component_grad_norm: true
282
+ batch_divisor: global_batch
283
+ max_grad_norm_ratio: null
284
+ precision: amp_bf16
285
+ wandb:
286
+ project: a1-realworld
287
+ entity: henryeap
288
+ group: null
289
+ name: glue_20250924_082336
290
+ tags:
291
+ - watching
292
+ log_artifacts: false
293
+ rank_zero_only: true
294
+ log_interval: 1
295
+ speed_monitor:
296
+ window_size: 20
297
+ gpu_flops_available: null
298
+ console_log_interval: 1
299
+ gen1_gc_interval: 1
300
+ compile: null
301
+ fsdp:
302
+ use_orig_params: true
303
+ sharding_strategy: FULL_SHARD
304
+ wrapping_strategy: by_block_and_size
305
+ precision: float
306
+ hybrid_sharding_num_model_replicas: null
307
+ softmax_auxiliary_loss: true
308
+ softmax_auxiliary_loss_scale: 0.0001
309
+ time_limit: null
310
+ extra_steps_after_cancel: 10
311
+ python_profiling: false
312
+ torch_profiling: false
313
+ stop_at: 500000
314
+ stop_after: null
315
+ activation_checkpointing: whole_layer
316
+ fused_loss: null
all_flow_matching/glue_best/step11000-action-head/metadata.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91bd03dced0b2509ade669c28c4f205463e4b4e83b54d6726754eeb8ea952bfe
3
+ size 1331
all_flow_matching/glue_best/step11500-action-head/metadata.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb72b6306ce04d1beb20bb289509f00c39a40845ff7c4b36bf4deb4e83fe82a
3
+ size 1331
all_flow_matching/glue_best/step11500-unsharded/config.yaml ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: glue_20250924_082336
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 3584
7
+ n_heads: 28
8
+ n_kv_heads: 4
9
+ qkv_bias: true
10
+ clip_qkv: null
11
+ n_layers: 28
12
+ mlp_ratio: 4
13
+ mlp_hidden_size: 37888
14
+ activation_type: swiglu
15
+ block_type: sequential
16
+ block_group_size: 1
17
+ rope: true
18
+ rope_full_precision: true
19
+ rope_theta: 1000000.0
20
+ vision_backbone:
21
+ image_model_type: openai
22
+ image_default_input_size:
23
+ - 336
24
+ - 336
25
+ image_patch_size: 14
26
+ image_pos_patch_size: 14
27
+ image_emb_dim: 1024
28
+ image_num_heads: 16
29
+ image_num_key_value_heads: 16
30
+ image_num_layers: 23
31
+ image_head_dim: 64
32
+ image_mlp_dim: 4096
33
+ image_mlp_activations: quick_gelu
34
+ image_dropout_rate: 0.0
35
+ image_num_pos: 577
36
+ image_norm_eps: 1.0e-05
37
+ attention_dropout: 0.0
38
+ residual_dropout: 0.0
39
+ initializer_range: 0.02
40
+ fsdp_wrap: false
41
+ resize_mode: default
42
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
43
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
44
+ low_cpu_fsdp: true
45
+ attention_type: sdpa
46
+ float32_attention: true
47
+ attention_dropout: 0.0
48
+ attention_layer_norm: false
49
+ residual_dropout: 0.1
50
+ response_residual_dropout: 0.0
51
+ embedding_dropout: 0.0
52
+ layer_norm_type: rms
53
+ layer_norm_with_affine: true
54
+ layer_norm_eps: 1.0e-06
55
+ attention_layer_norm_with_affine: true
56
+ max_sequence_length: 4096
57
+ max_position_embeddings: null
58
+ include_bias: false
59
+ bias_for_layer_norm: null
60
+ scale_logits: false
61
+ vocab_size: 152064
62
+ embedding_size: 152064
63
+ additional_vocab_size: 128
64
+ new_embedding_init_range: 0.02
65
+ weight_tying: false
66
+ init_device: null
67
+ init_fn: normal
68
+ init_std: 0.02
69
+ init_cutoff_factor: null
70
+ norm_after: false
71
+ precision: amp_bf16
72
+ max_crops: 12
73
+ crop_mode: overlap-and-resize-c2
74
+ use_col_tokens: true
75
+ prompt_type: uber_model
76
+ system_prompt_kind: demo_or_style
77
+ message_formatting: role
78
+ always_start_with_space: true
79
+ multi_annotation_weighting: root_subsegments
80
+ default_inference_len: 65
81
+ overlap_margins:
82
+ - 4
83
+ - 4
84
+ pad_value: 0.0
85
+ image_padding_embed: pad_and_partial_pad
86
+ fix_image_padding: true
87
+ vit_layers:
88
+ - -2
89
+ - -9
90
+ image_pooling_h: 2
91
+ image_pooling_w: 2
92
+ image_pooling_2d: attention_meanq
93
+ image_projector: mlp
94
+ image_feature_dropout: 0.0
95
+ initializer_range: 0.02
96
+ normalize_input_embeds: false
97
+ use_position_ids: true
98
+ head_dim: null
99
+ tokenizer:
100
+ identifier: Qwen/Qwen2-7B
101
+ tokenizer_dir: null
102
+ pad_tokenizer: true
103
+ moe_num_experts: 8
104
+ moe_top_k: 2
105
+ moe_mlp_impl: sparse
106
+ moe_log_expert_assignment: false
107
+ moe_shared_expert: false
108
+ moe_lbl_in_fp32: false
109
+ moe_interleave: false
110
+ moe_loss_weight: 0.1
111
+ moe_zloss_weight: null
112
+ moe_dropless: true
113
+ moe_capacity_factor: 1.25
114
+ action_head: flow_matching
115
+ num_diffusion_steps: 1000
116
+ num_diffusion_inference_steps: 30
117
+ use_proprio: true
118
+ action_head_dit_hidden_size: 1152
119
+ action_head_dit_depth: 28
120
+ action_head_dit_num_heads: 16
121
+ llm_causal_attention: false
122
+ action_use_left_eef: false
123
+ action_use_mobile_base: false
124
+ allow_resume: false
125
+ ft_llm: false
126
+ ft_vit: false
127
+ ft_connector: false
128
+ ft_embedding: lm_head
129
+ lora: false
130
+ use_lora: true
131
+ lora_rank: 32
132
+ lora_llm: true
133
+ lora_vit: false
134
+ lora_connector: false
135
+ early_exit: false
136
+ train_exit_random_layer: false
137
+ optimizer:
138
+ name: adamw
139
+ learning_rate: 0.0001
140
+ weight_decay: 0.01
141
+ betas:
142
+ - 0.9
143
+ - 0.95
144
+ eps: 1.0e-05
145
+ connector_learning_rate: 0.0002
146
+ vit_learning_rate: 6.0e-06
147
+ llm_learning_rate: 5.0e-05
148
+ connector_weight_decay: 0.0
149
+ vit_weight_decay: 0.0
150
+ llm_weight_decay: 0.0
151
+ connector_betas:
152
+ - 0.9
153
+ - 0.95
154
+ vit_betas:
155
+ - 0.9
156
+ - 0.95
157
+ llm_betas:
158
+ - 0.9
159
+ - 0.95
160
+ connector_eps: 1.0e-06
161
+ vit_eps: 1.0e-06
162
+ llm_eps: 1.0e-06
163
+ metrics_log_interval: 20
164
+ scheduler:
165
+ name: multimodal
166
+ units: steps
167
+ t_warmup: 100
168
+ t_max: null
169
+ alpha_f: 0.1
170
+ connector_t_warmup: 200
171
+ vit_t_warmup: 2000
172
+ llm_t_warmup: 2000
173
+ grad_clip_warmup_steps: null
174
+ grad_clip_warmup_factor: null
175
+ warmup_min_lr: 0.0
176
+ data:
177
+ dataset: vla_dataset_realworld
178
+ mixture: null
179
+ root_size_mixture: null
180
+ split: train
181
+ seed: 95818
182
+ shuffle_messages: false
183
+ pad: to_max
184
+ sequence_length: 1600
185
+ shuffle: true
186
+ for_inference: false
187
+ multi_modal: torch
188
+ num_workers: 0
189
+ drop_last: true
190
+ pin_memory: true
191
+ prefetch_factor: null
192
+ persistent_workers: false
193
+ timeout: 0
194
+ rlds_dataset_name: libero_4_task_suites_no_noops
195
+ rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1/Lerobot_Glue_best
196
+ use_wrist_image: true
197
+ use_proprio: true
198
+ rlds_shuffle_buffer_size: 100000
199
+ rlds_traj_threads: 8
200
+ rlds_read_threads: 8
201
+ lerobot_episode_index_start: null
202
+ lerobot_episode_index_end: null
203
+ restore_dataloader: true
204
+ fast_forward_batches: null
205
+ evaluators:
206
+ - label: val
207
+ data:
208
+ dataset: vla_dataset_realworld
209
+ mixture: null
210
+ root_size_mixture: null
211
+ split: validation
212
+ seed: null
213
+ shuffle_messages: false
214
+ pad: to_max
215
+ sequence_length: 1600
216
+ shuffle: false
217
+ for_inference: false
218
+ multi_modal: torch
219
+ num_workers: 0
220
+ drop_last: true
221
+ pin_memory: true
222
+ prefetch_factor: null
223
+ persistent_workers: true
224
+ timeout: 0
225
+ rlds_dataset_name: libero_4_task_suites_no_noops
226
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
227
+ use_wrist_image: true
228
+ use_proprio: true
229
+ rlds_shuffle_buffer_size: 256000
230
+ rlds_traj_threads: 8
231
+ rlds_read_threads: 8
232
+ lerobot_episode_index_start: 353
233
+ lerobot_episode_index_end: 765
234
+ device_eval_batch_size: null
235
+ subset_num_batches: 64
236
+ max_examples: null
237
+ max_new_tokens: 448
238
+ mm_evaluator: null
239
+ save_dir: null
240
+ save_to_checkpoint_dir: false
241
+ eval_name: null
242
+ skip_if_metrics_cached: true
243
+ eval_interval: 0
244
+ inf_eval_interval: -1
245
+ inf_evaluators: []
246
+ save_folder: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
247
+ remote_save_folder: null
248
+ canceled_check_interval: 50
249
+ save_interval: 500
250
+ save_interval_unsharded: 500
251
+ save_interval_ephemeral: null
252
+ save_interval_action_head: 500
253
+ save_num_checkpoints_to_keep: 1
254
+ save_num_unsharded_checkpoints_to_keep: 1
255
+ save_num_action_head_checkpoints_to_keep: 2
256
+ save_overwrite: true
257
+ force_save_unsharded: false
258
+ no_pre_train_checkpoint: true
259
+ initial_model_checkpoint: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
260
+ load_model_config: null
261
+ checkpoint_dir: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
262
+ load_path: null
263
+ load_path_sharded_checkpointer: null
264
+ reset_optimizer_state: false
265
+ reset_trainer_state: false
266
+ save_dataloader_state: false
267
+ reset_dataloader_state: false
268
+ keep_lr_on_load: true
269
+ sharded_checkpointer: torch_legacy
270
+ max_duration: 500000
271
+ global_train_batch_size: 126
272
+ device_train_batch_size: 15
273
+ device_train_microbatch_size: 16
274
+ device_eval_batch_size: 4
275
+ eval_subset_num_batches: -1
276
+ eval_on_load: false
277
+ device_inf_eval_batch_size: 16
278
+ inf_eval_subset_num_batches: -1
279
+ device_train_grad_accum: 0
280
+ max_grad_norm: 1.0
281
+ multi_component_grad_norm: true
282
+ batch_divisor: global_batch
283
+ max_grad_norm_ratio: null
284
+ precision: amp_bf16
285
+ wandb:
286
+ project: a1-realworld
287
+ entity: henryeap
288
+ group: null
289
+ name: glue_20250924_082336
290
+ tags:
291
+ - watching
292
+ log_artifacts: false
293
+ rank_zero_only: true
294
+ log_interval: 1
295
+ speed_monitor:
296
+ window_size: 20
297
+ gpu_flops_available: null
298
+ console_log_interval: 1
299
+ gen1_gc_interval: 1
300
+ compile: null
301
+ fsdp:
302
+ use_orig_params: true
303
+ sharding_strategy: FULL_SHARD
304
+ wrapping_strategy: by_block_and_size
305
+ precision: float
306
+ hybrid_sharding_num_model_replicas: null
307
+ softmax_auxiliary_loss: true
308
+ softmax_auxiliary_loss_scale: 0.0001
309
+ time_limit: null
310
+ extra_steps_after_cancel: 10
311
+ python_profiling: false
312
+ torch_profiling: false
313
+ stop_at: 500000
314
+ stop_after: null
315
+ activation_checkpointing: whole_layer
316
+ fused_loss: null
all_flow_matching/glue_best/step11500-unsharded/lora.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6269c7cb774e69a5c43b4109cffb347c2936232c5222e2b8a75056ee1188671d
3
+ size 304417027
all_flow_matching/glue_best/step11500-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b1734f30ac548c86aa66c22a5cfb32bd2320a41d56faab841b4fd53020a6d1b
3
+ size 15061
all_flow_matching/glue_best/step11500/config.yaml ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: glue_20250924_082336
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 3584
7
+ n_heads: 28
8
+ n_kv_heads: 4
9
+ qkv_bias: true
10
+ clip_qkv: null
11
+ n_layers: 28
12
+ mlp_ratio: 4
13
+ mlp_hidden_size: 37888
14
+ activation_type: swiglu
15
+ block_type: sequential
16
+ block_group_size: 1
17
+ rope: true
18
+ rope_full_precision: true
19
+ rope_theta: 1000000.0
20
+ vision_backbone:
21
+ image_model_type: openai
22
+ image_default_input_size:
23
+ - 336
24
+ - 336
25
+ image_patch_size: 14
26
+ image_pos_patch_size: 14
27
+ image_emb_dim: 1024
28
+ image_num_heads: 16
29
+ image_num_key_value_heads: 16
30
+ image_num_layers: 23
31
+ image_head_dim: 64
32
+ image_mlp_dim: 4096
33
+ image_mlp_activations: quick_gelu
34
+ image_dropout_rate: 0.0
35
+ image_num_pos: 577
36
+ image_norm_eps: 1.0e-05
37
+ attention_dropout: 0.0
38
+ residual_dropout: 0.0
39
+ initializer_range: 0.02
40
+ fsdp_wrap: false
41
+ resize_mode: default
42
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
43
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
44
+ low_cpu_fsdp: true
45
+ attention_type: sdpa
46
+ float32_attention: true
47
+ attention_dropout: 0.0
48
+ attention_layer_norm: false
49
+ residual_dropout: 0.1
50
+ response_residual_dropout: 0.0
51
+ embedding_dropout: 0.0
52
+ layer_norm_type: rms
53
+ layer_norm_with_affine: true
54
+ layer_norm_eps: 1.0e-06
55
+ attention_layer_norm_with_affine: true
56
+ max_sequence_length: 4096
57
+ max_position_embeddings: null
58
+ include_bias: false
59
+ bias_for_layer_norm: null
60
+ scale_logits: false
61
+ vocab_size: 152064
62
+ embedding_size: 152064
63
+ additional_vocab_size: 128
64
+ new_embedding_init_range: 0.02
65
+ weight_tying: false
66
+ init_device: null
67
+ init_fn: normal
68
+ init_std: 0.02
69
+ init_cutoff_factor: null
70
+ norm_after: false
71
+ precision: amp_bf16
72
+ max_crops: 12
73
+ crop_mode: overlap-and-resize-c2
74
+ use_col_tokens: true
75
+ prompt_type: uber_model
76
+ system_prompt_kind: demo_or_style
77
+ message_formatting: role
78
+ always_start_with_space: true
79
+ multi_annotation_weighting: root_subsegments
80
+ default_inference_len: 65
81
+ overlap_margins:
82
+ - 4
83
+ - 4
84
+ pad_value: 0.0
85
+ image_padding_embed: pad_and_partial_pad
86
+ fix_image_padding: true
87
+ vit_layers:
88
+ - -2
89
+ - -9
90
+ image_pooling_h: 2
91
+ image_pooling_w: 2
92
+ image_pooling_2d: attention_meanq
93
+ image_projector: mlp
94
+ image_feature_dropout: 0.0
95
+ initializer_range: 0.02
96
+ normalize_input_embeds: false
97
+ use_position_ids: true
98
+ head_dim: null
99
+ tokenizer:
100
+ identifier: Qwen/Qwen2-7B
101
+ tokenizer_dir: null
102
+ pad_tokenizer: true
103
+ moe_num_experts: 8
104
+ moe_top_k: 2
105
+ moe_mlp_impl: sparse
106
+ moe_log_expert_assignment: false
107
+ moe_shared_expert: false
108
+ moe_lbl_in_fp32: false
109
+ moe_interleave: false
110
+ moe_loss_weight: 0.1
111
+ moe_zloss_weight: null
112
+ moe_dropless: true
113
+ moe_capacity_factor: 1.25
114
+ action_head: flow_matching
115
+ num_diffusion_steps: 1000
116
+ num_diffusion_inference_steps: 30
117
+ use_proprio: true
118
+ action_head_dit_hidden_size: 1152
119
+ action_head_dit_depth: 28
120
+ action_head_dit_num_heads: 16
121
+ llm_causal_attention: false
122
+ action_use_left_eef: false
123
+ action_use_mobile_base: false
124
+ allow_resume: false
125
+ ft_llm: false
126
+ ft_vit: false
127
+ ft_connector: false
128
+ ft_embedding: lm_head
129
+ lora: false
130
+ use_lora: true
131
+ lora_rank: 32
132
+ lora_llm: true
133
+ lora_vit: false
134
+ lora_connector: false
135
+ early_exit: false
136
+ train_exit_random_layer: false
137
+ optimizer:
138
+ name: adamw
139
+ learning_rate: 0.0001
140
+ weight_decay: 0.01
141
+ betas:
142
+ - 0.9
143
+ - 0.95
144
+ eps: 1.0e-05
145
+ connector_learning_rate: 0.0002
146
+ vit_learning_rate: 6.0e-06
147
+ llm_learning_rate: 5.0e-05
148
+ connector_weight_decay: 0.0
149
+ vit_weight_decay: 0.0
150
+ llm_weight_decay: 0.0
151
+ connector_betas:
152
+ - 0.9
153
+ - 0.95
154
+ vit_betas:
155
+ - 0.9
156
+ - 0.95
157
+ llm_betas:
158
+ - 0.9
159
+ - 0.95
160
+ connector_eps: 1.0e-06
161
+ vit_eps: 1.0e-06
162
+ llm_eps: 1.0e-06
163
+ metrics_log_interval: 20
164
+ scheduler:
165
+ name: multimodal
166
+ units: steps
167
+ t_warmup: 100
168
+ t_max: null
169
+ alpha_f: 0.1
170
+ connector_t_warmup: 200
171
+ vit_t_warmup: 2000
172
+ llm_t_warmup: 2000
173
+ grad_clip_warmup_steps: null
174
+ grad_clip_warmup_factor: null
175
+ warmup_min_lr: 0.0
176
+ data:
177
+ dataset: vla_dataset_realworld
178
+ mixture: null
179
+ root_size_mixture: null
180
+ split: train
181
+ seed: 95818
182
+ shuffle_messages: false
183
+ pad: to_max
184
+ sequence_length: 1600
185
+ shuffle: true
186
+ for_inference: false
187
+ multi_modal: torch
188
+ num_workers: 0
189
+ drop_last: true
190
+ pin_memory: true
191
+ prefetch_factor: null
192
+ persistent_workers: false
193
+ timeout: 0
194
+ rlds_dataset_name: libero_4_task_suites_no_noops
195
+ rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1/Lerobot_Glue_best
196
+ use_wrist_image: true
197
+ use_proprio: true
198
+ rlds_shuffle_buffer_size: 100000
199
+ rlds_traj_threads: 8
200
+ rlds_read_threads: 8
201
+ lerobot_episode_index_start: null
202
+ lerobot_episode_index_end: null
203
+ restore_dataloader: true
204
+ fast_forward_batches: null
205
+ evaluators:
206
+ - label: val
207
+ data:
208
+ dataset: vla_dataset_realworld
209
+ mixture: null
210
+ root_size_mixture: null
211
+ split: validation
212
+ seed: null
213
+ shuffle_messages: false
214
+ pad: to_max
215
+ sequence_length: 1600
216
+ shuffle: false
217
+ for_inference: false
218
+ multi_modal: torch
219
+ num_workers: 0
220
+ drop_last: true
221
+ pin_memory: true
222
+ prefetch_factor: null
223
+ persistent_workers: true
224
+ timeout: 0
225
+ rlds_dataset_name: libero_4_task_suites_no_noops
226
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
227
+ use_wrist_image: true
228
+ use_proprio: true
229
+ rlds_shuffle_buffer_size: 256000
230
+ rlds_traj_threads: 8
231
+ rlds_read_threads: 8
232
+ lerobot_episode_index_start: 353
233
+ lerobot_episode_index_end: 765
234
+ device_eval_batch_size: null
235
+ subset_num_batches: 64
236
+ max_examples: null
237
+ max_new_tokens: 448
238
+ mm_evaluator: null
239
+ save_dir: null
240
+ save_to_checkpoint_dir: false
241
+ eval_name: null
242
+ skip_if_metrics_cached: true
243
+ eval_interval: 0
244
+ inf_eval_interval: -1
245
+ inf_evaluators: []
246
+ save_folder: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
247
+ remote_save_folder: null
248
+ canceled_check_interval: 50
249
+ save_interval: 500
250
+ save_interval_unsharded: 500
251
+ save_interval_ephemeral: null
252
+ save_interval_action_head: 500
253
+ save_num_checkpoints_to_keep: 1
254
+ save_num_unsharded_checkpoints_to_keep: 1
255
+ save_num_action_head_checkpoints_to_keep: 2
256
+ save_overwrite: true
257
+ force_save_unsharded: false
258
+ no_pre_train_checkpoint: true
259
+ initial_model_checkpoint: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
260
+ load_model_config: null
261
+ checkpoint_dir: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
262
+ load_path: null
263
+ load_path_sharded_checkpointer: null
264
+ reset_optimizer_state: false
265
+ reset_trainer_state: false
266
+ save_dataloader_state: false
267
+ reset_dataloader_state: false
268
+ keep_lr_on_load: true
269
+ sharded_checkpointer: torch_legacy
270
+ max_duration: 500000
271
+ global_train_batch_size: 126
272
+ device_train_batch_size: 15
273
+ device_train_microbatch_size: 16
274
+ device_eval_batch_size: 4
275
+ eval_subset_num_batches: -1
276
+ eval_on_load: false
277
+ device_inf_eval_batch_size: 16
278
+ inf_eval_subset_num_batches: -1
279
+ device_train_grad_accum: 0
280
+ max_grad_norm: 1.0
281
+ multi_component_grad_norm: true
282
+ batch_divisor: global_batch
283
+ max_grad_norm_ratio: null
284
+ precision: amp_bf16
285
+ wandb:
286
+ project: a1-realworld
287
+ entity: henryeap
288
+ group: null
289
+ name: glue_20250924_082336
290
+ tags:
291
+ - watching
292
+ log_artifacts: false
293
+ rank_zero_only: true
294
+ log_interval: 1
295
+ speed_monitor:
296
+ window_size: 20
297
+ gpu_flops_available: null
298
+ console_log_interval: 1
299
+ gen1_gc_interval: 1
300
+ compile: null
301
+ fsdp:
302
+ use_orig_params: true
303
+ sharding_strategy: FULL_SHARD
304
+ wrapping_strategy: by_block_and_size
305
+ precision: float
306
+ hybrid_sharding_num_model_replicas: null
307
+ softmax_auxiliary_loss: true
308
+ softmax_auxiliary_loss_scale: 0.0001
309
+ time_limit: null
310
+ extra_steps_after_cancel: 10
311
+ python_profiling: false
312
+ torch_profiling: false
313
+ stop_at: 500000
314
+ stop_after: null
315
+ activation_checkpointing: whole_layer
316
+ fused_loss: null
all_flow_matching/glue_best/wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-24T08:24:16.705237241Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-09-24T08:24:17.754431561Z","level":"INFO","msg":"stream: created new stream","id":"4dp69fok"}
3
+ {"time":"2025-09-24T08:24:17.754478082Z","level":"INFO","msg":"stream: started","id":"4dp69fok"}
4
+ {"time":"2025-09-24T08:24:17.754498402Z","level":"INFO","msg":"sender: started","stream_id":"4dp69fok"}
5
+ {"time":"2025-09-24T08:24:17.754506202Z","level":"INFO","msg":"writer: started","stream_id":"4dp69fok"}
6
+ {"time":"2025-09-24T08:24:17.754546793Z","level":"INFO","msg":"handler: started","stream_id":"4dp69fok"}
7
+ {"time":"2025-09-24T15:15:45.267501791Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
8
+ {"time":"2025-09-24T20:24:27.534186056Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
9
+ {"time":"2025-09-25T23:01:28.093149981Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
10
+ {"time":"2025-09-26T02:33:15.940926228Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/henryeap/a1-realworld/4dp69fok/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
11
+ {"time":"2025-09-26T19:36:48.428667728Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/henryeap/a1-realworld/4dp69fok/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
12
+ {"time":"2025-09-26T20:06:50.687851553Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
13
+ {"time":"2025-09-26T20:59:26.86775551Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
all_flow_matching/glue_best/wandb/wandb/debug.log ADDED
File without changes
all_flow_matching/glue_best/wandb/wandb/run-20250924_061930-dnrnwv30/run-dnrnwv30.wandb ADDED
Binary file (8.79 kB). View file
 
all_flow_matching/glue_best/wandb/wandb/run-20250924_062357-hmmpns57/run-hmmpns57.wandb ADDED
Binary file (8.79 kB). View file
 
all_flow_matching/glue_best/wandb/wandb/run-20250924_063128-wtatxotn/files/config.yaml ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.21.4
4
+ e:
5
+ o421nvn5u6ub6ruog26gg83x0g2lmgbt:
6
+ args:
7
+ - qwen2_7b
8
+ - save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
9
+ - --vision_backbone
10
+ - openai
11
+ - --action_head
12
+ - flow_matching
13
+ - --seq_len
14
+ - "768"
15
+ - --lora_rank
16
+ - "32"
17
+ - --lora_llm
18
+ - --checkpoint
19
+ - /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
20
+ - --device_train_microbatch_size
21
+ - "22"
22
+ - --global_batch_size
23
+ - "176"
24
+ - --dataset
25
+ - vla_dataset_realworld
26
+ - --llm_learning_rate
27
+ - "5e-5"
28
+ - --wandb_entity
29
+ - henryeap
30
+ - --wandb_project
31
+ - a1-realworld
32
+ - --wandb_run_name
33
+ - glue
34
+ - --save_overwrite
35
+ codePath: launch_scripts/train_vla.py
36
+ codePathLocal: launch_scripts/train_vla.py
37
+ cpu_count: 64
38
+ cpu_count_logical: 128
39
+ disk:
40
+ /:
41
+ total: "470343073792"
42
+ used: "51147874304"
43
+ email: ihenrykwok@outlook.com
44
+ executable: /vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10
45
+ git:
46
+ commit: c13f2763af61e0d729a8b5ab4bdefc512205bcc5
47
+ remote: https://github.com/Spatialtemporal-AI/A1.git
48
+ gpu: Instinct MI210
49
+ gpu_amd:
50
+ - id: "5"
51
+ maxPower: "300.0"
52
+ mclkRange: 400Mhz - 1600Mhz
53
+ model: "0x740f"
54
+ performanceLevel: auto
55
+ sclkRange: 500Mhz - 1700Mhz
56
+ series: Instinct MI210
57
+ sku: D67301V
58
+ uniqueId: "0x137c9ede1bb1518e"
59
+ vbiosVersion: 113-D67301V-073
60
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
61
+ - id: "7"
62
+ maxPower: "300.0"
63
+ mclkRange: 400Mhz - 1600Mhz
64
+ model: "0x740f"
65
+ performanceLevel: auto
66
+ sclkRange: 500Mhz - 1700Mhz
67
+ series: Instinct MI210
68
+ sku: D67301V
69
+ uniqueId: "0x21a2e88d06c419dc"
70
+ vbiosVersion: 113-D67301V-073
71
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
72
+ - id: "2"
73
+ maxPower: "300.0"
74
+ mclkRange: 400Mhz - 1600Mhz
75
+ model: "0x740f"
76
+ performanceLevel: auto
77
+ sclkRange: 500Mhz - 1700Mhz
78
+ series: Instinct MI210
79
+ sku: D67301V
80
+ uniqueId: "0x399226d2b2bfa544"
81
+ vbiosVersion: 113-D67301V-073
82
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
83
+ - id: "0"
84
+ maxPower: "300.0"
85
+ mclkRange: 400Mhz - 1600Mhz
86
+ model: "0x740f"
87
+ performanceLevel: auto
88
+ sclkRange: 500Mhz - 1700Mhz
89
+ series: Instinct MI210
90
+ sku: D67301V
91
+ uniqueId: "0x3558c3014c813fdb"
92
+ vbiosVersion: 113-D67301V-073
93
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
94
+ - id: "3"
95
+ maxPower: "300.0"
96
+ mclkRange: 400Mhz - 1600Mhz
97
+ model: "0x740f"
98
+ performanceLevel: auto
99
+ sclkRange: 500Mhz - 1700Mhz
100
+ series: Instinct MI210
101
+ sku: D67301V
102
+ uniqueId: "0xf61ec17df11883bd"
103
+ vbiosVersion: 113-D67301V-073
104
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
105
+ - id: "1"
106
+ maxPower: "300.0"
107
+ mclkRange: 400Mhz - 1600Mhz
108
+ model: "0x740f"
109
+ performanceLevel: auto
110
+ sclkRange: 500Mhz - 1700Mhz
111
+ series: Instinct MI210
112
+ sku: D67301V
113
+ uniqueId: "0x9b5c1c302c8129f8"
114
+ vbiosVersion: 113-D67301V-073
115
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
116
+ - id: "6"
117
+ maxPower: "300.0"
118
+ mclkRange: 400Mhz - 1600Mhz
119
+ model: "0x740f"
120
+ performanceLevel: auto
121
+ sclkRange: 500Mhz - 1700Mhz
122
+ series: Instinct MI210
123
+ sku: D67301V
124
+ uniqueId: "0xfa8b85a4625b04f"
125
+ vbiosVersion: 113-D67301V-073
126
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
127
+ - id: "4"
128
+ maxPower: "300.0"
129
+ mclkRange: 400Mhz - 1600Mhz
130
+ model: "0x740f"
131
+ performanceLevel: auto
132
+ sclkRange: 500Mhz - 1700Mhz
133
+ series: Instinct MI210
134
+ sku: D67301V
135
+ uniqueId: "0xa515afd8ced1d39d"
136
+ vbiosVersion: 113-D67301V-073
137
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
138
+ gpu_count: 8
139
+ host: auh7-1b-gpu-188
140
+ memory:
141
+ total: "2434606952448"
142
+ os: Linux-5.15.0-140-generic-x86_64-with-glibc2.35
143
+ program: /vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py
144
+ python: CPython 3.10.18
145
+ root: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb
146
+ slurm:
147
+ cluster_name: ai-04r
148
+ conf: /etc/slurm/slurm.conf
149
+ cpus_on_node: "128"
150
+ gpus_on_node: "8"
151
+ gtids: "0"
152
+ job_account: faculty-acc
153
+ job_cpus_per_node: "128"
154
+ job_end_time: "1758954648"
155
+ job_gid: "2000"
156
+ job_gpus: 0,1,2,3,4,5,6,7
157
+ job_id: "1605"
158
+ job_name: realworld_mh
159
+ job_nodelist: auh7-1b-gpu-188
160
+ job_num_nodes: "1"
161
+ job_partition: faculty
162
+ job_qos: xdqos
163
+ job_start_time: "1758695448"
164
+ job_uid: "2013"
165
+ job_user: xiaodan
166
+ jobid: "1605"
167
+ localid: "0"
168
+ nnodes: "1"
169
+ nodeid: "0"
170
+ nodelist: auh7-1b-gpu-188
171
+ nprocs: "1"
172
+ ntasks: "1"
173
+ ntasks_per_node: "1"
174
+ oom_kill_step: "0"
175
+ prio_process: "0"
176
+ procid: "0"
177
+ submit_dir: /vast/users/xiaodan/zhangjian/A1/launch_scripts
178
+ submit_host: auh-1b-cpu-login-001
179
+ task_pid: "2191329"
180
+ tasks_per_node: "1"
181
+ topology_addr: auh7-1b-gpu-188
182
+ topology_addr_pattern: node
183
+ startedAt: "2025-09-24T06:31:28.005264Z"
184
+ writerId: o421nvn5u6ub6ruog26gg83x0g2lmgbt
185
+ m: []
186
+ python_version: 3.10.18
187
+ t:
188
+ "1":
189
+ - 1
190
+ - 3
191
+ - 5
192
+ - 11
193
+ - 41
194
+ - 49
195
+ - 51
196
+ - 53
197
+ - 63
198
+ - 71
199
+ - 83
200
+ - 95
201
+ - 105
202
+ "2":
203
+ - 1
204
+ - 3
205
+ - 5
206
+ - 11
207
+ - 41
208
+ - 49
209
+ - 51
210
+ - 53
211
+ - 63
212
+ - 71
213
+ - 83
214
+ - 95
215
+ - 105
216
+ "3":
217
+ - 13
218
+ - 15
219
+ - 16
220
+ "4": 3.10.18
221
+ "5": 0.21.4
222
+ "6": 4.56.1
223
+ "12": 0.21.4
224
+ "13": linux-x86_64
225
+ activation_checkpointing:
226
+ value: whole_layer
227
+ allow_resume:
228
+ value: false
229
+ batch_divisor:
230
+ value: global_batch
231
+ canceled_check_interval:
232
+ value: 50
233
+ checkpoint_dir:
234
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
235
+ compile:
236
+ value: null
237
+ console_log_interval:
238
+ value: 1
239
+ data:
240
+ value:
241
+ dataset: vla_dataset_realworld
242
+ drop_last: true
243
+ for_inference: false
244
+ lerobot_episode_index_end: null
245
+ lerobot_episode_index_start: null
246
+ mixture: null
247
+ multi_modal: torch
248
+ num_workers: 0
249
+ pad: to_max
250
+ persistent_workers: false
251
+ pin_memory: true
252
+ prefetch_factor: null
253
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
254
+ rlds_dataset_name: libero_4_task_suites_no_noops
255
+ rlds_read_threads: 8
256
+ rlds_shuffle_buffer_size: 100000
257
+ rlds_traj_threads: 8
258
+ root_size_mixture: null
259
+ seed: 95818
260
+ sequence_length: 768
261
+ shuffle: true
262
+ shuffle_messages: false
263
+ split: train
264
+ timeout: 0
265
+ use_proprio: true
266
+ use_wrist_image: true
267
+ device_eval_batch_size:
268
+ value: 4
269
+ device_inf_eval_batch_size:
270
+ value: 16
271
+ device_train_batch_size:
272
+ value: 22
273
+ device_train_grad_accum:
274
+ value: 1
275
+ device_train_microbatch_size:
276
+ value: 22
277
+ dry_run:
278
+ value: false
279
+ early_exit:
280
+ value: false
281
+ epoch:
282
+ value: null
283
+ eval_interval:
284
+ value: 0
285
+ eval_on_load:
286
+ value: false
287
+ eval_subset_num_batches:
288
+ value: -1
289
+ evaluators:
290
+ value:
291
+ - data:
292
+ dataset: vla_dataset_realworld
293
+ drop_last: true
294
+ for_inference: false
295
+ lerobot_episode_index_end: 765
296
+ lerobot_episode_index_start: 353
297
+ mixture: null
298
+ multi_modal: torch
299
+ num_workers: 0
300
+ pad: to_max
301
+ persistent_workers: true
302
+ pin_memory: true
303
+ prefetch_factor: null
304
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
305
+ rlds_dataset_name: libero_4_task_suites_no_noops
306
+ rlds_read_threads: 8
307
+ rlds_shuffle_buffer_size: 256000
308
+ rlds_traj_threads: 8
309
+ root_size_mixture: null
310
+ seed: null
311
+ sequence_length: 768
312
+ shuffle: false
313
+ shuffle_messages: false
314
+ split: validation
315
+ timeout: 0
316
+ use_proprio: true
317
+ use_wrist_image: true
318
+ device_eval_batch_size: null
319
+ eval_name: null
320
+ label: val
321
+ max_examples: null
322
+ max_new_tokens: 448
323
+ mm_evaluator: null
324
+ save_dir: null
325
+ save_to_checkpoint_dir: false
326
+ skip_if_metrics_cached: true
327
+ subset_num_batches: 64
328
+ extra_steps_after_cancel:
329
+ value: 10
330
+ fast_forward_batches:
331
+ value: null
332
+ force_save_unsharded:
333
+ value: false
334
+ fsdp:
335
+ value:
336
+ hybrid_sharding_num_model_replicas: null
337
+ precision: float
338
+ sharding_strategy: FULL_SHARD
339
+ use_orig_params: true
340
+ wrapping_strategy: by_block_and_size
341
+ ft_connector:
342
+ value: false
343
+ ft_embedding:
344
+ value: lm_head
345
+ ft_llm:
346
+ value: false
347
+ ft_vit:
348
+ value: false
349
+ fused_loss:
350
+ value: null
351
+ gen1_gc_interval:
352
+ value: 1
353
+ global_train_batch_size:
354
+ value: 176
355
+ inf_eval_interval:
356
+ value: -1
357
+ inf_eval_subset_num_batches:
358
+ value: -1
359
+ inf_evaluators:
360
+ value: []
361
+ initial_model_checkpoint:
362
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
363
+ keep_lr_on_load:
364
+ value: true
365
+ load_model_config:
366
+ value: null
367
+ load_path:
368
+ value: null
369
+ load_path_sharded_checkpointer:
370
+ value: null
371
+ lora:
372
+ value: false
373
+ lora_connector:
374
+ value: false
375
+ lora_llm:
376
+ value: true
377
+ lora_rank:
378
+ value: 32
379
+ lora_vit:
380
+ value: false
381
+ max_duration:
382
+ value: 500000
383
+ max_grad_norm:
384
+ value: 1
385
+ max_grad_norm_ratio:
386
+ value: null
387
+ model:
388
+ value:
389
+ action_head: flow_matching
390
+ action_head_dit_depth: 28
391
+ action_head_dit_hidden_size: 1152
392
+ action_head_dit_num_heads: 16
393
+ action_use_left_eef: false
394
+ action_use_mobile_base: false
395
+ activation_type: swiglu
396
+ additional_vocab_size: 128
397
+ always_start_with_space: true
398
+ attention_dropout: 0
399
+ attention_layer_norm: false
400
+ attention_layer_norm_with_affine: true
401
+ attention_type: sdpa
402
+ bias_for_layer_norm: null
403
+ block_group_size: 1
404
+ block_type: sequential
405
+ clip_qkv: null
406
+ crop_mode: overlap-and-resize-c2
407
+ d_model: 3584
408
+ default_inference_len: 65
409
+ embedding_dropout: 0
410
+ embedding_size: 152064
411
+ fix_image_padding: true
412
+ float32_attention: true
413
+ head_dim: null
414
+ image_feature_dropout: 0
415
+ image_padding_embed: pad_and_partial_pad
416
+ image_pooling_2d: attention_meanq
417
+ image_pooling_h: 2
418
+ image_pooling_w: 2
419
+ image_projector: mlp
420
+ include_bias: false
421
+ init_cutoff_factor: null
422
+ init_device: null
423
+ init_fn: normal
424
+ init_std: 0.02
425
+ initializer_range: 0.02
426
+ layer_norm_eps: 1e-06
427
+ layer_norm_type: rms
428
+ layer_norm_with_affine: true
429
+ llm_causal_attention: false
430
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
431
+ low_cpu_fsdp: true
432
+ max_crops: 12
433
+ max_position_embeddings: null
434
+ max_sequence_length: 4096
435
+ message_formatting: role
436
+ mlp_hidden_size: 37888
437
+ mlp_ratio: 4
438
+ moe_capacity_factor: 1.25
439
+ moe_dropless: true
440
+ moe_interleave: false
441
+ moe_lbl_in_fp32: false
442
+ moe_log_expert_assignment: false
443
+ moe_loss_weight: 0.1
444
+ moe_mlp_impl: sparse
445
+ moe_num_experts: 8
446
+ moe_shared_expert: false
447
+ moe_top_k: 2
448
+ moe_zloss_weight: null
449
+ multi_annotation_weighting: root_subsegments
450
+ n_heads: 28
451
+ n_kv_heads: 4
452
+ n_layers: 28
453
+ new_embedding_init_range: 0.02
454
+ norm_after: false
455
+ normalize_input_embeds: false
456
+ num_diffusion_inference_steps: 30
457
+ num_diffusion_steps: 1000
458
+ overlap_margins:
459
+ - 4
460
+ - 4
461
+ pad_tokenizer: true
462
+ pad_value: 0
463
+ precision: amp_bf16
464
+ prompt_type: uber_model
465
+ qkv_bias: true
466
+ residual_dropout: 0.1
467
+ response_residual_dropout: 0
468
+ rope: true
469
+ rope_full_precision: true
470
+ rope_theta: 1e+06
471
+ scale_logits: false
472
+ system_prompt_kind: demo_or_style
473
+ tokenizer:
474
+ identifier: Qwen/Qwen2-7B
475
+ tokenizer_dir: null
476
+ use_col_tokens: true
477
+ use_position_ids: true
478
+ use_proprio: true
479
+ vision_backbone:
480
+ attention_dropout: 0
481
+ fsdp_wrap: false
482
+ image_default_input_size:
483
+ - 336
484
+ - 336
485
+ image_dropout_rate: 0
486
+ image_emb_dim: 1024
487
+ image_head_dim: 64
488
+ image_mlp_activations: quick_gelu
489
+ image_mlp_dim: 4096
490
+ image_model_type: openai
491
+ image_norm_eps: 1e-05
492
+ image_num_heads: 16
493
+ image_num_key_value_heads: 16
494
+ image_num_layers: 23
495
+ image_num_pos: 577
496
+ image_patch_size: 14
497
+ image_pos_patch_size: 14
498
+ initializer_range: 0.02
499
+ residual_dropout: 0
500
+ resize_mode: default
501
+ vit_layers:
502
+ - -2
503
+ - -9
504
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
505
+ vocab_size: 152064
506
+ weight_tying: false
507
+ multi_component_grad_norm:
508
+ value: true
509
+ no_pre_train_checkpoint:
510
+ value: true
511
+ optimizer:
512
+ value:
513
+ betas:
514
+ - 0.9
515
+ - 0.95
516
+ connector_betas:
517
+ - 0.9
518
+ - 0.95
519
+ connector_eps: 1e-06
520
+ connector_learning_rate: 0.0002
521
+ connector_weight_decay: 0
522
+ eps: 1e-05
523
+ learning_rate: 0.0001
524
+ llm_betas:
525
+ - 0.9
526
+ - 0.95
527
+ llm_eps: 1e-06
528
+ llm_learning_rate: 5e-05
529
+ llm_weight_decay: 0
530
+ metrics_log_interval: 20
531
+ name: adamw
532
+ vit_betas:
533
+ - 0.9
534
+ - 0.95
535
+ vit_eps: 1e-06
536
+ vit_learning_rate: 6e-06
537
+ vit_weight_decay: 0
538
+ weight_decay: 0.01
539
+ precision:
540
+ value: amp_bf16
541
+ python_profiling:
542
+ value: false
543
+ remote_save_folder:
544
+ value: null
545
+ reset_dataloader_state:
546
+ value: false
547
+ reset_optimizer_state:
548
+ value: false
549
+ reset_trainer_state:
550
+ value: false
551
+ restore_dataloader:
552
+ value: true
553
+ run_name:
554
+ value: glue_20250924_063100
555
+ save_dataloader_state:
556
+ value: false
557
+ save_folder:
558
+ value: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
559
+ save_interval:
560
+ value: 500
561
+ save_interval_action_head:
562
+ value: 500
563
+ save_interval_ephemeral:
564
+ value: null
565
+ save_interval_unsharded:
566
+ value: 500
567
+ save_num_action_head_checkpoints_to_keep:
568
+ value: 2
569
+ save_num_checkpoints_to_keep:
570
+ value: 1
571
+ save_num_unsharded_checkpoints_to_keep:
572
+ value: 1
573
+ save_overwrite:
574
+ value: true
575
+ scheduler:
576
+ value:
577
+ alpha_f: 0.1
578
+ connector_t_warmup: 200
579
+ grad_clip_warmup_factor: null
580
+ grad_clip_warmup_steps: null
581
+ llm_t_warmup: 2000
582
+ name: multimodal
583
+ t_max: null
584
+ t_warmup: 100
585
+ units: steps
586
+ vit_t_warmup: 2000
587
+ warmup_min_lr: 0
588
+ seed:
589
+ value: 6198
590
+ sharded_checkpointer:
591
+ value: torch_legacy
592
+ softmax_auxiliary_loss:
593
+ value: true
594
+ softmax_auxiliary_loss_scale:
595
+ value: 0.0001
596
+ speed_monitor:
597
+ value:
598
+ gpu_flops_available: null
599
+ window_size: 20
600
+ stop_after:
601
+ value: null
602
+ stop_at:
603
+ value: 500000
604
+ time_limit:
605
+ value: null
606
+ torch_profiling:
607
+ value: false
608
+ train_exit_random_layer:
609
+ value: false
610
+ use_lora:
611
+ value: true
all_flow_matching/glue_best/wandb/wandb/run-20250924_063128-wtatxotn/files/output.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb: Detected [openai] in use.
2
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
4
+ 09/24 [06:31:30] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
5
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
6
+ warnings.warn( # warn only once
7
+
8
+ ****** vla_cfg: {'datasets': {'rlds': {'name': 'Lerobot_Glue_best', 'path': '/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1/Lerobot_Glue_best', 'weight': 1.0, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [['/mnt/data2/guominghao/a1/warehouse/glue_lerobot', 0.6, 'bounds']], 'open-source-real-world': {'rlds': {'name': 'a1_real_world', 'path': '/vast/users/xiaodan/zhangjian/datasets/OXE', 'weight': 0.4, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [], 'agibot': {'path': None, 'weight': 0.1, 'action_proprio_normalization_type': None}}}, 'model': {'action_head': {'action_dim': 16, 'proprio_dim': 16, 'num_actions_chunk': 8, 'action_tokens_mapping': {'left_end_effector': 8, 'right_end_effector': 8}, 'use_left_eef': True, 'use_mobile_base': False}}}
9
+ build_tokenizer, cache_dir None tokenizer_dir None
10
+ 09/24 [06:31:31] INFO | >> Padding tokenizer with 418 tokens tokenizer.py:128
11
+ 09/24 [06:31:32] INFO | >> build_rlds_train_dataset: Loading train dataset: vla_dataset_realworld/train __init__.py:517
12
+ ****** Import RLDSBatchTransform, RLDSDataset successfully.
13
+ ****** before RLDS dataset...
14
+ ****** data_config.rlds_dataset_name: Lerobot_Glue_best
15
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1/Lerobot_Glue_best
16
+ Traceback (most recent call last):
17
+ File "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py", line 397, in <module>
18
+ train(cfg)
19
+ File "/vast/users/xiaodan/zhangjian/A1/scripts/train_for_action.py", line 160, in main
20
+ train_loader = build_train_dataloader(cfg, device)
21
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 196, in build_train_dataloader
22
+ return build_vla_train_dataloader(train_config, device)
23
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 314, in build_vla_train_dataloader
24
+ ds = build_rlds_train_dataset(train_config, _normalization_type, _image_augmentation, device)
25
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 527, in build_rlds_train_dataset
26
+ dataset = RLDSDataset(
27
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/vla/rlds_datasets.py", line 355, in __init__
28
+ per_dataset_kwargs, weights = get_oxe_dataset_kwargs_and_weights(
29
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/vla/rlds/oxe/materialize.py", line 119, in get_oxe_dataset_kwargs_and_weights
30
+ make_oxe_dataset_kwargs(
31
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/vla/rlds/oxe/materialize.py", line 31, in make_oxe_dataset_kwargs
32
+ dataset_kwargs = deepcopy(OXE_DATASET_CONFIGS[dataset_name])
33
+ KeyError: 'Lerobot_Glue_best'
all_flow_matching/glue_best/wandb/wandb/run-20250924_063128-wtatxotn/run-wtatxotn.wandb ADDED
Binary file (18.9 kB). View file
 
all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/config.yaml ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.21.4
4
+ e:
5
+ 71y4kqofohuhlolkoekjc4r6f1aprdzt:
6
+ args:
7
+ - qwen2_7b
8
+ - save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
9
+ - --vision_backbone
10
+ - openai
11
+ - --action_head
12
+ - flow_matching
13
+ - --seq_len
14
+ - "768"
15
+ - --lora_rank
16
+ - "32"
17
+ - --lora_llm
18
+ - --checkpoint
19
+ - /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
20
+ - --device_train_microbatch_size
21
+ - "22"
22
+ - --global_batch_size
23
+ - "176"
24
+ - --dataset
25
+ - vla_dataset_realworld
26
+ - --llm_learning_rate
27
+ - "5e-5"
28
+ - --wandb_entity
29
+ - henryeap
30
+ - --wandb_project
31
+ - a1-realworld
32
+ - --wandb_run_name
33
+ - glue
34
+ - --save_overwrite
35
+ codePath: launch_scripts/train_vla.py
36
+ codePathLocal: launch_scripts/train_vla.py
37
+ cpu_count: 64
38
+ cpu_count_logical: 128
39
+ disk:
40
+ /:
41
+ total: "470343073792"
42
+ used: "51148013568"
43
+ email: ihenrykwok@outlook.com
44
+ executable: /vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10
45
+ git:
46
+ commit: c13f2763af61e0d729a8b5ab4bdefc512205bcc5
47
+ remote: https://github.com/Spatialtemporal-AI/A1.git
48
+ gpu: Instinct MI210
49
+ gpu_amd:
50
+ - id: "1"
51
+ maxPower: "300.0"
52
+ mclkRange: 400Mhz - 1600Mhz
53
+ model: "0x740f"
54
+ performanceLevel: auto
55
+ sclkRange: 500Mhz - 1700Mhz
56
+ series: Instinct MI210
57
+ sku: D67301V
58
+ uniqueId: "0x9b5c1c302c8129f8"
59
+ vbiosVersion: 113-D67301V-073
60
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
61
+ - id: "3"
62
+ maxPower: "300.0"
63
+ mclkRange: 400Mhz - 1600Mhz
64
+ model: "0x740f"
65
+ performanceLevel: auto
66
+ sclkRange: 500Mhz - 1700Mhz
67
+ series: Instinct MI210
68
+ sku: D67301V
69
+ uniqueId: "0xf61ec17df11883bd"
70
+ vbiosVersion: 113-D67301V-073
71
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
72
+ - id: "5"
73
+ maxPower: "300.0"
74
+ mclkRange: 400Mhz - 1600Mhz
75
+ model: "0x740f"
76
+ performanceLevel: auto
77
+ sclkRange: 500Mhz - 1700Mhz
78
+ series: Instinct MI210
79
+ sku: D67301V
80
+ uniqueId: "0x137c9ede1bb1518e"
81
+ vbiosVersion: 113-D67301V-073
82
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
83
+ - id: "6"
84
+ maxPower: "300.0"
85
+ mclkRange: 400Mhz - 1600Mhz
86
+ model: "0x740f"
87
+ performanceLevel: auto
88
+ sclkRange: 500Mhz - 1700Mhz
89
+ series: Instinct MI210
90
+ sku: D67301V
91
+ uniqueId: "0xfa8b85a4625b04f"
92
+ vbiosVersion: 113-D67301V-073
93
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
94
+ - id: "0"
95
+ maxPower: "300.0"
96
+ mclkRange: 400Mhz - 1600Mhz
97
+ model: "0x740f"
98
+ performanceLevel: auto
99
+ sclkRange: 500Mhz - 1700Mhz
100
+ series: Instinct MI210
101
+ sku: D67301V
102
+ uniqueId: "0x3558c3014c813fdb"
103
+ vbiosVersion: 113-D67301V-073
104
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
105
+ - id: "4"
106
+ maxPower: "300.0"
107
+ mclkRange: 400Mhz - 1600Mhz
108
+ model: "0x740f"
109
+ performanceLevel: auto
110
+ sclkRange: 500Mhz - 1700Mhz
111
+ series: Instinct MI210
112
+ sku: D67301V
113
+ uniqueId: "0xa515afd8ced1d39d"
114
+ vbiosVersion: 113-D67301V-073
115
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
116
+ - id: "2"
117
+ maxPower: "300.0"
118
+ mclkRange: 400Mhz - 1600Mhz
119
+ model: "0x740f"
120
+ performanceLevel: auto
121
+ sclkRange: 500Mhz - 1700Mhz
122
+ series: Instinct MI210
123
+ sku: D67301V
124
+ uniqueId: "0x399226d2b2bfa544"
125
+ vbiosVersion: 113-D67301V-073
126
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
127
+ - id: "7"
128
+ maxPower: "300.0"
129
+ mclkRange: 400Mhz - 1600Mhz
130
+ model: "0x740f"
131
+ performanceLevel: auto
132
+ sclkRange: 500Mhz - 1700Mhz
133
+ series: Instinct MI210
134
+ sku: D67301V
135
+ uniqueId: "0x21a2e88d06c419dc"
136
+ vbiosVersion: 113-D67301V-073
137
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
138
+ gpu_count: 8
139
+ host: auh7-1b-gpu-188
140
+ memory:
141
+ total: "2434606952448"
142
+ os: Linux-5.15.0-140-generic-x86_64-with-glibc2.35
143
+ program: /vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py
144
+ python: CPython 3.10.18
145
+ root: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb
146
+ slurm:
147
+ cluster_name: ai-04r
148
+ conf: /etc/slurm/slurm.conf
149
+ cpus_on_node: "128"
150
+ gpus_on_node: "8"
151
+ gtids: "0"
152
+ job_account: faculty-acc
153
+ job_cpus_per_node: "128"
154
+ job_end_time: "1758954964"
155
+ job_gid: "2000"
156
+ job_gpus: 0,1,2,3,4,5,6,7
157
+ job_id: "1606"
158
+ job_name: realworld_mh
159
+ job_nodelist: auh7-1b-gpu-188
160
+ job_num_nodes: "1"
161
+ job_partition: faculty
162
+ job_qos: xdqos
163
+ job_start_time: "1758695764"
164
+ job_uid: "2013"
165
+ job_user: xiaodan
166
+ jobid: "1606"
167
+ localid: "0"
168
+ nnodes: "1"
169
+ nodeid: "0"
170
+ nodelist: auh7-1b-gpu-188
171
+ nprocs: "1"
172
+ ntasks: "1"
173
+ ntasks_per_node: "1"
174
+ oom_kill_step: "0"
175
+ prio_process: "0"
176
+ procid: "0"
177
+ submit_dir: /vast/users/xiaodan/zhangjian/A1/launch_scripts
178
+ submit_host: auh-1b-cpu-login-001
179
+ task_pid: "2192665"
180
+ tasks_per_node: "1"
181
+ topology_addr: auh7-1b-gpu-188
182
+ topology_addr_pattern: node
183
+ startedAt: "2025-09-24T06:36:42.806544Z"
184
+ writerId: 71y4kqofohuhlolkoekjc4r6f1aprdzt
185
+ m: []
186
+ python_version: 3.10.18
187
+ t:
188
+ "1":
189
+ - 1
190
+ - 3
191
+ - 5
192
+ - 11
193
+ - 41
194
+ - 49
195
+ - 51
196
+ - 53
197
+ - 63
198
+ - 71
199
+ - 83
200
+ - 95
201
+ - 105
202
+ "2":
203
+ - 1
204
+ - 3
205
+ - 5
206
+ - 11
207
+ - 41
208
+ - 49
209
+ - 51
210
+ - 53
211
+ - 63
212
+ - 71
213
+ - 83
214
+ - 95
215
+ - 105
216
+ "3":
217
+ - 13
218
+ - 15
219
+ - 16
220
+ "4": 3.10.18
221
+ "5": 0.21.4
222
+ "6": 4.56.1
223
+ "12": 0.21.4
224
+ "13": linux-x86_64
225
+ activation_checkpointing:
226
+ value: whole_layer
227
+ allow_resume:
228
+ value: false
229
+ batch_divisor:
230
+ value: global_batch
231
+ canceled_check_interval:
232
+ value: 50
233
+ checkpoint_dir:
234
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
235
+ compile:
236
+ value: null
237
+ console_log_interval:
238
+ value: 1
239
+ data:
240
+ value:
241
+ dataset: vla_dataset_realworld
242
+ drop_last: true
243
+ for_inference: false
244
+ lerobot_episode_index_end: null
245
+ lerobot_episode_index_start: null
246
+ mixture: null
247
+ multi_modal: torch
248
+ num_workers: 0
249
+ pad: to_max
250
+ persistent_workers: false
251
+ pin_memory: true
252
+ prefetch_factor: null
253
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
254
+ rlds_dataset_name: libero_4_task_suites_no_noops
255
+ rlds_read_threads: 8
256
+ rlds_shuffle_buffer_size: 100000
257
+ rlds_traj_threads: 8
258
+ root_size_mixture: null
259
+ seed: 95818
260
+ sequence_length: 768
261
+ shuffle: true
262
+ shuffle_messages: false
263
+ split: train
264
+ timeout: 0
265
+ use_proprio: true
266
+ use_wrist_image: true
267
+ device_eval_batch_size:
268
+ value: 4
269
+ device_inf_eval_batch_size:
270
+ value: 16
271
+ device_train_batch_size:
272
+ value: 22
273
+ device_train_grad_accum:
274
+ value: 1
275
+ device_train_microbatch_size:
276
+ value: 22
277
+ dry_run:
278
+ value: false
279
+ early_exit:
280
+ value: false
281
+ epoch:
282
+ value: null
283
+ eval_interval:
284
+ value: 0
285
+ eval_on_load:
286
+ value: false
287
+ eval_subset_num_batches:
288
+ value: -1
289
+ evaluators:
290
+ value:
291
+ - data:
292
+ dataset: vla_dataset_realworld
293
+ drop_last: true
294
+ for_inference: false
295
+ lerobot_episode_index_end: 765
296
+ lerobot_episode_index_start: 353
297
+ mixture: null
298
+ multi_modal: torch
299
+ num_workers: 0
300
+ pad: to_max
301
+ persistent_workers: true
302
+ pin_memory: true
303
+ prefetch_factor: null
304
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
305
+ rlds_dataset_name: libero_4_task_suites_no_noops
306
+ rlds_read_threads: 8
307
+ rlds_shuffle_buffer_size: 256000
308
+ rlds_traj_threads: 8
309
+ root_size_mixture: null
310
+ seed: null
311
+ sequence_length: 768
312
+ shuffle: false
313
+ shuffle_messages: false
314
+ split: validation
315
+ timeout: 0
316
+ use_proprio: true
317
+ use_wrist_image: true
318
+ device_eval_batch_size: null
319
+ eval_name: null
320
+ label: val
321
+ max_examples: null
322
+ max_new_tokens: 448
323
+ mm_evaluator: null
324
+ save_dir: null
325
+ save_to_checkpoint_dir: false
326
+ skip_if_metrics_cached: true
327
+ subset_num_batches: 64
328
+ extra_steps_after_cancel:
329
+ value: 10
330
+ fast_forward_batches:
331
+ value: null
332
+ force_save_unsharded:
333
+ value: false
334
+ fsdp:
335
+ value:
336
+ hybrid_sharding_num_model_replicas: null
337
+ precision: float
338
+ sharding_strategy: FULL_SHARD
339
+ use_orig_params: true
340
+ wrapping_strategy: by_block_and_size
341
+ ft_connector:
342
+ value: false
343
+ ft_embedding:
344
+ value: lm_head
345
+ ft_llm:
346
+ value: false
347
+ ft_vit:
348
+ value: false
349
+ fused_loss:
350
+ value: null
351
+ gen1_gc_interval:
352
+ value: 1
353
+ global_train_batch_size:
354
+ value: 176
355
+ inf_eval_interval:
356
+ value: -1
357
+ inf_eval_subset_num_batches:
358
+ value: -1
359
+ inf_evaluators:
360
+ value: []
361
+ initial_model_checkpoint:
362
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
363
+ keep_lr_on_load:
364
+ value: true
365
+ load_model_config:
366
+ value: null
367
+ load_path:
368
+ value: null
369
+ load_path_sharded_checkpointer:
370
+ value: null
371
+ lora:
372
+ value: false
373
+ lora_connector:
374
+ value: false
375
+ lora_llm:
376
+ value: true
377
+ lora_rank:
378
+ value: 32
379
+ lora_vit:
380
+ value: false
381
+ max_duration:
382
+ value: 500000
383
+ max_grad_norm:
384
+ value: 1
385
+ max_grad_norm_ratio:
386
+ value: null
387
+ model:
388
+ value:
389
+ action_head: flow_matching
390
+ action_head_dit_depth: 28
391
+ action_head_dit_hidden_size: 1152
392
+ action_head_dit_num_heads: 16
393
+ action_use_left_eef: false
394
+ action_use_mobile_base: false
395
+ activation_type: swiglu
396
+ additional_vocab_size: 128
397
+ always_start_with_space: true
398
+ attention_dropout: 0
399
+ attention_layer_norm: false
400
+ attention_layer_norm_with_affine: true
401
+ attention_type: sdpa
402
+ bias_for_layer_norm: null
403
+ block_group_size: 1
404
+ block_type: sequential
405
+ clip_qkv: null
406
+ crop_mode: overlap-and-resize-c2
407
+ d_model: 3584
408
+ default_inference_len: 65
409
+ embedding_dropout: 0
410
+ embedding_size: 152064
411
+ fix_image_padding: true
412
+ float32_attention: true
413
+ head_dim: null
414
+ image_feature_dropout: 0
415
+ image_padding_embed: pad_and_partial_pad
416
+ image_pooling_2d: attention_meanq
417
+ image_pooling_h: 2
418
+ image_pooling_w: 2
419
+ image_projector: mlp
420
+ include_bias: false
421
+ init_cutoff_factor: null
422
+ init_device: null
423
+ init_fn: normal
424
+ init_std: 0.02
425
+ initializer_range: 0.02
426
+ layer_norm_eps: 1e-06
427
+ layer_norm_type: rms
428
+ layer_norm_with_affine: true
429
+ llm_causal_attention: false
430
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
431
+ low_cpu_fsdp: true
432
+ max_crops: 12
433
+ max_position_embeddings: null
434
+ max_sequence_length: 4096
435
+ message_formatting: role
436
+ mlp_hidden_size: 37888
437
+ mlp_ratio: 4
438
+ moe_capacity_factor: 1.25
439
+ moe_dropless: true
440
+ moe_interleave: false
441
+ moe_lbl_in_fp32: false
442
+ moe_log_expert_assignment: false
443
+ moe_loss_weight: 0.1
444
+ moe_mlp_impl: sparse
445
+ moe_num_experts: 8
446
+ moe_shared_expert: false
447
+ moe_top_k: 2
448
+ moe_zloss_weight: null
449
+ multi_annotation_weighting: root_subsegments
450
+ n_heads: 28
451
+ n_kv_heads: 4
452
+ n_layers: 28
453
+ new_embedding_init_range: 0.02
454
+ norm_after: false
455
+ normalize_input_embeds: false
456
+ num_diffusion_inference_steps: 30
457
+ num_diffusion_steps: 1000
458
+ overlap_margins:
459
+ - 4
460
+ - 4
461
+ pad_tokenizer: true
462
+ pad_value: 0
463
+ precision: amp_bf16
464
+ prompt_type: uber_model
465
+ qkv_bias: true
466
+ residual_dropout: 0.1
467
+ response_residual_dropout: 0
468
+ rope: true
469
+ rope_full_precision: true
470
+ rope_theta: 1e+06
471
+ scale_logits: false
472
+ system_prompt_kind: demo_or_style
473
+ tokenizer:
474
+ identifier: Qwen/Qwen2-7B
475
+ tokenizer_dir: null
476
+ use_col_tokens: true
477
+ use_position_ids: true
478
+ use_proprio: true
479
+ vision_backbone:
480
+ attention_dropout: 0
481
+ fsdp_wrap: false
482
+ image_default_input_size:
483
+ - 336
484
+ - 336
485
+ image_dropout_rate: 0
486
+ image_emb_dim: 1024
487
+ image_head_dim: 64
488
+ image_mlp_activations: quick_gelu
489
+ image_mlp_dim: 4096
490
+ image_model_type: openai
491
+ image_norm_eps: 1e-05
492
+ image_num_heads: 16
493
+ image_num_key_value_heads: 16
494
+ image_num_layers: 23
495
+ image_num_pos: 577
496
+ image_patch_size: 14
497
+ image_pos_patch_size: 14
498
+ initializer_range: 0.02
499
+ residual_dropout: 0
500
+ resize_mode: default
501
+ vit_layers:
502
+ - -2
503
+ - -9
504
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
505
+ vocab_size: 152064
506
+ weight_tying: false
507
+ multi_component_grad_norm:
508
+ value: true
509
+ no_pre_train_checkpoint:
510
+ value: true
511
+ optimizer:
512
+ value:
513
+ betas:
514
+ - 0.9
515
+ - 0.95
516
+ connector_betas:
517
+ - 0.9
518
+ - 0.95
519
+ connector_eps: 1e-06
520
+ connector_learning_rate: 0.0002
521
+ connector_weight_decay: 0
522
+ eps: 1e-05
523
+ learning_rate: 0.0001
524
+ llm_betas:
525
+ - 0.9
526
+ - 0.95
527
+ llm_eps: 1e-06
528
+ llm_learning_rate: 5e-05
529
+ llm_weight_decay: 0
530
+ metrics_log_interval: 20
531
+ name: adamw
532
+ vit_betas:
533
+ - 0.9
534
+ - 0.95
535
+ vit_eps: 1e-06
536
+ vit_learning_rate: 6e-06
537
+ vit_weight_decay: 0
538
+ weight_decay: 0.01
539
+ precision:
540
+ value: amp_bf16
541
+ python_profiling:
542
+ value: false
543
+ remote_save_folder:
544
+ value: null
545
+ reset_dataloader_state:
546
+ value: false
547
+ reset_optimizer_state:
548
+ value: false
549
+ reset_trainer_state:
550
+ value: false
551
+ restore_dataloader:
552
+ value: true
553
+ run_name:
554
+ value: glue_20250924_063615
555
+ save_dataloader_state:
556
+ value: false
557
+ save_folder:
558
+ value: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
559
+ save_interval:
560
+ value: 500
561
+ save_interval_action_head:
562
+ value: 500
563
+ save_interval_ephemeral:
564
+ value: null
565
+ save_interval_unsharded:
566
+ value: 500
567
+ save_num_action_head_checkpoints_to_keep:
568
+ value: 2
569
+ save_num_checkpoints_to_keep:
570
+ value: 1
571
+ save_num_unsharded_checkpoints_to_keep:
572
+ value: 1
573
+ save_overwrite:
574
+ value: true
575
+ scheduler:
576
+ value:
577
+ alpha_f: 0.1
578
+ connector_t_warmup: 200
579
+ grad_clip_warmup_factor: null
580
+ grad_clip_warmup_steps: null
581
+ llm_t_warmup: 2000
582
+ name: multimodal
583
+ t_max: null
584
+ t_warmup: 100
585
+ units: steps
586
+ vit_t_warmup: 2000
587
+ warmup_min_lr: 0
588
+ seed:
589
+ value: 6198
590
+ sharded_checkpointer:
591
+ value: torch_legacy
592
+ softmax_auxiliary_loss:
593
+ value: true
594
+ softmax_auxiliary_loss_scale:
595
+ value: 0.0001
596
+ speed_monitor:
597
+ value:
598
+ gpu_flops_available: null
599
+ window_size: 20
600
+ stop_after:
601
+ value: null
602
+ stop_at:
603
+ value: 500000
604
+ time_limit:
605
+ value: null
606
+ torch_profiling:
607
+ value: false
608
+ train_exit_random_layer:
609
+ value: false
610
+ use_lora:
611
+ value: true
all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/output.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb: Detected [openai] in use.
2
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
4
+ 09/24 [06:36:44] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
5
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
6
+ warnings.warn( # warn only once
7
+
8
+ ****** vla_cfg: {'datasets': {'rlds': {'name': 'Lerobot_Glue_best', 'path': '/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1', 'weight': 1.0, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [['/mnt/data2/guominghao/a1/warehouse/glue_lerobot', 0.6, 'bounds']], 'open-source-real-world': {'rlds': {'name': 'a1_real_world', 'path': '/vast/users/xiaodan/zhangjian/datasets/OXE', 'weight': 0.4, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [], 'agibot': {'path': None, 'weight': 0.1, 'action_proprio_normalization_type': None}}}, 'model': {'action_head': {'action_dim': 16, 'proprio_dim': 16, 'num_actions_chunk': 8, 'action_tokens_mapping': {'left_end_effector': 8, 'right_end_effector': 8}, 'use_left_eef': True, 'use_mobile_base': False}}}
9
+ build_tokenizer, cache_dir None tokenizer_dir None
10
+ 09/24 [06:36:46] INFO | >> Padding tokenizer with 418 tokens tokenizer.py:128
11
+ 09/24 [06:36:47] INFO | >> build_rlds_train_dataset: Loading train dataset: vla_dataset_realworld/train __init__.py:517
12
+ ****** Import RLDSBatchTransform, RLDSDataset successfully.
13
+ ****** before RLDS dataset...
14
+ ****** data_config.rlds_dataset_name: Lerobot_Glue_best
15
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1
16
+ Traceback (most recent call last):
17
+ File "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py", line 397, in <module>
18
+ train(cfg)
19
+ File "/vast/users/xiaodan/zhangjian/A1/scripts/train_for_action.py", line 160, in main
20
+ train_loader = build_train_dataloader(cfg, device)
21
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 196, in build_train_dataloader
22
+ return build_vla_train_dataloader(train_config, device)
23
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 314, in build_vla_train_dataloader
24
+ ds = build_rlds_train_dataset(train_config, _normalization_type, _image_augmentation, device)
25
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 527, in build_rlds_train_dataset
26
+ dataset = RLDSDataset(
27
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/vla/rlds_datasets.py", line 355, in __init__
28
+ per_dataset_kwargs, weights = get_oxe_dataset_kwargs_and_weights(
29
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/vla/rlds/oxe/materialize.py", line 119, in get_oxe_dataset_kwargs_and_weights
30
+ make_oxe_dataset_kwargs(
31
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/vla/rlds/oxe/materialize.py", line 31, in make_oxe_dataset_kwargs
32
+ dataset_kwargs = deepcopy(OXE_DATASET_CONFIGS[dataset_name])
33
+ KeyError: 'Lerobot_Glue_best'
all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/requirements.txt ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai2-molmo==0.0.0
2
+ astunparse==1.6.3
3
+ flatbuffers==25.2.10
4
+ gast==0.6.0
5
+ google-pasta==0.2.0
6
+ h5py==3.14.0
7
+ libclang==18.1.1
8
+ Markdown==3.9
9
+ namex==0.1.0
10
+ opt_einsum==3.4.0
11
+ optree==0.17.0
12
+ tensorboard-data-server==0.7.2
13
+ tensorflow-io-gcs-filesystem==0.37.1
14
+ tensorflow-rocm==2.16.2
15
+ termcolor==3.1.0
16
+ Werkzeug==3.1.3
17
+ Brotli==1.1.0
18
+ Farama-Notifications==0.0.4
19
+ MarkupSafe==2.1.5
20
+ PyYAML==6.0.2
21
+ absl-py==2.3.1
22
+ accelerate==1.10.1
23
+ ai2-molmo==0.0.0
24
+ aiofiles==24.1.0
25
+ aiohappyeyeballs==2.6.1
26
+ aiohttp==3.12.15
27
+ aiosignal==1.4.0
28
+ annotated-types==0.7.0
29
+ antlr4-python3-runtime==4.9.3
30
+ anyio==4.10.0
31
+ array_record==0.8.1
32
+ async-timeout==5.0.1
33
+ attrs==25.3.0
34
+ av==15.1.0
35
+ backports.tarfile==1.2.0
36
+ beaker-gantry==3.2.0
37
+ beaker-py==2.5.0
38
+ black==23.12.1
39
+ boltons==25.0.0
40
+ boto3==1.40.33
41
+ botocore==1.40.33
42
+ build==1.3.0
43
+ cached_path==1.7.3
44
+ cached-property==2.0.1
45
+ cachetools==5.5.2
46
+ certifi==2025.8.3
47
+ cffi==2.0.0
48
+ charset-normalizer==3.4.3
49
+ click==8.2.1
50
+ click-help-colors==0.9.4
51
+ click-option-group==0.5.7
52
+ cloudpickle==3.1.1
53
+ cmake==4.1.0
54
+ contourpy==1.3.2
55
+ cryptography==46.0.1
56
+ cycler==0.12.1
57
+ dataclass-extensions==0.2.3
58
+ datasets==3.6.0
59
+ decorator==5.2.1
60
+ deepdiff==8.6.1
61
+ diffusers==0.35.1
62
+ dill==0.3.8
63
+ distro==1.9.0
64
+ dlimp==0.0.1
65
+ dm-tree==0.1.9
66
+ docutils==0.22.1
67
+ draccus==0.10.0
68
+ editdistance==0.8.1
69
+ einops==0.8.1
70
+ einops-exts==0.0.4
71
+ etils==1.13.0
72
+ evdev==1.9.2
73
+ exceptiongroup==1.3.0
74
+ face==24.0.0
75
+ fastapi==0.116.2
76
+ ffmpy==0.6.1
77
+ fiddle==0.3.0
78
+ filelock==3.13.1
79
+ fonttools==4.60.0
80
+ frozenlist==1.7.0
81
+ fsspec==2023.9.2
82
+ ftfy==6.3.1
83
+ gcsfs==2023.9.2
84
+ gitdb==4.0.12
85
+ GitPython==3.1.45
86
+ glom==24.11.0
87
+ google-api-core==2.25.1
88
+ google-auth==2.40.3
89
+ google-auth-oauthlib==1.2.2
90
+ google-cloud-core==2.4.3
91
+ google-cloud-storage==2.19.0
92
+ google-crc32c==1.7.1
93
+ google-resumable-media==2.7.2
94
+ googleapis-common-protos==1.70.0
95
+ gradio==5.46.0
96
+ gradio_client==1.13.0
97
+ graphviz==0.21
98
+ groovy==0.1.2
99
+ grpcio==1.75.0
100
+ gymnasium==0.29.1
101
+ h11==0.16.0
102
+ hf_transfer==0.1.9
103
+ hf-xet==1.1.10
104
+ httpcore==1.0.9
105
+ httpx==0.28.1
106
+ huggingface-hub==0.35.0
107
+ id==1.5.0
108
+ idna==3.10
109
+ imageio==2.37.0
110
+ imageio-ffmpeg==0.6.0
111
+ importlib_metadata==8.7.0
112
+ importlib_resources==6.5.2
113
+ iniconfig==2.1.0
114
+ inquirerpy==0.3.4
115
+ isort==5.12.0
116
+ jaraco.classes==3.4.0
117
+ jaraco.context==6.0.1
118
+ jaraco.functools==4.3.0
119
+ jeepney==0.9.0
120
+ Jinja2==3.1.4
121
+ jiter==0.11.0
122
+ jmespath==1.0.1
123
+ joblib==1.5.2
124
+ jsonlines==4.0.0
125
+ keras==2.15.0
126
+ keyring==25.6.0
127
+ kiwisolver==1.4.9
128
+ latex2sympy2_extended==1.10.2
129
+ lerobot==0.3.4
130
+ Levenshtein==0.27.1
131
+ libcst==1.8.4
132
+ lightning-utilities==0.15.2
133
+ markdown-it-py==4.0.0
134
+ math-verify==0.8.0
135
+ matplotlib==3.10.6
136
+ mdurl==0.1.2
137
+ mergedeep==1.3.4
138
+ ml-dtypes==0.2.0
139
+ ml_dtypes==0.5.3
140
+ more-itertools==10.8.0
141
+ mpmath==1.3.0
142
+ msgspec==0.19.0
143
+ multidict==6.6.4
144
+ multiprocess==0.70.16
145
+ mypy==1.3.0
146
+ mypy_extensions==1.1.0
147
+ necessary==0.4.3
148
+ networkx==3.3
149
+ nh3==0.3.0
150
+ nltk==3.9.1
151
+ numpy==1.26.4
152
+ numpy==2.2.6
153
+ oauthlib==3.3.1
154
+ omegaconf==2.3.0
155
+ openai==1.108.0
156
+ opencv-python-headless==4.12.0.88
157
+ OpenEXR==3.4.0
158
+ orderly-set==5.5.0
159
+ orjson==3.11.3
160
+ packaging==25.0
161
+ pandas==2.3.2
162
+ pathspec==0.12.1
163
+ petname==2.6
164
+ pfzy==0.3.4
165
+ pillow==11.0.0
166
+ pip==25.2
167
+ platformdirs==4.4.0
168
+ pluggy==1.6.0
169
+ promise==2.3
170
+ prompt_toolkit==3.0.52
171
+ propcache==0.3.2
172
+ proto-plus==1.26.1
173
+ protobuf==4.21.12
174
+ protobuf==6.32.1
175
+ psutil==7.1.0
176
+ pyarrow==21.0.0
177
+ pyasn1==0.6.1
178
+ pyasn1_modules==0.4.2
179
+ pycparser==2.23
180
+ pydantic==2.11.9
181
+ pydantic_core==2.33.2
182
+ pydub==0.25.1
183
+ Pygments==2.19.2
184
+ pynput==1.8.1
185
+ pyparsing==3.2.4
186
+ pyproject_hooks==1.2.0
187
+ pyserial==3.5
188
+ pytest==8.4.2
189
+ pytest-sphinx==0.6.3
190
+ python-dateutil==2.9.0.post0
191
+ python-Levenshtein==0.27.1
192
+ python-multipart==0.0.20
193
+ python-xlib==0.33
194
+ pytorch-triton-rocm==3.4.0
195
+ pytz==2025.2
196
+ pyyaml-include==1.4.1
197
+ RapidFuzz==3.14.1
198
+ readme_renderer==44.0
199
+ regex==2025.9.1
200
+ requests==2.32.5
201
+ requests-oauthlib==2.0.0
202
+ requests-toolbelt==1.0.0
203
+ requirements-parser==0.13.0
204
+ rerun-sdk==0.22.1
205
+ rfc3986==2.0.0
206
+ rich==13.9.4
207
+ rsa==4.9.1
208
+ ruff==0.13.0
209
+ s3transfer==0.14.0
210
+ safehttpx==0.1.6
211
+ safetensors==0.6.2
212
+ scikit-learn==1.7.2
213
+ scipy==1.15.3
214
+ SecretStorage==3.4.0
215
+ semantic-version==2.10.0
216
+ sentencepiece==0.2.1
217
+ sentry-sdk==2.38.0
218
+ setuptools==78.1.1
219
+ shellingham==1.5.4
220
+ six==1.17.0
221
+ smart_open==7.3.1
222
+ smashed==0.21.5
223
+ smmap==5.0.2
224
+ sniffio==1.3.1
225
+ starlette==0.48.0
226
+ sympy==1.13.3
227
+ tensorboard==2.15.2
228
+ tensorboard==2.19.0
229
+ tensorflow==2.15.0
230
+ tensorflow-addons==0.23.0
231
+ tensorflow-datasets==4.9.3
232
+ tensorflow-estimator==2.15.0
233
+ tensorflow-graphics==2021.12.3
234
+ tensorflow-metadata==1.17.2
235
+ threadpoolctl==3.6.0
236
+ timm==1.0.19
237
+ tokenizers==0.22.0
238
+ toml==0.10.2
239
+ tomli==2.2.1
240
+ tomlkit==0.13.3
241
+ torch==2.8.0+rocm6.4
242
+ torchcodec==0.5
243
+ torchmetrics==1.8.2
244
+ torchvision==0.23.0+rocm6.4
245
+ tqdm==4.67.1
246
+ transformers==4.56.1
247
+ trimesh==4.8.2
248
+ trouting==0.3.3
249
+ twine==6.2.0
250
+ typeguard==2.13.3
251
+ typer==0.17.4
252
+ typing_extensions==4.15.0
253
+ typing-inspect==0.9.0
254
+ typing-inspection==0.4.1
255
+ tzdata==2025.2
256
+ urllib3==2.5.0
257
+ uvicorn==0.35.0
258
+ wandb==0.21.4
259
+ wcwidth==0.2.13
260
+ websockets==15.0.1
261
+ wheel==0.45.1
262
+ wrapt==1.14.2
263
+ xxhash==3.5.0
264
+ yarl==1.20.1
265
+ zipp==3.23.0
266
+ lerobot==0.3.4
267
+ minLoRA==0.1.0
268
+ autocommand==2.2.2
269
+ backports.tarfile==1.2.0
270
+ importlib_metadata==8.0.0
271
+ inflect==7.3.1
272
+ jaraco.collections==5.1.0
273
+ jaraco.context==5.3.0
274
+ jaraco.functools==4.0.1
275
+ jaraco.text==3.12.1
276
+ more-itertools==10.3.0
277
+ packaging==24.2
278
+ platformdirs==4.2.2
279
+ tomli==2.0.1
280
+ typeguard==4.3.0
281
+ typing_extensions==4.12.2
282
+ wheel==0.45.1
283
+ zipp==3.19.2
all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/wandb-metadata.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-140-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.18",
4
+ "startedAt": "2025-09-24T06:36:42.806544Z",
5
+ "args": [
6
+ "qwen2_7b",
7
+ "save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt",
8
+ "--vision_backbone",
9
+ "openai",
10
+ "--action_head",
11
+ "flow_matching",
12
+ "--seq_len",
13
+ "768",
14
+ "--lora_rank",
15
+ "32",
16
+ "--lora_llm",
17
+ "--checkpoint",
18
+ "/vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924",
19
+ "--device_train_microbatch_size",
20
+ "22",
21
+ "--global_batch_size",
22
+ "176",
23
+ "--dataset",
24
+ "vla_dataset_realworld",
25
+ "--llm_learning_rate",
26
+ "5e-5",
27
+ "--wandb_entity",
28
+ "henryeap",
29
+ "--wandb_project",
30
+ "a1-realworld",
31
+ "--wandb_run_name",
32
+ "glue",
33
+ "--save_overwrite"
34
+ ],
35
+ "program": "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py",
36
+ "codePath": "launch_scripts/train_vla.py",
37
+ "codePathLocal": "launch_scripts/train_vla.py",
38
+ "git": {
39
+ "remote": "https://github.com/Spatialtemporal-AI/A1.git",
40
+ "commit": "c13f2763af61e0d729a8b5ab4bdefc512205bcc5"
41
+ },
42
+ "email": "ihenrykwok@outlook.com",
43
+ "root": "/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb",
44
+ "host": "auh7-1b-gpu-188",
45
+ "executable": "/vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10",
46
+ "cpu_count": 64,
47
+ "cpu_count_logical": 128,
48
+ "gpu": "Instinct MI210",
49
+ "gpu_count": 8,
50
+ "disk": {
51
+ "/": {
52
+ "total": "470343073792",
53
+ "used": "51148013568"
54
+ }
55
+ },
56
+ "memory": {
57
+ "total": "2434606952448"
58
+ },
59
+ "gpu_amd": [
60
+ {
61
+ "id": "1",
62
+ "uniqueId": "0x9b5c1c302c8129f8",
63
+ "vbiosVersion": "113-D67301V-073",
64
+ "performanceLevel": "auto",
65
+ "maxPower": "300.0",
66
+ "series": "Instinct MI210",
67
+ "model": "0x740f",
68
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
69
+ "sku": "D67301V",
70
+ "sclkRange": "500Mhz - 1700Mhz",
71
+ "mclkRange": "400Mhz - 1600Mhz"
72
+ },
73
+ {
74
+ "id": "3",
75
+ "uniqueId": "0xf61ec17df11883bd",
76
+ "vbiosVersion": "113-D67301V-073",
77
+ "performanceLevel": "auto",
78
+ "maxPower": "300.0",
79
+ "series": "Instinct MI210",
80
+ "model": "0x740f",
81
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
82
+ "sku": "D67301V",
83
+ "sclkRange": "500Mhz - 1700Mhz",
84
+ "mclkRange": "400Mhz - 1600Mhz"
85
+ },
86
+ {
87
+ "id": "5",
88
+ "uniqueId": "0x137c9ede1bb1518e",
89
+ "vbiosVersion": "113-D67301V-073",
90
+ "performanceLevel": "auto",
91
+ "maxPower": "300.0",
92
+ "series": "Instinct MI210",
93
+ "model": "0x740f",
94
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
95
+ "sku": "D67301V",
96
+ "sclkRange": "500Mhz - 1700Mhz",
97
+ "mclkRange": "400Mhz - 1600Mhz"
98
+ },
99
+ {
100
+ "id": "6",
101
+ "uniqueId": "0xfa8b85a4625b04f",
102
+ "vbiosVersion": "113-D67301V-073",
103
+ "performanceLevel": "auto",
104
+ "maxPower": "300.0",
105
+ "series": "Instinct MI210",
106
+ "model": "0x740f",
107
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
108
+ "sku": "D67301V",
109
+ "sclkRange": "500Mhz - 1700Mhz",
110
+ "mclkRange": "400Mhz - 1600Mhz"
111
+ },
112
+ {
113
+ "id": "0",
114
+ "uniqueId": "0x3558c3014c813fdb",
115
+ "vbiosVersion": "113-D67301V-073",
116
+ "performanceLevel": "auto",
117
+ "maxPower": "300.0",
118
+ "series": "Instinct MI210",
119
+ "model": "0x740f",
120
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
121
+ "sku": "D67301V",
122
+ "sclkRange": "500Mhz - 1700Mhz",
123
+ "mclkRange": "400Mhz - 1600Mhz"
124
+ },
125
+ {
126
+ "id": "4",
127
+ "uniqueId": "0xa515afd8ced1d39d",
128
+ "vbiosVersion": "113-D67301V-073",
129
+ "performanceLevel": "auto",
130
+ "maxPower": "300.0",
131
+ "series": "Instinct MI210",
132
+ "model": "0x740f",
133
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
134
+ "sku": "D67301V",
135
+ "sclkRange": "500Mhz - 1700Mhz",
136
+ "mclkRange": "400Mhz - 1600Mhz"
137
+ },
138
+ {
139
+ "id": "2",
140
+ "uniqueId": "0x399226d2b2bfa544",
141
+ "vbiosVersion": "113-D67301V-073",
142
+ "performanceLevel": "auto",
143
+ "maxPower": "300.0",
144
+ "series": "Instinct MI210",
145
+ "model": "0x740f",
146
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
147
+ "sku": "D67301V",
148
+ "sclkRange": "500Mhz - 1700Mhz",
149
+ "mclkRange": "400Mhz - 1600Mhz"
150
+ },
151
+ {
152
+ "id": "7",
153
+ "uniqueId": "0x21a2e88d06c419dc",
154
+ "vbiosVersion": "113-D67301V-073",
155
+ "performanceLevel": "auto",
156
+ "maxPower": "300.0",
157
+ "series": "Instinct MI210",
158
+ "model": "0x740f",
159
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
160
+ "sku": "D67301V",
161
+ "sclkRange": "500Mhz - 1700Mhz",
162
+ "mclkRange": "400Mhz - 1600Mhz"
163
+ }
164
+ ],
165
+ "slurm": {
166
+ "cluster_name": "ai-04r",
167
+ "conf": "/etc/slurm/slurm.conf",
168
+ "cpus_on_node": "128",
169
+ "gpus_on_node": "8",
170
+ "gtids": "0",
171
+ "job_account": "faculty-acc",
172
+ "job_cpus_per_node": "128",
173
+ "job_end_time": "1758954964",
174
+ "job_gid": "2000",
175
+ "job_gpus": "0,1,2,3,4,5,6,7",
176
+ "job_id": "1606",
177
+ "job_name": "realworld_mh",
178
+ "job_nodelist": "auh7-1b-gpu-188",
179
+ "job_num_nodes": "1",
180
+ "job_partition": "faculty",
181
+ "job_qos": "xdqos",
182
+ "job_start_time": "1758695764",
183
+ "job_uid": "2013",
184
+ "job_user": "xiaodan",
185
+ "jobid": "1606",
186
+ "localid": "0",
187
+ "nnodes": "1",
188
+ "nodeid": "0",
189
+ "nodelist": "auh7-1b-gpu-188",
190
+ "nprocs": "1",
191
+ "ntasks": "1",
192
+ "ntasks_per_node": "1",
193
+ "oom_kill_step": "0",
194
+ "prio_process": "0",
195
+ "procid": "0",
196
+ "submit_dir": "/vast/users/xiaodan/zhangjian/A1/launch_scripts",
197
+ "submit_host": "auh-1b-cpu-login-001",
198
+ "task_pid": "2192665",
199
+ "tasks_per_node": "1",
200
+ "topology_addr": "auh7-1b-gpu-188",
201
+ "topology_addr_pattern": "node"
202
+ },
203
+ "writerId": "71y4kqofohuhlolkoekjc4r6f1aprdzt"
204
+ }
all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime":2,"_wandb":{"runtime":2}}
all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-24T06:36:43.051707086Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-09-24T06:36:44.180319507Z","level":"INFO","msg":"stream: created new stream","id":"6tj2c8pr"}
3
+ {"time":"2025-09-24T06:36:44.180366367Z","level":"INFO","msg":"stream: started","id":"6tj2c8pr"}
4
+ {"time":"2025-09-24T06:36:44.180386688Z","level":"INFO","msg":"writer: started","stream_id":"6tj2c8pr"}
5
+ {"time":"2025-09-24T06:36:44.180391338Z","level":"INFO","msg":"handler: started","stream_id":"6tj2c8pr"}
6
+ {"time":"2025-09-24T06:36:44.180402908Z","level":"INFO","msg":"sender: started","stream_id":"6tj2c8pr"}
7
+ {"time":"2025-09-24T06:36:47.096300319Z","level":"INFO","msg":"stream: closing","id":"6tj2c8pr"}
8
+ {"time":"2025-09-24T06:36:48.366366183Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-09-24T06:36:48.706218134Z","level":"INFO","msg":"handler: closed","stream_id":"6tj2c8pr"}
10
+ {"time":"2025-09-24T06:36:48.707592142Z","level":"INFO","msg":"sender: closed","stream_id":"6tj2c8pr"}
11
+ {"time":"2025-09-24T06:36:48.707612462Z","level":"INFO","msg":"stream: closed","id":"6tj2c8pr"}
all_flow_matching/glue_best/wandb/wandb/run-20250924_063642-6tj2c8pr/run-6tj2c8pr.wandb ADDED
Binary file (18.9 kB). View file
 
all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/config.yaml ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.21.4
4
+ e:
5
+ ym1fr90agfv5lp1xadwns4zfs5lnvysu:
6
+ args:
7
+ - qwen2_7b
8
+ - save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
9
+ - --vision_backbone
10
+ - openai
11
+ - --action_head
12
+ - flow_matching
13
+ - --seq_len
14
+ - "768"
15
+ - --lora_rank
16
+ - "32"
17
+ - --lora_llm
18
+ - --checkpoint
19
+ - /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
20
+ - --device_train_microbatch_size
21
+ - "22"
22
+ - --global_batch_size
23
+ - "176"
24
+ - --dataset
25
+ - vla_dataset_realworld
26
+ - --llm_learning_rate
27
+ - "5e-5"
28
+ - --wandb_entity
29
+ - henryeap
30
+ - --wandb_project
31
+ - a1-realworld
32
+ - --wandb_run_name
33
+ - glue
34
+ - --save_overwrite
35
+ codePath: launch_scripts/train_vla.py
36
+ codePathLocal: launch_scripts/train_vla.py
37
+ cpu_count: 64
38
+ cpu_count_logical: 128
39
+ disk:
40
+ /:
41
+ total: "470343073792"
42
+ used: "51148275712"
43
+ email: ihenrykwok@outlook.com
44
+ executable: /vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10
45
+ git:
46
+ commit: c13f2763af61e0d729a8b5ab4bdefc512205bcc5
47
+ remote: https://github.com/Spatialtemporal-AI/A1.git
48
+ gpu: Instinct MI210
49
+ gpu_amd:
50
+ - id: "2"
51
+ maxPower: "300.0"
52
+ mclkRange: 400Mhz - 1600Mhz
53
+ model: "0x740f"
54
+ performanceLevel: auto
55
+ sclkRange: 500Mhz - 1700Mhz
56
+ series: Instinct MI210
57
+ sku: D67301V
58
+ uniqueId: "0x399226d2b2bfa544"
59
+ vbiosVersion: 113-D67301V-073
60
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
61
+ - id: "0"
62
+ maxPower: "300.0"
63
+ mclkRange: 400Mhz - 1600Mhz
64
+ model: "0x740f"
65
+ performanceLevel: auto
66
+ sclkRange: 500Mhz - 1700Mhz
67
+ series: Instinct MI210
68
+ sku: D67301V
69
+ uniqueId: "0x3558c3014c813fdb"
70
+ vbiosVersion: 113-D67301V-073
71
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
72
+ - id: "4"
73
+ maxPower: "300.0"
74
+ mclkRange: 400Mhz - 1600Mhz
75
+ model: "0x740f"
76
+ performanceLevel: auto
77
+ sclkRange: 500Mhz - 1700Mhz
78
+ series: Instinct MI210
79
+ sku: D67301V
80
+ uniqueId: "0xa515afd8ced1d39d"
81
+ vbiosVersion: 113-D67301V-073
82
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
83
+ - id: "5"
84
+ maxPower: "300.0"
85
+ mclkRange: 400Mhz - 1600Mhz
86
+ model: "0x740f"
87
+ performanceLevel: auto
88
+ sclkRange: 500Mhz - 1700Mhz
89
+ series: Instinct MI210
90
+ sku: D67301V
91
+ uniqueId: "0x137c9ede1bb1518e"
92
+ vbiosVersion: 113-D67301V-073
93
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
94
+ - id: "6"
95
+ maxPower: "300.0"
96
+ mclkRange: 400Mhz - 1600Mhz
97
+ model: "0x740f"
98
+ performanceLevel: auto
99
+ sclkRange: 500Mhz - 1700Mhz
100
+ series: Instinct MI210
101
+ sku: D67301V
102
+ uniqueId: "0xfa8b85a4625b04f"
103
+ vbiosVersion: 113-D67301V-073
104
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
105
+ - id: "7"
106
+ maxPower: "300.0"
107
+ mclkRange: 400Mhz - 1600Mhz
108
+ model: "0x740f"
109
+ performanceLevel: auto
110
+ sclkRange: 500Mhz - 1700Mhz
111
+ series: Instinct MI210
112
+ sku: D67301V
113
+ uniqueId: "0x21a2e88d06c419dc"
114
+ vbiosVersion: 113-D67301V-073
115
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
116
+ - id: "1"
117
+ maxPower: "300.0"
118
+ mclkRange: 400Mhz - 1600Mhz
119
+ model: "0x740f"
120
+ performanceLevel: auto
121
+ sclkRange: 500Mhz - 1700Mhz
122
+ series: Instinct MI210
123
+ sku: D67301V
124
+ uniqueId: "0x9b5c1c302c8129f8"
125
+ vbiosVersion: 113-D67301V-073
126
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
127
+ - id: "3"
128
+ maxPower: "300.0"
129
+ mclkRange: 400Mhz - 1600Mhz
130
+ model: "0x740f"
131
+ performanceLevel: auto
132
+ sclkRange: 500Mhz - 1700Mhz
133
+ series: Instinct MI210
134
+ sku: D67301V
135
+ uniqueId: "0xf61ec17df11883bd"
136
+ vbiosVersion: 113-D67301V-073
137
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
138
+ gpu_count: 8
139
+ host: auh7-1b-gpu-188
140
+ memory:
141
+ total: "2434606952448"
142
+ os: Linux-5.15.0-140-generic-x86_64-with-glibc2.35
143
+ program: /vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py
144
+ python: CPython 3.10.18
145
+ root: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb
146
+ slurm:
147
+ cluster_name: ai-04r
148
+ conf: /etc/slurm/slurm.conf
149
+ cpus_on_node: "128"
150
+ gpus_on_node: "8"
151
+ gtids: "0"
152
+ job_account: faculty-acc
153
+ job_cpus_per_node: "128"
154
+ job_end_time: "1758955952"
155
+ job_gid: "2000"
156
+ job_gpus: 0,1,2,3,4,5,6,7
157
+ job_id: "1607"
158
+ job_name: realworld_mh
159
+ job_nodelist: auh7-1b-gpu-188
160
+ job_num_nodes: "1"
161
+ job_partition: faculty
162
+ job_qos: xdqos
163
+ job_start_time: "1758696752"
164
+ job_uid: "2013"
165
+ job_user: xiaodan
166
+ jobid: "1607"
167
+ localid: "0"
168
+ nnodes: "1"
169
+ nodeid: "0"
170
+ nodelist: auh7-1b-gpu-188
171
+ nprocs: "1"
172
+ ntasks: "1"
173
+ ntasks_per_node: "1"
174
+ oom_kill_step: "0"
175
+ prio_process: "0"
176
+ procid: "0"
177
+ submit_dir: /vast/users/xiaodan/zhangjian/A1/launch_scripts
178
+ submit_host: auh-1b-cpu-login-001
179
+ task_pid: "2194698"
180
+ tasks_per_node: "1"
181
+ topology_addr: auh7-1b-gpu-188
182
+ topology_addr_pattern: node
183
+ startedAt: "2025-09-24T06:53:10.958875Z"
184
+ writerId: ym1fr90agfv5lp1xadwns4zfs5lnvysu
185
+ m: []
186
+ python_version: 3.10.18
187
+ t:
188
+ "1":
189
+ - 1
190
+ - 3
191
+ - 5
192
+ - 11
193
+ - 41
194
+ - 49
195
+ - 51
196
+ - 53
197
+ - 63
198
+ - 71
199
+ - 83
200
+ - 95
201
+ - 105
202
+ "2":
203
+ - 1
204
+ - 3
205
+ - 5
206
+ - 11
207
+ - 41
208
+ - 49
209
+ - 51
210
+ - 53
211
+ - 63
212
+ - 71
213
+ - 83
214
+ - 95
215
+ - 105
216
+ "3":
217
+ - 13
218
+ - 15
219
+ - 16
220
+ "4": 3.10.18
221
+ "5": 0.21.4
222
+ "6": 4.56.1
223
+ "12": 0.21.4
224
+ "13": linux-x86_64
225
+ activation_checkpointing:
226
+ value: whole_layer
227
+ allow_resume:
228
+ value: false
229
+ batch_divisor:
230
+ value: global_batch
231
+ canceled_check_interval:
232
+ value: 50
233
+ checkpoint_dir:
234
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
235
+ compile:
236
+ value: null
237
+ console_log_interval:
238
+ value: 1
239
+ data:
240
+ value:
241
+ dataset: vla_dataset_realworld
242
+ drop_last: true
243
+ for_inference: false
244
+ lerobot_episode_index_end: null
245
+ lerobot_episode_index_start: null
246
+ mixture: null
247
+ multi_modal: torch
248
+ num_workers: 0
249
+ pad: to_max
250
+ persistent_workers: false
251
+ pin_memory: true
252
+ prefetch_factor: null
253
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
254
+ rlds_dataset_name: libero_4_task_suites_no_noops
255
+ rlds_read_threads: 8
256
+ rlds_shuffle_buffer_size: 100000
257
+ rlds_traj_threads: 8
258
+ root_size_mixture: null
259
+ seed: 95818
260
+ sequence_length: 768
261
+ shuffle: true
262
+ shuffle_messages: false
263
+ split: train
264
+ timeout: 0
265
+ use_proprio: true
266
+ use_wrist_image: true
267
+ device_eval_batch_size:
268
+ value: 4
269
+ device_inf_eval_batch_size:
270
+ value: 16
271
+ device_train_batch_size:
272
+ value: 22
273
+ device_train_grad_accum:
274
+ value: 1
275
+ device_train_microbatch_size:
276
+ value: 22
277
+ dry_run:
278
+ value: false
279
+ early_exit:
280
+ value: false
281
+ epoch:
282
+ value: null
283
+ eval_interval:
284
+ value: 0
285
+ eval_on_load:
286
+ value: false
287
+ eval_subset_num_batches:
288
+ value: -1
289
+ evaluators:
290
+ value:
291
+ - data:
292
+ dataset: vla_dataset_realworld
293
+ drop_last: true
294
+ for_inference: false
295
+ lerobot_episode_index_end: 765
296
+ lerobot_episode_index_start: 353
297
+ mixture: null
298
+ multi_modal: torch
299
+ num_workers: 0
300
+ pad: to_max
301
+ persistent_workers: true
302
+ pin_memory: true
303
+ prefetch_factor: null
304
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
305
+ rlds_dataset_name: libero_4_task_suites_no_noops
306
+ rlds_read_threads: 8
307
+ rlds_shuffle_buffer_size: 256000
308
+ rlds_traj_threads: 8
309
+ root_size_mixture: null
310
+ seed: null
311
+ sequence_length: 768
312
+ shuffle: false
313
+ shuffle_messages: false
314
+ split: validation
315
+ timeout: 0
316
+ use_proprio: true
317
+ use_wrist_image: true
318
+ device_eval_batch_size: null
319
+ eval_name: null
320
+ label: val
321
+ max_examples: null
322
+ max_new_tokens: 448
323
+ mm_evaluator: null
324
+ save_dir: null
325
+ save_to_checkpoint_dir: false
326
+ skip_if_metrics_cached: true
327
+ subset_num_batches: 64
328
+ extra_steps_after_cancel:
329
+ value: 10
330
+ fast_forward_batches:
331
+ value: null
332
+ force_save_unsharded:
333
+ value: false
334
+ fsdp:
335
+ value:
336
+ hybrid_sharding_num_model_replicas: null
337
+ precision: float
338
+ sharding_strategy: FULL_SHARD
339
+ use_orig_params: true
340
+ wrapping_strategy: by_block_and_size
341
+ ft_connector:
342
+ value: false
343
+ ft_embedding:
344
+ value: lm_head
345
+ ft_llm:
346
+ value: false
347
+ ft_vit:
348
+ value: false
349
+ fused_loss:
350
+ value: null
351
+ gen1_gc_interval:
352
+ value: 1
353
+ global_train_batch_size:
354
+ value: 176
355
+ inf_eval_interval:
356
+ value: -1
357
+ inf_eval_subset_num_batches:
358
+ value: -1
359
+ inf_evaluators:
360
+ value: []
361
+ initial_model_checkpoint:
362
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
363
+ keep_lr_on_load:
364
+ value: true
365
+ load_model_config:
366
+ value: null
367
+ load_path:
368
+ value: null
369
+ load_path_sharded_checkpointer:
370
+ value: null
371
+ lora:
372
+ value: false
373
+ lora_connector:
374
+ value: false
375
+ lora_llm:
376
+ value: true
377
+ lora_rank:
378
+ value: 32
379
+ lora_vit:
380
+ value: false
381
+ max_duration:
382
+ value: 500000
383
+ max_grad_norm:
384
+ value: 1
385
+ max_grad_norm_ratio:
386
+ value: null
387
+ model:
388
+ value:
389
+ action_head: flow_matching
390
+ action_head_dit_depth: 28
391
+ action_head_dit_hidden_size: 1152
392
+ action_head_dit_num_heads: 16
393
+ action_use_left_eef: false
394
+ action_use_mobile_base: false
395
+ activation_type: swiglu
396
+ additional_vocab_size: 128
397
+ always_start_with_space: true
398
+ attention_dropout: 0
399
+ attention_layer_norm: false
400
+ attention_layer_norm_with_affine: true
401
+ attention_type: sdpa
402
+ bias_for_layer_norm: null
403
+ block_group_size: 1
404
+ block_type: sequential
405
+ clip_qkv: null
406
+ crop_mode: overlap-and-resize-c2
407
+ d_model: 3584
408
+ default_inference_len: 65
409
+ embedding_dropout: 0
410
+ embedding_size: 152064
411
+ fix_image_padding: true
412
+ float32_attention: true
413
+ head_dim: null
414
+ image_feature_dropout: 0
415
+ image_padding_embed: pad_and_partial_pad
416
+ image_pooling_2d: attention_meanq
417
+ image_pooling_h: 2
418
+ image_pooling_w: 2
419
+ image_projector: mlp
420
+ include_bias: false
421
+ init_cutoff_factor: null
422
+ init_device: null
423
+ init_fn: normal
424
+ init_std: 0.02
425
+ initializer_range: 0.02
426
+ layer_norm_eps: 1e-06
427
+ layer_norm_type: rms
428
+ layer_norm_with_affine: true
429
+ llm_causal_attention: false
430
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
431
+ low_cpu_fsdp: true
432
+ max_crops: 12
433
+ max_position_embeddings: null
434
+ max_sequence_length: 4096
435
+ message_formatting: role
436
+ mlp_hidden_size: 37888
437
+ mlp_ratio: 4
438
+ moe_capacity_factor: 1.25
439
+ moe_dropless: true
440
+ moe_interleave: false
441
+ moe_lbl_in_fp32: false
442
+ moe_log_expert_assignment: false
443
+ moe_loss_weight: 0.1
444
+ moe_mlp_impl: sparse
445
+ moe_num_experts: 8
446
+ moe_shared_expert: false
447
+ moe_top_k: 2
448
+ moe_zloss_weight: null
449
+ multi_annotation_weighting: root_subsegments
450
+ n_heads: 28
451
+ n_kv_heads: 4
452
+ n_layers: 28
453
+ new_embedding_init_range: 0.02
454
+ norm_after: false
455
+ normalize_input_embeds: false
456
+ num_diffusion_inference_steps: 30
457
+ num_diffusion_steps: 1000
458
+ overlap_margins:
459
+ - 4
460
+ - 4
461
+ pad_tokenizer: true
462
+ pad_value: 0
463
+ precision: amp_bf16
464
+ prompt_type: uber_model
465
+ qkv_bias: true
466
+ residual_dropout: 0.1
467
+ response_residual_dropout: 0
468
+ rope: true
469
+ rope_full_precision: true
470
+ rope_theta: 1e+06
471
+ scale_logits: false
472
+ system_prompt_kind: demo_or_style
473
+ tokenizer:
474
+ identifier: Qwen/Qwen2-7B
475
+ tokenizer_dir: null
476
+ use_col_tokens: true
477
+ use_position_ids: true
478
+ use_proprio: true
479
+ vision_backbone:
480
+ attention_dropout: 0
481
+ fsdp_wrap: false
482
+ image_default_input_size:
483
+ - 336
484
+ - 336
485
+ image_dropout_rate: 0
486
+ image_emb_dim: 1024
487
+ image_head_dim: 64
488
+ image_mlp_activations: quick_gelu
489
+ image_mlp_dim: 4096
490
+ image_model_type: openai
491
+ image_norm_eps: 1e-05
492
+ image_num_heads: 16
493
+ image_num_key_value_heads: 16
494
+ image_num_layers: 23
495
+ image_num_pos: 577
496
+ image_patch_size: 14
497
+ image_pos_patch_size: 14
498
+ initializer_range: 0.02
499
+ residual_dropout: 0
500
+ resize_mode: default
501
+ vit_layers:
502
+ - -2
503
+ - -9
504
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
505
+ vocab_size: 152064
506
+ weight_tying: false
507
+ multi_component_grad_norm:
508
+ value: true
509
+ no_pre_train_checkpoint:
510
+ value: true
511
+ optimizer:
512
+ value:
513
+ betas:
514
+ - 0.9
515
+ - 0.95
516
+ connector_betas:
517
+ - 0.9
518
+ - 0.95
519
+ connector_eps: 1e-06
520
+ connector_learning_rate: 0.0002
521
+ connector_weight_decay: 0
522
+ eps: 1e-05
523
+ learning_rate: 0.0001
524
+ llm_betas:
525
+ - 0.9
526
+ - 0.95
527
+ llm_eps: 1e-06
528
+ llm_learning_rate: 5e-05
529
+ llm_weight_decay: 0
530
+ metrics_log_interval: 20
531
+ name: adamw
532
+ vit_betas:
533
+ - 0.9
534
+ - 0.95
535
+ vit_eps: 1e-06
536
+ vit_learning_rate: 6e-06
537
+ vit_weight_decay: 0
538
+ weight_decay: 0.01
539
+ precision:
540
+ value: amp_bf16
541
+ python_profiling:
542
+ value: false
543
+ remote_save_folder:
544
+ value: null
545
+ reset_dataloader_state:
546
+ value: false
547
+ reset_optimizer_state:
548
+ value: false
549
+ reset_trainer_state:
550
+ value: false
551
+ restore_dataloader:
552
+ value: true
553
+ run_name:
554
+ value: glue_20250924_065243
555
+ save_dataloader_state:
556
+ value: false
557
+ save_folder:
558
+ value: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
559
+ save_interval:
560
+ value: 500
561
+ save_interval_action_head:
562
+ value: 500
563
+ save_interval_ephemeral:
564
+ value: null
565
+ save_interval_unsharded:
566
+ value: 500
567
+ save_num_action_head_checkpoints_to_keep:
568
+ value: 2
569
+ save_num_checkpoints_to_keep:
570
+ value: 1
571
+ save_num_unsharded_checkpoints_to_keep:
572
+ value: 1
573
+ save_overwrite:
574
+ value: true
575
+ scheduler:
576
+ value:
577
+ alpha_f: 0.1
578
+ connector_t_warmup: 200
579
+ grad_clip_warmup_factor: null
580
+ grad_clip_warmup_steps: null
581
+ llm_t_warmup: 2000
582
+ name: multimodal
583
+ t_max: null
584
+ t_warmup: 100
585
+ units: steps
586
+ vit_t_warmup: 2000
587
+ warmup_min_lr: 0
588
+ seed:
589
+ value: 6198
590
+ sharded_checkpointer:
591
+ value: torch_legacy
592
+ softmax_auxiliary_loss:
593
+ value: true
594
+ softmax_auxiliary_loss_scale:
595
+ value: 0.0001
596
+ speed_monitor:
597
+ value:
598
+ gpu_flops_available: null
599
+ window_size: 20
600
+ stop_after:
601
+ value: null
602
+ stop_at:
603
+ value: 500000
604
+ time_limit:
605
+ value: null
606
+ torch_profiling:
607
+ value: false
608
+ train_exit_random_layer:
609
+ value: false
610
+ use_lora:
611
+ value: true
all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/output.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb: Detected [openai] in use.
2
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
4
+ 09/24 [06:53:12] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
5
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
6
+ warnings.warn( # warn only once
7
+
8
+ ****** vla_cfg: {'datasets': {'rlds': {'name': None, 'path': None, 'weight': 1.0, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [['/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1/Lerobot_Glue_best', 0.6, 'bounds']], 'open-source-real-world': {'rlds': {'name': 'a1_real_world', 'path': '/vast/users/xiaodan/zhangjian/datasets/OXE', 'weight': 0.4, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [], 'agibot': {'path': None, 'weight': 0.1, 'action_proprio_normalization_type': None}}}, 'model': {'action_head': {'action_dim': 16, 'proprio_dim': 16, 'num_actions_chunk': 8, 'action_tokens_mapping': {'left_end_effector': 8, 'right_end_effector': 8}, 'use_left_eef': True, 'use_mobile_base': False}}}
9
+ ****** Skip RLDS main; path not found: None
10
+ ****** start build LeRobot main...
11
+ build_tokenizer, cache_dir None tokenizer_dir None
12
+ 09/24 [06:53:14] INFO | >> Padding tokenizer with 418 tokens tokenizer.py:128
13
+ 09/24 [06:53:15] INFO | >> Loading train dataset: vla_dataset_realworld/train __init__.py:434
14
+ Traceback (most recent call last):
15
+ File "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py", line 397, in <module>
16
+ train(cfg)
17
+ File "/vast/users/xiaodan/zhangjian/A1/scripts/train_for_action.py", line 160, in main
18
+ train_loader = build_train_dataloader(cfg, device)
19
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 196, in build_train_dataloader
20
+ return build_vla_train_dataloader(train_config, device)
21
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 327, in build_vla_train_dataloader
22
+ ds = build_lerobot_train_dataset(train_config, normalization_type,device)
23
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 435, in build_lerobot_train_dataset
24
+ from olmo.data.vla.lerobot_datasets import LeRobotDatasetWrapper
25
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/vla/lerobot_datasets.py", line 71, in <module>
26
+ class LeRobotDatasetWrapper(Dataset):
27
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/vla/lerobot_datasets.py", line 72, in LeRobotDatasetWrapper
28
+ def __init__(self, dataset_path, chunk_size=NUM_ACTIONS_CHUNK,
29
+ NameError: name 'NUM_ACTIONS_CHUNK' is not defined
all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/requirements.txt ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai2-molmo==0.0.0
2
+ astunparse==1.6.3
3
+ flatbuffers==25.2.10
4
+ gast==0.6.0
5
+ google-pasta==0.2.0
6
+ h5py==3.14.0
7
+ libclang==18.1.1
8
+ Markdown==3.9
9
+ namex==0.1.0
10
+ opt_einsum==3.4.0
11
+ optree==0.17.0
12
+ tensorboard-data-server==0.7.2
13
+ tensorflow-io-gcs-filesystem==0.37.1
14
+ tensorflow-rocm==2.16.2
15
+ termcolor==3.1.0
16
+ Werkzeug==3.1.3
17
+ Brotli==1.1.0
18
+ Farama-Notifications==0.0.4
19
+ MarkupSafe==2.1.5
20
+ PyYAML==6.0.2
21
+ absl-py==2.3.1
22
+ accelerate==1.10.1
23
+ ai2-molmo==0.0.0
24
+ aiofiles==24.1.0
25
+ aiohappyeyeballs==2.6.1
26
+ aiohttp==3.12.15
27
+ aiosignal==1.4.0
28
+ annotated-types==0.7.0
29
+ antlr4-python3-runtime==4.9.3
30
+ anyio==4.10.0
31
+ array_record==0.8.1
32
+ async-timeout==5.0.1
33
+ attrs==25.3.0
34
+ av==15.1.0
35
+ backports.tarfile==1.2.0
36
+ beaker-gantry==3.2.0
37
+ beaker-py==2.5.0
38
+ black==23.12.1
39
+ boltons==25.0.0
40
+ boto3==1.40.33
41
+ botocore==1.40.33
42
+ build==1.3.0
43
+ cached_path==1.7.3
44
+ cached-property==2.0.1
45
+ cachetools==5.5.2
46
+ certifi==2025.8.3
47
+ cffi==2.0.0
48
+ charset-normalizer==3.4.3
49
+ click==8.2.1
50
+ click-help-colors==0.9.4
51
+ click-option-group==0.5.7
52
+ cloudpickle==3.1.1
53
+ cmake==4.1.0
54
+ contourpy==1.3.2
55
+ cryptography==46.0.1
56
+ cycler==0.12.1
57
+ dataclass-extensions==0.2.3
58
+ datasets==3.6.0
59
+ decorator==5.2.1
60
+ deepdiff==8.6.1
61
+ diffusers==0.35.1
62
+ dill==0.3.8
63
+ distro==1.9.0
64
+ dlimp==0.0.1
65
+ dm-tree==0.1.9
66
+ docutils==0.22.1
67
+ draccus==0.10.0
68
+ editdistance==0.8.1
69
+ einops==0.8.1
70
+ einops-exts==0.0.4
71
+ etils==1.13.0
72
+ evdev==1.9.2
73
+ exceptiongroup==1.3.0
74
+ face==24.0.0
75
+ fastapi==0.116.2
76
+ ffmpy==0.6.1
77
+ fiddle==0.3.0
78
+ filelock==3.13.1
79
+ fonttools==4.60.0
80
+ frozenlist==1.7.0
81
+ fsspec==2023.9.2
82
+ ftfy==6.3.1
83
+ gcsfs==2023.9.2
84
+ gitdb==4.0.12
85
+ GitPython==3.1.45
86
+ glom==24.11.0
87
+ google-api-core==2.25.1
88
+ google-auth==2.40.3
89
+ google-auth-oauthlib==1.2.2
90
+ google-cloud-core==2.4.3
91
+ google-cloud-storage==2.19.0
92
+ google-crc32c==1.7.1
93
+ google-resumable-media==2.7.2
94
+ googleapis-common-protos==1.70.0
95
+ gradio==5.46.0
96
+ gradio_client==1.13.0
97
+ graphviz==0.21
98
+ groovy==0.1.2
99
+ grpcio==1.75.0
100
+ gymnasium==0.29.1
101
+ h11==0.16.0
102
+ hf_transfer==0.1.9
103
+ hf-xet==1.1.10
104
+ httpcore==1.0.9
105
+ httpx==0.28.1
106
+ huggingface-hub==0.35.0
107
+ id==1.5.0
108
+ idna==3.10
109
+ imageio==2.37.0
110
+ imageio-ffmpeg==0.6.0
111
+ importlib_metadata==8.7.0
112
+ importlib_resources==6.5.2
113
+ iniconfig==2.1.0
114
+ inquirerpy==0.3.4
115
+ isort==5.12.0
116
+ jaraco.classes==3.4.0
117
+ jaraco.context==6.0.1
118
+ jaraco.functools==4.3.0
119
+ jeepney==0.9.0
120
+ Jinja2==3.1.4
121
+ jiter==0.11.0
122
+ jmespath==1.0.1
123
+ joblib==1.5.2
124
+ jsonlines==4.0.0
125
+ keras==2.15.0
126
+ keyring==25.6.0
127
+ kiwisolver==1.4.9
128
+ latex2sympy2_extended==1.10.2
129
+ lerobot==0.3.4
130
+ Levenshtein==0.27.1
131
+ libcst==1.8.4
132
+ lightning-utilities==0.15.2
133
+ markdown-it-py==4.0.0
134
+ math-verify==0.8.0
135
+ matplotlib==3.10.6
136
+ mdurl==0.1.2
137
+ mergedeep==1.3.4
138
+ ml-dtypes==0.2.0
139
+ ml_dtypes==0.5.3
140
+ more-itertools==10.8.0
141
+ mpmath==1.3.0
142
+ msgspec==0.19.0
143
+ multidict==6.6.4
144
+ multiprocess==0.70.16
145
+ mypy==1.3.0
146
+ mypy_extensions==1.1.0
147
+ necessary==0.4.3
148
+ networkx==3.3
149
+ nh3==0.3.0
150
+ nltk==3.9.1
151
+ numpy==1.26.4
152
+ numpy==2.2.6
153
+ oauthlib==3.3.1
154
+ omegaconf==2.3.0
155
+ openai==1.108.0
156
+ opencv-python-headless==4.12.0.88
157
+ OpenEXR==3.4.0
158
+ orderly-set==5.5.0
159
+ orjson==3.11.3
160
+ packaging==25.0
161
+ pandas==2.3.2
162
+ pathspec==0.12.1
163
+ petname==2.6
164
+ pfzy==0.3.4
165
+ pillow==11.0.0
166
+ pip==25.2
167
+ platformdirs==4.4.0
168
+ pluggy==1.6.0
169
+ promise==2.3
170
+ prompt_toolkit==3.0.52
171
+ propcache==0.3.2
172
+ proto-plus==1.26.1
173
+ protobuf==4.21.12
174
+ protobuf==6.32.1
175
+ psutil==7.1.0
176
+ pyarrow==21.0.0
177
+ pyasn1==0.6.1
178
+ pyasn1_modules==0.4.2
179
+ pycparser==2.23
180
+ pydantic==2.11.9
181
+ pydantic_core==2.33.2
182
+ pydub==0.25.1
183
+ Pygments==2.19.2
184
+ pynput==1.8.1
185
+ pyparsing==3.2.4
186
+ pyproject_hooks==1.2.0
187
+ pyserial==3.5
188
+ pytest==8.4.2
189
+ pytest-sphinx==0.6.3
190
+ python-dateutil==2.9.0.post0
191
+ python-Levenshtein==0.27.1
192
+ python-multipart==0.0.20
193
+ python-xlib==0.33
194
+ pytorch-triton-rocm==3.4.0
195
+ pytz==2025.2
196
+ pyyaml-include==1.4.1
197
+ RapidFuzz==3.14.1
198
+ readme_renderer==44.0
199
+ regex==2025.9.1
200
+ requests==2.32.5
201
+ requests-oauthlib==2.0.0
202
+ requests-toolbelt==1.0.0
203
+ requirements-parser==0.13.0
204
+ rerun-sdk==0.22.1
205
+ rfc3986==2.0.0
206
+ rich==13.9.4
207
+ rsa==4.9.1
208
+ ruff==0.13.0
209
+ s3transfer==0.14.0
210
+ safehttpx==0.1.6
211
+ safetensors==0.6.2
212
+ scikit-learn==1.7.2
213
+ scipy==1.15.3
214
+ SecretStorage==3.4.0
215
+ semantic-version==2.10.0
216
+ sentencepiece==0.2.1
217
+ sentry-sdk==2.38.0
218
+ setuptools==78.1.1
219
+ shellingham==1.5.4
220
+ six==1.17.0
221
+ smart_open==7.3.1
222
+ smashed==0.21.5
223
+ smmap==5.0.2
224
+ sniffio==1.3.1
225
+ starlette==0.48.0
226
+ sympy==1.13.3
227
+ tensorboard==2.15.2
228
+ tensorboard==2.19.0
229
+ tensorflow==2.15.0
230
+ tensorflow-addons==0.23.0
231
+ tensorflow-datasets==4.9.3
232
+ tensorflow-estimator==2.15.0
233
+ tensorflow-graphics==2021.12.3
234
+ tensorflow-metadata==1.17.2
235
+ threadpoolctl==3.6.0
236
+ timm==1.0.19
237
+ tokenizers==0.22.0
238
+ toml==0.10.2
239
+ tomli==2.2.1
240
+ tomlkit==0.13.3
241
+ torch==2.8.0+rocm6.4
242
+ torchcodec==0.5
243
+ torchmetrics==1.8.2
244
+ torchvision==0.23.0+rocm6.4
245
+ tqdm==4.67.1
246
+ transformers==4.56.1
247
+ trimesh==4.8.2
248
+ trouting==0.3.3
249
+ twine==6.2.0
250
+ typeguard==2.13.3
251
+ typer==0.17.4
252
+ typing_extensions==4.15.0
253
+ typing-inspect==0.9.0
254
+ typing-inspection==0.4.1
255
+ tzdata==2025.2
256
+ urllib3==2.5.0
257
+ uvicorn==0.35.0
258
+ wandb==0.21.4
259
+ wcwidth==0.2.13
260
+ websockets==15.0.1
261
+ wheel==0.45.1
262
+ wrapt==1.14.2
263
+ xxhash==3.5.0
264
+ yarl==1.20.1
265
+ zipp==3.23.0
266
+ lerobot==0.3.4
267
+ minLoRA==0.1.0
268
+ autocommand==2.2.2
269
+ backports.tarfile==1.2.0
270
+ importlib_metadata==8.0.0
271
+ inflect==7.3.1
272
+ jaraco.collections==5.1.0
273
+ jaraco.context==5.3.0
274
+ jaraco.functools==4.0.1
275
+ jaraco.text==3.12.1
276
+ more-itertools==10.3.0
277
+ packaging==24.2
278
+ platformdirs==4.2.2
279
+ tomli==2.0.1
280
+ typeguard==4.3.0
281
+ typing_extensions==4.12.2
282
+ wheel==0.45.1
283
+ zipp==3.19.2
all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/wandb-metadata.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-140-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.18",
4
+ "startedAt": "2025-09-24T06:53:10.958875Z",
5
+ "args": [
6
+ "qwen2_7b",
7
+ "save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt",
8
+ "--vision_backbone",
9
+ "openai",
10
+ "--action_head",
11
+ "flow_matching",
12
+ "--seq_len",
13
+ "768",
14
+ "--lora_rank",
15
+ "32",
16
+ "--lora_llm",
17
+ "--checkpoint",
18
+ "/vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924",
19
+ "--device_train_microbatch_size",
20
+ "22",
21
+ "--global_batch_size",
22
+ "176",
23
+ "--dataset",
24
+ "vla_dataset_realworld",
25
+ "--llm_learning_rate",
26
+ "5e-5",
27
+ "--wandb_entity",
28
+ "henryeap",
29
+ "--wandb_project",
30
+ "a1-realworld",
31
+ "--wandb_run_name",
32
+ "glue",
33
+ "--save_overwrite"
34
+ ],
35
+ "program": "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py",
36
+ "codePath": "launch_scripts/train_vla.py",
37
+ "codePathLocal": "launch_scripts/train_vla.py",
38
+ "git": {
39
+ "remote": "https://github.com/Spatialtemporal-AI/A1.git",
40
+ "commit": "c13f2763af61e0d729a8b5ab4bdefc512205bcc5"
41
+ },
42
+ "email": "ihenrykwok@outlook.com",
43
+ "root": "/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb",
44
+ "host": "auh7-1b-gpu-188",
45
+ "executable": "/vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10",
46
+ "cpu_count": 64,
47
+ "cpu_count_logical": 128,
48
+ "gpu": "Instinct MI210",
49
+ "gpu_count": 8,
50
+ "disk": {
51
+ "/": {
52
+ "total": "470343073792",
53
+ "used": "51148275712"
54
+ }
55
+ },
56
+ "memory": {
57
+ "total": "2434606952448"
58
+ },
59
+ "gpu_amd": [
60
+ {
61
+ "id": "2",
62
+ "uniqueId": "0x399226d2b2bfa544",
63
+ "vbiosVersion": "113-D67301V-073",
64
+ "performanceLevel": "auto",
65
+ "maxPower": "300.0",
66
+ "series": "Instinct MI210",
67
+ "model": "0x740f",
68
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
69
+ "sku": "D67301V",
70
+ "sclkRange": "500Mhz - 1700Mhz",
71
+ "mclkRange": "400Mhz - 1600Mhz"
72
+ },
73
+ {
74
+ "id": "0",
75
+ "uniqueId": "0x3558c3014c813fdb",
76
+ "vbiosVersion": "113-D67301V-073",
77
+ "performanceLevel": "auto",
78
+ "maxPower": "300.0",
79
+ "series": "Instinct MI210",
80
+ "model": "0x740f",
81
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
82
+ "sku": "D67301V",
83
+ "sclkRange": "500Mhz - 1700Mhz",
84
+ "mclkRange": "400Mhz - 1600Mhz"
85
+ },
86
+ {
87
+ "id": "4",
88
+ "uniqueId": "0xa515afd8ced1d39d",
89
+ "vbiosVersion": "113-D67301V-073",
90
+ "performanceLevel": "auto",
91
+ "maxPower": "300.0",
92
+ "series": "Instinct MI210",
93
+ "model": "0x740f",
94
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
95
+ "sku": "D67301V",
96
+ "sclkRange": "500Mhz - 1700Mhz",
97
+ "mclkRange": "400Mhz - 1600Mhz"
98
+ },
99
+ {
100
+ "id": "5",
101
+ "uniqueId": "0x137c9ede1bb1518e",
102
+ "vbiosVersion": "113-D67301V-073",
103
+ "performanceLevel": "auto",
104
+ "maxPower": "300.0",
105
+ "series": "Instinct MI210",
106
+ "model": "0x740f",
107
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
108
+ "sku": "D67301V",
109
+ "sclkRange": "500Mhz - 1700Mhz",
110
+ "mclkRange": "400Mhz - 1600Mhz"
111
+ },
112
+ {
113
+ "id": "6",
114
+ "uniqueId": "0xfa8b85a4625b04f",
115
+ "vbiosVersion": "113-D67301V-073",
116
+ "performanceLevel": "auto",
117
+ "maxPower": "300.0",
118
+ "series": "Instinct MI210",
119
+ "model": "0x740f",
120
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
121
+ "sku": "D67301V",
122
+ "sclkRange": "500Mhz - 1700Mhz",
123
+ "mclkRange": "400Mhz - 1600Mhz"
124
+ },
125
+ {
126
+ "id": "7",
127
+ "uniqueId": "0x21a2e88d06c419dc",
128
+ "vbiosVersion": "113-D67301V-073",
129
+ "performanceLevel": "auto",
130
+ "maxPower": "300.0",
131
+ "series": "Instinct MI210",
132
+ "model": "0x740f",
133
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
134
+ "sku": "D67301V",
135
+ "sclkRange": "500Mhz - 1700Mhz",
136
+ "mclkRange": "400Mhz - 1600Mhz"
137
+ },
138
+ {
139
+ "id": "1",
140
+ "uniqueId": "0x9b5c1c302c8129f8",
141
+ "vbiosVersion": "113-D67301V-073",
142
+ "performanceLevel": "auto",
143
+ "maxPower": "300.0",
144
+ "series": "Instinct MI210",
145
+ "model": "0x740f",
146
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
147
+ "sku": "D67301V",
148
+ "sclkRange": "500Mhz - 1700Mhz",
149
+ "mclkRange": "400Mhz - 1600Mhz"
150
+ },
151
+ {
152
+ "id": "3",
153
+ "uniqueId": "0xf61ec17df11883bd",
154
+ "vbiosVersion": "113-D67301V-073",
155
+ "performanceLevel": "auto",
156
+ "maxPower": "300.0",
157
+ "series": "Instinct MI210",
158
+ "model": "0x740f",
159
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
160
+ "sku": "D67301V",
161
+ "sclkRange": "500Mhz - 1700Mhz",
162
+ "mclkRange": "400Mhz - 1600Mhz"
163
+ }
164
+ ],
165
+ "slurm": {
166
+ "cluster_name": "ai-04r",
167
+ "conf": "/etc/slurm/slurm.conf",
168
+ "cpus_on_node": "128",
169
+ "gpus_on_node": "8",
170
+ "gtids": "0",
171
+ "job_account": "faculty-acc",
172
+ "job_cpus_per_node": "128",
173
+ "job_end_time": "1758955952",
174
+ "job_gid": "2000",
175
+ "job_gpus": "0,1,2,3,4,5,6,7",
176
+ "job_id": "1607",
177
+ "job_name": "realworld_mh",
178
+ "job_nodelist": "auh7-1b-gpu-188",
179
+ "job_num_nodes": "1",
180
+ "job_partition": "faculty",
181
+ "job_qos": "xdqos",
182
+ "job_start_time": "1758696752",
183
+ "job_uid": "2013",
184
+ "job_user": "xiaodan",
185
+ "jobid": "1607",
186
+ "localid": "0",
187
+ "nnodes": "1",
188
+ "nodeid": "0",
189
+ "nodelist": "auh7-1b-gpu-188",
190
+ "nprocs": "1",
191
+ "ntasks": "1",
192
+ "ntasks_per_node": "1",
193
+ "oom_kill_step": "0",
194
+ "prio_process": "0",
195
+ "procid": "0",
196
+ "submit_dir": "/vast/users/xiaodan/zhangjian/A1/launch_scripts",
197
+ "submit_host": "auh-1b-cpu-login-001",
198
+ "task_pid": "2194698",
199
+ "tasks_per_node": "1",
200
+ "topology_addr": "auh7-1b-gpu-188",
201
+ "topology_addr_pattern": "node"
202
+ },
203
+ "writerId": "ym1fr90agfv5lp1xadwns4zfs5lnvysu"
204
+ }
all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":2},"_runtime":2}
all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-24T06:53:11.01260872Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpysryscrf/port-2194776.txt","pid":2194776,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-09-24T06:53:11.013097116Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2194776}
3
+ {"time":"2025-09-24T06:53:11.013088106Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2194776-2194946-2300508100/socket","Net":"unix"}}
4
+ {"time":"2025-09-24T06:53:11.195667052Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-09-24T06:53:11.202657842Z","level":"INFO","msg":"handleInformInit: received","streamId":"qsv5q1hc","id":"1(@)"}
6
+ {"time":"2025-09-24T06:53:12.34562108Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"qsv5q1hc","id":"1(@)"}
7
+ {"time":"2025-09-24T06:53:15.425080291Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-09-24T06:53:15.425127242Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2025-09-24T06:53:15.425161212Z","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2025-09-24T06:53:15.425190602Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2025-09-24T06:53:15.425244213Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-2194776-2194946-2300508100/socket","Net":"unix"}}
12
+ {"time":"2025-09-24T06:53:16.766852773Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-09-24T06:53:16.767200728Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-09-24T06:53:16.767210378Z","level":"INFO","msg":"server is closed"}
all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-24T06:53:11.204449275Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-09-24T06:53:12.345567379Z","level":"INFO","msg":"stream: created new stream","id":"qsv5q1hc"}
3
+ {"time":"2025-09-24T06:53:12.345615649Z","level":"INFO","msg":"stream: started","id":"qsv5q1hc"}
4
+ {"time":"2025-09-24T06:53:12.34563651Z","level":"INFO","msg":"sender: started","stream_id":"qsv5q1hc"}
5
+ {"time":"2025-09-24T06:53:12.34563539Z","level":"INFO","msg":"writer: started","stream_id":"qsv5q1hc"}
6
+ {"time":"2025-09-24T06:53:12.34568667Z","level":"INFO","msg":"handler: started","stream_id":"qsv5q1hc"}
7
+ {"time":"2025-09-24T06:53:15.425126022Z","level":"INFO","msg":"stream: closing","id":"qsv5q1hc"}
8
+ {"time":"2025-09-24T06:53:16.457729801Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-09-24T06:53:16.764520193Z","level":"INFO","msg":"handler: closed","stream_id":"qsv5q1hc"}
10
+ {"time":"2025-09-24T06:53:16.765675218Z","level":"INFO","msg":"sender: closed","stream_id":"qsv5q1hc"}
11
+ {"time":"2025-09-24T06:53:16.765705399Z","level":"INFO","msg":"stream: closed","id":"qsv5q1hc"}
all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/logs/debug.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 2025-09-24 06:53:15,425 INFO wandb-AsyncioManager-main:2194776 [service_client.py:_forward_responses():84] Reached EOF.
all_flow_matching/glue_best/wandb/wandb/run-20250924_065310-qsv5q1hc/run-qsv5q1hc.wandb ADDED
Binary file (17.9 kB). View file
 
all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/config.yaml ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.21.4
4
+ e:
5
+ pv6kdvw48bx7dygl9qkpmbu5bsrvk9dc:
6
+ args:
7
+ - qwen2_7b
8
+ - save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
9
+ - --vision_backbone
10
+ - openai
11
+ - --action_head
12
+ - flow_matching
13
+ - --seq_len
14
+ - "768"
15
+ - --lora_rank
16
+ - "32"
17
+ - --lora_llm
18
+ - --checkpoint
19
+ - /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
20
+ - --device_train_microbatch_size
21
+ - "22"
22
+ - --global_batch_size
23
+ - "176"
24
+ - --dataset
25
+ - vla_dataset_realworld
26
+ - --llm_learning_rate
27
+ - "5e-5"
28
+ - --wandb_entity
29
+ - henryeap
30
+ - --wandb_project
31
+ - a1-realworld
32
+ - --wandb_run_name
33
+ - glue
34
+ - --save_overwrite
35
+ codePath: launch_scripts/train_vla.py
36
+ codePathLocal: launch_scripts/train_vla.py
37
+ cpu_count: 64
38
+ cpu_count_logical: 128
39
+ disk:
40
+ /:
41
+ total: "470343073792"
42
+ used: "51148382208"
43
+ email: ihenrykwok@outlook.com
44
+ executable: /vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10
45
+ git:
46
+ commit: c13f2763af61e0d729a8b5ab4bdefc512205bcc5
47
+ remote: https://github.com/Spatialtemporal-AI/A1.git
48
+ gpu: Instinct MI210
49
+ gpu_amd:
50
+ - id: "0"
51
+ maxPower: "300.0"
52
+ mclkRange: 400Mhz - 1600Mhz
53
+ model: "0x740f"
54
+ performanceLevel: auto
55
+ sclkRange: 500Mhz - 1700Mhz
56
+ series: Instinct MI210
57
+ sku: D67301V
58
+ uniqueId: "0x3558c3014c813fdb"
59
+ vbiosVersion: 113-D67301V-073
60
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
61
+ - id: "6"
62
+ maxPower: "300.0"
63
+ mclkRange: 400Mhz - 1600Mhz
64
+ model: "0x740f"
65
+ performanceLevel: auto
66
+ sclkRange: 500Mhz - 1700Mhz
67
+ series: Instinct MI210
68
+ sku: D67301V
69
+ uniqueId: "0xfa8b85a4625b04f"
70
+ vbiosVersion: 113-D67301V-073
71
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
72
+ - id: "1"
73
+ maxPower: "300.0"
74
+ mclkRange: 400Mhz - 1600Mhz
75
+ model: "0x740f"
76
+ performanceLevel: auto
77
+ sclkRange: 500Mhz - 1700Mhz
78
+ series: Instinct MI210
79
+ sku: D67301V
80
+ uniqueId: "0x9b5c1c302c8129f8"
81
+ vbiosVersion: 113-D67301V-073
82
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
83
+ - id: "2"
84
+ maxPower: "300.0"
85
+ mclkRange: 400Mhz - 1600Mhz
86
+ model: "0x740f"
87
+ performanceLevel: auto
88
+ sclkRange: 500Mhz - 1700Mhz
89
+ series: Instinct MI210
90
+ sku: D67301V
91
+ uniqueId: "0x399226d2b2bfa544"
92
+ vbiosVersion: 113-D67301V-073
93
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
94
+ - id: "4"
95
+ maxPower: "300.0"
96
+ mclkRange: 400Mhz - 1600Mhz
97
+ model: "0x740f"
98
+ performanceLevel: auto
99
+ sclkRange: 500Mhz - 1700Mhz
100
+ series: Instinct MI210
101
+ sku: D67301V
102
+ uniqueId: "0xa515afd8ced1d39d"
103
+ vbiosVersion: 113-D67301V-073
104
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
105
+ - id: "5"
106
+ maxPower: "300.0"
107
+ mclkRange: 400Mhz - 1600Mhz
108
+ model: "0x740f"
109
+ performanceLevel: auto
110
+ sclkRange: 500Mhz - 1700Mhz
111
+ series: Instinct MI210
112
+ sku: D67301V
113
+ uniqueId: "0x137c9ede1bb1518e"
114
+ vbiosVersion: 113-D67301V-073
115
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
116
+ - id: "3"
117
+ maxPower: "300.0"
118
+ mclkRange: 400Mhz - 1600Mhz
119
+ model: "0x740f"
120
+ performanceLevel: auto
121
+ sclkRange: 500Mhz - 1700Mhz
122
+ series: Instinct MI210
123
+ sku: D67301V
124
+ uniqueId: "0xf61ec17df11883bd"
125
+ vbiosVersion: 113-D67301V-073
126
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
127
+ - id: "7"
128
+ maxPower: "300.0"
129
+ mclkRange: 400Mhz - 1600Mhz
130
+ model: "0x740f"
131
+ performanceLevel: auto
132
+ sclkRange: 500Mhz - 1700Mhz
133
+ series: Instinct MI210
134
+ sku: D67301V
135
+ uniqueId: "0x21a2e88d06c419dc"
136
+ vbiosVersion: 113-D67301V-073
137
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
138
+ gpu_count: 8
139
+ host: auh7-1b-gpu-188
140
+ memory:
141
+ total: "2434606952448"
142
+ os: Linux-5.15.0-140-generic-x86_64-with-glibc2.35
143
+ program: /vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py
144
+ python: CPython 3.10.18
145
+ root: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb
146
+ slurm:
147
+ cluster_name: ai-04r
148
+ conf: /etc/slurm/slurm.conf
149
+ cpus_on_node: "128"
150
+ gpus_on_node: "8"
151
+ gtids: "0"
152
+ job_account: faculty-acc
153
+ job_cpus_per_node: "128"
154
+ job_end_time: "1758956113"
155
+ job_gid: "2000"
156
+ job_gpus: 0,1,2,3,4,5,6,7
157
+ job_id: "1608"
158
+ job_name: realworld_mh
159
+ job_nodelist: auh7-1b-gpu-188
160
+ job_num_nodes: "1"
161
+ job_partition: faculty
162
+ job_qos: xdqos
163
+ job_start_time: "1758696913"
164
+ job_uid: "2013"
165
+ job_user: xiaodan
166
+ jobid: "1608"
167
+ localid: "0"
168
+ nnodes: "1"
169
+ nodeid: "0"
170
+ nodelist: auh7-1b-gpu-188
171
+ nprocs: "1"
172
+ ntasks: "1"
173
+ ntasks_per_node: "1"
174
+ oom_kill_step: "0"
175
+ prio_process: "0"
176
+ procid: "0"
177
+ submit_dir: /vast/users/xiaodan/zhangjian/A1/launch_scripts
178
+ submit_host: auh-1b-cpu-login-001
179
+ task_pid: "2195813"
180
+ tasks_per_node: "1"
181
+ topology_addr: auh7-1b-gpu-188
182
+ topology_addr_pattern: node
183
+ startedAt: "2025-09-24T06:55:50.673091Z"
184
+ writerId: pv6kdvw48bx7dygl9qkpmbu5bsrvk9dc
185
+ m: []
186
+ python_version: 3.10.18
187
+ t:
188
+ "1":
189
+ - 1
190
+ - 3
191
+ - 5
192
+ - 11
193
+ - 41
194
+ - 49
195
+ - 51
196
+ - 53
197
+ - 63
198
+ - 71
199
+ - 83
200
+ - 95
201
+ - 105
202
+ "2":
203
+ - 1
204
+ - 3
205
+ - 5
206
+ - 11
207
+ - 41
208
+ - 49
209
+ - 51
210
+ - 53
211
+ - 63
212
+ - 71
213
+ - 83
214
+ - 95
215
+ - 105
216
+ "3":
217
+ - 13
218
+ - 15
219
+ - 16
220
+ "4": 3.10.18
221
+ "5": 0.21.4
222
+ "6": 4.56.1
223
+ "12": 0.21.4
224
+ "13": linux-x86_64
225
+ activation_checkpointing:
226
+ value: whole_layer
227
+ allow_resume:
228
+ value: false
229
+ batch_divisor:
230
+ value: global_batch
231
+ canceled_check_interval:
232
+ value: 50
233
+ checkpoint_dir:
234
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
235
+ compile:
236
+ value: null
237
+ console_log_interval:
238
+ value: 1
239
+ data:
240
+ value:
241
+ dataset: vla_dataset_realworld
242
+ drop_last: true
243
+ for_inference: false
244
+ lerobot_episode_index_end: null
245
+ lerobot_episode_index_start: null
246
+ mixture: null
247
+ multi_modal: torch
248
+ num_workers: 0
249
+ pad: to_max
250
+ persistent_workers: false
251
+ pin_memory: true
252
+ prefetch_factor: null
253
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
254
+ rlds_dataset_name: libero_4_task_suites_no_noops
255
+ rlds_read_threads: 8
256
+ rlds_shuffle_buffer_size: 100000
257
+ rlds_traj_threads: 8
258
+ root_size_mixture: null
259
+ seed: 95818
260
+ sequence_length: 768
261
+ shuffle: true
262
+ shuffle_messages: false
263
+ split: train
264
+ timeout: 0
265
+ use_proprio: true
266
+ use_wrist_image: true
267
+ device_eval_batch_size:
268
+ value: 4
269
+ device_inf_eval_batch_size:
270
+ value: 16
271
+ device_train_batch_size:
272
+ value: 22
273
+ device_train_grad_accum:
274
+ value: 1
275
+ device_train_microbatch_size:
276
+ value: 22
277
+ dry_run:
278
+ value: false
279
+ early_exit:
280
+ value: false
281
+ epoch:
282
+ value: null
283
+ eval_interval:
284
+ value: 0
285
+ eval_on_load:
286
+ value: false
287
+ eval_subset_num_batches:
288
+ value: -1
289
+ evaluators:
290
+ value:
291
+ - data:
292
+ dataset: vla_dataset_realworld
293
+ drop_last: true
294
+ for_inference: false
295
+ lerobot_episode_index_end: 765
296
+ lerobot_episode_index_start: 353
297
+ mixture: null
298
+ multi_modal: torch
299
+ num_workers: 0
300
+ pad: to_max
301
+ persistent_workers: true
302
+ pin_memory: true
303
+ prefetch_factor: null
304
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
305
+ rlds_dataset_name: libero_4_task_suites_no_noops
306
+ rlds_read_threads: 8
307
+ rlds_shuffle_buffer_size: 256000
308
+ rlds_traj_threads: 8
309
+ root_size_mixture: null
310
+ seed: null
311
+ sequence_length: 768
312
+ shuffle: false
313
+ shuffle_messages: false
314
+ split: validation
315
+ timeout: 0
316
+ use_proprio: true
317
+ use_wrist_image: true
318
+ device_eval_batch_size: null
319
+ eval_name: null
320
+ label: val
321
+ max_examples: null
322
+ max_new_tokens: 448
323
+ mm_evaluator: null
324
+ save_dir: null
325
+ save_to_checkpoint_dir: false
326
+ skip_if_metrics_cached: true
327
+ subset_num_batches: 64
328
+ extra_steps_after_cancel:
329
+ value: 10
330
+ fast_forward_batches:
331
+ value: null
332
+ force_save_unsharded:
333
+ value: false
334
+ fsdp:
335
+ value:
336
+ hybrid_sharding_num_model_replicas: null
337
+ precision: float
338
+ sharding_strategy: FULL_SHARD
339
+ use_orig_params: true
340
+ wrapping_strategy: by_block_and_size
341
+ ft_connector:
342
+ value: false
343
+ ft_embedding:
344
+ value: lm_head
345
+ ft_llm:
346
+ value: false
347
+ ft_vit:
348
+ value: false
349
+ fused_loss:
350
+ value: null
351
+ gen1_gc_interval:
352
+ value: 1
353
+ global_train_batch_size:
354
+ value: 176
355
+ inf_eval_interval:
356
+ value: -1
357
+ inf_eval_subset_num_batches:
358
+ value: -1
359
+ inf_evaluators:
360
+ value: []
361
+ initial_model_checkpoint:
362
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
363
+ keep_lr_on_load:
364
+ value: true
365
+ load_model_config:
366
+ value: null
367
+ load_path:
368
+ value: null
369
+ load_path_sharded_checkpointer:
370
+ value: null
371
+ lora:
372
+ value: false
373
+ lora_connector:
374
+ value: false
375
+ lora_llm:
376
+ value: true
377
+ lora_rank:
378
+ value: 32
379
+ lora_vit:
380
+ value: false
381
+ max_duration:
382
+ value: 500000
383
+ max_grad_norm:
384
+ value: 1
385
+ max_grad_norm_ratio:
386
+ value: null
387
+ model:
388
+ value:
389
+ action_head: flow_matching
390
+ action_head_dit_depth: 28
391
+ action_head_dit_hidden_size: 1152
392
+ action_head_dit_num_heads: 16
393
+ action_use_left_eef: false
394
+ action_use_mobile_base: false
395
+ activation_type: swiglu
396
+ additional_vocab_size: 128
397
+ always_start_with_space: true
398
+ attention_dropout: 0
399
+ attention_layer_norm: false
400
+ attention_layer_norm_with_affine: true
401
+ attention_type: sdpa
402
+ bias_for_layer_norm: null
403
+ block_group_size: 1
404
+ block_type: sequential
405
+ clip_qkv: null
406
+ crop_mode: overlap-and-resize-c2
407
+ d_model: 3584
408
+ default_inference_len: 65
409
+ embedding_dropout: 0
410
+ embedding_size: 152064
411
+ fix_image_padding: true
412
+ float32_attention: true
413
+ head_dim: null
414
+ image_feature_dropout: 0
415
+ image_padding_embed: pad_and_partial_pad
416
+ image_pooling_2d: attention_meanq
417
+ image_pooling_h: 2
418
+ image_pooling_w: 2
419
+ image_projector: mlp
420
+ include_bias: false
421
+ init_cutoff_factor: null
422
+ init_device: null
423
+ init_fn: normal
424
+ init_std: 0.02
425
+ initializer_range: 0.02
426
+ layer_norm_eps: 1e-06
427
+ layer_norm_type: rms
428
+ layer_norm_with_affine: true
429
+ llm_causal_attention: false
430
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
431
+ low_cpu_fsdp: true
432
+ max_crops: 12
433
+ max_position_embeddings: null
434
+ max_sequence_length: 4096
435
+ message_formatting: role
436
+ mlp_hidden_size: 37888
437
+ mlp_ratio: 4
438
+ moe_capacity_factor: 1.25
439
+ moe_dropless: true
440
+ moe_interleave: false
441
+ moe_lbl_in_fp32: false
442
+ moe_log_expert_assignment: false
443
+ moe_loss_weight: 0.1
444
+ moe_mlp_impl: sparse
445
+ moe_num_experts: 8
446
+ moe_shared_expert: false
447
+ moe_top_k: 2
448
+ moe_zloss_weight: null
449
+ multi_annotation_weighting: root_subsegments
450
+ n_heads: 28
451
+ n_kv_heads: 4
452
+ n_layers: 28
453
+ new_embedding_init_range: 0.02
454
+ norm_after: false
455
+ normalize_input_embeds: false
456
+ num_diffusion_inference_steps: 30
457
+ num_diffusion_steps: 1000
458
+ overlap_margins:
459
+ - 4
460
+ - 4
461
+ pad_tokenizer: true
462
+ pad_value: 0
463
+ precision: amp_bf16
464
+ prompt_type: uber_model
465
+ qkv_bias: true
466
+ residual_dropout: 0.1
467
+ response_residual_dropout: 0
468
+ rope: true
469
+ rope_full_precision: true
470
+ rope_theta: 1e+06
471
+ scale_logits: false
472
+ system_prompt_kind: demo_or_style
473
+ tokenizer:
474
+ identifier: Qwen/Qwen2-7B
475
+ tokenizer_dir: null
476
+ use_col_tokens: true
477
+ use_position_ids: true
478
+ use_proprio: true
479
+ vision_backbone:
480
+ attention_dropout: 0
481
+ fsdp_wrap: false
482
+ image_default_input_size:
483
+ - 336
484
+ - 336
485
+ image_dropout_rate: 0
486
+ image_emb_dim: 1024
487
+ image_head_dim: 64
488
+ image_mlp_activations: quick_gelu
489
+ image_mlp_dim: 4096
490
+ image_model_type: openai
491
+ image_norm_eps: 1e-05
492
+ image_num_heads: 16
493
+ image_num_key_value_heads: 16
494
+ image_num_layers: 23
495
+ image_num_pos: 577
496
+ image_patch_size: 14
497
+ image_pos_patch_size: 14
498
+ initializer_range: 0.02
499
+ residual_dropout: 0
500
+ resize_mode: default
501
+ vit_layers:
502
+ - -2
503
+ - -9
504
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
505
+ vocab_size: 152064
506
+ weight_tying: false
507
+ multi_component_grad_norm:
508
+ value: true
509
+ no_pre_train_checkpoint:
510
+ value: true
511
+ optimizer:
512
+ value:
513
+ betas:
514
+ - 0.9
515
+ - 0.95
516
+ connector_betas:
517
+ - 0.9
518
+ - 0.95
519
+ connector_eps: 1e-06
520
+ connector_learning_rate: 0.0002
521
+ connector_weight_decay: 0
522
+ eps: 1e-05
523
+ learning_rate: 0.0001
524
+ llm_betas:
525
+ - 0.9
526
+ - 0.95
527
+ llm_eps: 1e-06
528
+ llm_learning_rate: 5e-05
529
+ llm_weight_decay: 0
530
+ metrics_log_interval: 20
531
+ name: adamw
532
+ vit_betas:
533
+ - 0.9
534
+ - 0.95
535
+ vit_eps: 1e-06
536
+ vit_learning_rate: 6e-06
537
+ vit_weight_decay: 0
538
+ weight_decay: 0.01
539
+ precision:
540
+ value: amp_bf16
541
+ python_profiling:
542
+ value: false
543
+ remote_save_folder:
544
+ value: null
545
+ reset_dataloader_state:
546
+ value: false
547
+ reset_optimizer_state:
548
+ value: false
549
+ reset_trainer_state:
550
+ value: false
551
+ restore_dataloader:
552
+ value: true
553
+ run_name:
554
+ value: glue_20250924_065523
555
+ save_dataloader_state:
556
+ value: false
557
+ save_folder:
558
+ value: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
559
+ save_interval:
560
+ value: 500
561
+ save_interval_action_head:
562
+ value: 500
563
+ save_interval_ephemeral:
564
+ value: null
565
+ save_interval_unsharded:
566
+ value: 500
567
+ save_num_action_head_checkpoints_to_keep:
568
+ value: 2
569
+ save_num_checkpoints_to_keep:
570
+ value: 1
571
+ save_num_unsharded_checkpoints_to_keep:
572
+ value: 1
573
+ save_overwrite:
574
+ value: true
575
+ scheduler:
576
+ value:
577
+ alpha_f: 0.1
578
+ connector_t_warmup: 200
579
+ grad_clip_warmup_factor: null
580
+ grad_clip_warmup_steps: null
581
+ llm_t_warmup: 2000
582
+ name: multimodal
583
+ t_max: null
584
+ t_warmup: 100
585
+ units: steps
586
+ vit_t_warmup: 2000
587
+ warmup_min_lr: 0
588
+ seed:
589
+ value: 6198
590
+ sharded_checkpointer:
591
+ value: torch_legacy
592
+ softmax_auxiliary_loss:
593
+ value: true
594
+ softmax_auxiliary_loss_scale:
595
+ value: 0.0001
596
+ speed_monitor:
597
+ value:
598
+ gpu_flops_available: null
599
+ window_size: 20
600
+ stop_after:
601
+ value: null
602
+ stop_at:
603
+ value: 500000
604
+ time_limit:
605
+ value: null
606
+ torch_profiling:
607
+ value: false
608
+ train_exit_random_layer:
609
+ value: false
610
+ use_lora:
611
+ value: true
all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/output.log ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb: Detected [openai] in use.
2
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
4
+ 09/24 [06:55:52] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
5
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
6
+ warnings.warn( # warn only once
7
+
8
+ ****** vla_cfg: {'datasets': {'rlds': {'name': None, 'path': None, 'weight': 1.0, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [['/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1/Lerobot_Glue_best', 0.6, 'bounds']], 'open-source-real-world': {'rlds': {'name': 'a1_real_world', 'path': '/vast/users/xiaodan/zhangjian/datasets/OXE', 'weight': 0.4, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [], 'agibot': {'path': None, 'weight': 0.1, 'action_proprio_normalization_type': None}}}, 'model': {'action_head': {'action_dim': 16, 'proprio_dim': 16, 'num_actions_chunk': 8, 'action_tokens_mapping': {'left_end_effector': 8, 'right_end_effector': 8}, 'use_left_eef': True, 'use_mobile_base': False}}}
9
+ ****** Skip RLDS main; path not found: None
10
+ ****** start build LeRobot main...
11
+ build_tokenizer, cache_dir None tokenizer_dir None
12
+ 09/24 [06:55:54] INFO | >> Padding tokenizer with 418 tokens tokenizer.py:128
13
+ INFO | >> Loading train dataset: vla_dataset_realworld/train __init__.py:434
14
+ ****** before LeRobot dataset...
15
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1/Lerobot_Glue_best
16
+ Traceback (most recent call last):
17
+ File "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py", line 397, in <module>
18
+ train(cfg)
19
+ File "/vast/users/xiaodan/zhangjian/A1/scripts/train_for_action.py", line 160, in main
20
+ train_loader = build_train_dataloader(cfg, device)
21
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 196, in build_train_dataloader
22
+ return build_vla_train_dataloader(train_config, device)
23
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 327, in build_vla_train_dataloader
24
+ ds = build_lerobot_train_dataset(train_config, normalization_type,device)
25
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/__init__.py", line 439, in build_lerobot_train_dataset
26
+ dataset = LeRobotDatasetWrapper(
27
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/vla/lerobot_datasets.py", line 84, in __init__
28
+ dataset_demo = LeRobotDataset(repo_id=os.path.basename(dataset_path),root=dataset_path)
29
+ File "/vast/users/xiaodan/zhangjian/lerobot/src/lerobot/datasets/lerobot_dataset.py", line 610, in __init__
30
+ self.meta = LeRobotDatasetMetadata(
31
+ File "/vast/users/xiaodan/zhangjian/lerobot/src/lerobot/datasets/lerobot_dataset.py", line 101, in __init__
32
+ self.load_metadata()
33
+ File "/vast/users/xiaodan/zhangjian/lerobot/src/lerobot/datasets/lerobot_dataset.py", line 112, in load_metadata
34
+ check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
35
+ File "/vast/users/xiaodan/zhangjian/lerobot/src/lerobot/datasets/utils.py", line 487, in check_version_compatibility
36
+ raise BackwardCompatibilityError(repo_id, v_check)
37
+ lerobot.datasets.backward_compatibility.BackwardCompatibilityError:
38
+ The dataset you requested (Lerobot_Glue_best) is in 2.1 format.
39
+
40
+ We introduced a new format since v3.0 which is not backward compatible with v2.1.
41
+ Please, update your dataset to the new format using this command:
42
+ ```
43
+ python -m lerobot.datasets.v30.convert_dataset_v21_to_v30 --repo-id=Lerobot_Glue_best
44
+ ```
45
+
46
+ If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
47
+ or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/requirements.txt ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai2-molmo==0.0.0
2
+ astunparse==1.6.3
3
+ flatbuffers==25.2.10
4
+ gast==0.6.0
5
+ google-pasta==0.2.0
6
+ h5py==3.14.0
7
+ libclang==18.1.1
8
+ Markdown==3.9
9
+ namex==0.1.0
10
+ opt_einsum==3.4.0
11
+ optree==0.17.0
12
+ tensorboard-data-server==0.7.2
13
+ tensorflow-io-gcs-filesystem==0.37.1
14
+ tensorflow-rocm==2.16.2
15
+ termcolor==3.1.0
16
+ Werkzeug==3.1.3
17
+ Brotli==1.1.0
18
+ Farama-Notifications==0.0.4
19
+ MarkupSafe==2.1.5
20
+ PyYAML==6.0.2
21
+ absl-py==2.3.1
22
+ accelerate==1.10.1
23
+ ai2-molmo==0.0.0
24
+ aiofiles==24.1.0
25
+ aiohappyeyeballs==2.6.1
26
+ aiohttp==3.12.15
27
+ aiosignal==1.4.0
28
+ annotated-types==0.7.0
29
+ antlr4-python3-runtime==4.9.3
30
+ anyio==4.10.0
31
+ array_record==0.8.1
32
+ async-timeout==5.0.1
33
+ attrs==25.3.0
34
+ av==15.1.0
35
+ backports.tarfile==1.2.0
36
+ beaker-gantry==3.2.0
37
+ beaker-py==2.5.0
38
+ black==23.12.1
39
+ boltons==25.0.0
40
+ boto3==1.40.33
41
+ botocore==1.40.33
42
+ build==1.3.0
43
+ cached_path==1.7.3
44
+ cached-property==2.0.1
45
+ cachetools==5.5.2
46
+ certifi==2025.8.3
47
+ cffi==2.0.0
48
+ charset-normalizer==3.4.3
49
+ click==8.2.1
50
+ click-help-colors==0.9.4
51
+ click-option-group==0.5.7
52
+ cloudpickle==3.1.1
53
+ cmake==4.1.0
54
+ contourpy==1.3.2
55
+ cryptography==46.0.1
56
+ cycler==0.12.1
57
+ dataclass-extensions==0.2.3
58
+ datasets==3.6.0
59
+ decorator==5.2.1
60
+ deepdiff==8.6.1
61
+ diffusers==0.35.1
62
+ dill==0.3.8
63
+ distro==1.9.0
64
+ dlimp==0.0.1
65
+ dm-tree==0.1.9
66
+ docutils==0.22.1
67
+ draccus==0.10.0
68
+ editdistance==0.8.1
69
+ einops==0.8.1
70
+ einops-exts==0.0.4
71
+ etils==1.13.0
72
+ evdev==1.9.2
73
+ exceptiongroup==1.3.0
74
+ face==24.0.0
75
+ fastapi==0.116.2
76
+ ffmpy==0.6.1
77
+ fiddle==0.3.0
78
+ filelock==3.13.1
79
+ fonttools==4.60.0
80
+ frozenlist==1.7.0
81
+ fsspec==2023.9.2
82
+ ftfy==6.3.1
83
+ gcsfs==2023.9.2
84
+ gitdb==4.0.12
85
+ GitPython==3.1.45
86
+ glom==24.11.0
87
+ google-api-core==2.25.1
88
+ google-auth==2.40.3
89
+ google-auth-oauthlib==1.2.2
90
+ google-cloud-core==2.4.3
91
+ google-cloud-storage==2.19.0
92
+ google-crc32c==1.7.1
93
+ google-resumable-media==2.7.2
94
+ googleapis-common-protos==1.70.0
95
+ gradio==5.46.0
96
+ gradio_client==1.13.0
97
+ graphviz==0.21
98
+ groovy==0.1.2
99
+ grpcio==1.75.0
100
+ gymnasium==0.29.1
101
+ h11==0.16.0
102
+ hf_transfer==0.1.9
103
+ hf-xet==1.1.10
104
+ httpcore==1.0.9
105
+ httpx==0.28.1
106
+ huggingface-hub==0.35.0
107
+ id==1.5.0
108
+ idna==3.10
109
+ imageio==2.37.0
110
+ imageio-ffmpeg==0.6.0
111
+ importlib_metadata==8.7.0
112
+ importlib_resources==6.5.2
113
+ iniconfig==2.1.0
114
+ inquirerpy==0.3.4
115
+ isort==5.12.0
116
+ jaraco.classes==3.4.0
117
+ jaraco.context==6.0.1
118
+ jaraco.functools==4.3.0
119
+ jeepney==0.9.0
120
+ Jinja2==3.1.4
121
+ jiter==0.11.0
122
+ jmespath==1.0.1
123
+ joblib==1.5.2
124
+ jsonlines==4.0.0
125
+ keras==2.15.0
126
+ keyring==25.6.0
127
+ kiwisolver==1.4.9
128
+ latex2sympy2_extended==1.10.2
129
+ lerobot==0.3.4
130
+ Levenshtein==0.27.1
131
+ libcst==1.8.4
132
+ lightning-utilities==0.15.2
133
+ markdown-it-py==4.0.0
134
+ math-verify==0.8.0
135
+ matplotlib==3.10.6
136
+ mdurl==0.1.2
137
+ mergedeep==1.3.4
138
+ ml-dtypes==0.2.0
139
+ ml_dtypes==0.5.3
140
+ more-itertools==10.8.0
141
+ mpmath==1.3.0
142
+ msgspec==0.19.0
143
+ multidict==6.6.4
144
+ multiprocess==0.70.16
145
+ mypy==1.3.0
146
+ mypy_extensions==1.1.0
147
+ necessary==0.4.3
148
+ networkx==3.3
149
+ nh3==0.3.0
150
+ nltk==3.9.1
151
+ numpy==1.26.4
152
+ numpy==2.2.6
153
+ oauthlib==3.3.1
154
+ omegaconf==2.3.0
155
+ openai==1.108.0
156
+ opencv-python-headless==4.12.0.88
157
+ OpenEXR==3.4.0
158
+ orderly-set==5.5.0
159
+ orjson==3.11.3
160
+ packaging==25.0
161
+ pandas==2.3.2
162
+ pathspec==0.12.1
163
+ petname==2.6
164
+ pfzy==0.3.4
165
+ pillow==11.0.0
166
+ pip==25.2
167
+ platformdirs==4.4.0
168
+ pluggy==1.6.0
169
+ promise==2.3
170
+ prompt_toolkit==3.0.52
171
+ propcache==0.3.2
172
+ proto-plus==1.26.1
173
+ protobuf==4.21.12
174
+ protobuf==6.32.1
175
+ psutil==7.1.0
176
+ pyarrow==21.0.0
177
+ pyasn1==0.6.1
178
+ pyasn1_modules==0.4.2
179
+ pycparser==2.23
180
+ pydantic==2.11.9
181
+ pydantic_core==2.33.2
182
+ pydub==0.25.1
183
+ Pygments==2.19.2
184
+ pynput==1.8.1
185
+ pyparsing==3.2.4
186
+ pyproject_hooks==1.2.0
187
+ pyserial==3.5
188
+ pytest==8.4.2
189
+ pytest-sphinx==0.6.3
190
+ python-dateutil==2.9.0.post0
191
+ python-Levenshtein==0.27.1
192
+ python-multipart==0.0.20
193
+ python-xlib==0.33
194
+ pytorch-triton-rocm==3.4.0
195
+ pytz==2025.2
196
+ pyyaml-include==1.4.1
197
+ RapidFuzz==3.14.1
198
+ readme_renderer==44.0
199
+ regex==2025.9.1
200
+ requests==2.32.5
201
+ requests-oauthlib==2.0.0
202
+ requests-toolbelt==1.0.0
203
+ requirements-parser==0.13.0
204
+ rerun-sdk==0.22.1
205
+ rfc3986==2.0.0
206
+ rich==13.9.4
207
+ rsa==4.9.1
208
+ ruff==0.13.0
209
+ s3transfer==0.14.0
210
+ safehttpx==0.1.6
211
+ safetensors==0.6.2
212
+ scikit-learn==1.7.2
213
+ scipy==1.15.3
214
+ SecretStorage==3.4.0
215
+ semantic-version==2.10.0
216
+ sentencepiece==0.2.1
217
+ sentry-sdk==2.38.0
218
+ setuptools==78.1.1
219
+ shellingham==1.5.4
220
+ six==1.17.0
221
+ smart_open==7.3.1
222
+ smashed==0.21.5
223
+ smmap==5.0.2
224
+ sniffio==1.3.1
225
+ starlette==0.48.0
226
+ sympy==1.13.3
227
+ tensorboard==2.15.2
228
+ tensorboard==2.19.0
229
+ tensorflow==2.15.0
230
+ tensorflow-addons==0.23.0
231
+ tensorflow-datasets==4.9.3
232
+ tensorflow-estimator==2.15.0
233
+ tensorflow-graphics==2021.12.3
234
+ tensorflow-metadata==1.17.2
235
+ threadpoolctl==3.6.0
236
+ timm==1.0.19
237
+ tokenizers==0.22.0
238
+ toml==0.10.2
239
+ tomli==2.2.1
240
+ tomlkit==0.13.3
241
+ torch==2.8.0+rocm6.4
242
+ torchcodec==0.5
243
+ torchmetrics==1.8.2
244
+ torchvision==0.23.0+rocm6.4
245
+ tqdm==4.67.1
246
+ transformers==4.56.1
247
+ trimesh==4.8.2
248
+ trouting==0.3.3
249
+ twine==6.2.0
250
+ typeguard==2.13.3
251
+ typer==0.17.4
252
+ typing_extensions==4.15.0
253
+ typing-inspect==0.9.0
254
+ typing-inspection==0.4.1
255
+ tzdata==2025.2
256
+ urllib3==2.5.0
257
+ uvicorn==0.35.0
258
+ wandb==0.21.4
259
+ wcwidth==0.2.13
260
+ websockets==15.0.1
261
+ wheel==0.45.1
262
+ wrapt==1.14.2
263
+ xxhash==3.5.0
264
+ yarl==1.20.1
265
+ zipp==3.23.0
266
+ lerobot==0.3.4
267
+ minLoRA==0.1.0
268
+ autocommand==2.2.2
269
+ backports.tarfile==1.2.0
270
+ importlib_metadata==8.0.0
271
+ inflect==7.3.1
272
+ jaraco.collections==5.1.0
273
+ jaraco.context==5.3.0
274
+ jaraco.functools==4.0.1
275
+ jaraco.text==3.12.1
276
+ more-itertools==10.3.0
277
+ packaging==24.2
278
+ platformdirs==4.2.2
279
+ tomli==2.0.1
280
+ typeguard==4.3.0
281
+ typing_extensions==4.12.2
282
+ wheel==0.45.1
283
+ zipp==3.19.2
all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/wandb-metadata.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-140-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.18",
4
+ "startedAt": "2025-09-24T06:55:50.673091Z",
5
+ "args": [
6
+ "qwen2_7b",
7
+ "save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt",
8
+ "--vision_backbone",
9
+ "openai",
10
+ "--action_head",
11
+ "flow_matching",
12
+ "--seq_len",
13
+ "768",
14
+ "--lora_rank",
15
+ "32",
16
+ "--lora_llm",
17
+ "--checkpoint",
18
+ "/vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924",
19
+ "--device_train_microbatch_size",
20
+ "22",
21
+ "--global_batch_size",
22
+ "176",
23
+ "--dataset",
24
+ "vla_dataset_realworld",
25
+ "--llm_learning_rate",
26
+ "5e-5",
27
+ "--wandb_entity",
28
+ "henryeap",
29
+ "--wandb_project",
30
+ "a1-realworld",
31
+ "--wandb_run_name",
32
+ "glue",
33
+ "--save_overwrite"
34
+ ],
35
+ "program": "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py",
36
+ "codePath": "launch_scripts/train_vla.py",
37
+ "codePathLocal": "launch_scripts/train_vla.py",
38
+ "git": {
39
+ "remote": "https://github.com/Spatialtemporal-AI/A1.git",
40
+ "commit": "c13f2763af61e0d729a8b5ab4bdefc512205bcc5"
41
+ },
42
+ "email": "ihenrykwok@outlook.com",
43
+ "root": "/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb",
44
+ "host": "auh7-1b-gpu-188",
45
+ "executable": "/vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10",
46
+ "cpu_count": 64,
47
+ "cpu_count_logical": 128,
48
+ "gpu": "Instinct MI210",
49
+ "gpu_count": 8,
50
+ "disk": {
51
+ "/": {
52
+ "total": "470343073792",
53
+ "used": "51148382208"
54
+ }
55
+ },
56
+ "memory": {
57
+ "total": "2434606952448"
58
+ },
59
+ "gpu_amd": [
60
+ {
61
+ "id": "0",
62
+ "uniqueId": "0x3558c3014c813fdb",
63
+ "vbiosVersion": "113-D67301V-073",
64
+ "performanceLevel": "auto",
65
+ "maxPower": "300.0",
66
+ "series": "Instinct MI210",
67
+ "model": "0x740f",
68
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
69
+ "sku": "D67301V",
70
+ "sclkRange": "500Mhz - 1700Mhz",
71
+ "mclkRange": "400Mhz - 1600Mhz"
72
+ },
73
+ {
74
+ "id": "6",
75
+ "uniqueId": "0xfa8b85a4625b04f",
76
+ "vbiosVersion": "113-D67301V-073",
77
+ "performanceLevel": "auto",
78
+ "maxPower": "300.0",
79
+ "series": "Instinct MI210",
80
+ "model": "0x740f",
81
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
82
+ "sku": "D67301V",
83
+ "sclkRange": "500Mhz - 1700Mhz",
84
+ "mclkRange": "400Mhz - 1600Mhz"
85
+ },
86
+ {
87
+ "id": "1",
88
+ "uniqueId": "0x9b5c1c302c8129f8",
89
+ "vbiosVersion": "113-D67301V-073",
90
+ "performanceLevel": "auto",
91
+ "maxPower": "300.0",
92
+ "series": "Instinct MI210",
93
+ "model": "0x740f",
94
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
95
+ "sku": "D67301V",
96
+ "sclkRange": "500Mhz - 1700Mhz",
97
+ "mclkRange": "400Mhz - 1600Mhz"
98
+ },
99
+ {
100
+ "id": "2",
101
+ "uniqueId": "0x399226d2b2bfa544",
102
+ "vbiosVersion": "113-D67301V-073",
103
+ "performanceLevel": "auto",
104
+ "maxPower": "300.0",
105
+ "series": "Instinct MI210",
106
+ "model": "0x740f",
107
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
108
+ "sku": "D67301V",
109
+ "sclkRange": "500Mhz - 1700Mhz",
110
+ "mclkRange": "400Mhz - 1600Mhz"
111
+ },
112
+ {
113
+ "id": "4",
114
+ "uniqueId": "0xa515afd8ced1d39d",
115
+ "vbiosVersion": "113-D67301V-073",
116
+ "performanceLevel": "auto",
117
+ "maxPower": "300.0",
118
+ "series": "Instinct MI210",
119
+ "model": "0x740f",
120
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
121
+ "sku": "D67301V",
122
+ "sclkRange": "500Mhz - 1700Mhz",
123
+ "mclkRange": "400Mhz - 1600Mhz"
124
+ },
125
+ {
126
+ "id": "5",
127
+ "uniqueId": "0x137c9ede1bb1518e",
128
+ "vbiosVersion": "113-D67301V-073",
129
+ "performanceLevel": "auto",
130
+ "maxPower": "300.0",
131
+ "series": "Instinct MI210",
132
+ "model": "0x740f",
133
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
134
+ "sku": "D67301V",
135
+ "sclkRange": "500Mhz - 1700Mhz",
136
+ "mclkRange": "400Mhz - 1600Mhz"
137
+ },
138
+ {
139
+ "id": "3",
140
+ "uniqueId": "0xf61ec17df11883bd",
141
+ "vbiosVersion": "113-D67301V-073",
142
+ "performanceLevel": "auto",
143
+ "maxPower": "300.0",
144
+ "series": "Instinct MI210",
145
+ "model": "0x740f",
146
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
147
+ "sku": "D67301V",
148
+ "sclkRange": "500Mhz - 1700Mhz",
149
+ "mclkRange": "400Mhz - 1600Mhz"
150
+ },
151
+ {
152
+ "id": "7",
153
+ "uniqueId": "0x21a2e88d06c419dc",
154
+ "vbiosVersion": "113-D67301V-073",
155
+ "performanceLevel": "auto",
156
+ "maxPower": "300.0",
157
+ "series": "Instinct MI210",
158
+ "model": "0x740f",
159
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
160
+ "sku": "D67301V",
161
+ "sclkRange": "500Mhz - 1700Mhz",
162
+ "mclkRange": "400Mhz - 1600Mhz"
163
+ }
164
+ ],
165
+ "slurm": {
166
+ "cluster_name": "ai-04r",
167
+ "conf": "/etc/slurm/slurm.conf",
168
+ "cpus_on_node": "128",
169
+ "gpus_on_node": "8",
170
+ "gtids": "0",
171
+ "job_account": "faculty-acc",
172
+ "job_cpus_per_node": "128",
173
+ "job_end_time": "1758956113",
174
+ "job_gid": "2000",
175
+ "job_gpus": "0,1,2,3,4,5,6,7",
176
+ "job_id": "1608",
177
+ "job_name": "realworld_mh",
178
+ "job_nodelist": "auh7-1b-gpu-188",
179
+ "job_num_nodes": "1",
180
+ "job_partition": "faculty",
181
+ "job_qos": "xdqos",
182
+ "job_start_time": "1758696913",
183
+ "job_uid": "2013",
184
+ "job_user": "xiaodan",
185
+ "jobid": "1608",
186
+ "localid": "0",
187
+ "nnodes": "1",
188
+ "nodeid": "0",
189
+ "nodelist": "auh7-1b-gpu-188",
190
+ "nprocs": "1",
191
+ "ntasks": "1",
192
+ "ntasks_per_node": "1",
193
+ "oom_kill_step": "0",
194
+ "prio_process": "0",
195
+ "procid": "0",
196
+ "submit_dir": "/vast/users/xiaodan/zhangjian/A1/launch_scripts",
197
+ "submit_host": "auh-1b-cpu-login-001",
198
+ "task_pid": "2195813",
199
+ "tasks_per_node": "1",
200
+ "topology_addr": "auh7-1b-gpu-188",
201
+ "topology_addr_pattern": "node"
202
+ },
203
+ "writerId": "pv6kdvw48bx7dygl9qkpmbu5bsrvk9dc"
204
+ }
all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":2},"_runtime":2}
all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-24T06:55:50.723563798Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpjs8cyprs/port-2195891.txt","pid":2195891,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-09-24T06:55:50.72450232Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2195891}
3
+ {"time":"2025-09-24T06:55:50.724421019Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2195891-2196055-1725862429/socket","Net":"unix"}}
4
+ {"time":"2025-09-24T06:55:50.908962781Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-09-24T06:55:50.915479784Z","level":"INFO","msg":"handleInformInit: received","streamId":"lqn400wc","id":"1(@)"}
6
+ {"time":"2025-09-24T06:55:52.044031974Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"lqn400wc","id":"1(@)"}
7
+ {"time":"2025-09-24T06:55:54.891857034Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-09-24T06:55:54.89233836Z","level":"INFO","msg":"server is shutting down"}
9
+ {"time":"2025-09-24T06:55:54.89233283Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
10
+ {"time":"2025-09-24T06:55:54.89236797Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2025-09-24T06:55:54.892429561Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-2195891-2196055-1725862429/socket","Net":"unix"}}
12
+ {"time":"2025-09-24T06:55:56.94476761Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-09-24T06:55:56.944779601Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-09-24T06:55:56.944793901Z","level":"INFO","msg":"server is closed"}
all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-24T06:55:50.917562781Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-09-24T06:55:52.043989044Z","level":"INFO","msg":"stream: created new stream","id":"lqn400wc"}
3
+ {"time":"2025-09-24T06:55:52.044025974Z","level":"INFO","msg":"stream: started","id":"lqn400wc"}
4
+ {"time":"2025-09-24T06:55:52.044043335Z","level":"INFO","msg":"handler: started","stream_id":"lqn400wc"}
5
+ {"time":"2025-09-24T06:55:52.044047115Z","level":"INFO","msg":"writer: started","stream_id":"lqn400wc"}
6
+ {"time":"2025-09-24T06:55:52.044082945Z","level":"INFO","msg":"sender: started","stream_id":"lqn400wc"}
7
+ {"time":"2025-09-24T06:55:54.8923419Z","level":"INFO","msg":"stream: closing","id":"lqn400wc"}
8
+ {"time":"2025-09-24T06:55:56.556098297Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-09-24T06:55:56.941936164Z","level":"INFO","msg":"handler: closed","stream_id":"lqn400wc"}
10
+ {"time":"2025-09-24T06:55:56.943528235Z","level":"INFO","msg":"sender: closed","stream_id":"lqn400wc"}
11
+ {"time":"2025-09-24T06:55:56.943536815Z","level":"INFO","msg":"stream: closed","id":"lqn400wc"}
all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/logs/debug.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 2025-09-24 06:55:54,891 INFO wandb-AsyncioManager-main:2195891 [service_client.py:_forward_responses():84] Reached EOF.
all_flow_matching/glue_best/wandb/wandb/run-20250924_065550-lqn400wc/run-lqn400wc.wandb ADDED
Binary file (19.8 kB). View file
 
all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/config.yaml ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.21.4
4
+ e:
5
+ 1pt1kzn3156onku1dbmbvrh2eyknhblo:
6
+ args:
7
+ - qwen2_7b
8
+ - save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
9
+ - --vision_backbone
10
+ - openai
11
+ - --action_head
12
+ - flow_matching
13
+ - --seq_len
14
+ - "768"
15
+ - --lora_rank
16
+ - "32"
17
+ - --lora_llm
18
+ - --checkpoint
19
+ - /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
20
+ - --device_train_microbatch_size
21
+ - "22"
22
+ - --global_batch_size
23
+ - "176"
24
+ - --dataset
25
+ - vla_dataset_realworld
26
+ - --llm_learning_rate
27
+ - "5e-5"
28
+ - --wandb_entity
29
+ - henryeap
30
+ - --wandb_project
31
+ - a1-realworld
32
+ - --wandb_run_name
33
+ - glue
34
+ - --save_overwrite
35
+ codePath: launch_scripts/train_vla.py
36
+ codePathLocal: launch_scripts/train_vla.py
37
+ cpu_count: 64
38
+ cpu_count_logical: 128
39
+ disk:
40
+ /:
41
+ total: "470343073792"
42
+ used: "50552754176"
43
+ email: ihenrykwok@outlook.com
44
+ executable: /vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10
45
+ git:
46
+ commit: c13f2763af61e0d729a8b5ab4bdefc512205bcc5
47
+ remote: https://github.com/Spatialtemporal-AI/A1.git
48
+ gpu: Instinct MI210
49
+ gpu_amd:
50
+ - id: "2"
51
+ maxPower: "300.0"
52
+ mclkRange: 400Mhz - 1600Mhz
53
+ model: "0x740f"
54
+ performanceLevel: auto
55
+ sclkRange: 500Mhz - 1700Mhz
56
+ series: Instinct MI210
57
+ sku: D67301V
58
+ uniqueId: "0xab01f34fc0edbb6e"
59
+ vbiosVersion: 113-D67301V-073
60
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
61
+ - id: "4"
62
+ maxPower: "300.0"
63
+ mclkRange: 400Mhz - 1600Mhz
64
+ model: "0x740f"
65
+ performanceLevel: auto
66
+ sclkRange: 500Mhz - 1700Mhz
67
+ series: Instinct MI210
68
+ sku: D67301V
69
+ uniqueId: "0x37e5d0f3d8682cca"
70
+ vbiosVersion: 113-D67301V-073
71
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
72
+ - id: "1"
73
+ maxPower: "300.0"
74
+ mclkRange: 400Mhz - 1600Mhz
75
+ model: "0x740f"
76
+ performanceLevel: auto
77
+ sclkRange: 500Mhz - 1700Mhz
78
+ series: Instinct MI210
79
+ sku: D67301V
80
+ uniqueId: "0xf07610cbfae55ec0"
81
+ vbiosVersion: 113-D67301V-073
82
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
83
+ - id: "3"
84
+ maxPower: "300.0"
85
+ mclkRange: 400Mhz - 1600Mhz
86
+ model: "0x740f"
87
+ performanceLevel: auto
88
+ sclkRange: 500Mhz - 1700Mhz
89
+ series: Instinct MI210
90
+ sku: D67301V
91
+ uniqueId: "0x9a76422d710d96e6"
92
+ vbiosVersion: 113-D67301V-073
93
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
94
+ - id: "7"
95
+ maxPower: "300.0"
96
+ mclkRange: 400Mhz - 1600Mhz
97
+ model: "0x740f"
98
+ performanceLevel: auto
99
+ sclkRange: 500Mhz - 1700Mhz
100
+ series: Instinct MI210
101
+ sku: D67301V
102
+ uniqueId: "0xe1b4f7497ad1d2db"
103
+ vbiosVersion: 113-D67301V-073
104
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
105
+ - id: "5"
106
+ maxPower: "300.0"
107
+ mclkRange: 400Mhz - 1600Mhz
108
+ model: "0x740f"
109
+ performanceLevel: auto
110
+ sclkRange: 500Mhz - 1700Mhz
111
+ series: Instinct MI210
112
+ sku: D67301V
113
+ uniqueId: "0x2e0c5f8d27fbe8f1"
114
+ vbiosVersion: 113-D67301V-073
115
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
116
+ - id: "0"
117
+ maxPower: "300.0"
118
+ mclkRange: 400Mhz - 1600Mhz
119
+ model: "0x740f"
120
+ performanceLevel: auto
121
+ sclkRange: 500Mhz - 1700Mhz
122
+ series: Instinct MI210
123
+ sku: D67301V
124
+ uniqueId: "0xca50e2816c5058ba"
125
+ vbiosVersion: 113-D67301V-073
126
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
127
+ - id: "6"
128
+ maxPower: "300.0"
129
+ mclkRange: 400Mhz - 1600Mhz
130
+ model: "0x740f"
131
+ performanceLevel: auto
132
+ sclkRange: 500Mhz - 1700Mhz
133
+ series: Instinct MI210
134
+ sku: D67301V
135
+ uniqueId: "0xfaa84ccf6c76f5e3"
136
+ vbiosVersion: 113-D67301V-073
137
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
138
+ gpu_count: 8
139
+ host: auh7-1b-gpu-295
140
+ memory:
141
+ total: "2434606931968"
142
+ os: Linux-5.15.0-140-generic-x86_64-with-glibc2.35
143
+ program: /vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py
144
+ python: CPython 3.10.18
145
+ root: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb
146
+ slurm:
147
+ cluster_name: ai-04r
148
+ conf: /etc/slurm/slurm.conf
149
+ cpus_on_node: "128"
150
+ gpus_on_node: "8"
151
+ gtids: "0"
152
+ job_account: faculty-acc
153
+ job_cpus_per_node: "128"
154
+ job_end_time: "1758959301"
155
+ job_gid: "2000"
156
+ job_gpus: 0,1,2,3,4,5,6,7
157
+ job_id: "1610"
158
+ job_name: realworld_mh
159
+ job_nodelist: auh7-1b-gpu-295
160
+ job_num_nodes: "1"
161
+ job_partition: faculty
162
+ job_qos: xdqos
163
+ job_start_time: "1758700101"
164
+ job_uid: "2013"
165
+ job_user: xiaodan
166
+ jobid: "1610"
167
+ localid: "0"
168
+ nnodes: "1"
169
+ nodeid: "0"
170
+ nodelist: auh7-1b-gpu-295
171
+ nprocs: "1"
172
+ ntasks: "1"
173
+ ntasks_per_node: "1"
174
+ oom_kill_step: "0"
175
+ prio_process: "0"
176
+ procid: "0"
177
+ submit_dir: /vast/users/xiaodan/zhangjian/A1/launch_scripts
178
+ submit_host: auh-1b-cpu-login-001
179
+ task_pid: "3944397"
180
+ tasks_per_node: "1"
181
+ topology_addr: auh7-1b-gpu-295
182
+ topology_addr_pattern: node
183
+ startedAt: "2025-09-24T07:49:27.634943Z"
184
+ writerId: 1pt1kzn3156onku1dbmbvrh2eyknhblo
185
+ m: []
186
+ python_version: 3.10.18
187
+ t:
188
+ "1":
189
+ - 1
190
+ - 3
191
+ - 5
192
+ - 11
193
+ - 41
194
+ - 49
195
+ - 51
196
+ - 53
197
+ - 63
198
+ - 71
199
+ - 83
200
+ - 95
201
+ - 105
202
+ "2":
203
+ - 1
204
+ - 3
205
+ - 5
206
+ - 11
207
+ - 41
208
+ - 49
209
+ - 51
210
+ - 53
211
+ - 63
212
+ - 71
213
+ - 83
214
+ - 95
215
+ - 105
216
+ "3":
217
+ - 2
218
+ - 13
219
+ - 15
220
+ - 16
221
+ - 61
222
+ "4": 3.10.18
223
+ "5": 0.21.4
224
+ "6": 4.56.1
225
+ "10":
226
+ - 19
227
+ "12": 0.21.4
228
+ "13": linux-x86_64
229
+ activation_checkpointing:
230
+ value: whole_layer
231
+ allow_resume:
232
+ value: false
233
+ batch_divisor:
234
+ value: global_batch
235
+ canceled_check_interval:
236
+ value: 50
237
+ checkpoint_dir:
238
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
239
+ compile:
240
+ value: null
241
+ console_log_interval:
242
+ value: 1
243
+ data:
244
+ value:
245
+ dataset: vla_dataset_realworld
246
+ drop_last: true
247
+ for_inference: false
248
+ lerobot_episode_index_end: null
249
+ lerobot_episode_index_start: null
250
+ mixture: null
251
+ multi_modal: torch
252
+ num_workers: 0
253
+ pad: to_max
254
+ persistent_workers: false
255
+ pin_memory: true
256
+ prefetch_factor: null
257
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
258
+ rlds_dataset_name: libero_4_task_suites_no_noops
259
+ rlds_read_threads: 8
260
+ rlds_shuffle_buffer_size: 100000
261
+ rlds_traj_threads: 8
262
+ root_size_mixture: null
263
+ seed: 95818
264
+ sequence_length: 768
265
+ shuffle: true
266
+ shuffle_messages: false
267
+ split: train
268
+ timeout: 0
269
+ use_proprio: true
270
+ use_wrist_image: true
271
+ device_eval_batch_size:
272
+ value: 4
273
+ device_inf_eval_batch_size:
274
+ value: 16
275
+ device_train_batch_size:
276
+ value: 22
277
+ device_train_grad_accum:
278
+ value: 1
279
+ device_train_microbatch_size:
280
+ value: 22
281
+ dry_run:
282
+ value: false
283
+ early_exit:
284
+ value: false
285
+ epoch:
286
+ value: null
287
+ eval_interval:
288
+ value: 0
289
+ eval_on_load:
290
+ value: false
291
+ eval_subset_num_batches:
292
+ value: -1
293
+ evaluators:
294
+ value:
295
+ - data:
296
+ dataset: vla_dataset_realworld
297
+ drop_last: true
298
+ for_inference: false
299
+ lerobot_episode_index_end: 765
300
+ lerobot_episode_index_start: 353
301
+ mixture: null
302
+ multi_modal: torch
303
+ num_workers: 0
304
+ pad: to_max
305
+ persistent_workers: true
306
+ pin_memory: true
307
+ prefetch_factor: null
308
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
309
+ rlds_dataset_name: libero_4_task_suites_no_noops
310
+ rlds_read_threads: 8
311
+ rlds_shuffle_buffer_size: 256000
312
+ rlds_traj_threads: 8
313
+ root_size_mixture: null
314
+ seed: null
315
+ sequence_length: 768
316
+ shuffle: false
317
+ shuffle_messages: false
318
+ split: validation
319
+ timeout: 0
320
+ use_proprio: true
321
+ use_wrist_image: true
322
+ device_eval_batch_size: null
323
+ eval_name: null
324
+ label: val
325
+ max_examples: null
326
+ max_new_tokens: 448
327
+ mm_evaluator: null
328
+ save_dir: null
329
+ save_to_checkpoint_dir: false
330
+ skip_if_metrics_cached: true
331
+ subset_num_batches: 64
332
+ extra_steps_after_cancel:
333
+ value: 10
334
+ fast_forward_batches:
335
+ value: null
336
+ force_save_unsharded:
337
+ value: false
338
+ fsdp:
339
+ value:
340
+ hybrid_sharding_num_model_replicas: null
341
+ precision: float
342
+ sharding_strategy: FULL_SHARD
343
+ use_orig_params: true
344
+ wrapping_strategy: by_block_and_size
345
+ ft_connector:
346
+ value: false
347
+ ft_embedding:
348
+ value: lm_head
349
+ ft_llm:
350
+ value: false
351
+ ft_vit:
352
+ value: false
353
+ fused_loss:
354
+ value: null
355
+ gen1_gc_interval:
356
+ value: 1
357
+ global_train_batch_size:
358
+ value: 176
359
+ inf_eval_interval:
360
+ value: -1
361
+ inf_eval_subset_num_batches:
362
+ value: -1
363
+ inf_evaluators:
364
+ value: []
365
+ initial_model_checkpoint:
366
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
367
+ keep_lr_on_load:
368
+ value: true
369
+ load_model_config:
370
+ value: null
371
+ load_path:
372
+ value: null
373
+ load_path_sharded_checkpointer:
374
+ value: null
375
+ lora:
376
+ value: false
377
+ lora_connector:
378
+ value: false
379
+ lora_llm:
380
+ value: true
381
+ lora_rank:
382
+ value: 32
383
+ lora_vit:
384
+ value: false
385
+ max_duration:
386
+ value: 500000
387
+ max_grad_norm:
388
+ value: 1
389
+ max_grad_norm_ratio:
390
+ value: null
391
+ model:
392
+ value:
393
+ action_head: flow_matching
394
+ action_head_dit_depth: 28
395
+ action_head_dit_hidden_size: 1152
396
+ action_head_dit_num_heads: 16
397
+ action_use_left_eef: false
398
+ action_use_mobile_base: false
399
+ activation_type: swiglu
400
+ additional_vocab_size: 128
401
+ always_start_with_space: true
402
+ attention_dropout: 0
403
+ attention_layer_norm: false
404
+ attention_layer_norm_with_affine: true
405
+ attention_type: sdpa
406
+ bias_for_layer_norm: null
407
+ block_group_size: 1
408
+ block_type: sequential
409
+ clip_qkv: null
410
+ crop_mode: overlap-and-resize-c2
411
+ d_model: 3584
412
+ default_inference_len: 65
413
+ embedding_dropout: 0
414
+ embedding_size: 152064
415
+ fix_image_padding: true
416
+ float32_attention: true
417
+ head_dim: null
418
+ image_feature_dropout: 0
419
+ image_padding_embed: pad_and_partial_pad
420
+ image_pooling_2d: attention_meanq
421
+ image_pooling_h: 2
422
+ image_pooling_w: 2
423
+ image_projector: mlp
424
+ include_bias: false
425
+ init_cutoff_factor: null
426
+ init_device: null
427
+ init_fn: normal
428
+ init_std: 0.02
429
+ initializer_range: 0.02
430
+ layer_norm_eps: 1e-06
431
+ layer_norm_type: rms
432
+ layer_norm_with_affine: true
433
+ llm_causal_attention: false
434
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
435
+ low_cpu_fsdp: true
436
+ max_crops: 12
437
+ max_position_embeddings: null
438
+ max_sequence_length: 4096
439
+ message_formatting: role
440
+ mlp_hidden_size: 37888
441
+ mlp_ratio: 4
442
+ moe_capacity_factor: 1.25
443
+ moe_dropless: true
444
+ moe_interleave: false
445
+ moe_lbl_in_fp32: false
446
+ moe_log_expert_assignment: false
447
+ moe_loss_weight: 0.1
448
+ moe_mlp_impl: sparse
449
+ moe_num_experts: 8
450
+ moe_shared_expert: false
451
+ moe_top_k: 2
452
+ moe_zloss_weight: null
453
+ multi_annotation_weighting: root_subsegments
454
+ n_heads: 28
455
+ n_kv_heads: 4
456
+ n_layers: 28
457
+ new_embedding_init_range: 0.02
458
+ norm_after: false
459
+ normalize_input_embeds: false
460
+ num_diffusion_inference_steps: 30
461
+ num_diffusion_steps: 1000
462
+ overlap_margins:
463
+ - 4
464
+ - 4
465
+ pad_tokenizer: true
466
+ pad_value: 0
467
+ precision: amp_bf16
468
+ prompt_type: uber_model
469
+ qkv_bias: true
470
+ residual_dropout: 0.1
471
+ response_residual_dropout: 0
472
+ rope: true
473
+ rope_full_precision: true
474
+ rope_theta: 1e+06
475
+ scale_logits: false
476
+ system_prompt_kind: demo_or_style
477
+ tokenizer:
478
+ identifier: Qwen/Qwen2-7B
479
+ tokenizer_dir: null
480
+ use_col_tokens: true
481
+ use_position_ids: true
482
+ use_proprio: true
483
+ vision_backbone:
484
+ attention_dropout: 0
485
+ fsdp_wrap: false
486
+ image_default_input_size:
487
+ - 336
488
+ - 336
489
+ image_dropout_rate: 0
490
+ image_emb_dim: 1024
491
+ image_head_dim: 64
492
+ image_mlp_activations: quick_gelu
493
+ image_mlp_dim: 4096
494
+ image_model_type: openai
495
+ image_norm_eps: 1e-05
496
+ image_num_heads: 16
497
+ image_num_key_value_heads: 16
498
+ image_num_layers: 23
499
+ image_num_pos: 577
500
+ image_patch_size: 14
501
+ image_pos_patch_size: 14
502
+ initializer_range: 0.02
503
+ residual_dropout: 0
504
+ resize_mode: default
505
+ vit_layers:
506
+ - -2
507
+ - -9
508
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
509
+ vocab_size: 152064
510
+ weight_tying: false
511
+ multi_component_grad_norm:
512
+ value: true
513
+ no_pre_train_checkpoint:
514
+ value: true
515
+ optimizer:
516
+ value:
517
+ betas:
518
+ - 0.9
519
+ - 0.95
520
+ connector_betas:
521
+ - 0.9
522
+ - 0.95
523
+ connector_eps: 1e-06
524
+ connector_learning_rate: 0.0002
525
+ connector_weight_decay: 0
526
+ eps: 1e-05
527
+ learning_rate: 0.0001
528
+ llm_betas:
529
+ - 0.9
530
+ - 0.95
531
+ llm_eps: 1e-06
532
+ llm_learning_rate: 5e-05
533
+ llm_weight_decay: 0
534
+ metrics_log_interval: 20
535
+ name: adamw
536
+ vit_betas:
537
+ - 0.9
538
+ - 0.95
539
+ vit_eps: 1e-06
540
+ vit_learning_rate: 6e-06
541
+ vit_weight_decay: 0
542
+ weight_decay: 0.01
543
+ precision:
544
+ value: amp_bf16
545
+ python_profiling:
546
+ value: false
547
+ remote_save_folder:
548
+ value: null
549
+ reset_dataloader_state:
550
+ value: false
551
+ reset_optimizer_state:
552
+ value: false
553
+ reset_trainer_state:
554
+ value: false
555
+ restore_dataloader:
556
+ value: true
557
+ run_name:
558
+ value: glue_20250924_074844
559
+ save_dataloader_state:
560
+ value: false
561
+ save_folder:
562
+ value: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
563
+ save_interval:
564
+ value: 500
565
+ save_interval_action_head:
566
+ value: 500
567
+ save_interval_ephemeral:
568
+ value: null
569
+ save_interval_unsharded:
570
+ value: 500
571
+ save_num_action_head_checkpoints_to_keep:
572
+ value: 2
573
+ save_num_checkpoints_to_keep:
574
+ value: 1
575
+ save_num_unsharded_checkpoints_to_keep:
576
+ value: 1
577
+ save_overwrite:
578
+ value: true
579
+ scheduler:
580
+ value:
581
+ alpha_f: 0.1
582
+ connector_t_warmup: 200
583
+ grad_clip_warmup_factor: null
584
+ grad_clip_warmup_steps: null
585
+ llm_t_warmup: 2000
586
+ name: multimodal
587
+ t_max: null
588
+ t_warmup: 100
589
+ units: steps
590
+ vit_t_warmup: 2000
591
+ warmup_min_lr: 0
592
+ seed:
593
+ value: 6198
594
+ sharded_checkpointer:
595
+ value: torch_legacy
596
+ softmax_auxiliary_loss:
597
+ value: true
598
+ softmax_auxiliary_loss_scale:
599
+ value: 0.0001
600
+ speed_monitor:
601
+ value:
602
+ gpu_flops_available: null
603
+ window_size: 20
604
+ stop_after:
605
+ value: null
606
+ stop_at:
607
+ value: 500000
608
+ time_limit:
609
+ value: null
610
+ torch_profiling:
611
+ value: false
612
+ train_exit_random_layer:
613
+ value: false
614
+ use_lora:
615
+ value: true
all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/output.log ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb: Detected [openai] in use.
2
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
4
+ 09/24 [07:49:29] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
5
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
6
+ warnings.warn( # warn only once
7
+
8
+ ****** vla_cfg: {'datasets': {'rlds': {'name': None, 'path': None, 'weight': 1.0, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [['/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1/Lerobot_Glue_best', 0.6, 'bounds']], 'open-source-real-world': {'rlds': {'name': 'a1_real_world', 'path': '/vast/users/xiaodan/zhangjian/datasets/OXE', 'weight': 0.4, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [], 'agibot': {'path': None, 'weight': 0.1, 'action_proprio_normalization_type': None}}}, 'model': {'action_head': {'action_dim': 16, 'proprio_dim': 16, 'num_actions_chunk': 8, 'action_tokens_mapping': {'left_end_effector': 8, 'right_end_effector': 8}, 'use_left_eef': True, 'use_mobile_base': False}}}
9
+ ****** Skip RLDS main; path not found: None
10
+ ****** start build LeRobot main...
11
+ build_tokenizer, cache_dir None tokenizer_dir None
12
+ 09/24 [07:49:31] INFO | >> Padding tokenizer with 418 tokens tokenizer.py:128
13
+ 09/24 [07:49:32] INFO | >> Loading train dataset: vla_dataset_realworld/train __init__.py:434
14
+ ****** before LeRobot dataset...
15
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/a1/Lerobot_Glue_best
16
+ ****** length of the dataset: 17698
17
+ ****** Skip RLDS open-source-real-world; mixture 'a1_real_world' not found under: /vast/users/xiaodan/zhangjian/datasets/OXE
18
+ ****** Expect one of: []
19
+ ****** path: None
20
+ ****** Skip AgiBotWorld-Alpha open-source-real-world; path not found: None
21
+ ****** After build vla train dataset...
22
+ ****** iterable_sources: [<olmo.data.dataset.IterableDatasetWrapper object at 0x7f68482ee830>]
23
+ ****** Before build mixed iterable dataset...
24
+ ****** Build vla train dataloader successfully!
25
+ ************************* Build train_dataloader successful!
26
+ ************************* Before build_inf_evaluators
27
+ 09/24 [07:49:49] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
28
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
29
+ warnings.warn( # warn only once
30
+
31
+ ************************* Build evaluators successful!
32
+ ************************* Early exit flags: early_exit=False
33
+ ************************* Initialize model successful!
34
+ ***** state_dict_path: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924/model.pt
35
+ ***** Load checkpoint successful!
36
+ missing keys: ['action_head.state_proj.weight', 'action_head.state_proj.bias', 'action_head.action_in_proj.weight', 'action_head.action_in_proj.bias', 'action_head.action_time_in.weight', 'action_head.action_time_in.bias', 'action_head.action_time_out.weight', 'action_head.action_time_out.bias', 'action_head.memory_proj.weight', 'action_head.memory_proj.bias', 'action_head.gemma.model.layers.0.self_attn.q_proj.weight', 'action_head.gemma.model.layers.0.self_attn.k_proj.weight', 'action_head.gemma.model.layers.0.self_attn.v_proj.weight', 'action_head.gemma.model.layers.0.self_attn.o_proj.weight', 'action_head.gemma.model.layers.0.mlp.gate_proj.weight', 'action_head.gemma.model.layers.0.mlp.up_proj.weight', 'action_head.gemma.model.layers.0.mlp.down_proj.weight', 'action_head.gemma.model.layers.0.input_layernorm.weight', 'action_head.gemma.model.layers.0.post_attention_layernorm.weight', 'action_head.gemma.model.layers.1.self_attn.q_proj.weight', 'action_head.gemma.model.layers.1.self_attn.k_proj.weight', 'action_head.gemma.model.layers.1.self_attn.v_proj.weight', 'action_head.gemma.model.layers.1.self_attn.o_proj.weight', 'action_head.gemma.model.layers.1.mlp.gate_proj.weight', 'action_head.gemma.model.layers.1.mlp.up_proj.weight', 'action_head.gemma.model.layers.1.mlp.down_proj.weight', 'action_head.gemma.model.layers.1.input_layernorm.weight', 'action_head.gemma.model.layers.1.post_attention_layernorm.weight', 'action_head.gemma.model.layers.2.self_attn.q_proj.weight', 'action_head.gemma.model.layers.2.self_attn.k_proj.weight', 'action_head.gemma.model.layers.2.self_attn.v_proj.weight', 'action_head.gemma.model.layers.2.self_attn.o_proj.weight', 'action_head.gemma.model.layers.2.mlp.gate_proj.weight', 'action_head.gemma.model.layers.2.mlp.up_proj.weight', 'action_head.gemma.model.layers.2.mlp.down_proj.weight', 'action_head.gemma.model.layers.2.input_layernorm.weight', 'action_head.gemma.model.layers.2.post_attention_layernorm.weight', 'action_head.gemma.model.layers.3.self_attn.q_proj.weight', 'action_head.gemma.model.layers.3.self_attn.k_proj.weight', 'action_head.gemma.model.layers.3.self_attn.v_proj.weight', 'action_head.gemma.model.layers.3.self_attn.o_proj.weight', 'action_head.gemma.model.layers.3.mlp.gate_proj.weight', 'action_head.gemma.model.layers.3.mlp.up_proj.weight', 'action_head.gemma.model.layers.3.mlp.down_proj.weight', 'action_head.gemma.model.layers.3.input_layernorm.weight', 'action_head.gemma.model.layers.3.post_attention_layernorm.weight', 'action_head.gemma.model.layers.4.self_attn.q_proj.weight', 'action_head.gemma.model.layers.4.self_attn.k_proj.weight', 'action_head.gemma.model.layers.4.self_attn.v_proj.weight', 'action_head.gemma.model.layers.4.self_attn.o_proj.weight', 'action_head.gemma.model.layers.4.mlp.gate_proj.weight', 'action_head.gemma.model.layers.4.mlp.up_proj.weight', 'action_head.gemma.model.layers.4.mlp.down_proj.weight', 'action_head.gemma.model.layers.4.input_layernorm.weight', 'action_head.gemma.model.layers.4.post_attention_layernorm.weight', 'action_head.gemma.model.layers.5.self_attn.q_proj.weight', 'action_head.gemma.model.layers.5.self_attn.k_proj.weight', 'action_head.gemma.model.layers.5.self_attn.v_proj.weight', 'action_head.gemma.model.layers.5.self_attn.o_proj.weight', 'action_head.gemma.model.layers.5.mlp.gate_proj.weight', 'action_head.gemma.model.layers.5.mlp.up_proj.weight', 'action_head.gemma.model.layers.5.mlp.down_proj.weight', 'action_head.gemma.model.layers.5.input_layernorm.weight', 'action_head.gemma.model.layers.5.post_attention_layernorm.weight', 'action_head.gemma.model.layers.6.self_attn.q_proj.weight', 'action_head.gemma.model.layers.6.self_attn.k_proj.weight', 'action_head.gemma.model.layers.6.self_attn.v_proj.weight', 'action_head.gemma.model.layers.6.self_attn.o_proj.weight', 'action_head.gemma.model.layers.6.mlp.gate_proj.weight', 'action_head.gemma.model.layers.6.mlp.up_proj.weight', 'action_head.gemma.model.layers.6.mlp.down_proj.weight', 'action_head.gemma.model.layers.6.input_layernorm.weight', 'action_head.gemma.model.layers.6.post_attention_
37
+ unexpected keys: []
38
+ ************************* Initialize model successful!
39
+ ************************* LoRA flags: use_lora=True, lora_llm=True, lora_vit=False, lora_connector=False
40
+ ************************* Before add lora to model
41
+ ************************* Add lora to model.transformer successful!
42
+ ************************* Before FSDP model wrapping
43
+ ************************* FSDP model wrapping successful!
44
+ ************************* Before building optimizer and scheduler
45
+ ************* Before get lora params
46
+ ************* get lora params name: llm_params[0]: blocks.6._fsdp_wrapped_module.ff_out.parametrizations.weight.0.lora_A
47
+ ************* After get lora params successfully
48
+ 09/24 [07:51:21] INFO | >> Constructing optimizer with 1 param groups optim.py:1283
49
+ **************************************************
50
+ After building optimizer and scheduler and model, before training, peak GPU memory (MB): 39071
51
+ ************************* VLATrainer initialized successfully!
52
+ ************************* Before trainer.fit()
53
+ Pre-train system metrics
54
+ System/Peak GPU Memory (MB)=39,071
55
+ !!!Training failed:
56
+ Traceback (most recent call last):
57
+ File "/vast/users/xiaodan/zhangjian/A1/scripts/train_for_action.py", line 593, in main
58
+ trainer.fit()
59
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/train.py", line 2277, in fit
60
+ for batch in self.train_loader:
61
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 734, in __next__
62
+ data = self._next_data()
63
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 790, in _next_data
64
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
65
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 43, in fetch
66
+ return self.collate_fn(data)
67
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/collator.py", line 158, in __call__
68
+ self._add_action_tokens_to_batch(batch)
69
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/collator.py", line 254, in _add_action_tokens_to_batch
70
+ action_tokens = self._build_action_tokens()
71
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/collator.py", line 337, in _build_action_tokens
72
+ assert len(right_eef_tokens) == ACTION_DIMS_MAPPING['right_end_effector']
73
+ AssertionError
74
+ wandb: WARNING The `quiet` argument to `wandb.run.finish()` is deprecated, use `wandb.Settings(quiet=...)` to set this instead.
all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/requirements.txt ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai2-molmo==0.0.0
2
+ astunparse==1.6.3
3
+ flatbuffers==25.2.10
4
+ gast==0.6.0
5
+ google-pasta==0.2.0
6
+ h5py==3.14.0
7
+ libclang==18.1.1
8
+ Markdown==3.9
9
+ namex==0.1.0
10
+ opt_einsum==3.4.0
11
+ optree==0.17.0
12
+ tensorboard-data-server==0.7.2
13
+ tensorflow-io-gcs-filesystem==0.37.1
14
+ tensorflow-rocm==2.16.2
15
+ termcolor==3.1.0
16
+ Werkzeug==3.1.3
17
+ Brotli==1.1.0
18
+ Farama-Notifications==0.0.4
19
+ MarkupSafe==2.1.5
20
+ PyYAML==6.0.2
21
+ absl-py==2.3.1
22
+ accelerate==1.10.1
23
+ ai2-molmo==0.0.0
24
+ aiofiles==24.1.0
25
+ aiohappyeyeballs==2.6.1
26
+ aiohttp==3.12.15
27
+ aiosignal==1.4.0
28
+ annotated-types==0.7.0
29
+ antlr4-python3-runtime==4.9.3
30
+ anyio==4.10.0
31
+ array_record==0.8.1
32
+ async-timeout==5.0.1
33
+ attrs==25.3.0
34
+ av==15.1.0
35
+ backports.tarfile==1.2.0
36
+ beaker-gantry==3.2.0
37
+ beaker-py==2.5.0
38
+ black==23.12.1
39
+ blinker==1.9.0
40
+ boltons==25.0.0
41
+ boto3==1.40.33
42
+ botocore==1.40.33
43
+ build==1.3.0
44
+ cached_path==1.7.3
45
+ cached-property==2.0.1
46
+ cachetools==5.5.2
47
+ certifi==2025.8.3
48
+ cffi==2.0.0
49
+ charset-normalizer==3.4.3
50
+ click==8.2.1
51
+ click-help-colors==0.9.4
52
+ click-option-group==0.5.7
53
+ cloudpickle==3.1.1
54
+ cmake==4.1.0
55
+ contourpy==1.3.2
56
+ cryptography==46.0.1
57
+ cycler==0.12.1
58
+ dataclass-extensions==0.2.3
59
+ datasets==3.6.0
60
+ decorator==5.2.1
61
+ deepdiff==8.6.1
62
+ diffusers==0.35.1
63
+ dill==0.3.8
64
+ distro==1.9.0
65
+ dlimp==0.0.1
66
+ dm-tree==0.1.9
67
+ docutils==0.22.1
68
+ draccus==0.10.0
69
+ editdistance==0.8.1
70
+ einops==0.8.1
71
+ einops-exts==0.0.4
72
+ etils==1.13.0
73
+ evdev==1.9.2
74
+ exceptiongroup==1.3.0
75
+ face==24.0.0
76
+ fastapi==0.116.2
77
+ ffmpy==0.6.1
78
+ fiddle==0.3.0
79
+ filelock==3.13.1
80
+ Flask==3.1.2
81
+ fonttools==4.60.0
82
+ frozenlist==1.7.0
83
+ fsspec==2023.9.2
84
+ ftfy==6.3.1
85
+ gcsfs==2023.9.2
86
+ gitdb==4.0.12
87
+ GitPython==3.1.45
88
+ glom==24.11.0
89
+ google-api-core==2.25.1
90
+ google-auth==2.40.3
91
+ google-auth-oauthlib==1.2.2
92
+ google-cloud-core==2.4.3
93
+ google-cloud-storage==2.19.0
94
+ google-crc32c==1.7.1
95
+ google-resumable-media==2.7.2
96
+ googleapis-common-protos==1.70.0
97
+ gradio==5.46.0
98
+ gradio_client==1.13.0
99
+ graphviz==0.21
100
+ groovy==0.1.2
101
+ grpcio==1.75.0
102
+ gymnasium==0.29.1
103
+ h11==0.16.0
104
+ hf_transfer==0.1.9
105
+ hf-xet==1.1.10
106
+ httpcore==1.0.9
107
+ httpx==0.28.1
108
+ huggingface-hub==0.35.0
109
+ id==1.5.0
110
+ idna==3.10
111
+ imageio==2.37.0
112
+ imageio-ffmpeg==0.6.0
113
+ importlib_metadata==8.7.0
114
+ importlib_resources==6.5.2
115
+ iniconfig==2.1.0
116
+ inquirerpy==0.3.4
117
+ isort==5.12.0
118
+ itsdangerous==2.2.0
119
+ jaraco.classes==3.4.0
120
+ jaraco.context==6.0.1
121
+ jaraco.functools==4.3.0
122
+ jeepney==0.9.0
123
+ Jinja2==3.1.4
124
+ jiter==0.11.0
125
+ jmespath==1.0.1
126
+ joblib==1.5.2
127
+ jsonlines==4.0.0
128
+ keras==2.15.0
129
+ keyring==25.6.0
130
+ kiwisolver==1.4.9
131
+ latex2sympy2_extended==1.10.2
132
+ lerobot==0.3.4
133
+ Levenshtein==0.27.1
134
+ libcst==1.8.4
135
+ lightning-utilities==0.15.2
136
+ markdown-it-py==4.0.0
137
+ math-verify==0.8.0
138
+ matplotlib==3.10.6
139
+ mdurl==0.1.2
140
+ mergedeep==1.3.4
141
+ ml-dtypes==0.2.0
142
+ ml_dtypes==0.5.3
143
+ more-itertools==10.8.0
144
+ mpmath==1.3.0
145
+ msgspec==0.19.0
146
+ multidict==6.6.4
147
+ multiprocess==0.70.16
148
+ mypy==1.3.0
149
+ mypy_extensions==1.1.0
150
+ necessary==0.4.3
151
+ networkx==3.3
152
+ nh3==0.3.0
153
+ nltk==3.9.1
154
+ numpy==1.26.4
155
+ oauthlib==3.3.1
156
+ omegaconf==2.3.0
157
+ openai==1.108.0
158
+ opencv-python-headless==4.12.0.88
159
+ OpenEXR==3.4.0
160
+ orderly-set==5.5.0
161
+ orjson==3.11.3
162
+ packaging==25.0
163
+ pandas==2.3.2
164
+ pathspec==0.12.1
165
+ petname==2.6
166
+ pfzy==0.3.4
167
+ pillow==11.0.0
168
+ pip==25.2
169
+ platformdirs==4.4.0
170
+ pluggy==1.6.0
171
+ promise==2.3
172
+ prompt_toolkit==3.0.52
173
+ propcache==0.3.2
174
+ proto-plus==1.26.1
175
+ protobuf==4.21.12
176
+ protobuf==6.32.1
177
+ psutil==7.1.0
178
+ pyarrow==21.0.0
179
+ pyasn1==0.6.1
180
+ pyasn1_modules==0.4.2
181
+ pycparser==2.23
182
+ pydantic==2.11.9
183
+ pydantic_core==2.33.2
184
+ pydub==0.25.1
185
+ Pygments==2.19.2
186
+ pynput==1.8.1
187
+ pyparsing==3.2.4
188
+ pyproject_hooks==1.2.0
189
+ pyserial==3.5
190
+ pytest==8.4.2
191
+ pytest-sphinx==0.6.3
192
+ python-dateutil==2.9.0.post0
193
+ python-Levenshtein==0.27.1
194
+ python-multipart==0.0.20
195
+ python-xlib==0.33
196
+ pytorch-triton-rocm==3.4.0
197
+ pytz==2025.2
198
+ pyyaml-include==1.4.1
199
+ RapidFuzz==3.14.1
200
+ readme_renderer==44.0
201
+ regex==2025.9.1
202
+ requests==2.32.5
203
+ requests-oauthlib==2.0.0
204
+ requests-toolbelt==1.0.0
205
+ requirements-parser==0.13.0
206
+ rerun-sdk==0.22.1
207
+ rfc3986==2.0.0
208
+ rich==13.9.4
209
+ rsa==4.9.1
210
+ ruff==0.13.0
211
+ s3transfer==0.14.0
212
+ safehttpx==0.1.6
213
+ safetensors==0.6.2
214
+ scikit-learn==1.7.2
215
+ scipy==1.15.3
216
+ SecretStorage==3.4.0
217
+ semantic-version==2.10.0
218
+ sentencepiece==0.2.1
219
+ sentry-sdk==2.38.0
220
+ setuptools==78.1.1
221
+ shellingham==1.5.4
222
+ six==1.17.0
223
+ smart_open==7.3.1
224
+ smashed==0.21.5
225
+ smmap==5.0.2
226
+ sniffio==1.3.1
227
+ starlette==0.48.0
228
+ sympy==1.13.3
229
+ tensorboard==2.15.2
230
+ tensorboard==2.19.0
231
+ tensorflow==2.15.0
232
+ tensorflow-addons==0.23.0
233
+ tensorflow-datasets==4.9.3
234
+ tensorflow-estimator==2.15.0
235
+ tensorflow-graphics==2021.12.3
236
+ tensorflow-metadata==1.17.2
237
+ threadpoolctl==3.6.0
238
+ timm==1.0.19
239
+ tokenizers==0.22.0
240
+ toml==0.10.2
241
+ tomli==2.2.1
242
+ tomlkit==0.13.3
243
+ torch==2.8.0+rocm6.4
244
+ torchcodec==0.5
245
+ torchmetrics==1.8.2
246
+ torchvision==0.23.0+rocm6.4
247
+ tqdm==4.67.1
248
+ transformers==4.56.1
249
+ trimesh==4.8.2
250
+ trouting==0.3.3
251
+ twine==6.2.0
252
+ typeguard==2.13.3
253
+ typer==0.17.4
254
+ typing_extensions==4.15.0
255
+ typing-inspect==0.9.0
256
+ typing-inspection==0.4.1
257
+ tzdata==2025.2
258
+ urllib3==2.5.0
259
+ uvicorn==0.35.0
260
+ wandb==0.21.4
261
+ wcwidth==0.2.13
262
+ websockets==15.0.1
263
+ wheel==0.45.1
264
+ wrapt==1.14.2
265
+ xxhash==3.5.0
266
+ yarl==1.20.1
267
+ zipp==3.23.0
268
+ lerobot==0.3.4
269
+ minLoRA==0.1.0
270
+ autocommand==2.2.2
271
+ backports.tarfile==1.2.0
272
+ importlib_metadata==8.0.0
273
+ inflect==7.3.1
274
+ jaraco.collections==5.1.0
275
+ jaraco.context==5.3.0
276
+ jaraco.functools==4.0.1
277
+ jaraco.text==3.12.1
278
+ more-itertools==10.3.0
279
+ packaging==24.2
280
+ platformdirs==4.2.2
281
+ tomli==2.0.1
282
+ typeguard==4.3.0
283
+ typing_extensions==4.12.2
284
+ wheel==0.45.1
285
+ zipp==3.19.2
all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/wandb-metadata.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-140-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.18",
4
+ "startedAt": "2025-09-24T07:49:27.634943Z",
5
+ "args": [
6
+ "qwen2_7b",
7
+ "save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt",
8
+ "--vision_backbone",
9
+ "openai",
10
+ "--action_head",
11
+ "flow_matching",
12
+ "--seq_len",
13
+ "768",
14
+ "--lora_rank",
15
+ "32",
16
+ "--lora_llm",
17
+ "--checkpoint",
18
+ "/vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924",
19
+ "--device_train_microbatch_size",
20
+ "22",
21
+ "--global_batch_size",
22
+ "176",
23
+ "--dataset",
24
+ "vla_dataset_realworld",
25
+ "--llm_learning_rate",
26
+ "5e-5",
27
+ "--wandb_entity",
28
+ "henryeap",
29
+ "--wandb_project",
30
+ "a1-realworld",
31
+ "--wandb_run_name",
32
+ "glue",
33
+ "--save_overwrite"
34
+ ],
35
+ "program": "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py",
36
+ "codePath": "launch_scripts/train_vla.py",
37
+ "codePathLocal": "launch_scripts/train_vla.py",
38
+ "git": {
39
+ "remote": "https://github.com/Spatialtemporal-AI/A1.git",
40
+ "commit": "c13f2763af61e0d729a8b5ab4bdefc512205bcc5"
41
+ },
42
+ "email": "ihenrykwok@outlook.com",
43
+ "root": "/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb",
44
+ "host": "auh7-1b-gpu-295",
45
+ "executable": "/vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10",
46
+ "cpu_count": 64,
47
+ "cpu_count_logical": 128,
48
+ "gpu": "Instinct MI210",
49
+ "gpu_count": 8,
50
+ "disk": {
51
+ "/": {
52
+ "total": "470343073792",
53
+ "used": "50552754176"
54
+ }
55
+ },
56
+ "memory": {
57
+ "total": "2434606931968"
58
+ },
59
+ "gpu_amd": [
60
+ {
61
+ "id": "2",
62
+ "uniqueId": "0xab01f34fc0edbb6e",
63
+ "vbiosVersion": "113-D67301V-073",
64
+ "performanceLevel": "auto",
65
+ "maxPower": "300.0",
66
+ "series": "Instinct MI210",
67
+ "model": "0x740f",
68
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
69
+ "sku": "D67301V",
70
+ "sclkRange": "500Mhz - 1700Mhz",
71
+ "mclkRange": "400Mhz - 1600Mhz"
72
+ },
73
+ {
74
+ "id": "4",
75
+ "uniqueId": "0x37e5d0f3d8682cca",
76
+ "vbiosVersion": "113-D67301V-073",
77
+ "performanceLevel": "auto",
78
+ "maxPower": "300.0",
79
+ "series": "Instinct MI210",
80
+ "model": "0x740f",
81
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
82
+ "sku": "D67301V",
83
+ "sclkRange": "500Mhz - 1700Mhz",
84
+ "mclkRange": "400Mhz - 1600Mhz"
85
+ },
86
+ {
87
+ "id": "1",
88
+ "uniqueId": "0xf07610cbfae55ec0",
89
+ "vbiosVersion": "113-D67301V-073",
90
+ "performanceLevel": "auto",
91
+ "maxPower": "300.0",
92
+ "series": "Instinct MI210",
93
+ "model": "0x740f",
94
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
95
+ "sku": "D67301V",
96
+ "sclkRange": "500Mhz - 1700Mhz",
97
+ "mclkRange": "400Mhz - 1600Mhz"
98
+ },
99
+ {
100
+ "id": "3",
101
+ "uniqueId": "0x9a76422d710d96e6",
102
+ "vbiosVersion": "113-D67301V-073",
103
+ "performanceLevel": "auto",
104
+ "maxPower": "300.0",
105
+ "series": "Instinct MI210",
106
+ "model": "0x740f",
107
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
108
+ "sku": "D67301V",
109
+ "sclkRange": "500Mhz - 1700Mhz",
110
+ "mclkRange": "400Mhz - 1600Mhz"
111
+ },
112
+ {
113
+ "id": "7",
114
+ "uniqueId": "0xe1b4f7497ad1d2db",
115
+ "vbiosVersion": "113-D67301V-073",
116
+ "performanceLevel": "auto",
117
+ "maxPower": "300.0",
118
+ "series": "Instinct MI210",
119
+ "model": "0x740f",
120
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
121
+ "sku": "D67301V",
122
+ "sclkRange": "500Mhz - 1700Mhz",
123
+ "mclkRange": "400Mhz - 1600Mhz"
124
+ },
125
+ {
126
+ "id": "5",
127
+ "uniqueId": "0x2e0c5f8d27fbe8f1",
128
+ "vbiosVersion": "113-D67301V-073",
129
+ "performanceLevel": "auto",
130
+ "maxPower": "300.0",
131
+ "series": "Instinct MI210",
132
+ "model": "0x740f",
133
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
134
+ "sku": "D67301V",
135
+ "sclkRange": "500Mhz - 1700Mhz",
136
+ "mclkRange": "400Mhz - 1600Mhz"
137
+ },
138
+ {
139
+ "id": "0",
140
+ "uniqueId": "0xca50e2816c5058ba",
141
+ "vbiosVersion": "113-D67301V-073",
142
+ "performanceLevel": "auto",
143
+ "maxPower": "300.0",
144
+ "series": "Instinct MI210",
145
+ "model": "0x740f",
146
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
147
+ "sku": "D67301V",
148
+ "sclkRange": "500Mhz - 1700Mhz",
149
+ "mclkRange": "400Mhz - 1600Mhz"
150
+ },
151
+ {
152
+ "id": "6",
153
+ "uniqueId": "0xfaa84ccf6c76f5e3",
154
+ "vbiosVersion": "113-D67301V-073",
155
+ "performanceLevel": "auto",
156
+ "maxPower": "300.0",
157
+ "series": "Instinct MI210",
158
+ "model": "0x740f",
159
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
160
+ "sku": "D67301V",
161
+ "sclkRange": "500Mhz - 1700Mhz",
162
+ "mclkRange": "400Mhz - 1600Mhz"
163
+ }
164
+ ],
165
+ "slurm": {
166
+ "cluster_name": "ai-04r",
167
+ "conf": "/etc/slurm/slurm.conf",
168
+ "cpus_on_node": "128",
169
+ "gpus_on_node": "8",
170
+ "gtids": "0",
171
+ "job_account": "faculty-acc",
172
+ "job_cpus_per_node": "128",
173
+ "job_end_time": "1758959301",
174
+ "job_gid": "2000",
175
+ "job_gpus": "0,1,2,3,4,5,6,7",
176
+ "job_id": "1610",
177
+ "job_name": "realworld_mh",
178
+ "job_nodelist": "auh7-1b-gpu-295",
179
+ "job_num_nodes": "1",
180
+ "job_partition": "faculty",
181
+ "job_qos": "xdqos",
182
+ "job_start_time": "1758700101",
183
+ "job_uid": "2013",
184
+ "job_user": "xiaodan",
185
+ "jobid": "1610",
186
+ "localid": "0",
187
+ "nnodes": "1",
188
+ "nodeid": "0",
189
+ "nodelist": "auh7-1b-gpu-295",
190
+ "nprocs": "1",
191
+ "ntasks": "1",
192
+ "ntasks_per_node": "1",
193
+ "oom_kill_step": "0",
194
+ "prio_process": "0",
195
+ "procid": "0",
196
+ "submit_dir": "/vast/users/xiaodan/zhangjian/A1/launch_scripts",
197
+ "submit_host": "auh-1b-cpu-login-001",
198
+ "task_pid": "3944397",
199
+ "tasks_per_node": "1",
200
+ "topology_addr": "auh7-1b-gpu-295",
201
+ "topology_addr_pattern": "node"
202
+ },
203
+ "writerId": "1pt1kzn3156onku1dbmbvrh2eyknhblo"
204
+ }
all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime":113.732812458,"System/Peak GPU Memory (MB)":39071.62890625,"_timestamp":1.7587002818870535e+09,"_step":0,"_wandb":{"runtime":113}}
all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-24T07:49:27.889617724Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpsbiq8ish/port-3944478.txt","pid":3944478,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-09-24T07:49:27.891870094Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3944478}
3
+ {"time":"2025-09-24T07:49:27.892998519Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-3944478-3944654-2489211586/socket","Net":"unix"}}
4
+ {"time":"2025-09-24T07:49:28.091672549Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-09-24T07:49:28.107596353Z","level":"INFO","msg":"handleInformInit: received","streamId":"rwm1qqvr","id":"1(@)"}
6
+ {"time":"2025-09-24T07:49:29.254468509Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"rwm1qqvr","id":"1(@)"}
7
+ {"time":"2025-09-24T07:51:25.363561879Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"rwm1qqvr","id":"1(@)"}
8
+ {"time":"2025-09-24T07:51:25.367101887Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"rwm1qqvr","id":"1(@)"}
9
+ {"time":"2025-09-24T07:51:25.367107627Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2025-09-24T07:51:25.367113717Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2025-09-24T07:51:25.367121007Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-09-24T07:51:25.367120867Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2025-09-24T07:51:25.367156617Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
+ {"time":"2025-09-24T07:51:25.367160127Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2025-09-24T07:51:25.367206328Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-3944478-3944654-2489211586/socket","Net":"unix"}}
16
+ {"time":"2025-09-24T07:51:25.367223268Z","level":"INFO","msg":"server is closed"}
all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-24T07:49:28.109477949Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-09-24T07:49:29.254414398Z","level":"INFO","msg":"stream: created new stream","id":"rwm1qqvr"}
3
+ {"time":"2025-09-24T07:49:29.254462629Z","level":"INFO","msg":"stream: started","id":"rwm1qqvr"}
4
+ {"time":"2025-09-24T07:49:29.254479529Z","level":"INFO","msg":"writer: started","stream_id":"rwm1qqvr"}
5
+ {"time":"2025-09-24T07:49:29.254487509Z","level":"INFO","msg":"handler: started","stream_id":"rwm1qqvr"}
6
+ {"time":"2025-09-24T07:49:29.254512839Z","level":"INFO","msg":"sender: started","stream_id":"rwm1qqvr"}
7
+ {"time":"2025-09-24T07:51:23.341950674Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":1.454277139}],"total_operations":1}}
8
+ {"time":"2025-09-24T07:51:24.999785419Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-09-24T07:51:25.363888344Z","level":"INFO","msg":"stream: closing","id":"rwm1qqvr"}
10
+ {"time":"2025-09-24T07:51:25.363901414Z","level":"INFO","msg":"handler: closed","stream_id":"rwm1qqvr"}
11
+ {"time":"2025-09-24T07:51:25.36514555Z","level":"INFO","msg":"sender: closed","stream_id":"rwm1qqvr"}
12
+ {"time":"2025-09-24T07:51:25.36515125Z","level":"INFO","msg":"stream: closed","id":"rwm1qqvr"}
all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/logs/debug.log ADDED
File without changes
all_flow_matching/glue_best/wandb/wandb/run-20250924_074927-rwm1qqvr/run-rwm1qqvr.wandb ADDED
Binary file (54.6 kB). View file
 
all_flow_matching/glue_best/wandb/wandb/run-20250924_075956-zoletkkn/files/config.yaml ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.21.4
4
+ e:
5
+ 89offtg18nkl0daugw7ob6ogc3vo0r47:
6
+ args:
7
+ - qwen2_7b
8
+ - save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
9
+ - --vision_backbone
10
+ - openai
11
+ - --action_head
12
+ - flow_matching
13
+ - --seq_len
14
+ - "768"
15
+ - --lora_rank
16
+ - "32"
17
+ - --lora_llm
18
+ - --checkpoint
19
+ - /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
20
+ - --device_train_microbatch_size
21
+ - "22"
22
+ - --global_batch_size
23
+ - "176"
24
+ - --dataset
25
+ - vla_dataset_realworld
26
+ - --llm_learning_rate
27
+ - "5e-5"
28
+ - --wandb_entity
29
+ - henryeap
30
+ - --wandb_project
31
+ - a1-realworld
32
+ - --wandb_run_name
33
+ - glue
34
+ - --save_overwrite
35
+ codePath: launch_scripts/train_vla.py
36
+ codePathLocal: launch_scripts/train_vla.py
37
+ cpu_count: 64
38
+ cpu_count_logical: 128
39
+ disk:
40
+ /:
41
+ total: "470343073792"
42
+ used: "50870067200"
43
+ email: ihenrykwok@outlook.com
44
+ executable: /vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10
45
+ git:
46
+ commit: c13f2763af61e0d729a8b5ab4bdefc512205bcc5
47
+ remote: https://github.com/Spatialtemporal-AI/A1.git
48
+ gpu: Instinct MI210
49
+ gpu_amd:
50
+ - id: "1"
51
+ maxPower: "300.0"
52
+ mclkRange: 400Mhz - 1600Mhz
53
+ model: "0x740f"
54
+ performanceLevel: auto
55
+ sclkRange: 500Mhz - 1700Mhz
56
+ series: Instinct MI210
57
+ sku: D67301V
58
+ uniqueId: "0x51514ecc6ede157"
59
+ vbiosVersion: 113-D67301V-073
60
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
61
+ - id: "2"
62
+ maxPower: "300.0"
63
+ mclkRange: 400Mhz - 1600Mhz
64
+ model: "0x740f"
65
+ performanceLevel: auto
66
+ sclkRange: 500Mhz - 1700Mhz
67
+ series: Instinct MI210
68
+ sku: D67301V
69
+ uniqueId: "0xf3ef7b4642ab85b4"
70
+ vbiosVersion: 113-D67301V-073
71
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
72
+ - id: "0"
73
+ maxPower: "300.0"
74
+ mclkRange: 400Mhz - 1600Mhz
75
+ model: "0x740f"
76
+ performanceLevel: auto
77
+ sclkRange: 500Mhz - 1700Mhz
78
+ series: Instinct MI210
79
+ sku: D67301V
80
+ uniqueId: "0x7f3568312f929f55"
81
+ vbiosVersion: 113-D67301V-073
82
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
83
+ - id: "3"
84
+ maxPower: "300.0"
85
+ mclkRange: 400Mhz - 1600Mhz
86
+ model: "0x740f"
87
+ performanceLevel: auto
88
+ sclkRange: 500Mhz - 1700Mhz
89
+ series: Instinct MI210
90
+ sku: D67301V
91
+ uniqueId: "0x78c4870668ca6f3c"
92
+ vbiosVersion: 113-D67301V-073
93
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
94
+ - id: "6"
95
+ maxPower: "300.0"
96
+ mclkRange: 400Mhz - 1600Mhz
97
+ model: "0x740f"
98
+ performanceLevel: auto
99
+ sclkRange: 500Mhz - 1700Mhz
100
+ series: Instinct MI210
101
+ sku: D67301V
102
+ uniqueId: "0xa5b5be8f3bb8ee59"
103
+ vbiosVersion: 113-D67301V-073
104
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
105
+ - id: "4"
106
+ maxPower: "300.0"
107
+ mclkRange: 400Mhz - 1600Mhz
108
+ model: "0x740f"
109
+ performanceLevel: auto
110
+ sclkRange: 500Mhz - 1700Mhz
111
+ series: Instinct MI210
112
+ sku: D67301V
113
+ uniqueId: "0xd7645877fbcaeda9"
114
+ vbiosVersion: 113-D67301V-073
115
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
116
+ - id: "5"
117
+ maxPower: "300.0"
118
+ mclkRange: 400Mhz - 1600Mhz
119
+ model: "0x740f"
120
+ performanceLevel: auto
121
+ sclkRange: 500Mhz - 1700Mhz
122
+ series: Instinct MI210
123
+ sku: D67301V
124
+ uniqueId: "0x29dc055d2883ffc3"
125
+ vbiosVersion: 113-D67301V-073
126
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
127
+ - id: "7"
128
+ maxPower: "300.0"
129
+ mclkRange: 400Mhz - 1600Mhz
130
+ model: "0x740f"
131
+ performanceLevel: auto
132
+ sclkRange: 500Mhz - 1700Mhz
133
+ series: Instinct MI210
134
+ sku: D67301V
135
+ uniqueId: "0xa98ff96823c37f37"
136
+ vbiosVersion: 113-D67301V-073
137
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
138
+ gpu_count: 8
139
+ host: auh7-1b-gpu-310
140
+ memory:
141
+ total: "2434606936064"
142
+ os: Linux-5.15.0-140-generic-x86_64-with-glibc2.35
143
+ program: /vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py
144
+ python: CPython 3.10.18
145
+ root: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/wandb
146
+ slurm:
147
+ cluster_name: ai-04r
148
+ conf: /etc/slurm/slurm.conf
149
+ cpus_on_node: "128"
150
+ gpus_on_node: "8"
151
+ gtids: "0"
152
+ job_account: faculty-acc
153
+ job_cpus_per_node: "128"
154
+ job_end_time: "1758959956"
155
+ job_gid: "2000"
156
+ job_gpus: 0,1,2,3,4,5,6,7
157
+ job_id: "1619"
158
+ job_name: realworld_mh
159
+ job_nodelist: auh7-1b-gpu-310
160
+ job_num_nodes: "1"
161
+ job_partition: faculty
162
+ job_qos: xdqos
163
+ job_start_time: "1758700756"
164
+ job_uid: "2013"
165
+ job_user: xiaodan
166
+ jobid: "1619"
167
+ localid: "0"
168
+ nnodes: "1"
169
+ nodeid: "0"
170
+ nodelist: auh7-1b-gpu-310
171
+ nprocs: "1"
172
+ ntasks: "1"
173
+ ntasks_per_node: "1"
174
+ oom_kill_step: "0"
175
+ prio_process: "0"
176
+ procid: "0"
177
+ submit_dir: /vast/users/xiaodan/zhangjian/A1/launch_scripts
178
+ submit_host: auh-1b-cpu-login-001
179
+ task_pid: "1391516"
180
+ tasks_per_node: "1"
181
+ topology_addr: auh7-1b-gpu-310
182
+ topology_addr_pattern: node
183
+ startedAt: "2025-09-24T07:59:56.864946Z"
184
+ writerId: 89offtg18nkl0daugw7ob6ogc3vo0r47
185
+ m: []
186
+ python_version: 3.10.18
187
+ t:
188
+ "1":
189
+ - 1
190
+ - 3
191
+ - 5
192
+ - 11
193
+ - 41
194
+ - 49
195
+ - 51
196
+ - 53
197
+ - 63
198
+ - 71
199
+ - 83
200
+ - 95
201
+ - 105
202
+ "2":
203
+ - 1
204
+ - 3
205
+ - 5
206
+ - 11
207
+ - 41
208
+ - 49
209
+ - 51
210
+ - 53
211
+ - 63
212
+ - 71
213
+ - 83
214
+ - 95
215
+ - 105
216
+ "3":
217
+ - 2
218
+ - 13
219
+ - 15
220
+ - 16
221
+ - 61
222
+ "4": 3.10.18
223
+ "5": 0.21.4
224
+ "6": 4.56.1
225
+ "10":
226
+ - 19
227
+ "12": 0.21.4
228
+ "13": linux-x86_64
229
+ activation_checkpointing:
230
+ value: whole_layer
231
+ allow_resume:
232
+ value: false
233
+ batch_divisor:
234
+ value: global_batch
235
+ canceled_check_interval:
236
+ value: 50
237
+ checkpoint_dir:
238
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
239
+ compile:
240
+ value: null
241
+ console_log_interval:
242
+ value: 1
243
+ data:
244
+ value:
245
+ dataset: vla_dataset_realworld
246
+ drop_last: true
247
+ for_inference: false
248
+ lerobot_episode_index_end: null
249
+ lerobot_episode_index_start: null
250
+ mixture: null
251
+ multi_modal: torch
252
+ num_workers: 0
253
+ pad: to_max
254
+ persistent_workers: false
255
+ pin_memory: true
256
+ prefetch_factor: null
257
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
258
+ rlds_dataset_name: libero_4_task_suites_no_noops
259
+ rlds_read_threads: 8
260
+ rlds_shuffle_buffer_size: 100000
261
+ rlds_traj_threads: 8
262
+ root_size_mixture: null
263
+ seed: 95818
264
+ sequence_length: 768
265
+ shuffle: true
266
+ shuffle_messages: false
267
+ split: train
268
+ timeout: 0
269
+ use_proprio: true
270
+ use_wrist_image: true
271
+ device_eval_batch_size:
272
+ value: 4
273
+ device_inf_eval_batch_size:
274
+ value: 16
275
+ device_train_batch_size:
276
+ value: 22
277
+ device_train_grad_accum:
278
+ value: 1
279
+ device_train_microbatch_size:
280
+ value: 22
281
+ dry_run:
282
+ value: false
283
+ early_exit:
284
+ value: false
285
+ epoch:
286
+ value: null
287
+ eval_interval:
288
+ value: 0
289
+ eval_on_load:
290
+ value: false
291
+ eval_subset_num_batches:
292
+ value: -1
293
+ evaluators:
294
+ value:
295
+ - data:
296
+ dataset: vla_dataset_realworld
297
+ drop_last: true
298
+ for_inference: false
299
+ lerobot_episode_index_end: 765
300
+ lerobot_episode_index_start: 353
301
+ mixture: null
302
+ multi_modal: torch
303
+ num_workers: 0
304
+ pad: to_max
305
+ persistent_workers: true
306
+ pin_memory: true
307
+ prefetch_factor: null
308
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
309
+ rlds_dataset_name: libero_4_task_suites_no_noops
310
+ rlds_read_threads: 8
311
+ rlds_shuffle_buffer_size: 256000
312
+ rlds_traj_threads: 8
313
+ root_size_mixture: null
314
+ seed: null
315
+ sequence_length: 768
316
+ shuffle: false
317
+ shuffle_messages: false
318
+ split: validation
319
+ timeout: 0
320
+ use_proprio: true
321
+ use_wrist_image: true
322
+ device_eval_batch_size: null
323
+ eval_name: null
324
+ label: val
325
+ max_examples: null
326
+ max_new_tokens: 448
327
+ mm_evaluator: null
328
+ save_dir: null
329
+ save_to_checkpoint_dir: false
330
+ skip_if_metrics_cached: true
331
+ subset_num_batches: 64
332
+ extra_steps_after_cancel:
333
+ value: 10
334
+ fast_forward_batches:
335
+ value: null
336
+ force_save_unsharded:
337
+ value: false
338
+ fsdp:
339
+ value:
340
+ hybrid_sharding_num_model_replicas: null
341
+ precision: float
342
+ sharding_strategy: FULL_SHARD
343
+ use_orig_params: true
344
+ wrapping_strategy: by_block_and_size
345
+ ft_connector:
346
+ value: false
347
+ ft_embedding:
348
+ value: lm_head
349
+ ft_llm:
350
+ value: false
351
+ ft_vit:
352
+ value: false
353
+ fused_loss:
354
+ value: null
355
+ gen1_gc_interval:
356
+ value: 1
357
+ global_train_batch_size:
358
+ value: 176
359
+ inf_eval_interval:
360
+ value: -1
361
+ inf_eval_subset_num_batches:
362
+ value: -1
363
+ inf_evaluators:
364
+ value: []
365
+ initial_model_checkpoint:
366
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
367
+ keep_lr_on_load:
368
+ value: true
369
+ load_model_config:
370
+ value: null
371
+ load_path:
372
+ value: null
373
+ load_path_sharded_checkpointer:
374
+ value: null
375
+ lora:
376
+ value: false
377
+ lora_connector:
378
+ value: false
379
+ lora_llm:
380
+ value: true
381
+ lora_rank:
382
+ value: 32
383
+ lora_vit:
384
+ value: false
385
+ max_duration:
386
+ value: 500000
387
+ max_grad_norm:
388
+ value: 1
389
+ max_grad_norm_ratio:
390
+ value: null
391
+ model:
392
+ value:
393
+ action_head: flow_matching
394
+ action_head_dit_depth: 28
395
+ action_head_dit_hidden_size: 1152
396
+ action_head_dit_num_heads: 16
397
+ action_use_left_eef: false
398
+ action_use_mobile_base: false
399
+ activation_type: swiglu
400
+ additional_vocab_size: 128
401
+ always_start_with_space: true
402
+ attention_dropout: 0
403
+ attention_layer_norm: false
404
+ attention_layer_norm_with_affine: true
405
+ attention_type: sdpa
406
+ bias_for_layer_norm: null
407
+ block_group_size: 1
408
+ block_type: sequential
409
+ clip_qkv: null
410
+ crop_mode: overlap-and-resize-c2
411
+ d_model: 3584
412
+ default_inference_len: 65
413
+ embedding_dropout: 0
414
+ embedding_size: 152064
415
+ fix_image_padding: true
416
+ float32_attention: true
417
+ head_dim: null
418
+ image_feature_dropout: 0
419
+ image_padding_embed: pad_and_partial_pad
420
+ image_pooling_2d: attention_meanq
421
+ image_pooling_h: 2
422
+ image_pooling_w: 2
423
+ image_projector: mlp
424
+ include_bias: false
425
+ init_cutoff_factor: null
426
+ init_device: null
427
+ init_fn: normal
428
+ init_std: 0.02
429
+ initializer_range: 0.02
430
+ layer_norm_eps: 1e-06
431
+ layer_norm_type: rms
432
+ layer_norm_with_affine: true
433
+ llm_causal_attention: false
434
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
435
+ low_cpu_fsdp: true
436
+ max_crops: 12
437
+ max_position_embeddings: null
438
+ max_sequence_length: 4096
439
+ message_formatting: role
440
+ mlp_hidden_size: 37888
441
+ mlp_ratio: 4
442
+ moe_capacity_factor: 1.25
443
+ moe_dropless: true
444
+ moe_interleave: false
445
+ moe_lbl_in_fp32: false
446
+ moe_log_expert_assignment: false
447
+ moe_loss_weight: 0.1
448
+ moe_mlp_impl: sparse
449
+ moe_num_experts: 8
450
+ moe_shared_expert: false
451
+ moe_top_k: 2
452
+ moe_zloss_weight: null
453
+ multi_annotation_weighting: root_subsegments
454
+ n_heads: 28
455
+ n_kv_heads: 4
456
+ n_layers: 28
457
+ new_embedding_init_range: 0.02
458
+ norm_after: false
459
+ normalize_input_embeds: false
460
+ num_diffusion_inference_steps: 30
461
+ num_diffusion_steps: 1000
462
+ overlap_margins:
463
+ - 4
464
+ - 4
465
+ pad_tokenizer: true
466
+ pad_value: 0
467
+ precision: amp_bf16
468
+ prompt_type: uber_model
469
+ qkv_bias: true
470
+ residual_dropout: 0.1
471
+ response_residual_dropout: 0
472
+ rope: true
473
+ rope_full_precision: true
474
+ rope_theta: 1e+06
475
+ scale_logits: false
476
+ system_prompt_kind: demo_or_style
477
+ tokenizer:
478
+ identifier: Qwen/Qwen2-7B
479
+ tokenizer_dir: null
480
+ use_col_tokens: true
481
+ use_position_ids: true
482
+ use_proprio: true
483
+ vision_backbone:
484
+ attention_dropout: 0
485
+ fsdp_wrap: false
486
+ image_default_input_size:
487
+ - 336
488
+ - 336
489
+ image_dropout_rate: 0
490
+ image_emb_dim: 1024
491
+ image_head_dim: 64
492
+ image_mlp_activations: quick_gelu
493
+ image_mlp_dim: 4096
494
+ image_model_type: openai
495
+ image_norm_eps: 1e-05
496
+ image_num_heads: 16
497
+ image_num_key_value_heads: 16
498
+ image_num_layers: 23
499
+ image_num_pos: 577
500
+ image_patch_size: 14
501
+ image_pos_patch_size: 14
502
+ initializer_range: 0.02
503
+ residual_dropout: 0
504
+ resize_mode: default
505
+ vit_layers:
506
+ - -2
507
+ - -9
508
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
509
+ vocab_size: 152064
510
+ weight_tying: false
511
+ multi_component_grad_norm:
512
+ value: true
513
+ no_pre_train_checkpoint:
514
+ value: true
515
+ optimizer:
516
+ value:
517
+ betas:
518
+ - 0.9
519
+ - 0.95
520
+ connector_betas:
521
+ - 0.9
522
+ - 0.95
523
+ connector_eps: 1e-06
524
+ connector_learning_rate: 0.0002
525
+ connector_weight_decay: 0
526
+ eps: 1e-05
527
+ learning_rate: 0.0001
528
+ llm_betas:
529
+ - 0.9
530
+ - 0.95
531
+ llm_eps: 1e-06
532
+ llm_learning_rate: 5e-05
533
+ llm_weight_decay: 0
534
+ metrics_log_interval: 20
535
+ name: adamw
536
+ vit_betas:
537
+ - 0.9
538
+ - 0.95
539
+ vit_eps: 1e-06
540
+ vit_learning_rate: 6e-06
541
+ vit_weight_decay: 0
542
+ weight_decay: 0.01
543
+ precision:
544
+ value: amp_bf16
545
+ python_profiling:
546
+ value: false
547
+ remote_save_folder:
548
+ value: null
549
+ reset_dataloader_state:
550
+ value: false
551
+ reset_optimizer_state:
552
+ value: false
553
+ reset_trainer_state:
554
+ value: false
555
+ restore_dataloader:
556
+ value: true
557
+ run_name:
558
+ value: glue_20250924_075928
559
+ save_dataloader_state:
560
+ value: false
561
+ save_folder:
562
+ value: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt
563
+ save_interval:
564
+ value: 500
565
+ save_interval_action_head:
566
+ value: 500
567
+ save_interval_ephemeral:
568
+ value: null
569
+ save_interval_unsharded:
570
+ value: 500
571
+ save_num_action_head_checkpoints_to_keep:
572
+ value: 2
573
+ save_num_checkpoints_to_keep:
574
+ value: 1
575
+ save_num_unsharded_checkpoints_to_keep:
576
+ value: 1
577
+ save_overwrite:
578
+ value: true
579
+ scheduler:
580
+ value:
581
+ alpha_f: 0.1
582
+ connector_t_warmup: 200
583
+ grad_clip_warmup_factor: null
584
+ grad_clip_warmup_steps: null
585
+ llm_t_warmup: 2000
586
+ name: multimodal
587
+ t_max: null
588
+ t_warmup: 100
589
+ units: steps
590
+ vit_t_warmup: 2000
591
+ warmup_min_lr: 0
592
+ seed:
593
+ value: 6198
594
+ sharded_checkpointer:
595
+ value: torch_legacy
596
+ softmax_auxiliary_loss:
597
+ value: true
598
+ softmax_auxiliary_loss_scale:
599
+ value: 0.0001
600
+ speed_monitor:
601
+ value:
602
+ gpu_flops_available: null
603
+ window_size: 20
604
+ stop_after:
605
+ value: null
606
+ stop_at:
607
+ value: 500000
608
+ time_limit:
609
+ value: null
610
+ torch_profiling:
611
+ value: false
612
+ train_exit_random_layer:
613
+ value: false
614
+ use_lora:
615
+ value: true