MayerZhu1995 commited on
Commit
fcc199b
·
verified ·
1 Parent(s): e324844

Upload qwen3vl-0.6B libero-object checkpoint

Browse files
.gitattributes CHANGED
@@ -43,3 +43,4 @@ openvla_dino_siglip_llama2_libero_10_full_finetune_bs64/openvla_dino_siglip_llam
43
  gr00t_eagle_3b_robocasa_gr1_24x30_finetune_bs64/gr00t_eagle_3b_robocasa_posttrain_24x30_official_aug_2026_05_26_16_59_09.jsonl filter=lfs diff=lfs merge=lfs -text
44
  gr00t_qwen3vl_0.6b_libero_10_full_finetune_bs64/gr00t_qwen3vl_0.6b_libero_10_0605_2026_06_05_10_26_17.jsonl filter=lfs diff=lfs merge=lfs -text
45
  gr00t_qwen3vl_0.6b_libero_10_full_finetune_bs64/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
43
  gr00t_eagle_3b_robocasa_gr1_24x30_finetune_bs64/gr00t_eagle_3b_robocasa_posttrain_24x30_official_aug_2026_05_26_16_59_09.jsonl filter=lfs diff=lfs merge=lfs -text
44
  gr00t_qwen3vl_0.6b_libero_10_full_finetune_bs64/gr00t_qwen3vl_0.6b_libero_10_0605_2026_06_05_10_26_17.jsonl filter=lfs diff=lfs merge=lfs -text
45
  gr00t_qwen3vl_0.6b_libero_10_full_finetune_bs64/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/checkpoints/step-025128-epoch-24-loss=0.0245.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aee147a607cc396ce49aad3164ec5a1197dbf4d0ed9e32577e5e652a08a83e17
3
+ size 8850981096
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/config.json ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_qwen3vl_vla_ckpt": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors",
3
+ "_qwen3vl_vlm_config": {
4
+ "architectures": [
5
+ "Qwen3VLAForConditionalGeneration"
6
+ ],
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 151645,
9
+ "image_token_id": 151655,
10
+ "model_type": "qwen3_vl",
11
+ "pad_token_id": 151643,
12
+ "pos_skipping_range": 4096,
13
+ "text_config": {
14
+ "attention_bias": false,
15
+ "attention_dropout": 0.0,
16
+ "bos_token_id": 151643,
17
+ "dtype": "bfloat16",
18
+ "eos_token_id": 151645,
19
+ "head_dim": 128,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 1024,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 3072,
24
+ "max_position_embeddings": 262144,
25
+ "model_type": "qwen3_vl_text",
26
+ "num_attention_heads": 16,
27
+ "num_hidden_layers": 28,
28
+ "num_key_value_heads": 8,
29
+ "pad_token_id": null,
30
+ "rms_norm_eps": 1e-06,
31
+ "rope_parameters": {
32
+ "mrope_interleaved": true,
33
+ "mrope_section": [
34
+ 24,
35
+ 20,
36
+ 20
37
+ ],
38
+ "rope_theta": 5000000,
39
+ "rope_type": "default"
40
+ },
41
+ "tie_word_embeddings": true,
42
+ "use_cache": true,
43
+ "vocab_size": 151936
44
+ },
45
+ "tie_word_embeddings": true,
46
+ "use_another_LLM_path": "",
47
+ "use_pos_skipping": false,
48
+ "vision_config": {
49
+ "deepstack_visual_indexes": [
50
+ 5,
51
+ 11,
52
+ 17
53
+ ],
54
+ "depth": 24,
55
+ "dtype": "bfloat16",
56
+ "hidden_act": "gelu_pytorch_tanh",
57
+ "hidden_size": 1024,
58
+ "in_channels": 3,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4096,
61
+ "model_type": "qwen3_vl",
62
+ "num_heads": 16,
63
+ "num_position_embeddings": 2304,
64
+ "out_hidden_size": 1024,
65
+ "patch_size": 16,
66
+ "spatial_merge_size": 2,
67
+ "temporal_patch_size": 2
68
+ }
69
+ },
70
+ "eval": {
71
+ "dataset": {
72
+ "transforms": [
73
+ {
74
+ "embodiment_id": 2,
75
+ "img_keys": [
76
+ "agentview_image",
77
+ "robot0_eye_in_hand_image"
78
+ ],
79
+ "type": "ProcessLiberoEvalInputs"
80
+ },
81
+ {
82
+ "type": "ConvertPILImageToNumpyArray"
83
+ },
84
+ {
85
+ "image_mean": [
86
+ 0.48145466,
87
+ 0.4578275,
88
+ 0.40821073
89
+ ],
90
+ "image_std": [
91
+ 0.26862954,
92
+ 0.26130258,
93
+ 0.27577711
94
+ ],
95
+ "img_key": "pixel_values",
96
+ "max_pixels": 1003520,
97
+ "merge_size": 2,
98
+ "min_pixels": 3136,
99
+ "patch_size": 16,
100
+ "temporal_patch_size": 2,
101
+ "to_tensor": true,
102
+ "type": "QWen2VLImageTransform"
103
+ },
104
+ {
105
+ "tokenizer": {
106
+ "model_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/",
107
+ "type": "PretrainedTokenizer"
108
+ },
109
+ "type": "LiberoPromptFromInputs"
110
+ },
111
+ {
112
+ "gripper_key": "robot0_gripper_qpos",
113
+ "norm_type": "mean_std",
114
+ "out_key": "states",
115
+ "pos_key": "robot0_eef_pos",
116
+ "quat_key": "robot0_eef_quat",
117
+ "state_dim": 64,
118
+ "type": "LiberoProprioFromInputs"
119
+ }
120
+ ],
121
+ "type": "LiberoParquetEvalDataset"
122
+ },
123
+ "denormalize_action": {
124
+ "norm_type": "mean_std",
125
+ "type": "DenormalizeLiberoAction"
126
+ },
127
+ "eval_chunk_size": 10,
128
+ "model_family": "pi0",
129
+ "num_steps_wait": 10,
130
+ "num_trials_per_task": 50,
131
+ "resize_size": 224,
132
+ "seed": 7,
133
+ "task_suite_name": "libero_object",
134
+ "type": "LiberoEvalRunner"
135
+ },
136
+ "inference_model": {
137
+ "freeze_projector": false,
138
+ "freeze_vlm_backbone": false,
139
+ "name_mapping": null,
140
+ "pretrained_name_or_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors",
141
+ "type": "LlavaVLA",
142
+ "vla_head": {
143
+ "action_dim": 32,
144
+ "backbone_embedding_dim": 2048,
145
+ "diffusion_model_cfg": {
146
+ "attention_head_dim": 48,
147
+ "cross_attention_dim": 2048,
148
+ "dropout": 0.2,
149
+ "final_dropout": true,
150
+ "interleave_self_attention": true,
151
+ "norm_type": "ada_norm",
152
+ "num_attention_heads": 32,
153
+ "num_layers": 16,
154
+ "output_dim": 1024,
155
+ "positional_embeddings": null
156
+ },
157
+ "hidden_size": 1024,
158
+ "input_embedding_dim": 1536,
159
+ "num_heads": 4,
160
+ "num_inference_timesteps": 4,
161
+ "num_layers": 1,
162
+ "ori_action_dim": 7,
163
+ "state_dim": 64,
164
+ "traj_length": 10,
165
+ "type": "FlowMatchingHead",
166
+ "vl_self_attention_cfg": {
167
+ "attention_head_dim": 64,
168
+ "dropout": 0.2,
169
+ "final_dropout": true,
170
+ "num_attention_heads": 32,
171
+ "num_layers": 4,
172
+ "positional_embeddings": null
173
+ }
174
+ },
175
+ "vlm_backbone": {
176
+ "attn_implementation": "sdpa",
177
+ "projection_output_dim": 2048,
178
+ "projection_type": "linear",
179
+ "type": "Qwen3VL",
180
+ "use_projection": true,
181
+ "vlm_backbone_id": "qwen3_0.6b_vl_pt",
182
+ "vlm_config": {
183
+ "architectures": [
184
+ "Qwen3VLAForConditionalGeneration"
185
+ ],
186
+ "dtype": "bfloat16",
187
+ "eos_token_id": 151645,
188
+ "image_token_id": 151655,
189
+ "model_type": "qwen3_vl",
190
+ "pad_token_id": 151643,
191
+ "pos_skipping_range": 4096,
192
+ "text_config": {
193
+ "attention_bias": false,
194
+ "attention_dropout": 0.0,
195
+ "bos_token_id": 151643,
196
+ "dtype": "bfloat16",
197
+ "eos_token_id": 151645,
198
+ "head_dim": 128,
199
+ "hidden_act": "silu",
200
+ "hidden_size": 1024,
201
+ "initializer_range": 0.02,
202
+ "intermediate_size": 3072,
203
+ "max_position_embeddings": 262144,
204
+ "model_type": "qwen3_vl_text",
205
+ "num_attention_heads": 16,
206
+ "num_hidden_layers": 28,
207
+ "num_key_value_heads": 8,
208
+ "pad_token_id": null,
209
+ "rms_norm_eps": 1e-06,
210
+ "rope_parameters": {
211
+ "mrope_interleaved": true,
212
+ "mrope_section": [
213
+ 24,
214
+ 20,
215
+ 20
216
+ ],
217
+ "rope_theta": 5000000,
218
+ "rope_type": "default"
219
+ },
220
+ "tie_word_embeddings": true,
221
+ "use_cache": true,
222
+ "vocab_size": 151936
223
+ },
224
+ "tie_word_embeddings": true,
225
+ "use_another_LLM_path": "",
226
+ "use_pos_skipping": false,
227
+ "vision_config": {
228
+ "deepstack_visual_indexes": [
229
+ 5,
230
+ 11,
231
+ 17
232
+ ],
233
+ "depth": 24,
234
+ "dtype": "bfloat16",
235
+ "hidden_act": "gelu_pytorch_tanh",
236
+ "hidden_size": 1024,
237
+ "in_channels": 3,
238
+ "initializer_range": 0.02,
239
+ "intermediate_size": 4096,
240
+ "model_type": "qwen3_vl",
241
+ "num_heads": 16,
242
+ "num_position_embeddings": 2304,
243
+ "out_hidden_size": 1024,
244
+ "patch_size": 16,
245
+ "spatial_merge_size": 2,
246
+ "temporal_patch_size": 2
247
+ }
248
+ },
249
+ "vlm_path": null
250
+ }
251
+ },
252
+ "model": {
253
+ "freeze_projector": false,
254
+ "freeze_vlm_backbone": false,
255
+ "name_mapping": null,
256
+ "pretrained_name_or_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors",
257
+ "strict_mapping": false,
258
+ "type": "LlavaVLA",
259
+ "vla_head": {
260
+ "action_dim": 32,
261
+ "backbone_embedding_dim": 2048,
262
+ "diffusion_model_cfg": {
263
+ "attention_head_dim": 48,
264
+ "cross_attention_dim": 2048,
265
+ "dropout": 0.2,
266
+ "final_dropout": true,
267
+ "interleave_self_attention": true,
268
+ "norm_type": "ada_norm",
269
+ "num_attention_heads": 32,
270
+ "num_layers": 16,
271
+ "output_dim": 1024,
272
+ "positional_embeddings": null
273
+ },
274
+ "hidden_size": 1024,
275
+ "input_embedding_dim": 1536,
276
+ "num_heads": 4,
277
+ "num_inference_timesteps": 4,
278
+ "num_layers": 1,
279
+ "ori_action_dim": 7,
280
+ "state_dim": 64,
281
+ "traj_length": 10,
282
+ "type": "FlowMatchingHead",
283
+ "vl_self_attention_cfg": {
284
+ "attention_head_dim": 64,
285
+ "dropout": 0.2,
286
+ "final_dropout": true,
287
+ "num_attention_heads": 32,
288
+ "num_layers": 4,
289
+ "positional_embeddings": null
290
+ }
291
+ },
292
+ "vlm_backbone": {
293
+ "attn_implementation": "sdpa",
294
+ "projection_output_dim": 2048,
295
+ "projection_type": "linear",
296
+ "type": "Qwen3VL",
297
+ "use_projection": true,
298
+ "vlm_backbone_id": "qwen3_0.6b_vl_pt",
299
+ "vlm_config": {
300
+ "architectures": [
301
+ "Qwen3VLAForConditionalGeneration"
302
+ ],
303
+ "dtype": "bfloat16",
304
+ "eos_token_id": 151645,
305
+ "image_token_id": 151655,
306
+ "model_type": "qwen3_vl",
307
+ "pad_token_id": 151643,
308
+ "pos_skipping_range": 4096,
309
+ "text_config": {
310
+ "attention_bias": false,
311
+ "attention_dropout": 0.0,
312
+ "bos_token_id": 151643,
313
+ "dtype": "bfloat16",
314
+ "eos_token_id": 151645,
315
+ "head_dim": 128,
316
+ "hidden_act": "silu",
317
+ "hidden_size": 1024,
318
+ "initializer_range": 0.02,
319
+ "intermediate_size": 3072,
320
+ "max_position_embeddings": 262144,
321
+ "model_type": "qwen3_vl_text",
322
+ "num_attention_heads": 16,
323
+ "num_hidden_layers": 28,
324
+ "num_key_value_heads": 8,
325
+ "pad_token_id": null,
326
+ "rms_norm_eps": 1e-06,
327
+ "rope_parameters": {
328
+ "mrope_interleaved": true,
329
+ "mrope_section": [
330
+ 24,
331
+ 20,
332
+ 20
333
+ ],
334
+ "rope_theta": 5000000,
335
+ "rope_type": "default"
336
+ },
337
+ "tie_word_embeddings": true,
338
+ "use_cache": true,
339
+ "vocab_size": 151936
340
+ },
341
+ "tie_word_embeddings": true,
342
+ "use_another_LLM_path": "",
343
+ "use_pos_skipping": false,
344
+ "vision_config": {
345
+ "deepstack_visual_indexes": [
346
+ 5,
347
+ 11,
348
+ 17
349
+ ],
350
+ "depth": 24,
351
+ "dtype": "bfloat16",
352
+ "hidden_act": "gelu_pytorch_tanh",
353
+ "hidden_size": 1024,
354
+ "in_channels": 3,
355
+ "initializer_range": 0.02,
356
+ "intermediate_size": 4096,
357
+ "model_type": "qwen3_vl",
358
+ "num_heads": 16,
359
+ "num_position_embeddings": 2304,
360
+ "out_hidden_size": 1024,
361
+ "patch_size": 16,
362
+ "spatial_merge_size": 2,
363
+ "temporal_patch_size": 2
364
+ }
365
+ },
366
+ "vlm_path": null
367
+ }
368
+ },
369
+ "runner": {
370
+ "change_key_name": false,
371
+ "collator": {
372
+ "keys": [
373
+ "states",
374
+ "observation.eepose",
375
+ "timestamp",
376
+ "images",
377
+ "img_masks",
378
+ "lang_tokens",
379
+ "lang_masks",
380
+ "actions",
381
+ "action_masks",
382
+ "embodiment_ids",
383
+ "image_grid_thw"
384
+ ],
385
+ "meta_keys": [
386
+ "task_description",
387
+ "prompt",
388
+ "info",
389
+ "stats"
390
+ ],
391
+ "type": "DictCollator"
392
+ },
393
+ "enable_gradient_checkpointing": false,
394
+ "enable_mixed_precision_training": true,
395
+ "learning_rate": 3e-05,
396
+ "lr_scheduler_type": "linear-warmup+cosine-decay",
397
+ "max_epochs": 24,
398
+ "max_grad_norm": 1.0,
399
+ "metric": {
400
+ "active_trackers": [
401
+ "jsonl",
402
+ "wandb"
403
+ ],
404
+ "grad_accumulation_steps": 1,
405
+ "run_dir": "work_dirs",
406
+ "type": "VLAMetric",
407
+ "window_size": 1
408
+ },
409
+ "mixed_precision_dtype": "bf16",
410
+ "sampler": null,
411
+ "sharding_strategy": "full-shard",
412
+ "tokenizer": {
413
+ "model_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/",
414
+ "type": "PretrainedTokenizer"
415
+ },
416
+ "type": "FSDPTrainRunner",
417
+ "warmup_ratio": 0.03,
418
+ "weight_decay": 0.0
419
+ },
420
+ "train_dataloader": {
421
+ "dataset": {
422
+ "datasets": {
423
+ "action_key": "action",
424
+ "action_window_size": 10,
425
+ "data_root_path": [
426
+ "/limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1"
427
+ ],
428
+ "statistic_name": "libero_object_no_noops",
429
+ "transforms": [
430
+ {
431
+ "embodiment_id": 2,
432
+ "name_mappings": {
433
+ "actions": [
434
+ "actions"
435
+ ],
436
+ "observation.state": [
437
+ "states"
438
+ ]
439
+ },
440
+ "parquet_keys": [
441
+ "observation.state",
442
+ "timestamp",
443
+ "actions",
444
+ "info",
445
+ "stats",
446
+ "action_masks"
447
+ ],
448
+ "type": "ProcessParquetInputs",
449
+ "video_keys": [
450
+ "observation.images.image",
451
+ "observation.images.wrist_image"
452
+ ]
453
+ },
454
+ {
455
+ "type": "ParquetPrompter"
456
+ },
457
+ {
458
+ "tokenizer": {
459
+ "model_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/",
460
+ "type": "PretrainedTokenizer"
461
+ },
462
+ "type": "ProcessPrompts"
463
+ },
464
+ {
465
+ "height": 224,
466
+ "type": "ResizeImages",
467
+ "width": 224
468
+ },
469
+ {
470
+ "image_mean": [
471
+ 0.48145466,
472
+ 0.4578275,
473
+ 0.40821073
474
+ ],
475
+ "image_std": [
476
+ 0.26862954,
477
+ 0.26130258,
478
+ 0.27577711
479
+ ],
480
+ "max_pixels": 1003520,
481
+ "merge_size": 2,
482
+ "min_pixels": 3136,
483
+ "patch_size": 16,
484
+ "temporal_patch_size": 2,
485
+ "type": "QWen2VLImageTransform"
486
+ },
487
+ {
488
+ "action_dim": 32,
489
+ "action_key": "action",
490
+ "norm_type": "mean_std",
491
+ "state_dim": 64,
492
+ "state_key": "proprio",
493
+ "type": "NormalizeStatesAndActions"
494
+ }
495
+ ],
496
+ "type": "ParquetDataset",
497
+ "use_delta": false,
498
+ "window_start_idx": 0
499
+ },
500
+ "name_mappings": {
501
+ "action": [
502
+ "action"
503
+ ],
504
+ "observation.state": [
505
+ "proprio"
506
+ ]
507
+ },
508
+ "statistic_keys": [
509
+ "observation.state",
510
+ "timestamp",
511
+ "action"
512
+ ],
513
+ "statistic_name": "libero_object_no_noops",
514
+ "type": "DistributedRepeatingDataset"
515
+ },
516
+ "per_device_batch_size": 8,
517
+ "per_device_num_workers": 4
518
+ }
519
+ }
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/config.yaml ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _qwen3vl_vla_ckpt: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors
2
+ _qwen3vl_vlm_config:
3
+ architectures:
4
+ - Qwen3VLAForConditionalGeneration
5
+ dtype: bfloat16
6
+ eos_token_id: 151645
7
+ image_token_id: 151655
8
+ model_type: qwen3_vl
9
+ pad_token_id: 151643
10
+ pos_skipping_range: 4096
11
+ text_config:
12
+ attention_bias: false
13
+ attention_dropout: 0.0
14
+ bos_token_id: 151643
15
+ dtype: bfloat16
16
+ eos_token_id: 151645
17
+ head_dim: 128
18
+ hidden_act: silu
19
+ hidden_size: 1024
20
+ initializer_range: 0.02
21
+ intermediate_size: 3072
22
+ max_position_embeddings: 262144
23
+ model_type: qwen3_vl_text
24
+ num_attention_heads: 16
25
+ num_hidden_layers: 28
26
+ num_key_value_heads: 8
27
+ pad_token_id: null
28
+ rms_norm_eps: 1.0e-06
29
+ rope_parameters:
30
+ mrope_interleaved: true
31
+ mrope_section:
32
+ - 24
33
+ - 20
34
+ - 20
35
+ rope_theta: 5000000
36
+ rope_type: default
37
+ tie_word_embeddings: true
38
+ use_cache: true
39
+ vocab_size: 151936
40
+ tie_word_embeddings: true
41
+ use_another_LLM_path: ''
42
+ use_pos_skipping: false
43
+ vision_config:
44
+ deepstack_visual_indexes:
45
+ - 5
46
+ - 11
47
+ - 17
48
+ depth: 24
49
+ dtype: bfloat16
50
+ hidden_act: gelu_pytorch_tanh
51
+ hidden_size: 1024
52
+ in_channels: 3
53
+ initializer_range: 0.02
54
+ intermediate_size: 4096
55
+ model_type: qwen3_vl
56
+ num_heads: 16
57
+ num_position_embeddings: 2304
58
+ out_hidden_size: 1024
59
+ patch_size: 16
60
+ spatial_merge_size: 2
61
+ temporal_patch_size: 2
62
+ eval:
63
+ dataset:
64
+ transforms:
65
+ - embodiment_id: 2
66
+ img_keys:
67
+ - agentview_image
68
+ - robot0_eye_in_hand_image
69
+ type: ProcessLiberoEvalInputs
70
+ - type: ConvertPILImageToNumpyArray
71
+ - image_mean:
72
+ - 0.48145466
73
+ - 0.4578275
74
+ - 0.40821073
75
+ image_std:
76
+ - 0.26862954
77
+ - 0.26130258
78
+ - 0.27577711
79
+ img_key: pixel_values
80
+ max_pixels: 1003520
81
+ merge_size: 2
82
+ min_pixels: 3136
83
+ patch_size: 16
84
+ temporal_patch_size: 2
85
+ to_tensor: true
86
+ type: QWen2VLImageTransform
87
+ - tokenizer:
88
+ model_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/
89
+ type: PretrainedTokenizer
90
+ type: LiberoPromptFromInputs
91
+ - gripper_key: robot0_gripper_qpos
92
+ norm_type: mean_std
93
+ out_key: states
94
+ pos_key: robot0_eef_pos
95
+ quat_key: robot0_eef_quat
96
+ state_dim: 64
97
+ type: LiberoProprioFromInputs
98
+ type: LiberoParquetEvalDataset
99
+ denormalize_action:
100
+ norm_type: mean_std
101
+ type: DenormalizeLiberoAction
102
+ eval_chunk_size: 10
103
+ model_family: pi0
104
+ num_steps_wait: 10
105
+ num_trials_per_task: 50
106
+ resize_size: 224
107
+ seed: 7
108
+ task_suite_name: libero_object
109
+ type: LiberoEvalRunner
110
+ inference_model:
111
+ freeze_projector: false
112
+ freeze_vlm_backbone: false
113
+ name_mapping: null
114
+ pretrained_name_or_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors
115
+ type: LlavaVLA
116
+ vla_head:
117
+ action_dim: 32
118
+ backbone_embedding_dim: 2048
119
+ diffusion_model_cfg:
120
+ attention_head_dim: 48
121
+ cross_attention_dim: 2048
122
+ dropout: 0.2
123
+ final_dropout: true
124
+ interleave_self_attention: true
125
+ norm_type: ada_norm
126
+ num_attention_heads: 32
127
+ num_layers: 16
128
+ output_dim: 1024
129
+ positional_embeddings: null
130
+ hidden_size: 1024
131
+ input_embedding_dim: 1536
132
+ num_heads: 4
133
+ num_inference_timesteps: 4
134
+ num_layers: 1
135
+ ori_action_dim: 7
136
+ state_dim: 64
137
+ traj_length: 10
138
+ type: FlowMatchingHead
139
+ vl_self_attention_cfg:
140
+ attention_head_dim: 64
141
+ dropout: 0.2
142
+ final_dropout: true
143
+ num_attention_heads: 32
144
+ num_layers: 4
145
+ positional_embeddings: null
146
+ vlm_backbone:
147
+ attn_implementation: sdpa
148
+ projection_output_dim: 2048
149
+ projection_type: linear
150
+ type: Qwen3VL
151
+ use_projection: true
152
+ vlm_backbone_id: qwen3_0.6b_vl_pt
153
+ vlm_config:
154
+ architectures:
155
+ - Qwen3VLAForConditionalGeneration
156
+ dtype: bfloat16
157
+ eos_token_id: 151645
158
+ image_token_id: 151655
159
+ model_type: qwen3_vl
160
+ pad_token_id: 151643
161
+ pos_skipping_range: 4096
162
+ text_config:
163
+ attention_bias: false
164
+ attention_dropout: 0.0
165
+ bos_token_id: 151643
166
+ dtype: bfloat16
167
+ eos_token_id: 151645
168
+ head_dim: 128
169
+ hidden_act: silu
170
+ hidden_size: 1024
171
+ initializer_range: 0.02
172
+ intermediate_size: 3072
173
+ max_position_embeddings: 262144
174
+ model_type: qwen3_vl_text
175
+ num_attention_heads: 16
176
+ num_hidden_layers: 28
177
+ num_key_value_heads: 8
178
+ pad_token_id: null
179
+ rms_norm_eps: 1.0e-06
180
+ rope_parameters:
181
+ mrope_interleaved: true
182
+ mrope_section:
183
+ - 24
184
+ - 20
185
+ - 20
186
+ rope_theta: 5000000
187
+ rope_type: default
188
+ tie_word_embeddings: true
189
+ use_cache: true
190
+ vocab_size: 151936
191
+ tie_word_embeddings: true
192
+ use_another_LLM_path: ''
193
+ use_pos_skipping: false
194
+ vision_config:
195
+ deepstack_visual_indexes:
196
+ - 5
197
+ - 11
198
+ - 17
199
+ depth: 24
200
+ dtype: bfloat16
201
+ hidden_act: gelu_pytorch_tanh
202
+ hidden_size: 1024
203
+ in_channels: 3
204
+ initializer_range: 0.02
205
+ intermediate_size: 4096
206
+ model_type: qwen3_vl
207
+ num_heads: 16
208
+ num_position_embeddings: 2304
209
+ out_hidden_size: 1024
210
+ patch_size: 16
211
+ spatial_merge_size: 2
212
+ temporal_patch_size: 2
213
+ vlm_path: null
214
+ model:
215
+ freeze_projector: false
216
+ freeze_vlm_backbone: false
217
+ name_mapping: null
218
+ pretrained_name_or_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors
219
+ strict_mapping: false
220
+ type: LlavaVLA
221
+ vla_head:
222
+ action_dim: 32
223
+ backbone_embedding_dim: 2048
224
+ diffusion_model_cfg:
225
+ attention_head_dim: 48
226
+ cross_attention_dim: 2048
227
+ dropout: 0.2
228
+ final_dropout: true
229
+ interleave_self_attention: true
230
+ norm_type: ada_norm
231
+ num_attention_heads: 32
232
+ num_layers: 16
233
+ output_dim: 1024
234
+ positional_embeddings: null
235
+ hidden_size: 1024
236
+ input_embedding_dim: 1536
237
+ num_heads: 4
238
+ num_inference_timesteps: 4
239
+ num_layers: 1
240
+ ori_action_dim: 7
241
+ state_dim: 64
242
+ traj_length: 10
243
+ type: FlowMatchingHead
244
+ vl_self_attention_cfg:
245
+ attention_head_dim: 64
246
+ dropout: 0.2
247
+ final_dropout: true
248
+ num_attention_heads: 32
249
+ num_layers: 4
250
+ positional_embeddings: null
251
+ vlm_backbone:
252
+ attn_implementation: sdpa
253
+ projection_output_dim: 2048
254
+ projection_type: linear
255
+ type: Qwen3VL
256
+ use_projection: true
257
+ vlm_backbone_id: qwen3_0.6b_vl_pt
258
+ vlm_config:
259
+ architectures:
260
+ - Qwen3VLAForConditionalGeneration
261
+ dtype: bfloat16
262
+ eos_token_id: 151645
263
+ image_token_id: 151655
264
+ model_type: qwen3_vl
265
+ pad_token_id: 151643
266
+ pos_skipping_range: 4096
267
+ text_config:
268
+ attention_bias: false
269
+ attention_dropout: 0.0
270
+ bos_token_id: 151643
271
+ dtype: bfloat16
272
+ eos_token_id: 151645
273
+ head_dim: 128
274
+ hidden_act: silu
275
+ hidden_size: 1024
276
+ initializer_range: 0.02
277
+ intermediate_size: 3072
278
+ max_position_embeddings: 262144
279
+ model_type: qwen3_vl_text
280
+ num_attention_heads: 16
281
+ num_hidden_layers: 28
282
+ num_key_value_heads: 8
283
+ pad_token_id: null
284
+ rms_norm_eps: 1.0e-06
285
+ rope_parameters:
286
+ mrope_interleaved: true
287
+ mrope_section:
288
+ - 24
289
+ - 20
290
+ - 20
291
+ rope_theta: 5000000
292
+ rope_type: default
293
+ tie_word_embeddings: true
294
+ use_cache: true
295
+ vocab_size: 151936
296
+ tie_word_embeddings: true
297
+ use_another_LLM_path: ''
298
+ use_pos_skipping: false
299
+ vision_config:
300
+ deepstack_visual_indexes:
301
+ - 5
302
+ - 11
303
+ - 17
304
+ depth: 24
305
+ dtype: bfloat16
306
+ hidden_act: gelu_pytorch_tanh
307
+ hidden_size: 1024
308
+ in_channels: 3
309
+ initializer_range: 0.02
310
+ intermediate_size: 4096
311
+ model_type: qwen3_vl
312
+ num_heads: 16
313
+ num_position_embeddings: 2304
314
+ out_hidden_size: 1024
315
+ patch_size: 16
316
+ spatial_merge_size: 2
317
+ temporal_patch_size: 2
318
+ vlm_path: null
319
+ runner:
320
+ change_key_name: false
321
+ collator:
322
+ keys:
323
+ - states
324
+ - observation.eepose
325
+ - timestamp
326
+ - images
327
+ - img_masks
328
+ - lang_tokens
329
+ - lang_masks
330
+ - actions
331
+ - action_masks
332
+ - embodiment_ids
333
+ - image_grid_thw
334
+ meta_keys:
335
+ - task_description
336
+ - prompt
337
+ - info
338
+ - stats
339
+ type: DictCollator
340
+ enable_gradient_checkpointing: false
341
+ enable_mixed_precision_training: true
342
+ learning_rate: 3.0e-05
343
+ lr_scheduler_type: linear-warmup+cosine-decay
344
+ max_epochs: 24
345
+ max_grad_norm: 1.0
346
+ metric:
347
+ active_trackers:
348
+ - jsonl
349
+ - wandb
350
+ grad_accumulation_steps: 1
351
+ run_dir: work_dirs
352
+ type: VLAMetric
353
+ window_size: 1
354
+ mixed_precision_dtype: bf16
355
+ sampler: null
356
+ sharding_strategy: full-shard
357
+ tokenizer:
358
+ model_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/
359
+ type: PretrainedTokenizer
360
+ type: FSDPTrainRunner
361
+ warmup_ratio: 0.03
362
+ weight_decay: 0.0
363
+ train_dataloader:
364
+ dataset:
365
+ datasets:
366
+ action_key: action
367
+ action_window_size: 10
368
+ data_root_path:
369
+ - /limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1
370
+ statistic_name: libero_object_no_noops
371
+ transforms:
372
+ - embodiment_id: 2
373
+ name_mappings:
374
+ actions:
375
+ - actions
376
+ observation.state:
377
+ - states
378
+ parquet_keys:
379
+ - observation.state
380
+ - timestamp
381
+ - actions
382
+ - info
383
+ - stats
384
+ - action_masks
385
+ type: ProcessParquetInputs
386
+ video_keys:
387
+ - observation.images.image
388
+ - observation.images.wrist_image
389
+ - type: ParquetPrompter
390
+ - tokenizer:
391
+ model_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/
392
+ type: PretrainedTokenizer
393
+ type: ProcessPrompts
394
+ - height: 224
395
+ type: ResizeImages
396
+ width: 224
397
+ - image_mean:
398
+ - 0.48145466
399
+ - 0.4578275
400
+ - 0.40821073
401
+ image_std:
402
+ - 0.26862954
403
+ - 0.26130258
404
+ - 0.27577711
405
+ max_pixels: 1003520
406
+ merge_size: 2
407
+ min_pixels: 3136
408
+ patch_size: 16
409
+ temporal_patch_size: 2
410
+ type: QWen2VLImageTransform
411
+ - action_dim: 32
412
+ action_key: action
413
+ norm_type: mean_std
414
+ state_dim: 64
415
+ state_key: proprio
416
+ type: NormalizeStatesAndActions
417
+ type: ParquetDataset
418
+ use_delta: false
419
+ window_start_idx: 0
420
+ name_mappings:
421
+ action:
422
+ - action
423
+ observation.state:
424
+ - proprio
425
+ statistic_keys:
426
+ - observation.state
427
+ - timestamp
428
+ - action
429
+ statistic_name: libero_object_no_noops
430
+ type: DistributedRepeatingDataset
431
+ per_device_batch_size: 8
432
+ per_device_num_workers: 4
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/dataset_statistics.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "libero_object_no_noops": {
3
+ "proprio": {
4
+ "mean": [
5
+ -0.029990377887890714,
6
+ -0.007947119348036638,
7
+ 0.20293400450543442,
8
+ 3.108609864126749,
9
+ -0.2140478258736818,
10
+ -0.11307033080181891,
11
+ 0.02938040086729137,
12
+ -0.03055662046031239
13
+ ],
14
+ "std": [
15
+ 0.023670072817660013,
16
+ 0.06225550550101929,
17
+ 0.027602195887468282,
18
+ 0.030705662709939595,
19
+ 0.11858388544011475,
20
+ 0.0732862116780689,
21
+ 0.0033820150919409114,
22
+ 0.003251806898346789
23
+ ],
24
+ "min": [
25
+ -0.1765444278717041,
26
+ -0.29457300901412964,
27
+ 0.008128180168569088,
28
+ 2.2890501022338867,
29
+ -1.883241891860962,
30
+ -1.0600427389144897,
31
+ 0.0006495157140307128,
32
+ -0.041782498359680176
33
+ ],
34
+ "max": [
35
+ 0.14580604434013367,
36
+ 0.33216384053230286,
37
+ 0.3857804834842682,
38
+ 3.4003844261169434,
39
+ 0.7954911589622498,
40
+ 0.6642207503318787,
41
+ 0.04104341194033623,
42
+ -0.00018117300351150334
43
+ ],
44
+ "q01": null,
45
+ "q99": null
46
+ },
47
+ "timestamp": {
48
+ "mean": [
49
+ 3.721695479517497
50
+ ],
51
+ "std": [
52
+ 2.237081841546431
53
+ ],
54
+ "min": [
55
+ 0.0
56
+ ],
57
+ "max": [
58
+ 12.65
59
+ ],
60
+ "q01": null,
61
+ "q99": null
62
+ },
63
+ "action": {
64
+ "mean": [
65
+ 0.07096490746267721,
66
+ 0.13498889685796536,
67
+ -0.046013733641776924,
68
+ 0.0012352044345171392,
69
+ 0.006998803721298765,
70
+ -0.015027527802288103,
71
+ 0.46428998075465666
72
+ ],
73
+ "std": [
74
+ 0.10133946158044306,
75
+ 0.165716399861371,
76
+ 0.16914353294024564,
77
+ 0.009240558533809633,
78
+ 0.018657116474914717,
79
+ 0.015913625946349673,
80
+ 0.18849963395480163
81
+ ],
82
+ "min": [
83
+ -0.8839285969734192,
84
+ -0.9375,
85
+ -0.9375,
86
+ -0.15000000596046448,
87
+ -0.29035714268684387,
88
+ -0.32892856001853943,
89
+ 0.0
90
+ ],
91
+ "max": [
92
+ 0.9375,
93
+ 0.8919642567634583,
94
+ 0.9375,
95
+ 0.17678570747375488,
96
+ 0.35035714507102966,
97
+ 0.1810714304447174,
98
+ 1.0
99
+ ],
100
+ "q01": null,
101
+ "q99": null
102
+ }
103
+ }
104
+ }
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/run-metrics.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hparams": "{'_qwen3vl_vla_ckpt': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', '_qwen3vl_vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'strict_mapping': False, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_object_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': ['/limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1'], 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_object_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 24, 'learning_rate': 3e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids', 'image_grid_thw'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {'_qwen3vl_vla_ckpt': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', '_qwen3vl_vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'strict_mapping': False, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_object_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': ['/limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1'], 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_object_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 24, 'learning_rate': 3e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids', 'image_grid_thw'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {...}, 'run_id': 'gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'sharding_strategy': 'full-shard', 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {...}, 'args': Namespace(config='/limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py', work_dir='/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', cfg_options={'runner.learning_rate': 3e-05, 'runner.max_epochs': 24}, eval_after_train=False, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_object', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'ConvertPILImageToNumpyArray'}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711], 'img_key': 'pixel_values', 'to_tensor': True}, {'type': 'LiberoPromptFromInputs', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'run_id': 'gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'sharding_strategy': 'full-shard', 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {'_qwen3vl_vla_ckpt': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', '_qwen3vl_vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'strict_mapping': False, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_object_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': ['/limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1'], 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_object_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 24, 'learning_rate': 3e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids', 'image_grid_thw'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {...}, 'run_id': 'gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'sharding_strategy': 'full-shard', 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {...}, 'args': Namespace(config='/limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py', work_dir='/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', cfg_options={'runner.learning_rate': 3e-05, 'runner.max_epochs': 24}, eval_after_train=False, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_object', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'ConvertPILImageToNumpyArray'}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711], 'img_key': 'pixel_values', 'to_tensor': True}, {'type': 'LiberoPromptFromInputs', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'args': Namespace(config='/limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py', work_dir='/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', cfg_options={'runner.learning_rate': 3e-05, 'runner.max_epochs': 24}, eval_after_train=False, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_object', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'ConvertPILImageToNumpyArray'}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711], 'img_key': 'pixel_values', 'to_tensor': True}, {'type': 'LiberoPromptFromInputs', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}", "run_id": "gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30"}
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<action_end>": 151675,
6
+ "<action_pad>": 151673,
7
+ "<action_start>": 151674,
8
+ "<action_video>": 151669,
9
+ "<box2d_end>": 151679,
10
+ "<box2d_start>": 151678,
11
+ "<future_video_pad>": 151670,
12
+ "<future_vision_end>": 151672,
13
+ "<future_vision_start>": 151671,
14
+ "<ignore_pad>": 151688,
15
+ "<point2d_end>": 151681,
16
+ "<point2d_start>": 151680,
17
+ "<ref_end>": 151683,
18
+ "<ref_keypoint_end>": 151685,
19
+ "<ref_keypoint_start>": 151684,
20
+ "<ref_start>": 151682,
21
+ "<think>": 151667,
22
+ "<think_end>": 151677,
23
+ "<think_start>": 151676,
24
+ "<tool_call>": 151657,
25
+ "<tool_response>": 151665,
26
+ "<traj2d_end>": 151687,
27
+ "<traj2d_start>": 151686,
28
+ "<|box_end|>": 151649,
29
+ "<|box_start|>": 151648,
30
+ "<|endoftext|>": 151643,
31
+ "<|file_sep|>": 151664,
32
+ "<|fim_middle|>": 151660,
33
+ "<|fim_pad|>": 151662,
34
+ "<|fim_prefix|>": 151659,
35
+ "<|fim_suffix|>": 151661,
36
+ "<|im_end|>": 151645,
37
+ "<|im_start|>": 151644,
38
+ "<|image_pad|>": 151655,
39
+ "<|object_ref_end|>": 151647,
40
+ "<|object_ref_start|>": 151646,
41
+ "<|quad_end|>": 151651,
42
+ "<|quad_start|>": 151650,
43
+ "<|repo_name|>": 151663,
44
+ "<|video_pad|>": 151656,
45
+ "<|vision_end|>": 151653,
46
+ "<|vision_pad|>": 151654,
47
+ "<|vision_start|>": 151652
48
+ }
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b7a88bc82340dd4205b97b9c287df826cf386b31ac9ecd9e648073d940355e1
3
+ size 11426476
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "151669": {
214
+ "content": "<action_video>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<future_video_pad>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<future_vision_start>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<future_vision_end>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<action_pad>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<action_start>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<action_end>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "151676": {
270
+ "content": "<think_start>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "151677": {
278
+ "content": "<think_end>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "151678": {
286
+ "content": "<box2d_start>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "151679": {
294
+ "content": "<box2d_end>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "151680": {
302
+ "content": "<point2d_start>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "151681": {
310
+ "content": "<point2d_end>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "151682": {
318
+ "content": "<ref_start>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "151683": {
326
+ "content": "<ref_end>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "151684": {
334
+ "content": "<ref_keypoint_start>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "151685": {
342
+ "content": "<ref_keypoint_end>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "151686": {
350
+ "content": "<traj2d_start>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "151687": {
358
+ "content": "<traj2d_end>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "151688": {
366
+ "content": "<ignore_pad>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ }
373
+ },
374
+ "additional_special_tokens": [
375
+ "<|im_start|>",
376
+ "<|im_end|>",
377
+ "<|object_ref_start|>",
378
+ "<|object_ref_end|>",
379
+ "<|box_start|>",
380
+ "<|box_end|>",
381
+ "<|quad_start|>",
382
+ "<|quad_end|>",
383
+ "<|vision_start|>",
384
+ "<|vision_end|>",
385
+ "<|vision_pad|>",
386
+ "<|image_pad|>",
387
+ "<|video_pad|>"
388
+ ],
389
+ "bos_token": null,
390
+ "clean_up_tokenization_spaces": false,
391
+ "eos_token": "<|im_end|>",
392
+ "errors": "replace",
393
+ "extra_special_tokens": {},
394
+ "model_max_length": 32768,
395
+ "pad_token": "<|endoftext|>",
396
+ "padding_side": "right",
397
+ "processor_class": "Qwen3VLAProcessor",
398
+ "split_special_tokens": false,
399
+ "tokenizer_class": "Qwen2Tokenizer",
400
+ "unk_token": null
401
+ }
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/vlm_backbone_config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3VLAForConditionalGeneration"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "eos_token_id": 151645,
7
+ "image_token_id": 151655,
8
+ "model_type": "qwen3_vl",
9
+ "pad_token_id": 151643,
10
+ "pos_skipping_range": 4096,
11
+ "text_config": {
12
+ "attention_bias": false,
13
+ "attention_dropout": 0.0,
14
+ "bos_token_id": 151643,
15
+ "dtype": "bfloat16",
16
+ "eos_token_id": 151645,
17
+ "head_dim": 128,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 1024,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 3072,
22
+ "max_position_embeddings": 262144,
23
+ "model_type": "qwen3_vl_text",
24
+ "num_attention_heads": 16,
25
+ "num_hidden_layers": 28,
26
+ "num_key_value_heads": 8,
27
+ "pad_token_id": null,
28
+ "rms_norm_eps": 1e-06,
29
+ "rope_parameters": {
30
+ "mrope_interleaved": true,
31
+ "mrope_section": [
32
+ 24,
33
+ 20,
34
+ 20
35
+ ],
36
+ "rope_theta": 5000000,
37
+ "rope_type": "default"
38
+ },
39
+ "tie_word_embeddings": true,
40
+ "use_cache": true,
41
+ "vocab_size": 151936
42
+ },
43
+ "tie_word_embeddings": true,
44
+ "transformers_version": "5.3.0",
45
+ "use_another_LLM_path": "",
46
+ "use_pos_skipping": false,
47
+ "video_token_id": 151656,
48
+ "vision_config": {
49
+ "deepstack_visual_indexes": [
50
+ 5,
51
+ 11,
52
+ 17
53
+ ],
54
+ "depth": 24,
55
+ "dtype": "bfloat16",
56
+ "hidden_act": "gelu_pytorch_tanh",
57
+ "hidden_size": 1024,
58
+ "in_channels": 3,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4096,
61
+ "model_type": "qwen3_vl",
62
+ "num_heads": 16,
63
+ "num_position_embeddings": 2304,
64
+ "out_hidden_size": 1024,
65
+ "patch_size": 16,
66
+ "spatial_merge_size": 2,
67
+ "temporal_patch_size": 2
68
+ },
69
+ "vision_end_token_id": 151653,
70
+ "vision_start_token_id": 151652
71
+ }