Junyi42 commited on
Commit
fdd4373
·
verified ·
1 Parent(s): cd6e0d0

Upload checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins

Browse files
checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/wandb/offline-run-20260126_192812-checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins-run0/files/output.log CHANGED
@@ -1,173 +1,3 @@
1
- FullyShardedDataParallel(
2
- (_fsdp_wrapped_module): Bagel(
3
- (language_model): Qwen2ForCausalLM(
4
- (model): Qwen2Model(
5
- (embed_tokens): Embedding(152064, 3584)
6
- (layers): ModuleList(
7
- (0-27): 28 x FullyShardedDataParallel(
8
- (_fsdp_wrapped_module): CheckpointWrapper(
9
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
- (self_attn): PackedAttentionMoT(
11
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
- )
24
- (mlp): Qwen2MLP(
25
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
- (act_fn): SiLU()
29
- )
30
- (mlp_moe_gen): Qwen2MLP(
31
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
- (act_fn): SiLU()
35
- )
36
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
- )
41
- )
42
- )
43
- )
44
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (rotary_emb): Qwen2RotaryEmbedding()
47
- )
48
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
- )
50
- (vit_model): SiglipVisionModel(
51
- (vision_model): FullyShardedDataParallel(
52
- (_fsdp_wrapped_module): SiglipVisionTransformer(
53
- (embeddings): SiglipVisionEmbeddings(
54
- (position_embedding): Embedding(4900, 1152)
55
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
56
- )
57
- (encoder): SiglipEncoder(
58
- (layers): ModuleList(
59
- (0-25): 26 x FullyShardedDataParallel(
60
- (_fsdp_wrapped_module): CheckpointWrapper(
61
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
62
- (self_attn): SiglipFlashAttention2(
63
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
64
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
65
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
66
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
67
- )
68
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
69
- (mlp): SiglipMLP(
70
- (activation_fn): PytorchGELUTanh()
71
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
72
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
73
- )
74
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
75
- )
76
- )
77
- )
78
- )
79
- )
80
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
81
- )
82
- )
83
- )
84
- (connector): FullyShardedDataParallel(
85
- (_fsdp_wrapped_module): CheckpointWrapper(
86
- (_checkpoint_wrapped_module): MLPconnector(
87
- (activation_fn): PytorchGELUTanh()
88
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
89
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
90
- )
91
- )
92
- )
93
- (vit_pos_embed): FullyShardedDataParallel(
94
- (_fsdp_wrapped_module): PositionEmbedding()
95
- )
96
- )
97
- )
98
- _flat_param True
99
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
100
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
101
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
102
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
103
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
104
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
105
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
106
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
107
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
108
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
109
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
110
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
111
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
112
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
113
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
128
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
142
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
143
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
144
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_pos_embed._fsdp_wrapped_module._flat_param False
156
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse/vlm_gym_match_equation_sos_train
157
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step0
158
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
159
- [eval debug] first 3 batch fingerprints:
160
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
161
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
162
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
163
- ce_avg: 1.5381660461425781, mse_avg: 0.0
164
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step500
165
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
166
- [eval debug] first 3 batch fingerprints:
167
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
168
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
169
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
170
- ce_avg: 0.06237730756402016, mse_avg: 0.0
171
  wandb: Detected [huggingface_hub.inference] in use.
172
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
173
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -1252,6 +1082,176 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1252
  [2026-01-26 19:53:23] (step=0001071) Train Loss mse: 0.0000, Train Loss ce: 0.0635, Train Steps/Sec: 1.02,
1253
  [2026-01-26 19:53:24] (step=0001072) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 1.01,
1254
  [2026-01-26 19:53:25] (step=0001073) Train Loss mse: 0.0000, Train Loss ce: 0.0489, Train Steps/Sec: 1.01,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1255
  [2026-01-26 19:53:26] (step=0001074) Train Loss mse: 0.0000, Train Loss ce: 0.0603, Train Steps/Sec: 1.02,
1256
  [2026-01-26 19:53:27] (step=0001075) Train Loss mse: 0.0000, Train Loss ce: 0.0678, Train Steps/Sec: 0.76,
1257
  [2026-01-26 19:53:28] (step=0001076) Train Loss mse: 0.0000, Train Loss ce: 0.0544, Train Steps/Sec: 0.82,
@@ -1289,27 +1289,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1289
  [2026-01-26 19:54:02] (step=0001108) Train Loss mse: 0.0000, Train Loss ce: 0.0604, Train Steps/Sec: 1.01,
1290
  [2026-01-26 19:54:03] (step=0001109) Train Loss mse: 0.0000, Train Loss ce: 0.0570, Train Steps/Sec: 0.81,
1291
  [2026-01-26 19:54:04] (step=0001110) Train Loss mse: 0.0000, Train Loss ce: 0.0559, Train Steps/Sec: 1.01,
1292
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step1500
1293
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
1294
- [eval debug] first 3 batch fingerprints:
1295
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1296
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1297
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1298
- ce_avg: 0.05330345034599304, mse_avg: 0.0
1299
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step2000
1300
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
1301
- [eval debug] first 3 batch fingerprints:
1302
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1303
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1304
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1305
- ce_avg: 0.05127852410078049, mse_avg: 0.0
1306
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step2500
1307
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
1308
- [eval debug] first 3 batch fingerprints:
1309
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1310
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1311
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1312
- ce_avg: 0.05304804816842079, mse_avg: 0.0
1313
  [2026-01-26 19:54:05] (step=0001111) Train Loss mse: 0.0000, Train Loss ce: 0.0425, Train Steps/Sec: 1.01,
1314
  [2026-01-26 19:54:06] (step=0001112) Train Loss mse: 0.0000, Train Loss ce: 0.0792, Train Steps/Sec: 1.01,
1315
  [2026-01-26 19:54:08] (step=0001113) Train Loss mse: 0.0000, Train Loss ce: 0.0488, Train Steps/Sec: 0.76,
@@ -2764,6 +2743,27 @@ ce_avg: 0.05304804816842079, mse_avg: 0.0
2764
  [2026-01-26 20:19:55] (step=0002562) Train Loss mse: 0.0000, Train Loss ce: 0.0472, Train Steps/Sec: 1.02,
2765
  [2026-01-26 20:19:56] (step=0002563) Train Loss mse: 0.0000, Train Loss ce: 0.0490, Train Steps/Sec: 1.02,
2766
  [2026-01-26 20:19:57] (step=0002564) Train Loss mse: 0.0000, Train Loss ce: 0.0634, Train Steps/Sec: 1.02,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2767
  [2026-01-26 20:19:58] (step=0002565) Train Loss mse: 0.0000, Train Loss ce: 0.0394, Train Steps/Sec: 1.02,
2768
  [2026-01-26 20:19:59] (step=0002566) Train Loss mse: 0.0000, Train Loss ce: 0.0322, Train Steps/Sec: 1.01,
2769
  [2026-01-26 20:20:00] (step=0002567) Train Loss mse: 0.0000, Train Loss ce: 0.0585, Train Steps/Sec: 0.76,
@@ -2813,20 +2813,6 @@ ce_avg: 0.05304804816842079, mse_avg: 0.0
2813
  [2026-01-26 20:20:47] (step=0002611) Train Loss mse: 0.0000, Train Loss ce: 0.0525, Train Steps/Sec: 1.01,
2814
  [2026-01-26 20:20:48] (step=0002612) Train Loss mse: 0.0000, Train Loss ce: 0.0433, Train Steps/Sec: 0.99,
2815
  [2026-01-26 20:20:49] (step=0002613) Train Loss mse: 0.0000, Train Loss ce: 0.0471, Train Steps/Sec: 0.76,
2816
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step3000
2817
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
2818
- [eval debug] first 3 batch fingerprints:
2819
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2820
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2821
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2822
- ce_avg: 0.058707185089588165, mse_avg: 0.0
2823
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step3500
2824
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
2825
- [eval debug] first 3 batch fingerprints:
2826
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2827
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2828
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2829
- ce_avg: 0.10416685044765472, mse_avg: 0.0
2830
  [2026-01-26 20:20:50] (step=0002614) Train Loss mse: 0.0000, Train Loss ce: 0.0586, Train Steps/Sec: 1.01,
2831
  [2026-01-26 20:20:51] (step=0002615) Train Loss mse: 0.0000, Train Loss ce: 0.0357, Train Steps/Sec: 1.01,
2832
  [2026-01-26 20:20:52] (step=0002616) Train Loss mse: 0.0000, Train Loss ce: 0.0313, Train Steps/Sec: 1.00,
@@ -3800,6 +3786,48 @@ ce_avg: 0.10416685044765472, mse_avg: 0.0
3800
  [2026-01-26 20:38:10] (step=0003584) Train Loss mse: 0.0000, Train Loss ce: 0.0436, Train Steps/Sec: 1.01,
3801
  [2026-01-26 20:38:11] (step=0003585) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 1.01,
3802
  [2026-01-26 20:38:12] (step=0003586) Train Loss mse: 0.0000, Train Loss ce: 0.0235, Train Steps/Sec: 0.81,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3803
  [2026-01-26 20:38:13] (step=0003587) Train Loss mse: 0.0000, Train Loss ce: 0.0596, Train Steps/Sec: 1.01,
3804
  [2026-01-26 20:38:14] (step=0003588) Train Loss mse: 0.0000, Train Loss ce: 0.0493, Train Steps/Sec: 1.01,
3805
  [2026-01-26 20:38:15] (step=0003589) Train Loss mse: 0.0000, Train Loss ce: 0.0348, Train Steps/Sec: 1.01,
@@ -3907,27 +3935,27 @@ ce_avg: 0.10416685044765472, mse_avg: 0.0
3907
  [2026-01-26 20:40:04] (step=0003691) Train Loss mse: 0.0000, Train Loss ce: 0.0398, Train Steps/Sec: 1.00,
3908
  [2026-01-26 20:40:05] (step=0003692) Train Loss mse: 0.0000, Train Loss ce: 0.0272, Train Steps/Sec: 1.02,
3909
  [2026-01-26 20:40:06] (step=0003693) Train Loss mse: 0.0000, Train Loss ce: 0.0313, Train Steps/Sec: 1.02,
3910
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step4000
3911
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
3912
- [eval debug] first 3 batch fingerprints:
3913
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3914
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3915
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3916
- ce_avg: 0.09295430034399033, mse_avg: 0.0
3917
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step4500
3918
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
3919
- [eval debug] first 3 batch fingerprints:
3920
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3921
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3922
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3923
- ce_avg: 0.09719827771186829, mse_avg: 0.0
3924
- base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step5000
3925
- Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
3926
- [eval debug] first 3 batch fingerprints:
3927
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3928
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3929
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3930
- ce_avg: 0.10010946542024612, mse_avg: 0.0
3931
  [2026-01-26 20:40:30] (step=0003715) Train Loss mse: 0.0000, Train Loss ce: 0.0235, Train Steps/Sec: 1.01,
3932
  [2026-01-26 20:40:31] (step=0003716) Train Loss mse: 0.0000, Train Loss ce: 0.0562, Train Steps/Sec: 1.01,
3933
  [2026-01-26 20:40:32] (step=0003717) Train Loss mse: 0.0000, Train Loss ce: 0.0530, Train Steps/Sec: 0.99,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
1082
  [2026-01-26 19:53:23] (step=0001071) Train Loss mse: 0.0000, Train Loss ce: 0.0635, Train Steps/Sec: 1.02,
1083
  [2026-01-26 19:53:24] (step=0001072) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 1.01,
1084
  [2026-01-26 19:53:25] (step=0001073) Train Loss mse: 0.0000, Train Loss ce: 0.0489, Train Steps/Sec: 1.01,
1085
+ FullyShardedDataParallel(
1086
+ (_fsdp_wrapped_module): Bagel(
1087
+ (language_model): Qwen2ForCausalLM(
1088
+ (model): Qwen2Model(
1089
+ (embed_tokens): Embedding(152064, 3584)
1090
+ (layers): ModuleList(
1091
+ (0-27): 28 x FullyShardedDataParallel(
1092
+ (_fsdp_wrapped_module): CheckpointWrapper(
1093
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
1094
+ (self_attn): PackedAttentionMoT(
1095
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
1096
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
1097
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
1098
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
1099
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
1100
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
1101
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
1102
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
1103
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
1104
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1105
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1106
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
1107
+ )
1108
+ (mlp): Qwen2MLP(
1109
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1110
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1111
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1112
+ (act_fn): SiLU()
1113
+ )
1114
+ (mlp_moe_gen): Qwen2MLP(
1115
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1116
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1117
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1118
+ (act_fn): SiLU()
1119
+ )
1120
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1121
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1122
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1123
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1124
+ )
1125
+ )
1126
+ )
1127
+ )
1128
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
1129
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1130
+ (rotary_emb): Qwen2RotaryEmbedding()
1131
+ )
1132
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
1133
+ )
1134
+ (vit_model): SiglipVisionModel(
1135
+ (vision_model): FullyShardedDataParallel(
1136
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
1137
+ (embeddings): SiglipVisionEmbeddings(
1138
+ (position_embedding): Embedding(4900, 1152)
1139
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
1140
+ )
1141
+ (encoder): SiglipEncoder(
1142
+ (layers): ModuleList(
1143
+ (0-25): 26 x FullyShardedDataParallel(
1144
+ (_fsdp_wrapped_module): CheckpointWrapper(
1145
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
1146
+ (self_attn): SiglipFlashAttention2(
1147
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
1148
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
1149
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
1150
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
1151
+ )
1152
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1153
+ (mlp): SiglipMLP(
1154
+ (activation_fn): PytorchGELUTanh()
1155
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
1156
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
1157
+ )
1158
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1159
+ )
1160
+ )
1161
+ )
1162
+ )
1163
+ )
1164
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1165
+ )
1166
+ )
1167
+ )
1168
+ (connector): FullyShardedDataParallel(
1169
+ (_fsdp_wrapped_module): CheckpointWrapper(
1170
+ (_checkpoint_wrapped_module): MLPconnector(
1171
+ (activation_fn): PytorchGELUTanh()
1172
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
1173
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
1174
+ )
1175
+ )
1176
+ )
1177
+ (vit_pos_embed): FullyShardedDataParallel(
1178
+ (_fsdp_wrapped_module): PositionEmbedding()
1179
+ )
1180
+ )
1181
+ )
1182
+ _flat_param True
1183
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1184
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1185
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1186
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1187
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1188
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1189
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1190
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1191
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1192
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1193
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1194
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1195
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1196
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1197
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1198
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1199
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1200
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1201
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1202
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1203
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1204
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1205
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1206
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1207
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1208
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1209
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1210
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1211
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
1212
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1213
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1214
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1215
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1216
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1217
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1218
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1219
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1220
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1221
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1222
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1223
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1224
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1225
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1226
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1227
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1228
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1229
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1230
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1231
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1232
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1233
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1234
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1235
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1236
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1237
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1238
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1239
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
1240
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse/vlm_gym_match_equation_sos_train
1241
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step0
1242
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
1243
+ [eval debug] first 3 batch fingerprints:
1244
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1245
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1246
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1247
+ ce_avg: 1.5381660461425781, mse_avg: 0.0
1248
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step500
1249
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
1250
+ [eval debug] first 3 batch fingerprints:
1251
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1252
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1253
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
1254
+ ce_avg: 0.06237730756402016, mse_avg: 0.0
1255
  [2026-01-26 19:53:26] (step=0001074) Train Loss mse: 0.0000, Train Loss ce: 0.0603, Train Steps/Sec: 1.02,
1256
  [2026-01-26 19:53:27] (step=0001075) Train Loss mse: 0.0000, Train Loss ce: 0.0678, Train Steps/Sec: 0.76,
1257
  [2026-01-26 19:53:28] (step=0001076) Train Loss mse: 0.0000, Train Loss ce: 0.0544, Train Steps/Sec: 0.82,
 
1289
  [2026-01-26 19:54:02] (step=0001108) Train Loss mse: 0.0000, Train Loss ce: 0.0604, Train Steps/Sec: 1.01,
1290
  [2026-01-26 19:54:03] (step=0001109) Train Loss mse: 0.0000, Train Loss ce: 0.0570, Train Steps/Sec: 0.81,
1291
  [2026-01-26 19:54:04] (step=0001110) Train Loss mse: 0.0000, Train Loss ce: 0.0559, Train Steps/Sec: 1.01,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1292
  [2026-01-26 19:54:05] (step=0001111) Train Loss mse: 0.0000, Train Loss ce: 0.0425, Train Steps/Sec: 1.01,
1293
  [2026-01-26 19:54:06] (step=0001112) Train Loss mse: 0.0000, Train Loss ce: 0.0792, Train Steps/Sec: 1.01,
1294
  [2026-01-26 19:54:08] (step=0001113) Train Loss mse: 0.0000, Train Loss ce: 0.0488, Train Steps/Sec: 0.76,
 
2743
  [2026-01-26 20:19:55] (step=0002562) Train Loss mse: 0.0000, Train Loss ce: 0.0472, Train Steps/Sec: 1.02,
2744
  [2026-01-26 20:19:56] (step=0002563) Train Loss mse: 0.0000, Train Loss ce: 0.0490, Train Steps/Sec: 1.02,
2745
  [2026-01-26 20:19:57] (step=0002564) Train Loss mse: 0.0000, Train Loss ce: 0.0634, Train Steps/Sec: 1.02,
2746
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step1000
2747
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
2748
+ [eval debug] first 3 batch fingerprints:
2749
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2750
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2751
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2752
+ ce_avg: 0.05690578743815422, mse_avg: 0.0
2753
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step1500
2754
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
2755
+ [eval debug] first 3 batch fingerprints:
2756
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2757
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2758
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2759
+ ce_avg: 0.05330345034599304, mse_avg: 0.0
2760
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step2000
2761
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
2762
+ [eval debug] first 3 batch fingerprints:
2763
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2764
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2765
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
2766
+ ce_avg: 0.05127852410078049, mse_avg: 0.0
2767
  [2026-01-26 20:19:58] (step=0002565) Train Loss mse: 0.0000, Train Loss ce: 0.0394, Train Steps/Sec: 1.02,
2768
  [2026-01-26 20:19:59] (step=0002566) Train Loss mse: 0.0000, Train Loss ce: 0.0322, Train Steps/Sec: 1.01,
2769
  [2026-01-26 20:20:00] (step=0002567) Train Loss mse: 0.0000, Train Loss ce: 0.0585, Train Steps/Sec: 0.76,
 
2813
  [2026-01-26 20:20:47] (step=0002611) Train Loss mse: 0.0000, Train Loss ce: 0.0525, Train Steps/Sec: 1.01,
2814
  [2026-01-26 20:20:48] (step=0002612) Train Loss mse: 0.0000, Train Loss ce: 0.0433, Train Steps/Sec: 0.99,
2815
  [2026-01-26 20:20:49] (step=0002613) Train Loss mse: 0.0000, Train Loss ce: 0.0471, Train Steps/Sec: 0.76,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2816
  [2026-01-26 20:20:50] (step=0002614) Train Loss mse: 0.0000, Train Loss ce: 0.0586, Train Steps/Sec: 1.01,
2817
  [2026-01-26 20:20:51] (step=0002615) Train Loss mse: 0.0000, Train Loss ce: 0.0357, Train Steps/Sec: 1.01,
2818
  [2026-01-26 20:20:52] (step=0002616) Train Loss mse: 0.0000, Train Loss ce: 0.0313, Train Steps/Sec: 1.00,
 
3786
  [2026-01-26 20:38:10] (step=0003584) Train Loss mse: 0.0000, Train Loss ce: 0.0436, Train Steps/Sec: 1.01,
3787
  [2026-01-26 20:38:11] (step=0003585) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 1.01,
3788
  [2026-01-26 20:38:12] (step=0003586) Train Loss mse: 0.0000, Train Loss ce: 0.0235, Train Steps/Sec: 0.81,
3789
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step2500
3790
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
3791
+ [eval debug] first 3 batch fingerprints:
3792
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3793
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3794
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3795
+ ce_avg: 0.05304804816842079, mse_avg: 0.0
3796
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step3000
3797
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
3798
+ [eval debug] first 3 batch fingerprints:
3799
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3800
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3801
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3802
+ ce_avg: 0.058707185089588165, mse_avg: 0.0
3803
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step3500
3804
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
3805
+ [eval debug] first 3 batch fingerprints:
3806
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3807
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3808
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3809
+ ce_avg: 0.10416685044765472, mse_avg: 0.0
3810
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step4000
3811
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
3812
+ [eval debug] first 3 batch fingerprints:
3813
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3814
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3815
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3816
+ ce_avg: 0.09295430034399033, mse_avg: 0.0
3817
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step4500
3818
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
3819
+ [eval debug] first 3 batch fingerprints:
3820
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3821
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3822
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3823
+ ce_avg: 0.09719827771186829, mse_avg: 0.0
3824
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_no_mse_ins_step5000
3825
+ Preparing Dataset vlm_gym_match_equation_sos_celoss_no_mse_evalonce/vlm_gym_match_equation_sos_val
3826
+ [eval debug] first 3 batch fingerprints:
3827
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3828
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3829
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_no_mse_evalonce'}]
3830
+ ce_avg: 0.10010946542024612, mse_avg: 0.0
3831
  [2026-01-26 20:38:13] (step=0003587) Train Loss mse: 0.0000, Train Loss ce: 0.0596, Train Steps/Sec: 1.01,
3832
  [2026-01-26 20:38:14] (step=0003588) Train Loss mse: 0.0000, Train Loss ce: 0.0493, Train Steps/Sec: 1.01,
3833
  [2026-01-26 20:38:15] (step=0003589) Train Loss mse: 0.0000, Train Loss ce: 0.0348, Train Steps/Sec: 1.01,
 
3935
  [2026-01-26 20:40:04] (step=0003691) Train Loss mse: 0.0000, Train Loss ce: 0.0398, Train Steps/Sec: 1.00,
3936
  [2026-01-26 20:40:05] (step=0003692) Train Loss mse: 0.0000, Train Loss ce: 0.0272, Train Steps/Sec: 1.02,
3937
  [2026-01-26 20:40:06] (step=0003693) Train Loss mse: 0.0000, Train Loss ce: 0.0313, Train Steps/Sec: 1.02,
3938
+ [2026-01-26 20:40:07] (step=0003694) Train Loss mse: 0.0000, Train Loss ce: 0.0528, Train Steps/Sec: 1.02,
3939
+ [2026-01-26 20:40:08] (step=0003695) Train Loss mse: 0.0000, Train Loss ce: 0.0489, Train Steps/Sec: 1.02,
3940
+ [2026-01-26 20:40:09] (step=0003696) Train Loss mse: 0.0000, Train Loss ce: 0.0267, Train Steps/Sec: 0.79,
3941
+ [2026-01-26 20:40:10] (step=0003697) Train Loss mse: 0.0000, Train Loss ce: 0.0365, Train Steps/Sec: 0.74,
3942
+ [2026-01-26 20:40:11] (step=0003698) Train Loss mse: 0.0000, Train Loss ce: 0.0334, Train Steps/Sec: 1.01,
3943
+ [2026-01-26 20:40:13] (step=0003699) Train Loss mse: 0.0000, Train Loss ce: 0.0528, Train Steps/Sec: 0.97,
3944
+ [2026-01-26 20:40:13] (step=0003700) Train Loss mse: 0.0000, Train Loss ce: 0.0260, Train Steps/Sec: 1.01,
3945
+ [2026-01-26 20:40:14] (step=0003701) Train Loss mse: 0.0000, Train Loss ce: 0.0514, Train Steps/Sec: 1.01,
3946
+ [2026-01-26 20:40:15] (step=0003702) Train Loss mse: 0.0000, Train Loss ce: 0.0590, Train Steps/Sec: 1.01,
3947
+ [2026-01-26 20:40:16] (step=0003703) Train Loss mse: 0.0000, Train Loss ce: 0.0275, Train Steps/Sec: 1.01,
3948
+ [2026-01-26 20:40:18] (step=0003704) Train Loss mse: 0.0000, Train Loss ce: 0.0255, Train Steps/Sec: 0.80,
3949
+ [2026-01-26 20:40:19] (step=0003705) Train Loss mse: 0.0000, Train Loss ce: 0.0308, Train Steps/Sec: 0.74,
3950
+ [2026-01-26 20:40:20] (step=0003706) Train Loss mse: 0.0000, Train Loss ce: 0.0407, Train Steps/Sec: 1.00,
3951
+ [2026-01-26 20:40:21] (step=0003707) Train Loss mse: 0.0000, Train Loss ce: 0.0359, Train Steps/Sec: 1.00,
3952
+ [2026-01-26 20:40:22] (step=0003708) Train Loss mse: 0.0000, Train Loss ce: 0.0362, Train Steps/Sec: 1.01,
3953
+ [2026-01-26 20:40:23] (step=0003709) Train Loss mse: 0.0000, Train Loss ce: 0.0446, Train Steps/Sec: 1.02,
3954
+ [2026-01-26 20:40:24] (step=0003710) Train Loss mse: 0.0000, Train Loss ce: 0.0333, Train Steps/Sec: 1.01,
3955
+ [2026-01-26 20:40:25] (step=0003711) Train Loss mse: 0.0000, Train Loss ce: 0.0461, Train Steps/Sec: 0.80,
3956
+ [2026-01-26 20:40:27] (step=0003712) Train Loss mse: 0.0000, Train Loss ce: 0.0473, Train Steps/Sec: 0.75,
3957
+ [2026-01-26 20:40:28] (step=0003713) Train Loss mse: 0.0000, Train Loss ce: 0.0338, Train Steps/Sec: 1.00,
3958
+ [2026-01-26 20:40:29] (step=0003714) Train Loss mse: 0.0000, Train Loss ce: 0.0298, Train Steps/Sec: 1.00,
3959
  [2026-01-26 20:40:30] (step=0003715) Train Loss mse: 0.0000, Train Loss ce: 0.0235, Train Steps/Sec: 1.01,
3960
  [2026-01-26 20:40:31] (step=0003716) Train Loss mse: 0.0000, Train Loss ce: 0.0562, Train Steps/Sec: 1.01,
3961
  [2026-01-26 20:40:32] (step=0003717) Train Loss mse: 0.0000, Train Loss ce: 0.0530, Train Steps/Sec: 0.99,