Junyi42 commited on
Commit
8c6e6d8
·
verified ·
1 Parent(s): f7209ad

Upload checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins

Browse files
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/wandb/offline-run-20260118_210409-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins-run0/files/output.log CHANGED
@@ -1145,6 +1145,9 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1145
  [2026-01-18 23:17:06] (step=0000964) Train Loss mse: 0.0000, Train Loss ce: 0.0772, Train Steps/Sec: 0.13,
1146
  [2026-01-18 23:17:14] (step=0000965) Train Loss mse: 0.0000, Train Loss ce: 0.0769, Train Steps/Sec: 0.13,
1147
  [2026-01-18 23:17:21] (step=0000966) Train Loss mse: 0.0000, Train Loss ce: 0.0778, Train Steps/Sec: 0.13,
 
 
 
1148
  base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step1000
1149
  Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
1150
  [eval debug] first 3 batch fingerprints:
@@ -1166,9 +1169,6 @@ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap
1166
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1167
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1168
  ce_avg: 0.1431104689836502, mse_avg: 0.0
1169
- [2026-01-18 23:17:29] (step=0000967) Train Loss mse: 0.0000, Train Loss ce: 0.0770, Train Steps/Sec: 0.13,
1170
- [2026-01-18 23:17:37] (step=0000968) Train Loss mse: 0.0000, Train Loss ce: 0.0793, Train Steps/Sec: 0.13,
1171
- [2026-01-18 23:17:45] (step=0000969) Train Loss mse: 0.0000, Train Loss ce: 0.0751, Train Steps/Sec: 0.12,
1172
  [2026-01-18 23:17:53] (step=0000970) Train Loss mse: 0.0000, Train Loss ce: 0.0761, Train Steps/Sec: 0.13,
1173
  [2026-01-18 23:18:01] (step=0000971) Train Loss mse: 0.0000, Train Loss ce: 0.0746, Train Steps/Sec: 0.13,
1174
  [2026-01-18 23:18:08] (step=0000972) Train Loss mse: 0.0000, Train Loss ce: 0.0773, Train Steps/Sec: 0.13,
@@ -2472,20 +2472,6 @@ ce_avg: 0.1431104689836502, mse_avg: 0.0
2472
  [2026-01-19 02:08:28] (step=0002270) Train Loss mse: 0.0000, Train Loss ce: 0.0758, Train Steps/Sec: 0.13,
2473
  [2026-01-19 02:08:36] (step=0002271) Train Loss mse: 0.0000, Train Loss ce: 0.0735, Train Steps/Sec: 0.12,
2474
  [2026-01-19 02:08:44] (step=0002272) Train Loss mse: 0.0000, Train Loss ce: 0.0754, Train Steps/Sec: 0.13,
2475
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step2500
2476
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2477
- [eval debug] first 3 batch fingerprints:
2478
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2479
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2480
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2481
- ce_avg: 0.15242451429367065, mse_avg: 0.0
2482
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3000
2483
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2484
- [eval debug] first 3 batch fingerprints:
2485
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2486
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2487
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2488
- ce_avg: 0.07420127838850021, mse_avg: 0.0
2489
  [2026-01-19 02:08:52] (step=0002273) Train Loss mse: 0.0000, Train Loss ce: 0.0747, Train Steps/Sec: 0.13,
2490
  [2026-01-19 02:09:00] (step=0002274) Train Loss mse: 0.0000, Train Loss ce: 0.0752, Train Steps/Sec: 0.13,
2491
  [2026-01-19 02:09:08] (step=0002275) Train Loss mse: 0.0000, Train Loss ce: 0.0736, Train Steps/Sec: 0.13,
@@ -2616,6 +2602,20 @@ ce_avg: 0.07420127838850021, mse_avg: 0.0
2616
  [2026-01-19 02:25:33] (step=0002400) Train Loss mse: 0.0000, Train Loss ce: 0.0746, Train Steps/Sec: 0.13,
2617
  [2026-01-19 02:25:41] (step=0002401) Train Loss mse: 0.0000, Train Loss ce: 0.0732, Train Steps/Sec: 0.13,
2618
  [2026-01-19 02:25:49] (step=0002402) Train Loss mse: 0.0000, Train Loss ce: 0.0755, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2619
  [2026-01-19 02:25:57] (step=0002403) Train Loss mse: 0.0000, Train Loss ce: 0.0755, Train Steps/Sec: 0.13,
2620
  [2026-01-19 02:26:05] (step=0002404) Train Loss mse: 0.0000, Train Loss ce: 0.0755, Train Steps/Sec: 0.13,
2621
  [2026-01-19 02:26:13] (step=0002405) Train Loss mse: 0.0000, Train Loss ce: 0.0739, Train Steps/Sec: 0.13,
 
1145
  [2026-01-18 23:17:06] (step=0000964) Train Loss mse: 0.0000, Train Loss ce: 0.0772, Train Steps/Sec: 0.13,
1146
  [2026-01-18 23:17:14] (step=0000965) Train Loss mse: 0.0000, Train Loss ce: 0.0769, Train Steps/Sec: 0.13,
1147
  [2026-01-18 23:17:21] (step=0000966) Train Loss mse: 0.0000, Train Loss ce: 0.0778, Train Steps/Sec: 0.13,
1148
+ [2026-01-18 23:17:29] (step=0000967) Train Loss mse: 0.0000, Train Loss ce: 0.0770, Train Steps/Sec: 0.13,
1149
+ [2026-01-18 23:17:37] (step=0000968) Train Loss mse: 0.0000, Train Loss ce: 0.0793, Train Steps/Sec: 0.13,
1150
+ [2026-01-18 23:17:45] (step=0000969) Train Loss mse: 0.0000, Train Loss ce: 0.0751, Train Steps/Sec: 0.12,
1151
  base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step1000
1152
  Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
1153
  [eval debug] first 3 batch fingerprints:
 
1169
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1170
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1171
  ce_avg: 0.1431104689836502, mse_avg: 0.0
 
 
 
1172
  [2026-01-18 23:17:53] (step=0000970) Train Loss mse: 0.0000, Train Loss ce: 0.0761, Train Steps/Sec: 0.13,
1173
  [2026-01-18 23:18:01] (step=0000971) Train Loss mse: 0.0000, Train Loss ce: 0.0746, Train Steps/Sec: 0.13,
1174
  [2026-01-18 23:18:08] (step=0000972) Train Loss mse: 0.0000, Train Loss ce: 0.0773, Train Steps/Sec: 0.13,
 
2472
  [2026-01-19 02:08:28] (step=0002270) Train Loss mse: 0.0000, Train Loss ce: 0.0758, Train Steps/Sec: 0.13,
2473
  [2026-01-19 02:08:36] (step=0002271) Train Loss mse: 0.0000, Train Loss ce: 0.0735, Train Steps/Sec: 0.12,
2474
  [2026-01-19 02:08:44] (step=0002272) Train Loss mse: 0.0000, Train Loss ce: 0.0754, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2475
  [2026-01-19 02:08:52] (step=0002273) Train Loss mse: 0.0000, Train Loss ce: 0.0747, Train Steps/Sec: 0.13,
2476
  [2026-01-19 02:09:00] (step=0002274) Train Loss mse: 0.0000, Train Loss ce: 0.0752, Train Steps/Sec: 0.13,
2477
  [2026-01-19 02:09:08] (step=0002275) Train Loss mse: 0.0000, Train Loss ce: 0.0736, Train Steps/Sec: 0.13,
 
2602
  [2026-01-19 02:25:33] (step=0002400) Train Loss mse: 0.0000, Train Loss ce: 0.0746, Train Steps/Sec: 0.13,
2603
  [2026-01-19 02:25:41] (step=0002401) Train Loss mse: 0.0000, Train Loss ce: 0.0732, Train Steps/Sec: 0.13,
2604
  [2026-01-19 02:25:49] (step=0002402) Train Loss mse: 0.0000, Train Loss ce: 0.0755, Train Steps/Sec: 0.13,
2605
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step2500
2606
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2607
+ [eval debug] first 3 batch fingerprints:
2608
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2609
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2610
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2611
+ ce_avg: 0.15242451429367065, mse_avg: 0.0
2612
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3000
2613
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2614
+ [eval debug] first 3 batch fingerprints:
2615
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2616
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2617
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2618
+ ce_avg: 0.07420127838850021, mse_avg: 0.0
2619
  [2026-01-19 02:25:57] (step=0002403) Train Loss mse: 0.0000, Train Loss ce: 0.0755, Train Steps/Sec: 0.13,
2620
  [2026-01-19 02:26:05] (step=0002404) Train Loss mse: 0.0000, Train Loss ce: 0.0755, Train Steps/Sec: 0.13,
2621
  [2026-01-19 02:26:13] (step=0002405) Train Loss mse: 0.0000, Train Loss ce: 0.0739, Train Steps/Sec: 0.13,
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/wandb/offline-run-20260119_053756-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins-run0/files/output.log CHANGED
@@ -1213,6 +1213,20 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1213
  [2026-01-19 08:05:14] (step=0001025) Train Loss mse: 0.0000, Train Loss ce: 0.0710, Train Steps/Sec: 0.12,
1214
  [2026-01-19 08:05:22] (step=0001026) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.12,
1215
  [2026-01-19 08:05:31] (step=0001027) Train Loss mse: 0.0000, Train Loss ce: 0.0701, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1216
  [2026-01-19 08:05:39] (step=0001028) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.12,
1217
  [2026-01-19 08:05:47] (step=0001029) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.13,
1218
  [2026-01-19 08:05:55] (step=0001030) Train Loss mse: 0.0000, Train Loss ce: 0.0728, Train Steps/Sec: 0.12,
@@ -1229,20 +1243,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1229
  [2026-01-19 08:07:25] (step=0001041) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.12,
1230
  [2026-01-19 08:07:34] (step=0001042) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.12,
1231
  [2026-01-19 08:07:42] (step=0001043) Train Loss mse: 0.0000, Train Loss ce: 0.0700, Train Steps/Sec: 0.12,
1232
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step1500
1233
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
1234
- [eval debug] first 3 batch fingerprints:
1235
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1236
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1237
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1238
- ce_avg: 0.1267234981060028, mse_avg: 0.0
1239
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step2000
1240
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
1241
- [eval debug] first 3 batch fingerprints:
1242
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1243
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1244
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1245
- ce_avg: 0.1393442451953888, mse_avg: 0.0
1246
  [2026-01-19 08:07:50] (step=0001044) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.12,
1247
  [2026-01-19 08:07:59] (step=0001045) Train Loss mse: 0.0000, Train Loss ce: 0.0697, Train Steps/Sec: 0.12,
1248
  [2026-01-19 08:08:07] (step=0001046) Train Loss mse: 0.0000, Train Loss ce: 0.0718, Train Steps/Sec: 0.12,
@@ -2593,27 +2593,6 @@ ce_avg: 0.1393442451953888, mse_avg: 0.0
2593
  [2026-01-19 11:12:34] (step=0002391) Train Loss mse: 0.0000, Train Loss ce: 0.0692, Train Steps/Sec: 0.12,
2594
  [2026-01-19 11:12:43] (step=0002392) Train Loss mse: 0.0000, Train Loss ce: 0.0692, Train Steps/Sec: 0.12,
2595
  [2026-01-19 11:12:51] (step=0002393) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.12,
2596
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step2500
2597
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2598
- [eval debug] first 3 batch fingerprints:
2599
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2600
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2601
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2602
- ce_avg: 0.14870576560497284, mse_avg: 0.0
2603
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3000
2604
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2605
- [eval debug] first 3 batch fingerprints:
2606
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2607
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2608
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2609
- ce_avg: 0.07034339010715485, mse_avg: 0.0
2610
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3500
2611
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2612
- [eval debug] first 3 batch fingerprints:
2613
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2614
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2615
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2616
- ce_avg: 0.07060375064611435, mse_avg: 0.0
2617
  [2026-01-19 11:12:59] (step=0002394) Train Loss mse: 0.0000, Train Loss ce: 0.0692, Train Steps/Sec: 0.12,
2618
  [2026-01-19 11:13:07] (step=0002395) Train Loss mse: 0.0000, Train Loss ce: 0.0696, Train Steps/Sec: 0.12,
2619
  [2026-01-19 11:13:16] (step=0002396) Train Loss mse: 0.0000, Train Loss ce: 0.0689, Train Steps/Sec: 0.12,
@@ -2664,6 +2643,27 @@ ce_avg: 0.07060375064611435, mse_avg: 0.0
2664
  [2026-01-19 11:19:26] (step=0002441) Train Loss mse: 0.0000, Train Loss ce: 0.0691, Train Steps/Sec: 0.12,
2665
  [2026-01-19 11:19:35] (step=0002442) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.12,
2666
  [2026-01-19 11:19:43] (step=0002443) Train Loss mse: 0.0000, Train Loss ce: 0.0692, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2667
  [2026-01-19 11:19:51] (step=0002444) Train Loss mse: 0.0000, Train Loss ce: 0.0689, Train Steps/Sec: 0.12,
2668
  [2026-01-19 11:19:59] (step=0002445) Train Loss mse: 0.0000, Train Loss ce: 0.0693, Train Steps/Sec: 0.12,
2669
  [2026-01-19 11:20:08] (step=0002446) Train Loss mse: 0.0000, Train Loss ce: 0.0680, Train Steps/Sec: 0.12,
@@ -3624,6 +3624,17 @@ ce_avg: 0.07060375064611435, mse_avg: 0.0
3624
  [2026-01-19 13:33:33] (step=0003398) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.12,
3625
  [2026-01-19 13:33:42] (step=0003399) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.11,
3626
  [2026-01-19 13:33:50] (step=0003400) Train Loss mse: 0.0000, Train Loss ce: 0.0678, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
3627
  base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step4000
3628
  Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
3629
  [eval debug] first 3 batch fingerprints:
@@ -3638,49 +3649,6 @@ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap
3638
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3639
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3640
  ce_avg: 0.07141611725091934, mse_avg: 0.0
3641
- [2026-01-19 13:33:58] (step=0003401) Train Loss mse: 0.0000, Train Loss ce: 0.0673, Train Steps/Sec: 0.12,
3642
- [2026-01-19 13:34:06] (step=0003402) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.12,
3643
- [2026-01-19 13:34:14] (step=0003403) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.12,
3644
- [2026-01-19 13:34:23] (step=0003404) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.12,
3645
- [2026-01-19 13:34:31] (step=0003405) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.12,
3646
- [2026-01-19 13:34:39] (step=0003406) Train Loss mse: 0.0000, Train Loss ce: 0.0702, Train Steps/Sec: 0.12,
3647
- [2026-01-19 13:34:47] (step=0003407) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.12,
3648
- [2026-01-19 13:34:55] (step=0003408) Train Loss mse: 0.0000, Train Loss ce: 0.0679, Train Steps/Sec: 0.12,
3649
- [2026-01-19 13:35:03] (step=0003409) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.13,
3650
- [2026-01-19 13:35:12] (step=0003410) Train Loss mse: 0.0000, Train Loss ce: 0.0680, Train Steps/Sec: 0.12,
3651
- [2026-01-19 13:35:20] (step=0003411) Train Loss mse: 0.0000, Train Loss ce: 0.0678, Train Steps/Sec: 0.12,
3652
- [2026-01-19 13:35:28] (step=0003412) Train Loss mse: 0.0000, Train Loss ce: 0.0677, Train Steps/Sec: 0.12,
3653
- [2026-01-19 13:35:36] (step=0003413) Train Loss mse: 0.0000, Train Loss ce: 0.0674, Train Steps/Sec: 0.12,
3654
- [2026-01-19 13:35:45] (step=0003414) Train Loss mse: 0.0000, Train Loss ce: 0.0678, Train Steps/Sec: 0.12,
3655
- [2026-01-19 13:35:53] (step=0003415) Train Loss mse: 0.0000, Train Loss ce: 0.0684, Train Steps/Sec: 0.12,
3656
- [2026-01-19 13:36:01] (step=0003416) Train Loss mse: 0.0000, Train Loss ce: 0.0669, Train Steps/Sec: 0.12,
3657
- [2026-01-19 13:36:10] (step=0003417) Train Loss mse: 0.0000, Train Loss ce: 0.0692, Train Steps/Sec: 0.12,
3658
- [2026-01-19 13:36:18] (step=0003418) Train Loss mse: 0.0000, Train Loss ce: 0.0684, Train Steps/Sec: 0.12,
3659
- [2026-01-19 13:36:26] (step=0003419) Train Loss mse: 0.0000, Train Loss ce: 0.0675, Train Steps/Sec: 0.12,
3660
- [2026-01-19 13:36:34] (step=0003420) Train Loss mse: 0.0000, Train Loss ce: 0.0674, Train Steps/Sec: 0.12,
3661
- [2026-01-19 13:36:43] (step=0003421) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.12,
3662
- [2026-01-19 13:36:51] (step=0003422) Train Loss mse: 0.0000, Train Loss ce: 0.0681, Train Steps/Sec: 0.12,
3663
- [2026-01-19 13:36:59] (step=0003423) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.12,
3664
- [2026-01-19 13:37:07] (step=0003424) Train Loss mse: 0.0000, Train Loss ce: 0.0695, Train Steps/Sec: 0.12,
3665
- [2026-01-19 13:37:15] (step=0003425) Train Loss mse: 0.0000, Train Loss ce: 0.0680, Train Steps/Sec: 0.12,
3666
- [2026-01-19 13:37:23] (step=0003426) Train Loss mse: 0.0000, Train Loss ce: 0.0681, Train Steps/Sec: 0.12,
3667
- [2026-01-19 13:37:32] (step=0003427) Train Loss mse: 0.0000, Train Loss ce: 0.0674, Train Steps/Sec: 0.12,
3668
- [2026-01-19 13:37:40] (step=0003428) Train Loss mse: 0.0000, Train Loss ce: 0.0667, Train Steps/Sec: 0.12,
3669
- [2026-01-19 13:37:48] (step=0003429) Train Loss mse: 0.0000, Train Loss ce: 0.0689, Train Steps/Sec: 0.12,
3670
- [2026-01-19 13:37:56] (step=0003430) Train Loss mse: 0.0000, Train Loss ce: 0.0693, Train Steps/Sec: 0.13,
3671
- [2026-01-19 13:38:04] (step=0003431) Train Loss mse: 0.0000, Train Loss ce: 0.0684, Train Steps/Sec: 0.12,
3672
- [2026-01-19 13:38:13] (step=0003432) Train Loss mse: 0.0000, Train Loss ce: 0.0675, Train Steps/Sec: 0.12,
3673
- [2026-01-19 13:38:21] (step=0003433) Train Loss mse: 0.0000, Train Loss ce: 0.0680, Train Steps/Sec: 0.12,
3674
- [2026-01-19 13:38:29] (step=0003434) Train Loss mse: 0.0000, Train Loss ce: 0.0679, Train Steps/Sec: 0.12,
3675
- [2026-01-19 13:38:37] (step=0003435) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.12,
3676
- [2026-01-19 13:38:46] (step=0003436) Train Loss mse: 0.0000, Train Loss ce: 0.0670, Train Steps/Sec: 0.12,
3677
- [2026-01-19 13:38:54] (step=0003437) Train Loss mse: 0.0000, Train Loss ce: 0.0668, Train Steps/Sec: 0.12,
3678
- [2026-01-19 13:39:03] (step=0003438) Train Loss mse: 0.0000, Train Loss ce: 0.0673, Train Steps/Sec: 0.12,
3679
- [2026-01-19 13:39:11] (step=0003439) Train Loss mse: 0.0000, Train Loss ce: 0.0693, Train Steps/Sec: 0.12,
3680
- [2026-01-19 13:39:19] (step=0003440) Train Loss mse: 0.0000, Train Loss ce: 0.0674, Train Steps/Sec: 0.12,
3681
- [2026-01-19 13:39:27] (step=0003441) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.12,
3682
- [2026-01-19 13:39:35] (step=0003442) Train Loss mse: 0.0000, Train Loss ce: 0.0674, Train Steps/Sec: 0.12,
3683
- [2026-01-19 13:39:43] (step=0003443) Train Loss mse: 0.0000, Train Loss ce: 0.0679, Train Steps/Sec: 0.12,
3684
  [2026-01-19 13:39:52] (step=0003444) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.12,
3685
  [2026-01-19 13:40:00] (step=0003445) Train Loss mse: 0.0000, Train Loss ce: 0.0679, Train Steps/Sec: 0.12,
3686
  [2026-01-19 13:40:08] (step=0003446) Train Loss mse: 0.0000, Train Loss ce: 0.0681, Train Steps/Sec: 0.12,
@@ -4996,6 +4964,13 @@ ce_avg: 0.07141611725091934, mse_avg: 0.0
4996
  [2026-01-19 16:40:06] (step=0004756) Train Loss mse: 0.0000, Train Loss ce: 0.0654, Train Steps/Sec: 0.12,
4997
  [2026-01-19 16:40:14] (step=0004757) Train Loss mse: 0.0000, Train Loss ce: 0.0667, Train Steps/Sec: 0.12,
4998
  [2026-01-19 16:40:22] (step=0004758) Train Loss mse: 0.0000, Train Loss ce: 0.0655, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
4999
  [2026-01-19 16:40:31] (step=0004759) Train Loss mse: 0.0000, Train Loss ce: 0.0645, Train Steps/Sec: 0.12,
5000
  [2026-01-19 16:40:39] (step=0004760) Train Loss mse: 0.0000, Train Loss ce: 0.0656, Train Steps/Sec: 0.12,
5001
  [2026-01-19 16:40:47] (step=0004761) Train Loss mse: 0.0000, Train Loss ce: 0.0654, Train Steps/Sec: 0.12,
@@ -5018,13 +4993,6 @@ ce_avg: 0.07141611725091934, mse_avg: 0.0
5018
  [2026-01-19 16:43:06] (step=0004778) Train Loss mse: 0.0000, Train Loss ce: 0.0658, Train Steps/Sec: 0.12,
5019
  [2026-01-19 16:43:14] (step=0004779) Train Loss mse: 0.0000, Train Loss ce: 0.0661, Train Steps/Sec: 0.12,
5020
  [2026-01-19 16:43:22] (step=0004780) Train Loss mse: 0.0000, Train Loss ce: 0.0658, Train Steps/Sec: 0.12,
5021
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step5000
5022
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
5023
- [eval debug] first 3 batch fingerprints:
5024
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
5025
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
5026
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
5027
- ce_avg: 0.07195709645748138, mse_avg: 0.0
5028
  [2026-01-19 16:43:31] (step=0004781) Train Loss mse: 0.0000, Train Loss ce: 0.0664, Train Steps/Sec: 0.12,
5029
  [2026-01-19 16:43:39] (step=0004782) Train Loss mse: 0.0000, Train Loss ce: 0.0653, Train Steps/Sec: 0.12,
5030
  [2026-01-19 16:43:47] (step=0004783) Train Loss mse: 0.0000, Train Loss ce: 0.0657, Train Steps/Sec: 0.12,
 
1213
  [2026-01-19 08:05:14] (step=0001025) Train Loss mse: 0.0000, Train Loss ce: 0.0710, Train Steps/Sec: 0.12,
1214
  [2026-01-19 08:05:22] (step=0001026) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.12,
1215
  [2026-01-19 08:05:31] (step=0001027) Train Loss mse: 0.0000, Train Loss ce: 0.0701, Train Steps/Sec: 0.12,
1216
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step1500
1217
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
1218
+ [eval debug] first 3 batch fingerprints:
1219
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1220
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1221
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1222
+ ce_avg: 0.1267234981060028, mse_avg: 0.0
1223
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step2000
1224
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
1225
+ [eval debug] first 3 batch fingerprints:
1226
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1227
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1228
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1229
+ ce_avg: 0.1393442451953888, mse_avg: 0.0
1230
  [2026-01-19 08:05:39] (step=0001028) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.12,
1231
  [2026-01-19 08:05:47] (step=0001029) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.13,
1232
  [2026-01-19 08:05:55] (step=0001030) Train Loss mse: 0.0000, Train Loss ce: 0.0728, Train Steps/Sec: 0.12,
 
1243
  [2026-01-19 08:07:25] (step=0001041) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.12,
1244
  [2026-01-19 08:07:34] (step=0001042) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.12,
1245
  [2026-01-19 08:07:42] (step=0001043) Train Loss mse: 0.0000, Train Loss ce: 0.0700, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1246
  [2026-01-19 08:07:50] (step=0001044) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.12,
1247
  [2026-01-19 08:07:59] (step=0001045) Train Loss mse: 0.0000, Train Loss ce: 0.0697, Train Steps/Sec: 0.12,
1248
  [2026-01-19 08:08:07] (step=0001046) Train Loss mse: 0.0000, Train Loss ce: 0.0718, Train Steps/Sec: 0.12,
 
2593
  [2026-01-19 11:12:34] (step=0002391) Train Loss mse: 0.0000, Train Loss ce: 0.0692, Train Steps/Sec: 0.12,
2594
  [2026-01-19 11:12:43] (step=0002392) Train Loss mse: 0.0000, Train Loss ce: 0.0692, Train Steps/Sec: 0.12,
2595
  [2026-01-19 11:12:51] (step=0002393) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2596
  [2026-01-19 11:12:59] (step=0002394) Train Loss mse: 0.0000, Train Loss ce: 0.0692, Train Steps/Sec: 0.12,
2597
  [2026-01-19 11:13:07] (step=0002395) Train Loss mse: 0.0000, Train Loss ce: 0.0696, Train Steps/Sec: 0.12,
2598
  [2026-01-19 11:13:16] (step=0002396) Train Loss mse: 0.0000, Train Loss ce: 0.0689, Train Steps/Sec: 0.12,
 
2643
  [2026-01-19 11:19:26] (step=0002441) Train Loss mse: 0.0000, Train Loss ce: 0.0691, Train Steps/Sec: 0.12,
2644
  [2026-01-19 11:19:35] (step=0002442) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.12,
2645
  [2026-01-19 11:19:43] (step=0002443) Train Loss mse: 0.0000, Train Loss ce: 0.0692, Train Steps/Sec: 0.12,
2646
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step2500
2647
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2648
+ [eval debug] first 3 batch fingerprints:
2649
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2650
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2651
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2652
+ ce_avg: 0.14870576560497284, mse_avg: 0.0
2653
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3000
2654
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2655
+ [eval debug] first 3 batch fingerprints:
2656
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2657
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2658
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2659
+ ce_avg: 0.07034339010715485, mse_avg: 0.0
2660
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3500
2661
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2662
+ [eval debug] first 3 batch fingerprints:
2663
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2664
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2665
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2666
+ ce_avg: 0.07060375064611435, mse_avg: 0.0
2667
  [2026-01-19 11:19:51] (step=0002444) Train Loss mse: 0.0000, Train Loss ce: 0.0689, Train Steps/Sec: 0.12,
2668
  [2026-01-19 11:19:59] (step=0002445) Train Loss mse: 0.0000, Train Loss ce: 0.0693, Train Steps/Sec: 0.12,
2669
  [2026-01-19 11:20:08] (step=0002446) Train Loss mse: 0.0000, Train Loss ce: 0.0680, Train Steps/Sec: 0.12,
 
3624
  [2026-01-19 13:33:33] (step=0003398) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.12,
3625
  [2026-01-19 13:33:42] (step=0003399) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.11,
3626
  [2026-01-19 13:33:50] (step=0003400) Train Loss mse: 0.0000, Train Loss ce: 0.0678, Train Steps/Sec: 0.12,
3627
+ [2026-01-19 13:33:58] (step=0003401) Train Loss mse: 0.0000, Train Loss ce: 0.0673, Train Steps/Sec: 0.12,
3628
+ [2026-01-19 13:34:06] (step=0003402) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.12,
3629
+ [2026-01-19 13:34:14] (step=0003403) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.12,
3630
+ [2026-01-19 13:34:23] (step=0003404) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.12,
3631
+ [2026-01-19 13:34:31] (step=0003405) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.12,
3632
+ [2026-01-19 13:34:39] (step=0003406) Train Loss mse: 0.0000, Train Loss ce: 0.0702, Train Steps/Sec: 0.12,
3633
+ [2026-01-19 13:34:47] (step=0003407) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.12,
3634
+ [2026-01-19 13:34:55] (step=0003408) Train Loss mse: 0.0000, Train Loss ce: 0.0679, Train Steps/Sec: 0.12,
3635
+ [2026-01-19 13:35:03] (step=0003409) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.13,
3636
+ [2026-01-19 13:35:12] (step=0003410) Train Loss mse: 0.0000, Train Loss ce: 0.0680, Train Steps/Sec: 0.12,
3637
+ [2026-01-19 13:35:20] (step=0003411) Train Loss mse: 0.0000, Train Loss ce: 0.0678, Train Steps/Sec: 0.12,
3638
  base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step4000
3639
  Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
3640
  [eval debug] first 3 batch fingerprints:
 
3649
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3650
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3651
  ce_avg: 0.07141611725091934, mse_avg: 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3652
  [2026-01-19 13:39:52] (step=0003444) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.12,
3653
  [2026-01-19 13:40:00] (step=0003445) Train Loss mse: 0.0000, Train Loss ce: 0.0679, Train Steps/Sec: 0.12,
3654
  [2026-01-19 13:40:08] (step=0003446) Train Loss mse: 0.0000, Train Loss ce: 0.0681, Train Steps/Sec: 0.12,
 
4964
  [2026-01-19 16:40:06] (step=0004756) Train Loss mse: 0.0000, Train Loss ce: 0.0654, Train Steps/Sec: 0.12,
4965
  [2026-01-19 16:40:14] (step=0004757) Train Loss mse: 0.0000, Train Loss ce: 0.0667, Train Steps/Sec: 0.12,
4966
  [2026-01-19 16:40:22] (step=0004758) Train Loss mse: 0.0000, Train Loss ce: 0.0655, Train Steps/Sec: 0.12,
4967
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step5000
4968
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
4969
+ [eval debug] first 3 batch fingerprints:
4970
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
4971
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
4972
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
4973
+ ce_avg: 0.07195709645748138, mse_avg: 0.0
4974
  [2026-01-19 16:40:31] (step=0004759) Train Loss mse: 0.0000, Train Loss ce: 0.0645, Train Steps/Sec: 0.12,
4975
  [2026-01-19 16:40:39] (step=0004760) Train Loss mse: 0.0000, Train Loss ce: 0.0656, Train Steps/Sec: 0.12,
4976
  [2026-01-19 16:40:47] (step=0004761) Train Loss mse: 0.0000, Train Loss ce: 0.0654, Train Steps/Sec: 0.12,
 
4993
  [2026-01-19 16:43:06] (step=0004778) Train Loss mse: 0.0000, Train Loss ce: 0.0658, Train Steps/Sec: 0.12,
4994
  [2026-01-19 16:43:14] (step=0004779) Train Loss mse: 0.0000, Train Loss ce: 0.0661, Train Steps/Sec: 0.12,
4995
  [2026-01-19 16:43:22] (step=0004780) Train Loss mse: 0.0000, Train Loss ce: 0.0658, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
4996
  [2026-01-19 16:43:31] (step=0004781) Train Loss mse: 0.0000, Train Loss ce: 0.0664, Train Steps/Sec: 0.12,
4997
  [2026-01-19 16:43:39] (step=0004782) Train Loss mse: 0.0000, Train Loss ce: 0.0653, Train Steps/Sec: 0.12,
4998
  [2026-01-19 16:43:47] (step=0004783) Train Loss mse: 0.0000, Train Loss ce: 0.0657, Train Steps/Sec: 0.12,
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/wandb/offline-run-20260122_153153-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins-run0/files/output.log CHANGED
@@ -871,6 +871,129 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
871
  [2026-01-22 17:18:30] (step=0000860) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.15,
872
  [2026-01-22 17:18:37] (step=0000861) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.14,
873
  [2026-01-22 17:18:44] (step=0000862) Train Loss mse: 0.0000, Train Loss ce: 0.0702, Train Steps/Sec: 0.15,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
874
  FullyShardedDataParallel(
875
  (_fsdp_wrapped_module): Bagel(
876
  (language_model): Qwen2ForCausalLM(
@@ -1055,129 +1178,6 @@ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap
1055
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1056
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1057
  ce_avg: 0.14597271382808685, mse_avg: 0.0
1058
- [2026-01-22 17:18:51] (step=0000863) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.14,
1059
- [2026-01-22 17:18:58] (step=0000864) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.15,
1060
- [2026-01-22 17:19:04] (step=0000865) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.15,
1061
- [2026-01-22 17:19:12] (step=0000866) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 0.13,
1062
- [2026-01-22 17:19:19] (step=0000867) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
1063
- [2026-01-22 17:19:25] (step=0000868) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.15,
1064
- [2026-01-22 17:19:32] (step=0000869) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
1065
- [2026-01-22 17:19:39] (step=0000870) Train Loss mse: 0.0000, Train Loss ce: 0.0710, Train Steps/Sec: 0.15,
1066
- [2026-01-22 17:19:46] (step=0000871) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 0.15,
1067
- [2026-01-22 17:19:54] (step=0000872) Train Loss mse: 0.0000, Train Loss ce: 0.0704, Train Steps/Sec: 0.13,
1068
- [2026-01-22 17:20:01] (step=0000873) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.14,
1069
- [2026-01-22 17:20:07] (step=0000874) Train Loss mse: 0.0000, Train Loss ce: 0.0723, Train Steps/Sec: 0.15,
1070
- [2026-01-22 17:20:14] (step=0000875) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.15,
1071
- [2026-01-22 17:20:20] (step=0000876) Train Loss mse: 0.0000, Train Loss ce: 0.0735, Train Steps/Sec: 0.15,
1072
- [2026-01-22 17:20:27] (step=0000877) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.15,
1073
- [2026-01-22 17:20:34] (step=0000878) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.14,
1074
- [2026-01-22 17:20:41] (step=0000879) Train Loss mse: 0.0000, Train Loss ce: 0.0729, Train Steps/Sec: 0.13,
1075
- [2026-01-22 17:20:49] (step=0000880) Train Loss mse: 0.0000, Train Loss ce: 0.0727, Train Steps/Sec: 0.13,
1076
- [2026-01-22 17:20:56] (step=0000881) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.14,
1077
- [2026-01-22 17:21:03] (step=0000882) Train Loss mse: 0.0000, Train Loss ce: 0.0722, Train Steps/Sec: 0.14,
1078
- [2026-01-22 17:21:10] (step=0000883) Train Loss mse: 0.0000, Train Loss ce: 0.0699, Train Steps/Sec: 0.14,
1079
- [2026-01-22 17:21:17] (step=0000884) Train Loss mse: 0.0000, Train Loss ce: 0.0729, Train Steps/Sec: 0.14,
1080
- [2026-01-22 17:21:24] (step=0000885) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.14,
1081
- [2026-01-22 17:21:31] (step=0000886) Train Loss mse: 0.0000, Train Loss ce: 0.0698, Train Steps/Sec: 0.15,
1082
- [2026-01-22 17:21:38] (step=0000887) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.15,
1083
- [2026-01-22 17:21:44] (step=0000888) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.15,
1084
- [2026-01-22 17:21:51] (step=0000889) Train Loss mse: 0.0000, Train Loss ce: 0.0704, Train Steps/Sec: 0.15,
1085
- [2026-01-22 17:21:58] (step=0000890) Train Loss mse: 0.0000, Train Loss ce: 0.0721, Train Steps/Sec: 0.14,
1086
- [2026-01-22 17:22:06] (step=0000891) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.13,
1087
- [2026-01-22 17:22:12] (step=0000892) Train Loss mse: 0.0000, Train Loss ce: 0.0741, Train Steps/Sec: 0.15,
1088
- [2026-01-22 17:22:19] (step=0000893) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.15,
1089
- [2026-01-22 17:22:26] (step=0000894) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.14,
1090
- [2026-01-22 17:22:33] (step=0000895) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.15,
1091
- [2026-01-22 17:22:40] (step=0000896) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.13,
1092
- [2026-01-22 17:22:47] (step=0000897) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
1093
- [2026-01-22 17:22:54] (step=0000898) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.14,
1094
- [2026-01-22 17:23:01] (step=0000899) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.15,
1095
- [2026-01-22 17:23:08] (step=0000900) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.15,
1096
- [2026-01-22 17:23:14] (step=0000901) Train Loss mse: 0.0000, Train Loss ce: 0.0731, Train Steps/Sec: 0.15,
1097
- [2026-01-22 17:23:22] (step=0000902) Train Loss mse: 0.0000, Train Loss ce: 0.0701, Train Steps/Sec: 0.14,
1098
- [2026-01-22 17:23:29] (step=0000903) Train Loss mse: 0.0000, Train Loss ce: 0.0726, Train Steps/Sec: 0.14,
1099
- [2026-01-22 17:23:36] (step=0000904) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.15,
1100
- [2026-01-22 17:23:42] (step=0000905) Train Loss mse: 0.0000, Train Loss ce: 0.0736, Train Steps/Sec: 0.15,
1101
- [2026-01-22 17:23:49] (step=0000906) Train Loss mse: 0.0000, Train Loss ce: 0.0708, Train Steps/Sec: 0.14,
1102
- [2026-01-22 17:23:57] (step=0000907) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
1103
- [2026-01-22 17:24:04] (step=0000908) Train Loss mse: 0.0000, Train Loss ce: 0.0736, Train Steps/Sec: 0.14,
1104
- [2026-01-22 17:24:11] (step=0000909) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.14,
1105
- [2026-01-22 17:24:17] (step=0000910) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.15,
1106
- [2026-01-22 17:24:25] (step=0000911) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 0.13,
1107
- [2026-01-22 17:24:32] (step=0000912) Train Loss mse: 0.0000, Train Loss ce: 0.0745, Train Steps/Sec: 0.15,
1108
- [2026-01-22 17:24:38] (step=0000913) Train Loss mse: 0.0000, Train Loss ce: 0.0722, Train Steps/Sec: 0.15,
1109
- [2026-01-22 17:24:45] (step=0000914) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.14,
1110
- [2026-01-22 17:24:53] (step=0000915) Train Loss mse: 0.0000, Train Loss ce: 0.0704, Train Steps/Sec: 0.14,
1111
- [2026-01-22 17:24:59] (step=0000916) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.15,
1112
- [2026-01-22 17:25:06] (step=0000917) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.15,
1113
- [2026-01-22 17:25:13] (step=0000918) Train Loss mse: 0.0000, Train Loss ce: 0.0727, Train Steps/Sec: 0.15,
1114
- [2026-01-22 17:25:20] (step=0000919) Train Loss mse: 0.0000, Train Loss ce: 0.0725, Train Steps/Sec: 0.14,
1115
- [2026-01-22 17:25:27] (step=0000920) Train Loss mse: 0.0000, Train Loss ce: 0.0723, Train Steps/Sec: 0.14,
1116
- [2026-01-22 17:25:35] (step=0000921) Train Loss mse: 0.0000, Train Loss ce: 0.0734, Train Steps/Sec: 0.14,
1117
- [2026-01-22 17:25:41] (step=0000922) Train Loss mse: 0.0000, Train Loss ce: 0.0729, Train Steps/Sec: 0.15,
1118
- [2026-01-22 17:25:48] (step=0000923) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.15,
1119
- [2026-01-22 17:25:55] (step=0000924) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.14,
1120
- [2026-01-22 17:26:02] (step=0000925) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 0.15,
1121
- [2026-01-22 17:26:09] (step=0000926) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.15,
1122
- [2026-01-22 17:26:16] (step=0000927) Train Loss mse: 0.0000, Train Loss ce: 0.0732, Train Steps/Sec: 0.14,
1123
- [2026-01-22 17:26:23] (step=0000928) Train Loss mse: 0.0000, Train Loss ce: 0.0733, Train Steps/Sec: 0.15,
1124
- [2026-01-22 17:26:29] (step=0000929) Train Loss mse: 0.0000, Train Loss ce: 0.0700, Train Steps/Sec: 0.15,
1125
- [2026-01-22 17:26:36] (step=0000930) Train Loss mse: 0.0000, Train Loss ce: 0.0698, Train Steps/Sec: 0.16,
1126
- [2026-01-22 17:26:43] (step=0000931) Train Loss mse: 0.0000, Train Loss ce: 0.0708, Train Steps/Sec: 0.15,
1127
- [2026-01-22 17:26:50] (step=0000932) Train Loss mse: 0.0000, Train Loss ce: 0.0699, Train Steps/Sec: 0.13,
1128
- [2026-01-22 17:26:58] (step=0000933) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.13,
1129
- [2026-01-22 17:27:05] (step=0000934) Train Loss mse: 0.0000, Train Loss ce: 0.0708, Train Steps/Sec: 0.14,
1130
- [2026-01-22 17:27:12] (step=0000935) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.14,
1131
- [2026-01-22 17:27:19] (step=0000936) Train Loss mse: 0.0000, Train Loss ce: 0.0724, Train Steps/Sec: 0.13,
1132
- [2026-01-22 17:27:26] (step=0000937) Train Loss mse: 0.0000, Train Loss ce: 0.0721, Train Steps/Sec: 0.15,
1133
- [2026-01-22 17:27:32] (step=0000938) Train Loss mse: 0.0000, Train Loss ce: 0.0724, Train Steps/Sec: 0.15,
1134
- [2026-01-22 17:27:40] (step=0000939) Train Loss mse: 0.0000, Train Loss ce: 0.0710, Train Steps/Sec: 0.14,
1135
- [2026-01-22 17:27:46] (step=0000940) Train Loss mse: 0.0000, Train Loss ce: 0.0695, Train Steps/Sec: 0.15,
1136
- [2026-01-22 17:27:53] (step=0000941) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.14,
1137
- [2026-01-22 17:28:00] (step=0000942) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
1138
- [2026-01-22 17:28:07] (step=0000943) Train Loss mse: 0.0000, Train Loss ce: 0.0744, Train Steps/Sec: 0.15,
1139
- [2026-01-22 17:28:14] (step=0000944) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.15,
1140
- [2026-01-22 17:28:21] (step=0000945) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
1141
- [2026-01-22 17:28:29] (step=0000946) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.13,
1142
- [2026-01-22 17:28:35] (step=0000947) Train Loss mse: 0.0000, Train Loss ce: 0.0723, Train Steps/Sec: 0.15,
1143
- [2026-01-22 17:28:42] (step=0000948) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.15,
1144
- [2026-01-22 17:28:49] (step=0000949) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.14,
1145
- [2026-01-22 17:28:56] (step=0000950) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.15,
1146
- [2026-01-22 17:29:03] (step=0000951) Train Loss mse: 0.0000, Train Loss ce: 0.0697, Train Steps/Sec: 0.14,
1147
- [2026-01-22 17:29:11] (step=0000952) Train Loss mse: 0.0000, Train Loss ce: 0.0703, Train Steps/Sec: 0.13,
1148
- [2026-01-22 17:29:18] (step=0000953) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.13,
1149
- [2026-01-22 17:29:26] (step=0000954) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.14,
1150
- [2026-01-22 17:29:32] (step=0000955) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.15,
1151
- [2026-01-22 17:29:39] (step=0000956) Train Loss mse: 0.0000, Train Loss ce: 0.0696, Train Steps/Sec: 0.15,
1152
- [2026-01-22 17:29:46] (step=0000957) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.14,
1153
- [2026-01-22 17:29:53] (step=0000958) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
1154
- [2026-01-22 17:30:01] (step=0000959) Train Loss mse: 0.0000, Train Loss ce: 0.0727, Train Steps/Sec: 0.13,
1155
- [2026-01-22 17:30:08] (step=0000960) Train Loss mse: 0.0000, Train Loss ce: 0.0722, Train Steps/Sec: 0.13,
1156
- [2026-01-22 17:30:15] (step=0000961) Train Loss mse: 0.0000, Train Loss ce: 0.0740, Train Steps/Sec: 0.15,
1157
- [2026-01-22 17:30:22] (step=0000962) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.14,
1158
- [2026-01-22 17:30:30] (step=0000963) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.13,
1159
- [2026-01-22 17:30:37] (step=0000964) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
1160
- [2026-01-22 17:30:44] (step=0000965) Train Loss mse: 0.0000, Train Loss ce: 0.0697, Train Steps/Sec: 0.14,
1161
- [2026-01-22 17:30:51] (step=0000966) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.15,
1162
- [2026-01-22 17:30:57] (step=0000967) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.15,
1163
- [2026-01-22 17:31:04] (step=0000968) Train Loss mse: 0.0000, Train Loss ce: 0.0701, Train Steps/Sec: 0.15,
1164
- [2026-01-22 17:31:11] (step=0000969) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
1165
- [2026-01-22 17:31:18] (step=0000970) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.14,
1166
- [2026-01-22 17:31:25] (step=0000971) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
1167
- [2026-01-22 17:31:32] (step=0000972) Train Loss mse: 0.0000, Train Loss ce: 0.0696, Train Steps/Sec: 0.15,
1168
- [2026-01-22 17:31:39] (step=0000973) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.14,
1169
- [2026-01-22 17:31:46] (step=0000974) Train Loss mse: 0.0000, Train Loss ce: 0.0721, Train Steps/Sec: 0.14,
1170
- [2026-01-22 17:31:52] (step=0000975) Train Loss mse: 0.0000, Train Loss ce: 0.0721, Train Steps/Sec: 0.15,
1171
- [2026-01-22 17:32:00] (step=0000976) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.14,
1172
- [2026-01-22 17:32:07] (step=0000977) Train Loss mse: 0.0000, Train Loss ce: 0.0726, Train Steps/Sec: 0.14,
1173
- [2026-01-22 17:32:13] (step=0000978) Train Loss mse: 0.0000, Train Loss ce: 0.0716, Train Steps/Sec: 0.15,
1174
- [2026-01-22 17:32:20] (step=0000979) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.15,
1175
- [2026-01-22 17:32:27] (step=0000980) Train Loss mse: 0.0000, Train Loss ce: 0.0738, Train Steps/Sec: 0.14,
1176
- [2026-01-22 17:32:34] (step=0000981) Train Loss mse: 0.0000, Train Loss ce: 0.0728, Train Steps/Sec: 0.14,
1177
- [2026-01-22 17:32:42] (step=0000982) Train Loss mse: 0.0000, Train Loss ce: 0.0739, Train Steps/Sec: 0.14,
1178
- [2026-01-22 17:32:49] (step=0000983) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.14,
1179
- [2026-01-22 17:32:55] (step=0000984) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.15,
1180
- [2026-01-22 17:33:02] (step=0000985) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.14,
1181
  [2026-01-22 17:33:10] (step=0000986) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.14,
1182
  [2026-01-22 17:33:16] (step=0000987) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.16,
1183
  [2026-01-22 17:33:23] (step=0000988) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.15,
@@ -2426,20 +2426,6 @@ ce_avg: 0.14597271382808685, mse_avg: 0.0
2426
  [2026-01-22 19:57:55] (step=0002231) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.14,
2427
  [2026-01-22 19:58:01] (step=0002232) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.15,
2428
  [2026-01-22 19:58:09] (step=0002233) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.13,
2429
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step2500
2430
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2431
- [eval debug] first 3 batch fingerprints:
2432
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2433
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2434
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2435
- ce_avg: 0.1571701020002365, mse_avg: 0.0
2436
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3000
2437
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2438
- [eval debug] first 3 batch fingerprints:
2439
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2440
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2441
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2442
- ce_avg: 0.0698586106300354, mse_avg: 0.0
2443
  [2026-01-22 19:58:15] (step=0002234) Train Loss mse: 0.0000, Train Loss ce: 0.0691, Train Steps/Sec: 0.15,
2444
  [2026-01-22 19:58:23] (step=0002235) Train Loss mse: 0.0000, Train Loss ce: 0.0701, Train Steps/Sec: 0.14,
2445
  [2026-01-22 19:58:29] (step=0002236) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.15,
@@ -2567,6 +2553,27 @@ ce_avg: 0.0698586106300354, mse_avg: 0.0
2567
  [2026-01-22 20:12:43] (step=0002358) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.14,
2568
  [2026-01-22 20:12:50] (step=0002359) Train Loss mse: 0.0000, Train Loss ce: 0.0684, Train Steps/Sec: 0.14,
2569
  [2026-01-22 20:12:57] (step=0002360) Train Loss mse: 0.0000, Train Loss ce: 0.0691, Train Steps/Sec: 0.14,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2570
  [2026-01-22 20:13:03] (step=0002361) Train Loss mse: 0.0000, Train Loss ce: 0.0687, Train Steps/Sec: 0.15,
2571
  [2026-01-22 20:13:10] (step=0002362) Train Loss mse: 0.0000, Train Loss ce: 0.0672, Train Steps/Sec: 0.15,
2572
  [2026-01-22 20:13:17] (step=0002363) Train Loss mse: 0.0000, Train Loss ce: 0.0698, Train Steps/Sec: 0.14,
@@ -3521,6 +3528,20 @@ ce_avg: 0.0698586106300354, mse_avg: 0.0
3521
  [2026-01-22 22:06:04] (step=0003309) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.15,
3522
  [2026-01-22 22:06:11] (step=0003310) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.14,
3523
  [2026-01-22 22:06:18] (step=0003311) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.14,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3524
  [2026-01-22 22:06:26] (step=0003312) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.13,
3525
  [2026-01-22 22:06:33] (step=0003313) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.15,
3526
  [2026-01-22 22:06:39] (step=0003314) Train Loss mse: 0.0000, Train Loss ce: 0.0677, Train Steps/Sec: 0.14,
@@ -3597,27 +3618,6 @@ ce_avg: 0.0698586106300354, mse_avg: 0.0
3597
  [2026-01-22 22:14:53] (step=0003385) Train Loss mse: 0.0000, Train Loss ce: 0.0689, Train Steps/Sec: 0.15,
3598
  [2026-01-22 22:15:00] (step=0003386) Train Loss mse: 0.0000, Train Loss ce: 0.0678, Train Steps/Sec: 0.15,
3599
  [2026-01-22 22:15:07] (step=0003387) Train Loss mse: 0.0000, Train Loss ce: 0.0687, Train Steps/Sec: 0.15,
3600
- [
3601
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3500
3602
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
3603
- [eval debug] first 3 batch fingerprints:
3604
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3605
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3606
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3607
- ce_avg: 0.07012835144996643, mse_avg: 0.0
3608
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step4000
3609
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
3610
- [eval debug] first 3 batch fingerprints:
3611
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3612
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3613
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3614
- ce_avg: 0.06947071850299835, mse_avg: 0.0
3615
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step4500
3616
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
3617
- [eval debug] first 3 batch fingerprints:
3618
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3619
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3620
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3621
  [2026-01-22 22:15:14] (step=0003388) Train Loss mse: 0.0000, Train Loss ce: 0.0697, Train Steps/Sec: 0.13,
3622
  [2026-01-22 22:15:22] (step=0003389) Train Loss mse: 0.0000, Train Loss ce: 0.0691, Train Steps/Sec: 0.13,
3623
  [2026-01-22 22:15:29] (step=0003390) Train Loss mse: 0.0000, Train Loss ce: 0.0710, Train Steps/Sec: 0.14,
@@ -4930,13 +4930,6 @@ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap
4930
  [2026-01-23 00:48:12] (step=0004697) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.13,
4931
  [2026-01-23 00:48:19] (step=0004698) Train Loss mse: 0.0000, Train Loss ce: 0.0700, Train Steps/Sec: 0.15,
4932
  [2026-01-23 00:48:25] (step=0004699) Train Loss mse: 0.0000, Train Loss ce: 0.0676, Train Steps/Sec: 0.15,
4933
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step5000
4934
- Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
4935
- [eval debug] first 3 batch fingerprints:
4936
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
4937
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
4938
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
4939
- ce_avg: 0.07005834579467773, mse_avg: 0.0
4940
  [2026-01-23 00:48:32] (step=0004700) Train Loss mse: 0.0000, Train Loss ce: 0.0675, Train Steps/Sec: 0.14,
4941
  [2026-01-23 00:48:40] (step=0004701) Train Loss mse: 0.0000, Train Loss ce: 0.0679, Train Steps/Sec: 0.14,
4942
  [2026-01-23 00:48:47] (step=0004702) Train Loss mse: 0.0000, Train Loss ce: 0.0690, Train Steps/Sec: 0.14,
@@ -4965,6 +4958,13 @@ ce_avg: 0.07005834579467773, mse_avg: 0.0
4965
  [2026-01-23 00:51:29] (step=0004725) Train Loss mse: 0.0000, Train Loss ce: 0.0672, Train Steps/Sec: 0.14,
4966
  [2026-01-23 00:51:37] (step=0004726) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.14,
4967
  [2026-01-23 00:51:44] (step=0004727) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.14,
 
 
 
 
 
 
 
4968
  [2026-01-23 00:51:51] (step=0004728) Train Loss mse: 0.0000, Train Loss ce: 0.0680, Train Steps/Sec: 0.14,
4969
  [2026-01-23 00:51:57] (step=0004729) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.15,
4970
  [2026-01-23 00:52:04] (step=0004730) Train Loss mse: 0.0000, Train Loss ce: 0.0677, Train Steps/Sec: 0.14,
 
871
  [2026-01-22 17:18:30] (step=0000860) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.15,
872
  [2026-01-22 17:18:37] (step=0000861) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.14,
873
  [2026-01-22 17:18:44] (step=0000862) Train Loss mse: 0.0000, Train Loss ce: 0.0702, Train Steps/Sec: 0.15,
874
+ [2026-01-22 17:18:51] (step=0000863) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.14,
875
+ [2026-01-22 17:18:58] (step=0000864) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.15,
876
+ [2026-01-22 17:19:04] (step=0000865) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.15,
877
+ [2026-01-22 17:19:12] (step=0000866) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 0.13,
878
+ [2026-01-22 17:19:19] (step=0000867) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
879
+ [2026-01-22 17:19:25] (step=0000868) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.15,
880
+ [2026-01-22 17:19:32] (step=0000869) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
881
+ [2026-01-22 17:19:39] (step=0000870) Train Loss mse: 0.0000, Train Loss ce: 0.0710, Train Steps/Sec: 0.15,
882
+ [2026-01-22 17:19:46] (step=0000871) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 0.15,
883
+ [2026-01-22 17:19:54] (step=0000872) Train Loss mse: 0.0000, Train Loss ce: 0.0704, Train Steps/Sec: 0.13,
884
+ [2026-01-22 17:20:01] (step=0000873) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.14,
885
+ [2026-01-22 17:20:07] (step=0000874) Train Loss mse: 0.0000, Train Loss ce: 0.0723, Train Steps/Sec: 0.15,
886
+ [2026-01-22 17:20:14] (step=0000875) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.15,
887
+ [2026-01-22 17:20:20] (step=0000876) Train Loss mse: 0.0000, Train Loss ce: 0.0735, Train Steps/Sec: 0.15,
888
+ [2026-01-22 17:20:27] (step=0000877) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.15,
889
+ [2026-01-22 17:20:34] (step=0000878) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.14,
890
+ [2026-01-22 17:20:41] (step=0000879) Train Loss mse: 0.0000, Train Loss ce: 0.0729, Train Steps/Sec: 0.13,
891
+ [2026-01-22 17:20:49] (step=0000880) Train Loss mse: 0.0000, Train Loss ce: 0.0727, Train Steps/Sec: 0.13,
892
+ [2026-01-22 17:20:56] (step=0000881) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.14,
893
+ [2026-01-22 17:21:03] (step=0000882) Train Loss mse: 0.0000, Train Loss ce: 0.0722, Train Steps/Sec: 0.14,
894
+ [2026-01-22 17:21:10] (step=0000883) Train Loss mse: 0.0000, Train Loss ce: 0.0699, Train Steps/Sec: 0.14,
895
+ [2026-01-22 17:21:17] (step=0000884) Train Loss mse: 0.0000, Train Loss ce: 0.0729, Train Steps/Sec: 0.14,
896
+ [2026-01-22 17:21:24] (step=0000885) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.14,
897
+ [2026-01-22 17:21:31] (step=0000886) Train Loss mse: 0.0000, Train Loss ce: 0.0698, Train Steps/Sec: 0.15,
898
+ [2026-01-22 17:21:38] (step=0000887) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.15,
899
+ [2026-01-22 17:21:44] (step=0000888) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.15,
900
+ [2026-01-22 17:21:51] (step=0000889) Train Loss mse: 0.0000, Train Loss ce: 0.0704, Train Steps/Sec: 0.15,
901
+ [2026-01-22 17:21:58] (step=0000890) Train Loss mse: 0.0000, Train Loss ce: 0.0721, Train Steps/Sec: 0.14,
902
+ [2026-01-22 17:22:06] (step=0000891) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.13,
903
+ [2026-01-22 17:22:12] (step=0000892) Train Loss mse: 0.0000, Train Loss ce: 0.0741, Train Steps/Sec: 0.15,
904
+ [2026-01-22 17:22:19] (step=0000893) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.15,
905
+ [2026-01-22 17:22:26] (step=0000894) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.14,
906
+ [2026-01-22 17:22:33] (step=0000895) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.15,
907
+ [2026-01-22 17:22:40] (step=0000896) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.13,
908
+ [2026-01-22 17:22:47] (step=0000897) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
909
+ [2026-01-22 17:22:54] (step=0000898) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.14,
910
+ [2026-01-22 17:23:01] (step=0000899) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.15,
911
+ [2026-01-22 17:23:08] (step=0000900) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.15,
912
+ [2026-01-22 17:23:14] (step=0000901) Train Loss mse: 0.0000, Train Loss ce: 0.0731, Train Steps/Sec: 0.15,
913
+ [2026-01-22 17:23:22] (step=0000902) Train Loss mse: 0.0000, Train Loss ce: 0.0701, Train Steps/Sec: 0.14,
914
+ [2026-01-22 17:23:29] (step=0000903) Train Loss mse: 0.0000, Train Loss ce: 0.0726, Train Steps/Sec: 0.14,
915
+ [2026-01-22 17:23:36] (step=0000904) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.15,
916
+ [2026-01-22 17:23:42] (step=0000905) Train Loss mse: 0.0000, Train Loss ce: 0.0736, Train Steps/Sec: 0.15,
917
+ [2026-01-22 17:23:49] (step=0000906) Train Loss mse: 0.0000, Train Loss ce: 0.0708, Train Steps/Sec: 0.14,
918
+ [2026-01-22 17:23:57] (step=0000907) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
919
+ [2026-01-22 17:24:04] (step=0000908) Train Loss mse: 0.0000, Train Loss ce: 0.0736, Train Steps/Sec: 0.14,
920
+ [2026-01-22 17:24:11] (step=0000909) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.14,
921
+ [2026-01-22 17:24:17] (step=0000910) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.15,
922
+ [2026-01-22 17:24:25] (step=0000911) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 0.13,
923
+ [2026-01-22 17:24:32] (step=0000912) Train Loss mse: 0.0000, Train Loss ce: 0.0745, Train Steps/Sec: 0.15,
924
+ [2026-01-22 17:24:38] (step=0000913) Train Loss mse: 0.0000, Train Loss ce: 0.0722, Train Steps/Sec: 0.15,
925
+ [2026-01-22 17:24:45] (step=0000914) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.14,
926
+ [2026-01-22 17:24:53] (step=0000915) Train Loss mse: 0.0000, Train Loss ce: 0.0704, Train Steps/Sec: 0.14,
927
+ [2026-01-22 17:24:59] (step=0000916) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.15,
928
+ [2026-01-22 17:25:06] (step=0000917) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.15,
929
+ [2026-01-22 17:25:13] (step=0000918) Train Loss mse: 0.0000, Train Loss ce: 0.0727, Train Steps/Sec: 0.15,
930
+ [2026-01-22 17:25:20] (step=0000919) Train Loss mse: 0.0000, Train Loss ce: 0.0725, Train Steps/Sec: 0.14,
931
+ [2026-01-22 17:25:27] (step=0000920) Train Loss mse: 0.0000, Train Loss ce: 0.0723, Train Steps/Sec: 0.14,
932
+ [2026-01-22 17:25:35] (step=0000921) Train Loss mse: 0.0000, Train Loss ce: 0.0734, Train Steps/Sec: 0.14,
933
+ [2026-01-22 17:25:41] (step=0000922) Train Loss mse: 0.0000, Train Loss ce: 0.0729, Train Steps/Sec: 0.15,
934
+ [2026-01-22 17:25:48] (step=0000923) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.15,
935
+ [2026-01-22 17:25:55] (step=0000924) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.14,
936
+ [2026-01-22 17:26:02] (step=0000925) Train Loss mse: 0.0000, Train Loss ce: 0.0730, Train Steps/Sec: 0.15,
937
+ [2026-01-22 17:26:09] (step=0000926) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.15,
938
+ [2026-01-22 17:26:16] (step=0000927) Train Loss mse: 0.0000, Train Loss ce: 0.0732, Train Steps/Sec: 0.14,
939
+ [2026-01-22 17:26:23] (step=0000928) Train Loss mse: 0.0000, Train Loss ce: 0.0733, Train Steps/Sec: 0.15,
940
+ [2026-01-22 17:26:29] (step=0000929) Train Loss mse: 0.0000, Train Loss ce: 0.0700, Train Steps/Sec: 0.15,
941
+ [2026-01-22 17:26:36] (step=0000930) Train Loss mse: 0.0000, Train Loss ce: 0.0698, Train Steps/Sec: 0.16,
942
+ [2026-01-22 17:26:43] (step=0000931) Train Loss mse: 0.0000, Train Loss ce: 0.0708, Train Steps/Sec: 0.15,
943
+ [2026-01-22 17:26:50] (step=0000932) Train Loss mse: 0.0000, Train Loss ce: 0.0699, Train Steps/Sec: 0.13,
944
+ [2026-01-22 17:26:58] (step=0000933) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.13,
945
+ [2026-01-22 17:27:05] (step=0000934) Train Loss mse: 0.0000, Train Loss ce: 0.0708, Train Steps/Sec: 0.14,
946
+ [2026-01-22 17:27:12] (step=0000935) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.14,
947
+ [2026-01-22 17:27:19] (step=0000936) Train Loss mse: 0.0000, Train Loss ce: 0.0724, Train Steps/Sec: 0.13,
948
+ [2026-01-22 17:27:26] (step=0000937) Train Loss mse: 0.0000, Train Loss ce: 0.0721, Train Steps/Sec: 0.15,
949
+ [2026-01-22 17:27:32] (step=0000938) Train Loss mse: 0.0000, Train Loss ce: 0.0724, Train Steps/Sec: 0.15,
950
+ [2026-01-22 17:27:40] (step=0000939) Train Loss mse: 0.0000, Train Loss ce: 0.0710, Train Steps/Sec: 0.14,
951
+ [2026-01-22 17:27:46] (step=0000940) Train Loss mse: 0.0000, Train Loss ce: 0.0695, Train Steps/Sec: 0.15,
952
+ [2026-01-22 17:27:53] (step=0000941) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.14,
953
+ [2026-01-22 17:28:00] (step=0000942) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
954
+ [2026-01-22 17:28:07] (step=0000943) Train Loss mse: 0.0000, Train Loss ce: 0.0744, Train Steps/Sec: 0.15,
955
+ [2026-01-22 17:28:14] (step=0000944) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.15,
956
+ [2026-01-22 17:28:21] (step=0000945) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
957
+ [2026-01-22 17:28:29] (step=0000946) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.13,
958
+ [2026-01-22 17:28:35] (step=0000947) Train Loss mse: 0.0000, Train Loss ce: 0.0723, Train Steps/Sec: 0.15,
959
+ [2026-01-22 17:28:42] (step=0000948) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.15,
960
+ [2026-01-22 17:28:49] (step=0000949) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.14,
961
+ [2026-01-22 17:28:56] (step=0000950) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.15,
962
+ [2026-01-22 17:29:03] (step=0000951) Train Loss mse: 0.0000, Train Loss ce: 0.0697, Train Steps/Sec: 0.14,
963
+ [2026-01-22 17:29:11] (step=0000952) Train Loss mse: 0.0000, Train Loss ce: 0.0703, Train Steps/Sec: 0.13,
964
+ [2026-01-22 17:29:18] (step=0000953) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.13,
965
+ [2026-01-22 17:29:26] (step=0000954) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.14,
966
+ [2026-01-22 17:29:32] (step=0000955) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.15,
967
+ [2026-01-22 17:29:39] (step=0000956) Train Loss mse: 0.0000, Train Loss ce: 0.0696, Train Steps/Sec: 0.15,
968
+ [2026-01-22 17:29:46] (step=0000957) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.14,
969
+ [2026-01-22 17:29:53] (step=0000958) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
970
+ [2026-01-22 17:30:01] (step=0000959) Train Loss mse: 0.0000, Train Loss ce: 0.0727, Train Steps/Sec: 0.13,
971
+ [2026-01-22 17:30:08] (step=0000960) Train Loss mse: 0.0000, Train Loss ce: 0.0722, Train Steps/Sec: 0.13,
972
+ [2026-01-22 17:30:15] (step=0000961) Train Loss mse: 0.0000, Train Loss ce: 0.0740, Train Steps/Sec: 0.15,
973
+ [2026-01-22 17:30:22] (step=0000962) Train Loss mse: 0.0000, Train Loss ce: 0.0714, Train Steps/Sec: 0.14,
974
+ [2026-01-22 17:30:30] (step=0000963) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.13,
975
+ [2026-01-22 17:30:37] (step=0000964) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
976
+ [2026-01-22 17:30:44] (step=0000965) Train Loss mse: 0.0000, Train Loss ce: 0.0697, Train Steps/Sec: 0.14,
977
+ [2026-01-22 17:30:51] (step=0000966) Train Loss mse: 0.0000, Train Loss ce: 0.0707, Train Steps/Sec: 0.15,
978
+ [2026-01-22 17:30:57] (step=0000967) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.15,
979
+ [2026-01-22 17:31:04] (step=0000968) Train Loss mse: 0.0000, Train Loss ce: 0.0701, Train Steps/Sec: 0.15,
980
+ [2026-01-22 17:31:11] (step=0000969) Train Loss mse: 0.0000, Train Loss ce: 0.0712, Train Steps/Sec: 0.14,
981
+ [2026-01-22 17:31:18] (step=0000970) Train Loss mse: 0.0000, Train Loss ce: 0.0717, Train Steps/Sec: 0.14,
982
+ [2026-01-22 17:31:25] (step=0000971) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.15,
983
+ [2026-01-22 17:31:32] (step=0000972) Train Loss mse: 0.0000, Train Loss ce: 0.0696, Train Steps/Sec: 0.15,
984
+ [2026-01-22 17:31:39] (step=0000973) Train Loss mse: 0.0000, Train Loss ce: 0.0715, Train Steps/Sec: 0.14,
985
+ [2026-01-22 17:31:46] (step=0000974) Train Loss mse: 0.0000, Train Loss ce: 0.0721, Train Steps/Sec: 0.14,
986
+ [2026-01-22 17:31:52] (step=0000975) Train Loss mse: 0.0000, Train Loss ce: 0.0721, Train Steps/Sec: 0.15,
987
+ [2026-01-22 17:32:00] (step=0000976) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.14,
988
+ [2026-01-22 17:32:07] (step=0000977) Train Loss mse: 0.0000, Train Loss ce: 0.0726, Train Steps/Sec: 0.14,
989
+ [2026-01-22 17:32:13] (step=0000978) Train Loss mse: 0.0000, Train Loss ce: 0.0716, Train Steps/Sec: 0.15,
990
+ [2026-01-22 17:32:20] (step=0000979) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.15,
991
+ [2026-01-22 17:32:27] (step=0000980) Train Loss mse: 0.0000, Train Loss ce: 0.0738, Train Steps/Sec: 0.14,
992
+ [2026-01-22 17:32:34] (step=0000981) Train Loss mse: 0.0000, Train Loss ce: 0.0728, Train Steps/Sec: 0.14,
993
+ [2026-01-22 17:32:42] (step=0000982) Train Loss mse: 0.0000, Train Loss ce: 0.0739, Train Steps/Sec: 0.14,
994
+ [2026-01-22 17:32:49] (step=0000983) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.14,
995
+ [2026-01-22 17:32:55] (step=0000984) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.15,
996
+ [2026-01-22 17:33:02] (step=0000985) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.14,
997
  FullyShardedDataParallel(
998
  (_fsdp_wrapped_module): Bagel(
999
  (language_model): Qwen2ForCausalLM(
 
1178
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1179
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
1180
  ce_avg: 0.14597271382808685, mse_avg: 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1181
  [2026-01-22 17:33:10] (step=0000986) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.14,
1182
  [2026-01-22 17:33:16] (step=0000987) Train Loss mse: 0.0000, Train Loss ce: 0.0720, Train Steps/Sec: 0.16,
1183
  [2026-01-22 17:33:23] (step=0000988) Train Loss mse: 0.0000, Train Loss ce: 0.0711, Train Steps/Sec: 0.15,
 
2426
  [2026-01-22 19:57:55] (step=0002231) Train Loss mse: 0.0000, Train Loss ce: 0.0706, Train Steps/Sec: 0.14,
2427
  [2026-01-22 19:58:01] (step=0002232) Train Loss mse: 0.0000, Train Loss ce: 0.0705, Train Steps/Sec: 0.15,
2428
  [2026-01-22 19:58:09] (step=0002233) Train Loss mse: 0.0000, Train Loss ce: 0.0713, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2429
  [2026-01-22 19:58:15] (step=0002234) Train Loss mse: 0.0000, Train Loss ce: 0.0691, Train Steps/Sec: 0.15,
2430
  [2026-01-22 19:58:23] (step=0002235) Train Loss mse: 0.0000, Train Loss ce: 0.0701, Train Steps/Sec: 0.14,
2431
  [2026-01-22 19:58:29] (step=0002236) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.15,
 
2553
  [2026-01-22 20:12:43] (step=0002358) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.14,
2554
  [2026-01-22 20:12:50] (step=0002359) Train Loss mse: 0.0000, Train Loss ce: 0.0684, Train Steps/Sec: 0.14,
2555
  [2026-01-22 20:12:57] (step=0002360) Train Loss mse: 0.0000, Train Loss ce: 0.0691, Train Steps/Sec: 0.14,
2556
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step2500
2557
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2558
+ [eval debug] first 3 batch fingerprints:
2559
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2560
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2561
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2562
+ ce_avg: 0.1571701020002365, mse_avg: 0.0
2563
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3000
2564
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2565
+ [eval debug] first 3 batch fingerprints:
2566
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2567
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2568
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2569
+ ce_avg: 0.0698586106300354, mse_avg: 0.0
2570
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step3500
2571
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
2572
+ [eval debug] first 3 batch fingerprints:
2573
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2574
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2575
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
2576
+ ce_avg: 0.07012835144996643, mse_avg: 0.0
2577
  [2026-01-22 20:13:03] (step=0002361) Train Loss mse: 0.0000, Train Loss ce: 0.0687, Train Steps/Sec: 0.15,
2578
  [2026-01-22 20:13:10] (step=0002362) Train Loss mse: 0.0000, Train Loss ce: 0.0672, Train Steps/Sec: 0.15,
2579
  [2026-01-22 20:13:17] (step=0002363) Train Loss mse: 0.0000, Train Loss ce: 0.0698, Train Steps/Sec: 0.14,
 
3528
  [2026-01-22 22:06:04] (step=0003309) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.15,
3529
  [2026-01-22 22:06:11] (step=0003310) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.14,
3530
  [2026-01-22 22:06:18] (step=0003311) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.14,
3531
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step4000
3532
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
3533
+ [eval debug] first 3 batch fingerprints:
3534
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3535
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3536
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3537
+ ce_avg: 0.06947071850299835, mse_avg: 0.0
3538
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step4500
3539
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
3540
+ [eval debug] first 3 batch fingerprints:
3541
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3542
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3543
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
3544
+ ce_avg: 0.069743812084198, mse_avg: 0.0
3545
  [2026-01-22 22:06:26] (step=0003312) Train Loss mse: 0.0000, Train Loss ce: 0.0709, Train Steps/Sec: 0.13,
3546
  [2026-01-22 22:06:33] (step=0003313) Train Loss mse: 0.0000, Train Loss ce: 0.0682, Train Steps/Sec: 0.15,
3547
  [2026-01-22 22:06:39] (step=0003314) Train Loss mse: 0.0000, Train Loss ce: 0.0677, Train Steps/Sec: 0.14,
 
3618
  [2026-01-22 22:14:53] (step=0003385) Train Loss mse: 0.0000, Train Loss ce: 0.0689, Train Steps/Sec: 0.15,
3619
  [2026-01-22 22:15:00] (step=0003386) Train Loss mse: 0.0000, Train Loss ce: 0.0678, Train Steps/Sec: 0.15,
3620
  [2026-01-22 22:15:07] (step=0003387) Train Loss mse: 0.0000, Train Loss ce: 0.0687, Train Steps/Sec: 0.15,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3621
  [2026-01-22 22:15:14] (step=0003388) Train Loss mse: 0.0000, Train Loss ce: 0.0697, Train Steps/Sec: 0.13,
3622
  [2026-01-22 22:15:22] (step=0003389) Train Loss mse: 0.0000, Train Loss ce: 0.0691, Train Steps/Sec: 0.13,
3623
  [2026-01-22 22:15:29] (step=0003390) Train Loss mse: 0.0000, Train Loss ce: 0.0710, Train Steps/Sec: 0.14,
 
4930
  [2026-01-23 00:48:12] (step=0004697) Train Loss mse: 0.0000, Train Loss ce: 0.0683, Train Steps/Sec: 0.13,
4931
  [2026-01-23 00:48:19] (step=0004698) Train Loss mse: 0.0000, Train Loss ce: 0.0700, Train Steps/Sec: 0.15,
4932
  [2026-01-23 00:48:25] (step=0004699) Train Loss mse: 0.0000, Train Loss ce: 0.0676, Train Steps/Sec: 0.15,
 
 
 
 
 
 
 
4933
  [2026-01-23 00:48:32] (step=0004700) Train Loss mse: 0.0000, Train Loss ce: 0.0675, Train Steps/Sec: 0.14,
4934
  [2026-01-23 00:48:40] (step=0004701) Train Loss mse: 0.0000, Train Loss ce: 0.0679, Train Steps/Sec: 0.14,
4935
  [2026-01-23 00:48:47] (step=0004702) Train Loss mse: 0.0000, Train Loss ce: 0.0690, Train Steps/Sec: 0.14,
 
4958
  [2026-01-23 00:51:29] (step=0004725) Train Loss mse: 0.0000, Train Loss ce: 0.0672, Train Steps/Sec: 0.14,
4959
  [2026-01-23 00:51:37] (step=0004726) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.14,
4960
  [2026-01-23 00:51:44] (step=0004727) Train Loss mse: 0.0000, Train Loss ce: 0.0688, Train Steps/Sec: 0.14,
4961
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ins_step5000
4962
+ Preparing Dataset vlm_gym_jigsaw_swap_celoss_no_mse_evalonce/vlm_gym_jigsaw_swap_val
4963
+ [eval debug] first 3 batch fingerprints:
4964
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
4965
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
4966
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_swap_celoss_no_mse_evalonce'}]
4967
+ ce_avg: 0.07005834579467773, mse_avg: 0.0
4968
  [2026-01-23 00:51:51] (step=0004728) Train Loss mse: 0.0000, Train Loss ce: 0.0680, Train Steps/Sec: 0.14,
4969
  [2026-01-23 00:51:57] (step=0004729) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.15,
4970
  [2026-01-23 00:52:04] (step=0004730) Train Loss mse: 0.0000, Train Loss ce: 0.0677, Train Steps/Sec: 0.14,