Junyi42 commited on
Commit
e3d786a
·
verified ·
1 Parent(s): 3074b08

Upload checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed

Browse files
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/wandb/offline-run-20260111_233308-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed-run0/files/output.log CHANGED
@@ -819,35 +819,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
819
  [2026-01-12 02:47:03] (step=0000808) Train Loss mse: 0.0447, Train Loss ce: 0.0600, Train Steps/Sec: 0.09,
820
  [2026-01-12 02:47:19] (step=0000809) Train Loss mse: 0.0325, Train Loss ce: 0.0641, Train Steps/Sec: 0.06,
821
  [2026-01-12 02:47:31] (step=0000810) Train Loss mse: 0.0435, Train Loss ce: 0.0588, Train Steps/Sec: 0.08,
822
- [2026-01-12 02:47:43] (step=0000811) Train Loss mse: 0.0598, Train Loss ce: 0.0609, Train Steps/Sec: 0.08,
823
- [2026-01-12 02:47:55] (step=0000812) Train Loss mse: 0.0389, Train Loss ce: 0.0585, Train Steps/Sec: 0.09,
824
- [2026-01-12 02:48:08] (step=0000813) Train Loss mse: 0.0492, Train Loss ce: 0.0590, Train Steps/Sec: 0.07,
825
- [2026-01-12 02:48:21] (step=0000814) Train Loss mse: 0.0543, Train Loss ce: 0.0658, Train Steps/Sec: 0.08,
826
- [2026-01-12 02:48:38] (step=0000815) Train Loss mse: 0.0495, Train Loss ce: 0.0667, Train Steps/Sec: 0.06,
827
- [2026-01-12 02:48:53] (step=0000816) Train Loss mse: 0.0343, Train Loss ce: 0.0635, Train Steps/Sec: 0.06,
828
- [2026-01-12 02:49:09] (step=0000817) Train Loss mse: 0.0452, Train Loss ce: 0.0661, Train Steps/Sec: 0.06,
829
- [2026-01-12 02:49:23] (step=0000818) Train Loss mse: 0.0347, Train Loss ce: 0.0633, Train Steps/Sec: 0.07,
830
- [2026-01-12 02:49:34] (step=0000819) Train Loss mse: 0.0487, Train Loss ce: 0.0630, Train Steps/Sec: 0.09,
831
- [2026-01-12 02:49:45] (step=0000820) Train Loss mse: 0.0425, Train Loss ce: 0.0600, Train Steps/Sec: 0.09,
832
- [2026-01-12 02:49:58] (step=0000821) Train Loss mse: 0.0287, Train Loss ce: 0.0599, Train Steps/Sec: 0.07,
833
- [2026-01-12 02:50:09] (step=0000822) Train Loss mse: 0.0400, Train Loss ce: 0.0655, Train Steps/Sec: 0.09,
834
- [2026-01-12 02:50:22] (step=0000823) Train Loss mse: 0.0512, Train Loss ce: 0.0626, Train Steps/Sec: 0.08,
835
- [2026-01-12 02:50:38] (step=0000824) Train Loss mse: 0.0549, Train Loss ce: 0.0732, Train Steps/Sec: 0.06,
836
- [2026-01-12 02:50:51] (step=0000825) Train Loss mse: 0.0471, Train Loss ce: 0.0684, Train Steps/Sec: 0.07,
837
- [2026-01-12 02:51:03] (step=0000826) Train Loss mse: 0.0601, Train Loss ce: 0.0645, Train Steps/Sec: 0.08,
838
- [2026-01-12 02:51:14] (step=0000827) Train Loss mse: 0.0475, Train Loss ce: 0.0641, Train Steps/Sec: 0.09,
839
- [2026-01-12 02:51:28] (step=0000828) Train Loss mse: 0.0399, Train Loss ce: 0.0642, Train Steps/Sec: 0.07,
840
- [2026-01-12 02:51:44] (step=0000829) Train Loss mse: 0.0344, Train Loss ce: 0.0644, Train Steps/Sec: 0.06,
841
- [2026-01-12 02:51:55] (step=0000830) Train Loss mse: 0.0483, Train Loss ce: 0.0616, Train Steps/Sec: 0.09,
842
- [2026-01-12 02:52:07] (step=0000831) Train Loss mse: 0.0494, Train Loss ce: 0.0635, Train Steps/Sec: 0.08,
843
- [2026-01-12 02:52:20] (step=0000832) Train Loss mse: 0.0520, Train Loss ce: 0.0662, Train Steps/Sec: 0.08,
844
- [2026-01-12 02:52:35] (step=0000833) Train Loss mse: 0.0426, Train Loss ce: 0.0628, Train Steps/Sec: 0.06,
845
- [2026-01-12 02:52:47] (step=0000834) Train Loss mse: 0.0460, Train Loss ce: 0.0740, Train Steps/Sec: 0.08,
846
- [2026-01-12 02:53:03] (step=0000835) Train Loss mse: 0.0393, Train Loss ce: 0.0605, Train Steps/Sec: 0.06,
847
- [2026-01-12 02:53:17] (step=0000836) Train Loss mse: 0.0367, Train Loss ce: 0.0604, Train Steps/Sec: 0.07,
848
- [2026-01-12 02:53:29] (step=0000837) Train Loss mse: 0.0563, Train Loss ce: 0.0676, Train Steps/Sec: 0.08,
849
- [2026-01-12 02:53:45] (step=0000838) Train Loss mse: 0.0371, Train Loss ce: 0.0716, Train Steps/Sec: 0.06,
850
- [2026-01-12 02:53:59] (step=0000839) Train Loss mse: 0.0382, Train Loss ce: 0.0621, Train Steps/Sec: 0.07,
851
  FullyShardedDataParallel(
852
  (_fsdp_wrapped_module): Bagel(
853
  (language_model): Qwen2ForCausalLM(
@@ -1034,27 +1005,35 @@ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
1034
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1035
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1036
  ce_avg: 0.389182448387146, mse_avg: 0.06303369253873825
1037
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step1000
1038
- Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
1039
- [eval debug] first 3 batch fingerprints:
1040
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1041
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1042
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1043
- ce_avg: 0.28593534231185913, mse_avg: 0.058155789971351624
1044
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step1500
1045
- Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
1046
- [eval debug] first 3 batch fingerprints:
1047
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1048
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1049
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1050
- ce_avg: 0.13890723884105682, mse_avg: 0.054675184190273285
1051
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step2000
1052
- Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
1053
- [eval debug] first 3 batch fingerprints:
1054
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1055
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1056
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1057
- ce_avg: 0.09731192141771317, mse_avg: 0.049818094819784164
 
 
 
 
 
 
 
 
1058
  [2026-01-12 02:54:15] (step=0000840) Train Loss mse: 0.0466, Train Loss ce: 0.0669, Train Steps/Sec: 0.06,
1059
  [2026-01-12 02:54:28] (step=0000841) Train Loss mse: 0.0454, Train Loss ce: 0.0709, Train Steps/Sec: 0.08,
1060
  [2026-01-12 02:54:40] (step=0000842) Train Loss mse: 0.0398, Train Loss ce: 0.0702, Train Steps/Sec: 0.08,
@@ -2093,6 +2072,20 @@ ce_avg: 0.09731192141771317, mse_avg: 0.049818094819784164
2093
  [2026-01-12 06:52:48] (step=0001875) Train Loss mse: 0.0338, Train Loss ce: 0.0594, Train Steps/Sec: 0.08,
2094
  [2026-01-12 06:53:05] (step=0001876) Train Loss mse: 0.0441, Train Loss ce: 0.0604, Train Steps/Sec: 0.06,
2095
  [2026-01-12 06:53:18] (step=0001877) Train Loss mse: 0.0328, Train Loss ce: 0.0611, Train Steps/Sec: 0.07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2096
  [2026-01-12 06:53:31] (step=0001878) Train Loss mse: 0.0325, Train Loss ce: 0.0575, Train Steps/Sec: 0.07,
2097
  [2026-01-12 06:53:45] (step=0001879) Train Loss mse: 0.0372, Train Loss ce: 0.0609, Train Steps/Sec: 0.08,
2098
  [2026-01-12 06:53:58] (step=0001880) Train Loss mse: 0.0411, Train Loss ce: 0.0609, Train Steps/Sec: 0.07,
@@ -2303,20 +2296,6 @@ ce_avg: 0.09731192141771317, mse_avg: 0.049818094819784164
2303
  [2026-01-12 07:41:52] (step=0002085) Train Loss mse: 0.0384, Train Loss ce: 0.0615, Train Steps/Sec: 0.08,
2304
  [2026-01-12 07:42:04] (step=0002086) Train Loss mse: 0.0376, Train Loss ce: 0.0600, Train Steps/Sec: 0.09,
2305
  [2026-01-12 07:42:19] (step=0002087) Train Loss mse: 0.0531, Train Loss ce: 0.0583, Train Steps/Sec: 0.07,
2306
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step2500
2307
- Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
2308
- [eval debug] first 3 batch fingerprints:
2309
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2310
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2311
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2312
- ce_avg: 0.0866585522890091, mse_avg: 0.04947679862380028
2313
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step3000
2314
- Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
2315
- [eval debug] first 3 batch fingerprints:
2316
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2317
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2318
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2319
- ce_avg: 0.07865892350673676, mse_avg: 0.049521658569574356
2320
  [2026-01-12 07:42:35] (step=0002088) Train Loss mse: 0.0440, Train Loss ce: 0.0601, Train Steps/Sec: 0.06,
2321
  [2026-01-12 07:42:48] (step=0002089) Train Loss mse: 0.0460, Train Loss ce: 0.0582, Train Steps/Sec: 0.08,
2322
  [2026-01-12 07:43:04] (step=0002090) Train Loss mse: 0.0397, Train Loss ce: 0.0569, Train Steps/Sec: 0.06,
@@ -3104,6 +3083,20 @@ ce_avg: 0.07865892350673676, mse_avg: 0.049521658569574356
3104
  [2026-01-12 10:44:53] (step=0002869) Train Loss mse: 0.0422, Train Loss ce: 0.0587, Train Steps/Sec: 0.06,
3105
  [2026-01-12 10:45:05] (step=0002870) Train Loss mse: 0.0456, Train Loss ce: 0.0590, Train Steps/Sec: 0.08,
3106
  [2026-01-12 10:45:19] (step=0002871) Train Loss mse: 0.0379, Train Loss ce: 0.0585, Train Steps/Sec: 0.07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3107
  [2026-01-12 10:45:32] (step=0002872) Train Loss mse: 0.0395, Train Loss ce: 0.0561, Train Steps/Sec: 0.08,
3108
  [2026-01-12 10:45:46] (step=0002873) Train Loss mse: 0.0464, Train Loss ce: 0.0642, Train Steps/Sec: 0.07,
3109
  [2026-01-12 10:46:00] (step=0002874) Train Loss mse: 0.0353, Train Loss ce: 0.0605, Train Steps/Sec: 0.07,
@@ -3211,20 +3204,6 @@ ce_avg: 0.07865892350673676, mse_avg: 0.049521658569574356
3211
  [2026-01-12 11:08:58] (step=0002976) Train Loss mse: 0.0449, Train Loss ce: 0.0565, Train Steps/Sec: 0.07,
3212
  [2026-01-12 11:09:14] (step=0002977) Train Loss mse: 0.0384, Train Loss ce: 0.0630, Train Steps/Sec: 0.06,
3213
  [2026-01-12 11:09:28] (step=0002978) Train Loss mse: 0.0437, Train Loss ce: 0.0567, Train Steps/Sec: 0.07,
3214
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step3500
3215
- Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
3216
- [eval debug] first 3 batch fingerprints:
3217
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3218
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3219
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3220
- ce_avg: 0.07448671013116837, mse_avg: 0.045566461980342865
3221
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step4000
3222
- Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
3223
- [eval debug] first 3 batch fingerprints:
3224
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3225
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3226
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3227
- ce_avg: 0.07164748013019562, mse_avg: 0.045982010662555695
3228
  [2026-01-12 11:09:39] (step=0002979) Train Loss mse: 0.0554, Train Loss ce: 0.0585, Train Steps/Sec: 0.09,
3229
  [2026-01-12 11:09:55] (step=0002980) Train Loss mse: 0.0392, Train Loss ce: 0.0592, Train Steps/Sec: 0.06,
3230
  [2026-01-12 11:10:09] (step=0002981) Train Loss mse: 0.0305, Train Loss ce: 0.0559, Train Steps/Sec: 0.07,
@@ -4194,6 +4173,41 @@ ce_avg: 0.07164748013019562, mse_avg: 0.045982010662555695
4194
  [2026-01-12 14:52:27] (step=0003945) Train Loss mse: 0.0261, Train Loss ce: 0.0557, Train Steps/Sec: 0.09,
4195
  [2026-01-12 14:52:38] (step=0003946) Train Loss mse: 0.0454, Train Loss ce: 0.0571, Train Steps/Sec: 0.09,
4196
  [2026-01-12 14:52:50] (step=0003947) Train Loss mse: 0.0509, Train Loss ce: 0.0586, Train Steps/Sec: 0.08,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4197
  [2026-01-12 14:53:03] (step=0003948) Train Loss mse: 0.0378, Train Loss ce: 0.0602, Train Steps/Sec: 0.08,
4198
  [2026-01-12 14:53:17] (step=0003949) Train Loss mse: 0.0307, Train Loss ce: 0.0570, Train Steps/Sec: 0.07,
4199
  [2026-01-12 14:53:29] (step=0003950) Train Loss mse: 0.0482, Train Loss ce: 0.0563, Train Steps/Sec: 0.08,
@@ -4506,20 +4520,6 @@ ce_avg: 0.07164748013019562, mse_avg: 0.045982010662555695
4506
  [2026-01-12 16:05:02] (step=0004257) Train Loss mse: 0.0372, Train Loss ce: 0.0558, Train Steps/Sec: 0.09,
4507
  [2026-01-12 16:05:15] (step=0004258) Train Loss mse: 0.0310, Train Loss ce: 0.0591, Train Steps/Sec: 0.07,
4508
  [2026-01-12 16:05:31] (step=0004259) Train Loss mse: 0.0422, Train Loss ce: 0.0573, Train Steps/Sec: 0.06,
4509
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step4500
4510
- Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
4511
- [eval debug] first 3 batch fingerprints:
4512
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4513
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4514
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4515
- ce_avg: 0.06939579546451569, mse_avg: 0.04529677331447601
4516
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step5000
4517
- Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
4518
- [eval debug] first 3 batch fingerprints:
4519
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4520
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4521
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4522
- ce_avg: 0.06746137142181396, mse_avg: 0.044675178825855255
4523
  [2026-01-12 16:05:44] (step=0004260) Train Loss mse: 0.0486, Train Loss ce: 0.0621, Train Steps/Sec: 0.08,
4524
  [2026-01-12 16:05:55] (step=0004261) Train Loss mse: 0.0576, Train Loss ce: 0.0519, Train Steps/Sec: 0.09,
4525
  [2026-01-12 16:06:11] (step=0004262) Train Loss mse: 0.0290, Train Loss ce: 0.0579, Train Steps/Sec: 0.06,
 
819
  [2026-01-12 02:47:03] (step=0000808) Train Loss mse: 0.0447, Train Loss ce: 0.0600, Train Steps/Sec: 0.09,
820
  [2026-01-12 02:47:19] (step=0000809) Train Loss mse: 0.0325, Train Loss ce: 0.0641, Train Steps/Sec: 0.06,
821
  [2026-01-12 02:47:31] (step=0000810) Train Loss mse: 0.0435, Train Loss ce: 0.0588, Train Steps/Sec: 0.08,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822
  FullyShardedDataParallel(
823
  (_fsdp_wrapped_module): Bagel(
824
  (language_model): Qwen2ForCausalLM(
 
1005
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1006
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
1007
  ce_avg: 0.389182448387146, mse_avg: 0.06303369253873825
1008
+ [2026-01-12 02:47:43] (step=0000811) Train Loss mse: 0.0598, Train Loss ce: 0.0609, Train Steps/Sec: 0.08,
1009
+ [2026-01-12 02:47:55] (step=0000812) Train Loss mse: 0.0389, Train Loss ce: 0.0585, Train Steps/Sec: 0.09,
1010
+ [2026-01-12 02:48:08] (step=0000813) Train Loss mse: 0.0492, Train Loss ce: 0.0590, Train Steps/Sec: 0.07,
1011
+ [2026-01-12 02:48:21] (step=0000814) Train Loss mse: 0.0543, Train Loss ce: 0.0658, Train Steps/Sec: 0.08,
1012
+ [2026-01-12 02:48:38] (step=0000815) Train Loss mse: 0.0495, Train Loss ce: 0.0667, Train Steps/Sec: 0.06,
1013
+ [2026-01-12 02:48:53] (step=0000816) Train Loss mse: 0.0343, Train Loss ce: 0.0635, Train Steps/Sec: 0.06,
1014
+ [2026-01-12 02:49:09] (step=0000817) Train Loss mse: 0.0452, Train Loss ce: 0.0661, Train Steps/Sec: 0.06,
1015
+ [2026-01-12 02:49:23] (step=0000818) Train Loss mse: 0.0347, Train Loss ce: 0.0633, Train Steps/Sec: 0.07,
1016
+ [2026-01-12 02:49:34] (step=0000819) Train Loss mse: 0.0487, Train Loss ce: 0.0630, Train Steps/Sec: 0.09,
1017
+ [2026-01-12 02:49:45] (step=0000820) Train Loss mse: 0.0425, Train Loss ce: 0.0600, Train Steps/Sec: 0.09,
1018
+ [2026-01-12 02:49:58] (step=0000821) Train Loss mse: 0.0287, Train Loss ce: 0.0599, Train Steps/Sec: 0.07,
1019
+ [2026-01-12 02:50:09] (step=0000822) Train Loss mse: 0.0400, Train Loss ce: 0.0655, Train Steps/Sec: 0.09,
1020
+ [2026-01-12 02:50:22] (step=0000823) Train Loss mse: 0.0512, Train Loss ce: 0.0626, Train Steps/Sec: 0.08,
1021
+ [2026-01-12 02:50:38] (step=0000824) Train Loss mse: 0.0549, Train Loss ce: 0.0732, Train Steps/Sec: 0.06,
1022
+ [2026-01-12 02:50:51] (step=0000825) Train Loss mse: 0.0471, Train Loss ce: 0.0684, Train Steps/Sec: 0.07,
1023
+ [2026-01-12 02:51:03] (step=0000826) Train Loss mse: 0.0601, Train Loss ce: 0.0645, Train Steps/Sec: 0.08,
1024
+ [2026-01-12 02:51:14] (step=0000827) Train Loss mse: 0.0475, Train Loss ce: 0.0641, Train Steps/Sec: 0.09,
1025
+ [2026-01-12 02:51:28] (step=0000828) Train Loss mse: 0.0399, Train Loss ce: 0.0642, Train Steps/Sec: 0.07,
1026
+ [2026-01-12 02:51:44] (step=0000829) Train Loss mse: 0.0344, Train Loss ce: 0.0644, Train Steps/Sec: 0.06,
1027
+ [2026-01-12 02:51:55] (step=0000830) Train Loss mse: 0.0483, Train Loss ce: 0.0616, Train Steps/Sec: 0.09,
1028
+ [2026-01-12 02:52:07] (step=0000831) Train Loss mse: 0.0494, Train Loss ce: 0.0635, Train Steps/Sec: 0.08,
1029
+ [2026-01-12 02:52:20] (step=0000832) Train Loss mse: 0.0520, Train Loss ce: 0.0662, Train Steps/Sec: 0.08,
1030
+ [2026-01-12 02:52:35] (step=0000833) Train Loss mse: 0.0426, Train Loss ce: 0.0628, Train Steps/Sec: 0.06,
1031
+ [2026-01-12 02:52:47] (step=0000834) Train Loss mse: 0.0460, Train Loss ce: 0.0740, Train Steps/Sec: 0.08,
1032
+ [2026-01-12 02:53:03] (step=0000835) Train Loss mse: 0.0393, Train Loss ce: 0.0605, Train Steps/Sec: 0.06,
1033
+ [2026-01-12 02:53:17] (step=0000836) Train Loss mse: 0.0367, Train Loss ce: 0.0604, Train Steps/Sec: 0.07,
1034
+ [2026-01-12 02:53:29] (step=0000837) Train Loss mse: 0.0563, Train Loss ce: 0.0676, Train Steps/Sec: 0.08,
1035
+ [2026-01-12 02:53:45] (step=0000838) Train Loss mse: 0.0371, Train Loss ce: 0.0716, Train Steps/Sec: 0.06,
1036
+ [2026-01-12 02:53:59] (step=0000839) Train Loss mse: 0.0382, Train Loss ce: 0.0621, Train Steps/Sec: 0.07,
1037
  [2026-01-12 02:54:15] (step=0000840) Train Loss mse: 0.0466, Train Loss ce: 0.0669, Train Steps/Sec: 0.06,
1038
  [2026-01-12 02:54:28] (step=0000841) Train Loss mse: 0.0454, Train Loss ce: 0.0709, Train Steps/Sec: 0.08,
1039
  [2026-01-12 02:54:40] (step=0000842) Train Loss mse: 0.0398, Train Loss ce: 0.0702, Train Steps/Sec: 0.08,
 
2072
  [2026-01-12 06:52:48] (step=0001875) Train Loss mse: 0.0338, Train Loss ce: 0.0594, Train Steps/Sec: 0.08,
2073
  [2026-01-12 06:53:05] (step=0001876) Train Loss mse: 0.0441, Train Loss ce: 0.0604, Train Steps/Sec: 0.06,
2074
  [2026-01-12 06:53:18] (step=0001877) Train Loss mse: 0.0328, Train Loss ce: 0.0611, Train Steps/Sec: 0.07,
2075
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step1000
2076
+ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
2077
+ [eval debug] first 3 batch fingerprints:
2078
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2079
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2080
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2081
+ ce_avg: 0.28593534231185913, mse_avg: 0.058155789971351624
2082
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step1500
2083
+ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
2084
+ [eval debug] first 3 batch fingerprints:
2085
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2086
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2087
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
2088
+ ce_avg: 0.13890723884105682, mse_avg: 0.054675184190273285
2089
  [2026-01-12 06:53:31] (step=0001878) Train Loss mse: 0.0325, Train Loss ce: 0.0575, Train Steps/Sec: 0.07,
2090
  [2026-01-12 06:53:45] (step=0001879) Train Loss mse: 0.0372, Train Loss ce: 0.0609, Train Steps/Sec: 0.08,
2091
  [2026-01-12 06:53:58] (step=0001880) Train Loss mse: 0.0411, Train Loss ce: 0.0609, Train Steps/Sec: 0.07,
 
2296
  [2026-01-12 07:41:52] (step=0002085) Train Loss mse: 0.0384, Train Loss ce: 0.0615, Train Steps/Sec: 0.08,
2297
  [2026-01-12 07:42:04] (step=0002086) Train Loss mse: 0.0376, Train Loss ce: 0.0600, Train Steps/Sec: 0.09,
2298
  [2026-01-12 07:42:19] (step=0002087) Train Loss mse: 0.0531, Train Loss ce: 0.0583, Train Steps/Sec: 0.07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2299
  [2026-01-12 07:42:35] (step=0002088) Train Loss mse: 0.0440, Train Loss ce: 0.0601, Train Steps/Sec: 0.06,
2300
  [2026-01-12 07:42:48] (step=0002089) Train Loss mse: 0.0460, Train Loss ce: 0.0582, Train Steps/Sec: 0.08,
2301
  [2026-01-12 07:43:04] (step=0002090) Train Loss mse: 0.0397, Train Loss ce: 0.0569, Train Steps/Sec: 0.06,
 
3083
  [2026-01-12 10:44:53] (step=0002869) Train Loss mse: 0.0422, Train Loss ce: 0.0587, Train Steps/Sec: 0.06,
3084
  [2026-01-12 10:45:05] (step=0002870) Train Loss mse: 0.0456, Train Loss ce: 0.0590, Train Steps/Sec: 0.08,
3085
  [2026-01-12 10:45:19] (step=0002871) Train Loss mse: 0.0379, Train Loss ce: 0.0585, Train Steps/Sec: 0.07,
3086
+ [2026-01-12 10:45:32
3087
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step2000
3088
+ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
3089
+ [eval debug] first 3 batch fingerprints:
3090
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3091
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3092
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3093
+ ce_avg: 0.09731192141771317, mse_avg: 0.049818094819784164
3094
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step2500
3095
+ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
3096
+ [eval debug] first 3 batch fingerprints:
3097
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3098
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3099
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
3100
  [2026-01-12 10:45:32] (step=0002872) Train Loss mse: 0.0395, Train Loss ce: 0.0561, Train Steps/Sec: 0.08,
3101
  [2026-01-12 10:45:46] (step=0002873) Train Loss mse: 0.0464, Train Loss ce: 0.0642, Train Steps/Sec: 0.07,
3102
  [2026-01-12 10:46:00] (step=0002874) Train Loss mse: 0.0353, Train Loss ce: 0.0605, Train Steps/Sec: 0.07,
 
3204
  [2026-01-12 11:08:58] (step=0002976) Train Loss mse: 0.0449, Train Loss ce: 0.0565, Train Steps/Sec: 0.07,
3205
  [2026-01-12 11:09:14] (step=0002977) Train Loss mse: 0.0384, Train Loss ce: 0.0630, Train Steps/Sec: 0.06,
3206
  [2026-01-12 11:09:28] (step=0002978) Train Loss mse: 0.0437, Train Loss ce: 0.0567, Train Steps/Sec: 0.07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3207
  [2026-01-12 11:09:39] (step=0002979) Train Loss mse: 0.0554, Train Loss ce: 0.0585, Train Steps/Sec: 0.09,
3208
  [2026-01-12 11:09:55] (step=0002980) Train Loss mse: 0.0392, Train Loss ce: 0.0592, Train Steps/Sec: 0.06,
3209
  [2026-01-12 11:10:09] (step=0002981) Train Loss mse: 0.0305, Train Loss ce: 0.0559, Train Steps/Sec: 0.07,
 
4173
  [2026-01-12 14:52:27] (step=0003945) Train Loss mse: 0.0261, Train Loss ce: 0.0557, Train Steps/Sec: 0.09,
4174
  [2026-01-12 14:52:38] (step=0003946) Train Loss mse: 0.0454, Train Loss ce: 0.0571, Train Steps/Sec: 0.09,
4175
  [2026-01-12 14:52:50] (step=0003947) Train Loss mse: 0.0509, Train Loss ce: 0.0586, Train Steps/Sec: 0.08,
4176
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step3000
4177
+ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
4178
+ [eval debug] first 3 batch fingerprints:
4179
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4180
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4181
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4182
+ ce_avg: 0.07865892350673676, mse_avg: 0.049521658569574356
4183
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step3500
4184
+ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
4185
+ [eval debug] first 3 batch fingerprints:
4186
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4187
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4188
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4189
+ ce_avg: 0.07448671013116837, mse_avg: 0.045566461980342865
4190
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step4000
4191
+ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
4192
+ [eval debug] first 3 batch fingerprints:
4193
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4194
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4195
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4196
+ ce_avg: 0.07164748013019562, mse_avg: 0.045982010662555695
4197
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step4500
4198
+ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
4199
+ [eval debug] first 3 batch fingerprints:
4200
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4201
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4202
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4203
+ ce_avg: 0.06939579546451569, mse_avg: 0.04529677331447601
4204
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema9999_hashed_step5000
4205
+ Preparing Dataset vlm_gym_jigsaw_celoss_evalonce/vlm_gym_jigsaw_val
4206
+ [eval debug] first 3 batch fingerprints:
4207
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4208
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4209
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_evalonce'}]
4210
+ ce_avg: 0.06746137142181396, mse_avg: 0.044675178825855255
4211
  [2026-01-12 14:53:03] (step=0003948) Train Loss mse: 0.0378, Train Loss ce: 0.0602, Train Steps/Sec: 0.08,
4212
  [2026-01-12 14:53:17] (step=0003949) Train Loss mse: 0.0307, Train Loss ce: 0.0570, Train Steps/Sec: 0.07,
4213
  [2026-01-12 14:53:29] (step=0003950) Train Loss mse: 0.0482, Train Loss ce: 0.0563, Train Steps/Sec: 0.08,
 
4520
  [2026-01-12 16:05:02] (step=0004257) Train Loss mse: 0.0372, Train Loss ce: 0.0558, Train Steps/Sec: 0.09,
4521
  [2026-01-12 16:05:15] (step=0004258) Train Loss mse: 0.0310, Train Loss ce: 0.0591, Train Steps/Sec: 0.07,
4522
  [2026-01-12 16:05:31] (step=0004259) Train Loss mse: 0.0422, Train Loss ce: 0.0573, Train Steps/Sec: 0.06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4523
  [2026-01-12 16:05:44] (step=0004260) Train Loss mse: 0.0486, Train Loss ce: 0.0621, Train Steps/Sec: 0.08,
4524
  [2026-01-12 16:05:55] (step=0004261) Train Loss mse: 0.0576, Train Loss ce: 0.0519, Train Steps/Sec: 0.09,
4525
  [2026-01-12 16:06:11] (step=0004262) Train Loss mse: 0.0290, Train Loss ce: 0.0579, Train Steps/Sec: 0.06,