Junyi42 commited on
Commit
23aa4ef
·
verified ·
1 Parent(s): 77c6e91

Upload checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse

Browse files
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/wandb/offline-run-20251227_170556-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse-run0/files/output.log CHANGED
@@ -572,165 +572,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
572
  [2025-12-27 18:26:02] (step=0000561) Train Loss mse: 0.0000, Train Loss ce: 0.0664, Train Steps/Sec: 0.13,
573
  [2025-12-27 18:26:10] (step=0000562) Train Loss mse: 0.0000, Train Loss ce: 0.0635, Train Steps/Sec: 0.13,
574
  [2025-12-27 18:26:18] (step=0000563) Train Loss mse: 0.0000, Train Loss ce: 0.0685, Train Steps/Sec: 0.13,
575
- FullyShardedDataParallel(
576
- (_fsdp_wrapped_module): Bagel(
577
- (language_model): Qwen2ForCausalLM(
578
- (model): Qwen2Model(
579
- (embed_tokens): Embedding(152064, 3584)
580
- (layers): ModuleList(
581
- (0-27): 28 x FullyShardedDataParallel(
582
- (_fsdp_wrapped_module): CheckpointWrapper(
583
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
584
- (self_attn): PackedAttentionMoT(
585
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
586
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
587
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
588
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
589
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
590
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
591
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
592
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
593
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
594
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
595
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
596
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
597
- )
598
- (mlp): Qwen2MLP(
599
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
600
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
601
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
602
- (act_fn): SiLU()
603
- )
604
- (mlp_moe_gen): Qwen2MLP(
605
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
606
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
607
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
608
- (act_fn): SiLU()
609
- )
610
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
611
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
612
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
613
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
614
- )
615
- )
616
- )
617
- )
618
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
619
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
620
- (rotary_emb): Qwen2RotaryEmbedding()
621
- )
622
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
623
- )
624
- (vit_model): SiglipVisionModel(
625
- (vision_model): FullyShardedDataParallel(
626
- (_fsdp_wrapped_module): SiglipVisionTransformer(
627
- (embeddings): SiglipVisionEmbeddings(
628
- (position_embedding): Embedding(4900, 1152)
629
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
630
- )
631
- (encoder): SiglipEncoder(
632
- (layers): ModuleList(
633
- (0-25): 26 x FullyShardedDataParallel(
634
- (_fsdp_wrapped_module): CheckpointWrapper(
635
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
636
- (self_attn): SiglipFlashAttention2(
637
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
638
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
639
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
640
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
641
- )
642
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
643
- (mlp): SiglipMLP(
644
- (activation_fn): PytorchGELUTanh()
645
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
646
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
647
- )
648
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
649
- )
650
- )
651
- )
652
- )
653
- )
654
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
655
- )
656
- )
657
- )
658
- (connector): FullyShardedDataParallel(
659
- (_fsdp_wrapped_module): CheckpointWrapper(
660
- (_checkpoint_wrapped_module): MLPconnector(
661
- (activation_fn): PytorchGELUTanh()
662
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
663
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
664
- )
665
- )
666
- )
667
- (vit_pos_embed): FullyShardedDataParallel(
668
- (_fsdp_wrapped_module): PositionEmbedding()
669
- )
670
- )
671
- )
672
- _flat_param True
673
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
674
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
675
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
676
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
677
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
678
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
679
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
680
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
681
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
682
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
683
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
684
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
685
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
686
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
687
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
688
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
689
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
690
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
691
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
692
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
693
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
694
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
695
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
696
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
697
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
698
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
699
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
700
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
701
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
702
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
703
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
704
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
705
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
706
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
707
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
708
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
709
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
710
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
711
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
712
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
713
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
714
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
715
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
716
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
717
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
718
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
719
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
720
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
721
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
722
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
723
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
724
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
725
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
726
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
727
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
728
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
729
- vit_pos_embed._fsdp_wrapped_module._flat_param False
730
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_train
731
- Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10.json: [Errno 2] No such file or directory: '/home/jiaxin/bagel_train/hashes_test_set_v10.json'
732
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_val
733
- Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10.json: [Errno 2] No such file or directory: '/home/jiaxin/bagel_train/hashes_test_set_v10.json'
734
  [2025-12-27 18:26:26] (step=0000564) Train Loss mse: 0.0000, Train Loss ce: 0.0656, Train Steps/Sec: 0.13,
735
  [2025-12-27 18:26:34] (step=0000565) Train Loss mse: 0.0000, Train Loss ce: 0.0642, Train Steps/Sec: 0.13,
736
  [2025-12-27 18:26:41] (step=0000566) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.13,
@@ -1083,6 +924,165 @@ Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10
1083
  [2025-12-27 19:12:07] (step=0000913) Train Loss mse: 0.0000, Train Loss ce: 0.0636, Train Steps/Sec: 0.13,
1084
  [2025-12-27 19:12:14] (step=0000914) Train Loss mse: 0.0000, Train Loss ce: 0.0631, Train Steps/Sec: 0.13,
1085
  [2025-12-27 19:12:23] (step=0000915) Train Loss mse: 0.0000, Train Loss ce: 0.0649, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1086
  [2025-12-27 19:12:30] (step=0000916) Train Loss mse: 0.0000, Train Loss ce: 0.0641, Train Steps/Sec: 0.13,
1087
  [2025-12-27 19:12:38] (step=0000917) Train Loss mse: 0.0000, Train Loss ce: 0.0652, Train Steps/Sec: 0.13,
1088
  [2025-12-27 19:12:46] (step=0000918) Train Loss mse: 0.0000, Train Loss ce: 0.0645, Train Steps/Sec: 0.13,
 
572
  [2025-12-27 18:26:02] (step=0000561) Train Loss mse: 0.0000, Train Loss ce: 0.0664, Train Steps/Sec: 0.13,
573
  [2025-12-27 18:26:10] (step=0000562) Train Loss mse: 0.0000, Train Loss ce: 0.0635, Train Steps/Sec: 0.13,
574
  [2025-12-27 18:26:18] (step=0000563) Train Loss mse: 0.0000, Train Loss ce: 0.0685, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  [2025-12-27 18:26:26] (step=0000564) Train Loss mse: 0.0000, Train Loss ce: 0.0656, Train Steps/Sec: 0.13,
576
  [2025-12-27 18:26:34] (step=0000565) Train Loss mse: 0.0000, Train Loss ce: 0.0642, Train Steps/Sec: 0.13,
577
  [2025-12-27 18:26:41] (step=0000566) Train Loss mse: 0.0000, Train Loss ce: 0.0686, Train Steps/Sec: 0.13,
 
924
  [2025-12-27 19:12:07] (step=0000913) Train Loss mse: 0.0000, Train Loss ce: 0.0636, Train Steps/Sec: 0.13,
925
  [2025-12-27 19:12:14] (step=0000914) Train Loss mse: 0.0000, Train Loss ce: 0.0631, Train Steps/Sec: 0.13,
926
  [2025-12-27 19:12:23] (step=0000915) Train Loss mse: 0.0000, Train Loss ce: 0.0649, Train Steps/Sec: 0.12,
927
+ FullyShardedDataParallel(
928
+ (_fsdp_wrapped_module): Bagel(
929
+ (language_model): Qwen2ForCausalLM(
930
+ (model): Qwen2Model(
931
+ (embed_tokens): Embedding(152064, 3584)
932
+ (layers): ModuleList(
933
+ (0-27): 28 x FullyShardedDataParallel(
934
+ (_fsdp_wrapped_module): CheckpointWrapper(
935
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
936
+ (self_attn): PackedAttentionMoT(
937
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
938
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
939
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
940
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
941
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
942
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
943
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
944
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
945
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
946
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
947
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
948
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
949
+ )
950
+ (mlp): Qwen2MLP(
951
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
952
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
953
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
954
+ (act_fn): SiLU()
955
+ )
956
+ (mlp_moe_gen): Qwen2MLP(
957
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
958
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
959
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
960
+ (act_fn): SiLU()
961
+ )
962
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
963
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
964
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
965
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
966
+ )
967
+ )
968
+ )
969
+ )
970
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
971
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
972
+ (rotary_emb): Qwen2RotaryEmbedding()
973
+ )
974
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
975
+ )
976
+ (vit_model): SiglipVisionModel(
977
+ (vision_model): FullyShardedDataParallel(
978
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
979
+ (embeddings): SiglipVisionEmbeddings(
980
+ (position_embedding): Embedding(4900, 1152)
981
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
982
+ )
983
+ (encoder): SiglipEncoder(
984
+ (layers): ModuleList(
985
+ (0-25): 26 x FullyShardedDataParallel(
986
+ (_fsdp_wrapped_module): CheckpointWrapper(
987
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
988
+ (self_attn): SiglipFlashAttention2(
989
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
990
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
991
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
992
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
993
+ )
994
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
995
+ (mlp): SiglipMLP(
996
+ (activation_fn): PytorchGELUTanh()
997
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
998
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
999
+ )
1000
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1001
+ )
1002
+ )
1003
+ )
1004
+ )
1005
+ )
1006
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1007
+ )
1008
+ )
1009
+ )
1010
+ (connector): FullyShardedDataParallel(
1011
+ (_fsdp_wrapped_module): CheckpointWrapper(
1012
+ (_checkpoint_wrapped_module): MLPconnector(
1013
+ (activation_fn): PytorchGELUTanh()
1014
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
1015
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
1016
+ )
1017
+ )
1018
+ )
1019
+ (vit_pos_embed): FullyShardedDataParallel(
1020
+ (_fsdp_wrapped_module): PositionEmbedding()
1021
+ )
1022
+ )
1023
+ )
1024
+ _flat_param True
1025
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1026
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1027
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1028
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1029
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1030
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1031
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1032
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1033
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1034
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1035
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1036
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1037
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1038
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1039
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1040
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1041
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1042
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1043
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1044
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1045
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1046
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1047
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1048
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1049
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1050
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1051
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1052
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1053
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
1054
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1055
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1056
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1057
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1058
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1059
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1060
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1061
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1062
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1063
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1064
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1065
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1066
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1067
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1068
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1069
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1070
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1071
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1072
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1073
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1074
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1075
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1076
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1077
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1078
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1079
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1080
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1081
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
1082
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_train
1083
+ Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10.json: [Errno 2] No such file or directory: '/home/jiaxin/bagel_train/hashes_test_set_v10.json'
1084
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_val
1085
+ Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10.json: [Errno 2] No such file or directory: '/home/jiaxin/bagel_train/hashes_test_set_v10.json'
1086
  [2025-12-27 19:12:30] (step=0000916) Train Loss mse: 0.0000, Train Loss ce: 0.0641, Train Steps/Sec: 0.13,
1087
  [2025-12-27 19:12:38] (step=0000917) Train Loss mse: 0.0000, Train Loss ce: 0.0652, Train Steps/Sec: 0.13,
1088
  [2025-12-27 19:12:46] (step=0000918) Train Loss mse: 0.0000, Train Loss ce: 0.0645, Train Steps/Sec: 0.13,
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/wandb/offline-run-20251230_022852-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993-run0/files/output.log CHANGED
@@ -1,3 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -979,165 +1138,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
979
  [2025-12-30 04:42:35] (step=0000968) Train Loss mse: 0.0000, Train Loss ce: 0.0645, Train Steps/Sec: 0.13,
980
  [2025-12-30 04:42:43] (step=0000969) Train Loss mse: 0.0000, Train Loss ce: 0.0641, Train Steps/Sec: 0.13,
981
  [2025-12-30 04:42:51] (step=0000970) Train Loss mse: 0.0000, Train Loss ce: 0.0653, Train Steps/Sec: 0.13,
982
- FullyShardedDataParallel(
983
- (_fsdp_wrapped_module): Bagel(
984
- (language_model): Qwen2ForCausalLM(
985
- (model): Qwen2Model(
986
- (embed_tokens): Embedding(152064, 3584)
987
- (layers): ModuleList(
988
- (0-27): 28 x FullyShardedDataParallel(
989
- (_fsdp_wrapped_module): CheckpointWrapper(
990
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
991
- (self_attn): PackedAttentionMoT(
992
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
993
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
994
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
995
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
996
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
997
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
998
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
999
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
1000
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
1001
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1002
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1003
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
1004
- )
1005
- (mlp): Qwen2MLP(
1006
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1007
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1008
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1009
- (act_fn): SiLU()
1010
- )
1011
- (mlp_moe_gen): Qwen2MLP(
1012
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1013
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1014
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1015
- (act_fn): SiLU()
1016
- )
1017
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1018
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1019
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1020
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1021
- )
1022
- )
1023
- )
1024
- )
1025
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
1026
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1027
- (rotary_emb): Qwen2RotaryEmbedding()
1028
- )
1029
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
1030
- )
1031
- (vit_model): SiglipVisionModel(
1032
- (vision_model): FullyShardedDataParallel(
1033
- (_fsdp_wrapped_module): SiglipVisionTransformer(
1034
- (embeddings): SiglipVisionEmbeddings(
1035
- (position_embedding): Embedding(4900, 1152)
1036
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
1037
- )
1038
- (encoder): SiglipEncoder(
1039
- (layers): ModuleList(
1040
- (0-25): 26 x FullyShardedDataParallel(
1041
- (_fsdp_wrapped_module): CheckpointWrapper(
1042
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
1043
- (self_attn): SiglipFlashAttention2(
1044
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
1045
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
1046
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
1047
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
1048
- )
1049
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1050
- (mlp): SiglipMLP(
1051
- (activation_fn): PytorchGELUTanh()
1052
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
1053
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
1054
- )
1055
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1056
- )
1057
- )
1058
- )
1059
- )
1060
- )
1061
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1062
- )
1063
- )
1064
- )
1065
- (connector): FullyShardedDataParallel(
1066
- (_fsdp_wrapped_module): CheckpointWrapper(
1067
- (_checkpoint_wrapped_module): MLPconnector(
1068
- (activation_fn): PytorchGELUTanh()
1069
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
1070
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
1071
- )
1072
- )
1073
- )
1074
- (vit_pos_embed): FullyShardedDataParallel(
1075
- (_fsdp_wrapped_module): PositionEmbedding()
1076
- )
1077
- )
1078
- )
1079
- _flat_param True
1080
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1081
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1082
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1083
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1084
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1085
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1086
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1087
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1088
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1089
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1090
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1091
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1092
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1093
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1094
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1095
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1096
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1097
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1098
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1099
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1100
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1101
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1102
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1103
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1104
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1105
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1106
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1107
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1108
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
1109
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1110
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1111
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1112
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1113
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1114
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1115
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1116
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1117
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1118
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1119
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1120
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1121
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1122
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1123
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1124
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1125
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1126
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1127
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1128
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1129
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1130
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1131
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1132
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1133
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1134
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1135
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1136
- vit_pos_embed._fsdp_wrapped_module._flat_param False
1137
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_train
1138
- Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10.json: [Errno 2] No such file or directory: '/home/jiaxin/bagel_train/hashes_test_set_v10.json'
1139
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_val
1140
- Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10.json: [Errno 2] No such file or directory: '/home/jiaxin/bagel_train/hashes_test_set_v10.json'
1141
  [2025-12-30 04:42:58] (step=0000971) Train Loss mse: 0.0000, Train Loss ce: 0.0651, Train Steps/Sec: 0.13,
1142
  [2025-12-30 04:43:06] (step=0000972) Train Loss mse: 0.0000, Train Loss ce: 0.0663, Train Steps/Sec: 0.12,
1143
  [2025-12-30 04:43:14] (step=0000973) Train Loss mse: 0.0000, Train Loss ce: 0.0629, Train Steps/Sec: 0.13,
@@ -3368,85 +3368,6 @@ Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10
3368
  [2025-12-30 09:37:59] (step=0003195) Train Loss mse: 0.0000, Train Loss ce: 0.0602, Train Steps/Sec: 0.12,
3369
  [2025-12-30 09:38:06] (step=0003196) Train Loss mse: 0.0000, Train Loss ce: 0.0614, Train Steps/Sec: 0.13,
3370
  [2025-12-30 09:38:14] (step=0003197) Train Loss mse: 0.0000, Train Loss ce: 0.0616, Train Steps/Sec: 0.13,
3371
- [2025-12-30 09:38:22] (step=0003198) Train Loss mse: 0.0000, Train Loss ce: 0.0613, Train Steps/Sec: 0.13,
3372
- [2025-12-30 09:38:30] (step=0003199) Train Loss mse: 0.0000, Train Loss ce: 0.0623, Train Steps/Sec: 0.13,
3373
- [2025-12-30 09:38:38] (step=0003200) Train Loss mse: 0.0000, Train Loss ce: 0.0629, Train Steps/Sec: 0.12,
3374
- [2025-12-30 09:38:46] (step=0003201) Train Loss mse: 0.0000, Train Loss ce: 0.0629, Train Steps/Sec: 0.13,
3375
- [2025-12-30 09:38:54] (step=0003202) Train Loss mse: 0.0000, Train Loss ce: 0.0619, Train Steps/Sec: 0.13,
3376
- [2025-12-30 09:39:01] (step=0003203) Train Loss mse: 0.0000, Train Loss ce: 0.0616, Train Steps/Sec: 0.13,
3377
- [2025-12-30 09:39:10] (step=0003204) Train Loss mse: 0.0000, Train Loss ce: 0.0611, Train Steps/Sec: 0.12,
3378
- [2025-12-30 09:39:18] (step=0003205) Train Loss mse: 0.0000, Train Loss ce: 0.0624, Train Steps/Sec: 0.12,
3379
- [2025-12-30 09:39:26] (step=0003206) Train Loss mse: 0.0000, Train Loss ce: 0.0594, Train Steps/Sec: 0.12,
3380
- [2025-12-30 09:39:34] (step=0003207) Train Loss mse: 0.0000, Train Loss ce: 0.0616, Train Steps/Sec: 0.13,
3381
- [2025-12-30 09:39:41] (step=0003208) Train Loss mse: 0.0000, Train Loss ce: 0.0626, Train Steps/Sec: 0.13,
3382
- [2025-12-30 09:39:50] (step=0003209) Train Loss mse: 0.0000, Train Loss ce: 0.0605, Train Steps/Sec: 0.12,
3383
- [2025-12-30 09:39:57] (step=0003210) Train Loss mse: 0.0000, Train Loss ce: 0.0614, Train Steps/Sec: 0.13,
3384
- [2025-12-30 09:40:05] (step=0003211) Train Loss mse: 0.0000, Train Loss ce: 0.0634, Train Steps/Sec: 0.13,
3385
- [2025-12-30 09:40:13] (step=0003212) Train Loss mse: 0.0000, Train Loss ce: 0.0594, Train Steps/Sec: 0.13,
3386
- [2025-12-30 09:40:21] (step=0003213) Train Loss mse: 0.0000, Train Loss ce: 0.0626, Train Steps/Sec: 0.12,
3387
- [2025-12-30 09:40:29] (step=0003214) Train Loss mse: 0.0000, Train Loss ce: 0.0615, Train Steps/Sec: 0.12,
3388
- [2025-12-30 09:40:37] (step=0003215) Train Loss mse: 0.0000, Train Loss ce: 0.0627, Train Steps/Sec: 0.13,
3389
- [2025-12-30 09:40:45] (step=0003216) Train Loss mse: 0.0000, Train Loss ce: 0.0613, Train Steps/Sec: 0.13,
3390
- [2025-12-30 09:40:53] (step=0003217) Train Loss mse: 0.0000, Train Loss ce: 0.0627, Train Steps/Sec: 0.13,
3391
- [2025-12-30 09:41:01] (step=0003218) Train Loss mse: 0.0000, Train Loss ce: 0.0615, Train Steps/Sec: 0.12,
3392
- [2025-12-30 09:41:08] (step=0003219) Train Loss mse: 0.0000, Train Loss ce: 0.0612, Train Steps/Sec: 0.13,
3393
- [2025-12-30 09:41:16] (step=0003220) Train Loss mse: 0.0000, Train Loss ce: 0.0616, Train Steps/Sec: 0.13,
3394
- [2025-12-30 09:41:24] (step=0003221) Train Loss mse: 0.0000, Train Loss ce: 0.0610, Train Steps/Sec: 0.13,
3395
- [2025-12-30 09:41:32] (step=0003222) Train Loss mse: 0.0000, Train Loss ce: 0.0621, Train Steps/Sec: 0.12,
3396
- [2025-12-30 09:41:40] (step=0003223) Train Loss mse: 0.0000, Train Loss ce: 0.0612, Train Steps/Sec: 0.13,
3397
- [2025-12-30 09:41:48] (step=0003224) Train Loss mse: 0.0000, Train Loss ce: 0.0619, Train Steps/Sec: 0.13,
3398
- [2025-12-30 09:41:55] (step=0003225) Train Loss mse: 0.0000, Train Loss ce: 0.0628, Train Steps/Sec: 0.13,
3399
- [2025-12-30 09:42:03] (step=0003226) Train Loss mse: 0.0000, Train Loss ce: 0.0623, Train Steps/Sec: 0.13,
3400
- [2025-12-30 09:42:11] (step=0003227) Train Loss mse: 0.0000, Train Loss ce: 0.0622, Train Steps/Sec: 0.12,
3401
- [2025-12-30 09:42:19] (step=0003228) Train Loss mse: 0.0000, Train Loss ce: 0.0616, Train Steps/Sec: 0.13,
3402
- [2025-12-30 09:42:27] (step=0003229) Train Loss mse: 0.0000, Train Loss ce: 0.0625, Train Steps/Sec: 0.13,
3403
- [2025-12-30 09:42:35] (step=0003230) Train Loss mse: 0.0000, Train Loss ce: 0.0613, Train Steps/Sec: 0.13,
3404
- [2025-12-30 09:42:43] (step=0003231) Train Loss mse: 0.0000, Train Loss ce: 0.0626, Train Steps/Sec: 0.13,
3405
- [2025-12-30 09:42:50] (step=0003232) Train Loss mse: 0.0000, Train Loss ce: 0.0622, Train Steps/Sec: 0.13,
3406
- [2025-12-30 09:42:58] (step=0003233) Train Loss mse: 0.0000, Train Loss ce: 0.0611, Train Steps/Sec: 0.13,
3407
- [2025-12-30 09:43:06] (step=0003234) Train Loss mse: 0.0000, Train Loss ce: 0.0612, Train Steps/Sec: 0.13,
3408
- [2025-12-30 09:43:14] (step=0003235) Train Loss mse: 0.0000, Train Loss ce: 0.0609, Train Steps/Sec: 0.12,
3409
- [2025-12-30 09:43:22] (step=0003236) Train Loss mse: 0.0000, Train Loss ce: 0.0607, Train Steps/Sec: 0.13,
3410
- [2025-12-30 09:43:30] (step=0003237) Train Loss mse: 0.0000, Train Loss ce: 0.0610, Train Steps/Sec: 0.12,
3411
- [2025-12-30 09:43:38] (step=0003238) Train Loss mse: 0.0000, Train Loss ce: 0.0629, Train Steps/Sec: 0.13,
3412
- [2025-12-30 09:43:46] (step=0003239) Train Loss mse: 0.0000, Train Loss ce: 0.0622, Train Steps/Sec: 0.13,
3413
- [2025-12-30 09:43:54] (step=0003240) Train Loss mse: 0.0000, Train Loss ce: 0.0606, Train Steps/Sec: 0.12,
3414
- [2025-12-30 09:44:02] (step=0003241) Train Loss mse: 0.0000, Train Loss ce: 0.0603, Train Steps/Sec: 0.12,
3415
- [2025-12-30 09:44:10] (step=0003242) Train Loss mse: 0.0000, Train Loss ce: 0.0612, Train Steps/Sec: 0.13,
3416
- [2025-12-30 09:44:17] (step=0003243) Train Loss mse: 0.0000, Train Loss ce: 0.0627, Train Steps/Sec: 0.13,
3417
- [2025-12-30 09:44:25] (step=0003244) Train Loss mse: 0.0000, Train Loss ce: 0.0616, Train Steps/Sec: 0.13,
3418
- [2025-12-30 09:44:33] (step=0003245) Train Loss mse: 0.0000, Train Loss ce: 0.0617, Train Steps/Sec: 0.13,
3419
- [2025-12-30 09:44:41] (step=0003246) Train Loss mse: 0.0000, Train Loss ce: 0.0613, Train Steps/Sec: 0.12,
3420
- [2025-12-30 09:44:49] (step=0003247) Train Loss mse: 0.0000, Train Loss ce: 0.0607, Train Steps/Sec: 0.13,
3421
- [2025-12-30 09:44:57] (step=0003248) Train Loss mse: 0.0000, Train Loss ce: 0.0602, Train Steps/Sec: 0.13,
3422
- [2025-12-30 09:45:05] (step=0003249) Train Loss mse: 0.0000, Train Loss ce: 0.0614, Train Steps/Sec: 0.13,
3423
- [2025-12-30 09:45:13] (step=0003250) Train Loss mse: 0.0000, Train Loss ce: 0.0606, Train Steps/Sec: 0.12,
3424
- [2025-12-30 09:45:21] (step=0003251) Train Loss mse: 0.0000, Train Loss ce: 0.0617, Train Steps/Sec: 0.13,
3425
- [2025-12-30 09:45:29] (step=0003252) Train Loss mse: 0.0000, Train Loss ce: 0.0617, Train Steps/Sec: 0.13,
3426
- [2025-12-30 09:45:36] (step=0003253) Train Loss mse: 0.0000, Train Loss ce: 0.0615, Train Steps/Sec: 0.13,
3427
- [2025-12-30 09:45:44] (step=0003254) Train Loss mse: 0.0000, Train Loss ce: 0.0600, Train Steps/Sec: 0.13,
3428
- [2025-12-30 09:45:52] (step=0003255) Train Loss mse: 0.0000, Train Loss ce: 0.0606, Train Steps/Sec: 0.13,
3429
- [2025-12-30 09:46:00] (step=0003256) Train Loss mse: 0.0000, Train Loss ce: 0.0624, Train Steps/Sec: 0.12,
3430
- [2025-12-30 09:46:08] (step=0003257) Train Loss mse: 0.0000, Train Loss ce: 0.0624, Train Steps/Sec: 0.13,
3431
- [2025-12-30 09:46:16] (step=0003258) Train Loss mse: 0.0000, Train Loss ce: 0.0616, Train Steps/Sec: 0.13,
3432
- [2025-12-30 09:46:24] (step=0003259) Train Loss mse: 0.0000, Train Loss ce: 0.0618, Train Steps/Sec: 0.13,
3433
- [2025-12-30 09:46:32] (step=0003260) Train Loss mse: 0.0000, Train Loss ce: 0.0614, Train Steps/Sec: 0.13,
3434
- [2025-12-30 09:46:40] (step=0003261) Train Loss mse: 0.0000, Train Loss ce: 0.0621, Train Steps/Sec: 0.13,
3435
- [2025-12-30 09:46:47] (step=0003262) Train Loss mse: 0.0000, Train Loss ce: 0.0609, Train Steps/Sec: 0.13,
3436
- [2025-12-30 09:46:55] (step=0003263) Train Loss mse: 0.0000, Train Loss ce: 0.0600, Train Steps/Sec: 0.13,
3437
- [2025-12-30 09:47:03] (step=0003264) Train Loss mse: 0.0000, Train Loss ce: 0.0619, Train Steps/Sec: 0.13,
3438
- [2025-12-30 09:47:11] (step=0003265) Train Loss mse: 0.0000, Train Loss ce: 0.0624, Train Steps/Sec: 0.12,
3439
- [2025-12-30 09:47:19] (step=0003266) Train Loss mse: 0.0000, Train Loss ce: 0.0616, Train Steps/Sec: 0.13,
3440
- [2025-12-30 09:47:27] (step=0003267) Train Loss mse: 0.0000, Train Loss ce: 0.0631, Train Steps/Sec: 0.13,
3441
- [2025-12-30 09:47:34] (step=0003268) Train Loss mse: 0.0000, Train Loss ce: 0.0611, Train Steps/Sec: 0.13,
3442
- [2025-12-30 09:47:42] (step=0003269) Train Loss mse: 0.0000, Train Loss ce: 0.0620, Train Steps/Sec: 0.13,
3443
- [2025-12-30 09:47:50] (step=0003270) Train Loss mse: 0.0000, Train Loss ce: 0.0617, Train Steps/Sec: 0.13,
3444
- [2025-12-30 09:47:58] (step=0003271) Train Loss mse: 0.0000, Train Loss ce: 0.0622, Train Steps/Sec: 0.13,
3445
- [2025-12-30 09:48:06] (step=0003272) Train Loss mse: 0.0000, Train Loss ce: 0.0613, Train Steps/Sec: 0.12,
3446
- [2025-12-30 09:48:14] (step=0003273) Train Loss mse: 0.0000, Train Loss ce: 0.0610, Train Steps/Sec: 0.13,
3447
- [2025-12-30 09:48:22] (step=0003274) Train Loss mse: 0.0000, Train Loss ce: 0.0608, Train Steps/Sec: 0.13,
3448
- [2025-12-30 09:48:29] (step=0003275) Train Loss mse: 0.0000, Train Loss ce: 0.0609, Train Steps/Sec: 0.13,
3449
- [2025-12-30 09:48:37] (step=0003276) Train Loss mse: 0.0000, Train Loss ce: 0.0613, Train Steps/Sec: 0.13,
3450
  [2025-12-30 09:48:45] (step=0003277) Train Loss mse: 0.0000, Train Loss ce: 0.0611, Train Steps/Sec: 0.13,
3451
  [2025-12-30 09:48:53] (step=0003278) Train Loss mse: 0.0000, Train Loss ce: 0.0611, Train Steps/Sec: 0.12,
3452
  [2025-12-30 09:49:01] (step=0003279) Train Loss mse: 0.0000, Train Loss ce: 0.0611, Train Steps/Sec: 0.13,
 
1
+ FullyShardedDataParallel(
2
+ (_fsdp_wrapped_module): Bagel(
3
+ (language_model): Qwen2ForCausalLM(
4
+ (model): Qwen2Model(
5
+ (embed_tokens): Embedding(152064, 3584)
6
+ (layers): ModuleList(
7
+ (0-27): 28 x FullyShardedDataParallel(
8
+ (_fsdp_wrapped_module): CheckpointWrapper(
9
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
+ (self_attn): PackedAttentionMoT(
11
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
+ )
24
+ (mlp): Qwen2MLP(
25
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
+ (act_fn): SiLU()
29
+ )
30
+ (mlp_moe_gen): Qwen2MLP(
31
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
+ (act_fn): SiLU()
35
+ )
36
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
+ )
41
+ )
42
+ )
43
+ )
44
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
+ (rotary_emb): Qwen2RotaryEmbedding()
47
+ )
48
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
+ )
50
+ (vit_model): SiglipVisionModel(
51
+ (vision_model): FullyShardedDataParallel(
52
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
53
+ (embeddings): SiglipVisionEmbeddings(
54
+ (position_embedding): Embedding(4900, 1152)
55
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
56
+ )
57
+ (encoder): SiglipEncoder(
58
+ (layers): ModuleList(
59
+ (0-25): 26 x FullyShardedDataParallel(
60
+ (_fsdp_wrapped_module): CheckpointWrapper(
61
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
62
+ (self_attn): SiglipFlashAttention2(
63
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
64
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
65
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
66
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
67
+ )
68
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
69
+ (mlp): SiglipMLP(
70
+ (activation_fn): PytorchGELUTanh()
71
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
72
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
73
+ )
74
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
75
+ )
76
+ )
77
+ )
78
+ )
79
+ )
80
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
81
+ )
82
+ )
83
+ )
84
+ (connector): FullyShardedDataParallel(
85
+ (_fsdp_wrapped_module): CheckpointWrapper(
86
+ (_checkpoint_wrapped_module): MLPconnector(
87
+ (activation_fn): PytorchGELUTanh()
88
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
89
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
90
+ )
91
+ )
92
+ )
93
+ (vit_pos_embed): FullyShardedDataParallel(
94
+ (_fsdp_wrapped_module): PositionEmbedding()
95
+ )
96
+ )
97
+ )
98
+ _flat_param True
99
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
100
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
101
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
102
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
103
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
104
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
105
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
106
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
107
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
108
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
109
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
110
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
111
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
112
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
113
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
128
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
142
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
143
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
144
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
156
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_train
157
+ Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10.json: [Errno 2] No such file or directory: '/home/jiaxin/bagel_train/hashes_test_set_v10.json'
158
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_val
159
+ Warning: failed loading hashes from /home/jiaxin/bagel_train/hashes_test_set_v10.json: [Errno 2] No such file or directory: '/home/jiaxin/bagel_train/hashes_test_set_v10.json'
160
  wandb: Detected [huggingface_hub.inference] in use.
161
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
162
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
1138
  [2025-12-30 04:42:35] (step=0000968) Train Loss mse: 0.0000, Train Loss ce: 0.0645, Train Steps/Sec: 0.13,
1139
  [2025-12-30 04:42:43] (step=0000969) Train Loss mse: 0.0000, Train Loss ce: 0.0641, Train Steps/Sec: 0.13,
1140
  [2025-12-30 04:42:51] (step=0000970) Train Loss mse: 0.0000, Train Loss ce: 0.0653, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1141
  [2025-12-30 04:42:58] (step=0000971) Train Loss mse: 0.0000, Train Loss ce: 0.0651, Train Steps/Sec: 0.13,
1142
  [2025-12-30 04:43:06] (step=0000972) Train Loss mse: 0.0000, Train Loss ce: 0.0663, Train Steps/Sec: 0.12,
1143
  [2025-12-30 04:43:14] (step=0000973) Train Loss mse: 0.0000, Train Loss ce: 0.0629, Train Steps/Sec: 0.13,
 
3368
  [2025-12-30 09:37:59] (step=0003195) Train Loss mse: 0.0000, Train Loss ce: 0.0602, Train Steps/Sec: 0.12,
3369
  [2025-12-30 09:38:06] (step=0003196) Train Loss mse: 0.0000, Train Loss ce: 0.0614, Train Steps/Sec: 0.13,
3370
  [2025-12-30 09:38:14] (step=0003197) Train Loss mse: 0.0000, Train Loss ce: 0.0616, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3371
  [2025-12-30 09:48:45] (step=0003277) Train Loss mse: 0.0000, Train Loss ce: 0.0611, Train Steps/Sec: 0.13,
3372
  [2025-12-30 09:48:53] (step=0003278) Train Loss mse: 0.0000, Train Loss ce: 0.0611, Train Steps/Sec: 0.12,
3373
  [2025-12-30 09:49:01] (step=0003279) Train Loss mse: 0.0000, Train Loss ce: 0.0611, Train Steps/Sec: 0.13,
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/wandb/offline-run-20251230_024203-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0/files/output.log CHANGED
@@ -1,160 +1,3 @@
1
- FullyShardedDataParallel(
2
- (_fsdp_wrapped_module): Bagel(
3
- (language_model): Qwen2ForCausalLM(
4
- (model): Qwen2Model(
5
- (embed_tokens): Embedding(152064, 3584)
6
- (layers): ModuleList(
7
- (0-27): 28 x FullyShardedDataParallel(
8
- (_fsdp_wrapped_module): CheckpointWrapper(
9
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
- (self_attn): PackedAttentionMoT(
11
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
- )
24
- (mlp): Qwen2MLP(
25
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
- (act_fn): SiLU()
29
- )
30
- (mlp_moe_gen): Qwen2MLP(
31
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
- (act_fn): SiLU()
35
- )
36
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
- )
41
- )
42
- )
43
- )
44
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (rotary_emb): Qwen2RotaryEmbedding()
47
- )
48
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
- )
50
- (vit_model): SiglipVisionModel(
51
- (vision_model): FullyShardedDataParallel(
52
- (_fsdp_wrapped_module): SiglipVisionTransformer(
53
- (embeddings): SiglipVisionEmbeddings(
54
- (position_embedding): Embedding(4900, 1152)
55
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
56
- )
57
- (encoder): SiglipEncoder(
58
- (layers): ModuleList(
59
- (0-25): 26 x FullyShardedDataParallel(
60
- (_fsdp_wrapped_module): CheckpointWrapper(
61
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
62
- (self_attn): SiglipFlashAttention2(
63
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
64
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
65
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
66
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
67
- )
68
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
69
- (mlp): SiglipMLP(
70
- (activation_fn): PytorchGELUTanh()
71
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
72
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
73
- )
74
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
75
- )
76
- )
77
- )
78
- )
79
- )
80
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
81
- )
82
- )
83
- )
84
- (connector): FullyShardedDataParallel(
85
- (_fsdp_wrapped_module): CheckpointWrapper(
86
- (_checkpoint_wrapped_module): MLPconnector(
87
- (activation_fn): PytorchGELUTanh()
88
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
89
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
90
- )
91
- )
92
- )
93
- (vit_pos_embed): FullyShardedDataParallel(
94
- (_fsdp_wrapped_module): PositionEmbedding()
95
- )
96
- )
97
- )
98
- _flat_param True
99
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
100
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
101
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
102
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
103
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
104
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
105
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
106
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
107
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
108
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
109
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
110
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
111
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
112
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
113
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
128
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
142
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
143
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
144
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_pos_embed._fsdp_wrapped_module._flat_param False
156
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_train
157
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_val
158
  wandb: Detected [huggingface_hub.inference] in use.
159
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
160
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -442,4 +285,161 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
442
  [2025-12-30 03:24:38] (step=0000274) Train Loss mse: 0.0000, Train Loss ce: 0.0703, Train Steps/Sec: 0.13,
443
  [2025-12-30 03:24:46] (step=0000275) Train Loss mse: 0.0000, Train Loss ce: 0.0741, Train Steps/Sec: 0.13,
444
  [2025-12-30 03:24:54] (step=0000276) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.12,
445
- [2025-12-30 03:25:02] (step=0000277) Train Loss mse: 0.0000, Train Loss ce: 0.0748, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
285
  [2025-12-30 03:24:38] (step=0000274) Train Loss mse: 0.0000, Train Loss ce: 0.0703, Train Steps/Sec: 0.13,
286
  [2025-12-30 03:24:46] (step=0000275) Train Loss mse: 0.0000, Train Loss ce: 0.0741, Train Steps/Sec: 0.13,
287
  [2025-12-30 03:24:54] (step=0000276) Train Loss mse: 0.0000, Train Loss ce: 0.0719, Train Steps/Sec: 0.12,
288
+ [2025-12-30 03:25:02] (step=0000277) Train Loss mse: 0.0000, Train Loss ce: 0.0748, Train Steps/Sec: 0.13,
289
+ FullyShardedDataParallel(
290
+ (_fsdp_wrapped_module): Bagel(
291
+ (language_model): Qwen2ForCausalLM(
292
+ (model): Qwen2Model(
293
+ (embed_tokens): Embedding(152064, 3584)
294
+ (layers): ModuleList(
295
+ (0-27): 28 x FullyShardedDataParallel(
296
+ (_fsdp_wrapped_module): CheckpointWrapper(
297
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
298
+ (self_attn): PackedAttentionMoT(
299
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
300
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
301
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
302
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
303
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
304
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
305
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
306
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
307
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
308
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
309
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
310
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
311
+ )
312
+ (mlp): Qwen2MLP(
313
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
314
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
315
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
316
+ (act_fn): SiLU()
317
+ )
318
+ (mlp_moe_gen): Qwen2MLP(
319
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
320
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
321
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
322
+ (act_fn): SiLU()
323
+ )
324
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
325
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
326
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
327
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
328
+ )
329
+ )
330
+ )
331
+ )
332
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
333
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
334
+ (rotary_emb): Qwen2RotaryEmbedding()
335
+ )
336
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
337
+ )
338
+ (vit_model): SiglipVisionModel(
339
+ (vision_model): FullyShardedDataParallel(
340
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
341
+ (embeddings): SiglipVisionEmbeddings(
342
+ (position_embedding): Embedding(4900, 1152)
343
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
344
+ )
345
+ (encoder): SiglipEncoder(
346
+ (layers): ModuleList(
347
+ (0-25): 26 x FullyShardedDataParallel(
348
+ (_fsdp_wrapped_module): CheckpointWrapper(
349
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
350
+ (self_attn): SiglipFlashAttention2(
351
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
352
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
353
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
354
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
355
+ )
356
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
357
+ (mlp): SiglipMLP(
358
+ (activation_fn): PytorchGELUTanh()
359
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
360
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
361
+ )
362
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
363
+ )
364
+ )
365
+ )
366
+ )
367
+ )
368
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
369
+ )
370
+ )
371
+ )
372
+ (connector): FullyShardedDataParallel(
373
+ (_fsdp_wrapped_module): CheckpointWrapper(
374
+ (_checkpoint_wrapped_module): MLPconnector(
375
+ (activation_fn): PytorchGELUTanh()
376
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
377
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
378
+ )
379
+ )
380
+ )
381
+ (vit_pos_embed): FullyShardedDataParallel(
382
+ (_fsdp_wrapped_module): PositionEmbedding()
383
+ )
384
+ )
385
+ )
386
+ _flat_param True
387
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
388
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
389
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
390
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
391
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
392
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
393
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
394
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
395
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
396
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
397
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
398
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
399
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
400
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
401
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
402
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
403
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
404
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
405
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
406
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
407
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
408
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
409
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
410
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
411
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
412
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
413
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
414
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
415
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
416
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
417
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
418
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
419
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
420
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
421
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
422
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
423
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
424
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
425
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
426
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
427
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
428
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
429
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
430
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
431
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
432
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
433
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
434
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
435
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
436
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
437
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
438
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
439
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
440
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
441
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
442
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
443
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
444
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_train
445
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_junyi/vlm_gym_jigsaw_val
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/wandb/offline-run-20260105_043345-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0/files/output.log CHANGED
@@ -25484,835 +25484,4 @@ vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25484
  vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25485
  vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25486
  vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25487
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25488
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25489
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25490
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25491
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25492
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25493
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25494
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25495
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25496
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25497
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25498
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25499
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25500
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25501
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25502
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25503
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25504
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25505
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25506
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25507
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25508
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25509
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25510
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25511
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25512
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25513
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25514
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25515
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25516
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25517
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25518
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25519
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25520
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25521
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25522
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25523
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25524
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25525
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25526
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25527
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25528
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25529
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25530
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25531
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25532
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25533
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25534
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25535
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25536
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25537
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25538
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25539
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25540
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25541
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25542
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25543
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25544
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25545
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25546
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25547
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25548
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25549
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25550
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25551
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25552
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25553
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25554
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25555
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25556
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25557
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25558
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25559
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25560
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25561
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25562
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25563
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25564
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25565
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25566
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25567
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25568
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25569
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25570
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25571
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25572
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25573
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25574
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25575
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25576
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25577
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25578
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25579
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25580
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25581
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25582
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25583
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25584
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25585
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25586
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25587
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25588
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25589
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25590
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25591
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25592
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25593
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25594
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25595
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25596
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25597
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25598
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25599
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25600
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25601
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25602
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25603
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25604
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25605
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25606
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25607
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25608
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25609
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25610
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25611
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25612
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25613
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25614
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25615
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25616
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25617
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25618
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25619
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25620
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25621
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25622
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25623
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25624
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25625
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25626
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25627
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25628
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25629
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25630
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25631
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25632
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25633
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25634
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25635
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25636
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25637
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25638
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25639
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25640
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25641
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25642
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25643
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25644
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25645
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25646
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25647
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25648
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25649
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25650
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25651
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25652
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25653
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25654
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25655
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25656
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25657
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25658
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25659
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25660
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25661
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25662
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25663
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25664
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25665
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25666
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25667
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25668
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25669
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25670
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25671
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25672
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25673
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25674
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25675
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25676
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25677
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25678
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25679
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25680
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25681
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25682
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25683
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25684
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25685
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25686
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25687
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25688
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25689
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25690
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25691
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25692
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25693
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25694
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25695
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25696
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25697
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25698
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25699
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25700
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25701
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25702
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25703
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25704
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25705
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25706
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25707
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25708
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25709
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25710
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25711
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25712
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25713
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25714
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25715
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25716
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25717
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25718
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25719
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25720
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25721
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25722
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25723
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25724
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25725
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25726
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25727
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25728
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25729
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25730
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25731
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25732
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25733
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25734
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25735
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25736
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25737
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25738
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25739
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25740
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25741
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25742
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25743
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25744
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25745
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25746
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25747
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25748
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25749
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25750
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25751
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25752
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25753
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25754
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25755
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25756
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25757
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25758
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25759
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25760
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25761
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25762
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25763
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25764
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25765
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25766
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25767
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25768
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25769
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25770
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25771
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25772
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25773
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25774
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25775
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25776
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25777
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25778
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25779
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25780
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25781
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25782
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25783
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25784
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25785
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25786
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25787
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25788
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25789
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25790
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25791
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25792
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25793
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25794
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25795
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25796
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25797
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25798
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25799
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25800
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25801
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25802
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25803
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25804
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25805
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25806
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25807
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25808
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25809
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25810
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25811
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25812
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25813
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25814
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25815
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25816
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25817
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25818
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25819
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25820
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25821
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25822
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25823
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25824
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25825
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25826
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25827
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25828
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25829
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25830
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25831
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25832
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25833
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25834
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25835
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25836
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25837
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25838
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25839
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25840
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25841
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25842
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25843
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25844
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25845
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25846
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25847
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25848
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25849
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25850
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25851
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25852
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25853
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25854
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25855
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25856
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25857
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25858
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25859
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25860
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25861
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25862
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25863
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25864
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25865
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25866
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25867
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25868
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25869
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25870
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25871
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25872
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25873
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25874
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25875
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25876
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25877
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25878
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25879
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25880
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25881
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25882
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25883
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25884
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25885
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25886
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25887
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25888
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25889
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25890
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25891
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25892
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25893
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25894
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25895
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25896
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25897
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25898
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25899
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25900
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25901
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25902
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25903
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25904
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25905
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25906
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25907
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25908
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25909
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25910
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25911
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25912
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25913
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25914
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25915
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25916
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25917
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25918
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25919
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25920
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25921
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25922
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25923
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25924
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25925
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25926
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25927
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25928
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25929
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25930
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25931
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25932
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25933
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25934
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25935
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25936
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25937
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25938
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25939
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25940
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25941
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25942
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25943
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25944
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25945
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25946
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25947
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25948
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25949
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25950
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25951
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25952
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25953
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25954
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25955
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25956
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25957
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25958
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25959
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25960
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25961
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25962
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25963
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25964
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25965
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25966
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25967
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25968
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25969
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25970
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25971
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25972
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25973
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25974
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25975
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25976
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25977
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25978
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25979
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25980
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25981
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25982
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25983
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25984
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25985
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25986
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25987
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25988
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25989
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25990
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25991
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25992
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25993
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25994
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25995
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25996
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25997
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25998
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25999
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26000
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26001
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26002
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26003
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26004
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26005
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26006
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26007
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26008
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26009
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26010
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26011
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26012
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26013
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26014
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26015
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26016
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26017
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26018
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26019
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26020
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26021
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26022
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26023
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26024
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26025
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26026
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26027
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26028
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26029
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26030
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26031
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26032
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26033
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26034
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26035
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26036
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26037
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26038
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26039
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26040
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26041
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26042
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26043
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26044
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26045
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26046
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26047
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26048
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26049
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26050
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26051
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26052
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26053
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26054
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26055
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26056
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26057
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26058
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26059
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26060
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26061
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26062
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26063
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26064
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26065
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26066
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26067
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26068
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26069
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26070
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26071
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26072
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26073
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26074
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26075
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26076
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26077
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26078
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26079
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26080
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26081
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26082
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26083
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26084
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26085
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26086
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26087
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26088
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26089
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26090
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26091
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26092
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26093
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26094
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26095
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26096
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26097
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26098
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26099
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26100
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26101
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26102
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26103
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26104
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26105
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26106
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26107
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26108
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26109
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26110
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26111
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26112
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26113
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26114
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26115
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26116
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26117
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26118
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26119
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26120
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26121
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26122
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26123
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26124
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26125
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26126
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26127
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26128
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26129
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26130
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26131
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26132
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26133
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26134
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26135
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26136
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26137
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26138
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26139
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26140
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26141
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26142
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26143
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26144
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26145
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26146
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26147
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26148
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26149
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26150
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26151
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26152
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26153
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26154
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26155
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26156
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26157
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26158
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26159
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26160
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26161
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26162
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26163
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26164
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26165
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26166
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26167
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26168
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26169
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26170
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26171
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26172
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26173
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26174
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26175
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26176
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26177
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26178
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26179
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26180
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26181
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26182
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26183
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26184
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26185
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26186
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26187
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26188
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26189
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26190
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26191
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26192
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26193
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26194
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26195
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26196
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26197
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26198
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26199
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26200
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26201
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26202
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26203
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26204
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26205
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26206
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26207
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26208
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26209
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26210
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26211
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26212
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26213
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26214
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26215
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26216
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26217
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26218
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26219
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26220
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26221
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26222
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26223
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26224
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26225
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26226
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26227
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26228
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26229
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26230
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26231
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26232
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26233
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26234
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26235
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26236
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26237
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26238
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26239
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26240
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26241
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26242
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26243
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26244
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26245
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26246
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26247
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26248
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26249
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26250
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26251
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26252
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26253
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26254
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26255
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26256
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26257
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26258
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26259
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26260
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26261
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26262
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26263
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26264
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26265
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26266
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26267
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26268
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26269
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26270
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26271
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26272
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26273
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26274
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26275
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26276
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26277
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26278
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26279
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26280
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26281
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26282
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26283
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26284
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26285
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26286
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26287
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26288
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26289
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26290
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26291
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26292
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26293
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26294
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26295
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26296
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26297
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26298
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26299
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26300
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26301
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26302
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26303
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26304
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26305
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26306
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26307
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26308
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26309
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26310
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26311
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26312
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26313
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26314
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26315
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26316
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26317
- vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
26318
  vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
 
25484
  vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25485
  vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
25486
  vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25487
  vlm_gym_jigsaw_celoss_no_mse_junyi repeat in rank-0 worker-0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/wandb/offline-run-20260107_185004-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0/files/output.log CHANGED
@@ -168,13 +168,6 @@ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
168
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
169
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
170
  ce_avg: 0.06184878200292587, mse_avg: 0.0
171
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step1000
172
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
173
- [eval debug] first 3 batch fingerprints:
174
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
175
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
176
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
177
- ce_avg: 0.08925717324018478, mse_avg: 0.0
178
  wandb: Detected [huggingface_hub.inference] in use.
179
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
180
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -1089,20 +1082,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1089
  [2026-01-07 21:00:31] (step=0000901) Train Loss mse: 0.0000, Train Loss ce: 0.0586, Train Steps/Sec: 0.12,
1090
  [2026-01-07 21:00:39] (step=0000902) Train Loss mse: 0.0000, Train Loss ce: 0.0604, Train Steps/Sec: 0.12,
1091
  [2026-01-07 21:00:47] (step=0000903) Train Loss mse: 0.0000, Train Loss ce: 0.0591, Train Steps/Sec: 0.12,
1092
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step1500
1093
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
1094
- [eval debug] first 3 batch fingerprints:
1095
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1096
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1097
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1098
- ce_avg: 0.1071508377790451, mse_avg: 0.0
1099
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step2000
1100
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
1101
- [eval debug] first 3 batch fingerprints:
1102
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1103
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1104
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1105
- ce_avg: 0.1155419573187828, mse_avg: 0.0
1106
  [2026-01-07 21:00:55] (step=0000904) Train Loss mse: 0.0000, Train Loss ce: 0.0592, Train Steps/Sec: 0.12,
1107
  [2026-01-07 21:01:04] (step=0000905) Train Loss mse: 0.0000, Train Loss ce: 0.0595, Train Steps/Sec: 0.12,
1108
  [2026-01-07 21:01:12] (step=0000906) Train Loss mse: 0.0000, Train Loss ce: 0.0606, Train Steps/Sec: 0.12,
@@ -1155,6 +1134,20 @@ ce_avg: 0.1155419573187828, mse_avg: 0.0
1155
  [2026-01-07 21:07:39] (step=0000953) Train Loss mse: 0.0000, Train Loss ce: 0.0580, Train Steps/Sec: 0.12,
1156
  [2026-01-07 21:07:47] (step=0000954) Train Loss mse: 0.0000, Train Loss ce: 0.0604, Train Steps/Sec: 0.12,
1157
  [2026-01-07 21:07:55] (step=0000955) Train Loss mse: 0.0000, Train Loss ce: 0.0597, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1158
  [2026-01-07 21:08:03] (step=0000956) Train Loss mse: 0.0000, Train Loss ce: 0.0595, Train Steps/Sec: 0.12,
1159
  [2026-01-07 21:08:12] (step=0000957) Train Loss mse: 0.0000, Train Loss ce: 0.0602, Train Steps/Sec: 0.12,
1160
  [2026-01-07 21:08:20] (step=0000958) Train Loss mse: 0.0000, Train Loss ce: 0.0602, Train Steps/Sec: 0.12,
@@ -2406,6 +2399,27 @@ ce_avg: 0.1155419573187828, mse_avg: 0.0
2406
  [2026-01-07 23:59:46] (step=0002204) Train Loss mse: 0.0000, Train Loss ce: 0.0569, Train Steps/Sec: 0.12,
2407
  [2026-01-07 23:59:54] (step=0002205) Train Loss mse: 0.0000, Train Loss ce: 0.0562, Train Steps/Sec: 0.12,
2408
  [2026-01-08 00:00:02] (step=0002206) Train Loss mse: 0.0000, Train Loss ce: 0.0587, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2409
  [2026-01-08 00:00:10] (step=0002207) Train Loss mse: 0.0000, Train Loss ce: 0.0568, Train Steps/Sec: 0.12,
2410
  [2026-01-08 00:00:19] (step=0002208) Train Loss mse: 0.0000, Train Loss ce: 0.0573, Train Steps/Sec: 0.12,
2411
  [2026-01-08 00:00:27] (step=0002209) Train Loss mse: 0.0000, Train Loss ce: 0.0578, Train Steps/Sec: 0.12,
@@ -2470,20 +2484,6 @@ ce_avg: 0.1155419573187828, mse_avg: 0.0
2470
  [2026-01-08 00:08:33] (step=0002268) Train Loss mse: 0.0000, Train Loss ce: 0.0577, Train Steps/Sec: 0.12,
2471
  [2026-01-08 00:08:41] (step=0002269) Train Loss mse: 0.0000, Train Loss ce: 0.0576, Train Steps/Sec: 0.12,
2472
  [2026-01-08 00:08:49] (step=0002270) Train Loss mse: 0.0000, Train Loss ce: 0.0572, Train Steps/Sec: 0.12,
2473
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step2500
2474
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
2475
- [eval debug] first 3 batch fingerprints:
2476
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2477
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2478
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2479
- ce_avg: 0.12304135411977768, mse_avg: 0.0
2480
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step3000
2481
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
2482
- [eval debug] first 3 batch fingerprints:
2483
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2484
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2485
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2486
- ce_avg: 0.05864739045500755, mse_avg: 0.0
2487
  [2026-01-08 00:08:58] (step=0002271) Train Loss mse: 0.0000, Train Loss ce: 0.0576, Train Steps/Sec: 0.12,
2488
  [2026-01-08 00:09:06] (step=0002272) Train Loss mse: 0.0000, Train Loss ce: 0.0577, Train Steps/Sec: 0.12,
2489
  [2026-01-08 00:09:14] (step=0002273) Train Loss mse: 0.0000, Train Loss ce: 0.0565, Train Steps/Sec: 0.12,
@@ -3289,6 +3289,20 @@ ce_avg: 0.05864739045500755, mse_avg: 0.0
3289
  [2026-01-08 02:01:24] (step=0003070) Train Loss mse: 0.0000, Train Loss ce: 0.0575, Train Steps/Sec: 0.12,
3290
  [2026-01-08 02:01:32] (step=0003071) Train Loss mse: 0.0000, Train Loss ce: 0.0564, Train Steps/Sec: 0.12,
3291
  [2026-01-08 02:01:41] (step=0003072) Train Loss mse: 0.0000, Train Loss ce: 0.0571, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3292
  [2026-01-08 02:01:49] (step=0003073) Train Loss mse: 0.0000, Train Loss ce: 0.0556, Train Steps/Sec: 0.12,
3293
  [2026-01-08 02:01:57] (step=0003074) Train Loss mse: 0.0000, Train Loss ce: 0.0567, Train Steps/Sec: 0.12,
3294
  [2026-01-08 02:02:05] (step=0003075) Train Loss mse: 0.0000, Train Loss ce: 0.0563, Train Steps/Sec: 0.12,
@@ -3448,27 +3462,6 @@ ce_avg: 0.05864739045500755, mse_avg: 0.0
3448
  [2026-01-08 02:23:13] (step=0003229) Train Loss mse: 0.0000, Train Loss ce: 0.0575, Train Steps/Sec: 0.12,
3449
  [2026-01-08 02:23:21] (step=0003230) Train Loss mse: 0.0000, Train Loss ce: 0.0566, Train Steps/Sec: 0.12,
3450
  [2026-01-08 02:23:29] (step=0003231) Train Loss mse: 0.0000, Train Loss ce: 0.0567, Train Steps/Sec: 0.12,
3451
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step3500
3452
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
3453
- [eval debug] first 3 batch fingerprints:
3454
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3455
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3456
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3457
- ce_avg: 0.058637309819459915, mse_avg: 0.0
3458
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step4000
3459
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
3460
- [eval debug] first 3 batch fingerprints:
3461
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3462
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3463
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3464
- ce_avg: 0.059058357030153275, mse_avg: 0.0
3465
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step4500
3466
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
3467
- [eval debug] first 3 batch fingerprints:
3468
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3469
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3470
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3471
- ce_avg: 0.05962677672505379, mse_avg: 0.0
3472
  [2026-01-08 02:23:38] (step=0003232) Train Loss mse: 0.0000, Train Loss ce: 0.0555, Train Steps/Sec: 0.12,
3473
  [2026-01-08 02:23:46] (step=0003233) Train Loss mse: 0.0000, Train Loss ce: 0.0562, Train Steps/Sec: 0.12,
3474
  [2026-01-08 02:23:54] (step=0003234) Train Loss mse: 0.0000, Train Loss ce: 0.0572, Train Steps/Sec: 0.12,
@@ -4641,6 +4634,20 @@ ce_avg: 0.05962677672505379, mse_avg: 0.0
4641
  [2026-01-08 05:04:14] (step=0004401) Train Loss mse: 0.0000, Train Loss ce: 0.0541, Train Steps/Sec: 0.12,
4642
  [2026-01-08 05:04:23] (step=0004402) Train Loss mse: 0.0000, Train Loss ce: 0.0553, Train Steps/Sec: 0.12,
4643
  [2026-01-08 05:04:31] (step=0004403) Train Loss mse: 0.0000, Train Loss ce: 0.0548, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4644
  [2026-01-08 05:04:39] (step=0004404) Train Loss mse: 0.0000, Train Loss ce: 0.0545, Train Steps/Sec: 0.13,
4645
  [2026-01-08 05:04:47] (step=0004405) Train Loss mse: 0.0000, Train Loss ce: 0.0554, Train Steps/Sec: 0.12,
4646
  [2026-01-08 05:04:55] (step=0004406) Train Loss mse: 0.0000, Train Loss ce: 0.0546, Train Steps/Sec: 0.12,
@@ -4851,13 +4858,6 @@ ce_avg: 0.05962677672505379, mse_avg: 0.0
4851
  [2026-01-08 05:33:09] (step=0004611) Train Loss mse: 0.0000, Train Loss ce: 0.0549, Train Steps/Sec: 0.12,
4852
  [2026-01-08 05:33:18] (step=0004612) Train Loss mse: 0.0000, Train Loss ce: 0.0548, Train Steps/Sec: 0.12,
4853
  [2026-01-08 05:33:26] (step=0004613) Train Loss mse: 0.0000, Train Loss ce: 0.0545, Train Steps/Sec: 0.12,
4854
- base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step5000
4855
- Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
4856
- [eval debug] first 3 batch fingerprints:
4857
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
4858
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
4859
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
4860
- ce_avg: 0.060032669454813004, mse_avg: 0.0
4861
  [2026-01-08 05:33:34] (step=0004614) Train Loss mse: 0.0000, Train Loss ce: 0.0549, Train Steps/Sec: 0.12,
4862
  [2026-01-08 05:33:42] (step=0004615) Train Loss mse: 0.0000, Train Loss ce: 0.0540, Train Steps/Sec: 0.12,
4863
  [2026-01-08 05:33:51] (step=0004616) Train Loss mse: 0.0000, Train Loss ce: 0.0555, Train Steps/Sec: 0.12,
 
168
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
169
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
170
  ce_avg: 0.06184878200292587, mse_avg: 0.0
 
 
 
 
 
 
 
171
  wandb: Detected [huggingface_hub.inference] in use.
172
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
173
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
1082
  [2026-01-07 21:00:31] (step=0000901) Train Loss mse: 0.0000, Train Loss ce: 0.0586, Train Steps/Sec: 0.12,
1083
  [2026-01-07 21:00:39] (step=0000902) Train Loss mse: 0.0000, Train Loss ce: 0.0604, Train Steps/Sec: 0.12,
1084
  [2026-01-07 21:00:47] (step=0000903) Train Loss mse: 0.0000, Train Loss ce: 0.0591, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1085
  [2026-01-07 21:00:55] (step=0000904) Train Loss mse: 0.0000, Train Loss ce: 0.0592, Train Steps/Sec: 0.12,
1086
  [2026-01-07 21:01:04] (step=0000905) Train Loss mse: 0.0000, Train Loss ce: 0.0595, Train Steps/Sec: 0.12,
1087
  [2026-01-07 21:01:12] (step=0000906) Train Loss mse: 0.0000, Train Loss ce: 0.0606, Train Steps/Sec: 0.12,
 
1134
  [2026-01-07 21:07:39] (step=0000953) Train Loss mse: 0.0000, Train Loss ce: 0.0580, Train Steps/Sec: 0.12,
1135
  [2026-01-07 21:07:47] (step=0000954) Train Loss mse: 0.0000, Train Loss ce: 0.0604, Train Steps/Sec: 0.12,
1136
  [2026-01-07 21:07:55] (step=0000955) Train Loss mse: 0.0000, Train Loss ce: 0.0597, Train Steps/Sec: 0.12,
1137
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step1000
1138
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
1139
+ [eval debug] first 3 batch fingerprints:
1140
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1141
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1142
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1143
+ ce_avg: 0.08925717324018478, mse_avg: 0.0
1144
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step1500
1145
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
1146
+ [eval debug] first 3 batch fingerprints:
1147
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1148
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1149
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
1150
+ ce_avg: 0.1071508377790451, mse_avg: 0.0
1151
  [2026-01-07 21:08:03] (step=0000956) Train Loss mse: 0.0000, Train Loss ce: 0.0595, Train Steps/Sec: 0.12,
1152
  [2026-01-07 21:08:12] (step=0000957) Train Loss mse: 0.0000, Train Loss ce: 0.0602, Train Steps/Sec: 0.12,
1153
  [2026-01-07 21:08:20] (step=0000958) Train Loss mse: 0.0000, Train Loss ce: 0.0602, Train Steps/Sec: 0.12,
 
2399
  [2026-01-07 23:59:46] (step=0002204) Train Loss mse: 0.0000, Train Loss ce: 0.0569, Train Steps/Sec: 0.12,
2400
  [2026-01-07 23:59:54] (step=0002205) Train Loss mse: 0.0000, Train Loss ce: 0.0562, Train Steps/Sec: 0.12,
2401
  [2026-01-08 00:00:02] (step=0002206) Train Loss mse: 0.0000, Train Loss ce: 0.0587, Train Steps/Sec: 0.12,
2402
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step2000
2403
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
2404
+ [eval debug] first 3 batch fingerprints:
2405
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2406
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2407
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2408
+ ce_avg: 0.1155419573187828, mse_avg: 0.0
2409
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step2500
2410
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
2411
+ [eval debug] first 3 batch fingerprints:
2412
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2413
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2414
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2415
+ ce_avg: 0.12304135411977768, mse_avg: 0.0
2416
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step3000
2417
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
2418
+ [eval debug] first 3 batch fingerprints:
2419
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2420
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2421
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
2422
+ ce_avg: 0.05864739045500755, mse_avg: 0.0
2423
  [2026-01-08 00:00:10] (step=0002207) Train Loss mse: 0.0000, Train Loss ce: 0.0568, Train Steps/Sec: 0.12,
2424
  [2026-01-08 00:00:19] (step=0002208) Train Loss mse: 0.0000, Train Loss ce: 0.0573, Train Steps/Sec: 0.12,
2425
  [2026-01-08 00:00:27] (step=0002209) Train Loss mse: 0.0000, Train Loss ce: 0.0578, Train Steps/Sec: 0.12,
 
2484
  [2026-01-08 00:08:33] (step=0002268) Train Loss mse: 0.0000, Train Loss ce: 0.0577, Train Steps/Sec: 0.12,
2485
  [2026-01-08 00:08:41] (step=0002269) Train Loss mse: 0.0000, Train Loss ce: 0.0576, Train Steps/Sec: 0.12,
2486
  [2026-01-08 00:08:49] (step=0002270) Train Loss mse: 0.0000, Train Loss ce: 0.0572, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2487
  [2026-01-08 00:08:58] (step=0002271) Train Loss mse: 0.0000, Train Loss ce: 0.0576, Train Steps/Sec: 0.12,
2488
  [2026-01-08 00:09:06] (step=0002272) Train Loss mse: 0.0000, Train Loss ce: 0.0577, Train Steps/Sec: 0.12,
2489
  [2026-01-08 00:09:14] (step=0002273) Train Loss mse: 0.0000, Train Loss ce: 0.0565, Train Steps/Sec: 0.12,
 
3289
  [2026-01-08 02:01:24] (step=0003070) Train Loss mse: 0.0000, Train Loss ce: 0.0575, Train Steps/Sec: 0.12,
3290
  [2026-01-08 02:01:32] (step=0003071) Train Loss mse: 0.0000, Train Loss ce: 0.0564, Train Steps/Sec: 0.12,
3291
  [2026-01-08 02:01:41] (step=0003072) Train Loss mse: 0.0000, Train Loss ce: 0.0571, Train Steps/Sec: 0.12,
3292
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step3500
3293
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
3294
+ [eval debug] first 3 batch fingerprints:
3295
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3296
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3297
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3298
+ ce_avg: 0.058637309819459915, mse_avg: 0.0
3299
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step4000
3300
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
3301
+ [eval debug] first 3 batch fingerprints:
3302
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3303
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3304
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
3305
+ ce_avg: 0.059058357030153275, mse_avg: 0.0
3306
  [2026-01-08 02:01:49] (step=0003073) Train Loss mse: 0.0000, Train Loss ce: 0.0556, Train Steps/Sec: 0.12,
3307
  [2026-01-08 02:01:57] (step=0003074) Train Loss mse: 0.0000, Train Loss ce: 0.0567, Train Steps/Sec: 0.12,
3308
  [2026-01-08 02:02:05] (step=0003075) Train Loss mse: 0.0000, Train Loss ce: 0.0563, Train Steps/Sec: 0.12,
 
3462
  [2026-01-08 02:23:13] (step=0003229) Train Loss mse: 0.0000, Train Loss ce: 0.0575, Train Steps/Sec: 0.12,
3463
  [2026-01-08 02:23:21] (step=0003230) Train Loss mse: 0.0000, Train Loss ce: 0.0566, Train Steps/Sec: 0.12,
3464
  [2026-01-08 02:23:29] (step=0003231) Train Loss mse: 0.0000, Train Loss ce: 0.0567, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3465
  [2026-01-08 02:23:38] (step=0003232) Train Loss mse: 0.0000, Train Loss ce: 0.0555, Train Steps/Sec: 0.12,
3466
  [2026-01-08 02:23:46] (step=0003233) Train Loss mse: 0.0000, Train Loss ce: 0.0562, Train Steps/Sec: 0.12,
3467
  [2026-01-08 02:23:54] (step=0003234) Train Loss mse: 0.0000, Train Loss ce: 0.0572, Train Steps/Sec: 0.12,
 
4634
  [2026-01-08 05:04:14] (step=0004401) Train Loss mse: 0.0000, Train Loss ce: 0.0541, Train Steps/Sec: 0.12,
4635
  [2026-01-08 05:04:23] (step=0004402) Train Loss mse: 0.0000, Train Loss ce: 0.0553, Train Steps/Sec: 0.12,
4636
  [2026-01-08 05:04:31] (step=0004403) Train Loss mse: 0.0000, Train Loss ce: 0.0548, Train Steps/Sec: 0.12,
4637
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step4500
4638
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
4639
+ [eval debug] first 3 batch fingerprints:
4640
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
4641
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
4642
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
4643
+ ce_avg: 0.05962677672505379, mse_avg: 0.0
4644
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/eval_used_rows, step_tag is checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed_step5000
4645
+ Preparing Dataset vlm_gym_jigsaw_celoss_no_mse_evalonce/vlm_gym_jigsaw_val
4646
+ [eval debug] first 3 batch fingerprints:
4647
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
4648
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
4649
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_jigsaw_celoss_no_mse_evalonce'}]
4650
+ ce_avg: 0.060032669454813004, mse_avg: 0.0
4651
  [2026-01-08 05:04:39] (step=0004404) Train Loss mse: 0.0000, Train Loss ce: 0.0545, Train Steps/Sec: 0.13,
4652
  [2026-01-08 05:04:47] (step=0004405) Train Loss mse: 0.0000, Train Loss ce: 0.0554, Train Steps/Sec: 0.12,
4653
  [2026-01-08 05:04:55] (step=0004406) Train Loss mse: 0.0000, Train Loss ce: 0.0546, Train Steps/Sec: 0.12,
 
4858
  [2026-01-08 05:33:09] (step=0004611) Train Loss mse: 0.0000, Train Loss ce: 0.0549, Train Steps/Sec: 0.12,
4859
  [2026-01-08 05:33:18] (step=0004612) Train Loss mse: 0.0000, Train Loss ce: 0.0548, Train Steps/Sec: 0.12,
4860
  [2026-01-08 05:33:26] (step=0004613) Train Loss mse: 0.0000, Train Loss ce: 0.0545, Train Steps/Sec: 0.12,
 
 
 
 
 
 
 
4861
  [2026-01-08 05:33:34] (step=0004614) Train Loss mse: 0.0000, Train Loss ce: 0.0549, Train Steps/Sec: 0.12,
4862
  [2026-01-08 05:33:42] (step=0004615) Train Loss mse: 0.0000, Train Loss ce: 0.0540, Train Steps/Sec: 0.12,
4863
  [2026-01-08 05:33:51] (step=0004616) Train Loss mse: 0.0000, Train Loss ce: 0.0555, Train Steps/Sec: 0.12,