Junyi42 commited on
Commit
85f82ad
·
verified ·
1 Parent(s): 37c8558

Upload checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins

Browse files
checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/wandb/offline-run-20260125_170309-vlm_gym_colorization_one_img_lr2e_5_mse_only_ins-run0/files/output.log CHANGED
@@ -1,189 +1,3 @@
1
- FullyShardedDataParallel(
2
- (_fsdp_wrapped_module): Bagel(
3
- (language_model): Qwen2ForCausalLM(
4
- (model): Qwen2Model(
5
- (embed_tokens): Embedding(152064, 3584)
6
- (layers): ModuleList(
7
- (0-27): 28 x FullyShardedDataParallel(
8
- (_fsdp_wrapped_module): CheckpointWrapper(
9
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
- (self_attn): PackedAttentionMoT(
11
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
- )
24
- (mlp): Qwen2MLP(
25
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
- (act_fn): SiLU()
29
- )
30
- (mlp_moe_gen): Qwen2MLP(
31
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
- (act_fn): SiLU()
35
- )
36
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
- )
41
- )
42
- )
43
- )
44
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (rotary_emb): Qwen2RotaryEmbedding()
47
- )
48
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
- )
50
- (time_embedder): FullyShardedDataParallel(
51
- (_fsdp_wrapped_module): TimestepEmbedder(
52
- (mlp): Sequential(
53
- (0): Linear(in_features=256, out_features=3584, bias=True)
54
- (1): SiLU()
55
- (2): Linear(in_features=3584, out_features=3584, bias=True)
56
- )
57
- )
58
- )
59
- (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
60
- (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
61
- (latent_pos_embed): FullyShardedDataParallel(
62
- (_fsdp_wrapped_module): PositionEmbedding()
63
- )
64
- (vit_model): SiglipVisionModel(
65
- (vision_model): FullyShardedDataParallel(
66
- (_fsdp_wrapped_module): SiglipVisionTransformer(
67
- (embeddings): SiglipVisionEmbeddings(
68
- (position_embedding): Embedding(4900, 1152)
69
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
70
- )
71
- (encoder): SiglipEncoder(
72
- (layers): ModuleList(
73
- (0-25): 26 x FullyShardedDataParallel(
74
- (_fsdp_wrapped_module): CheckpointWrapper(
75
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
76
- (self_attn): SiglipFlashAttention2(
77
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
78
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
79
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
80
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
81
- )
82
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
83
- (mlp): SiglipMLP(
84
- (activation_fn): PytorchGELUTanh()
85
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
86
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
87
- )
88
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
89
- )
90
- )
91
- )
92
- )
93
- )
94
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
95
- )
96
- )
97
- )
98
- (connector): FullyShardedDataParallel(
99
- (_fsdp_wrapped_module): CheckpointWrapper(
100
- (_checkpoint_wrapped_module): MLPconnector(
101
- (activation_fn): PytorchGELUTanh()
102
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
103
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
104
- )
105
- )
106
- )
107
- (vit_pos_embed): FullyShardedDataParallel(
108
- (_fsdp_wrapped_module): PositionEmbedding()
109
- )
110
- )
111
- )
112
- _flat_param True
113
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
128
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- time_embedder._fsdp_wrapped_module._flat_param True
142
- latent_pos_embed._fsdp_wrapped_module._flat_param False
143
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
144
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
156
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
157
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
158
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
159
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
160
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
161
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
162
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
163
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
164
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
165
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
166
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
167
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
168
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
169
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
170
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
171
- vit_pos_embed._fsdp_wrapped_module._flat_param False
172
- Preparing Dataset vlm_gym_colorization_mse_loss_only/vlm_gym_colorization_train
173
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step0
174
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
175
- [eval debug] first 3 batch fingerprints:
176
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
177
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
178
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
179
- ce_avg: 0.0, mse_avg: 0.05326032266020775
180
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step500
181
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
182
- [eval debug] first 3 batch fingerprints:
183
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
184
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
185
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
186
- ce_avg: 0.0, mse_avg: 0.007997258566319942
187
  wandb: Detected [huggingface_hub.inference] in use.
188
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
189
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -781,4 +595,244 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
781
  [2026-01-25 20:42:14] (step=0000584) Train Loss mse: 0.0077, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
782
  [2026-01-25 20:42:37] (step=0000585) Train Loss mse: 0.0083, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
783
  [2026-01-25 20:42:58] (step=0000586) Train Loss mse: 0.0079, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
784
- [2026-01-25 20:43:20] (step=0000587) Train Loss mse: 0.0078, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
595
  [2026-01-25 20:42:14] (step=0000584) Train Loss mse: 0.0077, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
596
  [2026-01-25 20:42:37] (step=0000585) Train Loss mse: 0.0083, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
597
  [2026-01-25 20:42:58] (step=0000586) Train Loss mse: 0.0079, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
598
+ [2026-01-25 20:43:20] (step=0000587) Train Loss mse: 0.0078, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
599
+ [2026-01-25 20:43:44] (step=0000588) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
600
+ [2026-01-25 20:44:05] (step=0000589) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
601
+ [2026-01-25 20:44:26] (step=0000590) Train Loss mse: 0.0075, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
602
+ [2026-01-25 20:44:46] (step=0000591) Train Loss mse: 0.0082, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
603
+ [2026-01-25 20:45:10] (step=0000592) Train Loss mse: 0.0075, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
604
+ [2026-01-25 20:45:32] (step=0000593) Train Loss mse: 0.0071, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
605
+ [2026-01-25 20:45:54] (step=0000594) Train Loss mse: 0.0077, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
606
+ [2026-01-25 20:46:13] (step=0000595) Train Loss mse: 0.0072, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
607
+ [2026-01-25 20:46:33] (step=0000596) Train Loss mse: 0.0071, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
608
+ [2026-01-25 20:46:55] (step=0000597) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
609
+ [2026-01-25 20:47:12] (step=0000598) Train Loss mse: 0.0088, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
610
+ [2026-01-25 20:47:33] (step=0000599) Train Loss mse: 0.0069, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
611
+ [2026-01-25 20:47:53] (step=0000600) Train Loss mse: 0.0074, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
612
+ [2026-01-25 20:48:14] (step=0000601) Train Loss mse: 0.0085, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
613
+ [2026-01-25 20:48:35] (step=0000602) Train Loss mse: 0.0079, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
614
+ [2026-01-25 20:48:58] (step=0000603) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
615
+ [2026-01-25 20:49:25] (step=0000604) Train Loss mse: 0.0071, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
616
+ [2026-01-25 20:49:46] (step=0000605) Train Loss mse: 0.0083, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
617
+ [2026-01-25 20:50:07] (step=0000606) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
618
+ [2026-01-25 20:50:28] (step=0000607) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
619
+ [2026-01-25 20:50:47] (step=0000608) Train Loss mse: 0.0075, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
620
+ [2026-01-25 20:51:07] (step=0000609) Train Loss mse: 0.0082, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
621
+ [2026-01-25 20:51:29] (step=0000610) Train Loss mse: 0.0088, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
622
+ [2026-01-25 20:51:49] (step=0000611) Train Loss mse: 0.0083, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
623
+ [2026-01-25 20:52:22] (step=0000612) Train Loss mse: 0.0068, Train Loss ce: 0.0000, Train Steps/Sec: 0.03,
624
+ [2026-01-25 20:52:45] (step=0000613) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
625
+ [2026-01-25 20:53:07] (step=0000614) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
626
+ [2026-01-25 20:53:28] (step=0000615) Train Loss mse: 0.0090, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
627
+ [2026-01-25 20:53:49] (step=0000616) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
628
+ FullyShardedDataParallel(
629
+ (_fsdp_wrapped_module): Bagel(
630
+ (language_model): Qwen2ForCausalLM(
631
+ (model): Qwen2Model(
632
+ (embed_tokens): Embedding(152064, 3584)
633
+ (layers): ModuleList(
634
+ (0-27): 28 x FullyShardedDataParallel(
635
+ (_fsdp_wrapped_module): CheckpointWrapper(
636
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
637
+ (self_attn): PackedAttentionMoT(
638
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
639
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
640
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
641
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
642
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
643
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
644
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
645
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
646
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
647
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
648
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
649
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
650
+ )
651
+ (mlp): Qwen2MLP(
652
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
653
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
654
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
655
+ (act_fn): SiLU()
656
+ )
657
+ (mlp_moe_gen): Qwen2MLP(
658
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
659
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
660
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
661
+ (act_fn): SiLU()
662
+ )
663
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
664
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
665
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
666
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
667
+ )
668
+ )
669
+ )
670
+ )
671
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
672
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
673
+ (rotary_emb): Qwen2RotaryEmbedding()
674
+ )
675
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
676
+ )
677
+ (time_embedder): FullyShardedDataParallel(
678
+ (_fsdp_wrapped_module): TimestepEmbedder(
679
+ (mlp): Sequential(
680
+ (0): Linear(in_features=256, out_features=3584, bias=True)
681
+ (1): SiLU()
682
+ (2): Linear(in_features=3584, out_features=3584, bias=True)
683
+ )
684
+ )
685
+ )
686
+ (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
687
+ (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
688
+ (latent_pos_embed): FullyShardedDataParallel(
689
+ (_fsdp_wrapped_module): PositionEmbedding()
690
+ )
691
+ (vit_model): SiglipVisionModel(
692
+ (vision_model): FullyShardedDataParallel(
693
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
694
+ (embeddings): SiglipVisionEmbeddings(
695
+ (position_embedding): Embedding(4900, 1152)
696
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
697
+ )
698
+ (encoder): SiglipEncoder(
699
+ (layers): ModuleList(
700
+ (0-25): 26 x FullyShardedDataParallel(
701
+ (_fsdp_wrapped_module): CheckpointWrapper(
702
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
703
+ (self_attn): SiglipFlashAttention2(
704
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
705
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
706
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
707
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
708
+ )
709
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
710
+ (mlp): SiglipMLP(
711
+ (activation_fn): PytorchGELUTanh()
712
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
713
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
714
+ )
715
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
716
+ )
717
+ )
718
+ )
719
+ )
720
+ )
721
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
722
+ )
723
+ )
724
+ )
725
+ (connector): FullyShardedDataParallel(
726
+ (_fsdp_wrapped_module): CheckpointWrapper(
727
+ (_checkpoint_wrapped_module): MLPconnector(
728
+ (activation_fn): PytorchGELUTanh()
729
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
730
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
731
+ )
732
+ )
733
+ )
734
+ (vit_pos_embed): FullyShardedDataParallel(
735
+ (_fsdp_wrapped_module): PositionEmbedding()
736
+ )
737
+ )
738
+ )
739
+ _flat_param True
740
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
741
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
742
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
743
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
744
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
745
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
746
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
747
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
748
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
749
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
750
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
751
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
752
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
753
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
754
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
755
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
756
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
757
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
758
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
759
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
760
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
761
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
762
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
763
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
764
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
765
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
766
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
767
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
768
+ time_embedder._fsdp_wrapped_module._flat_param True
769
+ latent_pos_embed._fsdp_wrapped_module._flat_param False
770
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
771
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
772
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
773
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
774
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
775
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
776
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
777
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
778
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
779
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
780
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
781
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
782
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
783
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
784
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
785
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
786
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
787
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
788
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
789
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
790
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
791
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
792
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
793
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
794
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
795
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
796
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
797
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
798
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
799
+ Preparing Dataset vlm_gym_colorization_mse_loss_only/vlm_gym_colorization_train
800
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step0
801
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
802
+ [eval debug] first 3 batch fingerprints:
803
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
804
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
805
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
806
+ ce_avg: 0.0, mse_avg: 0.05326032266020775
807
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step500
808
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
809
+ [eval debug] first 3 batch fingerprints:
810
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
811
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
812
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
813
+ ce_avg: 0.0, mse_avg: 0.007997258566319942
814
+ [2026-01-25 20:54:14] (step=0000617) Train Loss mse: 0.0085, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
815
+ [2026-01-25 20:54:35] (step=0000618) Train Loss mse: 0.0074, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
816
+ [2026-01-25 20:54:54] (step=0000619) Train Loss mse: 0.0086, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
817
+ [2026-01-25 20:55:14] (step=0000620) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
818
+ [2026-01-25 20:55:37] (step=0000621) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
819
+ [2026-01-25 20:55:59] (step=0000622) Train Loss mse: 0.0078, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
820
+ [2026-01-25 20:56:22] (step=0000623) Train Loss mse: 0.0089, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
821
+ [2026-01-25 20:56:45] (step=0000624) Train Loss mse: 0.0079, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
822
+ [2026-01-25 20:57:06] (step=0000625) Train Loss mse: 0.0091, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
823
+ [2026-01-25 20:57:30] (step=0000626) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
824
+ [2026-01-25 20:57:51] (step=0000627) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
825
+ [2026-01-25 20:58:14] (step=0000628) Train Loss mse: 0.0086, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
826
+ [2026-01-25 20:58:34] (step=0000629) Train Loss mse: 0.0078, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
827
+ [2026-01-25 20:58:58] (step=0000630) Train Loss mse: 0.0082, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
828
+ [2026-01-25 20:59:18] (step=0000631) Train Loss mse: 0.0086, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
829
+ [2026-01-25 20:59:40] (step=0000632) Train Loss mse: 0.0079, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
830
+ [2026-01-25 20:59:59] (step=0000633) Train Loss mse: 0.0090, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
831
+ [2026-01-25 21:00:21] (step=0000634) Train Loss mse: 0.0082, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
832
+ [2026-01-25 21:00:44] (step=0000635) Train Loss mse: 0.0091, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
833
+ [2026-01-25 21:01:04] (step=0000636) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
834
+ [2026-01-25 21:01:25] (step=0000637) Train Loss mse: 0.0079, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
835
+ [2026-01-25 21:01:50] (step=0000638) Train Loss mse: 0.0064, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
836
+ [2026-01-25 21:02:12] (step=0000639) Train Loss mse: 0.0084, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
837
+ [2026-01-25 21:02:34] (step=0000640) Train Loss mse: 0.0087, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
838
+ [2026-01-25 21:02:56] (step=0000641) Train Loss mse: 0.0079, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/wandb/offline-run-20260125_170309-vlm_gym_colorization_one_img_lr2e_5_mse_only_ins-run0/files/wandb-summary.json CHANGED
@@ -1 +1 @@
1
- {"_runtime": 13210.631898663, "lr": 1.981747237555166e-05, "total_ce_tokens": 0, "total_norm": 0.12721827626228333, "total_samples": 16, "eval/ce": 0, "ce": 0, "total_mse_tokens": 87744, "mem_allocated": 59549.859375, "mem_cache": 77350, "_step": 586, "eval/mse": 0.007997258566319942, "_timestamp": 1769373778.5024066, "mse": 0.0079260915517807}
 
1
+ {"_runtime": 14386.023717632, "lr": 1.9742649624205542e-05, "total_ce_tokens": 0, "total_norm": 0.13556422293186188, "total_samples": 16, "eval/ce": 0, "ce": 0, "total_mse_tokens": 76256, "mem_allocated": 59549.859375, "mem_cache": 77430, "_step": 640, "eval/mse": 0.007997258566319942, "_timestamp": 1769374954.148881, "mse": 0.008700057864189148}
checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/wandb/offline-run-20260125_170309-vlm_gym_colorization_one_img_lr2e_5_mse_only_ins-run0/run-vlm_gym_colorization_one_img_lr2e_5_mse_only_ins-run0.wandb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4281b748958d71bb7fd2acb4ba10fe4a6c796f5027202b05e70bbd8ef2e16891
3
- size 4030464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d2a6ef470a0f28027b3f864fd146368428ba578fc86e600d4d011f46973322a
3
+ size 4358144