Junyi42 commited on
Commit
532998d
·
verified ·
1 Parent(s): 0947539

Upload checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins

Browse files
checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/wandb/offline-run-20260125_170309-vlm_gym_colorization_one_img_lr2e_5_mse_only_ins-run0/files/output.log CHANGED
@@ -1,189 +1,3 @@
1
- FullyShardedDataParallel(
2
- (_fsdp_wrapped_module): Bagel(
3
- (language_model): Qwen2ForCausalLM(
4
- (model): Qwen2Model(
5
- (embed_tokens): Embedding(152064, 3584)
6
- (layers): ModuleList(
7
- (0-27): 28 x FullyShardedDataParallel(
8
- (_fsdp_wrapped_module): CheckpointWrapper(
9
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
- (self_attn): PackedAttentionMoT(
11
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
- )
24
- (mlp): Qwen2MLP(
25
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
- (act_fn): SiLU()
29
- )
30
- (mlp_moe_gen): Qwen2MLP(
31
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
- (act_fn): SiLU()
35
- )
36
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
- )
41
- )
42
- )
43
- )
44
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (rotary_emb): Qwen2RotaryEmbedding()
47
- )
48
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
- )
50
- (time_embedder): FullyShardedDataParallel(
51
- (_fsdp_wrapped_module): TimestepEmbedder(
52
- (mlp): Sequential(
53
- (0): Linear(in_features=256, out_features=3584, bias=True)
54
- (1): SiLU()
55
- (2): Linear(in_features=3584, out_features=3584, bias=True)
56
- )
57
- )
58
- )
59
- (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
60
- (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
61
- (latent_pos_embed): FullyShardedDataParallel(
62
- (_fsdp_wrapped_module): PositionEmbedding()
63
- )
64
- (vit_model): SiglipVisionModel(
65
- (vision_model): FullyShardedDataParallel(
66
- (_fsdp_wrapped_module): SiglipVisionTransformer(
67
- (embeddings): SiglipVisionEmbeddings(
68
- (position_embedding): Embedding(4900, 1152)
69
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
70
- )
71
- (encoder): SiglipEncoder(
72
- (layers): ModuleList(
73
- (0-25): 26 x FullyShardedDataParallel(
74
- (_fsdp_wrapped_module): CheckpointWrapper(
75
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
76
- (self_attn): SiglipFlashAttention2(
77
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
78
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
79
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
80
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
81
- )
82
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
83
- (mlp): SiglipMLP(
84
- (activation_fn): PytorchGELUTanh()
85
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
86
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
87
- )
88
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
89
- )
90
- )
91
- )
92
- )
93
- )
94
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
95
- )
96
- )
97
- )
98
- (connector): FullyShardedDataParallel(
99
- (_fsdp_wrapped_module): CheckpointWrapper(
100
- (_checkpoint_wrapped_module): MLPconnector(
101
- (activation_fn): PytorchGELUTanh()
102
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
103
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
104
- )
105
- )
106
- )
107
- (vit_pos_embed): FullyShardedDataParallel(
108
- (_fsdp_wrapped_module): PositionEmbedding()
109
- )
110
- )
111
- )
112
- _flat_param True
113
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
128
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- time_embedder._fsdp_wrapped_module._flat_param True
142
- latent_pos_embed._fsdp_wrapped_module._flat_param False
143
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
144
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
156
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
157
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
158
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
159
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
160
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
161
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
162
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
163
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
164
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
165
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
166
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
167
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
168
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
169
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
170
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
171
- vit_pos_embed._fsdp_wrapped_module._flat_param False
172
- Preparing Dataset vlm_gym_colorization_mse_loss_only/vlm_gym_colorization_train
173
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step0
174
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
175
- [eval debug] first 3 batch fingerprints:
176
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
177
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
178
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
179
- ce_avg: 0.0, mse_avg: 0.05326032266020775
180
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step500
181
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
182
- [eval debug] first 3 batch fingerprints:
183
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
184
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
185
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
186
- ce_avg: 0.0, mse_avg: 0.007997258566319942
187
  wandb: Detected [huggingface_hub.inference] in use.
188
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
189
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -920,20 +734,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
920
  [2026-01-25 21:32:29] (step=0000723) Train Loss mse: 0.0069, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
921
  [2026-01-25 21:32:54] (step=0000724) Train Loss mse: 0.0069, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
922
  [2026-01-25 21:33:17] (step=0000725) Train Loss mse: 0.0077, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
923
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step1000
924
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
925
- [eval debug] first 3 batch fingerprints:
926
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
927
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
928
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
929
- ce_avg: 0.0, mse_avg: 0.007652191445231438
930
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step1500
931
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
932
- [eval debug] first 3 batch fingerprints:
933
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
934
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
935
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
936
- ce_avg: 0.0, mse_avg: 0.00800316222012043
937
  [2026-01-25 21:33:38] (step=0000726) Train Loss mse: 0.0078, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
938
  [2026-01-25 21:33:57] (step=0000727) Train Loss mse: 0.0079, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
939
  [2026-01-25 21:34:18] (step=0000728) Train Loss mse: 0.0087, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
@@ -967,6 +767,192 @@ ce_avg: 0.0, mse_avg: 0.00800316222012043
967
  [2026-01-25 21:44:25] (step=0000756) Train Loss mse: 0.0091, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
968
  [2026-01-25 21:44:45] (step=0000757) Train Loss mse: 0.0084, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
969
  [2026-01-25 21:45:11] (step=0000758) Train Loss mse: 0.0070, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
  [2026-01-25 21:45:29] (step=0000759) Train Loss mse: 0.0082, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
971
  [2026-01-25 21:45:51] (step=0000760) Train Loss mse: 0.0091, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
972
  [2026-01-25 21:46:13] (step=0000761) Train Loss mse: 0.0069, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
@@ -1963,20 +1949,6 @@ ce_avg: 0.0, mse_avg: 0.00800316222012043
1963
  [2026-01-26 03:41:23] (step=0001752) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
1964
  [2026-01-26 03:41:43] (step=0001753) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
1965
  [2026-01-26 03:42:05] (step=0001754) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
1966
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step2000
1967
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
1968
- [eval debug] first 3 batch fingerprints:
1969
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
1970
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
1971
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
1972
- ce_avg: 0.0, mse_avg: 0.0081106498837471
1973
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step2500
1974
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
1975
- [eval debug] first 3 batch fingerprints:
1976
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
1977
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
1978
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
1979
- ce_avg: 0.0, mse_avg: 0.007652428932487965
1980
  [2026-01-26 03:42:23] (step=0001755) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
1981
  [2026-01-26 03:42:41] (step=0001756) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1982
  [2026-01-26 03:43:02] (step=0001757) Train Loss mse: 0.0065, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
@@ -2043,6 +2015,34 @@ ce_avg: 0.0, mse_avg: 0.007652428932487965
2043
  [2026-01-26 04:04:48] (step=0001818) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
2044
  [2026-01-26 04:05:17] (step=0001819) Train Loss mse: 0.0070, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
2045
  [2026-01-26 04:05:40] (step=0001820) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2046
  [2026-01-26 04:05:59] (step=0001821) Train Loss mse: 0.0070, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
2047
  [2026-01-26 04:06:23] (step=0001822) Train Loss mse: 0.0059, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
2048
  [2026-01-26 04:06:47] (step=0001823) Train Loss mse: 0.0062, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
@@ -2988,20 +2988,6 @@ ce_avg: 0.0, mse_avg: 0.007652428932487965
2988
  [2026-01-26 09:44:17] (step=0002763) Train Loss mse: 0.0071, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
2989
  [2026-01-26 09:44:39] (step=0002764) Train Loss mse: 0.0068, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
2990
  [2026-01-26 09:44:56] (step=0002765) Train Loss mse: 0.0072, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
2991
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step3000
2992
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
2993
- [eval debug] first 3 batch fingerprints:
2994
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2995
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2996
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2997
- ce_avg: 0.0, mse_avg: 0.007834003306925297
2998
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step3500
2999
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
3000
- [eval debug] first 3 batch fingerprints:
3001
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3002
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3003
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3004
- ce_avg: 0.0, mse_avg: 0.007766008842736483
3005
  [2026-01-26 09:45:20] (step=0002766) Train Loss mse: 0.0084, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3006
  [2026-01-26 09:45:43] (step=0002767) Train Loss mse: 0.0068, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3007
  [2026-01-26 09:46:03] (step=0002768) Train Loss mse: 0.0074, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
@@ -3098,6 +3084,20 @@ ce_avg: 0.0, mse_avg: 0.007766008842736483
3098
  [2026-01-26 10:18:28] (step=0002859) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3099
  [2026-01-26 10:18:49] (step=0002860) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3100
  [2026-01-26 10:19:12] (step=0002861) Train Loss mse: 0.0067, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3101
  [2026-01-26 10:19:31] (step=0002862) Train Loss mse: 0.0065, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3102
  [2026-01-26 10:19:53] (step=0002863) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3103
  [2026-01-26 10:20:13] (step=0002864) Train Loss mse: 0.0084, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
@@ -3740,20 +3740,6 @@ ce_avg: 0.0, mse_avg: 0.007766008842736483
3740
  [2026-01-26 14:09:22] (step=0003501) Train Loss mse: 0.0066, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3741
  [2026-01-26 14:09:44] (step=0003502) Train Loss mse: 0.0062, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3742
  [2026-01-26 14:10:07] (step=0003503) Train Loss mse: 0.0075, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3743
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step4000
3744
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
3745
- [eval debug] first 3 batch fingerprints:
3746
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3747
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3748
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3749
- ce_avg: 0.0, mse_avg: 0.007558991201221943
3750
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step4500
3751
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
3752
- [eval debug] first 3 batch fingerprints:
3753
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3754
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3755
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3756
- ce_avg: 0.0, mse_avg: 0.007897508330643177
3757
  [2026-01-26 14:10:28] (step=0003504) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3758
  [2026-01-26 14:10:53] (step=0003505) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3759
  [2026-01-26 14:11:12] (step=0003506) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
@@ -3870,6 +3856,20 @@ ce_avg: 0.0, mse_avg: 0.007897508330643177
3870
  [2026-01-26 14:51:12] (step=0003617) Train Loss mse: 0.0059, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3871
  [2026-01-26 14:51:33] (step=0003618) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3872
  [2026-01-26 14:51:54] (step=0003619) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3873
  [2026-01-26 14:52:15] (step=0003620) Train Loss mse: 0.0077, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3874
  [2026-01-26 14:52:35] (step=0003621) Train Loss mse: 0.0074, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3875
  [2026-01-26 14:52:53] (step=0003622) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
@@ -4848,13 +4848,6 @@ ce_avg: 0.0, mse_avg: 0.007897508330643177
4848
  [2026-01-26 20:43:22] (step=0004595) Train Loss mse: 0.0062, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
4849
  [2026-01-26 20:43:40] (step=0004596) Train Loss mse: 0.0086, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
4850
  [2026-01-26 20:44:01] (step=0004597) Train Loss mse: 0.0064, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
4851
- base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step5000
4852
- Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
4853
- [eval debug] first 3 batch fingerprints:
4854
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
4855
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
4856
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
4857
- ce_avg: 0.0, mse_avg: 0.007832281291484833
4858
  [2026-01-26 20:44:23] (step=0004598) Train Loss mse: 0.0061, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
4859
  [2026-01-26 20:44:48] (step=0004599) Train Loss mse: 0.0056, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
4860
  [2026-01-26 20:45:10] (step=0004600) Train Loss mse: 0.0070, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
@@ -4951,6 +4944,13 @@ ce_avg: 0.0, mse_avg: 0.007832281291484833
4951
  [2026-01-26 21:17:01] (step=0004691) Train Loss mse: 0.0067, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
4952
  [2026-01-26 21:17:21] (step=0004692) Train Loss mse: 0.0072, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
4953
  [2026-01-26 21:17:43] (step=0004693) Train Loss mse: 0.0069, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
 
 
 
 
 
 
 
4954
  [2026-01-26 21:18:07] (step=0004694) Train Loss mse: 0.0077, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
4955
  [2026-01-26 21:18:31] (step=0004695) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
4956
  [2026-01-26 21:18:52] (step=0004696) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
734
  [2026-01-25 21:32:29] (step=0000723) Train Loss mse: 0.0069, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
735
  [2026-01-25 21:32:54] (step=0000724) Train Loss mse: 0.0069, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
736
  [2026-01-25 21:33:17] (step=0000725) Train Loss mse: 0.0077, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  [2026-01-25 21:33:38] (step=0000726) Train Loss mse: 0.0078, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
738
  [2026-01-25 21:33:57] (step=0000727) Train Loss mse: 0.0079, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
739
  [2026-01-25 21:34:18] (step=0000728) Train Loss mse: 0.0087, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
767
  [2026-01-25 21:44:25] (step=0000756) Train Loss mse: 0.0091, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
768
  [2026-01-25 21:44:45] (step=0000757) Train Loss mse: 0.0084, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
769
  [2026-01-25 21:45:11] (step=0000758) Train Loss mse: 0.0070, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
770
+ FullyShardedDataParallel(
771
+ (_fsdp_wrapped_module): Bagel(
772
+ (language_model): Qwen2ForCausalLM(
773
+ (model): Qwen2Model(
774
+ (embed_tokens): Embedding(152064, 3584)
775
+ (layers): ModuleList(
776
+ (0-27): 28 x FullyShardedDataParallel(
777
+ (_fsdp_wrapped_module): CheckpointWrapper(
778
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
779
+ (self_attn): PackedAttentionMoT(
780
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
781
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
782
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
783
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
784
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
785
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
786
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
787
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
788
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
789
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
790
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
791
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
792
+ )
793
+ (mlp): Qwen2MLP(
794
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
795
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
796
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
797
+ (act_fn): SiLU()
798
+ )
799
+ (mlp_moe_gen): Qwen2MLP(
800
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
801
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
802
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
803
+ (act_fn): SiLU()
804
+ )
805
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
806
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
807
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
808
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
809
+ )
810
+ )
811
+ )
812
+ )
813
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
814
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
815
+ (rotary_emb): Qwen2RotaryEmbedding()
816
+ )
817
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
818
+ )
819
+ (time_embedder): FullyShardedDataParallel(
820
+ (_fsdp_wrapped_module): TimestepEmbedder(
821
+ (mlp): Sequential(
822
+ (0): Linear(in_features=256, out_features=3584, bias=True)
823
+ (1): SiLU()
824
+ (2): Linear(in_features=3584, out_features=3584, bias=True)
825
+ )
826
+ )
827
+ )
828
+ (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
829
+ (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
830
+ (latent_pos_embed): FullyShardedDataParallel(
831
+ (_fsdp_wrapped_module): PositionEmbedding()
832
+ )
833
+ (vit_model): SiglipVisionModel(
834
+ (vision_model): FullyShardedDataParallel(
835
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
836
+ (embeddings): SiglipVisionEmbeddings(
837
+ (position_embedding): Embedding(4900, 1152)
838
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
839
+ )
840
+ (encoder): SiglipEncoder(
841
+ (layers): ModuleList(
842
+ (0-25): 26 x FullyShardedDataParallel(
843
+ (_fsdp_wrapped_module): CheckpointWrapper(
844
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
845
+ (self_attn): SiglipFlashAttention2(
846
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
847
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
848
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
849
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
850
+ )
851
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
852
+ (mlp): SiglipMLP(
853
+ (activation_fn): PytorchGELUTanh()
854
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
855
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
856
+ )
857
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
858
+ )
859
+ )
860
+ )
861
+ )
862
+ )
863
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
864
+ )
865
+ )
866
+ )
867
+ (connector): FullyShardedDataParallel(
868
+ (_fsdp_wrapped_module): CheckpointWrapper(
869
+ (_checkpoint_wrapped_module): MLPconnector(
870
+ (activation_fn): PytorchGELUTanh()
871
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
872
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
873
+ )
874
+ )
875
+ )
876
+ (vit_pos_embed): FullyShardedDataParallel(
877
+ (_fsdp_wrapped_module): PositionEmbedding()
878
+ )
879
+ )
880
+ )
881
+ _flat_param True
882
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
883
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
884
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
885
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
886
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
887
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
888
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
889
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
890
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
891
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
892
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
893
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
894
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
895
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
896
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
897
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
898
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
899
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
900
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
901
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
902
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
903
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
904
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
905
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
906
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
907
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
908
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
909
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
910
+ time_embedder._fsdp_wrapped_module._flat_param True
911
+ latent_pos_embed._fsdp_wrapped_module._flat_param False
912
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
913
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
914
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
915
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
916
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
917
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
918
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
919
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
920
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
921
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
922
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
923
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
924
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
925
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
926
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
927
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
928
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
929
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
930
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
931
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
932
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
933
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
934
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
935
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
936
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
937
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
938
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
939
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
940
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
941
+ Preparing Dataset vlm_gym_colorization_mse_loss_only/vlm_gym_colorization_train
942
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step0
943
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
944
+ [eval debug] first 3 batch fingerprints:
945
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
946
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
947
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
948
+ ce_avg: 0.0, mse_avg: 0.05326032266020775
949
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step500
950
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
951
+ [eval debug] first 3 batch fingerprints:
952
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
953
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
954
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
955
+ ce_avg: 0.0, mse_avg: 0.007997258566319942
956
  [2026-01-25 21:45:29] (step=0000759) Train Loss mse: 0.0082, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
957
  [2026-01-25 21:45:51] (step=0000760) Train Loss mse: 0.0091, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
958
  [2026-01-25 21:46:13] (step=0000761) Train Loss mse: 0.0069, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
1949
  [2026-01-26 03:41:23] (step=0001752) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
1950
  [2026-01-26 03:41:43] (step=0001753) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
1951
  [2026-01-26 03:42:05] (step=0001754) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1952
  [2026-01-26 03:42:23] (step=0001755) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
1953
  [2026-01-26 03:42:41] (step=0001756) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1954
  [2026-01-26 03:43:02] (step=0001757) Train Loss mse: 0.0065, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
2015
  [2026-01-26 04:04:48] (step=0001818) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
2016
  [2026-01-26 04:05:17] (step=0001819) Train Loss mse: 0.0070, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
2017
  [2026-01-26 04:05:40] (step=0001820) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
2018
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step1000
2019
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
2020
+ [eval debug] first 3 batch fingerprints:
2021
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2022
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2023
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2024
+ ce_avg: 0.0, mse_avg: 0.007652191445231438
2025
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step1500
2026
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
2027
+ [eval debug] first 3 batch fingerprints:
2028
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2029
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2030
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2031
+ ce_avg: 0.0, mse_avg: 0.00800316222012043
2032
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step2000
2033
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
2034
+ [eval debug] first 3 batch fingerprints:
2035
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2036
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2037
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2038
+ ce_avg: 0.0, mse_avg: 0.0081106498837471
2039
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step2500
2040
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
2041
+ [eval debug] first 3 batch fingerprints:
2042
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2043
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2044
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
2045
+ ce_avg: 0.0, mse_avg: 0.007652428932487965
2046
  [2026-01-26 04:05:59] (step=0001821) Train Loss mse: 0.0070, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
2047
  [2026-01-26 04:06:23] (step=0001822) Train Loss mse: 0.0059, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
2048
  [2026-01-26 04:06:47] (step=0001823) Train Loss mse: 0.0062, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
 
2988
  [2026-01-26 09:44:17] (step=0002763) Train Loss mse: 0.0071, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
2989
  [2026-01-26 09:44:39] (step=0002764) Train Loss mse: 0.0068, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
2990
  [2026-01-26 09:44:56] (step=0002765) Train Loss mse: 0.0072, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2991
  [2026-01-26 09:45:20] (step=0002766) Train Loss mse: 0.0084, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
2992
  [2026-01-26 09:45:43] (step=0002767) Train Loss mse: 0.0068, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
2993
  [2026-01-26 09:46:03] (step=0002768) Train Loss mse: 0.0074, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
3084
  [2026-01-26 10:18:28] (step=0002859) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3085
  [2026-01-26 10:18:49] (step=0002860) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3086
  [2026-01-26 10:19:12] (step=0002861) Train Loss mse: 0.0067, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3087
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step3000
3088
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
3089
+ [eval debug] first 3 batch fingerprints:
3090
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3091
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3092
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3093
+ ce_avg: 0.0, mse_avg: 0.007834003306925297
3094
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step3500
3095
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
3096
+ [eval debug] first 3 batch fingerprints:
3097
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3098
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3099
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3100
+ ce_avg: 0.0, mse_avg: 0.007766008842736483
3101
  [2026-01-26 10:19:31] (step=0002862) Train Loss mse: 0.0065, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3102
  [2026-01-26 10:19:53] (step=0002863) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3103
  [2026-01-26 10:20:13] (step=0002864) Train Loss mse: 0.0084, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
3740
  [2026-01-26 14:09:22] (step=0003501) Train Loss mse: 0.0066, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3741
  [2026-01-26 14:09:44] (step=0003502) Train Loss mse: 0.0062, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3742
  [2026-01-26 14:10:07] (step=0003503) Train Loss mse: 0.0075, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3743
  [2026-01-26 14:10:28] (step=0003504) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3744
  [2026-01-26 14:10:53] (step=0003505) Train Loss mse: 0.0080, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
3745
  [2026-01-26 14:11:12] (step=0003506) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
3856
  [2026-01-26 14:51:12] (step=0003617) Train Loss mse: 0.0059, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3857
  [2026-01-26 14:51:33] (step=0003618) Train Loss mse: 0.0073, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3858
  [2026-01-26 14:51:54] (step=0003619) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3859
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step4000
3860
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
3861
+ [eval debug] first 3 batch fingerprints:
3862
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3863
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3864
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3865
+ ce_avg: 0.0, mse_avg: 0.007558991201221943
3866
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step4500
3867
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
3868
+ [eval debug] first 3 batch fingerprints:
3869
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3870
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3871
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
3872
+ ce_avg: 0.0, mse_avg: 0.007897508330643177
3873
  [2026-01-26 14:52:15] (step=0003620) Train Loss mse: 0.0077, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3874
  [2026-01-26 14:52:35] (step=0003621) Train Loss mse: 0.0074, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
3875
  [2026-01-26 14:52:53] (step=0003622) Train Loss mse: 0.0076, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
4848
  [2026-01-26 20:43:22] (step=0004595) Train Loss mse: 0.0062, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
4849
  [2026-01-26 20:43:40] (step=0004596) Train Loss mse: 0.0086, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
4850
  [2026-01-26 20:44:01] (step=0004597) Train Loss mse: 0.0064, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
 
 
 
 
 
 
4851
  [2026-01-26 20:44:23] (step=0004598) Train Loss mse: 0.0061, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
4852
  [2026-01-26 20:44:48] (step=0004599) Train Loss mse: 0.0056, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
4853
  [2026-01-26 20:45:10] (step=0004600) Train Loss mse: 0.0070, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
 
4944
  [2026-01-26 21:17:01] (step=0004691) Train Loss mse: 0.0067, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
4945
  [2026-01-26 21:17:21] (step=0004692) Train Loss mse: 0.0072, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,
4946
  [2026-01-26 21:17:43] (step=0004693) Train Loss mse: 0.0069, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
4947
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_colorization_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_colorization_one_img_lr2e_5_mse_only_ins_step5000
4948
+ Preparing Dataset vlm_gym_colorization_mse_loss_only_evalonce/vlm_gym_colorization_val
4949
+ [eval debug] first 3 batch fingerprints:
4950
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
4951
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
4952
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_colorization_mse_loss_only_evalonce'}]
4953
+ ce_avg: 0.0, mse_avg: 0.007832281291484833
4954
  [2026-01-26 21:18:07] (step=0004694) Train Loss mse: 0.0077, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
4955
  [2026-01-26 21:18:31] (step=0004695) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.04,
4956
  [2026-01-26 21:18:52] (step=0004696) Train Loss mse: 0.0081, Train Loss ce: 0.0000, Train Steps/Sec: 0.05,