Training checkpoint at step 12000
Browse files- trainer_state.json +186 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4148,6 +4148,186 @@
|
|
| 4148 |
"eval_samples_per_second": 2.464,
|
| 4149 |
"eval_steps_per_second": 1.232,
|
| 4150 |
"step": 11500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4151 |
}
|
| 4152 |
],
|
| 4153 |
"logging_steps": 25,
|
|
@@ -4167,7 +4347,7 @@
|
|
| 4167 |
"attributes": {}
|
| 4168 |
}
|
| 4169 |
},
|
| 4170 |
-
"total_flos": 2.
|
| 4171 |
"train_batch_size": 1,
|
| 4172 |
"trial_name": null,
|
| 4173 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 12000,
|
| 3 |
+
"best_metric": 2.538311243057251,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-12000",
|
| 5 |
+
"epoch": 0.24,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 12000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4148 |
"eval_samples_per_second": 2.464,
|
| 4149 |
"eval_steps_per_second": 1.232,
|
| 4150 |
"step": 11500
|
| 4151 |
+
},
|
| 4152 |
+
{
|
| 4153 |
+
"epoch": 0.2305,
|
| 4154 |
+
"grad_norm": 2.155444422697904,
|
| 4155 |
+
"learning_rate": 8.550222222222223e-06,
|
| 4156 |
+
"loss": 2.543,
|
| 4157 |
+
"step": 11525
|
| 4158 |
+
},
|
| 4159 |
+
{
|
| 4160 |
+
"epoch": 0.231,
|
| 4161 |
+
"grad_norm": 2.5216609928964706,
|
| 4162 |
+
"learning_rate": 8.544666666666668e-06,
|
| 4163 |
+
"loss": 2.5339,
|
| 4164 |
+
"step": 11550
|
| 4165 |
+
},
|
| 4166 |
+
{
|
| 4167 |
+
"epoch": 0.2315,
|
| 4168 |
+
"grad_norm": 3.2141643729123826,
|
| 4169 |
+
"learning_rate": 8.539111111111112e-06,
|
| 4170 |
+
"loss": 2.5311,
|
| 4171 |
+
"step": 11575
|
| 4172 |
+
},
|
| 4173 |
+
{
|
| 4174 |
+
"epoch": 0.232,
|
| 4175 |
+
"grad_norm": 2.779033714093245,
|
| 4176 |
+
"learning_rate": 8.533555555555557e-06,
|
| 4177 |
+
"loss": 2.5367,
|
| 4178 |
+
"step": 11600
|
| 4179 |
+
},
|
| 4180 |
+
{
|
| 4181 |
+
"epoch": 0.232,
|
| 4182 |
+
"eval_loss": 2.539663553237915,
|
| 4183 |
+
"eval_runtime": 42.1104,
|
| 4184 |
+
"eval_samples_per_second": 2.47,
|
| 4185 |
+
"eval_steps_per_second": 1.235,
|
| 4186 |
+
"step": 11600
|
| 4187 |
+
},
|
| 4188 |
+
{
|
| 4189 |
+
"epoch": 0.2325,
|
| 4190 |
+
"grad_norm": 2.0599049344871134,
|
| 4191 |
+
"learning_rate": 8.528e-06,
|
| 4192 |
+
"loss": 2.5406,
|
| 4193 |
+
"step": 11625
|
| 4194 |
+
},
|
| 4195 |
+
{
|
| 4196 |
+
"epoch": 0.233,
|
| 4197 |
+
"grad_norm": 2.1617162796171536,
|
| 4198 |
+
"learning_rate": 8.522444444444446e-06,
|
| 4199 |
+
"loss": 2.5244,
|
| 4200 |
+
"step": 11650
|
| 4201 |
+
},
|
| 4202 |
+
{
|
| 4203 |
+
"epoch": 0.2335,
|
| 4204 |
+
"grad_norm": 2.4286224889340926,
|
| 4205 |
+
"learning_rate": 8.51688888888889e-06,
|
| 4206 |
+
"loss": 2.5364,
|
| 4207 |
+
"step": 11675
|
| 4208 |
+
},
|
| 4209 |
+
{
|
| 4210 |
+
"epoch": 0.234,
|
| 4211 |
+
"grad_norm": 2.0435359432545424,
|
| 4212 |
+
"learning_rate": 8.511333333333334e-06,
|
| 4213 |
+
"loss": 2.5332,
|
| 4214 |
+
"step": 11700
|
| 4215 |
+
},
|
| 4216 |
+
{
|
| 4217 |
+
"epoch": 0.234,
|
| 4218 |
+
"eval_loss": 2.539963960647583,
|
| 4219 |
+
"eval_runtime": 42.1502,
|
| 4220 |
+
"eval_samples_per_second": 2.467,
|
| 4221 |
+
"eval_steps_per_second": 1.234,
|
| 4222 |
+
"step": 11700
|
| 4223 |
+
},
|
| 4224 |
+
{
|
| 4225 |
+
"epoch": 0.2345,
|
| 4226 |
+
"grad_norm": 2.6031764141012195,
|
| 4227 |
+
"learning_rate": 8.505777777777778e-06,
|
| 4228 |
+
"loss": 2.5292,
|
| 4229 |
+
"step": 11725
|
| 4230 |
+
},
|
| 4231 |
+
{
|
| 4232 |
+
"epoch": 0.235,
|
| 4233 |
+
"grad_norm": 2.2484621657042427,
|
| 4234 |
+
"learning_rate": 8.500222222222223e-06,
|
| 4235 |
+
"loss": 2.523,
|
| 4236 |
+
"step": 11750
|
| 4237 |
+
},
|
| 4238 |
+
{
|
| 4239 |
+
"epoch": 0.2355,
|
| 4240 |
+
"grad_norm": 2.854177673999505,
|
| 4241 |
+
"learning_rate": 8.494666666666668e-06,
|
| 4242 |
+
"loss": 2.5218,
|
| 4243 |
+
"step": 11775
|
| 4244 |
+
},
|
| 4245 |
+
{
|
| 4246 |
+
"epoch": 0.236,
|
| 4247 |
+
"grad_norm": 2.0770100967771055,
|
| 4248 |
+
"learning_rate": 8.489111111111112e-06,
|
| 4249 |
+
"loss": 2.534,
|
| 4250 |
+
"step": 11800
|
| 4251 |
+
},
|
| 4252 |
+
{
|
| 4253 |
+
"epoch": 0.236,
|
| 4254 |
+
"eval_loss": 2.538536548614502,
|
| 4255 |
+
"eval_runtime": 42.3875,
|
| 4256 |
+
"eval_samples_per_second": 2.454,
|
| 4257 |
+
"eval_steps_per_second": 1.227,
|
| 4258 |
+
"step": 11800
|
| 4259 |
+
},
|
| 4260 |
+
{
|
| 4261 |
+
"epoch": 0.2365,
|
| 4262 |
+
"grad_norm": 2.391823444522325,
|
| 4263 |
+
"learning_rate": 8.483555555555556e-06,
|
| 4264 |
+
"loss": 2.5211,
|
| 4265 |
+
"step": 11825
|
| 4266 |
+
},
|
| 4267 |
+
{
|
| 4268 |
+
"epoch": 0.237,
|
| 4269 |
+
"grad_norm": 2.333238897849914,
|
| 4270 |
+
"learning_rate": 8.478e-06,
|
| 4271 |
+
"loss": 2.5238,
|
| 4272 |
+
"step": 11850
|
| 4273 |
+
},
|
| 4274 |
+
{
|
| 4275 |
+
"epoch": 0.2375,
|
| 4276 |
+
"grad_norm": 2.1636671466235256,
|
| 4277 |
+
"learning_rate": 8.472444444444446e-06,
|
| 4278 |
+
"loss": 2.5378,
|
| 4279 |
+
"step": 11875
|
| 4280 |
+
},
|
| 4281 |
+
{
|
| 4282 |
+
"epoch": 0.238,
|
| 4283 |
+
"grad_norm": 2.5877564973697607,
|
| 4284 |
+
"learning_rate": 8.46688888888889e-06,
|
| 4285 |
+
"loss": 2.5415,
|
| 4286 |
+
"step": 11900
|
| 4287 |
+
},
|
| 4288 |
+
{
|
| 4289 |
+
"epoch": 0.238,
|
| 4290 |
+
"eval_loss": 2.538837194442749,
|
| 4291 |
+
"eval_runtime": 42.2059,
|
| 4292 |
+
"eval_samples_per_second": 2.464,
|
| 4293 |
+
"eval_steps_per_second": 1.232,
|
| 4294 |
+
"step": 11900
|
| 4295 |
+
},
|
| 4296 |
+
{
|
| 4297 |
+
"epoch": 0.2385,
|
| 4298 |
+
"grad_norm": 2.1416643296031785,
|
| 4299 |
+
"learning_rate": 8.461333333333333e-06,
|
| 4300 |
+
"loss": 2.525,
|
| 4301 |
+
"step": 11925
|
| 4302 |
+
},
|
| 4303 |
+
{
|
| 4304 |
+
"epoch": 0.239,
|
| 4305 |
+
"grad_norm": 2.213813959028046,
|
| 4306 |
+
"learning_rate": 8.455777777777778e-06,
|
| 4307 |
+
"loss": 2.5416,
|
| 4308 |
+
"step": 11950
|
| 4309 |
+
},
|
| 4310 |
+
{
|
| 4311 |
+
"epoch": 0.2395,
|
| 4312 |
+
"grad_norm": 2.759854381361929,
|
| 4313 |
+
"learning_rate": 8.450222222222224e-06,
|
| 4314 |
+
"loss": 2.5355,
|
| 4315 |
+
"step": 11975
|
| 4316 |
+
},
|
| 4317 |
+
{
|
| 4318 |
+
"epoch": 0.24,
|
| 4319 |
+
"grad_norm": 2.050520488248713,
|
| 4320 |
+
"learning_rate": 8.444666666666667e-06,
|
| 4321 |
+
"loss": 2.5263,
|
| 4322 |
+
"step": 12000
|
| 4323 |
+
},
|
| 4324 |
+
{
|
| 4325 |
+
"epoch": 0.24,
|
| 4326 |
+
"eval_loss": 2.538311243057251,
|
| 4327 |
+
"eval_runtime": 42.2256,
|
| 4328 |
+
"eval_samples_per_second": 2.463,
|
| 4329 |
+
"eval_steps_per_second": 1.231,
|
| 4330 |
+
"step": 12000
|
| 4331 |
}
|
| 4332 |
],
|
| 4333 |
"logging_steps": 25,
|
|
|
|
| 4347 |
"attributes": {}
|
| 4348 |
}
|
| 4349 |
},
|
| 4350 |
+
"total_flos": 2.6930632229499437e+19,
|
| 4351 |
"train_batch_size": 1,
|
| 4352 |
"trial_name": null,
|
| 4353 |
"trial_params": null
|