irodkin commited on
Commit
705e8a1
·
verified ·
1 Parent(s): aff0a04

Training checkpoint at step 12000

Browse files
Files changed (1) hide show
  1. trainer_state.json +186 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 11500,
3
- "best_metric": 2.541316032409668,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-11500",
5
- "epoch": 0.23,
6
  "eval_steps": 100,
7
- "global_step": 11500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4148,6 +4148,186 @@
4148
  "eval_samples_per_second": 2.464,
4149
  "eval_steps_per_second": 1.232,
4150
  "step": 11500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4151
  }
4152
  ],
4153
  "logging_steps": 25,
@@ -4167,7 +4347,7 @@
4167
  "attributes": {}
4168
  }
4169
  },
4170
- "total_flos": 2.5808522545396187e+19,
4171
  "train_batch_size": 1,
4172
  "trial_name": null,
4173
  "trial_params": null
 
1
  {
2
+ "best_global_step": 12000,
3
+ "best_metric": 2.538311243057251,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-12000",
5
+ "epoch": 0.24,
6
  "eval_steps": 100,
7
+ "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4148
  "eval_samples_per_second": 2.464,
4149
  "eval_steps_per_second": 1.232,
4150
  "step": 11500
4151
+ },
4152
+ {
4153
+ "epoch": 0.2305,
4154
+ "grad_norm": 2.155444422697904,
4155
+ "learning_rate": 8.550222222222223e-06,
4156
+ "loss": 2.543,
4157
+ "step": 11525
4158
+ },
4159
+ {
4160
+ "epoch": 0.231,
4161
+ "grad_norm": 2.5216609928964706,
4162
+ "learning_rate": 8.544666666666668e-06,
4163
+ "loss": 2.5339,
4164
+ "step": 11550
4165
+ },
4166
+ {
4167
+ "epoch": 0.2315,
4168
+ "grad_norm": 3.2141643729123826,
4169
+ "learning_rate": 8.539111111111112e-06,
4170
+ "loss": 2.5311,
4171
+ "step": 11575
4172
+ },
4173
+ {
4174
+ "epoch": 0.232,
4175
+ "grad_norm": 2.779033714093245,
4176
+ "learning_rate": 8.533555555555557e-06,
4177
+ "loss": 2.5367,
4178
+ "step": 11600
4179
+ },
4180
+ {
4181
+ "epoch": 0.232,
4182
+ "eval_loss": 2.539663553237915,
4183
+ "eval_runtime": 42.1104,
4184
+ "eval_samples_per_second": 2.47,
4185
+ "eval_steps_per_second": 1.235,
4186
+ "step": 11600
4187
+ },
4188
+ {
4189
+ "epoch": 0.2325,
4190
+ "grad_norm": 2.0599049344871134,
4191
+ "learning_rate": 8.528e-06,
4192
+ "loss": 2.5406,
4193
+ "step": 11625
4194
+ },
4195
+ {
4196
+ "epoch": 0.233,
4197
+ "grad_norm": 2.1617162796171536,
4198
+ "learning_rate": 8.522444444444446e-06,
4199
+ "loss": 2.5244,
4200
+ "step": 11650
4201
+ },
4202
+ {
4203
+ "epoch": 0.2335,
4204
+ "grad_norm": 2.4286224889340926,
4205
+ "learning_rate": 8.51688888888889e-06,
4206
+ "loss": 2.5364,
4207
+ "step": 11675
4208
+ },
4209
+ {
4210
+ "epoch": 0.234,
4211
+ "grad_norm": 2.0435359432545424,
4212
+ "learning_rate": 8.511333333333334e-06,
4213
+ "loss": 2.5332,
4214
+ "step": 11700
4215
+ },
4216
+ {
4217
+ "epoch": 0.234,
4218
+ "eval_loss": 2.539963960647583,
4219
+ "eval_runtime": 42.1502,
4220
+ "eval_samples_per_second": 2.467,
4221
+ "eval_steps_per_second": 1.234,
4222
+ "step": 11700
4223
+ },
4224
+ {
4225
+ "epoch": 0.2345,
4226
+ "grad_norm": 2.6031764141012195,
4227
+ "learning_rate": 8.505777777777778e-06,
4228
+ "loss": 2.5292,
4229
+ "step": 11725
4230
+ },
4231
+ {
4232
+ "epoch": 0.235,
4233
+ "grad_norm": 2.2484621657042427,
4234
+ "learning_rate": 8.500222222222223e-06,
4235
+ "loss": 2.523,
4236
+ "step": 11750
4237
+ },
4238
+ {
4239
+ "epoch": 0.2355,
4240
+ "grad_norm": 2.854177673999505,
4241
+ "learning_rate": 8.494666666666668e-06,
4242
+ "loss": 2.5218,
4243
+ "step": 11775
4244
+ },
4245
+ {
4246
+ "epoch": 0.236,
4247
+ "grad_norm": 2.0770100967771055,
4248
+ "learning_rate": 8.489111111111112e-06,
4249
+ "loss": 2.534,
4250
+ "step": 11800
4251
+ },
4252
+ {
4253
+ "epoch": 0.236,
4254
+ "eval_loss": 2.538536548614502,
4255
+ "eval_runtime": 42.3875,
4256
+ "eval_samples_per_second": 2.454,
4257
+ "eval_steps_per_second": 1.227,
4258
+ "step": 11800
4259
+ },
4260
+ {
4261
+ "epoch": 0.2365,
4262
+ "grad_norm": 2.391823444522325,
4263
+ "learning_rate": 8.483555555555556e-06,
4264
+ "loss": 2.5211,
4265
+ "step": 11825
4266
+ },
4267
+ {
4268
+ "epoch": 0.237,
4269
+ "grad_norm": 2.333238897849914,
4270
+ "learning_rate": 8.478e-06,
4271
+ "loss": 2.5238,
4272
+ "step": 11850
4273
+ },
4274
+ {
4275
+ "epoch": 0.2375,
4276
+ "grad_norm": 2.1636671466235256,
4277
+ "learning_rate": 8.472444444444446e-06,
4278
+ "loss": 2.5378,
4279
+ "step": 11875
4280
+ },
4281
+ {
4282
+ "epoch": 0.238,
4283
+ "grad_norm": 2.5877564973697607,
4284
+ "learning_rate": 8.46688888888889e-06,
4285
+ "loss": 2.5415,
4286
+ "step": 11900
4287
+ },
4288
+ {
4289
+ "epoch": 0.238,
4290
+ "eval_loss": 2.538837194442749,
4291
+ "eval_runtime": 42.2059,
4292
+ "eval_samples_per_second": 2.464,
4293
+ "eval_steps_per_second": 1.232,
4294
+ "step": 11900
4295
+ },
4296
+ {
4297
+ "epoch": 0.2385,
4298
+ "grad_norm": 2.1416643296031785,
4299
+ "learning_rate": 8.461333333333333e-06,
4300
+ "loss": 2.525,
4301
+ "step": 11925
4302
+ },
4303
+ {
4304
+ "epoch": 0.239,
4305
+ "grad_norm": 2.213813959028046,
4306
+ "learning_rate": 8.455777777777778e-06,
4307
+ "loss": 2.5416,
4308
+ "step": 11950
4309
+ },
4310
+ {
4311
+ "epoch": 0.2395,
4312
+ "grad_norm": 2.759854381361929,
4313
+ "learning_rate": 8.450222222222224e-06,
4314
+ "loss": 2.5355,
4315
+ "step": 11975
4316
+ },
4317
+ {
4318
+ "epoch": 0.24,
4319
+ "grad_norm": 2.050520488248713,
4320
+ "learning_rate": 8.444666666666667e-06,
4321
+ "loss": 2.5263,
4322
+ "step": 12000
4323
+ },
4324
+ {
4325
+ "epoch": 0.24,
4326
+ "eval_loss": 2.538311243057251,
4327
+ "eval_runtime": 42.2256,
4328
+ "eval_samples_per_second": 2.463,
4329
+ "eval_steps_per_second": 1.231,
4330
+ "step": 12000
4331
  }
4332
  ],
4333
  "logging_steps": 25,
 
4347
  "attributes": {}
4348
  }
4349
  },
4350
+ "total_flos": 2.6930632229499437e+19,
4351
  "train_batch_size": 1,
4352
  "trial_name": null,
4353
  "trial_params": null