irodkin commited on
Commit
89e5626
·
verified ·
1 Parent(s): 73e384d

Training checkpoint at step 12000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 10900,
3
- "best_metric": 2.4055566787719727,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-9000",
5
- "epoch": 0.22,
6
  "eval_steps": 100,
7
- "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3968,6 +3968,366 @@
3968
  "eval_samples_per_second": 3.217,
3969
  "eval_steps_per_second": 1.609,
3970
  "step": 11000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3971
  }
3972
  ],
3973
  "logging_steps": 25,
@@ -3987,7 +4347,7 @@
3987
  "attributes": {}
3988
  }
3989
  },
3990
- "total_flos": 3.5015225281599242e+19,
3991
  "train_batch_size": 1,
3992
  "trial_name": null,
3993
  "trial_params": null
 
1
  {
2
+ "best_global_step": 12000,
3
+ "best_metric": 2.4031572341918945,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-12000",
5
+ "epoch": 0.24,
6
  "eval_steps": 100,
7
+ "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3968
  "eval_samples_per_second": 3.217,
3969
  "eval_steps_per_second": 1.609,
3970
  "step": 11000
3971
+ },
3972
+ {
3973
+ "epoch": 0.2205,
3974
+ "grad_norm": 0.619520274382602,
3975
+ "learning_rate": 8.661333333333335e-06,
3976
+ "loss": 2.3868,
3977
+ "step": 11025
3978
+ },
3979
+ {
3980
+ "epoch": 0.221,
3981
+ "grad_norm": 0.5973378822756289,
3982
+ "learning_rate": 8.655777777777778e-06,
3983
+ "loss": 2.398,
3984
+ "step": 11050
3985
+ },
3986
+ {
3987
+ "epoch": 0.2215,
3988
+ "grad_norm": 0.6143187669490118,
3989
+ "learning_rate": 8.650222222222223e-06,
3990
+ "loss": 2.387,
3991
+ "step": 11075
3992
+ },
3993
+ {
3994
+ "epoch": 0.222,
3995
+ "grad_norm": 0.5804040103557917,
3996
+ "learning_rate": 8.644666666666669e-06,
3997
+ "loss": 2.3951,
3998
+ "step": 11100
3999
+ },
4000
+ {
4001
+ "epoch": 0.222,
4002
+ "eval_loss": 2.4050545692443848,
4003
+ "eval_runtime": 31.7713,
4004
+ "eval_samples_per_second": 3.21,
4005
+ "eval_steps_per_second": 1.605,
4006
+ "step": 11100
4007
+ },
4008
+ {
4009
+ "epoch": 0.2225,
4010
+ "grad_norm": 0.5833158956225722,
4011
+ "learning_rate": 8.639111111111112e-06,
4012
+ "loss": 2.3854,
4013
+ "step": 11125
4014
+ },
4015
+ {
4016
+ "epoch": 0.223,
4017
+ "grad_norm": 0.5741811771851818,
4018
+ "learning_rate": 8.633555555555556e-06,
4019
+ "loss": 2.3866,
4020
+ "step": 11150
4021
+ },
4022
+ {
4023
+ "epoch": 0.2235,
4024
+ "grad_norm": 0.5856955103294486,
4025
+ "learning_rate": 8.628000000000001e-06,
4026
+ "loss": 2.4058,
4027
+ "step": 11175
4028
+ },
4029
+ {
4030
+ "epoch": 0.224,
4031
+ "grad_norm": 0.5685596699989746,
4032
+ "learning_rate": 8.622444444444446e-06,
4033
+ "loss": 2.3953,
4034
+ "step": 11200
4035
+ },
4036
+ {
4037
+ "epoch": 0.224,
4038
+ "eval_loss": 2.4051928520202637,
4039
+ "eval_runtime": 35.481,
4040
+ "eval_samples_per_second": 2.875,
4041
+ "eval_steps_per_second": 1.437,
4042
+ "step": 11200
4043
+ },
4044
+ {
4045
+ "epoch": 0.2245,
4046
+ "grad_norm": 0.5854297741723825,
4047
+ "learning_rate": 8.61688888888889e-06,
4048
+ "loss": 2.3977,
4049
+ "step": 11225
4050
+ },
4051
+ {
4052
+ "epoch": 0.225,
4053
+ "grad_norm": 0.582929503102295,
4054
+ "learning_rate": 8.611333333333333e-06,
4055
+ "loss": 2.3948,
4056
+ "step": 11250
4057
+ },
4058
+ {
4059
+ "epoch": 0.2255,
4060
+ "grad_norm": 0.5839207937169353,
4061
+ "learning_rate": 8.605777777777779e-06,
4062
+ "loss": 2.4104,
4063
+ "step": 11275
4064
+ },
4065
+ {
4066
+ "epoch": 0.226,
4067
+ "grad_norm": 0.5568849917729087,
4068
+ "learning_rate": 8.600222222222224e-06,
4069
+ "loss": 2.4011,
4070
+ "step": 11300
4071
+ },
4072
+ {
4073
+ "epoch": 0.226,
4074
+ "eval_loss": 2.404717445373535,
4075
+ "eval_runtime": 31.9835,
4076
+ "eval_samples_per_second": 3.189,
4077
+ "eval_steps_per_second": 1.595,
4078
+ "step": 11300
4079
+ },
4080
+ {
4081
+ "epoch": 0.2265,
4082
+ "grad_norm": 0.5549969270675909,
4083
+ "learning_rate": 8.594666666666668e-06,
4084
+ "loss": 2.3965,
4085
+ "step": 11325
4086
+ },
4087
+ {
4088
+ "epoch": 0.227,
4089
+ "grad_norm": 0.5606539732290856,
4090
+ "learning_rate": 8.589111111111111e-06,
4091
+ "loss": 2.3921,
4092
+ "step": 11350
4093
+ },
4094
+ {
4095
+ "epoch": 0.2275,
4096
+ "grad_norm": 0.5626929771754517,
4097
+ "learning_rate": 8.583555555555556e-06,
4098
+ "loss": 2.3912,
4099
+ "step": 11375
4100
+ },
4101
+ {
4102
+ "epoch": 0.228,
4103
+ "grad_norm": 0.5731631708828652,
4104
+ "learning_rate": 8.578000000000002e-06,
4105
+ "loss": 2.3926,
4106
+ "step": 11400
4107
+ },
4108
+ {
4109
+ "epoch": 0.228,
4110
+ "eval_loss": 2.4047322273254395,
4111
+ "eval_runtime": 31.8245,
4112
+ "eval_samples_per_second": 3.205,
4113
+ "eval_steps_per_second": 1.603,
4114
+ "step": 11400
4115
+ },
4116
+ {
4117
+ "epoch": 0.2285,
4118
+ "grad_norm": 0.5661654100374769,
4119
+ "learning_rate": 8.572444444444445e-06,
4120
+ "loss": 2.3951,
4121
+ "step": 11425
4122
+ },
4123
+ {
4124
+ "epoch": 0.229,
4125
+ "grad_norm": 0.5602181256620924,
4126
+ "learning_rate": 8.56688888888889e-06,
4127
+ "loss": 2.3812,
4128
+ "step": 11450
4129
+ },
4130
+ {
4131
+ "epoch": 0.2295,
4132
+ "grad_norm": 0.5950733473289397,
4133
+ "learning_rate": 8.561333333333334e-06,
4134
+ "loss": 2.3963,
4135
+ "step": 11475
4136
+ },
4137
+ {
4138
+ "epoch": 0.23,
4139
+ "grad_norm": 0.5733938863696743,
4140
+ "learning_rate": 8.55577777777778e-06,
4141
+ "loss": 2.3932,
4142
+ "step": 11500
4143
+ },
4144
+ {
4145
+ "epoch": 0.23,
4146
+ "eval_loss": 2.403830051422119,
4147
+ "eval_runtime": 31.7862,
4148
+ "eval_samples_per_second": 3.209,
4149
+ "eval_steps_per_second": 1.604,
4150
+ "step": 11500
4151
+ },
4152
+ {
4153
+ "epoch": 0.2305,
4154
+ "grad_norm": 0.5702512759518216,
4155
+ "learning_rate": 8.550222222222223e-06,
4156
+ "loss": 2.3824,
4157
+ "step": 11525
4158
+ },
4159
+ {
4160
+ "epoch": 0.231,
4161
+ "grad_norm": 0.5749933738625221,
4162
+ "learning_rate": 8.544666666666668e-06,
4163
+ "loss": 2.3674,
4164
+ "step": 11550
4165
+ },
4166
+ {
4167
+ "epoch": 0.2315,
4168
+ "grad_norm": 0.563814842108926,
4169
+ "learning_rate": 8.539111111111112e-06,
4170
+ "loss": 2.3866,
4171
+ "step": 11575
4172
+ },
4173
+ {
4174
+ "epoch": 0.232,
4175
+ "grad_norm": 0.601764608458657,
4176
+ "learning_rate": 8.533555555555557e-06,
4177
+ "loss": 2.3949,
4178
+ "step": 11600
4179
+ },
4180
+ {
4181
+ "epoch": 0.232,
4182
+ "eval_loss": 2.4035561084747314,
4183
+ "eval_runtime": 31.7077,
4184
+ "eval_samples_per_second": 3.217,
4185
+ "eval_steps_per_second": 1.608,
4186
+ "step": 11600
4187
+ },
4188
+ {
4189
+ "epoch": 0.2325,
4190
+ "grad_norm": 0.5674229084100237,
4191
+ "learning_rate": 8.528e-06,
4192
+ "loss": 2.3782,
4193
+ "step": 11625
4194
+ },
4195
+ {
4196
+ "epoch": 0.233,
4197
+ "grad_norm": 0.5660025767055805,
4198
+ "learning_rate": 8.522444444444446e-06,
4199
+ "loss": 2.3811,
4200
+ "step": 11650
4201
+ },
4202
+ {
4203
+ "epoch": 0.2335,
4204
+ "grad_norm": 0.5776196117388842,
4205
+ "learning_rate": 8.51688888888889e-06,
4206
+ "loss": 2.3964,
4207
+ "step": 11675
4208
+ },
4209
+ {
4210
+ "epoch": 0.234,
4211
+ "grad_norm": 0.5815076886720436,
4212
+ "learning_rate": 8.511333333333334e-06,
4213
+ "loss": 2.3907,
4214
+ "step": 11700
4215
+ },
4216
+ {
4217
+ "epoch": 0.234,
4218
+ "eval_loss": 2.4035725593566895,
4219
+ "eval_runtime": 31.7541,
4220
+ "eval_samples_per_second": 3.212,
4221
+ "eval_steps_per_second": 1.606,
4222
+ "step": 11700
4223
+ },
4224
+ {
4225
+ "epoch": 0.2345,
4226
+ "grad_norm": 0.5810635532925048,
4227
+ "learning_rate": 8.505777777777778e-06,
4228
+ "loss": 2.3921,
4229
+ "step": 11725
4230
+ },
4231
+ {
4232
+ "epoch": 0.235,
4233
+ "grad_norm": 0.5635380257098753,
4234
+ "learning_rate": 8.500222222222223e-06,
4235
+ "loss": 2.4062,
4236
+ "step": 11750
4237
+ },
4238
+ {
4239
+ "epoch": 0.2355,
4240
+ "grad_norm": 0.5985004911332629,
4241
+ "learning_rate": 8.494666666666668e-06,
4242
+ "loss": 2.3853,
4243
+ "step": 11775
4244
+ },
4245
+ {
4246
+ "epoch": 0.236,
4247
+ "grad_norm": 0.580078413647693,
4248
+ "learning_rate": 8.489111111111112e-06,
4249
+ "loss": 2.3826,
4250
+ "step": 11800
4251
+ },
4252
+ {
4253
+ "epoch": 0.236,
4254
+ "eval_loss": 2.403505325317383,
4255
+ "eval_runtime": 31.7265,
4256
+ "eval_samples_per_second": 3.215,
4257
+ "eval_steps_per_second": 1.607,
4258
+ "step": 11800
4259
+ },
4260
+ {
4261
+ "epoch": 0.2365,
4262
+ "grad_norm": 0.5560334145179444,
4263
+ "learning_rate": 8.483555555555556e-06,
4264
+ "loss": 2.3829,
4265
+ "step": 11825
4266
+ },
4267
+ {
4268
+ "epoch": 0.237,
4269
+ "grad_norm": 0.5870934042209253,
4270
+ "learning_rate": 8.478e-06,
4271
+ "loss": 2.374,
4272
+ "step": 11850
4273
+ },
4274
+ {
4275
+ "epoch": 0.2375,
4276
+ "grad_norm": 0.5745342448568999,
4277
+ "learning_rate": 8.472444444444446e-06,
4278
+ "loss": 2.3797,
4279
+ "step": 11875
4280
+ },
4281
+ {
4282
+ "epoch": 0.238,
4283
+ "grad_norm": 0.5676573173578097,
4284
+ "learning_rate": 8.46688888888889e-06,
4285
+ "loss": 2.3867,
4286
+ "step": 11900
4287
+ },
4288
+ {
4289
+ "epoch": 0.238,
4290
+ "eval_loss": 2.403400421142578,
4291
+ "eval_runtime": 31.8105,
4292
+ "eval_samples_per_second": 3.206,
4293
+ "eval_steps_per_second": 1.603,
4294
+ "step": 11900
4295
+ },
4296
+ {
4297
+ "epoch": 0.2385,
4298
+ "grad_norm": 0.5701256243606029,
4299
+ "learning_rate": 8.461333333333333e-06,
4300
+ "loss": 2.3832,
4301
+ "step": 11925
4302
+ },
4303
+ {
4304
+ "epoch": 0.239,
4305
+ "grad_norm": 0.5839965205220576,
4306
+ "learning_rate": 8.455777777777778e-06,
4307
+ "loss": 2.3939,
4308
+ "step": 11950
4309
+ },
4310
+ {
4311
+ "epoch": 0.2395,
4312
+ "grad_norm": 0.581600775004578,
4313
+ "learning_rate": 8.450222222222224e-06,
4314
+ "loss": 2.382,
4315
+ "step": 11975
4316
+ },
4317
+ {
4318
+ "epoch": 0.24,
4319
+ "grad_norm": 0.5945113931788275,
4320
+ "learning_rate": 8.444666666666667e-06,
4321
+ "loss": 2.3947,
4322
+ "step": 12000
4323
+ },
4324
+ {
4325
+ "epoch": 0.24,
4326
+ "eval_loss": 2.4031572341918945,
4327
+ "eval_runtime": 31.7154,
4328
+ "eval_samples_per_second": 3.216,
4329
+ "eval_steps_per_second": 1.608,
4330
+ "step": 12000
4331
  }
4332
  ],
4333
  "logging_steps": 25,
 
4347
  "attributes": {}
4348
  }
4349
  },
4350
+ "total_flos": 3.819842757992645e+19,
4351
  "train_batch_size": 1,
4352
  "trial_name": null,
4353
  "trial_params": null