Plofski commited on
Commit
19583b8
·
verified ·
1 Parent(s): f65ea43

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84f66cc33d9cd5915476a96d4590c19f424c7a30752f1c8fbfea7813b99ddcec
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6e1468ef363199a8ce8dceeee806e0cd1265dabba9569f802d5e0ffdf55cf29
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff96785f117afab51789d3a95126f7a57e937335d9e00258dde9f7269e32c788
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:979fd7f70ce82e647328d9ca181635fd358343ae3c4356518a994deb8d2c7554
3
  size 1072594443
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b42a9ebffc25267408092ad255514977530cb80117fd8185edcad8326726d7b8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8c6451e983e45b2059a969443ca799e62ce60a9d34862e6b02e6b5034f66233
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9067096514205117,
6
  "eval_steps": 500,
7
- "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4058,6 +4058,456 @@
4058
  "mean_token_accuracy": 0.8125901579856872,
4059
  "num_tokens": 4982012.0,
4060
  "step": 4500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4061
  }
4062
  ],
4063
  "logging_steps": 10,
@@ -4077,7 +4527,7 @@
4077
  "attributes": {}
4078
  }
4079
  },
4080
- "total_flos": 6031407455969280.0,
4081
  "train_batch_size": 8,
4082
  "trial_name": null,
4083
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.007455168245013,
6
  "eval_steps": 500,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4058
  "mean_token_accuracy": 0.8125901579856872,
4059
  "num_tokens": 4982012.0,
4060
  "step": 4500
4061
+ },
4062
+ {
4063
+ "epoch": 0.9087245617570018,
4064
+ "grad_norm": 10.875,
4065
+ "learning_rate": 1.3943179528510981e-05,
4066
+ "loss": 0.9166,
4067
+ "mean_token_accuracy": 0.7789366781711579,
4068
+ "num_tokens": 4992691.0,
4069
+ "step": 4510
4070
+ },
4071
+ {
4072
+ "epoch": 0.9107394720934918,
4073
+ "grad_norm": 10.6875,
4074
+ "learning_rate": 1.3929746792934382e-05,
4075
+ "loss": 0.8536,
4076
+ "mean_token_accuracy": 0.7905638337135314,
4077
+ "num_tokens": 5004578.0,
4078
+ "step": 4520
4079
+ },
4080
+ {
4081
+ "epoch": 0.9127543824299819,
4082
+ "grad_norm": 14.25,
4083
+ "learning_rate": 1.3916314057357782e-05,
4084
+ "loss": 0.976,
4085
+ "mean_token_accuracy": 0.7722863137722016,
4086
+ "num_tokens": 5015364.0,
4087
+ "step": 4530
4088
+ },
4089
+ {
4090
+ "epoch": 0.9147692927664719,
4091
+ "grad_norm": 12.9375,
4092
+ "learning_rate": 1.3902881321781181e-05,
4093
+ "loss": 0.8254,
4094
+ "mean_token_accuracy": 0.7985598504543304,
4095
+ "num_tokens": 5026777.0,
4096
+ "step": 4540
4097
+ },
4098
+ {
4099
+ "epoch": 0.9167842031029619,
4100
+ "grad_norm": 14.3125,
4101
+ "learning_rate": 1.3889448586204582e-05,
4102
+ "loss": 0.8435,
4103
+ "mean_token_accuracy": 0.7964738607406616,
4104
+ "num_tokens": 5038156.0,
4105
+ "step": 4550
4106
+ },
4107
+ {
4108
+ "epoch": 0.918799113439452,
4109
+ "grad_norm": 10.8125,
4110
+ "learning_rate": 1.3876015850627982e-05,
4111
+ "loss": 0.9532,
4112
+ "mean_token_accuracy": 0.7702659487724304,
4113
+ "num_tokens": 5049182.0,
4114
+ "step": 4560
4115
+ },
4116
+ {
4117
+ "epoch": 0.920814023775942,
4118
+ "grad_norm": 10.375,
4119
+ "learning_rate": 1.3862583115051383e-05,
4120
+ "loss": 0.749,
4121
+ "mean_token_accuracy": 0.8094150185585022,
4122
+ "num_tokens": 5060508.0,
4123
+ "step": 4570
4124
+ },
4125
+ {
4126
+ "epoch": 0.9228289341124319,
4127
+ "grad_norm": 11.1875,
4128
+ "learning_rate": 1.384915037947478e-05,
4129
+ "loss": 0.9101,
4130
+ "mean_token_accuracy": 0.780214524269104,
4131
+ "num_tokens": 5072338.0,
4132
+ "step": 4580
4133
+ },
4134
+ {
4135
+ "epoch": 0.924843844448922,
4136
+ "grad_norm": 11.0625,
4137
+ "learning_rate": 1.383571764389818e-05,
4138
+ "loss": 0.8272,
4139
+ "mean_token_accuracy": 0.7955898463726043,
4140
+ "num_tokens": 5083561.0,
4141
+ "step": 4590
4142
+ },
4143
+ {
4144
+ "epoch": 0.926858754785412,
4145
+ "grad_norm": 11.125,
4146
+ "learning_rate": 1.382228490832158e-05,
4147
+ "loss": 0.8138,
4148
+ "mean_token_accuracy": 0.797929847240448,
4149
+ "num_tokens": 5095032.0,
4150
+ "step": 4600
4151
+ },
4152
+ {
4153
+ "epoch": 0.9288736651219021,
4154
+ "grad_norm": 13.5,
4155
+ "learning_rate": 1.380885217274498e-05,
4156
+ "loss": 0.8166,
4157
+ "mean_token_accuracy": 0.7943983316421509,
4158
+ "num_tokens": 5105707.0,
4159
+ "step": 4610
4160
+ },
4161
+ {
4162
+ "epoch": 0.9308885754583921,
4163
+ "grad_norm": 12.0625,
4164
+ "learning_rate": 1.379541943716838e-05,
4165
+ "loss": 0.8926,
4166
+ "mean_token_accuracy": 0.7872261703014374,
4167
+ "num_tokens": 5116277.0,
4168
+ "step": 4620
4169
+ },
4170
+ {
4171
+ "epoch": 0.9329034857948821,
4172
+ "grad_norm": 10.125,
4173
+ "learning_rate": 1.378198670159178e-05,
4174
+ "loss": 0.8729,
4175
+ "mean_token_accuracy": 0.7849249064922332,
4176
+ "num_tokens": 5128524.0,
4177
+ "step": 4630
4178
+ },
4179
+ {
4180
+ "epoch": 0.9349183961313722,
4181
+ "grad_norm": 10.625,
4182
+ "learning_rate": 1.376855396601518e-05,
4183
+ "loss": 0.8558,
4184
+ "mean_token_accuracy": 0.7900417923927308,
4185
+ "num_tokens": 5139328.0,
4186
+ "step": 4640
4187
+ },
4188
+ {
4189
+ "epoch": 0.9369333064678622,
4190
+ "grad_norm": 10.625,
4191
+ "learning_rate": 1.375512123043858e-05,
4192
+ "loss": 0.8806,
4193
+ "mean_token_accuracy": 0.7793081521987915,
4194
+ "num_tokens": 5152047.0,
4195
+ "step": 4650
4196
+ },
4197
+ {
4198
+ "epoch": 0.9389482168043523,
4199
+ "grad_norm": 10.6875,
4200
+ "learning_rate": 1.374168849486198e-05,
4201
+ "loss": 0.9167,
4202
+ "mean_token_accuracy": 0.7699385344982147,
4203
+ "num_tokens": 5163236.0,
4204
+ "step": 4660
4205
+ },
4206
+ {
4207
+ "epoch": 0.9409631271408422,
4208
+ "grad_norm": 12.6875,
4209
+ "learning_rate": 1.3728255759285381e-05,
4210
+ "loss": 0.778,
4211
+ "mean_token_accuracy": 0.8066167533397675,
4212
+ "num_tokens": 5173182.0,
4213
+ "step": 4670
4214
+ },
4215
+ {
4216
+ "epoch": 0.9429780374773322,
4217
+ "grad_norm": 10.8125,
4218
+ "learning_rate": 1.3714823023708778e-05,
4219
+ "loss": 0.8731,
4220
+ "mean_token_accuracy": 0.7907391846179962,
4221
+ "num_tokens": 5184212.0,
4222
+ "step": 4680
4223
+ },
4224
+ {
4225
+ "epoch": 0.9449929478138223,
4226
+ "grad_norm": 14.1875,
4227
+ "learning_rate": 1.3701390288132179e-05,
4228
+ "loss": 0.8561,
4229
+ "mean_token_accuracy": 0.7872300326824189,
4230
+ "num_tokens": 5195178.0,
4231
+ "step": 4690
4232
+ },
4233
+ {
4234
+ "epoch": 0.9470078581503123,
4235
+ "grad_norm": 10.0625,
4236
+ "learning_rate": 1.368795755255558e-05,
4237
+ "loss": 0.8959,
4238
+ "mean_token_accuracy": 0.785253643989563,
4239
+ "num_tokens": 5206174.0,
4240
+ "step": 4700
4241
+ },
4242
+ {
4243
+ "epoch": 0.9490227684868023,
4244
+ "grad_norm": 10.1875,
4245
+ "learning_rate": 1.3674524816978978e-05,
4246
+ "loss": 0.82,
4247
+ "mean_token_accuracy": 0.8018651187419892,
4248
+ "num_tokens": 5216839.0,
4249
+ "step": 4710
4250
+ },
4251
+ {
4252
+ "epoch": 0.9510376788232924,
4253
+ "grad_norm": 12.375,
4254
+ "learning_rate": 1.3661092081402379e-05,
4255
+ "loss": 1.0637,
4256
+ "mean_token_accuracy": 0.7501500964164733,
4257
+ "num_tokens": 5228271.0,
4258
+ "step": 4720
4259
+ },
4260
+ {
4261
+ "epoch": 0.9530525891597824,
4262
+ "grad_norm": 12.625,
4263
+ "learning_rate": 1.364765934582578e-05,
4264
+ "loss": 0.9295,
4265
+ "mean_token_accuracy": 0.7782152414321899,
4266
+ "num_tokens": 5238828.0,
4267
+ "step": 4730
4268
+ },
4269
+ {
4270
+ "epoch": 0.9550674994962725,
4271
+ "grad_norm": 12.0625,
4272
+ "learning_rate": 1.363422661024918e-05,
4273
+ "loss": 0.9794,
4274
+ "mean_token_accuracy": 0.7655089437961579,
4275
+ "num_tokens": 5250865.0,
4276
+ "step": 4740
4277
+ },
4278
+ {
4279
+ "epoch": 0.9570824098327625,
4280
+ "grad_norm": 12.75,
4281
+ "learning_rate": 1.3620793874672577e-05,
4282
+ "loss": 0.8559,
4283
+ "mean_token_accuracy": 0.783417934179306,
4284
+ "num_tokens": 5261161.0,
4285
+ "step": 4750
4286
+ },
4287
+ {
4288
+ "epoch": 0.9590973201692524,
4289
+ "grad_norm": 13.125,
4290
+ "learning_rate": 1.3607361139095977e-05,
4291
+ "loss": 0.8129,
4292
+ "mean_token_accuracy": 0.8023806989192963,
4293
+ "num_tokens": 5271735.0,
4294
+ "step": 4760
4295
+ },
4296
+ {
4297
+ "epoch": 0.9611122305057425,
4298
+ "grad_norm": 10.3125,
4299
+ "learning_rate": 1.3593928403519378e-05,
4300
+ "loss": 0.9024,
4301
+ "mean_token_accuracy": 0.7816856026649475,
4302
+ "num_tokens": 5282948.0,
4303
+ "step": 4770
4304
+ },
4305
+ {
4306
+ "epoch": 0.9631271408422325,
4307
+ "grad_norm": 16.375,
4308
+ "learning_rate": 1.3580495667942777e-05,
4309
+ "loss": 0.7899,
4310
+ "mean_token_accuracy": 0.8060350120067596,
4311
+ "num_tokens": 5293161.0,
4312
+ "step": 4780
4313
+ },
4314
+ {
4315
+ "epoch": 0.9651420511787225,
4316
+ "grad_norm": 12.0625,
4317
+ "learning_rate": 1.3567062932366177e-05,
4318
+ "loss": 0.9497,
4319
+ "mean_token_accuracy": 0.7778710544109344,
4320
+ "num_tokens": 5303823.0,
4321
+ "step": 4790
4322
+ },
4323
+ {
4324
+ "epoch": 0.9671569615152126,
4325
+ "grad_norm": 12.375,
4326
+ "learning_rate": 1.3553630196789578e-05,
4327
+ "loss": 0.8374,
4328
+ "mean_token_accuracy": 0.7918814778327942,
4329
+ "num_tokens": 5314457.0,
4330
+ "step": 4800
4331
+ },
4332
+ {
4333
+ "epoch": 0.9691718718517026,
4334
+ "grad_norm": 13.4375,
4335
+ "learning_rate": 1.3540197461212977e-05,
4336
+ "loss": 1.0195,
4337
+ "mean_token_accuracy": 0.7530766189098358,
4338
+ "num_tokens": 5324766.0,
4339
+ "step": 4810
4340
+ },
4341
+ {
4342
+ "epoch": 0.9711867821881927,
4343
+ "grad_norm": 12.8125,
4344
+ "learning_rate": 1.3526764725636377e-05,
4345
+ "loss": 0.8813,
4346
+ "mean_token_accuracy": 0.7811478495597839,
4347
+ "num_tokens": 5336093.0,
4348
+ "step": 4820
4349
+ },
4350
+ {
4351
+ "epoch": 0.9732016925246827,
4352
+ "grad_norm": 10.4375,
4353
+ "learning_rate": 1.3513331990059778e-05,
4354
+ "loss": 0.8947,
4355
+ "mean_token_accuracy": 0.7876034200191497,
4356
+ "num_tokens": 5346706.0,
4357
+ "step": 4830
4358
+ },
4359
+ {
4360
+ "epoch": 0.9752166028611727,
4361
+ "grad_norm": 15.25,
4362
+ "learning_rate": 1.3499899254483178e-05,
4363
+ "loss": 0.8773,
4364
+ "mean_token_accuracy": 0.7921059668064118,
4365
+ "num_tokens": 5357534.0,
4366
+ "step": 4840
4367
+ },
4368
+ {
4369
+ "epoch": 0.9772315131976627,
4370
+ "grad_norm": 10.75,
4371
+ "learning_rate": 1.3486466518906575e-05,
4372
+ "loss": 0.895,
4373
+ "mean_token_accuracy": 0.7818942189216613,
4374
+ "num_tokens": 5370003.0,
4375
+ "step": 4850
4376
+ },
4377
+ {
4378
+ "epoch": 0.9792464235341527,
4379
+ "grad_norm": 11.3125,
4380
+ "learning_rate": 1.3473033783329976e-05,
4381
+ "loss": 0.7589,
4382
+ "mean_token_accuracy": 0.8098963499069214,
4383
+ "num_tokens": 5381355.0,
4384
+ "step": 4860
4385
+ },
4386
+ {
4387
+ "epoch": 0.9812613338706427,
4388
+ "grad_norm": 12.0625,
4389
+ "learning_rate": 1.3459601047753376e-05,
4390
+ "loss": 0.793,
4391
+ "mean_token_accuracy": 0.8019460260868072,
4392
+ "num_tokens": 5392541.0,
4393
+ "step": 4870
4394
+ },
4395
+ {
4396
+ "epoch": 0.9832762442071328,
4397
+ "grad_norm": 14.5,
4398
+ "learning_rate": 1.3446168312176775e-05,
4399
+ "loss": 0.9046,
4400
+ "mean_token_accuracy": 0.781462025642395,
4401
+ "num_tokens": 5403838.0,
4402
+ "step": 4880
4403
+ },
4404
+ {
4405
+ "epoch": 0.9852911545436228,
4406
+ "grad_norm": 11.25,
4407
+ "learning_rate": 1.3432735576600176e-05,
4408
+ "loss": 0.9039,
4409
+ "mean_token_accuracy": 0.7840433418750763,
4410
+ "num_tokens": 5414995.0,
4411
+ "step": 4890
4412
+ },
4413
+ {
4414
+ "epoch": 0.9873060648801129,
4415
+ "grad_norm": 9.5,
4416
+ "learning_rate": 1.3419302841023576e-05,
4417
+ "loss": 0.8909,
4418
+ "mean_token_accuracy": 0.7859670460224152,
4419
+ "num_tokens": 5426001.0,
4420
+ "step": 4900
4421
+ },
4422
+ {
4423
+ "epoch": 0.9893209752166029,
4424
+ "grad_norm": 13.375,
4425
+ "learning_rate": 1.3405870105446977e-05,
4426
+ "loss": 0.8171,
4427
+ "mean_token_accuracy": 0.7865999937057495,
4428
+ "num_tokens": 5438081.0,
4429
+ "step": 4910
4430
+ },
4431
+ {
4432
+ "epoch": 0.9913358855530929,
4433
+ "grad_norm": 14.3125,
4434
+ "learning_rate": 1.3392437369870374e-05,
4435
+ "loss": 0.9477,
4436
+ "mean_token_accuracy": 0.773062938451767,
4437
+ "num_tokens": 5449464.0,
4438
+ "step": 4920
4439
+ },
4440
+ {
4441
+ "epoch": 0.9933507958895829,
4442
+ "grad_norm": 11.3125,
4443
+ "learning_rate": 1.3379004634293775e-05,
4444
+ "loss": 0.9291,
4445
+ "mean_token_accuracy": 0.7776412189006805,
4446
+ "num_tokens": 5461080.0,
4447
+ "step": 4930
4448
+ },
4449
+ {
4450
+ "epoch": 0.9953657062260729,
4451
+ "grad_norm": 11.625,
4452
+ "learning_rate": 1.3365571898717175e-05,
4453
+ "loss": 0.8828,
4454
+ "mean_token_accuracy": 0.7811066091060639,
4455
+ "num_tokens": 5472144.0,
4456
+ "step": 4940
4457
+ },
4458
+ {
4459
+ "epoch": 0.9973806165625629,
4460
+ "grad_norm": 9.875,
4461
+ "learning_rate": 1.3352139163140574e-05,
4462
+ "loss": 0.8439,
4463
+ "mean_token_accuracy": 0.7922865450382233,
4464
+ "num_tokens": 5484117.0,
4465
+ "step": 4950
4466
+ },
4467
+ {
4468
+ "epoch": 0.999395526899053,
4469
+ "grad_norm": 10.5,
4470
+ "learning_rate": 1.3338706427563974e-05,
4471
+ "loss": 0.8985,
4472
+ "mean_token_accuracy": 0.7755892872810364,
4473
+ "num_tokens": 5495916.0,
4474
+ "step": 4960
4475
+ },
4476
+ {
4477
+ "epoch": 1.001410437235543,
4478
+ "grad_norm": 12.875,
4479
+ "learning_rate": 1.3325273691987375e-05,
4480
+ "loss": 0.8131,
4481
+ "mean_token_accuracy": 0.7963548183441163,
4482
+ "num_tokens": 5506891.0,
4483
+ "step": 4970
4484
+ },
4485
+ {
4486
+ "epoch": 1.003425347572033,
4487
+ "grad_norm": 10.3125,
4488
+ "learning_rate": 1.3311840956410774e-05,
4489
+ "loss": 0.7879,
4490
+ "mean_token_accuracy": 0.7989233016967774,
4491
+ "num_tokens": 5519454.0,
4492
+ "step": 4980
4493
+ },
4494
+ {
4495
+ "epoch": 1.005440257908523,
4496
+ "grad_norm": 11.875,
4497
+ "learning_rate": 1.3298408220834174e-05,
4498
+ "loss": 0.7878,
4499
+ "mean_token_accuracy": 0.8067593216896057,
4500
+ "num_tokens": 5529707.0,
4501
+ "step": 4990
4502
+ },
4503
+ {
4504
+ "epoch": 1.007455168245013,
4505
+ "grad_norm": 11.1875,
4506
+ "learning_rate": 1.3284975485257575e-05,
4507
+ "loss": 0.7955,
4508
+ "mean_token_accuracy": 0.800259780883789,
4509
+ "num_tokens": 5541015.0,
4510
+ "step": 5000
4511
  }
4512
  ],
4513
  "logging_steps": 10,
 
4527
  "attributes": {}
4528
  }
4529
  },
4530
+ "total_flos": 6697551334397952.0,
4531
  "train_batch_size": 8,
4532
  "trial_name": null,
4533
  "trial_params": null