gg1 / trainer_state.json
giang16GG11's picture
Upload folder using huggingface_hub
539ac5c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.7500875043752186,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00035001750087504374,
"grad_norm": 7.988319396972656,
"learning_rate": 5e-05,
"loss": 0.2364,
"mean_token_accuracy": 0.8999999761581421,
"num_tokens": 452.0,
"step": 1
},
{
"epoch": 0.0035001750087504373,
"grad_norm": 0.8047283291816711,
"learning_rate": 4.991e-05,
"loss": 0.1339,
"mean_token_accuracy": 0.966666665342119,
"num_tokens": 4674.0,
"step": 10
},
{
"epoch": 0.007000350017500875,
"grad_norm": 0.08694641292095184,
"learning_rate": 4.981e-05,
"loss": 0.0457,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 9193.0,
"step": 20
},
{
"epoch": 0.010500525026251312,
"grad_norm": 9.807371139526367,
"learning_rate": 4.9710000000000003e-05,
"loss": 0.1587,
"mean_token_accuracy": 0.9599999904632568,
"num_tokens": 13953.0,
"step": 30
},
{
"epoch": 0.01400070003500175,
"grad_norm": 0.12450232356786728,
"learning_rate": 4.961e-05,
"loss": 0.107,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 18471.0,
"step": 40
},
{
"epoch": 0.01750087504375219,
"grad_norm": 3.2105650901794434,
"learning_rate": 4.951e-05,
"loss": 0.1113,
"mean_token_accuracy": 0.9649999916553498,
"num_tokens": 23060.0,
"step": 50
},
{
"epoch": 0.021001050052502624,
"grad_norm": 0.6051918268203735,
"learning_rate": 4.941e-05,
"loss": 0.0627,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 27672.0,
"step": 60
},
{
"epoch": 0.024501225061253063,
"grad_norm": 5.309004306793213,
"learning_rate": 4.931e-05,
"loss": 0.125,
"mean_token_accuracy": 0.9649999916553498,
"num_tokens": 32267.0,
"step": 70
},
{
"epoch": 0.0280014000700035,
"grad_norm": 3.3586971759796143,
"learning_rate": 4.921e-05,
"loss": 0.0377,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 36884.0,
"step": 80
},
{
"epoch": 0.03150157507875394,
"grad_norm": 4.870711803436279,
"learning_rate": 4.911e-05,
"loss": 0.0644,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 41583.0,
"step": 90
},
{
"epoch": 0.03500175008750438,
"grad_norm": 0.014425868168473244,
"learning_rate": 4.901e-05,
"loss": 0.0432,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 46184.0,
"step": 100
},
{
"epoch": 0.038501925096254816,
"grad_norm": 8.833477020263672,
"learning_rate": 4.891e-05,
"loss": 0.093,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 50776.0,
"step": 110
},
{
"epoch": 0.04200210010500525,
"grad_norm": 0.19166199862957,
"learning_rate": 4.881e-05,
"loss": 0.1521,
"mean_token_accuracy": 0.9649999976158142,
"num_tokens": 55443.0,
"step": 120
},
{
"epoch": 0.04550227511375569,
"grad_norm": 0.3452470302581787,
"learning_rate": 4.871e-05,
"loss": 0.0972,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 60036.0,
"step": 130
},
{
"epoch": 0.049002450122506126,
"grad_norm": 4.509720802307129,
"learning_rate": 4.861e-05,
"loss": 0.1143,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 64565.0,
"step": 140
},
{
"epoch": 0.052502625131256565,
"grad_norm": 0.35976719856262207,
"learning_rate": 4.851e-05,
"loss": 0.032,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 69185.0,
"step": 150
},
{
"epoch": 0.056002800140007,
"grad_norm": 5.863715648651123,
"learning_rate": 4.841e-05,
"loss": 0.0522,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 73698.0,
"step": 160
},
{
"epoch": 0.05950297514875744,
"grad_norm": 0.1498999446630478,
"learning_rate": 4.8309999999999997e-05,
"loss": 0.1335,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 78163.0,
"step": 170
},
{
"epoch": 0.06300315015750788,
"grad_norm": 1.365043044090271,
"learning_rate": 4.821e-05,
"loss": 0.0818,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 82718.0,
"step": 180
},
{
"epoch": 0.06650332516625831,
"grad_norm": 7.134900093078613,
"learning_rate": 4.8110000000000005e-05,
"loss": 0.0881,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 87209.0,
"step": 190
},
{
"epoch": 0.07000350017500875,
"grad_norm": 1.4992774724960327,
"learning_rate": 4.801e-05,
"loss": 0.0612,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 91760.0,
"step": 200
},
{
"epoch": 0.07350367518375919,
"grad_norm": 0.02836497873067856,
"learning_rate": 4.791000000000001e-05,
"loss": 0.0553,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 96423.0,
"step": 210
},
{
"epoch": 0.07700385019250963,
"grad_norm": 0.020807797089219093,
"learning_rate": 4.7810000000000005e-05,
"loss": 0.0672,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 100968.0,
"step": 220
},
{
"epoch": 0.08050402520126006,
"grad_norm": 0.03391553834080696,
"learning_rate": 4.771e-05,
"loss": 0.0326,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 105637.0,
"step": 230
},
{
"epoch": 0.0840042002100105,
"grad_norm": 10.723307609558105,
"learning_rate": 4.761000000000001e-05,
"loss": 0.0897,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 110219.0,
"step": 240
},
{
"epoch": 0.08750437521876094,
"grad_norm": 1.3650544881820679,
"learning_rate": 4.7510000000000004e-05,
"loss": 0.0266,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 114796.0,
"step": 250
},
{
"epoch": 0.09100455022751137,
"grad_norm": 0.07940108329057693,
"learning_rate": 4.741e-05,
"loss": 0.01,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 119315.0,
"step": 260
},
{
"epoch": 0.09450472523626181,
"grad_norm": 0.05688886716961861,
"learning_rate": 4.7310000000000006e-05,
"loss": 0.0224,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 123868.0,
"step": 270
},
{
"epoch": 0.09800490024501225,
"grad_norm": 0.00870002806186676,
"learning_rate": 4.7210000000000004e-05,
"loss": 0.012,
"mean_token_accuracy": 1.0,
"num_tokens": 128591.0,
"step": 280
},
{
"epoch": 0.10150507525376269,
"grad_norm": 55.893104553222656,
"learning_rate": 4.711e-05,
"loss": 0.0297,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 133283.0,
"step": 290
},
{
"epoch": 0.10500525026251313,
"grad_norm": 0.003983665257692337,
"learning_rate": 4.7010000000000006e-05,
"loss": 0.2061,
"mean_token_accuracy": 0.9649999916553498,
"num_tokens": 138008.0,
"step": 300
},
{
"epoch": 0.10850542527126357,
"grad_norm": 7.029219627380371,
"learning_rate": 4.691e-05,
"loss": 0.0849,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 142566.0,
"step": 310
},
{
"epoch": 0.112005600280014,
"grad_norm": 2.8149940967559814,
"learning_rate": 4.681e-05,
"loss": 0.0667,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 147213.0,
"step": 320
},
{
"epoch": 0.11550577528876443,
"grad_norm": 2.192121982574463,
"learning_rate": 4.6710000000000005e-05,
"loss": 0.0638,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 151831.0,
"step": 330
},
{
"epoch": 0.11900595029751487,
"grad_norm": 15.69092082977295,
"learning_rate": 4.661e-05,
"loss": 0.0256,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 156368.0,
"step": 340
},
{
"epoch": 0.12250612530626531,
"grad_norm": 0.544373095035553,
"learning_rate": 4.651e-05,
"loss": 0.0368,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 160915.0,
"step": 350
},
{
"epoch": 0.12600630031501575,
"grad_norm": 0.04070553556084633,
"learning_rate": 4.6410000000000005e-05,
"loss": 0.0729,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 165461.0,
"step": 360
},
{
"epoch": 0.1295064753237662,
"grad_norm": 0.006062925793230534,
"learning_rate": 4.631e-05,
"loss": 0.0372,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 170199.0,
"step": 370
},
{
"epoch": 0.13300665033251663,
"grad_norm": 0.04721317067742348,
"learning_rate": 4.6210000000000006e-05,
"loss": 0.0353,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 174899.0,
"step": 380
},
{
"epoch": 0.13650682534126707,
"grad_norm": 0.01112948078662157,
"learning_rate": 4.6110000000000004e-05,
"loss": 0.0565,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 179419.0,
"step": 390
},
{
"epoch": 0.1400070003500175,
"grad_norm": 3.867860794067383,
"learning_rate": 4.601e-05,
"loss": 0.0537,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 183987.0,
"step": 400
},
{
"epoch": 0.14350717535876795,
"grad_norm": 10.329545974731445,
"learning_rate": 4.5910000000000006e-05,
"loss": 0.0888,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 188517.0,
"step": 410
},
{
"epoch": 0.14700735036751839,
"grad_norm": 0.04144367575645447,
"learning_rate": 4.5810000000000004e-05,
"loss": 0.0723,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 193090.0,
"step": 420
},
{
"epoch": 0.15050752537626882,
"grad_norm": 13.018311500549316,
"learning_rate": 4.571e-05,
"loss": 0.0799,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 197726.0,
"step": 430
},
{
"epoch": 0.15400770038501926,
"grad_norm": 7.063663959503174,
"learning_rate": 4.5610000000000005e-05,
"loss": 0.0295,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 202376.0,
"step": 440
},
{
"epoch": 0.15750787539376968,
"grad_norm": 8.98883056640625,
"learning_rate": 4.551e-05,
"loss": 0.0624,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 207026.0,
"step": 450
},
{
"epoch": 0.16100805040252011,
"grad_norm": 13.842345237731934,
"learning_rate": 4.541e-05,
"loss": 0.0715,
"mean_token_accuracy": 0.975,
"num_tokens": 211776.0,
"step": 460
},
{
"epoch": 0.16450822541127055,
"grad_norm": 9.97155475616455,
"learning_rate": 4.5310000000000005e-05,
"loss": 0.0661,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 216432.0,
"step": 470
},
{
"epoch": 0.168008400420021,
"grad_norm": 7.468666076660156,
"learning_rate": 4.521e-05,
"loss": 0.0367,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 220948.0,
"step": 480
},
{
"epoch": 0.17150857542877143,
"grad_norm": 4.366839408874512,
"learning_rate": 4.511e-05,
"loss": 0.0627,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 225605.0,
"step": 490
},
{
"epoch": 0.17500875043752187,
"grad_norm": 1.5920456647872925,
"learning_rate": 4.5010000000000004e-05,
"loss": 0.0625,
"step": 500
},
{
"epoch": 0.17500875043752187,
"eval_accuracy": 0.42072015161086546,
"eval_f1": 0.35817992606791954,
"eval_loss": 0.052172355353832245,
"eval_mean_token_accuracy": 0.9854798049035699,
"eval_num_tokens": 230290.0,
"eval_precision": 0.4422081376879308,
"eval_recall": 0.3790056922240466,
"eval_runtime": 244.5865,
"eval_samples_per_second": 6.472,
"eval_steps_per_second": 0.81,
"step": 500
},
{
"epoch": 0.1785089254462723,
"grad_norm": 0.04973801597952843,
"learning_rate": 4.491e-05,
"loss": 0.082,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 234977.0,
"step": 510
},
{
"epoch": 0.18200910045502275,
"grad_norm": 0.0434698760509491,
"learning_rate": 4.481e-05,
"loss": 0.0479,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 239628.0,
"step": 520
},
{
"epoch": 0.1855092754637732,
"grad_norm": 11.246657371520996,
"learning_rate": 4.4710000000000004e-05,
"loss": 0.0361,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 244331.0,
"step": 530
},
{
"epoch": 0.18900945047252363,
"grad_norm": 2.8165736198425293,
"learning_rate": 4.461e-05,
"loss": 0.0622,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 249178.0,
"step": 540
},
{
"epoch": 0.19250962548127407,
"grad_norm": 1.3719075918197632,
"learning_rate": 4.451e-05,
"loss": 0.0498,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 253815.0,
"step": 550
},
{
"epoch": 0.1960098004900245,
"grad_norm": 0.8937302827835083,
"learning_rate": 4.4410000000000003e-05,
"loss": 0.0782,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 258433.0,
"step": 560
},
{
"epoch": 0.19950997549877494,
"grad_norm": 0.01865805685520172,
"learning_rate": 4.431e-05,
"loss": 0.0096,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 263057.0,
"step": 570
},
{
"epoch": 0.20301015050752538,
"grad_norm": 0.6028000712394714,
"learning_rate": 4.421e-05,
"loss": 0.0144,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 267618.0,
"step": 580
},
{
"epoch": 0.20651032551627582,
"grad_norm": 0.013873261399567127,
"learning_rate": 4.411e-05,
"loss": 0.0576,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 272288.0,
"step": 590
},
{
"epoch": 0.21001050052502626,
"grad_norm": 6.103112697601318,
"learning_rate": 4.401e-05,
"loss": 0.1198,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 276910.0,
"step": 600
},
{
"epoch": 0.2135106755337767,
"grad_norm": 0.640934944152832,
"learning_rate": 4.391e-05,
"loss": 0.0378,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 281446.0,
"step": 610
},
{
"epoch": 0.21701085054252714,
"grad_norm": 0.31448185443878174,
"learning_rate": 4.381e-05,
"loss": 0.0696,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 286106.0,
"step": 620
},
{
"epoch": 0.22051102555127755,
"grad_norm": 0.03195786848664284,
"learning_rate": 4.371e-05,
"loss": 0.124,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 290743.0,
"step": 630
},
{
"epoch": 0.224011200560028,
"grad_norm": 0.8114803433418274,
"learning_rate": 4.361e-05,
"loss": 0.1201,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 295315.0,
"step": 640
},
{
"epoch": 0.22751137556877843,
"grad_norm": 0.16202567517757416,
"learning_rate": 4.351e-05,
"loss": 0.0553,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 299840.0,
"step": 650
},
{
"epoch": 0.23101155057752887,
"grad_norm": 4.016778469085693,
"learning_rate": 4.341e-05,
"loss": 0.0692,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 304471.0,
"step": 660
},
{
"epoch": 0.2345117255862793,
"grad_norm": 0.056026436388492584,
"learning_rate": 4.3310000000000004e-05,
"loss": 0.0395,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 309003.0,
"step": 670
},
{
"epoch": 0.23801190059502975,
"grad_norm": 0.4657319188117981,
"learning_rate": 4.321e-05,
"loss": 0.0094,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 313702.0,
"step": 680
},
{
"epoch": 0.24151207560378019,
"grad_norm": 21.3116397857666,
"learning_rate": 4.311e-05,
"loss": 0.0468,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 318425.0,
"step": 690
},
{
"epoch": 0.24501225061253062,
"grad_norm": 0.024263957515358925,
"learning_rate": 4.301e-05,
"loss": 0.0576,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 323080.0,
"step": 700
},
{
"epoch": 0.24851242562128106,
"grad_norm": 0.039419762790203094,
"learning_rate": 4.291e-05,
"loss": 0.0503,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 327692.0,
"step": 710
},
{
"epoch": 0.2520126006300315,
"grad_norm": 0.06194750592112541,
"learning_rate": 4.281e-05,
"loss": 0.0379,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 332238.0,
"step": 720
},
{
"epoch": 0.25551277563878194,
"grad_norm": 0.015114092268049717,
"learning_rate": 4.271e-05,
"loss": 0.0729,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 336826.0,
"step": 730
},
{
"epoch": 0.2590129506475324,
"grad_norm": 0.05118599534034729,
"learning_rate": 4.261e-05,
"loss": 0.0773,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 341422.0,
"step": 740
},
{
"epoch": 0.2625131256562828,
"grad_norm": 0.13388273119926453,
"learning_rate": 4.251e-05,
"loss": 0.07,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 346078.0,
"step": 750
},
{
"epoch": 0.26601330066503326,
"grad_norm": 0.043984536081552505,
"learning_rate": 4.241e-05,
"loss": 0.121,
"mean_token_accuracy": 0.9649999916553498,
"num_tokens": 350615.0,
"step": 760
},
{
"epoch": 0.2695134756737837,
"grad_norm": 3.5789825916290283,
"learning_rate": 4.231e-05,
"loss": 0.0646,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 355229.0,
"step": 770
},
{
"epoch": 0.27301365068253414,
"grad_norm": 0.15532948076725006,
"learning_rate": 4.221e-05,
"loss": 0.0809,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 359810.0,
"step": 780
},
{
"epoch": 0.2765138256912846,
"grad_norm": 4.205129146575928,
"learning_rate": 4.211e-05,
"loss": 0.0478,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 364370.0,
"step": 790
},
{
"epoch": 0.280014000700035,
"grad_norm": 0.06457880884408951,
"learning_rate": 4.201e-05,
"loss": 0.0381,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 368854.0,
"step": 800
},
{
"epoch": 0.28351417570878545,
"grad_norm": 0.06110011041164398,
"learning_rate": 4.191e-05,
"loss": 0.0362,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 373402.0,
"step": 810
},
{
"epoch": 0.2870143507175359,
"grad_norm": 0.6663037538528442,
"learning_rate": 4.181000000000001e-05,
"loss": 0.053,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 378067.0,
"step": 820
},
{
"epoch": 0.29051452572628633,
"grad_norm": 0.019796814769506454,
"learning_rate": 4.1710000000000006e-05,
"loss": 0.0813,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 382674.0,
"step": 830
},
{
"epoch": 0.29401470073503677,
"grad_norm": 6.284240245819092,
"learning_rate": 4.161e-05,
"loss": 0.1022,
"mean_token_accuracy": 0.9599999904632568,
"num_tokens": 387372.0,
"step": 840
},
{
"epoch": 0.2975148757437872,
"grad_norm": 0.050411708652973175,
"learning_rate": 4.151000000000001e-05,
"loss": 0.0286,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 392048.0,
"step": 850
},
{
"epoch": 0.30101505075253765,
"grad_norm": 0.13556945323944092,
"learning_rate": 4.1410000000000005e-05,
"loss": 0.056,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 396846.0,
"step": 860
},
{
"epoch": 0.3045152257612881,
"grad_norm": 0.2066652923822403,
"learning_rate": 4.131e-05,
"loss": 0.0304,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 401427.0,
"step": 870
},
{
"epoch": 0.3080154007700385,
"grad_norm": 0.21275383234024048,
"learning_rate": 4.121000000000001e-05,
"loss": 0.0971,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 406019.0,
"step": 880
},
{
"epoch": 0.31151557577878897,
"grad_norm": 0.0494910404086113,
"learning_rate": 4.1110000000000005e-05,
"loss": 0.0054,
"mean_token_accuracy": 1.0,
"num_tokens": 410600.0,
"step": 890
},
{
"epoch": 0.31501575078753935,
"grad_norm": 0.06328645348548889,
"learning_rate": 4.101e-05,
"loss": 0.0584,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 415202.0,
"step": 900
},
{
"epoch": 0.3185159257962898,
"grad_norm": 0.011447213590145111,
"learning_rate": 4.0910000000000006e-05,
"loss": 0.0842,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 419881.0,
"step": 910
},
{
"epoch": 0.32201610080504023,
"grad_norm": 0.11036702245473862,
"learning_rate": 4.0810000000000004e-05,
"loss": 0.023,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 424461.0,
"step": 920
},
{
"epoch": 0.32551627581379067,
"grad_norm": 5.421338081359863,
"learning_rate": 4.071e-05,
"loss": 0.0802,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 429057.0,
"step": 930
},
{
"epoch": 0.3290164508225411,
"grad_norm": 0.3822776675224304,
"learning_rate": 4.0610000000000006e-05,
"loss": 0.0204,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 433654.0,
"step": 940
},
{
"epoch": 0.33251662583129155,
"grad_norm": 0.39122045040130615,
"learning_rate": 4.0510000000000003e-05,
"loss": 0.0318,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 438323.0,
"step": 950
},
{
"epoch": 0.336016800840042,
"grad_norm": 0.16552428901195526,
"learning_rate": 4.041e-05,
"loss": 0.0472,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 442957.0,
"step": 960
},
{
"epoch": 0.3395169758487924,
"grad_norm": 0.028619434684515,
"learning_rate": 4.0310000000000005e-05,
"loss": 0.0433,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 447752.0,
"step": 970
},
{
"epoch": 0.34301715085754286,
"grad_norm": 7.8463053703308105,
"learning_rate": 4.021e-05,
"loss": 0.0688,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 452474.0,
"step": 980
},
{
"epoch": 0.3465173258662933,
"grad_norm": 0.012101550586521626,
"learning_rate": 4.011e-05,
"loss": 0.0084,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 457143.0,
"step": 990
},
{
"epoch": 0.35001750087504374,
"grad_norm": 0.013838861137628555,
"learning_rate": 4.0010000000000005e-05,
"loss": 0.0593,
"step": 1000
},
{
"epoch": 0.35001750087504374,
"eval_accuracy": 0.441566645609602,
"eval_f1": 0.3728091111256171,
"eval_loss": 0.057350896298885345,
"eval_mean_token_accuracy": 0.9861111180348829,
"eval_num_tokens": 461707.0,
"eval_precision": 0.4527947168630948,
"eval_recall": 0.39396244890143234,
"eval_runtime": 244.5385,
"eval_samples_per_second": 6.473,
"eval_steps_per_second": 0.81,
"step": 1000
},
{
"epoch": 0.3535176758837942,
"grad_norm": 0.0760878473520279,
"learning_rate": 3.991e-05,
"loss": 0.1017,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 466311.0,
"step": 1010
},
{
"epoch": 0.3570178508925446,
"grad_norm": 6.497073173522949,
"learning_rate": 3.981e-05,
"loss": 0.0353,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 470821.0,
"step": 1020
},
{
"epoch": 0.36051802590129506,
"grad_norm": 8.943822860717773,
"learning_rate": 3.9710000000000004e-05,
"loss": 0.0868,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 475416.0,
"step": 1030
},
{
"epoch": 0.3640182009100455,
"grad_norm": 0.10018932819366455,
"learning_rate": 3.961e-05,
"loss": 0.0314,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 480046.0,
"step": 1040
},
{
"epoch": 0.36751837591879594,
"grad_norm": 0.058345384895801544,
"learning_rate": 3.951e-05,
"loss": 0.0198,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 484728.0,
"step": 1050
},
{
"epoch": 0.3710185509275464,
"grad_norm": 0.059850409626960754,
"learning_rate": 3.9410000000000004e-05,
"loss": 0.0561,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 489334.0,
"step": 1060
},
{
"epoch": 0.3745187259362968,
"grad_norm": 0.03875022009015083,
"learning_rate": 3.931e-05,
"loss": 0.0893,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 493949.0,
"step": 1070
},
{
"epoch": 0.37801890094504725,
"grad_norm": 0.7719871997833252,
"learning_rate": 3.921e-05,
"loss": 0.0427,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 498518.0,
"step": 1080
},
{
"epoch": 0.3815190759537977,
"grad_norm": 0.05535457283258438,
"learning_rate": 3.911e-05,
"loss": 0.0117,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 503111.0,
"step": 1090
},
{
"epoch": 0.38501925096254813,
"grad_norm": 6.557998180389404,
"learning_rate": 3.901e-05,
"loss": 0.0805,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 507707.0,
"step": 1100
},
{
"epoch": 0.38851942597129857,
"grad_norm": 0.6564468145370483,
"learning_rate": 3.8910000000000005e-05,
"loss": 0.0841,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 512285.0,
"step": 1110
},
{
"epoch": 0.392019600980049,
"grad_norm": 8.401987075805664,
"learning_rate": 3.881e-05,
"loss": 0.0721,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 516852.0,
"step": 1120
},
{
"epoch": 0.39551977598879945,
"grad_norm": 1.693769931793213,
"learning_rate": 3.871e-05,
"loss": 0.0807,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 521413.0,
"step": 1130
},
{
"epoch": 0.3990199509975499,
"grad_norm": 3.10587739944458,
"learning_rate": 3.8610000000000005e-05,
"loss": 0.059,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 525992.0,
"step": 1140
},
{
"epoch": 0.4025201260063003,
"grad_norm": 0.17380690574645996,
"learning_rate": 3.851e-05,
"loss": 0.0357,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 530679.0,
"step": 1150
},
{
"epoch": 0.40602030101505077,
"grad_norm": 0.33141088485717773,
"learning_rate": 3.841e-05,
"loss": 0.0045,
"mean_token_accuracy": 1.0,
"num_tokens": 535218.0,
"step": 1160
},
{
"epoch": 0.4095204760238012,
"grad_norm": 0.0494840107858181,
"learning_rate": 3.8310000000000004e-05,
"loss": 0.0594,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 539817.0,
"step": 1170
},
{
"epoch": 0.41302065103255164,
"grad_norm": 0.013975823298096657,
"learning_rate": 3.821e-05,
"loss": 0.0022,
"mean_token_accuracy": 1.0,
"num_tokens": 544490.0,
"step": 1180
},
{
"epoch": 0.4165208260413021,
"grad_norm": 0.09675566107034683,
"learning_rate": 3.811e-05,
"loss": 0.0019,
"mean_token_accuracy": 1.0,
"num_tokens": 549043.0,
"step": 1190
},
{
"epoch": 0.4200210010500525,
"grad_norm": 0.00722131785005331,
"learning_rate": 3.8010000000000004e-05,
"loss": 0.0806,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 553578.0,
"step": 1200
},
{
"epoch": 0.42352117605880296,
"grad_norm": 0.022663407027721405,
"learning_rate": 3.791e-05,
"loss": 0.0042,
"mean_token_accuracy": 1.0,
"num_tokens": 558226.0,
"step": 1210
},
{
"epoch": 0.4270213510675534,
"grad_norm": 0.012322783470153809,
"learning_rate": 3.781e-05,
"loss": 0.007,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 562862.0,
"step": 1220
},
{
"epoch": 0.43052152607630384,
"grad_norm": 0.016185415908694267,
"learning_rate": 3.771e-05,
"loss": 0.0943,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 567480.0,
"step": 1230
},
{
"epoch": 0.4340217010850543,
"grad_norm": 0.0974903255701065,
"learning_rate": 3.761e-05,
"loss": 0.0773,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 572014.0,
"step": 1240
},
{
"epoch": 0.43752187609380466,
"grad_norm": 0.028429092839360237,
"learning_rate": 3.751e-05,
"loss": 0.0779,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 576638.0,
"step": 1250
},
{
"epoch": 0.4410220511025551,
"grad_norm": 2.4505090713500977,
"learning_rate": 3.741e-05,
"loss": 0.0153,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 581221.0,
"step": 1260
},
{
"epoch": 0.44452222611130554,
"grad_norm": 0.11989375203847885,
"learning_rate": 3.731e-05,
"loss": 0.0748,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 585878.0,
"step": 1270
},
{
"epoch": 0.448022401120056,
"grad_norm": 0.06575898826122284,
"learning_rate": 3.721e-05,
"loss": 0.0247,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 590474.0,
"step": 1280
},
{
"epoch": 0.4515225761288064,
"grad_norm": 17.148649215698242,
"learning_rate": 3.711e-05,
"loss": 0.0701,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 594994.0,
"step": 1290
},
{
"epoch": 0.45502275113755686,
"grad_norm": 0.022335920482873917,
"learning_rate": 3.701e-05,
"loss": 0.0447,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 599605.0,
"step": 1300
},
{
"epoch": 0.4585229261463073,
"grad_norm": 0.16378021240234375,
"learning_rate": 3.691e-05,
"loss": 0.0934,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 604331.0,
"step": 1310
},
{
"epoch": 0.46202310115505774,
"grad_norm": 4.628612995147705,
"learning_rate": 3.681e-05,
"loss": 0.1114,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 608859.0,
"step": 1320
},
{
"epoch": 0.4655232761638082,
"grad_norm": 4.558804035186768,
"learning_rate": 3.671e-05,
"loss": 0.0527,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 613472.0,
"step": 1330
},
{
"epoch": 0.4690234511725586,
"grad_norm": 7.380437850952148,
"learning_rate": 3.661e-05,
"loss": 0.0211,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 618003.0,
"step": 1340
},
{
"epoch": 0.47252362618130905,
"grad_norm": 0.054671116173267365,
"learning_rate": 3.651e-05,
"loss": 0.0896,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 622602.0,
"step": 1350
},
{
"epoch": 0.4760238011900595,
"grad_norm": 0.22701649367809296,
"learning_rate": 3.641e-05,
"loss": 0.0546,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 627312.0,
"step": 1360
},
{
"epoch": 0.47952397619880993,
"grad_norm": 9.734682083129883,
"learning_rate": 3.6309999999999996e-05,
"loss": 0.0576,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 632167.0,
"step": 1370
},
{
"epoch": 0.48302415120756037,
"grad_norm": 10.223374366760254,
"learning_rate": 3.621e-05,
"loss": 0.0569,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 636822.0,
"step": 1380
},
{
"epoch": 0.4865243262163108,
"grad_norm": 0.22119201719760895,
"learning_rate": 3.611e-05,
"loss": 0.052,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 641324.0,
"step": 1390
},
{
"epoch": 0.49002450122506125,
"grad_norm": 0.09743613004684448,
"learning_rate": 3.601e-05,
"loss": 0.0326,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 645990.0,
"step": 1400
},
{
"epoch": 0.4935246762338117,
"grad_norm": 4.46646785736084,
"learning_rate": 3.591e-05,
"loss": 0.0845,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 650642.0,
"step": 1410
},
{
"epoch": 0.4970248512425621,
"grad_norm": 0.3847590386867523,
"learning_rate": 3.581e-05,
"loss": 0.0276,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 655313.0,
"step": 1420
},
{
"epoch": 0.5005250262513126,
"grad_norm": 3.9029712677001953,
"learning_rate": 3.571e-05,
"loss": 0.0479,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 659845.0,
"step": 1430
},
{
"epoch": 0.504025201260063,
"grad_norm": 5.140905380249023,
"learning_rate": 3.5610000000000006e-05,
"loss": 0.0274,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 664526.0,
"step": 1440
},
{
"epoch": 0.5075253762688134,
"grad_norm": 0.0748833566904068,
"learning_rate": 3.5510000000000004e-05,
"loss": 0.112,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 669184.0,
"step": 1450
},
{
"epoch": 0.5110255512775639,
"grad_norm": 0.06513810157775879,
"learning_rate": 3.541e-05,
"loss": 0.0294,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 673829.0,
"step": 1460
},
{
"epoch": 0.5145257262863143,
"grad_norm": 0.14687716960906982,
"learning_rate": 3.5310000000000006e-05,
"loss": 0.039,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 678451.0,
"step": 1470
},
{
"epoch": 0.5180259012950648,
"grad_norm": 0.04928717017173767,
"learning_rate": 3.5210000000000003e-05,
"loss": 0.0881,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 682982.0,
"step": 1480
},
{
"epoch": 0.5215260763038152,
"grad_norm": 0.05730545148253441,
"learning_rate": 3.511e-05,
"loss": 0.0094,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 687510.0,
"step": 1490
},
{
"epoch": 0.5250262513125656,
"grad_norm": 0.024362344294786453,
"learning_rate": 3.5010000000000005e-05,
"loss": 0.0119,
"step": 1500
},
{
"epoch": 0.5250262513125656,
"eval_accuracy": 0.5596967782691092,
"eval_f1": 0.4608174078043728,
"eval_loss": 0.047076478600502014,
"eval_mean_token_accuracy": 0.9872474811895929,
"eval_num_tokens": 692119.0,
"eval_precision": 0.5081716761653752,
"eval_recall": 0.49738319415052024,
"eval_runtime": 244.3841,
"eval_samples_per_second": 6.478,
"eval_steps_per_second": 0.81,
"step": 1500
},
{
"epoch": 0.5285264263213161,
"grad_norm": 0.03228575736284256,
"learning_rate": 3.491e-05,
"loss": 0.0282,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 696712.0,
"step": 1510
},
{
"epoch": 0.5320266013300665,
"grad_norm": 15.16336441040039,
"learning_rate": 3.481e-05,
"loss": 0.0758,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 701390.0,
"step": 1520
},
{
"epoch": 0.535526776338817,
"grad_norm": 19.84299087524414,
"learning_rate": 3.4710000000000005e-05,
"loss": 0.0911,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 705965.0,
"step": 1530
},
{
"epoch": 0.5390269513475674,
"grad_norm": 0.051229000091552734,
"learning_rate": 3.461e-05,
"loss": 0.0982,
"mean_token_accuracy": 0.9599999904632568,
"num_tokens": 710601.0,
"step": 1540
},
{
"epoch": 0.5425271263563178,
"grad_norm": 6.5445756912231445,
"learning_rate": 3.451000000000001e-05,
"loss": 0.037,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 715211.0,
"step": 1550
},
{
"epoch": 0.5460273013650683,
"grad_norm": 4.0851874351501465,
"learning_rate": 3.4410000000000004e-05,
"loss": 0.0403,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 719818.0,
"step": 1560
},
{
"epoch": 0.5495274763738187,
"grad_norm": 0.20048797130584717,
"learning_rate": 3.431e-05,
"loss": 0.0598,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 724418.0,
"step": 1570
},
{
"epoch": 0.5530276513825692,
"grad_norm": 8.350198745727539,
"learning_rate": 3.4210000000000006e-05,
"loss": 0.0261,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 729079.0,
"step": 1580
},
{
"epoch": 0.5565278263913196,
"grad_norm": 7.64754056930542,
"learning_rate": 3.4110000000000004e-05,
"loss": 0.0205,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 733776.0,
"step": 1590
},
{
"epoch": 0.56002800140007,
"grad_norm": 11.657675743103027,
"learning_rate": 3.401e-05,
"loss": 0.0254,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 738301.0,
"step": 1600
},
{
"epoch": 0.5635281764088205,
"grad_norm": 0.044835012406110764,
"learning_rate": 3.3910000000000006e-05,
"loss": 0.0675,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 743020.0,
"step": 1610
},
{
"epoch": 0.5670283514175709,
"grad_norm": 0.08898824453353882,
"learning_rate": 3.381e-05,
"loss": 0.0438,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 747522.0,
"step": 1620
},
{
"epoch": 0.5705285264263213,
"grad_norm": 19.048906326293945,
"learning_rate": 3.371e-05,
"loss": 0.0462,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 752073.0,
"step": 1630
},
{
"epoch": 0.5740287014350718,
"grad_norm": 5.376831531524658,
"learning_rate": 3.3610000000000005e-05,
"loss": 0.0067,
"mean_token_accuracy": 1.0,
"num_tokens": 756668.0,
"step": 1640
},
{
"epoch": 0.5775288764438222,
"grad_norm": 0.003997461870312691,
"learning_rate": 3.351e-05,
"loss": 0.0093,
"mean_token_accuracy": 1.0,
"num_tokens": 761349.0,
"step": 1650
},
{
"epoch": 0.5810290514525727,
"grad_norm": 1.1141142845153809,
"learning_rate": 3.341e-05,
"loss": 0.0453,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 766087.0,
"step": 1660
},
{
"epoch": 0.5845292264613231,
"grad_norm": 0.09356174618005753,
"learning_rate": 3.3310000000000005e-05,
"loss": 0.0415,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 770676.0,
"step": 1670
},
{
"epoch": 0.5880294014700735,
"grad_norm": 16.47395133972168,
"learning_rate": 3.321e-05,
"loss": 0.0726,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 775287.0,
"step": 1680
},
{
"epoch": 0.591529576478824,
"grad_norm": 1.1543943881988525,
"learning_rate": 3.311e-05,
"loss": 0.0262,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 779907.0,
"step": 1690
},
{
"epoch": 0.5950297514875744,
"grad_norm": 0.6417059898376465,
"learning_rate": 3.3010000000000004e-05,
"loss": 0.1458,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 784527.0,
"step": 1700
},
{
"epoch": 0.5985299264963249,
"grad_norm": 0.03735469654202461,
"learning_rate": 3.291e-05,
"loss": 0.0517,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 789037.0,
"step": 1710
},
{
"epoch": 0.6020301015050753,
"grad_norm": 7.025692462921143,
"learning_rate": 3.281e-05,
"loss": 0.0912,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 793581.0,
"step": 1720
},
{
"epoch": 0.6055302765138257,
"grad_norm": 0.046697914600372314,
"learning_rate": 3.2710000000000004e-05,
"loss": 0.0516,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 798206.0,
"step": 1730
},
{
"epoch": 0.6090304515225762,
"grad_norm": 1.4056965112686157,
"learning_rate": 3.261e-05,
"loss": 0.0658,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 802791.0,
"step": 1740
},
{
"epoch": 0.6125306265313266,
"grad_norm": 15.819257736206055,
"learning_rate": 3.251e-05,
"loss": 0.0332,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 807423.0,
"step": 1750
},
{
"epoch": 0.616030801540077,
"grad_norm": 0.15242606401443481,
"learning_rate": 3.241e-05,
"loss": 0.0116,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 812014.0,
"step": 1760
},
{
"epoch": 0.6195309765488275,
"grad_norm": 0.8969595432281494,
"learning_rate": 3.231e-05,
"loss": 0.0697,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 816484.0,
"step": 1770
},
{
"epoch": 0.6230311515575779,
"grad_norm": 13.24059772491455,
"learning_rate": 3.221e-05,
"loss": 0.0475,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 821141.0,
"step": 1780
},
{
"epoch": 0.6265313265663283,
"grad_norm": 0.0862284004688263,
"learning_rate": 3.211e-05,
"loss": 0.0133,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 825825.0,
"step": 1790
},
{
"epoch": 0.6300315015750787,
"grad_norm": 6.188477993011475,
"learning_rate": 3.201e-05,
"loss": 0.0941,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 830440.0,
"step": 1800
},
{
"epoch": 0.6335316765838291,
"grad_norm": 0.047075141221284866,
"learning_rate": 3.191e-05,
"loss": 0.0152,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 835064.0,
"step": 1810
},
{
"epoch": 0.6370318515925796,
"grad_norm": 8.754451751708984,
"learning_rate": 3.181e-05,
"loss": 0.034,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 839606.0,
"step": 1820
},
{
"epoch": 0.64053202660133,
"grad_norm": 0.6907691955566406,
"learning_rate": 3.171e-05,
"loss": 0.0215,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 844205.0,
"step": 1830
},
{
"epoch": 0.6440322016100805,
"grad_norm": 0.06890915334224701,
"learning_rate": 3.1610000000000004e-05,
"loss": 0.0804,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 848857.0,
"step": 1840
},
{
"epoch": 0.6475323766188309,
"grad_norm": 0.04362496733665466,
"learning_rate": 3.151e-05,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 853505.0,
"step": 1850
},
{
"epoch": 0.6510325516275813,
"grad_norm": 0.032738834619522095,
"learning_rate": 3.141e-05,
"loss": 0.0738,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 857949.0,
"step": 1860
},
{
"epoch": 0.6545327266363318,
"grad_norm": 0.0720139741897583,
"learning_rate": 3.1310000000000003e-05,
"loss": 0.0198,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 862481.0,
"step": 1870
},
{
"epoch": 0.6580329016450822,
"grad_norm": 0.3373511731624603,
"learning_rate": 3.121e-05,
"loss": 0.0232,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 867070.0,
"step": 1880
},
{
"epoch": 0.6615330766538327,
"grad_norm": 0.03332596644759178,
"learning_rate": 3.111e-05,
"loss": 0.0255,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 871710.0,
"step": 1890
},
{
"epoch": 0.6650332516625831,
"grad_norm": 0.02673097886145115,
"learning_rate": 3.101e-05,
"loss": 0.1023,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 876374.0,
"step": 1900
},
{
"epoch": 0.6685334266713335,
"grad_norm": 29.00749969482422,
"learning_rate": 3.091e-05,
"loss": 0.0775,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 880967.0,
"step": 1910
},
{
"epoch": 0.672033601680084,
"grad_norm": 0.013920117169618607,
"learning_rate": 3.081e-05,
"loss": 0.0095,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 885539.0,
"step": 1920
},
{
"epoch": 0.6755337766888344,
"grad_norm": 0.004398212768137455,
"learning_rate": 3.071e-05,
"loss": 0.0107,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 890118.0,
"step": 1930
},
{
"epoch": 0.6790339516975848,
"grad_norm": 0.11914502084255219,
"learning_rate": 3.061e-05,
"loss": 0.0353,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 894652.0,
"step": 1940
},
{
"epoch": 0.6825341267063353,
"grad_norm": 0.06763932853937149,
"learning_rate": 3.051e-05,
"loss": 0.0155,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 899187.0,
"step": 1950
},
{
"epoch": 0.6860343017150857,
"grad_norm": 0.03659069910645485,
"learning_rate": 3.041e-05,
"loss": 0.0309,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 903727.0,
"step": 1960
},
{
"epoch": 0.6895344767238362,
"grad_norm": 5.335174083709717,
"learning_rate": 3.031e-05,
"loss": 0.1113,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 908334.0,
"step": 1970
},
{
"epoch": 0.6930346517325866,
"grad_norm": 2.4410805702209473,
"learning_rate": 3.021e-05,
"loss": 0.0128,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 913065.0,
"step": 1980
},
{
"epoch": 0.696534826741337,
"grad_norm": 0.05332425609230995,
"learning_rate": 3.0109999999999998e-05,
"loss": 0.0443,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 917618.0,
"step": 1990
},
{
"epoch": 0.7000350017500875,
"grad_norm": 0.012657753191888332,
"learning_rate": 3.001e-05,
"loss": 0.0267,
"step": 2000
},
{
"epoch": 0.7000350017500875,
"eval_accuracy": 0.5710675931775111,
"eval_f1": 0.4674227263281785,
"eval_loss": 0.052520181983709335,
"eval_mean_token_accuracy": 0.9876262681050734,
"eval_num_tokens": 922286.0,
"eval_precision": 0.5114304763470895,
"eval_recall": 0.5012130900032767,
"eval_runtime": 244.1164,
"eval_samples_per_second": 6.485,
"eval_steps_per_second": 0.811,
"step": 2000
},
{
"epoch": 0.7035351767588379,
"grad_norm": 5.11520528793335,
"learning_rate": 2.991e-05,
"loss": 0.0078,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 926980.0,
"step": 2010
},
{
"epoch": 0.7070353517675884,
"grad_norm": 0.03029199317097664,
"learning_rate": 2.9809999999999997e-05,
"loss": 0.0963,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 931724.0,
"step": 2020
},
{
"epoch": 0.7105355267763388,
"grad_norm": 0.5081428289413452,
"learning_rate": 2.971e-05,
"loss": 0.0361,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 936266.0,
"step": 2030
},
{
"epoch": 0.7140357017850892,
"grad_norm": 0.04822823032736778,
"learning_rate": 2.961e-05,
"loss": 0.1017,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 940886.0,
"step": 2040
},
{
"epoch": 0.7175358767938397,
"grad_norm": 0.2854156494140625,
"learning_rate": 2.951e-05,
"loss": 0.0186,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 945433.0,
"step": 2050
},
{
"epoch": 0.7210360518025901,
"grad_norm": 0.3434739112854004,
"learning_rate": 2.9409999999999998e-05,
"loss": 0.0587,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 949930.0,
"step": 2060
},
{
"epoch": 0.7245362268113406,
"grad_norm": 0.03626574948430061,
"learning_rate": 2.9310000000000006e-05,
"loss": 0.1057,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 954492.0,
"step": 2070
},
{
"epoch": 0.728036401820091,
"grad_norm": 11.993911743164062,
"learning_rate": 2.9210000000000003e-05,
"loss": 0.1327,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 958986.0,
"step": 2080
},
{
"epoch": 0.7315365768288414,
"grad_norm": 0.07597003877162933,
"learning_rate": 2.9110000000000004e-05,
"loss": 0.018,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 963614.0,
"step": 2090
},
{
"epoch": 0.7350367518375919,
"grad_norm": 10.063232421875,
"learning_rate": 2.9010000000000005e-05,
"loss": 0.1227,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 968140.0,
"step": 2100
},
{
"epoch": 0.7385369268463423,
"grad_norm": 0.1807040572166443,
"learning_rate": 2.8910000000000003e-05,
"loss": 0.0323,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 972835.0,
"step": 2110
},
{
"epoch": 0.7420371018550928,
"grad_norm": 0.17890332639217377,
"learning_rate": 2.8810000000000004e-05,
"loss": 0.0207,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 977445.0,
"step": 2120
},
{
"epoch": 0.7455372768638432,
"grad_norm": 9.020623207092285,
"learning_rate": 2.8710000000000005e-05,
"loss": 0.0899,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 982140.0,
"step": 2130
},
{
"epoch": 0.7490374518725936,
"grad_norm": 3.116069793701172,
"learning_rate": 2.8610000000000002e-05,
"loss": 0.0603,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 986817.0,
"step": 2140
},
{
"epoch": 0.7525376268813441,
"grad_norm": 14.557893753051758,
"learning_rate": 2.8510000000000003e-05,
"loss": 0.0326,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 991362.0,
"step": 2150
},
{
"epoch": 0.7560378018900945,
"grad_norm": 5.8639140129089355,
"learning_rate": 2.8410000000000004e-05,
"loss": 0.0533,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 995958.0,
"step": 2160
},
{
"epoch": 0.759537976898845,
"grad_norm": 0.08902487903833389,
"learning_rate": 2.8310000000000002e-05,
"loss": 0.0089,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1000561.0,
"step": 2170
},
{
"epoch": 0.7630381519075954,
"grad_norm": 0.021990323439240456,
"learning_rate": 2.8210000000000003e-05,
"loss": 0.0398,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1005102.0,
"step": 2180
},
{
"epoch": 0.7665383269163458,
"grad_norm": 0.0434272363781929,
"learning_rate": 2.8110000000000004e-05,
"loss": 0.0367,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1009721.0,
"step": 2190
},
{
"epoch": 0.7700385019250963,
"grad_norm": 7.773507595062256,
"learning_rate": 2.8010000000000005e-05,
"loss": 0.0529,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1014234.0,
"step": 2200
},
{
"epoch": 0.7735386769338467,
"grad_norm": 11.276909828186035,
"learning_rate": 2.7910000000000002e-05,
"loss": 0.0828,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 1018943.0,
"step": 2210
},
{
"epoch": 0.7770388519425971,
"grad_norm": 0.2111329585313797,
"learning_rate": 2.7810000000000003e-05,
"loss": 0.0168,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1023659.0,
"step": 2220
},
{
"epoch": 0.7805390269513476,
"grad_norm": 0.09295608103275299,
"learning_rate": 2.7710000000000004e-05,
"loss": 0.0426,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1028221.0,
"step": 2230
},
{
"epoch": 0.784039201960098,
"grad_norm": 0.05695830285549164,
"learning_rate": 2.761e-05,
"loss": 0.0321,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1032825.0,
"step": 2240
},
{
"epoch": 0.7875393769688485,
"grad_norm": 0.03428833931684494,
"learning_rate": 2.7510000000000003e-05,
"loss": 0.1096,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1037389.0,
"step": 2250
},
{
"epoch": 0.7910395519775989,
"grad_norm": 0.052995167672634125,
"learning_rate": 2.7410000000000004e-05,
"loss": 0.0808,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1041941.0,
"step": 2260
},
{
"epoch": 0.7945397269863493,
"grad_norm": 0.1979517787694931,
"learning_rate": 2.731e-05,
"loss": 0.0145,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1046451.0,
"step": 2270
},
{
"epoch": 0.7980399019950998,
"grad_norm": 0.024557696655392647,
"learning_rate": 2.7210000000000002e-05,
"loss": 0.0534,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1051071.0,
"step": 2280
},
{
"epoch": 0.8015400770038502,
"grad_norm": 7.660386085510254,
"learning_rate": 2.7110000000000003e-05,
"loss": 0.1042,
"mean_token_accuracy": 0.9649999916553498,
"num_tokens": 1055716.0,
"step": 2290
},
{
"epoch": 0.8050402520126007,
"grad_norm": 3.119615316390991,
"learning_rate": 2.701e-05,
"loss": 0.061,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1060266.0,
"step": 2300
},
{
"epoch": 0.8085404270213511,
"grad_norm": 6.7030158042907715,
"learning_rate": 2.691e-05,
"loss": 0.0265,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1064858.0,
"step": 2310
},
{
"epoch": 0.8120406020301015,
"grad_norm": 0.08051805198192596,
"learning_rate": 2.6810000000000003e-05,
"loss": 0.0091,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1069403.0,
"step": 2320
},
{
"epoch": 0.815540777038852,
"grad_norm": 0.08621969074010849,
"learning_rate": 2.671e-05,
"loss": 0.0043,
"mean_token_accuracy": 1.0,
"num_tokens": 1074041.0,
"step": 2330
},
{
"epoch": 0.8190409520476024,
"grad_norm": 7.230138778686523,
"learning_rate": 2.661e-05,
"loss": 0.0891,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1078603.0,
"step": 2340
},
{
"epoch": 0.8225411270563528,
"grad_norm": 1.925933837890625,
"learning_rate": 2.6510000000000002e-05,
"loss": 0.1072,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1083208.0,
"step": 2350
},
{
"epoch": 0.8260413020651033,
"grad_norm": 0.06855742633342743,
"learning_rate": 2.6410000000000003e-05,
"loss": 0.0987,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1087939.0,
"step": 2360
},
{
"epoch": 0.8295414770738537,
"grad_norm": 4.232824802398682,
"learning_rate": 2.631e-05,
"loss": 0.0537,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1092618.0,
"step": 2370
},
{
"epoch": 0.8330416520826042,
"grad_norm": 0.054919663816690445,
"learning_rate": 2.621e-05,
"loss": 0.0309,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1097232.0,
"step": 2380
},
{
"epoch": 0.8365418270913546,
"grad_norm": 8.129829406738281,
"learning_rate": 2.6110000000000002e-05,
"loss": 0.0761,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1101883.0,
"step": 2390
},
{
"epoch": 0.840042002100105,
"grad_norm": 7.850025653839111,
"learning_rate": 2.601e-05,
"loss": 0.0357,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1106608.0,
"step": 2400
},
{
"epoch": 0.8435421771088555,
"grad_norm": 0.504169762134552,
"learning_rate": 2.591e-05,
"loss": 0.0419,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1111214.0,
"step": 2410
},
{
"epoch": 0.8470423521176059,
"grad_norm": 0.02623009867966175,
"learning_rate": 2.5810000000000002e-05,
"loss": 0.0353,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1115761.0,
"step": 2420
},
{
"epoch": 0.8505425271263564,
"grad_norm": 0.2593607008457184,
"learning_rate": 2.571e-05,
"loss": 0.0757,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1120497.0,
"step": 2430
},
{
"epoch": 0.8540427021351068,
"grad_norm": 0.09586932510137558,
"learning_rate": 2.561e-05,
"loss": 0.0709,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1125050.0,
"step": 2440
},
{
"epoch": 0.8575428771438572,
"grad_norm": 0.03755811229348183,
"learning_rate": 2.551e-05,
"loss": 0.0462,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1129583.0,
"step": 2450
},
{
"epoch": 0.8610430521526077,
"grad_norm": 0.01429970283061266,
"learning_rate": 2.541e-05,
"loss": 0.0176,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1134231.0,
"step": 2460
},
{
"epoch": 0.8645432271613581,
"grad_norm": 0.1092047318816185,
"learning_rate": 2.531e-05,
"loss": 0.0348,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1138841.0,
"step": 2470
},
{
"epoch": 0.8680434021701086,
"grad_norm": 0.04420563951134682,
"learning_rate": 2.521e-05,
"loss": 0.0602,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1143627.0,
"step": 2480
},
{
"epoch": 0.871543577178859,
"grad_norm": 0.056809134781360626,
"learning_rate": 2.5110000000000002e-05,
"loss": 0.0113,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1148140.0,
"step": 2490
},
{
"epoch": 0.8750437521876093,
"grad_norm": 0.39837557077407837,
"learning_rate": 2.501e-05,
"loss": 0.0467,
"step": 2500
},
{
"epoch": 0.8750437521876093,
"eval_accuracy": 0.44535691724573595,
"eval_f1": 0.3790299221602013,
"eval_loss": 0.05089215189218521,
"eval_mean_token_accuracy": 0.9880050568267552,
"eval_num_tokens": 1152749.0,
"eval_precision": 0.454011773226288,
"eval_recall": 0.4125268353595752,
"eval_runtime": 244.5484,
"eval_samples_per_second": 6.473,
"eval_steps_per_second": 0.81,
"step": 2500
},
{
"epoch": 0.8785439271963598,
"grad_norm": 0.028136901557445526,
"learning_rate": 2.491e-05,
"loss": 0.016,
"mean_token_accuracy": 0.9874999970197678,
"num_tokens": 1157289.0,
"step": 2510
},
{
"epoch": 0.8820441022051102,
"grad_norm": 8.768643379211426,
"learning_rate": 2.481e-05,
"loss": 0.0547,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1161827.0,
"step": 2520
},
{
"epoch": 0.8855442772138606,
"grad_norm": 4.318042755126953,
"learning_rate": 2.471e-05,
"loss": 0.0548,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1166441.0,
"step": 2530
},
{
"epoch": 0.8890444522226111,
"grad_norm": 0.032455261796712875,
"learning_rate": 2.4610000000000003e-05,
"loss": 0.0342,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1171123.0,
"step": 2540
},
{
"epoch": 0.8925446272313615,
"grad_norm": 1.8352007865905762,
"learning_rate": 2.451e-05,
"loss": 0.0399,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1175686.0,
"step": 2550
},
{
"epoch": 0.896044802240112,
"grad_norm": 0.11914759129285812,
"learning_rate": 2.4410000000000002e-05,
"loss": 0.089,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1180208.0,
"step": 2560
},
{
"epoch": 0.8995449772488624,
"grad_norm": 0.09686534851789474,
"learning_rate": 2.4310000000000003e-05,
"loss": 0.0889,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1184783.0,
"step": 2570
},
{
"epoch": 0.9030451522576128,
"grad_norm": 0.06705299764871597,
"learning_rate": 2.4210000000000004e-05,
"loss": 0.0801,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1189400.0,
"step": 2580
},
{
"epoch": 0.9065453272663633,
"grad_norm": 0.04434126242995262,
"learning_rate": 2.411e-05,
"loss": 0.0036,
"mean_token_accuracy": 1.0,
"num_tokens": 1194035.0,
"step": 2590
},
{
"epoch": 0.9100455022751137,
"grad_norm": 0.03630208596587181,
"learning_rate": 2.4010000000000002e-05,
"loss": 0.0326,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1198629.0,
"step": 2600
},
{
"epoch": 0.9135456772838642,
"grad_norm": 0.031141789630055428,
"learning_rate": 2.3910000000000003e-05,
"loss": 0.0436,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1203254.0,
"step": 2610
},
{
"epoch": 0.9170458522926146,
"grad_norm": 0.33005213737487793,
"learning_rate": 2.381e-05,
"loss": 0.066,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1207907.0,
"step": 2620
},
{
"epoch": 0.920546027301365,
"grad_norm": 8.107050895690918,
"learning_rate": 2.371e-05,
"loss": 0.0418,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1212537.0,
"step": 2630
},
{
"epoch": 0.9240462023101155,
"grad_norm": 0.28287169337272644,
"learning_rate": 2.3610000000000003e-05,
"loss": 0.0683,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1217120.0,
"step": 2640
},
{
"epoch": 0.9275463773188659,
"grad_norm": 0.03412451222538948,
"learning_rate": 2.351e-05,
"loss": 0.0101,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1221671.0,
"step": 2650
},
{
"epoch": 0.9310465523276164,
"grad_norm": 6.6741814613342285,
"learning_rate": 2.341e-05,
"loss": 0.085,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1226356.0,
"step": 2660
},
{
"epoch": 0.9345467273363668,
"grad_norm": 4.757784366607666,
"learning_rate": 2.3310000000000002e-05,
"loss": 0.1041,
"mean_token_accuracy": 0.9649999916553498,
"num_tokens": 1230966.0,
"step": 2670
},
{
"epoch": 0.9380469023451172,
"grad_norm": 0.1483946591615677,
"learning_rate": 2.321e-05,
"loss": 0.0043,
"mean_token_accuracy": 1.0,
"num_tokens": 1235570.0,
"step": 2680
},
{
"epoch": 0.9415470773538677,
"grad_norm": 4.1783976554870605,
"learning_rate": 2.311e-05,
"loss": 0.0297,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1240120.0,
"step": 2690
},
{
"epoch": 0.9450472523626181,
"grad_norm": 0.006953865755349398,
"learning_rate": 2.301e-05,
"loss": 0.0036,
"mean_token_accuracy": 1.0,
"num_tokens": 1244744.0,
"step": 2700
},
{
"epoch": 0.9485474273713685,
"grad_norm": 7.843381881713867,
"learning_rate": 2.2910000000000003e-05,
"loss": 0.0747,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1249304.0,
"step": 2710
},
{
"epoch": 0.952047602380119,
"grad_norm": 2.691250801086426,
"learning_rate": 2.281e-05,
"loss": 0.0341,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1253884.0,
"step": 2720
},
{
"epoch": 0.9555477773888694,
"grad_norm": 0.048404838889837265,
"learning_rate": 2.271e-05,
"loss": 0.0236,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1258643.0,
"step": 2730
},
{
"epoch": 0.9590479523976199,
"grad_norm": 0.10087752342224121,
"learning_rate": 2.2610000000000002e-05,
"loss": 0.0454,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1263197.0,
"step": 2740
},
{
"epoch": 0.9625481274063703,
"grad_norm": 0.66507887840271,
"learning_rate": 2.251e-05,
"loss": 0.0573,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1267826.0,
"step": 2750
},
{
"epoch": 0.9660483024151207,
"grad_norm": 0.0337546281516552,
"learning_rate": 2.241e-05,
"loss": 0.0534,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1272318.0,
"step": 2760
},
{
"epoch": 0.9695484774238712,
"grad_norm": 0.022819435223937035,
"learning_rate": 2.231e-05,
"loss": 0.0262,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1276943.0,
"step": 2770
},
{
"epoch": 0.9730486524326216,
"grad_norm": 0.023641835898160934,
"learning_rate": 2.221e-05,
"loss": 0.0152,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1281549.0,
"step": 2780
},
{
"epoch": 0.9765488274413721,
"grad_norm": 3.202338695526123,
"learning_rate": 2.211e-05,
"loss": 0.0755,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1286053.0,
"step": 2790
},
{
"epoch": 0.9800490024501225,
"grad_norm": 0.052319396287202835,
"learning_rate": 2.201e-05,
"loss": 0.0376,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1290719.0,
"step": 2800
},
{
"epoch": 0.9835491774588729,
"grad_norm": 10.303972244262695,
"learning_rate": 2.191e-05,
"loss": 0.0052,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1295220.0,
"step": 2810
},
{
"epoch": 0.9870493524676234,
"grad_norm": 0.11650484800338745,
"learning_rate": 2.181e-05,
"loss": 0.0036,
"mean_token_accuracy": 1.0,
"num_tokens": 1299736.0,
"step": 2820
},
{
"epoch": 0.9905495274763738,
"grad_norm": 0.016379429027438164,
"learning_rate": 2.171e-05,
"loss": 0.0403,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1304307.0,
"step": 2830
},
{
"epoch": 0.9940497024851243,
"grad_norm": 0.07140190899372101,
"learning_rate": 2.1609999999999998e-05,
"loss": 0.0358,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1308761.0,
"step": 2840
},
{
"epoch": 0.9975498774938747,
"grad_norm": 0.3014475405216217,
"learning_rate": 2.1510000000000002e-05,
"loss": 0.0319,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1313336.0,
"step": 2850
},
{
"epoch": 1.0010500525026251,
"grad_norm": 0.06447609513998032,
"learning_rate": 2.1410000000000003e-05,
"loss": 0.0348,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1317756.0,
"step": 2860
},
{
"epoch": 1.0045502275113756,
"grad_norm": 0.02268841676414013,
"learning_rate": 2.131e-05,
"loss": 0.0218,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1322331.0,
"step": 2870
},
{
"epoch": 1.008050402520126,
"grad_norm": 0.042231637984514236,
"learning_rate": 2.1210000000000002e-05,
"loss": 0.0023,
"mean_token_accuracy": 1.0,
"num_tokens": 1326941.0,
"step": 2880
},
{
"epoch": 1.0115505775288764,
"grad_norm": 5.811006546020508,
"learning_rate": 2.1110000000000003e-05,
"loss": 0.0024,
"mean_token_accuracy": 1.0,
"num_tokens": 1331630.0,
"step": 2890
},
{
"epoch": 1.0150507525376269,
"grad_norm": 0.0456203818321228,
"learning_rate": 2.101e-05,
"loss": 0.0011,
"mean_token_accuracy": 1.0,
"num_tokens": 1336316.0,
"step": 2900
},
{
"epoch": 1.0185509275463773,
"grad_norm": 0.2723388671875,
"learning_rate": 2.091e-05,
"loss": 0.0114,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1341005.0,
"step": 2910
},
{
"epoch": 1.0220511025551278,
"grad_norm": 0.0164664164185524,
"learning_rate": 2.0810000000000002e-05,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 1345651.0,
"step": 2920
},
{
"epoch": 1.0255512775638782,
"grad_norm": 0.38593819737434387,
"learning_rate": 2.0710000000000003e-05,
"loss": 0.0395,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1350333.0,
"step": 2930
},
{
"epoch": 1.0290514525726286,
"grad_norm": 17.3580379486084,
"learning_rate": 2.061e-05,
"loss": 0.103,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1354894.0,
"step": 2940
},
{
"epoch": 1.032551627581379,
"grad_norm": 0.019541358575224876,
"learning_rate": 2.0510000000000002e-05,
"loss": 0.0496,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1359645.0,
"step": 2950
},
{
"epoch": 1.0360518025901295,
"grad_norm": 1.1783517599105835,
"learning_rate": 2.0410000000000003e-05,
"loss": 0.06,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1364197.0,
"step": 2960
},
{
"epoch": 1.03955197759888,
"grad_norm": 0.03887060657143593,
"learning_rate": 2.031e-05,
"loss": 0.0118,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1368857.0,
"step": 2970
},
{
"epoch": 1.0430521526076304,
"grad_norm": 0.11780918389558792,
"learning_rate": 2.021e-05,
"loss": 0.0336,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1373483.0,
"step": 2980
},
{
"epoch": 1.0465523276163808,
"grad_norm": 0.06139334291219711,
"learning_rate": 2.0110000000000002e-05,
"loss": 0.0924,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1378101.0,
"step": 2990
},
{
"epoch": 1.0500525026251313,
"grad_norm": 0.0635937973856926,
"learning_rate": 2.001e-05,
"loss": 0.0472,
"step": 3000
},
{
"epoch": 1.0500525026251313,
"eval_accuracy": 0.44662034112444726,
"eval_f1": 0.3874674882536518,
"eval_loss": 0.041092198342084885,
"eval_mean_token_accuracy": 0.9890151577766495,
"eval_num_tokens": 1382635.0,
"eval_precision": 0.4642550079051411,
"eval_recall": 0.4463958399837153,
"eval_runtime": 243.8353,
"eval_samples_per_second": 6.492,
"eval_steps_per_second": 0.812,
"step": 3000
},
{
"epoch": 1.0535526776338817,
"grad_norm": 0.05304880812764168,
"learning_rate": 1.991e-05,
"loss": 0.0065,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1387247.0,
"step": 3010
},
{
"epoch": 1.0570528526426322,
"grad_norm": 0.09396978467702866,
"learning_rate": 1.9810000000000002e-05,
"loss": 0.03,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1391905.0,
"step": 3020
},
{
"epoch": 1.0605530276513826,
"grad_norm": 0.02283914014697075,
"learning_rate": 1.971e-05,
"loss": 0.0412,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1396551.0,
"step": 3030
},
{
"epoch": 1.064053202660133,
"grad_norm": 0.14994072914123535,
"learning_rate": 1.961e-05,
"loss": 0.0387,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1401184.0,
"step": 3040
},
{
"epoch": 1.0675533776688835,
"grad_norm": 0.04891595244407654,
"learning_rate": 1.951e-05,
"loss": 0.04,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1405794.0,
"step": 3050
},
{
"epoch": 1.071053552677634,
"grad_norm": 8.5429105758667,
"learning_rate": 1.941e-05,
"loss": 0.0338,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1410352.0,
"step": 3060
},
{
"epoch": 1.0745537276863844,
"grad_norm": 6.096926212310791,
"learning_rate": 1.931e-05,
"loss": 0.0823,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1414922.0,
"step": 3070
},
{
"epoch": 1.0780539026951348,
"grad_norm": 0.01014970988035202,
"learning_rate": 1.921e-05,
"loss": 0.0596,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1419518.0,
"step": 3080
},
{
"epoch": 1.0815540777038852,
"grad_norm": 8.101456642150879,
"learning_rate": 1.911e-05,
"loss": 0.0309,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1424063.0,
"step": 3090
},
{
"epoch": 1.0850542527126357,
"grad_norm": 0.03248458355665207,
"learning_rate": 1.901e-05,
"loss": 0.0081,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1428657.0,
"step": 3100
},
{
"epoch": 1.088554427721386,
"grad_norm": 0.284970760345459,
"learning_rate": 1.891e-05,
"loss": 0.0035,
"mean_token_accuracy": 1.0,
"num_tokens": 1433340.0,
"step": 3110
},
{
"epoch": 1.0920546027301365,
"grad_norm": 5.219287872314453,
"learning_rate": 1.881e-05,
"loss": 0.0339,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1437947.0,
"step": 3120
},
{
"epoch": 1.095554777738887,
"grad_norm": 0.021635359153151512,
"learning_rate": 1.871e-05,
"loss": 0.0533,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1442589.0,
"step": 3130
},
{
"epoch": 1.0990549527476374,
"grad_norm": 0.05187542736530304,
"learning_rate": 1.861e-05,
"loss": 0.0297,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1447150.0,
"step": 3140
},
{
"epoch": 1.1025551277563879,
"grad_norm": 0.05536261200904846,
"learning_rate": 1.851e-05,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 1451702.0,
"step": 3150
},
{
"epoch": 1.1060553027651383,
"grad_norm": 4.796628475189209,
"learning_rate": 1.841e-05,
"loss": 0.0073,
"mean_token_accuracy": 1.0,
"num_tokens": 1456224.0,
"step": 3160
},
{
"epoch": 1.1095554777738887,
"grad_norm": 4.390865325927734,
"learning_rate": 1.8310000000000003e-05,
"loss": 0.0532,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1460755.0,
"step": 3170
},
{
"epoch": 1.1130556527826392,
"grad_norm": 0.12759913504123688,
"learning_rate": 1.8210000000000004e-05,
"loss": 0.0183,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1465282.0,
"step": 3180
},
{
"epoch": 1.1165558277913896,
"grad_norm": 0.023097023367881775,
"learning_rate": 1.811e-05,
"loss": 0.0012,
"mean_token_accuracy": 1.0,
"num_tokens": 1469858.0,
"step": 3190
},
{
"epoch": 1.12005600280014,
"grad_norm": 0.013977882452309132,
"learning_rate": 1.8010000000000002e-05,
"loss": 0.0598,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1474377.0,
"step": 3200
},
{
"epoch": 1.1235561778088905,
"grad_norm": 0.03361167758703232,
"learning_rate": 1.7910000000000003e-05,
"loss": 0.0238,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1478946.0,
"step": 3210
},
{
"epoch": 1.127056352817641,
"grad_norm": 0.08658773452043533,
"learning_rate": 1.781e-05,
"loss": 0.0542,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1483522.0,
"step": 3220
},
{
"epoch": 1.1305565278263914,
"grad_norm": 0.030420592054724693,
"learning_rate": 1.771e-05,
"loss": 0.0509,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1488282.0,
"step": 3230
},
{
"epoch": 1.1340567028351418,
"grad_norm": 0.10281772166490555,
"learning_rate": 1.7610000000000002e-05,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 1492851.0,
"step": 3240
},
{
"epoch": 1.1375568778438923,
"grad_norm": 0.061314165592193604,
"learning_rate": 1.751e-05,
"loss": 0.0397,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1497490.0,
"step": 3250
},
{
"epoch": 1.1410570528526427,
"grad_norm": 0.05558156967163086,
"learning_rate": 1.741e-05,
"loss": 0.0341,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1502033.0,
"step": 3260
},
{
"epoch": 1.1445572278613931,
"grad_norm": 0.7785694003105164,
"learning_rate": 1.7310000000000002e-05,
"loss": 0.0501,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1506574.0,
"step": 3270
},
{
"epoch": 1.1480574028701436,
"grad_norm": 1.054373025894165,
"learning_rate": 1.721e-05,
"loss": 0.0073,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1511166.0,
"step": 3280
},
{
"epoch": 1.151557577878894,
"grad_norm": 13.361648559570312,
"learning_rate": 1.711e-05,
"loss": 0.0203,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1515897.0,
"step": 3290
},
{
"epoch": 1.1550577528876445,
"grad_norm": 9.834617614746094,
"learning_rate": 1.701e-05,
"loss": 0.0487,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1520628.0,
"step": 3300
},
{
"epoch": 1.158557927896395,
"grad_norm": 0.03448121249675751,
"learning_rate": 1.6910000000000002e-05,
"loss": 0.0257,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1525210.0,
"step": 3310
},
{
"epoch": 1.1620581029051453,
"grad_norm": 0.11401532590389252,
"learning_rate": 1.681e-05,
"loss": 0.0379,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1529651.0,
"step": 3320
},
{
"epoch": 1.1655582779138958,
"grad_norm": 3.9310457706451416,
"learning_rate": 1.671e-05,
"loss": 0.0212,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1534286.0,
"step": 3330
},
{
"epoch": 1.1690584529226462,
"grad_norm": 0.012804349884390831,
"learning_rate": 1.6610000000000002e-05,
"loss": 0.0012,
"mean_token_accuracy": 1.0,
"num_tokens": 1538944.0,
"step": 3340
},
{
"epoch": 1.1725586279313966,
"grad_norm": 0.7828325033187866,
"learning_rate": 1.651e-05,
"loss": 0.0346,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1543591.0,
"step": 3350
},
{
"epoch": 1.176058802940147,
"grad_norm": 0.027147287502884865,
"learning_rate": 1.641e-05,
"loss": 0.0021,
"mean_token_accuracy": 1.0,
"num_tokens": 1548149.0,
"step": 3360
},
{
"epoch": 1.1795589779488975,
"grad_norm": 0.05930430442094803,
"learning_rate": 1.631e-05,
"loss": 0.0335,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1552807.0,
"step": 3370
},
{
"epoch": 1.183059152957648,
"grad_norm": 0.03868912532925606,
"learning_rate": 1.621e-05,
"loss": 0.0871,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1557375.0,
"step": 3380
},
{
"epoch": 1.1865593279663984,
"grad_norm": 0.038131892681121826,
"learning_rate": 1.611e-05,
"loss": 0.0129,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1561984.0,
"step": 3390
},
{
"epoch": 1.1900595029751488,
"grad_norm": 0.3329053521156311,
"learning_rate": 1.601e-05,
"loss": 0.0173,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1566645.0,
"step": 3400
},
{
"epoch": 1.1935596779838993,
"grad_norm": 0.08648809045553207,
"learning_rate": 1.591e-05,
"loss": 0.0916,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1571275.0,
"step": 3410
},
{
"epoch": 1.1970598529926497,
"grad_norm": 9.256734848022461,
"learning_rate": 1.581e-05,
"loss": 0.0156,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1575848.0,
"step": 3420
},
{
"epoch": 1.2005600280014002,
"grad_norm": 0.20919840037822723,
"learning_rate": 1.571e-05,
"loss": 0.028,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1580383.0,
"step": 3430
},
{
"epoch": 1.2040602030101506,
"grad_norm": 8.012375831604004,
"learning_rate": 1.561e-05,
"loss": 0.0042,
"mean_token_accuracy": 1.0,
"num_tokens": 1584986.0,
"step": 3440
},
{
"epoch": 1.207560378018901,
"grad_norm": 0.024143755435943604,
"learning_rate": 1.551e-05,
"loss": 0.0021,
"mean_token_accuracy": 1.0,
"num_tokens": 1589541.0,
"step": 3450
},
{
"epoch": 1.2110605530276515,
"grad_norm": 3.953441619873047,
"learning_rate": 1.541e-05,
"loss": 0.043,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1594177.0,
"step": 3460
},
{
"epoch": 1.214560728036402,
"grad_norm": 3.8087575435638428,
"learning_rate": 1.531e-05,
"loss": 0.0538,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1598836.0,
"step": 3470
},
{
"epoch": 1.2180609030451524,
"grad_norm": 0.01786259561777115,
"learning_rate": 1.5210000000000002e-05,
"loss": 0.0583,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1603561.0,
"step": 3480
},
{
"epoch": 1.2215610780539028,
"grad_norm": 0.026305489242076874,
"learning_rate": 1.5110000000000003e-05,
"loss": 0.0121,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1608176.0,
"step": 3490
},
{
"epoch": 1.2250612530626532,
"grad_norm": 0.026074456050992012,
"learning_rate": 1.5010000000000002e-05,
"loss": 0.0616,
"step": 3500
},
{
"epoch": 1.2250612530626532,
"eval_accuracy": 0.4403032217308907,
"eval_f1": 0.3717722608671718,
"eval_loss": 0.04926339536905289,
"eval_mean_token_accuracy": 0.9881313193326045,
"eval_num_tokens": 1612812.0,
"eval_precision": 0.44875078714612676,
"eval_recall": 0.39254694591331135,
"eval_runtime": 245.1895,
"eval_samples_per_second": 6.456,
"eval_steps_per_second": 0.808,
"step": 3500
},
{
"epoch": 1.2285614280714037,
"grad_norm": 0.0484970398247242,
"learning_rate": 1.4910000000000001e-05,
"loss": 0.0593,
"mean_token_accuracy": 0.9874999970197678,
"num_tokens": 1617379.0,
"step": 3510
},
{
"epoch": 1.232061603080154,
"grad_norm": 12.298089981079102,
"learning_rate": 1.4810000000000002e-05,
"loss": 0.0659,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1622039.0,
"step": 3520
},
{
"epoch": 1.2355617780889045,
"grad_norm": 0.022822504863142967,
"learning_rate": 1.4710000000000001e-05,
"loss": 0.0392,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1626629.0,
"step": 3530
},
{
"epoch": 1.239061953097655,
"grad_norm": 0.19993631541728973,
"learning_rate": 1.461e-05,
"loss": 0.003,
"mean_token_accuracy": 1.0,
"num_tokens": 1631184.0,
"step": 3540
},
{
"epoch": 1.2425621281064054,
"grad_norm": 0.01650061272084713,
"learning_rate": 1.4510000000000002e-05,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 1635787.0,
"step": 3550
},
{
"epoch": 1.2460623031151559,
"grad_norm": 0.09447409212589264,
"learning_rate": 1.4410000000000001e-05,
"loss": 0.0111,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1640489.0,
"step": 3560
},
{
"epoch": 1.2495624781239063,
"grad_norm": 1.348664402961731,
"learning_rate": 1.4310000000000002e-05,
"loss": 0.0037,
"mean_token_accuracy": 1.0,
"num_tokens": 1645122.0,
"step": 3570
},
{
"epoch": 1.2530626531326567,
"grad_norm": 0.02807781472802162,
"learning_rate": 1.4210000000000001e-05,
"loss": 0.0101,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1649714.0,
"step": 3580
},
{
"epoch": 1.2565628281414072,
"grad_norm": 0.0278321523219347,
"learning_rate": 1.411e-05,
"loss": 0.0317,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1654253.0,
"step": 3590
},
{
"epoch": 1.2600630031501576,
"grad_norm": 0.05552316829562187,
"learning_rate": 1.4010000000000001e-05,
"loss": 0.0338,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1658870.0,
"step": 3600
},
{
"epoch": 1.263563178158908,
"grad_norm": 0.5879592895507812,
"learning_rate": 1.391e-05,
"loss": 0.0768,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1663540.0,
"step": 3610
},
{
"epoch": 1.2670633531676585,
"grad_norm": 0.23051026463508606,
"learning_rate": 1.381e-05,
"loss": 0.0366,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1668164.0,
"step": 3620
},
{
"epoch": 1.270563528176409,
"grad_norm": 7.013516426086426,
"learning_rate": 1.3710000000000001e-05,
"loss": 0.0831,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1672778.0,
"step": 3630
},
{
"epoch": 1.2740637031851594,
"grad_norm": 0.20101211965084076,
"learning_rate": 1.361e-05,
"loss": 0.0336,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1677340.0,
"step": 3640
},
{
"epoch": 1.2775638781939098,
"grad_norm": 0.3626852035522461,
"learning_rate": 1.3510000000000001e-05,
"loss": 0.043,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1681942.0,
"step": 3650
},
{
"epoch": 1.2810640532026603,
"grad_norm": 0.053018514066934586,
"learning_rate": 1.341e-05,
"loss": 0.045,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1686613.0,
"step": 3660
},
{
"epoch": 1.2845642282114107,
"grad_norm": 7.274749755859375,
"learning_rate": 1.331e-05,
"loss": 0.1322,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1691194.0,
"step": 3670
},
{
"epoch": 1.2880644032201611,
"grad_norm": 0.05607298016548157,
"learning_rate": 1.321e-05,
"loss": 0.0384,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1695910.0,
"step": 3680
},
{
"epoch": 1.2915645782289116,
"grad_norm": 0.03872371464967728,
"learning_rate": 1.311e-05,
"loss": 0.0138,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1700440.0,
"step": 3690
},
{
"epoch": 1.295064753237662,
"grad_norm": 0.042605865746736526,
"learning_rate": 1.301e-05,
"loss": 0.0845,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1705135.0,
"step": 3700
},
{
"epoch": 1.2985649282464125,
"grad_norm": 4.082870006561279,
"learning_rate": 1.291e-05,
"loss": 0.0633,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1709718.0,
"step": 3710
},
{
"epoch": 1.302065103255163,
"grad_norm": 5.0214691162109375,
"learning_rate": 1.281e-05,
"loss": 0.0522,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1714292.0,
"step": 3720
},
{
"epoch": 1.3055652782639133,
"grad_norm": 0.05840720981359482,
"learning_rate": 1.271e-05,
"loss": 0.0667,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1718744.0,
"step": 3730
},
{
"epoch": 1.3090654532726638,
"grad_norm": 0.3673993647098541,
"learning_rate": 1.261e-05,
"loss": 0.0463,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1723373.0,
"step": 3740
},
{
"epoch": 1.312565628281414,
"grad_norm": 0.06860412657260895,
"learning_rate": 1.2509999999999999e-05,
"loss": 0.0414,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1727984.0,
"step": 3750
},
{
"epoch": 1.3160658032901644,
"grad_norm": 0.03777327015995979,
"learning_rate": 1.2410000000000001e-05,
"loss": 0.0116,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1732531.0,
"step": 3760
},
{
"epoch": 1.3195659782989149,
"grad_norm": 0.7017369270324707,
"learning_rate": 1.231e-05,
"loss": 0.0685,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1737147.0,
"step": 3770
},
{
"epoch": 1.3230661533076653,
"grad_norm": 8.006946563720703,
"learning_rate": 1.221e-05,
"loss": 0.0483,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1741794.0,
"step": 3780
},
{
"epoch": 1.3265663283164157,
"grad_norm": 7.42986536026001,
"learning_rate": 1.2110000000000001e-05,
"loss": 0.0691,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1746457.0,
"step": 3790
},
{
"epoch": 1.3300665033251662,
"grad_norm": 0.08513722568750381,
"learning_rate": 1.201e-05,
"loss": 0.0096,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1751031.0,
"step": 3800
},
{
"epoch": 1.3335666783339166,
"grad_norm": 5.149372577667236,
"learning_rate": 1.1910000000000001e-05,
"loss": 0.1008,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1755649.0,
"step": 3810
},
{
"epoch": 1.337066853342667,
"grad_norm": 0.10420811176300049,
"learning_rate": 1.181e-05,
"loss": 0.0901,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1760094.0,
"step": 3820
},
{
"epoch": 1.3405670283514175,
"grad_norm": 0.15396250784397125,
"learning_rate": 1.171e-05,
"loss": 0.006,
"mean_token_accuracy": 1.0,
"num_tokens": 1764739.0,
"step": 3830
},
{
"epoch": 1.344067203360168,
"grad_norm": 0.08703949302434921,
"learning_rate": 1.161e-05,
"loss": 0.0277,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1769426.0,
"step": 3840
},
{
"epoch": 1.3475673783689184,
"grad_norm": 2.2800724506378174,
"learning_rate": 1.151e-05,
"loss": 0.0215,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1774042.0,
"step": 3850
},
{
"epoch": 1.3510675533776688,
"grad_norm": 7.864820957183838,
"learning_rate": 1.141e-05,
"loss": 0.0396,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1778764.0,
"step": 3860
},
{
"epoch": 1.3545677283864193,
"grad_norm": 18.59937286376953,
"learning_rate": 1.1310000000000002e-05,
"loss": 0.0501,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1783410.0,
"step": 3870
},
{
"epoch": 1.3580679033951697,
"grad_norm": 0.06370130181312561,
"learning_rate": 1.1210000000000001e-05,
"loss": 0.0804,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1788086.0,
"step": 3880
},
{
"epoch": 1.3615680784039201,
"grad_norm": 3.136486053466797,
"learning_rate": 1.111e-05,
"loss": 0.0549,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1792576.0,
"step": 3890
},
{
"epoch": 1.3650682534126706,
"grad_norm": 0.080386683344841,
"learning_rate": 1.1010000000000001e-05,
"loss": 0.0105,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1797137.0,
"step": 3900
},
{
"epoch": 1.368568428421421,
"grad_norm": 0.311697393655777,
"learning_rate": 1.091e-05,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 1801751.0,
"step": 3910
},
{
"epoch": 1.3720686034301715,
"grad_norm": 0.04613969102501869,
"learning_rate": 1.081e-05,
"loss": 0.0236,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1806217.0,
"step": 3920
},
{
"epoch": 1.375568778438922,
"grad_norm": 2.0834603309631348,
"learning_rate": 1.071e-05,
"loss": 0.0064,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1810797.0,
"step": 3930
},
{
"epoch": 1.3790689534476723,
"grad_norm": 4.339105129241943,
"learning_rate": 1.061e-05,
"loss": 0.0572,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1815366.0,
"step": 3940
},
{
"epoch": 1.3825691284564228,
"grad_norm": 0.03018569014966488,
"learning_rate": 1.0510000000000001e-05,
"loss": 0.0121,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1819910.0,
"step": 3950
},
{
"epoch": 1.3860693034651732,
"grad_norm": 0.02608495019376278,
"learning_rate": 1.041e-05,
"loss": 0.0203,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1824620.0,
"step": 3960
},
{
"epoch": 1.3895694784739236,
"grad_norm": 0.028722476214170456,
"learning_rate": 1.031e-05,
"loss": 0.0524,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1829297.0,
"step": 3970
},
{
"epoch": 1.393069653482674,
"grad_norm": 3.794125556945801,
"learning_rate": 1.021e-05,
"loss": 0.1004,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1833884.0,
"step": 3980
},
{
"epoch": 1.3965698284914245,
"grad_norm": 0.0639004036784172,
"learning_rate": 1.011e-05,
"loss": 0.0432,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1838429.0,
"step": 3990
},
{
"epoch": 1.400070003500175,
"grad_norm": 0.08853046596050262,
"learning_rate": 1.001e-05,
"loss": 0.0531,
"step": 4000
},
{
"epoch": 1.400070003500175,
"eval_accuracy": 0.4491471888818699,
"eval_f1": 0.3831962155491568,
"eval_loss": 0.04581384360790253,
"eval_mean_token_accuracy": 0.988113282003788,
"eval_num_tokens": 1843093.0,
"eval_precision": 0.4551209732080744,
"eval_recall": 0.42343303927833725,
"eval_runtime": 243.4032,
"eval_samples_per_second": 6.504,
"eval_steps_per_second": 0.813,
"step": 4000
},
{
"epoch": 1.4035701785089254,
"grad_norm": 1.6858179569244385,
"learning_rate": 9.91e-06,
"loss": 0.0544,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1847649.0,
"step": 4010
},
{
"epoch": 1.4070703535176758,
"grad_norm": 4.908888339996338,
"learning_rate": 9.810000000000001e-06,
"loss": 0.0405,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1852259.0,
"step": 4020
},
{
"epoch": 1.4105705285264263,
"grad_norm": 0.08344841003417969,
"learning_rate": 9.71e-06,
"loss": 0.0251,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1856863.0,
"step": 4030
},
{
"epoch": 1.4140707035351767,
"grad_norm": 0.43226656317710876,
"learning_rate": 9.610000000000001e-06,
"loss": 0.0363,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1861413.0,
"step": 4040
},
{
"epoch": 1.4175708785439272,
"grad_norm": 5.785868167877197,
"learning_rate": 9.51e-06,
"loss": 0.069,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1865973.0,
"step": 4050
},
{
"epoch": 1.4210710535526776,
"grad_norm": 0.880620002746582,
"learning_rate": 9.410000000000001e-06,
"loss": 0.0436,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1870553.0,
"step": 4060
},
{
"epoch": 1.424571228561428,
"grad_norm": 6.6892218589782715,
"learning_rate": 9.31e-06,
"loss": 0.0627,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1875126.0,
"step": 4070
},
{
"epoch": 1.4280714035701785,
"grad_norm": 0.048246119171381,
"learning_rate": 9.21e-06,
"loss": 0.0428,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1879726.0,
"step": 4080
},
{
"epoch": 1.431571578578929,
"grad_norm": 0.07305438071489334,
"learning_rate": 9.110000000000001e-06,
"loss": 0.0387,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1884286.0,
"step": 4090
},
{
"epoch": 1.4350717535876794,
"grad_norm": 11.415247917175293,
"learning_rate": 9.01e-06,
"loss": 0.0821,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1888772.0,
"step": 4100
},
{
"epoch": 1.4385719285964298,
"grad_norm": 0.0724155455827713,
"learning_rate": 8.910000000000001e-06,
"loss": 0.0189,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1893335.0,
"step": 4110
},
{
"epoch": 1.4420721036051802,
"grad_norm": 0.11276718974113464,
"learning_rate": 8.81e-06,
"loss": 0.0218,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1898013.0,
"step": 4120
},
{
"epoch": 1.4455722786139307,
"grad_norm": 0.07353251427412033,
"learning_rate": 8.71e-06,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 1902625.0,
"step": 4130
},
{
"epoch": 1.4490724536226811,
"grad_norm": 0.031495820730924606,
"learning_rate": 8.61e-06,
"loss": 0.025,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1907228.0,
"step": 4140
},
{
"epoch": 1.4525726286314316,
"grad_norm": 13.788881301879883,
"learning_rate": 8.51e-06,
"loss": 0.0804,
"mean_token_accuracy": 0.975,
"num_tokens": 1911797.0,
"step": 4150
},
{
"epoch": 1.456072803640182,
"grad_norm": 0.05939871817827225,
"learning_rate": 8.409999999999999e-06,
"loss": 0.0432,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1916403.0,
"step": 4160
},
{
"epoch": 1.4595729786489324,
"grad_norm": 11.028568267822266,
"learning_rate": 8.31e-06,
"loss": 0.0425,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1920937.0,
"step": 4170
},
{
"epoch": 1.4630731536576829,
"grad_norm": 0.048251356929540634,
"learning_rate": 8.210000000000001e-06,
"loss": 0.0328,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1925503.0,
"step": 4180
},
{
"epoch": 1.4665733286664333,
"grad_norm": 0.05744925141334534,
"learning_rate": 8.11e-06,
"loss": 0.0581,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1930161.0,
"step": 4190
},
{
"epoch": 1.4700735036751837,
"grad_norm": 4.82538366317749,
"learning_rate": 8.010000000000001e-06,
"loss": 0.0637,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1934851.0,
"step": 4200
},
{
"epoch": 1.4735736786839342,
"grad_norm": 0.015897316858172417,
"learning_rate": 7.91e-06,
"loss": 0.0553,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1939517.0,
"step": 4210
},
{
"epoch": 1.4770738536926846,
"grad_norm": 0.4943805932998657,
"learning_rate": 7.810000000000001e-06,
"loss": 0.064,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1944221.0,
"step": 4220
},
{
"epoch": 1.480574028701435,
"grad_norm": 0.8401426672935486,
"learning_rate": 7.71e-06,
"loss": 0.004,
"mean_token_accuracy": 1.0,
"num_tokens": 1948737.0,
"step": 4230
},
{
"epoch": 1.4840742037101855,
"grad_norm": 8.93281364440918,
"learning_rate": 7.610000000000001e-06,
"loss": 0.0687,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1953245.0,
"step": 4240
},
{
"epoch": 1.487574378718936,
"grad_norm": 24.4106388092041,
"learning_rate": 7.51e-06,
"loss": 0.0928,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 1957794.0,
"step": 4250
},
{
"epoch": 1.4910745537276864,
"grad_norm": 0.38299062848091125,
"learning_rate": 7.41e-06,
"loss": 0.0376,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1962327.0,
"step": 4260
},
{
"epoch": 1.4945747287364368,
"grad_norm": 0.06252578645944595,
"learning_rate": 7.31e-06,
"loss": 0.0196,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1967028.0,
"step": 4270
},
{
"epoch": 1.4980749037451873,
"grad_norm": 1.5602178573608398,
"learning_rate": 7.2100000000000004e-06,
"loss": 0.023,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 1971740.0,
"step": 4280
},
{
"epoch": 1.5015750787539377,
"grad_norm": 0.031557030975818634,
"learning_rate": 7.11e-06,
"loss": 0.0233,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1976313.0,
"step": 4290
},
{
"epoch": 1.5050752537626881,
"grad_norm": 0.027841169387102127,
"learning_rate": 7.01e-06,
"loss": 0.022,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1980963.0,
"step": 4300
},
{
"epoch": 1.5085754287714386,
"grad_norm": 0.62822425365448,
"learning_rate": 6.91e-06,
"loss": 0.0481,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 1985494.0,
"step": 4310
},
{
"epoch": 1.512075603780189,
"grad_norm": 0.05204153060913086,
"learning_rate": 6.81e-06,
"loss": 0.0123,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 1990229.0,
"step": 4320
},
{
"epoch": 1.5155757787889395,
"grad_norm": 0.030517544597387314,
"learning_rate": 6.710000000000001e-06,
"loss": 0.0037,
"mean_token_accuracy": 1.0,
"num_tokens": 1994888.0,
"step": 4330
},
{
"epoch": 1.51907595379769,
"grad_norm": 0.03292595595121384,
"learning_rate": 6.610000000000001e-06,
"loss": 0.0739,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 1999584.0,
"step": 4340
},
{
"epoch": 1.5225761288064403,
"grad_norm": 0.04422605782747269,
"learning_rate": 6.510000000000001e-06,
"loss": 0.0677,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2004217.0,
"step": 4350
},
{
"epoch": 1.5260763038151908,
"grad_norm": 0.03554658591747284,
"learning_rate": 6.4100000000000005e-06,
"loss": 0.0764,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 2008889.0,
"step": 4360
},
{
"epoch": 1.5295764788239412,
"grad_norm": 3.288350820541382,
"learning_rate": 6.3100000000000006e-06,
"loss": 0.0361,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2013499.0,
"step": 4370
},
{
"epoch": 1.5330766538326916,
"grad_norm": 0.06462374329566956,
"learning_rate": 6.210000000000001e-06,
"loss": 0.0048,
"mean_token_accuracy": 1.0,
"num_tokens": 2018094.0,
"step": 4380
},
{
"epoch": 1.536576828841442,
"grad_norm": 0.1262829601764679,
"learning_rate": 6.110000000000001e-06,
"loss": 0.0165,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2022654.0,
"step": 4390
},
{
"epoch": 1.5400770038501925,
"grad_norm": 1.0548720359802246,
"learning_rate": 6.01e-06,
"loss": 0.0459,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2027308.0,
"step": 4400
},
{
"epoch": 1.543577178858943,
"grad_norm": 10.028485298156738,
"learning_rate": 5.91e-06,
"loss": 0.0683,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 2031892.0,
"step": 4410
},
{
"epoch": 1.5470773538676934,
"grad_norm": 9.958955764770508,
"learning_rate": 5.81e-06,
"loss": 0.0637,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 2036478.0,
"step": 4420
},
{
"epoch": 1.5505775288764438,
"grad_norm": 0.04276181757450104,
"learning_rate": 5.71e-06,
"loss": 0.0782,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2041074.0,
"step": 4430
},
{
"epoch": 1.5540777038851943,
"grad_norm": 0.047367651015520096,
"learning_rate": 5.61e-06,
"loss": 0.0377,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2045668.0,
"step": 4440
},
{
"epoch": 1.5575778788939447,
"grad_norm": 4.863480091094971,
"learning_rate": 5.510000000000001e-06,
"loss": 0.0285,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2050297.0,
"step": 4450
},
{
"epoch": 1.5610780539026952,
"grad_norm": 6.1144537925720215,
"learning_rate": 5.410000000000001e-06,
"loss": 0.0646,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2054920.0,
"step": 4460
},
{
"epoch": 1.5645782289114456,
"grad_norm": 0.05516400188207626,
"learning_rate": 5.31e-06,
"loss": 0.0097,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2059412.0,
"step": 4470
},
{
"epoch": 1.568078403920196,
"grad_norm": 0.06957421451807022,
"learning_rate": 5.21e-06,
"loss": 0.0453,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2063933.0,
"step": 4480
},
{
"epoch": 1.5715785789289465,
"grad_norm": 0.59195876121521,
"learning_rate": 5.11e-06,
"loss": 0.0545,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2068549.0,
"step": 4490
},
{
"epoch": 1.575078753937697,
"grad_norm": 4.435554504394531,
"learning_rate": 5.01e-06,
"loss": 0.0137,
"step": 4500
},
{
"epoch": 1.575078753937697,
"eval_accuracy": 0.44662034112444726,
"eval_f1": 0.3821042468723695,
"eval_loss": 0.04538816958665848,
"eval_mean_token_accuracy": 0.9878607569920896,
"eval_num_tokens": 2073095.0,
"eval_precision": 0.45302304542991134,
"eval_recall": 0.42551771400250105,
"eval_runtime": 243.4481,
"eval_samples_per_second": 6.502,
"eval_steps_per_second": 0.813,
"step": 4500
},
{
"epoch": 1.5785789289464474,
"grad_norm": 0.03762364760041237,
"learning_rate": 4.9100000000000004e-06,
"loss": 0.026,
"mean_token_accuracy": 0.9924999982118606,
"num_tokens": 2077816.0,
"step": 4510
},
{
"epoch": 1.5820791039551978,
"grad_norm": 0.038492508232593536,
"learning_rate": 4.81e-06,
"loss": 0.037,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2082369.0,
"step": 4520
},
{
"epoch": 1.5855792789639482,
"grad_norm": 0.04851048067212105,
"learning_rate": 4.710000000000001e-06,
"loss": 0.1037,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 2086980.0,
"step": 4530
},
{
"epoch": 1.5890794539726987,
"grad_norm": 0.06226026266813278,
"learning_rate": 4.610000000000001e-06,
"loss": 0.0316,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2091582.0,
"step": 4540
},
{
"epoch": 1.5925796289814491,
"grad_norm": 0.012553819455206394,
"learning_rate": 4.51e-06,
"loss": 0.0426,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2096137.0,
"step": 4550
},
{
"epoch": 1.5960798039901996,
"grad_norm": 0.12090373784303665,
"learning_rate": 4.41e-06,
"loss": 0.0035,
"mean_token_accuracy": 1.0,
"num_tokens": 2100974.0,
"step": 4560
},
{
"epoch": 1.59957997899895,
"grad_norm": 0.024477414786815643,
"learning_rate": 4.31e-06,
"loss": 0.0199,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2105553.0,
"step": 4570
},
{
"epoch": 1.6030801540077004,
"grad_norm": 9.273329734802246,
"learning_rate": 4.21e-06,
"loss": 0.0197,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2110182.0,
"step": 4580
},
{
"epoch": 1.6065803290164509,
"grad_norm": 6.43629264831543,
"learning_rate": 4.11e-06,
"loss": 0.0163,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2114818.0,
"step": 4590
},
{
"epoch": 1.6100805040252013,
"grad_norm": 0.047764312475919724,
"learning_rate": 4.01e-06,
"loss": 0.0112,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2119456.0,
"step": 4600
},
{
"epoch": 1.6135806790339517,
"grad_norm": 3.2197811603546143,
"learning_rate": 3.910000000000001e-06,
"loss": 0.0357,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2124191.0,
"step": 4610
},
{
"epoch": 1.6170808540427022,
"grad_norm": 0.04559561237692833,
"learning_rate": 3.8100000000000004e-06,
"loss": 0.0896,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2128813.0,
"step": 4620
},
{
"epoch": 1.6205810290514526,
"grad_norm": 0.03245115652680397,
"learning_rate": 3.7100000000000005e-06,
"loss": 0.0167,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2133363.0,
"step": 4630
},
{
"epoch": 1.624081204060203,
"grad_norm": 0.0637376606464386,
"learning_rate": 3.61e-06,
"loss": 0.0161,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2137919.0,
"step": 4640
},
{
"epoch": 1.6275813790689533,
"grad_norm": 0.10170795023441315,
"learning_rate": 3.5100000000000003e-06,
"loss": 0.0619,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2142475.0,
"step": 4650
},
{
"epoch": 1.6310815540777037,
"grad_norm": 0.11928985267877579,
"learning_rate": 3.41e-06,
"loss": 0.0317,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2147153.0,
"step": 4660
},
{
"epoch": 1.6345817290864542,
"grad_norm": 0.31450000405311584,
"learning_rate": 3.31e-06,
"loss": 0.0262,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2151803.0,
"step": 4670
},
{
"epoch": 1.6380819040952046,
"grad_norm": 0.025399642065167427,
"learning_rate": 3.2099999999999998e-06,
"loss": 0.029,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2156302.0,
"step": 4680
},
{
"epoch": 1.641582079103955,
"grad_norm": 0.07148288935422897,
"learning_rate": 3.11e-06,
"loss": 0.058,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2160964.0,
"step": 4690
},
{
"epoch": 1.6450822541127055,
"grad_norm": 0.043584585189819336,
"learning_rate": 3.01e-06,
"loss": 0.0118,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2165614.0,
"step": 4700
},
{
"epoch": 1.648582429121456,
"grad_norm": 0.021303439512848854,
"learning_rate": 2.91e-06,
"loss": 0.0593,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2170098.0,
"step": 4710
},
{
"epoch": 1.6520826041302064,
"grad_norm": 3.4671308994293213,
"learning_rate": 2.81e-06,
"loss": 0.0461,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2174607.0,
"step": 4720
},
{
"epoch": 1.6555827791389568,
"grad_norm": 0.03900500759482384,
"learning_rate": 2.71e-06,
"loss": 0.0029,
"mean_token_accuracy": 1.0,
"num_tokens": 2179286.0,
"step": 4730
},
{
"epoch": 1.6590829541477072,
"grad_norm": 0.02918989770114422,
"learning_rate": 2.6100000000000004e-06,
"loss": 0.0892,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 2183836.0,
"step": 4740
},
{
"epoch": 1.6625831291564577,
"grad_norm": 0.029129987582564354,
"learning_rate": 2.51e-06,
"loss": 0.0861,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 2188430.0,
"step": 4750
},
{
"epoch": 1.6660833041652081,
"grad_norm": 0.08811552822589874,
"learning_rate": 2.4100000000000002e-06,
"loss": 0.007,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2193026.0,
"step": 4760
},
{
"epoch": 1.6695834791739586,
"grad_norm": 3.6819851398468018,
"learning_rate": 2.31e-06,
"loss": 0.016,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2197606.0,
"step": 4770
},
{
"epoch": 1.673083654182709,
"grad_norm": 0.0475095734000206,
"learning_rate": 2.2100000000000004e-06,
"loss": 0.0318,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2202131.0,
"step": 4780
},
{
"epoch": 1.6765838291914594,
"grad_norm": 5.221133708953857,
"learning_rate": 2.11e-06,
"loss": 0.0138,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2206887.0,
"step": 4790
},
{
"epoch": 1.6800840042002099,
"grad_norm": 8.505605697631836,
"learning_rate": 2.0100000000000002e-06,
"loss": 0.0216,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2211562.0,
"step": 4800
},
{
"epoch": 1.6835841792089603,
"grad_norm": 0.05636419355869293,
"learning_rate": 1.91e-06,
"loss": 0.0568,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2216043.0,
"step": 4810
},
{
"epoch": 1.6870843542177107,
"grad_norm": 0.06148410961031914,
"learning_rate": 1.8100000000000002e-06,
"loss": 0.0313,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2220686.0,
"step": 4820
},
{
"epoch": 1.6905845292264612,
"grad_norm": 0.1644497960805893,
"learning_rate": 1.7100000000000001e-06,
"loss": 0.0611,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 2225264.0,
"step": 4830
},
{
"epoch": 1.6940847042352116,
"grad_norm": 8.40280532836914,
"learning_rate": 1.61e-06,
"loss": 0.0134,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2229777.0,
"step": 4840
},
{
"epoch": 1.697584879243962,
"grad_norm": 0.4285930097103119,
"learning_rate": 1.5100000000000002e-06,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 2234477.0,
"step": 4850
},
{
"epoch": 1.7010850542527125,
"grad_norm": 0.05217473581433296,
"learning_rate": 1.41e-06,
"loss": 0.0159,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2239087.0,
"step": 4860
},
{
"epoch": 1.704585229261463,
"grad_norm": 17.20269012451172,
"learning_rate": 1.3100000000000002e-06,
"loss": 0.0332,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2243751.0,
"step": 4870
},
{
"epoch": 1.7080854042702134,
"grad_norm": 0.019595852121710777,
"learning_rate": 1.21e-06,
"loss": 0.0438,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2248371.0,
"step": 4880
},
{
"epoch": 1.7115855792789638,
"grad_norm": 0.021653831005096436,
"learning_rate": 1.1100000000000002e-06,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 2252963.0,
"step": 4890
},
{
"epoch": 1.7150857542877143,
"grad_norm": 0.06774129718542099,
"learning_rate": 1.01e-06,
"loss": 0.0086,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2257627.0,
"step": 4900
},
{
"epoch": 1.7185859292964647,
"grad_norm": 0.17008963227272034,
"learning_rate": 9.100000000000001e-07,
"loss": 0.0511,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2262204.0,
"step": 4910
},
{
"epoch": 1.7220861043052151,
"grad_norm": 0.05181132256984711,
"learning_rate": 8.1e-07,
"loss": 0.102,
"mean_token_accuracy": 0.9699999928474426,
"num_tokens": 2266782.0,
"step": 4920
},
{
"epoch": 1.7255862793139656,
"grad_norm": 0.019397318363189697,
"learning_rate": 7.100000000000001e-07,
"loss": 0.0653,
"mean_token_accuracy": 0.9799999952316284,
"num_tokens": 2271407.0,
"step": 4930
},
{
"epoch": 1.729086454322716,
"grad_norm": 12.307579040527344,
"learning_rate": 6.100000000000001e-07,
"loss": 0.0463,
"mean_token_accuracy": 0.9899999976158143,
"num_tokens": 2276067.0,
"step": 4940
},
{
"epoch": 1.7325866293314665,
"grad_norm": 0.9262644648551941,
"learning_rate": 5.100000000000001e-07,
"loss": 0.1174,
"mean_token_accuracy": 0.9749999940395355,
"num_tokens": 2280720.0,
"step": 4950
},
{
"epoch": 1.736086804340217,
"grad_norm": 10.878531455993652,
"learning_rate": 4.1000000000000004e-07,
"loss": 0.0366,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2285258.0,
"step": 4960
},
{
"epoch": 1.7395869793489673,
"grad_norm": 0.015575112774968147,
"learning_rate": 3.1e-07,
"loss": 0.013,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2289807.0,
"step": 4970
},
{
"epoch": 1.7430871543577178,
"grad_norm": 6.684645652770996,
"learning_rate": 2.1e-07,
"loss": 0.048,
"mean_token_accuracy": 0.9849999964237213,
"num_tokens": 2294427.0,
"step": 4980
},
{
"epoch": 1.7465873293664682,
"grad_norm": 0.2887522280216217,
"learning_rate": 1.1e-07,
"loss": 0.0268,
"mean_token_accuracy": 0.9949999988079071,
"num_tokens": 2299106.0,
"step": 4990
},
{
"epoch": 1.7500875043752186,
"grad_norm": 3.4086289405822754,
"learning_rate": 1e-08,
"loss": 0.0635,
"step": 5000
},
{
"epoch": 1.7500875043752186,
"eval_accuracy": 0.4516740366392925,
"eval_f1": 0.3847962989063844,
"eval_loss": 0.04607350006699562,
"eval_mean_token_accuracy": 0.9878607566910561,
"eval_num_tokens": 2303714.0,
"eval_precision": 0.4551334955315969,
"eval_recall": 0.42526555766959756,
"eval_runtime": 245.308,
"eval_samples_per_second": 6.453,
"eval_steps_per_second": 0.807,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.477983256915968e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}