POLAR-g-eCe-train-args / trainer_state.json
CocoRoF's picture
Upload checkpoint-1491 contents
2b3d88b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994972347913524,
"eval_steps": 750,
"global_step": 1491,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033517680576504107,
"grad_norm": 14.694869995117188,
"learning_rate": 6.666666666666667e-06,
"loss": 53.6406,
"mean_token_accuracy": 0.5338318642228842,
"step": 5
},
{
"epoch": 0.006703536115300821,
"grad_norm": 14.033230781555176,
"learning_rate": 1.3333333333333333e-05,
"loss": 52.3838,
"mean_token_accuracy": 0.5248840853571892,
"step": 10
},
{
"epoch": 0.010055304172951232,
"grad_norm": 6.804769039154053,
"learning_rate": 2e-05,
"loss": 47.9105,
"mean_token_accuracy": 0.5399681400507689,
"step": 15
},
{
"epoch": 0.013407072230601643,
"grad_norm": 7.750083923339844,
"learning_rate": 2.6666666666666667e-05,
"loss": 41.8861,
"mean_token_accuracy": 0.55653104968369,
"step": 20
},
{
"epoch": 0.01675884028825205,
"grad_norm": 6.184543132781982,
"learning_rate": 3.3333333333333335e-05,
"loss": 37.33,
"mean_token_accuracy": 0.5655230440199375,
"step": 25
},
{
"epoch": 0.020110608345902465,
"grad_norm": 4.537179946899414,
"learning_rate": 4e-05,
"loss": 32.7503,
"mean_token_accuracy": 0.587661711126566,
"step": 30
},
{
"epoch": 0.023462376403552875,
"grad_norm": 3.6645753383636475,
"learning_rate": 4.666666666666667e-05,
"loss": 29.1892,
"mean_token_accuracy": 0.6075583577156067,
"step": 35
},
{
"epoch": 0.026814144461203285,
"grad_norm": 3.7526533603668213,
"learning_rate": 5.333333333333333e-05,
"loss": 26.3524,
"mean_token_accuracy": 0.6198613092303276,
"step": 40
},
{
"epoch": 0.030165912518853696,
"grad_norm": 3.0561397075653076,
"learning_rate": 6e-05,
"loss": 24.1513,
"mean_token_accuracy": 0.6353930421173573,
"step": 45
},
{
"epoch": 0.0335176805765041,
"grad_norm": 2.857618808746338,
"learning_rate": 6.666666666666667e-05,
"loss": 23.5029,
"mean_token_accuracy": 0.6437373287975788,
"step": 50
},
{
"epoch": 0.03686944863415452,
"grad_norm": 2.7901978492736816,
"learning_rate": 7.333333333333333e-05,
"loss": 22.9387,
"mean_token_accuracy": 0.646886795759201,
"step": 55
},
{
"epoch": 0.04022121669180493,
"grad_norm": 2.8266501426696777,
"learning_rate": 8e-05,
"loss": 22.0359,
"mean_token_accuracy": 0.6525138475000858,
"step": 60
},
{
"epoch": 0.04357298474945534,
"grad_norm": 2.5010733604431152,
"learning_rate": 8.666666666666667e-05,
"loss": 21.5158,
"mean_token_accuracy": 0.6548139773309231,
"step": 65
},
{
"epoch": 0.04692475280710575,
"grad_norm": 2.5834386348724365,
"learning_rate": 9.333333333333334e-05,
"loss": 21.5409,
"mean_token_accuracy": 0.6478891499340534,
"step": 70
},
{
"epoch": 0.05027652086475616,
"grad_norm": 2.6927576065063477,
"learning_rate": 0.0001,
"loss": 20.1017,
"mean_token_accuracy": 0.6757474772632122,
"step": 75
},
{
"epoch": 0.05362828892240657,
"grad_norm": 2.0276572704315186,
"learning_rate": 9.964689265536724e-05,
"loss": 19.9912,
"mean_token_accuracy": 0.6763999305665493,
"step": 80
},
{
"epoch": 0.05698005698005698,
"grad_norm": 2.4628567695617676,
"learning_rate": 9.929378531073446e-05,
"loss": 19.9089,
"mean_token_accuracy": 0.672279854118824,
"step": 85
},
{
"epoch": 0.06033182503770739,
"grad_norm": 2.258838415145874,
"learning_rate": 9.89406779661017e-05,
"loss": 19.7132,
"mean_token_accuracy": 0.6713059276342392,
"step": 90
},
{
"epoch": 0.0636835930953578,
"grad_norm": 2.447565793991089,
"learning_rate": 9.858757062146892e-05,
"loss": 18.7631,
"mean_token_accuracy": 0.6825208596885204,
"step": 95
},
{
"epoch": 0.0670353611530082,
"grad_norm": 2.1105902194976807,
"learning_rate": 9.823446327683616e-05,
"loss": 19.4631,
"mean_token_accuracy": 0.6674435302615166,
"step": 100
},
{
"epoch": 0.07038712921065862,
"grad_norm": 2.309248447418213,
"learning_rate": 9.78813559322034e-05,
"loss": 19.0249,
"mean_token_accuracy": 0.6734571024775505,
"step": 105
},
{
"epoch": 0.07373889726830904,
"grad_norm": 2.101681709289551,
"learning_rate": 9.752824858757063e-05,
"loss": 18.593,
"mean_token_accuracy": 0.6875097192823887,
"step": 110
},
{
"epoch": 0.07709066532595944,
"grad_norm": 2.157726526260376,
"learning_rate": 9.717514124293787e-05,
"loss": 18.5973,
"mean_token_accuracy": 0.6829216606914997,
"step": 115
},
{
"epoch": 0.08044243338360986,
"grad_norm": 2.0711209774017334,
"learning_rate": 9.682203389830509e-05,
"loss": 19.1541,
"mean_token_accuracy": 0.6785640828311443,
"step": 120
},
{
"epoch": 0.08379420144126026,
"grad_norm": 2.015594959259033,
"learning_rate": 9.646892655367233e-05,
"loss": 18.9493,
"mean_token_accuracy": 0.6861244946718216,
"step": 125
},
{
"epoch": 0.08714596949891068,
"grad_norm": 2.1295998096466064,
"learning_rate": 9.611581920903955e-05,
"loss": 18.5125,
"mean_token_accuracy": 0.6793887488543987,
"step": 130
},
{
"epoch": 0.09049773755656108,
"grad_norm": 2.2496395111083984,
"learning_rate": 9.576271186440679e-05,
"loss": 18.4019,
"mean_token_accuracy": 0.6890006221830844,
"step": 135
},
{
"epoch": 0.0938495056142115,
"grad_norm": 2.1168577671051025,
"learning_rate": 9.540960451977402e-05,
"loss": 18.7305,
"mean_token_accuracy": 0.6841622419655323,
"step": 140
},
{
"epoch": 0.0972012736718619,
"grad_norm": 1.8554915189743042,
"learning_rate": 9.505649717514125e-05,
"loss": 18.6606,
"mean_token_accuracy": 0.6859239712357521,
"step": 145
},
{
"epoch": 0.10055304172951232,
"grad_norm": 1.9698066711425781,
"learning_rate": 9.470338983050848e-05,
"loss": 19.1065,
"mean_token_accuracy": 0.6759489566087723,
"step": 150
},
{
"epoch": 0.10390480978716272,
"grad_norm": 2.2483623027801514,
"learning_rate": 9.43502824858757e-05,
"loss": 18.8041,
"mean_token_accuracy": 0.68142851293087,
"step": 155
},
{
"epoch": 0.10725657784481314,
"grad_norm": 1.8570690155029297,
"learning_rate": 9.399717514124294e-05,
"loss": 18.8862,
"mean_token_accuracy": 0.6791303649544715,
"step": 160
},
{
"epoch": 0.11060834590246355,
"grad_norm": 2.143021583557129,
"learning_rate": 9.364406779661016e-05,
"loss": 18.7605,
"mean_token_accuracy": 0.681893227249384,
"step": 165
},
{
"epoch": 0.11396011396011396,
"grad_norm": 1.8951307535171509,
"learning_rate": 9.32909604519774e-05,
"loss": 18.3005,
"mean_token_accuracy": 0.6897541806101799,
"step": 170
},
{
"epoch": 0.11731188201776437,
"grad_norm": 1.971745252609253,
"learning_rate": 9.293785310734464e-05,
"loss": 18.8995,
"mean_token_accuracy": 0.6820204116404056,
"step": 175
},
{
"epoch": 0.12066365007541478,
"grad_norm": 1.910328984260559,
"learning_rate": 9.258474576271187e-05,
"loss": 18.8808,
"mean_token_accuracy": 0.6812884464859963,
"step": 180
},
{
"epoch": 0.12401541813306519,
"grad_norm": 1.730974555015564,
"learning_rate": 9.223163841807911e-05,
"loss": 18.0871,
"mean_token_accuracy": 0.6907590143382549,
"step": 185
},
{
"epoch": 0.1273671861907156,
"grad_norm": 2.125452995300293,
"learning_rate": 9.187853107344633e-05,
"loss": 18.1569,
"mean_token_accuracy": 0.689236406236887,
"step": 190
},
{
"epoch": 0.13071895424836602,
"grad_norm": 2.0234949588775635,
"learning_rate": 9.152542372881357e-05,
"loss": 18.3342,
"mean_token_accuracy": 0.6902932204306126,
"step": 195
},
{
"epoch": 0.1340707223060164,
"grad_norm": 1.9802364110946655,
"learning_rate": 9.11723163841808e-05,
"loss": 18.7942,
"mean_token_accuracy": 0.6788501650094986,
"step": 200
},
{
"epoch": 0.13742249036366683,
"grad_norm": 1.8897534608840942,
"learning_rate": 9.081920903954803e-05,
"loss": 18.4679,
"mean_token_accuracy": 0.6900524459779263,
"step": 205
},
{
"epoch": 0.14077425842131724,
"grad_norm": 1.9040635824203491,
"learning_rate": 9.046610169491526e-05,
"loss": 18.0058,
"mean_token_accuracy": 0.690093420445919,
"step": 210
},
{
"epoch": 0.14412602647896766,
"grad_norm": 2.0558955669403076,
"learning_rate": 9.011299435028249e-05,
"loss": 17.5489,
"mean_token_accuracy": 0.7006829999387264,
"step": 215
},
{
"epoch": 0.14747779453661808,
"grad_norm": 1.7952055931091309,
"learning_rate": 8.975988700564972e-05,
"loss": 18.2907,
"mean_token_accuracy": 0.6876891441643238,
"step": 220
},
{
"epoch": 0.15082956259426847,
"grad_norm": 1.8588192462921143,
"learning_rate": 8.940677966101694e-05,
"loss": 18.4005,
"mean_token_accuracy": 0.6897859051823616,
"step": 225
},
{
"epoch": 0.15418133065191889,
"grad_norm": 1.9269477128982544,
"learning_rate": 8.905367231638418e-05,
"loss": 18.2096,
"mean_token_accuracy": 0.6909494370222091,
"step": 230
},
{
"epoch": 0.1575330987095693,
"grad_norm": 1.8693301677703857,
"learning_rate": 8.870056497175142e-05,
"loss": 18.394,
"mean_token_accuracy": 0.6836515329778194,
"step": 235
},
{
"epoch": 0.16088486676721972,
"grad_norm": 1.787061333656311,
"learning_rate": 8.834745762711864e-05,
"loss": 18.1503,
"mean_token_accuracy": 0.6907145738601684,
"step": 240
},
{
"epoch": 0.1642366348248701,
"grad_norm": 1.8895225524902344,
"learning_rate": 8.799435028248588e-05,
"loss": 18.3026,
"mean_token_accuracy": 0.6878940775990486,
"step": 245
},
{
"epoch": 0.16758840288252053,
"grad_norm": 1.835693120956421,
"learning_rate": 8.764124293785311e-05,
"loss": 17.9347,
"mean_token_accuracy": 0.6917316012084485,
"step": 250
},
{
"epoch": 0.17094017094017094,
"grad_norm": 1.7408661842346191,
"learning_rate": 8.728813559322035e-05,
"loss": 18.0051,
"mean_token_accuracy": 0.689583633840084,
"step": 255
},
{
"epoch": 0.17429193899782136,
"grad_norm": 1.9096996784210205,
"learning_rate": 8.693502824858759e-05,
"loss": 17.6064,
"mean_token_accuracy": 0.6965925216674804,
"step": 260
},
{
"epoch": 0.17764370705547175,
"grad_norm": 1.9822146892547607,
"learning_rate": 8.658192090395481e-05,
"loss": 17.6301,
"mean_token_accuracy": 0.7005406267940998,
"step": 265
},
{
"epoch": 0.18099547511312217,
"grad_norm": 1.8383901119232178,
"learning_rate": 8.622881355932204e-05,
"loss": 17.9114,
"mean_token_accuracy": 0.6876685306429863,
"step": 270
},
{
"epoch": 0.18434724317077258,
"grad_norm": 1.7920355796813965,
"learning_rate": 8.587570621468927e-05,
"loss": 18.1271,
"mean_token_accuracy": 0.689356567710638,
"step": 275
},
{
"epoch": 0.187699011228423,
"grad_norm": 1.6455663442611694,
"learning_rate": 8.55225988700565e-05,
"loss": 17.787,
"mean_token_accuracy": 0.6919776491820813,
"step": 280
},
{
"epoch": 0.1910507792860734,
"grad_norm": 1.9442647695541382,
"learning_rate": 8.516949152542373e-05,
"loss": 17.6019,
"mean_token_accuracy": 0.6980393722653389,
"step": 285
},
{
"epoch": 0.1944025473437238,
"grad_norm": 2.294377565383911,
"learning_rate": 8.481638418079096e-05,
"loss": 17.8778,
"mean_token_accuracy": 0.6954585202038288,
"step": 290
},
{
"epoch": 0.19775431540137423,
"grad_norm": 1.8009259700775146,
"learning_rate": 8.44632768361582e-05,
"loss": 17.5257,
"mean_token_accuracy": 0.6998075112700463,
"step": 295
},
{
"epoch": 0.20110608345902464,
"grad_norm": 2.015516757965088,
"learning_rate": 8.411016949152542e-05,
"loss": 17.7554,
"mean_token_accuracy": 0.6968327619135379,
"step": 300
},
{
"epoch": 0.20445785151667506,
"grad_norm": 1.5640082359313965,
"learning_rate": 8.375706214689266e-05,
"loss": 17.3438,
"mean_token_accuracy": 0.69996168166399,
"step": 305
},
{
"epoch": 0.20780961957432545,
"grad_norm": 1.9527899026870728,
"learning_rate": 8.340395480225988e-05,
"loss": 17.6883,
"mean_token_accuracy": 0.6988407798111439,
"step": 310
},
{
"epoch": 0.21116138763197587,
"grad_norm": 1.8222606182098389,
"learning_rate": 8.305084745762712e-05,
"loss": 17.0646,
"mean_token_accuracy": 0.7061679445207119,
"step": 315
},
{
"epoch": 0.21451315568962628,
"grad_norm": 1.8560868501663208,
"learning_rate": 8.269774011299435e-05,
"loss": 17.8875,
"mean_token_accuracy": 0.6941629223525524,
"step": 320
},
{
"epoch": 0.2178649237472767,
"grad_norm": 1.7588037252426147,
"learning_rate": 8.234463276836159e-05,
"loss": 17.6412,
"mean_token_accuracy": 0.6954927705228329,
"step": 325
},
{
"epoch": 0.2212166918049271,
"grad_norm": 1.738242268562317,
"learning_rate": 8.199152542372883e-05,
"loss": 17.8251,
"mean_token_accuracy": 0.6898994512856007,
"step": 330
},
{
"epoch": 0.2245684598625775,
"grad_norm": 1.8485089540481567,
"learning_rate": 8.163841807909605e-05,
"loss": 17.3078,
"mean_token_accuracy": 0.7000270999968052,
"step": 335
},
{
"epoch": 0.22792022792022792,
"grad_norm": 1.8579105138778687,
"learning_rate": 8.128531073446328e-05,
"loss": 17.3078,
"mean_token_accuracy": 0.6995702408254146,
"step": 340
},
{
"epoch": 0.23127199597787834,
"grad_norm": 1.7994352579116821,
"learning_rate": 8.093220338983051e-05,
"loss": 17.7557,
"mean_token_accuracy": 0.6928035505115986,
"step": 345
},
{
"epoch": 0.23462376403552873,
"grad_norm": 1.9240634441375732,
"learning_rate": 8.057909604519774e-05,
"loss": 17.4329,
"mean_token_accuracy": 0.6960855178534985,
"step": 350
},
{
"epoch": 0.23797553209317915,
"grad_norm": 1.6718952655792236,
"learning_rate": 8.022598870056498e-05,
"loss": 17.5951,
"mean_token_accuracy": 0.6947735913097859,
"step": 355
},
{
"epoch": 0.24132730015082957,
"grad_norm": 1.6835826635360718,
"learning_rate": 7.98728813559322e-05,
"loss": 18.1085,
"mean_token_accuracy": 0.6882089108228684,
"step": 360
},
{
"epoch": 0.24467906820847998,
"grad_norm": 1.7387073040008545,
"learning_rate": 7.951977401129944e-05,
"loss": 17.799,
"mean_token_accuracy": 0.6932998545467853,
"step": 365
},
{
"epoch": 0.24803083626613037,
"grad_norm": 2.0071725845336914,
"learning_rate": 7.916666666666666e-05,
"loss": 17.4076,
"mean_token_accuracy": 0.6961173862218857,
"step": 370
},
{
"epoch": 0.2513826043237808,
"grad_norm": 2.326915740966797,
"learning_rate": 7.88135593220339e-05,
"loss": 17.3121,
"mean_token_accuracy": 0.7005321949720382,
"step": 375
},
{
"epoch": 0.2547343723814312,
"grad_norm": 2.1876060962677,
"learning_rate": 7.846045197740113e-05,
"loss": 17.9069,
"mean_token_accuracy": 0.6906426399946213,
"step": 380
},
{
"epoch": 0.2580861404390816,
"grad_norm": 1.849671483039856,
"learning_rate": 7.810734463276837e-05,
"loss": 17.483,
"mean_token_accuracy": 0.7000573620200157,
"step": 385
},
{
"epoch": 0.26143790849673204,
"grad_norm": 1.6676862239837646,
"learning_rate": 7.775423728813561e-05,
"loss": 16.8936,
"mean_token_accuracy": 0.7045633904635906,
"step": 390
},
{
"epoch": 0.26478967655438246,
"grad_norm": 1.6702505350112915,
"learning_rate": 7.740112994350283e-05,
"loss": 17.904,
"mean_token_accuracy": 0.6874841086566448,
"step": 395
},
{
"epoch": 0.2681414446120328,
"grad_norm": 1.7280704975128174,
"learning_rate": 7.704802259887007e-05,
"loss": 17.4515,
"mean_token_accuracy": 0.7018027983605861,
"step": 400
},
{
"epoch": 0.27149321266968324,
"grad_norm": 1.8801991939544678,
"learning_rate": 7.669491525423729e-05,
"loss": 17.43,
"mean_token_accuracy": 0.7009049601852894,
"step": 405
},
{
"epoch": 0.27484498072733365,
"grad_norm": 1.9758073091506958,
"learning_rate": 7.634180790960453e-05,
"loss": 17.5984,
"mean_token_accuracy": 0.6948069363832474,
"step": 410
},
{
"epoch": 0.27819674878498407,
"grad_norm": 1.5747147798538208,
"learning_rate": 7.598870056497176e-05,
"loss": 18.3079,
"mean_token_accuracy": 0.6853139907121658,
"step": 415
},
{
"epoch": 0.2815485168426345,
"grad_norm": 1.6292234659194946,
"learning_rate": 7.563559322033898e-05,
"loss": 17.4527,
"mean_token_accuracy": 0.697540608048439,
"step": 420
},
{
"epoch": 0.2849002849002849,
"grad_norm": 1.6185086965560913,
"learning_rate": 7.528248587570622e-05,
"loss": 17.4193,
"mean_token_accuracy": 0.7012022204697133,
"step": 425
},
{
"epoch": 0.2882520529579353,
"grad_norm": 1.8361762762069702,
"learning_rate": 7.492937853107344e-05,
"loss": 17.4544,
"mean_token_accuracy": 0.698820473998785,
"step": 430
},
{
"epoch": 0.29160382101558574,
"grad_norm": 1.7740592956542969,
"learning_rate": 7.457627118644068e-05,
"loss": 18.0507,
"mean_token_accuracy": 0.6881603226065636,
"step": 435
},
{
"epoch": 0.29495558907323616,
"grad_norm": 1.8252911567687988,
"learning_rate": 7.42231638418079e-05,
"loss": 17.155,
"mean_token_accuracy": 0.7065504610538482,
"step": 440
},
{
"epoch": 0.2983073571308865,
"grad_norm": 1.8424382209777832,
"learning_rate": 7.387005649717514e-05,
"loss": 17.3055,
"mean_token_accuracy": 0.6978819817304611,
"step": 445
},
{
"epoch": 0.30165912518853694,
"grad_norm": 1.7494243383407593,
"learning_rate": 7.351694915254238e-05,
"loss": 16.8365,
"mean_token_accuracy": 0.7099504336714745,
"step": 450
},
{
"epoch": 0.30501089324618735,
"grad_norm": 1.936540961265564,
"learning_rate": 7.316384180790961e-05,
"loss": 18.2753,
"mean_token_accuracy": 0.6913827233016491,
"step": 455
},
{
"epoch": 0.30836266130383777,
"grad_norm": 1.810272216796875,
"learning_rate": 7.281073446327685e-05,
"loss": 17.0536,
"mean_token_accuracy": 0.6986232809722424,
"step": 460
},
{
"epoch": 0.3117144293614882,
"grad_norm": 1.6832094192504883,
"learning_rate": 7.245762711864407e-05,
"loss": 17.2231,
"mean_token_accuracy": 0.702030860632658,
"step": 465
},
{
"epoch": 0.3150661974191386,
"grad_norm": 1.8872151374816895,
"learning_rate": 7.21045197740113e-05,
"loss": 17.5502,
"mean_token_accuracy": 0.6932449921965599,
"step": 470
},
{
"epoch": 0.318417965476789,
"grad_norm": 1.788021445274353,
"learning_rate": 7.175141242937854e-05,
"loss": 16.8596,
"mean_token_accuracy": 0.7096694305539131,
"step": 475
},
{
"epoch": 0.32176973353443944,
"grad_norm": 1.8025559186935425,
"learning_rate": 7.139830508474577e-05,
"loss": 16.662,
"mean_token_accuracy": 0.7063573338091373,
"step": 480
},
{
"epoch": 0.3251215015920898,
"grad_norm": 2.274674654006958,
"learning_rate": 7.1045197740113e-05,
"loss": 17.5965,
"mean_token_accuracy": 0.6934389650821686,
"step": 485
},
{
"epoch": 0.3284732696497402,
"grad_norm": 1.6426053047180176,
"learning_rate": 7.069209039548022e-05,
"loss": 17.0914,
"mean_token_accuracy": 0.7049042917788029,
"step": 490
},
{
"epoch": 0.33182503770739064,
"grad_norm": 1.6252586841583252,
"learning_rate": 7.033898305084746e-05,
"loss": 17.6078,
"mean_token_accuracy": 0.6924709647893905,
"step": 495
},
{
"epoch": 0.33517680576504105,
"grad_norm": 1.7185930013656616,
"learning_rate": 6.998587570621468e-05,
"loss": 17.314,
"mean_token_accuracy": 0.7039985358715057,
"step": 500
},
{
"epoch": 0.33852857382269147,
"grad_norm": 1.7891852855682373,
"learning_rate": 6.963276836158192e-05,
"loss": 17.2188,
"mean_token_accuracy": 0.6977060906589031,
"step": 505
},
{
"epoch": 0.3418803418803419,
"grad_norm": 1.9103929996490479,
"learning_rate": 6.927966101694916e-05,
"loss": 17.4467,
"mean_token_accuracy": 0.6982413403689861,
"step": 510
},
{
"epoch": 0.3452321099379923,
"grad_norm": 1.8996375799179077,
"learning_rate": 6.892655367231638e-05,
"loss": 16.9608,
"mean_token_accuracy": 0.7054095402359962,
"step": 515
},
{
"epoch": 0.3485838779956427,
"grad_norm": 2.0335419178009033,
"learning_rate": 6.857344632768362e-05,
"loss": 17.3361,
"mean_token_accuracy": 0.7016568422317505,
"step": 520
},
{
"epoch": 0.35193564605329314,
"grad_norm": 1.9008755683898926,
"learning_rate": 6.822033898305085e-05,
"loss": 16.9694,
"mean_token_accuracy": 0.7059390284121037,
"step": 525
},
{
"epoch": 0.3552874141109435,
"grad_norm": 1.8340988159179688,
"learning_rate": 6.786723163841809e-05,
"loss": 17.3528,
"mean_token_accuracy": 0.7033507622778415,
"step": 530
},
{
"epoch": 0.3586391821685939,
"grad_norm": 1.6903594732284546,
"learning_rate": 6.751412429378532e-05,
"loss": 17.3021,
"mean_token_accuracy": 0.7001501135528088,
"step": 535
},
{
"epoch": 0.36199095022624433,
"grad_norm": 1.8101950883865356,
"learning_rate": 6.716101694915255e-05,
"loss": 17.938,
"mean_token_accuracy": 0.6908830553293228,
"step": 540
},
{
"epoch": 0.36534271828389475,
"grad_norm": 1.6470075845718384,
"learning_rate": 6.680790960451978e-05,
"loss": 17.6612,
"mean_token_accuracy": 0.6923478744924069,
"step": 545
},
{
"epoch": 0.36869448634154517,
"grad_norm": 2.1860337257385254,
"learning_rate": 6.6454802259887e-05,
"loss": 17.5684,
"mean_token_accuracy": 0.6983748801052571,
"step": 550
},
{
"epoch": 0.3720462543991956,
"grad_norm": 1.717653512954712,
"learning_rate": 6.610169491525424e-05,
"loss": 17.1166,
"mean_token_accuracy": 0.7025655619800091,
"step": 555
},
{
"epoch": 0.375398022456846,
"grad_norm": 1.9525723457336426,
"learning_rate": 6.574858757062147e-05,
"loss": 17.2908,
"mean_token_accuracy": 0.6997996769845486,
"step": 560
},
{
"epoch": 0.3787497905144964,
"grad_norm": 1.6053602695465088,
"learning_rate": 6.53954802259887e-05,
"loss": 17.3894,
"mean_token_accuracy": 0.698741364479065,
"step": 565
},
{
"epoch": 0.3821015585721468,
"grad_norm": 1.7356934547424316,
"learning_rate": 6.504237288135594e-05,
"loss": 17.1546,
"mean_token_accuracy": 0.7013543620705605,
"step": 570
},
{
"epoch": 0.3854533266297972,
"grad_norm": 1.7188559770584106,
"learning_rate": 6.468926553672316e-05,
"loss": 17.7637,
"mean_token_accuracy": 0.6936320647597313,
"step": 575
},
{
"epoch": 0.3888050946874476,
"grad_norm": 1.8413478136062622,
"learning_rate": 6.43361581920904e-05,
"loss": 17.8498,
"mean_token_accuracy": 0.695782047510147,
"step": 580
},
{
"epoch": 0.39215686274509803,
"grad_norm": 1.5715190172195435,
"learning_rate": 6.398305084745762e-05,
"loss": 17.4304,
"mean_token_accuracy": 0.6989135831594467,
"step": 585
},
{
"epoch": 0.39550863080274845,
"grad_norm": 1.8729442358016968,
"learning_rate": 6.362994350282486e-05,
"loss": 16.9125,
"mean_token_accuracy": 0.708356649428606,
"step": 590
},
{
"epoch": 0.39886039886039887,
"grad_norm": 2.099592685699463,
"learning_rate": 6.327683615819209e-05,
"loss": 17.542,
"mean_token_accuracy": 0.6888726130127907,
"step": 595
},
{
"epoch": 0.4022121669180493,
"grad_norm": 1.6204314231872559,
"learning_rate": 6.292372881355933e-05,
"loss": 16.9305,
"mean_token_accuracy": 0.7038852870464325,
"step": 600
},
{
"epoch": 0.4055639349756997,
"grad_norm": 2.12034010887146,
"learning_rate": 6.257062146892656e-05,
"loss": 17.0389,
"mean_token_accuracy": 0.704576326906681,
"step": 605
},
{
"epoch": 0.4089157030333501,
"grad_norm": 1.6821502447128296,
"learning_rate": 6.221751412429379e-05,
"loss": 16.788,
"mean_token_accuracy": 0.7000284940004349,
"step": 610
},
{
"epoch": 0.4122674710910005,
"grad_norm": 1.8137435913085938,
"learning_rate": 6.186440677966102e-05,
"loss": 17.5926,
"mean_token_accuracy": 0.6961537927389145,
"step": 615
},
{
"epoch": 0.4156192391486509,
"grad_norm": 1.6652235984802246,
"learning_rate": 6.151129943502825e-05,
"loss": 17.3539,
"mean_token_accuracy": 0.7028377398848533,
"step": 620
},
{
"epoch": 0.4189710072063013,
"grad_norm": 1.766480803489685,
"learning_rate": 6.115819209039548e-05,
"loss": 17.529,
"mean_token_accuracy": 0.6905739739537239,
"step": 625
},
{
"epoch": 0.42232277526395173,
"grad_norm": 1.6319854259490967,
"learning_rate": 6.080508474576272e-05,
"loss": 16.9847,
"mean_token_accuracy": 0.7060947254300117,
"step": 630
},
{
"epoch": 0.42567454332160215,
"grad_norm": 2.1006696224212646,
"learning_rate": 6.045197740112994e-05,
"loss": 16.9317,
"mean_token_accuracy": 0.7015593230724335,
"step": 635
},
{
"epoch": 0.42902631137925257,
"grad_norm": 1.7353427410125732,
"learning_rate": 6.009887005649718e-05,
"loss": 17.4744,
"mean_token_accuracy": 0.7001501567661762,
"step": 640
},
{
"epoch": 0.432378079436903,
"grad_norm": 1.9449700117111206,
"learning_rate": 5.974576271186441e-05,
"loss": 16.8705,
"mean_token_accuracy": 0.7026407413184643,
"step": 645
},
{
"epoch": 0.4357298474945534,
"grad_norm": 1.6030067205429077,
"learning_rate": 5.9392655367231644e-05,
"loss": 16.8924,
"mean_token_accuracy": 0.702277285605669,
"step": 650
},
{
"epoch": 0.43908161555220376,
"grad_norm": 1.5722424983978271,
"learning_rate": 5.903954802259888e-05,
"loss": 17.364,
"mean_token_accuracy": 0.6959278948605061,
"step": 655
},
{
"epoch": 0.4424333836098542,
"grad_norm": 1.8168216943740845,
"learning_rate": 5.86864406779661e-05,
"loss": 16.704,
"mean_token_accuracy": 0.7045813865959645,
"step": 660
},
{
"epoch": 0.4457851516675046,
"grad_norm": 1.905402660369873,
"learning_rate": 5.833333333333334e-05,
"loss": 16.8896,
"mean_token_accuracy": 0.7026248089969158,
"step": 665
},
{
"epoch": 0.449136919725155,
"grad_norm": 1.7437454462051392,
"learning_rate": 5.798022598870056e-05,
"loss": 17.0496,
"mean_token_accuracy": 0.702862861007452,
"step": 670
},
{
"epoch": 0.45248868778280543,
"grad_norm": 1.7496871948242188,
"learning_rate": 5.76271186440678e-05,
"loss": 16.7024,
"mean_token_accuracy": 0.7073140636086463,
"step": 675
},
{
"epoch": 0.45584045584045585,
"grad_norm": 1.6521803140640259,
"learning_rate": 5.727401129943503e-05,
"loss": 17.4437,
"mean_token_accuracy": 0.6910906590521335,
"step": 680
},
{
"epoch": 0.45919222389810627,
"grad_norm": 1.7904677391052246,
"learning_rate": 5.6920903954802264e-05,
"loss": 17.4803,
"mean_token_accuracy": 0.6987466789782047,
"step": 685
},
{
"epoch": 0.4625439919557567,
"grad_norm": 2.4545388221740723,
"learning_rate": 5.65677966101695e-05,
"loss": 17.2987,
"mean_token_accuracy": 0.699196208268404,
"step": 690
},
{
"epoch": 0.46589576001340705,
"grad_norm": 1.6428866386413574,
"learning_rate": 5.6214689265536723e-05,
"loss": 16.7636,
"mean_token_accuracy": 0.7029999569058418,
"step": 695
},
{
"epoch": 0.46924752807105746,
"grad_norm": 1.9685977697372437,
"learning_rate": 5.586158192090396e-05,
"loss": 17.3887,
"mean_token_accuracy": 0.6938736639916897,
"step": 700
},
{
"epoch": 0.4725992961287079,
"grad_norm": 1.5567928552627563,
"learning_rate": 5.550847457627118e-05,
"loss": 17.1879,
"mean_token_accuracy": 0.7024729043245316,
"step": 705
},
{
"epoch": 0.4759510641863583,
"grad_norm": 1.6846567392349243,
"learning_rate": 5.515536723163842e-05,
"loss": 16.8679,
"mean_token_accuracy": 0.7025640495121479,
"step": 710
},
{
"epoch": 0.4793028322440087,
"grad_norm": 1.6596832275390625,
"learning_rate": 5.480225988700565e-05,
"loss": 16.7137,
"mean_token_accuracy": 0.7031160019338131,
"step": 715
},
{
"epoch": 0.48265460030165913,
"grad_norm": 2.04453444480896,
"learning_rate": 5.4449152542372885e-05,
"loss": 17.0646,
"mean_token_accuracy": 0.7018779084086418,
"step": 720
},
{
"epoch": 0.48600636835930955,
"grad_norm": 1.7244528532028198,
"learning_rate": 5.409604519774012e-05,
"loss": 17.1897,
"mean_token_accuracy": 0.6981223806738853,
"step": 725
},
{
"epoch": 0.48935813641695997,
"grad_norm": 1.6929802894592285,
"learning_rate": 5.3742937853107344e-05,
"loss": 17.2678,
"mean_token_accuracy": 0.6996262572705746,
"step": 730
},
{
"epoch": 0.4927099044746104,
"grad_norm": 1.7945303916931152,
"learning_rate": 5.338983050847458e-05,
"loss": 17.1465,
"mean_token_accuracy": 0.7002299666404724,
"step": 735
},
{
"epoch": 0.49606167253226074,
"grad_norm": 1.5936013460159302,
"learning_rate": 5.30367231638418e-05,
"loss": 17.0265,
"mean_token_accuracy": 0.6998031720519066,
"step": 740
},
{
"epoch": 0.49941344058991116,
"grad_norm": 1.553004264831543,
"learning_rate": 5.268361581920904e-05,
"loss": 16.7301,
"mean_token_accuracy": 0.7022854961454869,
"step": 745
},
{
"epoch": 0.5027652086475616,
"grad_norm": 1.7667690515518188,
"learning_rate": 5.2330508474576275e-05,
"loss": 16.8576,
"mean_token_accuracy": 0.7085686258971691,
"step": 750
},
{
"epoch": 0.5027652086475616,
"eval_loss": 1.0600364208221436,
"eval_mean_token_accuracy": 0.7049777010093035,
"eval_runtime": 1736.5707,
"eval_samples_per_second": 1.392,
"eval_steps_per_second": 0.174,
"step": 750
},
{
"epoch": 0.506116976705212,
"grad_norm": 1.4901829957962036,
"learning_rate": 5.1977401129943505e-05,
"loss": 17.0004,
"mean_token_accuracy": 0.6990960523486137,
"step": 755
},
{
"epoch": 0.5094687447628624,
"grad_norm": 1.8451662063598633,
"learning_rate": 5.162429378531074e-05,
"loss": 17.2012,
"mean_token_accuracy": 0.7007680244743824,
"step": 760
},
{
"epoch": 0.5128205128205128,
"grad_norm": 1.6952011585235596,
"learning_rate": 5.1271186440677964e-05,
"loss": 17.612,
"mean_token_accuracy": 0.6927438467741013,
"step": 765
},
{
"epoch": 0.5161722808781632,
"grad_norm": 1.7307817935943604,
"learning_rate": 5.09180790960452e-05,
"loss": 16.8776,
"mean_token_accuracy": 0.706513649225235,
"step": 770
},
{
"epoch": 0.5195240489358136,
"grad_norm": 1.6692585945129395,
"learning_rate": 5.056497175141243e-05,
"loss": 17.0364,
"mean_token_accuracy": 0.704279126226902,
"step": 775
},
{
"epoch": 0.5228758169934641,
"grad_norm": 1.6963402032852173,
"learning_rate": 5.0211864406779666e-05,
"loss": 16.8957,
"mean_token_accuracy": 0.7085353158414364,
"step": 780
},
{
"epoch": 0.5262275850511144,
"grad_norm": 1.678458571434021,
"learning_rate": 4.9858757062146896e-05,
"loss": 17.7932,
"mean_token_accuracy": 0.6964584030210972,
"step": 785
},
{
"epoch": 0.5295793531087649,
"grad_norm": 1.7449827194213867,
"learning_rate": 4.9505649717514125e-05,
"loss": 16.8765,
"mean_token_accuracy": 0.7036922007799149,
"step": 790
},
{
"epoch": 0.5329311211664153,
"grad_norm": 1.7107524871826172,
"learning_rate": 4.915254237288136e-05,
"loss": 17.243,
"mean_token_accuracy": 0.6997682720422744,
"step": 795
},
{
"epoch": 0.5362828892240656,
"grad_norm": 1.6416223049163818,
"learning_rate": 4.879943502824859e-05,
"loss": 16.7253,
"mean_token_accuracy": 0.7050332672894001,
"step": 800
},
{
"epoch": 0.5396346572817161,
"grad_norm": 1.867213249206543,
"learning_rate": 4.844632768361582e-05,
"loss": 16.8566,
"mean_token_accuracy": 0.7032786093652248,
"step": 805
},
{
"epoch": 0.5429864253393665,
"grad_norm": 1.6539360284805298,
"learning_rate": 4.809322033898305e-05,
"loss": 16.6993,
"mean_token_accuracy": 0.7117977932095527,
"step": 810
},
{
"epoch": 0.546338193397017,
"grad_norm": 1.752715826034546,
"learning_rate": 4.7740112994350286e-05,
"loss": 17.5809,
"mean_token_accuracy": 0.6992670528590679,
"step": 815
},
{
"epoch": 0.5496899614546673,
"grad_norm": 1.806174397468567,
"learning_rate": 4.7387005649717516e-05,
"loss": 17.1588,
"mean_token_accuracy": 0.6960965767502785,
"step": 820
},
{
"epoch": 0.5530417295123178,
"grad_norm": 1.719764232635498,
"learning_rate": 4.703389830508475e-05,
"loss": 16.8685,
"mean_token_accuracy": 0.7025568410754204,
"step": 825
},
{
"epoch": 0.5563934975699681,
"grad_norm": 1.7800629138946533,
"learning_rate": 4.668079096045198e-05,
"loss": 16.8872,
"mean_token_accuracy": 0.6994628652930259,
"step": 830
},
{
"epoch": 0.5597452656276186,
"grad_norm": 1.7011103630065918,
"learning_rate": 4.632768361581921e-05,
"loss": 17.2342,
"mean_token_accuracy": 0.7006913289427757,
"step": 835
},
{
"epoch": 0.563097033685269,
"grad_norm": 1.6887695789337158,
"learning_rate": 4.597457627118644e-05,
"loss": 16.7385,
"mean_token_accuracy": 0.7045929700136184,
"step": 840
},
{
"epoch": 0.5664488017429193,
"grad_norm": 1.9496142864227295,
"learning_rate": 4.562146892655367e-05,
"loss": 16.8387,
"mean_token_accuracy": 0.7083131410181522,
"step": 845
},
{
"epoch": 0.5698005698005698,
"grad_norm": 1.7757388353347778,
"learning_rate": 4.5268361581920906e-05,
"loss": 17.3856,
"mean_token_accuracy": 0.6994826771318913,
"step": 850
},
{
"epoch": 0.5731523378582202,
"grad_norm": 1.7115302085876465,
"learning_rate": 4.491525423728814e-05,
"loss": 16.5993,
"mean_token_accuracy": 0.7093915119767189,
"step": 855
},
{
"epoch": 0.5765041059158706,
"grad_norm": 1.7968231439590454,
"learning_rate": 4.456214689265537e-05,
"loss": 16.8983,
"mean_token_accuracy": 0.7087731070816516,
"step": 860
},
{
"epoch": 0.579855873973521,
"grad_norm": 1.6066899299621582,
"learning_rate": 4.42090395480226e-05,
"loss": 16.7126,
"mean_token_accuracy": 0.7053335346281528,
"step": 865
},
{
"epoch": 0.5832076420311715,
"grad_norm": 1.6380205154418945,
"learning_rate": 4.385593220338983e-05,
"loss": 17.0037,
"mean_token_accuracy": 0.7038719221949578,
"step": 870
},
{
"epoch": 0.5865594100888218,
"grad_norm": 1.8956695795059204,
"learning_rate": 4.350282485875706e-05,
"loss": 16.9679,
"mean_token_accuracy": 0.6983371920883655,
"step": 875
},
{
"epoch": 0.5899111781464723,
"grad_norm": 1.625135064125061,
"learning_rate": 4.314971751412429e-05,
"loss": 17.0642,
"mean_token_accuracy": 0.7067640118300915,
"step": 880
},
{
"epoch": 0.5932629462041227,
"grad_norm": 1.6344581842422485,
"learning_rate": 4.279661016949153e-05,
"loss": 16.3079,
"mean_token_accuracy": 0.7225491903722286,
"step": 885
},
{
"epoch": 0.596614714261773,
"grad_norm": 1.7680976390838623,
"learning_rate": 4.244350282485876e-05,
"loss": 16.7187,
"mean_token_accuracy": 0.7041032016277313,
"step": 890
},
{
"epoch": 0.5999664823194235,
"grad_norm": 1.8056613206863403,
"learning_rate": 4.209039548022599e-05,
"loss": 17.3536,
"mean_token_accuracy": 0.6975419208407402,
"step": 895
},
{
"epoch": 0.6033182503770739,
"grad_norm": 1.8398966789245605,
"learning_rate": 4.173728813559322e-05,
"loss": 16.6245,
"mean_token_accuracy": 0.7088275127112865,
"step": 900
},
{
"epoch": 0.6066700184347243,
"grad_norm": 1.8332566022872925,
"learning_rate": 4.138418079096045e-05,
"loss": 17.0128,
"mean_token_accuracy": 0.7018843114376068,
"step": 905
},
{
"epoch": 0.6100217864923747,
"grad_norm": 1.6582337617874146,
"learning_rate": 4.103107344632768e-05,
"loss": 16.8948,
"mean_token_accuracy": 0.7051651798188686,
"step": 910
},
{
"epoch": 0.6133735545500252,
"grad_norm": 1.7373839616775513,
"learning_rate": 4.067796610169492e-05,
"loss": 16.9138,
"mean_token_accuracy": 0.7022108249366283,
"step": 915
},
{
"epoch": 0.6167253226076755,
"grad_norm": 1.6373577117919922,
"learning_rate": 4.0324858757062154e-05,
"loss": 17.0573,
"mean_token_accuracy": 0.7042267486453057,
"step": 920
},
{
"epoch": 0.620077090665326,
"grad_norm": 1.581024408340454,
"learning_rate": 3.997175141242938e-05,
"loss": 16.6234,
"mean_token_accuracy": 0.7054463028907776,
"step": 925
},
{
"epoch": 0.6234288587229764,
"grad_norm": 1.6900616884231567,
"learning_rate": 3.961864406779661e-05,
"loss": 17.0468,
"mean_token_accuracy": 0.7014504976570606,
"step": 930
},
{
"epoch": 0.6267806267806267,
"grad_norm": 1.6560430526733398,
"learning_rate": 3.926553672316384e-05,
"loss": 16.909,
"mean_token_accuracy": 0.7064756542444229,
"step": 935
},
{
"epoch": 0.6301323948382772,
"grad_norm": 1.8687000274658203,
"learning_rate": 3.891242937853107e-05,
"loss": 17.0047,
"mean_token_accuracy": 0.7055176287889481,
"step": 940
},
{
"epoch": 0.6334841628959276,
"grad_norm": 1.777716040611267,
"learning_rate": 3.855932203389831e-05,
"loss": 16.556,
"mean_token_accuracy": 0.7047871246933937,
"step": 945
},
{
"epoch": 0.636835930953578,
"grad_norm": 1.6830016374588013,
"learning_rate": 3.820621468926554e-05,
"loss": 16.5832,
"mean_token_accuracy": 0.7049862682819367,
"step": 950
},
{
"epoch": 0.6401876990112284,
"grad_norm": 1.5959638357162476,
"learning_rate": 3.7853107344632774e-05,
"loss": 16.8336,
"mean_token_accuracy": 0.7072055459022522,
"step": 955
},
{
"epoch": 0.6435394670688789,
"grad_norm": 1.82794189453125,
"learning_rate": 3.7500000000000003e-05,
"loss": 16.6644,
"mean_token_accuracy": 0.7058505766093731,
"step": 960
},
{
"epoch": 0.6468912351265292,
"grad_norm": 1.6554478406906128,
"learning_rate": 3.714689265536723e-05,
"loss": 16.2796,
"mean_token_accuracy": 0.7101977132260799,
"step": 965
},
{
"epoch": 0.6502430031841796,
"grad_norm": 1.8698370456695557,
"learning_rate": 3.679378531073446e-05,
"loss": 16.1934,
"mean_token_accuracy": 0.7142874717712402,
"step": 970
},
{
"epoch": 0.6535947712418301,
"grad_norm": 1.8040566444396973,
"learning_rate": 3.644067796610169e-05,
"loss": 16.5345,
"mean_token_accuracy": 0.7125143676996231,
"step": 975
},
{
"epoch": 0.6569465392994804,
"grad_norm": 1.6644558906555176,
"learning_rate": 3.608757062146893e-05,
"loss": 16.508,
"mean_token_accuracy": 0.7078846462070942,
"step": 980
},
{
"epoch": 0.6602983073571309,
"grad_norm": 1.7228506803512573,
"learning_rate": 3.573446327683616e-05,
"loss": 16.8474,
"mean_token_accuracy": 0.7084795109927654,
"step": 985
},
{
"epoch": 0.6636500754147813,
"grad_norm": 1.486241102218628,
"learning_rate": 3.5381355932203394e-05,
"loss": 17.1453,
"mean_token_accuracy": 0.6975291892886162,
"step": 990
},
{
"epoch": 0.6670018434724317,
"grad_norm": 1.7130765914916992,
"learning_rate": 3.5028248587570624e-05,
"loss": 16.458,
"mean_token_accuracy": 0.7106956362724304,
"step": 995
},
{
"epoch": 0.6703536115300821,
"grad_norm": 1.863926649093628,
"learning_rate": 3.467514124293785e-05,
"loss": 17.3095,
"mean_token_accuracy": 0.6962033234536648,
"step": 1000
},
{
"epoch": 0.6737053795877326,
"grad_norm": 1.6535072326660156,
"learning_rate": 3.432203389830508e-05,
"loss": 16.6846,
"mean_token_accuracy": 0.7084034703671932,
"step": 1005
},
{
"epoch": 0.6770571476453829,
"grad_norm": 1.7278594970703125,
"learning_rate": 3.396892655367232e-05,
"loss": 16.9805,
"mean_token_accuracy": 0.7026786416769027,
"step": 1010
},
{
"epoch": 0.6804089157030333,
"grad_norm": 1.9055004119873047,
"learning_rate": 3.361581920903955e-05,
"loss": 17.2562,
"mean_token_accuracy": 0.6977267302572727,
"step": 1015
},
{
"epoch": 0.6837606837606838,
"grad_norm": 1.6398614645004272,
"learning_rate": 3.326271186440678e-05,
"loss": 17.3378,
"mean_token_accuracy": 0.6958214737474918,
"step": 1020
},
{
"epoch": 0.6871124518183341,
"grad_norm": 1.926950454711914,
"learning_rate": 3.2909604519774014e-05,
"loss": 16.6536,
"mean_token_accuracy": 0.7083842910826206,
"step": 1025
},
{
"epoch": 0.6904642198759846,
"grad_norm": 1.8061659336090088,
"learning_rate": 3.2556497175141244e-05,
"loss": 16.643,
"mean_token_accuracy": 0.7093963578343392,
"step": 1030
},
{
"epoch": 0.693815987933635,
"grad_norm": 1.6816084384918213,
"learning_rate": 3.2203389830508473e-05,
"loss": 16.9696,
"mean_token_accuracy": 0.7000316813588142,
"step": 1035
},
{
"epoch": 0.6971677559912854,
"grad_norm": 1.630842685699463,
"learning_rate": 3.185028248587571e-05,
"loss": 16.587,
"mean_token_accuracy": 0.7107978977262974,
"step": 1040
},
{
"epoch": 0.7005195240489358,
"grad_norm": 1.755123257637024,
"learning_rate": 3.149717514124294e-05,
"loss": 17.0736,
"mean_token_accuracy": 0.7017260067164898,
"step": 1045
},
{
"epoch": 0.7038712921065863,
"grad_norm": 1.4850029945373535,
"learning_rate": 3.114406779661017e-05,
"loss": 16.3165,
"mean_token_accuracy": 0.7119720429182053,
"step": 1050
},
{
"epoch": 0.7072230601642366,
"grad_norm": 1.916961908340454,
"learning_rate": 3.0790960451977405e-05,
"loss": 17.0237,
"mean_token_accuracy": 0.6976533338427544,
"step": 1055
},
{
"epoch": 0.710574828221887,
"grad_norm": 1.5003294944763184,
"learning_rate": 3.043785310734463e-05,
"loss": 16.8504,
"mean_token_accuracy": 0.7056308597326278,
"step": 1060
},
{
"epoch": 0.7139265962795375,
"grad_norm": 1.9166836738586426,
"learning_rate": 3.0084745762711864e-05,
"loss": 16.8231,
"mean_token_accuracy": 0.7023352533578873,
"step": 1065
},
{
"epoch": 0.7172783643371878,
"grad_norm": 1.7789411544799805,
"learning_rate": 2.97316384180791e-05,
"loss": 17.3132,
"mean_token_accuracy": 0.6994914725422859,
"step": 1070
},
{
"epoch": 0.7206301323948383,
"grad_norm": 1.7289875745773315,
"learning_rate": 2.937853107344633e-05,
"loss": 17.3902,
"mean_token_accuracy": 0.69447166249156,
"step": 1075
},
{
"epoch": 0.7239819004524887,
"grad_norm": 1.4835467338562012,
"learning_rate": 2.902542372881356e-05,
"loss": 16.751,
"mean_token_accuracy": 0.7052346661686897,
"step": 1080
},
{
"epoch": 0.7273336685101391,
"grad_norm": 1.5802119970321655,
"learning_rate": 2.8672316384180792e-05,
"loss": 16.6574,
"mean_token_accuracy": 0.7059398606419564,
"step": 1085
},
{
"epoch": 0.7306854365677895,
"grad_norm": 1.8420851230621338,
"learning_rate": 2.8319209039548022e-05,
"loss": 16.9315,
"mean_token_accuracy": 0.7063411138951778,
"step": 1090
},
{
"epoch": 0.7340372046254399,
"grad_norm": 1.7593777179718018,
"learning_rate": 2.7966101694915255e-05,
"loss": 16.8653,
"mean_token_accuracy": 0.7089171193540096,
"step": 1095
},
{
"epoch": 0.7373889726830903,
"grad_norm": 1.681443452835083,
"learning_rate": 2.7612994350282488e-05,
"loss": 16.9878,
"mean_token_accuracy": 0.7057393230497837,
"step": 1100
},
{
"epoch": 0.7407407407407407,
"grad_norm": 1.6064281463623047,
"learning_rate": 2.725988700564972e-05,
"loss": 16.6153,
"mean_token_accuracy": 0.7038764618337154,
"step": 1105
},
{
"epoch": 0.7440925087983912,
"grad_norm": 1.5632483959197998,
"learning_rate": 2.690677966101695e-05,
"loss": 16.0927,
"mean_token_accuracy": 0.7171440742909908,
"step": 1110
},
{
"epoch": 0.7474442768560415,
"grad_norm": 1.8588156700134277,
"learning_rate": 2.6553672316384183e-05,
"loss": 16.5765,
"mean_token_accuracy": 0.7098327249288559,
"step": 1115
},
{
"epoch": 0.750796044913692,
"grad_norm": 1.5576221942901611,
"learning_rate": 2.6200564971751413e-05,
"loss": 16.6568,
"mean_token_accuracy": 0.7029327027499676,
"step": 1120
},
{
"epoch": 0.7541478129713424,
"grad_norm": 1.645244836807251,
"learning_rate": 2.5847457627118642e-05,
"loss": 16.7294,
"mean_token_accuracy": 0.7060277953743934,
"step": 1125
},
{
"epoch": 0.7574995810289928,
"grad_norm": 1.4038984775543213,
"learning_rate": 2.549435028248588e-05,
"loss": 16.5925,
"mean_token_accuracy": 0.7068064086139202,
"step": 1130
},
{
"epoch": 0.7608513490866432,
"grad_norm": 1.7987641096115112,
"learning_rate": 2.514124293785311e-05,
"loss": 16.6834,
"mean_token_accuracy": 0.7070130936801433,
"step": 1135
},
{
"epoch": 0.7642031171442936,
"grad_norm": 1.5423444509506226,
"learning_rate": 2.478813559322034e-05,
"loss": 16.4551,
"mean_token_accuracy": 0.7121224895119667,
"step": 1140
},
{
"epoch": 0.767554885201944,
"grad_norm": 1.7546942234039307,
"learning_rate": 2.443502824858757e-05,
"loss": 16.9741,
"mean_token_accuracy": 0.7010989025235176,
"step": 1145
},
{
"epoch": 0.7709066532595944,
"grad_norm": 1.8481935262680054,
"learning_rate": 2.4081920903954803e-05,
"loss": 16.6323,
"mean_token_accuracy": 0.7058765202760696,
"step": 1150
},
{
"epoch": 0.7742584213172449,
"grad_norm": 1.6855909824371338,
"learning_rate": 2.3728813559322036e-05,
"loss": 16.6844,
"mean_token_accuracy": 0.7119428858160972,
"step": 1155
},
{
"epoch": 0.7776101893748952,
"grad_norm": 1.9828130006790161,
"learning_rate": 2.3375706214689266e-05,
"loss": 16.866,
"mean_token_accuracy": 0.7036800056695938,
"step": 1160
},
{
"epoch": 0.7809619574325457,
"grad_norm": 1.5005120038986206,
"learning_rate": 2.30225988700565e-05,
"loss": 16.3539,
"mean_token_accuracy": 0.711839384585619,
"step": 1165
},
{
"epoch": 0.7843137254901961,
"grad_norm": 2.262735366821289,
"learning_rate": 2.266949152542373e-05,
"loss": 16.4102,
"mean_token_accuracy": 0.7110463745892048,
"step": 1170
},
{
"epoch": 0.7876654935478465,
"grad_norm": 1.6699568033218384,
"learning_rate": 2.231638418079096e-05,
"loss": 17.1027,
"mean_token_accuracy": 0.7031991191208362,
"step": 1175
},
{
"epoch": 0.7910172616054969,
"grad_norm": 1.6248890161514282,
"learning_rate": 2.196327683615819e-05,
"loss": 16.3399,
"mean_token_accuracy": 0.7143234215676785,
"step": 1180
},
{
"epoch": 0.7943690296631473,
"grad_norm": 1.7570775747299194,
"learning_rate": 2.1610169491525427e-05,
"loss": 16.2255,
"mean_token_accuracy": 0.7123358778655529,
"step": 1185
},
{
"epoch": 0.7977207977207977,
"grad_norm": 1.9391677379608154,
"learning_rate": 2.1257062146892657e-05,
"loss": 16.3472,
"mean_token_accuracy": 0.711616413295269,
"step": 1190
},
{
"epoch": 0.8010725657784481,
"grad_norm": 1.8997981548309326,
"learning_rate": 2.0903954802259886e-05,
"loss": 16.5601,
"mean_token_accuracy": 0.7071553356945515,
"step": 1195
},
{
"epoch": 0.8044243338360986,
"grad_norm": 1.6094359159469604,
"learning_rate": 2.055084745762712e-05,
"loss": 16.622,
"mean_token_accuracy": 0.7043877936899662,
"step": 1200
},
{
"epoch": 0.8077761018937489,
"grad_norm": 1.7940973043441772,
"learning_rate": 2.0197740112994352e-05,
"loss": 16.6535,
"mean_token_accuracy": 0.705554535984993,
"step": 1205
},
{
"epoch": 0.8111278699513994,
"grad_norm": 1.6890041828155518,
"learning_rate": 1.984463276836158e-05,
"loss": 17.2328,
"mean_token_accuracy": 0.6988375537097454,
"step": 1210
},
{
"epoch": 0.8144796380090498,
"grad_norm": 1.5568735599517822,
"learning_rate": 1.9491525423728814e-05,
"loss": 16.9753,
"mean_token_accuracy": 0.7015632651746273,
"step": 1215
},
{
"epoch": 0.8178314060667002,
"grad_norm": 1.7157835960388184,
"learning_rate": 1.9138418079096047e-05,
"loss": 16.3668,
"mean_token_accuracy": 0.7098449252545833,
"step": 1220
},
{
"epoch": 0.8211831741243506,
"grad_norm": 1.7175644636154175,
"learning_rate": 1.8785310734463277e-05,
"loss": 16.8061,
"mean_token_accuracy": 0.7032932281494141,
"step": 1225
},
{
"epoch": 0.824534942182001,
"grad_norm": 1.7225829362869263,
"learning_rate": 1.843220338983051e-05,
"loss": 16.5716,
"mean_token_accuracy": 0.7074852548539639,
"step": 1230
},
{
"epoch": 0.8278867102396514,
"grad_norm": 1.8654727935791016,
"learning_rate": 1.8079096045197743e-05,
"loss": 16.8172,
"mean_token_accuracy": 0.7035241700708866,
"step": 1235
},
{
"epoch": 0.8312384782973018,
"grad_norm": 1.9604694843292236,
"learning_rate": 1.7725988700564972e-05,
"loss": 16.2992,
"mean_token_accuracy": 0.714275274425745,
"step": 1240
},
{
"epoch": 0.8345902463549523,
"grad_norm": 1.7569185495376587,
"learning_rate": 1.7372881355932205e-05,
"loss": 16.6269,
"mean_token_accuracy": 0.7052666112780571,
"step": 1245
},
{
"epoch": 0.8379420144126026,
"grad_norm": 1.6537069082260132,
"learning_rate": 1.7019774011299435e-05,
"loss": 16.5978,
"mean_token_accuracy": 0.708269502967596,
"step": 1250
},
{
"epoch": 0.8412937824702531,
"grad_norm": 1.8623359203338623,
"learning_rate": 1.6666666666666667e-05,
"loss": 16.1831,
"mean_token_accuracy": 0.7164609245955944,
"step": 1255
},
{
"epoch": 0.8446455505279035,
"grad_norm": 1.7004101276397705,
"learning_rate": 1.63135593220339e-05,
"loss": 16.9611,
"mean_token_accuracy": 0.7057129152119159,
"step": 1260
},
{
"epoch": 0.8479973185855538,
"grad_norm": 1.8294973373413086,
"learning_rate": 1.596045197740113e-05,
"loss": 16.8036,
"mean_token_accuracy": 0.7046464517712593,
"step": 1265
},
{
"epoch": 0.8513490866432043,
"grad_norm": 1.7992702722549438,
"learning_rate": 1.5607344632768363e-05,
"loss": 16.139,
"mean_token_accuracy": 0.7126708298921585,
"step": 1270
},
{
"epoch": 0.8547008547008547,
"grad_norm": 2.033846855163574,
"learning_rate": 1.5254237288135596e-05,
"loss": 16.49,
"mean_token_accuracy": 0.707030464708805,
"step": 1275
},
{
"epoch": 0.8580526227585051,
"grad_norm": 1.690617561340332,
"learning_rate": 1.4901129943502825e-05,
"loss": 16.7829,
"mean_token_accuracy": 0.7026272863149643,
"step": 1280
},
{
"epoch": 0.8614043908161555,
"grad_norm": 1.7161706686019897,
"learning_rate": 1.4548022598870056e-05,
"loss": 16.4907,
"mean_token_accuracy": 0.7054763376712799,
"step": 1285
},
{
"epoch": 0.864756158873806,
"grad_norm": 1.5910500288009644,
"learning_rate": 1.419491525423729e-05,
"loss": 16.3073,
"mean_token_accuracy": 0.7165283918380737,
"step": 1290
},
{
"epoch": 0.8681079269314563,
"grad_norm": 1.5939749479293823,
"learning_rate": 1.384180790960452e-05,
"loss": 16.6524,
"mean_token_accuracy": 0.705347529053688,
"step": 1295
},
{
"epoch": 0.8714596949891068,
"grad_norm": 1.7478996515274048,
"learning_rate": 1.3488700564971752e-05,
"loss": 17.1832,
"mean_token_accuracy": 0.6956523738801479,
"step": 1300
},
{
"epoch": 0.8748114630467572,
"grad_norm": 1.6442205905914307,
"learning_rate": 1.3135593220338985e-05,
"loss": 16.3978,
"mean_token_accuracy": 0.7132278561592102,
"step": 1305
},
{
"epoch": 0.8781632311044075,
"grad_norm": 1.7201565504074097,
"learning_rate": 1.2782485875706216e-05,
"loss": 16.3159,
"mean_token_accuracy": 0.711051919311285,
"step": 1310
},
{
"epoch": 0.881514999162058,
"grad_norm": 1.829209327697754,
"learning_rate": 1.2429378531073447e-05,
"loss": 16.7987,
"mean_token_accuracy": 0.7058401651680469,
"step": 1315
},
{
"epoch": 0.8848667672197084,
"grad_norm": 1.4660886526107788,
"learning_rate": 1.2076271186440678e-05,
"loss": 16.7297,
"mean_token_accuracy": 0.7092804253101349,
"step": 1320
},
{
"epoch": 0.8882185352773588,
"grad_norm": 1.4927663803100586,
"learning_rate": 1.172316384180791e-05,
"loss": 15.9333,
"mean_token_accuracy": 0.7158772744238376,
"step": 1325
},
{
"epoch": 0.8915703033350092,
"grad_norm": 1.6522186994552612,
"learning_rate": 1.137005649717514e-05,
"loss": 16.4156,
"mean_token_accuracy": 0.7134528748691082,
"step": 1330
},
{
"epoch": 0.8949220713926597,
"grad_norm": 1.7809523344039917,
"learning_rate": 1.1016949152542374e-05,
"loss": 16.2625,
"mean_token_accuracy": 0.7148336976766586,
"step": 1335
},
{
"epoch": 0.89827383945031,
"grad_norm": 1.8860619068145752,
"learning_rate": 1.0663841807909605e-05,
"loss": 16.6187,
"mean_token_accuracy": 0.7087382405996323,
"step": 1340
},
{
"epoch": 0.9016256075079605,
"grad_norm": 1.854195475578308,
"learning_rate": 1.0310734463276836e-05,
"loss": 16.5843,
"mean_token_accuracy": 0.7144103929400444,
"step": 1345
},
{
"epoch": 0.9049773755656109,
"grad_norm": 1.7052239179611206,
"learning_rate": 9.957627118644067e-06,
"loss": 16.3345,
"mean_token_accuracy": 0.7125584341585636,
"step": 1350
},
{
"epoch": 0.9083291436232612,
"grad_norm": 1.5887420177459717,
"learning_rate": 9.6045197740113e-06,
"loss": 16.2409,
"mean_token_accuracy": 0.7080107174813748,
"step": 1355
},
{
"epoch": 0.9116809116809117,
"grad_norm": 1.6052732467651367,
"learning_rate": 9.251412429378532e-06,
"loss": 16.2373,
"mean_token_accuracy": 0.7137157171964645,
"step": 1360
},
{
"epoch": 0.9150326797385621,
"grad_norm": 1.7612617015838623,
"learning_rate": 8.898305084745763e-06,
"loss": 16.0292,
"mean_token_accuracy": 0.7181592255830764,
"step": 1365
},
{
"epoch": 0.9183844477962125,
"grad_norm": 1.8271749019622803,
"learning_rate": 8.545197740112996e-06,
"loss": 16.8757,
"mean_token_accuracy": 0.701992305368185,
"step": 1370
},
{
"epoch": 0.9217362158538629,
"grad_norm": 1.6350926160812378,
"learning_rate": 8.192090395480225e-06,
"loss": 16.6061,
"mean_token_accuracy": 0.7089238859713077,
"step": 1375
},
{
"epoch": 0.9250879839115134,
"grad_norm": 1.7321621179580688,
"learning_rate": 7.838983050847458e-06,
"loss": 16.2532,
"mean_token_accuracy": 0.7115737572312355,
"step": 1380
},
{
"epoch": 0.9284397519691637,
"grad_norm": 1.8958040475845337,
"learning_rate": 7.48587570621469e-06,
"loss": 16.5068,
"mean_token_accuracy": 0.7108790181577206,
"step": 1385
},
{
"epoch": 0.9317915200268141,
"grad_norm": 1.629992127418518,
"learning_rate": 7.1327683615819206e-06,
"loss": 16.2367,
"mean_token_accuracy": 0.7134776934981346,
"step": 1390
},
{
"epoch": 0.9351432880844646,
"grad_norm": 1.904123067855835,
"learning_rate": 6.779661016949153e-06,
"loss": 16.3444,
"mean_token_accuracy": 0.7045241884887219,
"step": 1395
},
{
"epoch": 0.9384950561421149,
"grad_norm": 1.6319600343704224,
"learning_rate": 6.426553672316385e-06,
"loss": 16.3,
"mean_token_accuracy": 0.7118948072195053,
"step": 1400
},
{
"epoch": 0.9418468241997654,
"grad_norm": 1.6921709775924683,
"learning_rate": 6.073446327683617e-06,
"loss": 16.5816,
"mean_token_accuracy": 0.7079687170684338,
"step": 1405
},
{
"epoch": 0.9451985922574158,
"grad_norm": 1.636551856994629,
"learning_rate": 5.720338983050848e-06,
"loss": 16.785,
"mean_token_accuracy": 0.7054948009550571,
"step": 1410
},
{
"epoch": 0.9485503603150662,
"grad_norm": 1.6171858310699463,
"learning_rate": 5.367231638418079e-06,
"loss": 16.6877,
"mean_token_accuracy": 0.7033485405147075,
"step": 1415
},
{
"epoch": 0.9519021283727166,
"grad_norm": 1.6833641529083252,
"learning_rate": 5.014124293785311e-06,
"loss": 16.5803,
"mean_token_accuracy": 0.706027402728796,
"step": 1420
},
{
"epoch": 0.9552538964303671,
"grad_norm": 2.0238494873046875,
"learning_rate": 4.6610169491525425e-06,
"loss": 16.4305,
"mean_token_accuracy": 0.7110757566988468,
"step": 1425
},
{
"epoch": 0.9586056644880174,
"grad_norm": 1.5262683629989624,
"learning_rate": 4.307909604519774e-06,
"loss": 16.105,
"mean_token_accuracy": 0.7173994883894921,
"step": 1430
},
{
"epoch": 0.9619574325456678,
"grad_norm": 1.6822128295898438,
"learning_rate": 3.954802259887006e-06,
"loss": 17.0064,
"mean_token_accuracy": 0.7033144362270832,
"step": 1435
},
{
"epoch": 0.9653092006033183,
"grad_norm": 2.1382946968078613,
"learning_rate": 3.6016949152542374e-06,
"loss": 16.6567,
"mean_token_accuracy": 0.7085098147392273,
"step": 1440
},
{
"epoch": 0.9686609686609686,
"grad_norm": 1.6137080192565918,
"learning_rate": 3.248587570621469e-06,
"loss": 16.4193,
"mean_token_accuracy": 0.7077061600983143,
"step": 1445
},
{
"epoch": 0.9720127367186191,
"grad_norm": 1.6318018436431885,
"learning_rate": 2.8954802259887007e-06,
"loss": 16.5904,
"mean_token_accuracy": 0.7037704810500145,
"step": 1450
},
{
"epoch": 0.9753645047762695,
"grad_norm": 1.6723519563674927,
"learning_rate": 2.5423728813559323e-06,
"loss": 16.351,
"mean_token_accuracy": 0.715372896194458,
"step": 1455
},
{
"epoch": 0.9787162728339199,
"grad_norm": 2.6915719509124756,
"learning_rate": 2.189265536723164e-06,
"loss": 16.5627,
"mean_token_accuracy": 0.706637478619814,
"step": 1460
},
{
"epoch": 0.9820680408915703,
"grad_norm": 1.9349390268325806,
"learning_rate": 1.8361581920903956e-06,
"loss": 16.7821,
"mean_token_accuracy": 0.7010103747248649,
"step": 1465
},
{
"epoch": 0.9854198089492208,
"grad_norm": 1.6685172319412231,
"learning_rate": 1.4830508474576273e-06,
"loss": 16.7016,
"mean_token_accuracy": 0.7086931586265564,
"step": 1470
},
{
"epoch": 0.9887715770068711,
"grad_norm": 1.7148998975753784,
"learning_rate": 1.129943502824859e-06,
"loss": 16.4809,
"mean_token_accuracy": 0.7131018862128258,
"step": 1475
},
{
"epoch": 0.9921233450645215,
"grad_norm": 1.8873836994171143,
"learning_rate": 7.768361581920904e-07,
"loss": 16.5183,
"mean_token_accuracy": 0.7111847102642059,
"step": 1480
},
{
"epoch": 0.995475113122172,
"grad_norm": 1.8390552997589111,
"learning_rate": 4.2372881355932204e-07,
"loss": 16.1742,
"mean_token_accuracy": 0.7128683432936669,
"step": 1485
},
{
"epoch": 0.9988268811798223,
"grad_norm": 1.8799461126327515,
"learning_rate": 7.062146892655368e-08,
"loss": 17.1633,
"mean_token_accuracy": 0.6963419988751411,
"step": 1490
}
],
"logging_steps": 5,
"max_steps": 1491,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 750,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5012213304045076e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}