VideoMolmo_checkpoints / train_prev_f4_model_bf16_LORA_ckpt600trainer_state.json
Fahad-S's picture
Upload train_prev_f4_model_bf16_LORA_ckpt600trainer_state.json with huggingface_hub
dd63b74 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.1278437465320166,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0035512151814449007,
"grad_norm": 0.09903648495674133,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.4075,
"mean_token_accuracy": 0.03430055791613995,
"step": 1
},
{
"epoch": 0.007102430362889801,
"grad_norm": 0.14180830121040344,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.4487,
"mean_token_accuracy": 0.03482818407428567,
"step": 2
},
{
"epoch": 0.010653645544334702,
"grad_norm": 0.11331801116466522,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.419,
"mean_token_accuracy": 0.030828955586912343,
"step": 3
},
{
"epoch": 0.014204860725779603,
"grad_norm": 0.13718660175800323,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.4512,
"mean_token_accuracy": 0.03115637891824008,
"step": 4
},
{
"epoch": 0.017756075907224503,
"grad_norm": 0.1235477402806282,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.4367,
"mean_token_accuracy": 0.03077795269928174,
"step": 5
},
{
"epoch": 0.021307291088669404,
"grad_norm": 0.11979226022958755,
"learning_rate": 8.571428571428571e-06,
"loss": 0.4685,
"mean_token_accuracy": 0.028771477849659277,
"step": 6
},
{
"epoch": 0.024858506270114305,
"grad_norm": 0.12223777920007706,
"learning_rate": 1e-05,
"loss": 0.4658,
"mean_token_accuracy": 0.028678809732809896,
"step": 7
},
{
"epoch": 0.028409721451559206,
"grad_norm": 0.12957823276519775,
"learning_rate": 9.999980365120307e-06,
"loss": 0.4549,
"mean_token_accuracy": 0.031924477760185255,
"step": 8
},
{
"epoch": 0.0319609366330041,
"grad_norm": 0.11528096348047256,
"learning_rate": 9.999921460635436e-06,
"loss": 0.4568,
"mean_token_accuracy": 0.02935352978965966,
"step": 9
},
{
"epoch": 0.03551215181444901,
"grad_norm": 0.11329038441181183,
"learning_rate": 9.999823287008022e-06,
"loss": 0.4423,
"mean_token_accuracy": 0.031521555294602877,
"step": 10
},
{
"epoch": 0.039063366995893904,
"grad_norm": 0.11604199558496475,
"learning_rate": 9.999685845009114e-06,
"loss": 0.4312,
"mean_token_accuracy": 0.03416921407915652,
"step": 11
},
{
"epoch": 0.04261458217733881,
"grad_norm": 0.10851351916790009,
"learning_rate": 9.999509135718176e-06,
"loss": 0.4357,
"mean_token_accuracy": 0.029943688183266204,
"step": 12
},
{
"epoch": 0.046165797358783706,
"grad_norm": 0.10252473503351212,
"learning_rate": 9.999293160523074e-06,
"loss": 0.4327,
"mean_token_accuracy": 0.03228329481498804,
"step": 13
},
{
"epoch": 0.04971701254022861,
"grad_norm": 0.1350434422492981,
"learning_rate": 9.999037921120068e-06,
"loss": 0.4546,
"mean_token_accuracy": 0.03238412337668706,
"step": 14
},
{
"epoch": 0.05326822772167351,
"grad_norm": 0.10226991772651672,
"learning_rate": 9.998743419513795e-06,
"loss": 0.443,
"mean_token_accuracy": 0.03165558609907748,
"step": 15
},
{
"epoch": 0.05681944290311841,
"grad_norm": 0.13848458230495453,
"learning_rate": 9.998409658017256e-06,
"loss": 0.4851,
"mean_token_accuracy": 0.03008650508854771,
"step": 16
},
{
"epoch": 0.06037065808456331,
"grad_norm": 0.11423337459564209,
"learning_rate": 9.998036639251798e-06,
"loss": 0.4088,
"mean_token_accuracy": 0.030042485868762014,
"step": 17
},
{
"epoch": 0.0639218732660082,
"grad_norm": 0.13896144926548004,
"learning_rate": 9.997624366147094e-06,
"loss": 0.4404,
"mean_token_accuracy": 0.030281073039077455,
"step": 18
},
{
"epoch": 0.06747308844745312,
"grad_norm": 0.10911770910024643,
"learning_rate": 9.997172841941114e-06,
"loss": 0.4448,
"mean_token_accuracy": 0.03207043489237549,
"step": 19
},
{
"epoch": 0.07102430362889801,
"grad_norm": 0.12348262220621109,
"learning_rate": 9.99668207018011e-06,
"loss": 0.4368,
"mean_token_accuracy": 0.033165274380735354,
"step": 20
},
{
"epoch": 0.07457551881034291,
"grad_norm": 0.1128680482506752,
"learning_rate": 9.996152054718579e-06,
"loss": 0.432,
"mean_token_accuracy": 0.030988398069894174,
"step": 21
},
{
"epoch": 0.07812673399178781,
"grad_norm": 0.11654637008905411,
"learning_rate": 9.995582799719237e-06,
"loss": 0.4493,
"mean_token_accuracy": 0.032766436190286186,
"step": 22
},
{
"epoch": 0.08167794917323272,
"grad_norm": 0.11634049564599991,
"learning_rate": 9.994974309652984e-06,
"loss": 0.4752,
"mean_token_accuracy": 0.031991063471650705,
"step": 23
},
{
"epoch": 0.08522916435467762,
"grad_norm": 0.1251709908246994,
"learning_rate": 9.994326589298875e-06,
"loss": 0.4215,
"mean_token_accuracy": 0.036875830935969134,
"step": 24
},
{
"epoch": 0.08878037953612251,
"grad_norm": 0.1075059100985527,
"learning_rate": 9.993639643744071e-06,
"loss": 0.4055,
"mean_token_accuracy": 0.03151970944236382,
"step": 25
},
{
"epoch": 0.09233159471756741,
"grad_norm": 0.11893657594919205,
"learning_rate": 9.99291347838381e-06,
"loss": 0.4635,
"mean_token_accuracy": 0.028766451931005577,
"step": 26
},
{
"epoch": 0.09588280989901232,
"grad_norm": 0.12458056956529617,
"learning_rate": 9.992148098921361e-06,
"loss": 0.465,
"mean_token_accuracy": 0.03177367915850482,
"step": 27
},
{
"epoch": 0.09943402508045722,
"grad_norm": 0.12087491899728775,
"learning_rate": 9.99134351136798e-06,
"loss": 0.4415,
"mean_token_accuracy": 0.034534925398475025,
"step": 28
},
{
"epoch": 0.10298524026190212,
"grad_norm": 0.13300693035125732,
"learning_rate": 9.990499722042852e-06,
"loss": 0.4532,
"mean_token_accuracy": 0.03134459158900427,
"step": 29
},
{
"epoch": 0.10653645544334701,
"grad_norm": 0.11541508138179779,
"learning_rate": 9.989616737573064e-06,
"loss": 0.4373,
"mean_token_accuracy": 0.03319291834850446,
"step": 30
},
{
"epoch": 0.11008767062479193,
"grad_norm": 0.1059102788567543,
"learning_rate": 9.98869456489353e-06,
"loss": 0.4341,
"mean_token_accuracy": 0.03190637141960906,
"step": 31
},
{
"epoch": 0.11363888580623682,
"grad_norm": 0.11466473340988159,
"learning_rate": 9.987733211246952e-06,
"loss": 0.4453,
"mean_token_accuracy": 0.030628334232460475,
"step": 32
},
{
"epoch": 0.11719010098768172,
"grad_norm": 0.11979004740715027,
"learning_rate": 9.986732684183753e-06,
"loss": 0.4449,
"mean_token_accuracy": 0.03580710734240711,
"step": 33
},
{
"epoch": 0.12074131616912662,
"grad_norm": 0.1313788890838623,
"learning_rate": 9.985692991562026e-06,
"loss": 0.4171,
"mean_token_accuracy": 0.035130951946484856,
"step": 34
},
{
"epoch": 0.12429253135057153,
"grad_norm": 0.1338130682706833,
"learning_rate": 9.984614141547468e-06,
"loss": 0.4269,
"mean_token_accuracy": 0.03584297694033012,
"step": 35
},
{
"epoch": 0.1278437465320164,
"grad_norm": 0.13190719485282898,
"learning_rate": 9.983496142613314e-06,
"loss": 0.4456,
"mean_token_accuracy": 0.02804510169880814,
"step": 36
},
{
"epoch": 0.13139496171346132,
"grad_norm": 0.10598088055849075,
"learning_rate": 9.982339003540272e-06,
"loss": 0.4531,
"mean_token_accuracy": 0.036306285659520654,
"step": 37
},
{
"epoch": 0.13494617689490623,
"grad_norm": 0.09667421877384186,
"learning_rate": 9.981142733416457e-06,
"loss": 0.3926,
"mean_token_accuracy": 0.03226046055351617,
"step": 38
},
{
"epoch": 0.13849739207635112,
"grad_norm": 0.12407270818948746,
"learning_rate": 9.97990734163732e-06,
"loss": 0.4234,
"mean_token_accuracy": 0.032570251350989565,
"step": 39
},
{
"epoch": 0.14204860725779603,
"grad_norm": 0.10452189296483994,
"learning_rate": 9.978632837905566e-06,
"loss": 0.4217,
"mean_token_accuracy": 0.034018361457128776,
"step": 40
},
{
"epoch": 0.14559982243924094,
"grad_norm": 0.1165575161576271,
"learning_rate": 9.977319232231088e-06,
"loss": 0.4209,
"mean_token_accuracy": 0.03243047746218508,
"step": 41
},
{
"epoch": 0.14915103762068582,
"grad_norm": 0.12650305032730103,
"learning_rate": 9.975966534930879e-06,
"loss": 0.4372,
"mean_token_accuracy": 0.02945730801729951,
"step": 42
},
{
"epoch": 0.15270225280213073,
"grad_norm": 0.11621509492397308,
"learning_rate": 9.974574756628961e-06,
"loss": 0.452,
"mean_token_accuracy": 0.031811273845960386,
"step": 43
},
{
"epoch": 0.15625346798357562,
"grad_norm": 0.12744253873825073,
"learning_rate": 9.973143908256291e-06,
"loss": 0.4459,
"mean_token_accuracy": 0.029921257570094895,
"step": 44
},
{
"epoch": 0.15980468316502053,
"grad_norm": 0.15649749338626862,
"learning_rate": 9.971674001050687e-06,
"loss": 0.4252,
"mean_token_accuracy": 0.030991621693829075,
"step": 45
},
{
"epoch": 0.16335589834646544,
"grad_norm": 0.11494240909814835,
"learning_rate": 9.970165046556726e-06,
"loss": 0.4232,
"mean_token_accuracy": 0.03496951759007061,
"step": 46
},
{
"epoch": 0.16690711352791032,
"grad_norm": 0.12458129972219467,
"learning_rate": 9.968617056625665e-06,
"loss": 0.4633,
"mean_token_accuracy": 0.02982275520116673,
"step": 47
},
{
"epoch": 0.17045832870935523,
"grad_norm": 0.11398381739854813,
"learning_rate": 9.967030043415345e-06,
"loss": 0.4503,
"mean_token_accuracy": 0.03281566769874189,
"step": 48
},
{
"epoch": 0.17400954389080014,
"grad_norm": 0.10996269434690475,
"learning_rate": 9.965404019390087e-06,
"loss": 0.4218,
"mean_token_accuracy": 0.0321590854000533,
"step": 49
},
{
"epoch": 0.17756075907224503,
"grad_norm": 0.12258810549974442,
"learning_rate": 9.963738997320609e-06,
"loss": 0.4482,
"mean_token_accuracy": 0.027974106487818062,
"step": 50
},
{
"epoch": 0.18111197425368994,
"grad_norm": 0.1071733832359314,
"learning_rate": 9.962034990283912e-06,
"loss": 0.4257,
"mean_token_accuracy": 0.02945839857784449,
"step": 51
},
{
"epoch": 0.18466318943513482,
"grad_norm": 0.12644729018211365,
"learning_rate": 9.960292011663186e-06,
"loss": 0.4792,
"mean_token_accuracy": 0.032736343830038095,
"step": 52
},
{
"epoch": 0.18821440461657973,
"grad_norm": 0.10932295769453049,
"learning_rate": 9.958510075147703e-06,
"loss": 0.4029,
"mean_token_accuracy": 0.029769023109111004,
"step": 53
},
{
"epoch": 0.19176561979802464,
"grad_norm": 0.1045532375574112,
"learning_rate": 9.956689194732702e-06,
"loss": 0.4121,
"mean_token_accuracy": 0.032606568154733395,
"step": 54
},
{
"epoch": 0.19531683497946953,
"grad_norm": 0.11694876849651337,
"learning_rate": 9.954829384719296e-06,
"loss": 0.4416,
"mean_token_accuracy": 0.03312313952847035,
"step": 55
},
{
"epoch": 0.19886805016091444,
"grad_norm": 0.12251029908657074,
"learning_rate": 9.95293065971434e-06,
"loss": 0.4289,
"mean_token_accuracy": 0.03135830574319698,
"step": 56
},
{
"epoch": 0.20241926534235935,
"grad_norm": 0.10416404157876968,
"learning_rate": 9.950993034630328e-06,
"loss": 0.4275,
"mean_token_accuracy": 0.038488676873384975,
"step": 57
},
{
"epoch": 0.20597048052380423,
"grad_norm": 0.1636134833097458,
"learning_rate": 9.949016524685277e-06,
"loss": 0.464,
"mean_token_accuracy": 0.02992881732279784,
"step": 58
},
{
"epoch": 0.20952169570524914,
"grad_norm": 0.12780985236167908,
"learning_rate": 9.947001145402598e-06,
"loss": 0.4541,
"mean_token_accuracy": 0.029724605861702003,
"step": 59
},
{
"epoch": 0.21307291088669403,
"grad_norm": 0.13115057349205017,
"learning_rate": 9.944946912610986e-06,
"loss": 0.462,
"mean_token_accuracy": 0.03166838363176794,
"step": 60
},
{
"epoch": 0.21662412606813894,
"grad_norm": 0.12131655216217041,
"learning_rate": 9.942853842444283e-06,
"loss": 0.4206,
"mean_token_accuracy": 0.030444662748777773,
"step": 61
},
{
"epoch": 0.22017534124958385,
"grad_norm": 0.13489212095737457,
"learning_rate": 9.940721951341365e-06,
"loss": 0.4322,
"mean_token_accuracy": 0.0293623886336718,
"step": 62
},
{
"epoch": 0.22372655643102873,
"grad_norm": 0.11916383355855942,
"learning_rate": 9.938551256046e-06,
"loss": 0.4512,
"mean_token_accuracy": 0.035470658251142595,
"step": 63
},
{
"epoch": 0.22727777161247364,
"grad_norm": 0.13127347826957703,
"learning_rate": 9.936341773606723e-06,
"loss": 0.4424,
"mean_token_accuracy": 0.032253861045319354,
"step": 64
},
{
"epoch": 0.23082898679391856,
"grad_norm": 0.12579481303691864,
"learning_rate": 9.934093521376707e-06,
"loss": 0.4465,
"mean_token_accuracy": 0.03229751772960299,
"step": 65
},
{
"epoch": 0.23438020197536344,
"grad_norm": 0.11594023555517197,
"learning_rate": 9.931806517013612e-06,
"loss": 0.4399,
"mean_token_accuracy": 0.02944770805333974,
"step": 66
},
{
"epoch": 0.23793141715680835,
"grad_norm": 0.10294274985790253,
"learning_rate": 9.929480778479465e-06,
"loss": 0.4186,
"mean_token_accuracy": 0.03471728676959174,
"step": 67
},
{
"epoch": 0.24148263233825323,
"grad_norm": 0.13848905265331268,
"learning_rate": 9.9271163240405e-06,
"loss": 0.4196,
"mean_token_accuracy": 0.03424310322952806,
"step": 68
},
{
"epoch": 0.24503384751969814,
"grad_norm": 0.10381077975034714,
"learning_rate": 9.92471317226703e-06,
"loss": 0.4448,
"mean_token_accuracy": 0.03184566564414126,
"step": 69
},
{
"epoch": 0.24858506270114306,
"grad_norm": 0.13564680516719818,
"learning_rate": 9.922271342033295e-06,
"loss": 0.4259,
"mean_token_accuracy": 0.03590564796468243,
"step": 70
},
{
"epoch": 0.25213627788258797,
"grad_norm": 0.13113564252853394,
"learning_rate": 9.919790852517313e-06,
"loss": 0.4574,
"mean_token_accuracy": 0.034656246283702785,
"step": 71
},
{
"epoch": 0.2556874930640328,
"grad_norm": 0.10604698956012726,
"learning_rate": 9.917271723200725e-06,
"loss": 0.4,
"mean_token_accuracy": 0.03376925739576109,
"step": 72
},
{
"epoch": 0.25923870824547773,
"grad_norm": 0.1232958659529686,
"learning_rate": 9.914713973868654e-06,
"loss": 0.4529,
"mean_token_accuracy": 0.031287746594898636,
"step": 73
},
{
"epoch": 0.26278992342692264,
"grad_norm": 0.12214750051498413,
"learning_rate": 9.91211762460954e-06,
"loss": 0.4576,
"mean_token_accuracy": 0.032098766198032536,
"step": 74
},
{
"epoch": 0.26634113860836756,
"grad_norm": 0.11422038823366165,
"learning_rate": 9.909482695814986e-06,
"loss": 0.424,
"mean_token_accuracy": 0.03233322404776118,
"step": 75
},
{
"epoch": 0.26989235378981247,
"grad_norm": 0.11640136688947678,
"learning_rate": 9.906809208179593e-06,
"loss": 0.4368,
"mean_token_accuracy": 0.03468143657846667,
"step": 76
},
{
"epoch": 0.2734435689712574,
"grad_norm": 0.10775226354598999,
"learning_rate": 9.904097182700806e-06,
"loss": 0.413,
"mean_token_accuracy": 0.0327887820021715,
"step": 77
},
{
"epoch": 0.27699478415270223,
"grad_norm": 0.1443656086921692,
"learning_rate": 9.901346640678744e-06,
"loss": 0.4565,
"mean_token_accuracy": 0.03074216824643372,
"step": 78
},
{
"epoch": 0.28054599933414714,
"grad_norm": 0.13974997401237488,
"learning_rate": 9.898557603716031e-06,
"loss": 0.4522,
"mean_token_accuracy": 0.030708255846548127,
"step": 79
},
{
"epoch": 0.28409721451559206,
"grad_norm": 0.11789468675851822,
"learning_rate": 9.895730093717629e-06,
"loss": 0.4354,
"mean_token_accuracy": 0.029810201263899216,
"step": 80
},
{
"epoch": 0.28764842969703697,
"grad_norm": 0.1083148941397667,
"learning_rate": 9.892864132890663e-06,
"loss": 0.414,
"mean_token_accuracy": 0.032822823130118195,
"step": 81
},
{
"epoch": 0.2911996448784819,
"grad_norm": 0.11463505774736404,
"learning_rate": 9.889959743744253e-06,
"loss": 0.4493,
"mean_token_accuracy": 0.030246941671066452,
"step": 82
},
{
"epoch": 0.29475086005992673,
"grad_norm": 0.1068756952881813,
"learning_rate": 9.887016949089334e-06,
"loss": 0.4228,
"mean_token_accuracy": 0.03203033060890448,
"step": 83
},
{
"epoch": 0.29830207524137164,
"grad_norm": 0.1452023833990097,
"learning_rate": 9.884035772038471e-06,
"loss": 0.4389,
"mean_token_accuracy": 0.03580652122036554,
"step": 84
},
{
"epoch": 0.30185329042281656,
"grad_norm": 0.11701487749814987,
"learning_rate": 9.881016236005686e-06,
"loss": 0.4338,
"mean_token_accuracy": 0.03185546858730959,
"step": 85
},
{
"epoch": 0.30540450560426147,
"grad_norm": 0.123188316822052,
"learning_rate": 9.877958364706269e-06,
"loss": 0.4633,
"mean_token_accuracy": 0.03289600303105544,
"step": 86
},
{
"epoch": 0.3089557207857064,
"grad_norm": 0.10650799423456192,
"learning_rate": 9.874862182156596e-06,
"loss": 0.4365,
"mean_token_accuracy": 0.032173037085158285,
"step": 87
},
{
"epoch": 0.31250693596715123,
"grad_norm": 0.13090448081493378,
"learning_rate": 9.871727712673931e-06,
"loss": 0.4148,
"mean_token_accuracy": 0.032025952023104765,
"step": 88
},
{
"epoch": 0.31605815114859614,
"grad_norm": 0.1158757209777832,
"learning_rate": 9.868554980876253e-06,
"loss": 0.434,
"mean_token_accuracy": 0.03277148631241289,
"step": 89
},
{
"epoch": 0.31960936633004106,
"grad_norm": 0.1502567082643509,
"learning_rate": 9.865344011682038e-06,
"loss": 0.4402,
"mean_token_accuracy": 0.03716896351761534,
"step": 90
},
{
"epoch": 0.32316058151148597,
"grad_norm": 0.10705456882715225,
"learning_rate": 9.86209483031009e-06,
"loss": 0.4428,
"mean_token_accuracy": 0.028792706354579423,
"step": 91
},
{
"epoch": 0.3267117966929309,
"grad_norm": 0.13711455464363098,
"learning_rate": 9.858807462279319e-06,
"loss": 0.4311,
"mean_token_accuracy": 0.03321756741206627,
"step": 92
},
{
"epoch": 0.3302630118743758,
"grad_norm": 0.13615398108959198,
"learning_rate": 9.855481933408557e-06,
"loss": 0.4426,
"mean_token_accuracy": 0.03406765976615134,
"step": 93
},
{
"epoch": 0.33381422705582064,
"grad_norm": 0.13447901606559753,
"learning_rate": 9.852118269816348e-06,
"loss": 0.4807,
"mean_token_accuracy": 0.029012137778408942,
"step": 94
},
{
"epoch": 0.33736544223726556,
"grad_norm": 0.12379316985607147,
"learning_rate": 9.848716497920742e-06,
"loss": 0.4446,
"mean_token_accuracy": 0.030392555565413204,
"step": 95
},
{
"epoch": 0.34091665741871047,
"grad_norm": 0.1602836698293686,
"learning_rate": 9.845276644439093e-06,
"loss": 0.4601,
"mean_token_accuracy": 0.03084721965751669,
"step": 96
},
{
"epoch": 0.3444678726001554,
"grad_norm": 0.08885196596384048,
"learning_rate": 9.841798736387846e-06,
"loss": 0.4065,
"mean_token_accuracy": 0.03453358236947679,
"step": 97
},
{
"epoch": 0.3480190877816003,
"grad_norm": 0.12009483575820923,
"learning_rate": 9.838282801082322e-06,
"loss": 0.473,
"mean_token_accuracy": 0.03147461433763965,
"step": 98
},
{
"epoch": 0.35157030296304514,
"grad_norm": 0.1066279485821724,
"learning_rate": 9.834728866136506e-06,
"loss": 0.4057,
"mean_token_accuracy": 0.03152646504895529,
"step": 99
},
{
"epoch": 0.35512151814449006,
"grad_norm": 0.11169735342264175,
"learning_rate": 9.831136959462835e-06,
"loss": 0.4499,
"mean_token_accuracy": 0.03387650641161599,
"step": 100
},
{
"epoch": 0.35867273332593497,
"grad_norm": 0.12812377512454987,
"learning_rate": 9.82750710927197e-06,
"loss": 0.4314,
"mean_token_accuracy": 0.03172285951222875,
"step": 101
},
{
"epoch": 0.3622239485073799,
"grad_norm": 0.1205376535654068,
"learning_rate": 9.823839344072582e-06,
"loss": 0.4214,
"mean_token_accuracy": 0.03345074072058196,
"step": 102
},
{
"epoch": 0.3657751636888248,
"grad_norm": 0.12378670275211334,
"learning_rate": 9.820133692671116e-06,
"loss": 0.439,
"mean_token_accuracy": 0.029327621996344533,
"step": 103
},
{
"epoch": 0.36932637887026964,
"grad_norm": 0.1197931170463562,
"learning_rate": 9.816390184171587e-06,
"loss": 0.4552,
"mean_token_accuracy": 0.03034232833306305,
"step": 104
},
{
"epoch": 0.37287759405171456,
"grad_norm": 0.13017283380031586,
"learning_rate": 9.812608847975327e-06,
"loss": 0.4246,
"mean_token_accuracy": 0.03273073174932506,
"step": 105
},
{
"epoch": 0.37642880923315947,
"grad_norm": 0.11206940561532974,
"learning_rate": 9.808789713780768e-06,
"loss": 0.43,
"mean_token_accuracy": 0.033182675353600644,
"step": 106
},
{
"epoch": 0.3799800244146044,
"grad_norm": 0.15535788238048553,
"learning_rate": 9.804932811583208e-06,
"loss": 0.4609,
"mean_token_accuracy": 0.029387073122052243,
"step": 107
},
{
"epoch": 0.3835312395960493,
"grad_norm": 0.10909680277109146,
"learning_rate": 9.801038171674571e-06,
"loss": 0.4159,
"mean_token_accuracy": 0.03437764603586402,
"step": 108
},
{
"epoch": 0.3870824547774942,
"grad_norm": 0.09239600598812103,
"learning_rate": 9.797105824643171e-06,
"loss": 0.4191,
"mean_token_accuracy": 0.03158426017398597,
"step": 109
},
{
"epoch": 0.39063366995893906,
"grad_norm": 0.1135198250412941,
"learning_rate": 9.793135801373472e-06,
"loss": 0.4322,
"mean_token_accuracy": 0.029844234144547954,
"step": 110
},
{
"epoch": 0.39418488514038397,
"grad_norm": 0.13279543817043304,
"learning_rate": 9.789128133045846e-06,
"loss": 0.4734,
"mean_token_accuracy": 0.034466699355107266,
"step": 111
},
{
"epoch": 0.3977361003218289,
"grad_norm": 0.1167665496468544,
"learning_rate": 9.785082851136327e-06,
"loss": 0.4249,
"mean_token_accuracy": 0.03815961653708655,
"step": 112
},
{
"epoch": 0.4012873155032738,
"grad_norm": 0.10479965060949326,
"learning_rate": 9.780999987416363e-06,
"loss": 0.4256,
"mean_token_accuracy": 0.03297240539905033,
"step": 113
},
{
"epoch": 0.4048385306847187,
"grad_norm": 0.11566231399774551,
"learning_rate": 9.776879573952573e-06,
"loss": 0.4347,
"mean_token_accuracy": 0.03426292850053869,
"step": 114
},
{
"epoch": 0.40838974586616356,
"grad_norm": 0.13041988015174866,
"learning_rate": 9.772721643106483e-06,
"loss": 0.4171,
"mean_token_accuracy": 0.030951603603170952,
"step": 115
},
{
"epoch": 0.41194096104760847,
"grad_norm": 0.13270442187786102,
"learning_rate": 9.768526227534286e-06,
"loss": 0.4541,
"mean_token_accuracy": 0.030458911915047793,
"step": 116
},
{
"epoch": 0.4154921762290534,
"grad_norm": 0.1140994057059288,
"learning_rate": 9.764293360186568e-06,
"loss": 0.4252,
"mean_token_accuracy": 0.032363708789489465,
"step": 117
},
{
"epoch": 0.4190433914104983,
"grad_norm": 0.10969191789627075,
"learning_rate": 9.760023074308067e-06,
"loss": 0.4403,
"mean_token_accuracy": 0.03104240054017282,
"step": 118
},
{
"epoch": 0.4225946065919432,
"grad_norm": 0.10822800546884537,
"learning_rate": 9.755715403437405e-06,
"loss": 0.4271,
"mean_token_accuracy": 0.027545168155484134,
"step": 119
},
{
"epoch": 0.42614582177338806,
"grad_norm": 0.10963085293769836,
"learning_rate": 9.75137038140682e-06,
"loss": 0.413,
"mean_token_accuracy": 0.030798182297075982,
"step": 120
},
{
"epoch": 0.42969703695483297,
"grad_norm": 0.11720361560583115,
"learning_rate": 9.746988042341907e-06,
"loss": 0.4112,
"mean_token_accuracy": 0.03484758762715501,
"step": 121
},
{
"epoch": 0.4332482521362779,
"grad_norm": 0.12010498344898224,
"learning_rate": 9.742568420661347e-06,
"loss": 0.4248,
"mean_token_accuracy": 0.034901620656455634,
"step": 122
},
{
"epoch": 0.4367994673177228,
"grad_norm": 0.11497566103935242,
"learning_rate": 9.738111551076633e-06,
"loss": 0.4231,
"mean_token_accuracy": 0.03080306958872825,
"step": 123
},
{
"epoch": 0.4403506824991677,
"grad_norm": 0.1160268560051918,
"learning_rate": 9.733617468591806e-06,
"loss": 0.4501,
"mean_token_accuracy": 0.029371788314165315,
"step": 124
},
{
"epoch": 0.4439018976806126,
"grad_norm": 0.11650796234607697,
"learning_rate": 9.729086208503174e-06,
"loss": 0.4168,
"mean_token_accuracy": 0.03356469544087304,
"step": 125
},
{
"epoch": 0.44745311286205747,
"grad_norm": 0.13858026266098022,
"learning_rate": 9.724517806399035e-06,
"loss": 0.4122,
"mean_token_accuracy": 0.030320848196424777,
"step": 126
},
{
"epoch": 0.4510043280435024,
"grad_norm": 0.09430894255638123,
"learning_rate": 9.7199122981594e-06,
"loss": 0.4063,
"mean_token_accuracy": 0.03472123346364242,
"step": 127
},
{
"epoch": 0.4545555432249473,
"grad_norm": 0.11508966982364655,
"learning_rate": 9.715269719955708e-06,
"loss": 0.4464,
"mean_token_accuracy": 0.032851900316018146,
"step": 128
},
{
"epoch": 0.4581067584063922,
"grad_norm": 0.11557421833276749,
"learning_rate": 9.710590108250546e-06,
"loss": 0.4382,
"mean_token_accuracy": 0.03142100031618611,
"step": 129
},
{
"epoch": 0.4616579735878371,
"grad_norm": 0.11115779727697372,
"learning_rate": 9.705873499797358e-06,
"loss": 0.4127,
"mean_token_accuracy": 0.030225146732846042,
"step": 130
},
{
"epoch": 0.46520918876928197,
"grad_norm": 0.13472823798656464,
"learning_rate": 9.701119931640161e-06,
"loss": 0.4499,
"mean_token_accuracy": 0.03345891845674487,
"step": 131
},
{
"epoch": 0.4687604039507269,
"grad_norm": 0.14002355933189392,
"learning_rate": 9.69632944111325e-06,
"loss": 0.4515,
"mean_token_accuracy": 0.02956337868090486,
"step": 132
},
{
"epoch": 0.4723116191321718,
"grad_norm": 0.11660108715295792,
"learning_rate": 9.691502065840905e-06,
"loss": 0.4259,
"mean_token_accuracy": 0.03448187607500586,
"step": 133
},
{
"epoch": 0.4758628343136167,
"grad_norm": 0.1005408763885498,
"learning_rate": 9.686637843737104e-06,
"loss": 0.4246,
"mean_token_accuracy": 0.0317718359438004,
"step": 134
},
{
"epoch": 0.4794140494950616,
"grad_norm": 0.11530707031488419,
"learning_rate": 9.681736813005207e-06,
"loss": 0.4211,
"mean_token_accuracy": 0.028531658732390497,
"step": 135
},
{
"epoch": 0.48296526467650647,
"grad_norm": 0.11586639285087585,
"learning_rate": 9.676799012137678e-06,
"loss": 0.4539,
"mean_token_accuracy": 0.035154912646248704,
"step": 136
},
{
"epoch": 0.4865164798579514,
"grad_norm": 0.11311322450637817,
"learning_rate": 9.671824479915768e-06,
"loss": 0.4323,
"mean_token_accuracy": 0.03284673898087931,
"step": 137
},
{
"epoch": 0.4900676950393963,
"grad_norm": 0.14726871252059937,
"learning_rate": 9.666813255409212e-06,
"loss": 0.4535,
"mean_token_accuracy": 0.02753433164252783,
"step": 138
},
{
"epoch": 0.4936189102208412,
"grad_norm": 0.10065948218107224,
"learning_rate": 9.661765377975924e-06,
"loss": 0.4168,
"mean_token_accuracy": 0.034299176466447534,
"step": 139
},
{
"epoch": 0.4971701254022861,
"grad_norm": 0.13521119952201843,
"learning_rate": 9.656680887261693e-06,
"loss": 0.4413,
"mean_token_accuracy": 0.03242656226575491,
"step": 140
},
{
"epoch": 0.500721340583731,
"grad_norm": 0.11620158702135086,
"learning_rate": 9.651559823199865e-06,
"loss": 0.4219,
"mean_token_accuracy": 0.03236671327249496,
"step": 141
},
{
"epoch": 0.5042725557651759,
"grad_norm": 0.13755175471305847,
"learning_rate": 9.646402226011028e-06,
"loss": 0.4646,
"mean_token_accuracy": 0.027886055768249207,
"step": 142
},
{
"epoch": 0.5078237709466208,
"grad_norm": 0.12309759855270386,
"learning_rate": 9.641208136202705e-06,
"loss": 0.4387,
"mean_token_accuracy": 0.033787832529924344,
"step": 143
},
{
"epoch": 0.5113749861280656,
"grad_norm": 0.09280434995889664,
"learning_rate": 9.635977594569025e-06,
"loss": 0.4072,
"mean_token_accuracy": 0.03402286476557492,
"step": 144
},
{
"epoch": 0.5149262013095106,
"grad_norm": 0.10794156789779663,
"learning_rate": 9.630710642190412e-06,
"loss": 0.4207,
"mean_token_accuracy": 0.03402460503275506,
"step": 145
},
{
"epoch": 0.5184774164909555,
"grad_norm": 0.12627671658992767,
"learning_rate": 9.625407320433257e-06,
"loss": 0.4509,
"mean_token_accuracy": 0.03308519825441181,
"step": 146
},
{
"epoch": 0.5220286316724004,
"grad_norm": 0.11183463037014008,
"learning_rate": 9.620067670949593e-06,
"loss": 0.447,
"mean_token_accuracy": 0.030306155767902965,
"step": 147
},
{
"epoch": 0.5255798468538453,
"grad_norm": 0.12490588426589966,
"learning_rate": 9.614691735676768e-06,
"loss": 0.438,
"mean_token_accuracy": 0.02836215259230812,
"step": 148
},
{
"epoch": 0.5291310620352903,
"grad_norm": 0.09944428503513336,
"learning_rate": 9.609279556837122e-06,
"loss": 0.4048,
"mean_token_accuracy": 0.034694020345341414,
"step": 149
},
{
"epoch": 0.5326822772167351,
"grad_norm": 0.11730767041444778,
"learning_rate": 9.603831176937645e-06,
"loss": 0.4741,
"mean_token_accuracy": 0.03214656777709024,
"step": 150
},
{
"epoch": 0.53623349239818,
"grad_norm": 0.1205916702747345,
"learning_rate": 9.598346638769653e-06,
"loss": 0.4363,
"mean_token_accuracy": 0.02935866809639265,
"step": 151
},
{
"epoch": 0.5397847075796249,
"grad_norm": 0.11929334700107574,
"learning_rate": 9.592825985408443e-06,
"loss": 0.4471,
"mean_token_accuracy": 0.033947633866773685,
"step": 152
},
{
"epoch": 0.5433359227610698,
"grad_norm": 0.12662938237190247,
"learning_rate": 9.58726926021296e-06,
"loss": 0.4543,
"mean_token_accuracy": 0.028889564997371053,
"step": 153
},
{
"epoch": 0.5468871379425148,
"grad_norm": 0.11105228215456009,
"learning_rate": 9.581676506825458e-06,
"loss": 0.4367,
"mean_token_accuracy": 0.03555938474892173,
"step": 154
},
{
"epoch": 0.5504383531239596,
"grad_norm": 0.11740686744451523,
"learning_rate": 9.576047769171154e-06,
"loss": 0.4299,
"mean_token_accuracy": 0.03297806582850171,
"step": 155
},
{
"epoch": 0.5539895683054045,
"grad_norm": 0.11532069742679596,
"learning_rate": 9.57038309145788e-06,
"loss": 0.4662,
"mean_token_accuracy": 0.02882920480624307,
"step": 156
},
{
"epoch": 0.5575407834868494,
"grad_norm": 0.11282172054052353,
"learning_rate": 9.564682518175745e-06,
"loss": 0.4591,
"mean_token_accuracy": 0.03218559510423802,
"step": 157
},
{
"epoch": 0.5610919986682943,
"grad_norm": 0.09830707311630249,
"learning_rate": 9.558946094096773e-06,
"loss": 0.4048,
"mean_token_accuracy": 0.030819052895822097,
"step": 158
},
{
"epoch": 0.5646432138497393,
"grad_norm": 0.13022297620773315,
"learning_rate": 9.553173864274567e-06,
"loss": 0.4528,
"mean_token_accuracy": 0.031908016972010955,
"step": 159
},
{
"epoch": 0.5681944290311841,
"grad_norm": 0.09976419061422348,
"learning_rate": 9.547365874043939e-06,
"loss": 0.4228,
"mean_token_accuracy": 0.03265460357397387,
"step": 160
},
{
"epoch": 0.571745644212629,
"grad_norm": 0.12907098233699799,
"learning_rate": 9.541522169020568e-06,
"loss": 0.4338,
"mean_token_accuracy": 0.03252671978043509,
"step": 161
},
{
"epoch": 0.5752968593940739,
"grad_norm": 0.11115579307079315,
"learning_rate": 9.535642795100628e-06,
"loss": 0.4329,
"mean_token_accuracy": 0.03106809964447166,
"step": 162
},
{
"epoch": 0.5788480745755188,
"grad_norm": 0.10402530431747437,
"learning_rate": 9.529727798460443e-06,
"loss": 0.4158,
"mean_token_accuracy": 0.02999804872160894,
"step": 163
},
{
"epoch": 0.5823992897569638,
"grad_norm": 0.11144474148750305,
"learning_rate": 9.52377722555611e-06,
"loss": 0.4318,
"mean_token_accuracy": 0.031277922937078984,
"step": 164
},
{
"epoch": 0.5859505049384086,
"grad_norm": 0.11141372472047806,
"learning_rate": 9.517791123123141e-06,
"loss": 0.4513,
"mean_token_accuracy": 0.030790336892096093,
"step": 165
},
{
"epoch": 0.5895017201198535,
"grad_norm": 0.11419973522424698,
"learning_rate": 9.5117695381761e-06,
"loss": 0.4478,
"mean_token_accuracy": 0.03272564147846424,
"step": 166
},
{
"epoch": 0.5930529353012984,
"grad_norm": 0.11346927285194397,
"learning_rate": 9.50571251800822e-06,
"loss": 0.4243,
"mean_token_accuracy": 0.034652669322895235,
"step": 167
},
{
"epoch": 0.5966041504827433,
"grad_norm": 0.10645310580730438,
"learning_rate": 9.49962011019105e-06,
"loss": 0.4291,
"mean_token_accuracy": 0.030576513236155733,
"step": 168
},
{
"epoch": 0.6001553656641883,
"grad_norm": 0.13944825530052185,
"learning_rate": 9.493492362574069e-06,
"loss": 0.4506,
"mean_token_accuracy": 0.032292869611410424,
"step": 169
},
{
"epoch": 0.6037065808456331,
"grad_norm": 0.12811113893985748,
"learning_rate": 9.487329323284306e-06,
"loss": 0.4272,
"mean_token_accuracy": 0.028058004420017824,
"step": 170
},
{
"epoch": 0.607257796027078,
"grad_norm": 0.12232954055070877,
"learning_rate": 9.481131040725982e-06,
"loss": 0.4304,
"mean_token_accuracy": 0.030032273742108373,
"step": 171
},
{
"epoch": 0.6108090112085229,
"grad_norm": 0.12088574469089508,
"learning_rate": 9.474897563580105e-06,
"loss": 0.4298,
"mean_token_accuracy": 0.03754826654039789,
"step": 172
},
{
"epoch": 0.6143602263899678,
"grad_norm": 0.14544999599456787,
"learning_rate": 9.468628940804109e-06,
"loss": 0.4611,
"mean_token_accuracy": 0.02830790860025445,
"step": 173
},
{
"epoch": 0.6179114415714128,
"grad_norm": 0.12021893262863159,
"learning_rate": 9.46232522163145e-06,
"loss": 0.4297,
"mean_token_accuracy": 0.031058607095474144,
"step": 174
},
{
"epoch": 0.6214626567528576,
"grad_norm": 0.11498667299747467,
"learning_rate": 9.45598645557124e-06,
"loss": 0.4365,
"mean_token_accuracy": 0.03069377435167553,
"step": 175
},
{
"epoch": 0.6250138719343025,
"grad_norm": 0.11569106578826904,
"learning_rate": 9.44961269240784e-06,
"loss": 0.4524,
"mean_token_accuracy": 0.031692910630226834,
"step": 176
},
{
"epoch": 0.6285650871157474,
"grad_norm": 0.11758638918399811,
"learning_rate": 9.443203982200479e-06,
"loss": 0.4201,
"mean_token_accuracy": 0.028326944633590756,
"step": 177
},
{
"epoch": 0.6321163022971923,
"grad_norm": 0.12534549832344055,
"learning_rate": 9.436760375282858e-06,
"loss": 0.4518,
"mean_token_accuracy": 0.03576031195188989,
"step": 178
},
{
"epoch": 0.6356675174786373,
"grad_norm": 0.12341169267892838,
"learning_rate": 9.430281922262758e-06,
"loss": 0.4191,
"mean_token_accuracy": 0.03796804480589344,
"step": 179
},
{
"epoch": 0.6392187326600821,
"grad_norm": 0.12558291852474213,
"learning_rate": 9.423768674021638e-06,
"loss": 0.4382,
"mean_token_accuracy": 0.030642931031252374,
"step": 180
},
{
"epoch": 0.642769947841527,
"grad_norm": 0.10039878636598587,
"learning_rate": 9.417220681714232e-06,
"loss": 0.4258,
"mean_token_accuracy": 0.031603102244844195,
"step": 181
},
{
"epoch": 0.6463211630229719,
"grad_norm": 0.1142401471734047,
"learning_rate": 9.410637996768161e-06,
"loss": 0.4323,
"mean_token_accuracy": 0.030828532620944316,
"step": 182
},
{
"epoch": 0.6498723782044168,
"grad_norm": 0.1079002246260643,
"learning_rate": 9.404020670883511e-06,
"loss": 0.4113,
"mean_token_accuracy": 0.03495721969375154,
"step": 183
},
{
"epoch": 0.6534235933858618,
"grad_norm": 0.11479309946298599,
"learning_rate": 9.397368756032445e-06,
"loss": 0.444,
"mean_token_accuracy": 0.030968798611866077,
"step": 184
},
{
"epoch": 0.6569748085673066,
"grad_norm": 0.12519249320030212,
"learning_rate": 9.390682304458782e-06,
"loss": 0.4447,
"mean_token_accuracy": 0.03184685645828722,
"step": 185
},
{
"epoch": 0.6605260237487516,
"grad_norm": 0.13242708146572113,
"learning_rate": 9.38396136867759e-06,
"loss": 0.4325,
"mean_token_accuracy": 0.03234595538378926,
"step": 186
},
{
"epoch": 0.6640772389301964,
"grad_norm": 0.1431904435157776,
"learning_rate": 9.377206001474773e-06,
"loss": 0.4637,
"mean_token_accuracy": 0.02888246741349576,
"step": 187
},
{
"epoch": 0.6676284541116413,
"grad_norm": 0.12454501539468765,
"learning_rate": 9.370416255906663e-06,
"loss": 0.4437,
"mean_token_accuracy": 0.03247980809828732,
"step": 188
},
{
"epoch": 0.6711796692930863,
"grad_norm": 0.142369344830513,
"learning_rate": 9.363592185299593e-06,
"loss": 0.4505,
"mean_token_accuracy": 0.03117010975802259,
"step": 189
},
{
"epoch": 0.6747308844745311,
"grad_norm": 0.11705569177865982,
"learning_rate": 9.356733843249487e-06,
"loss": 0.4729,
"mean_token_accuracy": 0.030519108702719677,
"step": 190
},
{
"epoch": 0.6782820996559761,
"grad_norm": 0.10681698471307755,
"learning_rate": 9.349841283621432e-06,
"loss": 0.4472,
"mean_token_accuracy": 0.02811988699613721,
"step": 191
},
{
"epoch": 0.6818333148374209,
"grad_norm": 0.1344628483057022,
"learning_rate": 9.34291456054926e-06,
"loss": 0.4646,
"mean_token_accuracy": 0.03209848375627189,
"step": 192
},
{
"epoch": 0.6853845300188658,
"grad_norm": 0.12343592941761017,
"learning_rate": 9.33595372843512e-06,
"loss": 0.4418,
"mean_token_accuracy": 0.028969483559194487,
"step": 193
},
{
"epoch": 0.6889357452003108,
"grad_norm": 0.10727835446596146,
"learning_rate": 9.328958841949056e-06,
"loss": 0.4168,
"mean_token_accuracy": 0.03168499671301106,
"step": 194
},
{
"epoch": 0.6924869603817556,
"grad_norm": 0.1320902556180954,
"learning_rate": 9.321929956028565e-06,
"loss": 0.4324,
"mean_token_accuracy": 0.030861409675708273,
"step": 195
},
{
"epoch": 0.6960381755632006,
"grad_norm": 0.10983603447675705,
"learning_rate": 9.31486712587818e-06,
"loss": 0.4466,
"mean_token_accuracy": 0.03086903478106251,
"step": 196
},
{
"epoch": 0.6995893907446454,
"grad_norm": 0.12487445771694183,
"learning_rate": 9.307770406969032e-06,
"loss": 0.3974,
"mean_token_accuracy": 0.03599889015458757,
"step": 197
},
{
"epoch": 0.7031406059260903,
"grad_norm": 0.12027975171804428,
"learning_rate": 9.300639855038405e-06,
"loss": 0.4511,
"mean_token_accuracy": 0.03203024027243373,
"step": 198
},
{
"epoch": 0.7066918211075353,
"grad_norm": 0.12009831517934799,
"learning_rate": 9.293475526089316e-06,
"loss": 0.4304,
"mean_token_accuracy": 0.031490876361203846,
"step": 199
},
{
"epoch": 0.7102430362889801,
"grad_norm": 0.12162379175424576,
"learning_rate": 9.286277476390056e-06,
"loss": 0.4525,
"mean_token_accuracy": 0.029639694414072437,
"step": 200
},
{
"epoch": 0.7137942514704251,
"grad_norm": 0.1278138905763626,
"learning_rate": 9.279045762473764e-06,
"loss": 0.4715,
"mean_token_accuracy": 0.030264464585343376,
"step": 201
},
{
"epoch": 0.7173454666518699,
"grad_norm": 0.10495288670063019,
"learning_rate": 9.27178044113797e-06,
"loss": 0.4277,
"mean_token_accuracy": 0.03156335685889644,
"step": 202
},
{
"epoch": 0.7208966818333148,
"grad_norm": 0.12295868992805481,
"learning_rate": 9.264481569444157e-06,
"loss": 0.4545,
"mean_token_accuracy": 0.030497470339469146,
"step": 203
},
{
"epoch": 0.7244478970147598,
"grad_norm": 0.09332438558340073,
"learning_rate": 9.257149204717317e-06,
"loss": 0.3944,
"mean_token_accuracy": 0.03187960746254248,
"step": 204
},
{
"epoch": 0.7279991121962046,
"grad_norm": 0.14773069322109222,
"learning_rate": 9.249783404545488e-06,
"loss": 0.4389,
"mean_token_accuracy": 0.0344977921267855,
"step": 205
},
{
"epoch": 0.7315503273776496,
"grad_norm": 0.12294626981019974,
"learning_rate": 9.242384226779308e-06,
"loss": 0.4572,
"mean_token_accuracy": 0.03103074165119324,
"step": 206
},
{
"epoch": 0.7351015425590944,
"grad_norm": 0.13636770844459534,
"learning_rate": 9.234951729531564e-06,
"loss": 0.4458,
"mean_token_accuracy": 0.0317818928451743,
"step": 207
},
{
"epoch": 0.7386527577405393,
"grad_norm": 0.10994532704353333,
"learning_rate": 9.227485971176734e-06,
"loss": 0.4293,
"mean_token_accuracy": 0.03580091031108168,
"step": 208
},
{
"epoch": 0.7422039729219843,
"grad_norm": 0.13844896852970123,
"learning_rate": 9.219987010350522e-06,
"loss": 0.4661,
"mean_token_accuracy": 0.031334696937847184,
"step": 209
},
{
"epoch": 0.7457551881034291,
"grad_norm": 0.12561361491680145,
"learning_rate": 9.212454905949406e-06,
"loss": 0.4398,
"mean_token_accuracy": 0.03021751243250037,
"step": 210
},
{
"epoch": 0.7493064032848741,
"grad_norm": 0.09732896089553833,
"learning_rate": 9.204889717130172e-06,
"loss": 0.4107,
"mean_token_accuracy": 0.03446503423401737,
"step": 211
},
{
"epoch": 0.7528576184663189,
"grad_norm": 0.1296277940273285,
"learning_rate": 9.197291503309448e-06,
"loss": 0.4356,
"mean_token_accuracy": 0.03087400232834625,
"step": 212
},
{
"epoch": 0.7564088336477638,
"grad_norm": 0.10486706346273422,
"learning_rate": 9.189660324163243e-06,
"loss": 0.418,
"mean_token_accuracy": 0.035182704639737494,
"step": 213
},
{
"epoch": 0.7599600488292088,
"grad_norm": 0.12244177609682083,
"learning_rate": 9.181996239626468e-06,
"loss": 0.4523,
"mean_token_accuracy": 0.031129384631640278,
"step": 214
},
{
"epoch": 0.7635112640106536,
"grad_norm": 0.11858798563480377,
"learning_rate": 9.174299309892474e-06,
"loss": 0.4207,
"mean_token_accuracy": 0.03296038699045312,
"step": 215
},
{
"epoch": 0.7670624791920986,
"grad_norm": 0.11604490876197815,
"learning_rate": 9.166569595412576e-06,
"loss": 0.4627,
"mean_token_accuracy": 0.030237445222155657,
"step": 216
},
{
"epoch": 0.7706136943735434,
"grad_norm": 0.1077352985739708,
"learning_rate": 9.158807156895581e-06,
"loss": 0.4314,
"mean_token_accuracy": 0.031499510438152356,
"step": 217
},
{
"epoch": 0.7741649095549884,
"grad_norm": 0.10301569849252701,
"learning_rate": 9.151012055307308e-06,
"loss": 0.4322,
"mean_token_accuracy": 0.03421672209515236,
"step": 218
},
{
"epoch": 0.7777161247364333,
"grad_norm": 0.10384063422679901,
"learning_rate": 9.14318435187011e-06,
"loss": 0.4272,
"mean_token_accuracy": 0.028132701208960498,
"step": 219
},
{
"epoch": 0.7812673399178781,
"grad_norm": 0.13823232054710388,
"learning_rate": 9.135324108062391e-06,
"loss": 0.4281,
"mean_token_accuracy": 0.037649581041478086,
"step": 220
},
{
"epoch": 0.7848185550993231,
"grad_norm": 0.12410993129014969,
"learning_rate": 9.127431385618129e-06,
"loss": 0.418,
"mean_token_accuracy": 0.03602545694229775,
"step": 221
},
{
"epoch": 0.7883697702807679,
"grad_norm": 0.117218516767025,
"learning_rate": 9.119506246526386e-06,
"loss": 0.445,
"mean_token_accuracy": 0.03329144358031044,
"step": 222
},
{
"epoch": 0.7919209854622129,
"grad_norm": 0.11457571387290955,
"learning_rate": 9.111548753030824e-06,
"loss": 0.4492,
"mean_token_accuracy": 0.031933008820487885,
"step": 223
},
{
"epoch": 0.7954722006436578,
"grad_norm": 0.12239421904087067,
"learning_rate": 9.103558967629211e-06,
"loss": 0.4316,
"mean_token_accuracy": 0.03384177159387036,
"step": 224
},
{
"epoch": 0.7990234158251026,
"grad_norm": 0.1020677462220192,
"learning_rate": 9.09553695307294e-06,
"loss": 0.416,
"mean_token_accuracy": 0.02926015923367231,
"step": 225
},
{
"epoch": 0.8025746310065476,
"grad_norm": 0.13255175948143005,
"learning_rate": 9.087482772366529e-06,
"loss": 0.4609,
"mean_token_accuracy": 0.029086984148307238,
"step": 226
},
{
"epoch": 0.8061258461879924,
"grad_norm": 0.12250262498855591,
"learning_rate": 9.07939648876712e-06,
"loss": 0.4503,
"mean_token_accuracy": 0.028839589653216535,
"step": 227
},
{
"epoch": 0.8096770613694374,
"grad_norm": 0.1015993133187294,
"learning_rate": 9.071278165784001e-06,
"loss": 0.4456,
"mean_token_accuracy": 0.036248502918169834,
"step": 228
},
{
"epoch": 0.8132282765508823,
"grad_norm": 0.11699055135250092,
"learning_rate": 9.063127867178085e-06,
"loss": 0.4456,
"mean_token_accuracy": 0.03173907856398728,
"step": 229
},
{
"epoch": 0.8167794917323271,
"grad_norm": 0.1485580950975418,
"learning_rate": 9.054945656961429e-06,
"loss": 0.4553,
"mean_token_accuracy": 0.031534753976302454,
"step": 230
},
{
"epoch": 0.8203307069137721,
"grad_norm": 0.11307156831026077,
"learning_rate": 9.046731599396716e-06,
"loss": 0.4558,
"mean_token_accuracy": 0.030681278425618075,
"step": 231
},
{
"epoch": 0.8238819220952169,
"grad_norm": 0.13930678367614746,
"learning_rate": 9.03848575899676e-06,
"loss": 0.4537,
"mean_token_accuracy": 0.03255952116523986,
"step": 232
},
{
"epoch": 0.8274331372766619,
"grad_norm": 0.12353396415710449,
"learning_rate": 9.030208200523994e-06,
"loss": 0.4994,
"mean_token_accuracy": 0.02909530242322944,
"step": 233
},
{
"epoch": 0.8309843524581068,
"grad_norm": 0.13621285557746887,
"learning_rate": 9.021898988989966e-06,
"loss": 0.4975,
"mean_token_accuracy": 0.02880604089659755,
"step": 234
},
{
"epoch": 0.8345355676395516,
"grad_norm": 0.12057027965784073,
"learning_rate": 9.013558189654819e-06,
"loss": 0.4231,
"mean_token_accuracy": 0.031169906702416483,
"step": 235
},
{
"epoch": 0.8380867828209966,
"grad_norm": 0.1317608654499054,
"learning_rate": 9.005185868026793e-06,
"loss": 0.4354,
"mean_token_accuracy": 0.032176410291867796,
"step": 236
},
{
"epoch": 0.8416379980024414,
"grad_norm": 0.11058870702981949,
"learning_rate": 8.996782089861699e-06,
"loss": 0.4211,
"mean_token_accuracy": 0.030159930855006678,
"step": 237
},
{
"epoch": 0.8451892131838864,
"grad_norm": 0.11160682886838913,
"learning_rate": 8.988346921162407e-06,
"loss": 0.4608,
"mean_token_accuracy": 0.03140243760572048,
"step": 238
},
{
"epoch": 0.8487404283653313,
"grad_norm": 0.12711158394813538,
"learning_rate": 8.979880428178323e-06,
"loss": 0.425,
"mean_token_accuracy": 0.03014450103364652,
"step": 239
},
{
"epoch": 0.8522916435467761,
"grad_norm": 0.10027121752500534,
"learning_rate": 8.971382677404878e-06,
"loss": 0.4168,
"mean_token_accuracy": 0.0301158644942916,
"step": 240
},
{
"epoch": 0.8558428587282211,
"grad_norm": 0.11261257529258728,
"learning_rate": 8.962853735582996e-06,
"loss": 0.4391,
"mean_token_accuracy": 0.03371476338725188,
"step": 241
},
{
"epoch": 0.8593940739096659,
"grad_norm": 0.11643965542316437,
"learning_rate": 8.95429366969858e-06,
"loss": 0.4387,
"mean_token_accuracy": 0.03266353774779418,
"step": 242
},
{
"epoch": 0.8629452890911109,
"grad_norm": 0.1157926544547081,
"learning_rate": 8.94570254698197e-06,
"loss": 0.4163,
"mean_token_accuracy": 0.03458352739835391,
"step": 243
},
{
"epoch": 0.8664965042725558,
"grad_norm": 0.13001175224781036,
"learning_rate": 8.93708043490743e-06,
"loss": 0.4514,
"mean_token_accuracy": 0.033025608005118556,
"step": 244
},
{
"epoch": 0.8700477194540006,
"grad_norm": 0.10545831173658371,
"learning_rate": 8.928427401192618e-06,
"loss": 0.4254,
"mean_token_accuracy": 0.033079178792831954,
"step": 245
},
{
"epoch": 0.8735989346354456,
"grad_norm": 0.13507139682769775,
"learning_rate": 8.919743513798044e-06,
"loss": 0.4596,
"mean_token_accuracy": 0.0301192174811149,
"step": 246
},
{
"epoch": 0.8771501498168904,
"grad_norm": 0.14613857865333557,
"learning_rate": 8.911028840926537e-06,
"loss": 0.4366,
"mean_token_accuracy": 0.032090271219203714,
"step": 247
},
{
"epoch": 0.8807013649983354,
"grad_norm": 0.16336016356945038,
"learning_rate": 8.902283451022725e-06,
"loss": 0.4568,
"mean_token_accuracy": 0.02893733787277597,
"step": 248
},
{
"epoch": 0.8842525801797803,
"grad_norm": 0.1086035892367363,
"learning_rate": 8.89350741277247e-06,
"loss": 0.4177,
"mean_token_accuracy": 0.031324194565968355,
"step": 249
},
{
"epoch": 0.8878037953612252,
"grad_norm": 0.11843832582235336,
"learning_rate": 8.884700795102365e-06,
"loss": 0.4408,
"mean_token_accuracy": 0.030065572103922023,
"step": 250
},
{
"epoch": 0.8913550105426701,
"grad_norm": 0.11013603955507278,
"learning_rate": 8.875863667179155e-06,
"loss": 0.4411,
"mean_token_accuracy": 0.03253144204427372,
"step": 251
},
{
"epoch": 0.8949062257241149,
"grad_norm": 0.10877636820077896,
"learning_rate": 8.866996098409217e-06,
"loss": 0.4445,
"mean_token_accuracy": 0.030102188491582638,
"step": 252
},
{
"epoch": 0.8984574409055599,
"grad_norm": 0.12473298609256744,
"learning_rate": 8.858098158438013e-06,
"loss": 0.4278,
"mean_token_accuracy": 0.036179308270220645,
"step": 253
},
{
"epoch": 0.9020086560870048,
"grad_norm": 0.13368239998817444,
"learning_rate": 8.849169917149532e-06,
"loss": 0.4418,
"mean_token_accuracy": 0.0318060622739722,
"step": 254
},
{
"epoch": 0.9055598712684497,
"grad_norm": 0.13873310387134552,
"learning_rate": 8.840211444665754e-06,
"loss": 0.4442,
"mean_token_accuracy": 0.031459740581340156,
"step": 255
},
{
"epoch": 0.9091110864498946,
"grad_norm": 0.11932561546564102,
"learning_rate": 8.831222811346088e-06,
"loss": 0.4303,
"mean_token_accuracy": 0.034126964517781744,
"step": 256
},
{
"epoch": 0.9126623016313394,
"grad_norm": 0.10621127486228943,
"learning_rate": 8.822204087786831e-06,
"loss": 0.4131,
"mean_token_accuracy": 0.028078828796424204,
"step": 257
},
{
"epoch": 0.9162135168127844,
"grad_norm": 0.10923238098621368,
"learning_rate": 8.813155344820602e-06,
"loss": 0.4328,
"mean_token_accuracy": 0.03459552870117477,
"step": 258
},
{
"epoch": 0.9197647319942293,
"grad_norm": 0.1389293372631073,
"learning_rate": 8.804076653515792e-06,
"loss": 0.4502,
"mean_token_accuracy": 0.030979796672909288,
"step": 259
},
{
"epoch": 0.9233159471756742,
"grad_norm": 0.12477507442235947,
"learning_rate": 8.794968085176006e-06,
"loss": 0.4523,
"mean_token_accuracy": 0.030272630028775893,
"step": 260
},
{
"epoch": 0.9268671623571191,
"grad_norm": 0.10108153522014618,
"learning_rate": 8.785829711339502e-06,
"loss": 0.4305,
"mean_token_accuracy": 0.032170976937777596,
"step": 261
},
{
"epoch": 0.9304183775385639,
"grad_norm": 0.12623409926891327,
"learning_rate": 8.776661603778629e-06,
"loss": 0.4341,
"mean_token_accuracy": 0.03348069915591623,
"step": 262
},
{
"epoch": 0.9339695927200089,
"grad_norm": 0.1211409643292427,
"learning_rate": 8.767463834499261e-06,
"loss": 0.429,
"mean_token_accuracy": 0.03233881213964196,
"step": 263
},
{
"epoch": 0.9375208079014538,
"grad_norm": 0.12843799591064453,
"learning_rate": 8.758236475740236e-06,
"loss": 0.4286,
"mean_token_accuracy": 0.030366436680196784,
"step": 264
},
{
"epoch": 0.9410720230828987,
"grad_norm": 0.11599805951118469,
"learning_rate": 8.748979599972787e-06,
"loss": 0.4165,
"mean_token_accuracy": 0.030518364896124694,
"step": 265
},
{
"epoch": 0.9446232382643436,
"grad_norm": 0.11995775997638702,
"learning_rate": 8.739693279899969e-06,
"loss": 0.4613,
"mean_token_accuracy": 0.036479957580013433,
"step": 266
},
{
"epoch": 0.9481744534457884,
"grad_norm": 0.11425738781690598,
"learning_rate": 8.730377588456092e-06,
"loss": 0.4501,
"mean_token_accuracy": 0.03359218502737349,
"step": 267
},
{
"epoch": 0.9517256686272334,
"grad_norm": 0.12363097071647644,
"learning_rate": 8.72103259880615e-06,
"loss": 0.383,
"mean_token_accuracy": 0.03248222741785867,
"step": 268
},
{
"epoch": 0.9552768838086783,
"grad_norm": 0.14224040508270264,
"learning_rate": 8.711658384345244e-06,
"loss": 0.4453,
"mean_token_accuracy": 0.02758350678777788,
"step": 269
},
{
"epoch": 0.9588280989901232,
"grad_norm": 0.11947707086801529,
"learning_rate": 8.702255018698e-06,
"loss": 0.4427,
"mean_token_accuracy": 0.030479307537461864,
"step": 270
},
{
"epoch": 0.9623793141715681,
"grad_norm": 0.11489012092351913,
"learning_rate": 8.692822575718e-06,
"loss": 0.4322,
"mean_token_accuracy": 0.032157614528841805,
"step": 271
},
{
"epoch": 0.9659305293530129,
"grad_norm": 0.13005125522613525,
"learning_rate": 8.683361129487198e-06,
"loss": 0.4471,
"mean_token_accuracy": 0.0325647444005881,
"step": 272
},
{
"epoch": 0.9694817445344579,
"grad_norm": 0.1157633364200592,
"learning_rate": 8.673870754315336e-06,
"loss": 0.4213,
"mean_token_accuracy": 0.03665181735595979,
"step": 273
},
{
"epoch": 0.9730329597159028,
"grad_norm": 0.10300834476947784,
"learning_rate": 8.664351524739368e-06,
"loss": 0.4311,
"mean_token_accuracy": 0.030481873731332598,
"step": 274
},
{
"epoch": 0.9765841748973477,
"grad_norm": 0.10374599695205688,
"learning_rate": 8.65480351552286e-06,
"loss": 0.3858,
"mean_token_accuracy": 0.029714216772845248,
"step": 275
},
{
"epoch": 0.9801353900787926,
"grad_norm": 0.10875561088323593,
"learning_rate": 8.645226801655418e-06,
"loss": 0.4303,
"mean_token_accuracy": 0.03389121513828286,
"step": 276
},
{
"epoch": 0.9836866052602374,
"grad_norm": 0.10975569486618042,
"learning_rate": 8.635621458352094e-06,
"loss": 0.4157,
"mean_token_accuracy": 0.030311406939290464,
"step": 277
},
{
"epoch": 0.9872378204416824,
"grad_norm": 0.11089649796485901,
"learning_rate": 8.625987561052789e-06,
"loss": 0.4503,
"mean_token_accuracy": 0.03370039981746231,
"step": 278
},
{
"epoch": 0.9907890356231273,
"grad_norm": 0.13259300589561462,
"learning_rate": 8.616325185421673e-06,
"loss": 0.4693,
"mean_token_accuracy": 0.029472638332663337,
"step": 279
},
{
"epoch": 0.9943402508045722,
"grad_norm": 0.1153963953256607,
"learning_rate": 8.606634407346575e-06,
"loss": 0.4639,
"mean_token_accuracy": 0.033020730788848596,
"step": 280
},
{
"epoch": 0.9978914659860171,
"grad_norm": 0.10748148709535599,
"learning_rate": 8.596915302938403e-06,
"loss": 0.4285,
"mean_token_accuracy": 0.030481149649858708,
"step": 281
},
{
"epoch": 1.0,
"grad_norm": 0.06762544065713882,
"learning_rate": 8.587167948530533e-06,
"loss": 0.2615,
"mean_token_accuracy": 0.028144311064878774,
"step": 282
},
{
"epoch": 1.003551215181445,
"grad_norm": 0.12099552899599075,
"learning_rate": 8.577392420678217e-06,
"loss": 0.4319,
"mean_token_accuracy": 0.03571089356773882,
"step": 283
},
{
"epoch": 1.0071024303628897,
"grad_norm": 0.11393958330154419,
"learning_rate": 8.567588796157983e-06,
"loss": 0.4351,
"mean_token_accuracy": 0.03345001182970009,
"step": 284
},
{
"epoch": 1.0106536455443347,
"grad_norm": 0.12373528629541397,
"learning_rate": 8.557757151967025e-06,
"loss": 0.428,
"mean_token_accuracy": 0.030941195098421304,
"step": 285
},
{
"epoch": 1.0142048607257796,
"grad_norm": 0.10352396219968796,
"learning_rate": 8.547897565322601e-06,
"loss": 0.4078,
"mean_token_accuracy": 0.03218995450879447,
"step": 286
},
{
"epoch": 1.0177560759072246,
"grad_norm": 0.11134106665849686,
"learning_rate": 8.538010113661434e-06,
"loss": 0.4118,
"mean_token_accuracy": 0.033460683858720586,
"step": 287
},
{
"epoch": 1.0213072910886694,
"grad_norm": 0.11754720658063889,
"learning_rate": 8.528094874639092e-06,
"loss": 0.4467,
"mean_token_accuracy": 0.032714871398638934,
"step": 288
},
{
"epoch": 1.0248585062701143,
"grad_norm": 0.12311739474534988,
"learning_rate": 8.518151926129384e-06,
"loss": 0.4248,
"mean_token_accuracy": 0.03665415535760985,
"step": 289
},
{
"epoch": 1.0284097214515593,
"grad_norm": 0.11815127730369568,
"learning_rate": 8.508181346223749e-06,
"loss": 0.4683,
"mean_token_accuracy": 0.03141742098341638,
"step": 290
},
{
"epoch": 1.031960936633004,
"grad_norm": 0.13334578275680542,
"learning_rate": 8.498183213230646e-06,
"loss": 0.4357,
"mean_token_accuracy": 0.034682282523135655,
"step": 291
},
{
"epoch": 1.035512151814449,
"grad_norm": 0.11024576425552368,
"learning_rate": 8.488157605674924e-06,
"loss": 0.4039,
"mean_token_accuracy": 0.03140191955390037,
"step": 292
},
{
"epoch": 1.039063366995894,
"grad_norm": 0.11426277458667755,
"learning_rate": 8.478104602297226e-06,
"loss": 0.4646,
"mean_token_accuracy": 0.0302239565171476,
"step": 293
},
{
"epoch": 1.0426145821773387,
"grad_norm": 0.13004811108112335,
"learning_rate": 8.468024282053357e-06,
"loss": 0.4405,
"mean_token_accuracy": 0.030375482703675516,
"step": 294
},
{
"epoch": 1.0461657973587837,
"grad_norm": 0.11746154725551605,
"learning_rate": 8.457916724113667e-06,
"loss": 0.4623,
"mean_token_accuracy": 0.03180732572218403,
"step": 295
},
{
"epoch": 1.0497170125402286,
"grad_norm": 0.1322462111711502,
"learning_rate": 8.447782007862427e-06,
"loss": 0.4509,
"mean_token_accuracy": 0.03127452934131725,
"step": 296
},
{
"epoch": 1.0532682277216736,
"grad_norm": 0.11387277394533157,
"learning_rate": 8.437620212897213e-06,
"loss": 0.4601,
"mean_token_accuracy": 0.03242337591655087,
"step": 297
},
{
"epoch": 1.0568194429031184,
"grad_norm": 0.1132403239607811,
"learning_rate": 8.427431419028273e-06,
"loss": 0.4225,
"mean_token_accuracy": 0.03050946029179613,
"step": 298
},
{
"epoch": 1.0603706580845633,
"grad_norm": 0.11399143934249878,
"learning_rate": 8.417215706277905e-06,
"loss": 0.4096,
"mean_token_accuracy": 0.030746224825634272,
"step": 299
},
{
"epoch": 1.0639218732660083,
"grad_norm": 0.1122201532125473,
"learning_rate": 8.406973154879826e-06,
"loss": 0.4719,
"mean_token_accuracy": 0.030868174064380582,
"step": 300
},
{
"epoch": 1.067473088447453,
"grad_norm": 0.11641374230384827,
"learning_rate": 8.396703845278537e-06,
"loss": 0.4785,
"mean_token_accuracy": 0.03244481591536896,
"step": 301
},
{
"epoch": 1.071024303628898,
"grad_norm": 0.11136434972286224,
"learning_rate": 8.386407858128707e-06,
"loss": 0.414,
"mean_token_accuracy": 0.03586079240631079,
"step": 302
},
{
"epoch": 1.074575518810343,
"grad_norm": 0.0993858203291893,
"learning_rate": 8.376085274294518e-06,
"loss": 0.4497,
"mean_token_accuracy": 0.031207030300720362,
"step": 303
},
{
"epoch": 1.0781267339917877,
"grad_norm": 0.12647627294063568,
"learning_rate": 8.365736174849053e-06,
"loss": 0.4451,
"mean_token_accuracy": 0.032390626991400495,
"step": 304
},
{
"epoch": 1.0816779491732327,
"grad_norm": 0.10189671069383621,
"learning_rate": 8.355360641073637e-06,
"loss": 0.4146,
"mean_token_accuracy": 0.03132528428795922,
"step": 305
},
{
"epoch": 1.0852291643546776,
"grad_norm": 0.13301630318164825,
"learning_rate": 8.344958754457214e-06,
"loss": 0.4729,
"mean_token_accuracy": 0.030442484636296285,
"step": 306
},
{
"epoch": 1.0887803795361226,
"grad_norm": 0.10021793097257614,
"learning_rate": 8.3345305966957e-06,
"loss": 0.4336,
"mean_token_accuracy": 0.029210989938292187,
"step": 307
},
{
"epoch": 1.0923315947175674,
"grad_norm": 0.1116681918501854,
"learning_rate": 8.324076249691347e-06,
"loss": 0.4579,
"mean_token_accuracy": 0.03265956621180521,
"step": 308
},
{
"epoch": 1.0958828098990123,
"grad_norm": 0.12556776404380798,
"learning_rate": 8.31359579555209e-06,
"loss": 0.4345,
"mean_token_accuracy": 0.03397847878295579,
"step": 309
},
{
"epoch": 1.0994340250804573,
"grad_norm": 0.12205666303634644,
"learning_rate": 8.30308931659091e-06,
"loss": 0.4724,
"mean_token_accuracy": 0.031116866441152524,
"step": 310
},
{
"epoch": 1.102985240261902,
"grad_norm": 0.11637762933969498,
"learning_rate": 8.292556895325195e-06,
"loss": 0.4209,
"mean_token_accuracy": 0.031364490951091284,
"step": 311
},
{
"epoch": 1.106536455443347,
"grad_norm": 0.10156012326478958,
"learning_rate": 8.281998614476066e-06,
"loss": 0.4303,
"mean_token_accuracy": 0.03230309693390154,
"step": 312
},
{
"epoch": 1.110087670624792,
"grad_norm": 0.12292397022247314,
"learning_rate": 8.271414556967758e-06,
"loss": 0.4684,
"mean_token_accuracy": 0.030929933149309363,
"step": 313
},
{
"epoch": 1.1136388858062367,
"grad_norm": 0.12097756564617157,
"learning_rate": 8.260804805926948e-06,
"loss": 0.4372,
"mean_token_accuracy": 0.033056139553082176,
"step": 314
},
{
"epoch": 1.1171901009876817,
"grad_norm": 0.12706424295902252,
"learning_rate": 8.250169444682109e-06,
"loss": 0.4453,
"mean_token_accuracy": 0.028371741809678497,
"step": 315
},
{
"epoch": 1.1207413161691266,
"grad_norm": 0.11325722187757492,
"learning_rate": 8.239508556762857e-06,
"loss": 0.4328,
"mean_token_accuracy": 0.02932896776837879,
"step": 316
},
{
"epoch": 1.1242925313505716,
"grad_norm": 0.11459724605083466,
"learning_rate": 8.228822225899294e-06,
"loss": 0.4533,
"mean_token_accuracy": 0.03276748675489216,
"step": 317
},
{
"epoch": 1.1278437465320164,
"grad_norm": 0.11599034070968628,
"learning_rate": 8.218110536021347e-06,
"loss": 0.4169,
"mean_token_accuracy": 0.032454707339638844,
"step": 318
},
{
"epoch": 1.1313949617134613,
"grad_norm": 0.11511659622192383,
"learning_rate": 8.207373571258113e-06,
"loss": 0.4274,
"mean_token_accuracy": 0.031180168389255414,
"step": 319
},
{
"epoch": 1.1349461768949063,
"grad_norm": 0.1145598515868187,
"learning_rate": 8.196611415937196e-06,
"loss": 0.4306,
"mean_token_accuracy": 0.032077380437840475,
"step": 320
},
{
"epoch": 1.138497392076351,
"grad_norm": 0.12687522172927856,
"learning_rate": 8.18582415458405e-06,
"loss": 0.4119,
"mean_token_accuracy": 0.03120192799178767,
"step": 321
},
{
"epoch": 1.142048607257796,
"grad_norm": 0.11041875183582306,
"learning_rate": 8.1750118719213e-06,
"loss": 0.4385,
"mean_token_accuracy": 0.03349298075772822,
"step": 322
},
{
"epoch": 1.145599822439241,
"grad_norm": 0.099665068089962,
"learning_rate": 8.164174652868097e-06,
"loss": 0.4246,
"mean_token_accuracy": 0.03043448956668726,
"step": 323
},
{
"epoch": 1.149151037620686,
"grad_norm": 0.11485815793275833,
"learning_rate": 8.153312582539438e-06,
"loss": 0.4182,
"mean_token_accuracy": 0.030599489495216403,
"step": 324
},
{
"epoch": 1.1527022528021307,
"grad_norm": 0.13416896760463715,
"learning_rate": 8.142425746245503e-06,
"loss": 0.4498,
"mean_token_accuracy": 0.029845738639778574,
"step": 325
},
{
"epoch": 1.1562534679835756,
"grad_norm": 0.13239556550979614,
"learning_rate": 8.131514229490975e-06,
"loss": 0.507,
"mean_token_accuracy": 0.031054503782797838,
"step": 326
},
{
"epoch": 1.1598046831650206,
"grad_norm": 0.12665045261383057,
"learning_rate": 8.120578117974388e-06,
"loss": 0.4287,
"mean_token_accuracy": 0.03402871212529135,
"step": 327
},
{
"epoch": 1.1633558983464654,
"grad_norm": 0.14665213227272034,
"learning_rate": 8.109617497587429e-06,
"loss": 0.4638,
"mean_token_accuracy": 0.029450454625475686,
"step": 328
},
{
"epoch": 1.1669071135279103,
"grad_norm": 0.1434643715620041,
"learning_rate": 8.098632454414286e-06,
"loss": 0.4413,
"mean_token_accuracy": 0.03347731575922808,
"step": 329
},
{
"epoch": 1.1704583287093553,
"grad_norm": 0.09570540487766266,
"learning_rate": 8.08762307473096e-06,
"loss": 0.4027,
"mean_token_accuracy": 0.028911761146446224,
"step": 330
},
{
"epoch": 1.1740095438908003,
"grad_norm": 0.0962023138999939,
"learning_rate": 8.07658944500459e-06,
"loss": 0.427,
"mean_token_accuracy": 0.03492621044642874,
"step": 331
},
{
"epoch": 1.177560759072245,
"grad_norm": 0.09288761019706726,
"learning_rate": 8.065531651892771e-06,
"loss": 0.4205,
"mean_token_accuracy": 0.03696197941462742,
"step": 332
},
{
"epoch": 1.18111197425369,
"grad_norm": 0.11237376183271408,
"learning_rate": 8.054449782242876e-06,
"loss": 0.4474,
"mean_token_accuracy": 0.029908099084423156,
"step": 333
},
{
"epoch": 1.1846631894351347,
"grad_norm": 0.10639701038599014,
"learning_rate": 8.043343923091382e-06,
"loss": 0.4375,
"mean_token_accuracy": 0.03525051188262296,
"step": 334
},
{
"epoch": 1.1882144046165797,
"grad_norm": 0.10115876793861389,
"learning_rate": 8.03221416166317e-06,
"loss": 0.4197,
"mean_token_accuracy": 0.035662662419781554,
"step": 335
},
{
"epoch": 1.1917656197980246,
"grad_norm": 0.14355993270874023,
"learning_rate": 8.021060585370845e-06,
"loss": 0.4546,
"mean_token_accuracy": 0.03178581687097903,
"step": 336
},
{
"epoch": 1.1953168349794696,
"grad_norm": 0.10448651015758514,
"learning_rate": 8.009883281814066e-06,
"loss": 0.3823,
"mean_token_accuracy": 0.030594974505220307,
"step": 337
},
{
"epoch": 1.1988680501609144,
"grad_norm": 0.14605805277824402,
"learning_rate": 7.998682338778834e-06,
"loss": 0.4644,
"mean_token_accuracy": 0.032786186962766806,
"step": 338
},
{
"epoch": 1.2024192653423593,
"grad_norm": 0.11398659646511078,
"learning_rate": 7.987457844236817e-06,
"loss": 0.4479,
"mean_token_accuracy": 0.030445763823081506,
"step": 339
},
{
"epoch": 1.2059704805238043,
"grad_norm": 0.10509105026721954,
"learning_rate": 7.976209886344654e-06,
"loss": 0.4302,
"mean_token_accuracy": 0.031263302011211636,
"step": 340
},
{
"epoch": 1.209521695705249,
"grad_norm": 0.12267457693815231,
"learning_rate": 7.964938553443267e-06,
"loss": 0.43,
"mean_token_accuracy": 0.03181008769752225,
"step": 341
},
{
"epoch": 1.213072910886694,
"grad_norm": 0.13068453967571259,
"learning_rate": 7.953643934057162e-06,
"loss": 0.4138,
"mean_token_accuracy": 0.02875417193354224,
"step": 342
},
{
"epoch": 1.216624126068139,
"grad_norm": 0.1138904020190239,
"learning_rate": 7.942326116893733e-06,
"loss": 0.4309,
"mean_token_accuracy": 0.035087752894469304,
"step": 343
},
{
"epoch": 1.220175341249584,
"grad_norm": 0.11926258355379105,
"learning_rate": 7.930985190842576e-06,
"loss": 0.4252,
"mean_token_accuracy": 0.034354623672697926,
"step": 344
},
{
"epoch": 1.2237265564310287,
"grad_norm": 0.11906962096691132,
"learning_rate": 7.919621244974773e-06,
"loss": 0.4375,
"mean_token_accuracy": 0.03322149996893131,
"step": 345
},
{
"epoch": 1.2272777716124736,
"grad_norm": 0.1229124665260315,
"learning_rate": 7.908234368542214e-06,
"loss": 0.4282,
"mean_token_accuracy": 0.027521870553755434,
"step": 346
},
{
"epoch": 1.2308289867939186,
"grad_norm": 0.12528809905052185,
"learning_rate": 7.896824650976873e-06,
"loss": 0.4434,
"mean_token_accuracy": 0.033167709574627224,
"step": 347
},
{
"epoch": 1.2343802019753634,
"grad_norm": 0.13237693905830383,
"learning_rate": 7.885392181890126e-06,
"loss": 0.4345,
"mean_token_accuracy": 0.03176790558427456,
"step": 348
},
{
"epoch": 1.2379314171568083,
"grad_norm": 0.11144175380468369,
"learning_rate": 7.873937051072037e-06,
"loss": 0.4237,
"mean_token_accuracy": 0.035554787718865555,
"step": 349
},
{
"epoch": 1.2414826323382533,
"grad_norm": 0.11270050704479218,
"learning_rate": 7.862459348490645e-06,
"loss": 0.4276,
"mean_token_accuracy": 0.03402164916769834,
"step": 350
},
{
"epoch": 1.2450338475196983,
"grad_norm": 0.11199501156806946,
"learning_rate": 7.85095916429128e-06,
"loss": 0.422,
"mean_token_accuracy": 0.029369741489063017,
"step": 351
},
{
"epoch": 1.248585062701143,
"grad_norm": 0.11493431031703949,
"learning_rate": 7.839436588795834e-06,
"loss": 0.4439,
"mean_token_accuracy": 0.030408731843635906,
"step": 352
},
{
"epoch": 1.252136277882588,
"grad_norm": 0.12098958343267441,
"learning_rate": 7.82789171250206e-06,
"loss": 0.4624,
"mean_token_accuracy": 0.03373954394919565,
"step": 353
},
{
"epoch": 1.2556874930640327,
"grad_norm": 0.11300572007894516,
"learning_rate": 7.816324626082864e-06,
"loss": 0.4189,
"mean_token_accuracy": 0.030854827327857492,
"step": 354
},
{
"epoch": 1.2592387082454777,
"grad_norm": 0.11174172908067703,
"learning_rate": 7.804735420385578e-06,
"loss": 0.4238,
"mean_token_accuracy": 0.036562988705554744,
"step": 355
},
{
"epoch": 1.2627899234269226,
"grad_norm": 0.12956643104553223,
"learning_rate": 7.793124186431271e-06,
"loss": 0.4461,
"mean_token_accuracy": 0.0327669634934864,
"step": 356
},
{
"epoch": 1.2663411386083676,
"grad_norm": 0.1083948016166687,
"learning_rate": 7.781491015414018e-06,
"loss": 0.4331,
"mean_token_accuracy": 0.03526076916750753,
"step": 357
},
{
"epoch": 1.2698923537898126,
"grad_norm": 0.1142224371433258,
"learning_rate": 7.769835998700182e-06,
"loss": 0.4384,
"mean_token_accuracy": 0.033927020744158654,
"step": 358
},
{
"epoch": 1.2734435689712573,
"grad_norm": 0.11039263755083084,
"learning_rate": 7.758159227827701e-06,
"loss": 0.4415,
"mean_token_accuracy": 0.03664633895459701,
"step": 359
},
{
"epoch": 1.2769947841527023,
"grad_norm": 0.1170225441455841,
"learning_rate": 7.746460794505375e-06,
"loss": 0.4572,
"mean_token_accuracy": 0.028795534330129158,
"step": 360
},
{
"epoch": 1.280545999334147,
"grad_norm": 0.10929251462221146,
"learning_rate": 7.734740790612137e-06,
"loss": 0.4153,
"mean_token_accuracy": 0.03168763507710537,
"step": 361
},
{
"epoch": 1.284097214515592,
"grad_norm": 0.12044554948806763,
"learning_rate": 7.722999308196329e-06,
"loss": 0.4328,
"mean_token_accuracy": 0.02993700837396318,
"step": 362
},
{
"epoch": 1.287648429697037,
"grad_norm": 0.11714787036180496,
"learning_rate": 7.711236439474991e-06,
"loss": 0.437,
"mean_token_accuracy": 0.033871306681248825,
"step": 363
},
{
"epoch": 1.291199644878482,
"grad_norm": 0.10608412325382233,
"learning_rate": 7.69945227683313e-06,
"loss": 0.4126,
"mean_token_accuracy": 0.0351106549569522,
"step": 364
},
{
"epoch": 1.2947508600599267,
"grad_norm": 0.12553463876247406,
"learning_rate": 7.68764691282299e-06,
"loss": 0.428,
"mean_token_accuracy": 0.03240462405665312,
"step": 365
},
{
"epoch": 1.2983020752413716,
"grad_norm": 0.11900968104600906,
"learning_rate": 7.675820440163334e-06,
"loss": 0.4462,
"mean_token_accuracy": 0.03121896790980827,
"step": 366
},
{
"epoch": 1.3018532904228166,
"grad_norm": 0.13135908544063568,
"learning_rate": 7.663972951738708e-06,
"loss": 0.4563,
"mean_token_accuracy": 0.0299510243057739,
"step": 367
},
{
"epoch": 1.3054045056042614,
"grad_norm": 0.11855065077543259,
"learning_rate": 7.652104540598712e-06,
"loss": 0.4508,
"mean_token_accuracy": 0.03168911819375353,
"step": 368
},
{
"epoch": 1.3089557207857063,
"grad_norm": 0.11534618586301804,
"learning_rate": 7.640215299957283e-06,
"loss": 0.4274,
"mean_token_accuracy": 0.028188226955535356,
"step": 369
},
{
"epoch": 1.3125069359671513,
"grad_norm": 0.14458145201206207,
"learning_rate": 7.628305323191942e-06,
"loss": 0.4678,
"mean_token_accuracy": 0.030351929475727957,
"step": 370
},
{
"epoch": 1.3160581511485963,
"grad_norm": 0.10904058068990707,
"learning_rate": 7.616374703843071e-06,
"loss": 0.4343,
"mean_token_accuracy": 0.027722057624487206,
"step": 371
},
{
"epoch": 1.319609366330041,
"grad_norm": 0.12762950360774994,
"learning_rate": 7.604423535613183e-06,
"loss": 0.4492,
"mean_token_accuracy": 0.030748855464480584,
"step": 372
},
{
"epoch": 1.323160581511486,
"grad_norm": 0.11561151593923569,
"learning_rate": 7.592451912366176e-06,
"loss": 0.4323,
"mean_token_accuracy": 0.03335273687844165,
"step": 373
},
{
"epoch": 1.326711796692931,
"grad_norm": 0.1088724359869957,
"learning_rate": 7.580459928126607e-06,
"loss": 0.4449,
"mean_token_accuracy": 0.02959715071847313,
"step": 374
},
{
"epoch": 1.3302630118743757,
"grad_norm": 0.1027379035949707,
"learning_rate": 7.568447677078937e-06,
"loss": 0.4363,
"mean_token_accuracy": 0.03155246840651671,
"step": 375
},
{
"epoch": 1.3338142270558206,
"grad_norm": 0.12113990634679794,
"learning_rate": 7.556415253566814e-06,
"loss": 0.4348,
"mean_token_accuracy": 0.033069032098865137,
"step": 376
},
{
"epoch": 1.3373654422372656,
"grad_norm": 0.1241699829697609,
"learning_rate": 7.544362752092309e-06,
"loss": 0.451,
"mean_token_accuracy": 0.03430816161016992,
"step": 377
},
{
"epoch": 1.3409166574187106,
"grad_norm": 0.11146720498800278,
"learning_rate": 7.532290267315189e-06,
"loss": 0.4543,
"mean_token_accuracy": 0.034234997216117335,
"step": 378
},
{
"epoch": 1.3444678726001553,
"grad_norm": 0.12195354700088501,
"learning_rate": 7.52019789405217e-06,
"loss": 0.443,
"mean_token_accuracy": 0.030580678501792136,
"step": 379
},
{
"epoch": 1.3480190877816003,
"grad_norm": 0.1010030210018158,
"learning_rate": 7.508085727276169e-06,
"loss": 0.4056,
"mean_token_accuracy": 0.029197495718108257,
"step": 380
},
{
"epoch": 1.351570302963045,
"grad_norm": 0.11852242797613144,
"learning_rate": 7.495953862115561e-06,
"loss": 0.4604,
"mean_token_accuracy": 0.028626239189179614,
"step": 381
},
{
"epoch": 1.35512151814449,
"grad_norm": 0.15514077246189117,
"learning_rate": 7.483802393853431e-06,
"loss": 0.4638,
"mean_token_accuracy": 0.028513169188954635,
"step": 382
},
{
"epoch": 1.358672733325935,
"grad_norm": 0.1145181730389595,
"learning_rate": 7.471631417926826e-06,
"loss": 0.4245,
"mean_token_accuracy": 0.028723615967464866,
"step": 383
},
{
"epoch": 1.36222394850738,
"grad_norm": 0.10029944777488708,
"learning_rate": 7.459441029926006e-06,
"loss": 0.4169,
"mean_token_accuracy": 0.03500634835290839,
"step": 384
},
{
"epoch": 1.365775163688825,
"grad_norm": 0.14268019795417786,
"learning_rate": 7.447231325593689e-06,
"loss": 0.4738,
"mean_token_accuracy": 0.03180533792328788,
"step": 385
},
{
"epoch": 1.3693263788702696,
"grad_norm": 0.09910490363836288,
"learning_rate": 7.435002400824309e-06,
"loss": 0.4006,
"mean_token_accuracy": 0.035282006590932724,
"step": 386
},
{
"epoch": 1.3728775940517146,
"grad_norm": 0.10883248597383499,
"learning_rate": 7.422754351663252e-06,
"loss": 0.4423,
"mean_token_accuracy": 0.03188303730712505,
"step": 387
},
{
"epoch": 1.3764288092331594,
"grad_norm": 0.11308008432388306,
"learning_rate": 7.410487274306104e-06,
"loss": 0.4171,
"mean_token_accuracy": 0.03165900051317294,
"step": 388
},
{
"epoch": 1.3799800244146043,
"grad_norm": 0.12280994653701782,
"learning_rate": 7.398201265097902e-06,
"loss": 0.4296,
"mean_token_accuracy": 0.03297045080034877,
"step": 389
},
{
"epoch": 1.3835312395960493,
"grad_norm": 0.11657087504863739,
"learning_rate": 7.385896420532372e-06,
"loss": 0.4095,
"mean_token_accuracy": 0.031911868369206786,
"step": 390
},
{
"epoch": 1.3870824547774943,
"grad_norm": 0.1274227350950241,
"learning_rate": 7.37357283725117e-06,
"loss": 0.4876,
"mean_token_accuracy": 0.02943271119875135,
"step": 391
},
{
"epoch": 1.390633669958939,
"grad_norm": 0.11406645178794861,
"learning_rate": 7.361230612043125e-06,
"loss": 0.4178,
"mean_token_accuracy": 0.03322120701341191,
"step": 392
},
{
"epoch": 1.394184885140384,
"grad_norm": 0.12713231146335602,
"learning_rate": 7.3488698418434824e-06,
"loss": 0.4538,
"mean_token_accuracy": 0.02631053911318304,
"step": 393
},
{
"epoch": 1.397736100321829,
"grad_norm": 0.1251516342163086,
"learning_rate": 7.3364906237331345e-06,
"loss": 0.4197,
"mean_token_accuracy": 0.03226183277001837,
"step": 394
},
{
"epoch": 1.4012873155032737,
"grad_norm": 0.11285842210054398,
"learning_rate": 7.324093054937864e-06,
"loss": 0.4141,
"mean_token_accuracy": 0.03131841377398814,
"step": 395
},
{
"epoch": 1.4048385306847186,
"grad_norm": 0.11905115842819214,
"learning_rate": 7.311677232827583e-06,
"loss": 0.4419,
"mean_token_accuracy": 0.030178814733517356,
"step": 396
},
{
"epoch": 1.4083897458661636,
"grad_norm": 0.12142392992973328,
"learning_rate": 7.299243254915558e-06,
"loss": 0.4421,
"mean_token_accuracy": 0.03145620572104235,
"step": 397
},
{
"epoch": 1.4119409610476086,
"grad_norm": 0.12110509723424911,
"learning_rate": 7.286791218857654e-06,
"loss": 0.4323,
"mean_token_accuracy": 0.03314107269034139,
"step": 398
},
{
"epoch": 1.4154921762290533,
"grad_norm": 0.11982744932174683,
"learning_rate": 7.274321222451561e-06,
"loss": 0.4694,
"mean_token_accuracy": 0.03136878209625138,
"step": 399
},
{
"epoch": 1.4190433914104983,
"grad_norm": 0.11144396662712097,
"learning_rate": 7.261833363636036e-06,
"loss": 0.4529,
"mean_token_accuracy": 0.03282846643924131,
"step": 400
},
{
"epoch": 1.4225946065919433,
"grad_norm": 0.10743708163499832,
"learning_rate": 7.249327740490114e-06,
"loss": 0.4403,
"mean_token_accuracy": 0.03177920994858141,
"step": 401
},
{
"epoch": 1.426145821773388,
"grad_norm": 0.1281249076128006,
"learning_rate": 7.236804451232364e-06,
"loss": 0.415,
"mean_token_accuracy": 0.0316295579468715,
"step": 402
},
{
"epoch": 1.429697036954833,
"grad_norm": 0.11751551181077957,
"learning_rate": 7.224263594220093e-06,
"loss": 0.4372,
"mean_token_accuracy": 0.03051399137621047,
"step": 403
},
{
"epoch": 1.433248252136278,
"grad_norm": 0.12105882912874222,
"learning_rate": 7.211705267948592e-06,
"loss": 0.4326,
"mean_token_accuracy": 0.034232050165883265,
"step": 404
},
{
"epoch": 1.436799467317723,
"grad_norm": 0.10550647228956223,
"learning_rate": 7.199129571050345e-06,
"loss": 0.424,
"mean_token_accuracy": 0.031322834107413655,
"step": 405
},
{
"epoch": 1.4403506824991676,
"grad_norm": 0.12269502133131027,
"learning_rate": 7.186536602294278e-06,
"loss": 0.461,
"mean_token_accuracy": 0.03530319871424581,
"step": 406
},
{
"epoch": 1.4439018976806126,
"grad_norm": 0.11797191202640533,
"learning_rate": 7.173926460584956e-06,
"loss": 0.426,
"mean_token_accuracy": 0.027765252081735525,
"step": 407
},
{
"epoch": 1.4474531128620574,
"grad_norm": 0.1220933124423027,
"learning_rate": 7.161299244961828e-06,
"loss": 0.4229,
"mean_token_accuracy": 0.03250828143427498,
"step": 408
},
{
"epoch": 1.4510043280435023,
"grad_norm": 0.1393410712480545,
"learning_rate": 7.148655054598436e-06,
"loss": 0.4955,
"mean_token_accuracy": 0.031024973690364277,
"step": 409
},
{
"epoch": 1.4545555432249473,
"grad_norm": 0.12431557476520538,
"learning_rate": 7.135993988801644e-06,
"loss": 0.4545,
"mean_token_accuracy": 0.03278170326666441,
"step": 410
},
{
"epoch": 1.4581067584063923,
"grad_norm": 0.0940745621919632,
"learning_rate": 7.1233161470108525e-06,
"loss": 0.4252,
"mean_token_accuracy": 0.0334139017832058,
"step": 411
},
{
"epoch": 1.4616579735878372,
"grad_norm": 0.12231415510177612,
"learning_rate": 7.110621628797222e-06,
"loss": 0.4495,
"mean_token_accuracy": 0.03126630225597182,
"step": 412
},
{
"epoch": 1.465209188769282,
"grad_norm": 0.11169279366731644,
"learning_rate": 7.097910533862886e-06,
"loss": 0.4242,
"mean_token_accuracy": 0.033264531775785144,
"step": 413
},
{
"epoch": 1.468760403950727,
"grad_norm": 0.09450326859951019,
"learning_rate": 7.085182962040173e-06,
"loss": 0.4242,
"mean_token_accuracy": 0.03063771854431252,
"step": 414
},
{
"epoch": 1.4723116191321717,
"grad_norm": 0.1040828675031662,
"learning_rate": 7.072439013290824e-06,
"loss": 0.4445,
"mean_token_accuracy": 0.030200165703718085,
"step": 415
},
{
"epoch": 1.4758628343136166,
"grad_norm": 0.11325669288635254,
"learning_rate": 7.059678787705191e-06,
"loss": 0.4406,
"mean_token_accuracy": 0.0290958868645248,
"step": 416
},
{
"epoch": 1.4794140494950616,
"grad_norm": 0.11439729481935501,
"learning_rate": 7.046902385501477e-06,
"loss": 0.4131,
"mean_token_accuracy": 0.034142243890528334,
"step": 417
},
{
"epoch": 1.4829652646765066,
"grad_norm": 0.12844893336296082,
"learning_rate": 7.03410990702493e-06,
"loss": 0.4454,
"mean_token_accuracy": 0.0313700257538585,
"step": 418
},
{
"epoch": 1.4865164798579513,
"grad_norm": 0.10952496528625488,
"learning_rate": 7.02130145274706e-06,
"loss": 0.4467,
"mean_token_accuracy": 0.032556907943217084,
"step": 419
},
{
"epoch": 1.4900676950393963,
"grad_norm": 0.10907234251499176,
"learning_rate": 7.008477123264849e-06,
"loss": 0.4287,
"mean_token_accuracy": 0.0329911024782632,
"step": 420
},
{
"epoch": 1.4936189102208413,
"grad_norm": 0.1262599229812622,
"learning_rate": 6.995637019299963e-06,
"loss": 0.4458,
"mean_token_accuracy": 0.03044356228565448,
"step": 421
},
{
"epoch": 1.497170125402286,
"grad_norm": 0.14225925505161285,
"learning_rate": 6.982781241697963e-06,
"loss": 0.4441,
"mean_token_accuracy": 0.03241951846212032,
"step": 422
},
{
"epoch": 1.500721340583731,
"grad_norm": 0.1305040866136551,
"learning_rate": 6.969909891427509e-06,
"loss": 0.4799,
"mean_token_accuracy": 0.030976602058217395,
"step": 423
},
{
"epoch": 1.504272555765176,
"grad_norm": 0.14286646246910095,
"learning_rate": 6.957023069579561e-06,
"loss": 0.4688,
"mean_token_accuracy": 0.03533977083861828,
"step": 424
},
{
"epoch": 1.507823770946621,
"grad_norm": 0.10188660025596619,
"learning_rate": 6.944120877366605e-06,
"loss": 0.4046,
"mean_token_accuracy": 0.03206467899872223,
"step": 425
},
{
"epoch": 1.5113749861280656,
"grad_norm": 0.10866432636976242,
"learning_rate": 6.931203416121831e-06,
"loss": 0.433,
"mean_token_accuracy": 0.03298617543987348,
"step": 426
},
{
"epoch": 1.5149262013095106,
"grad_norm": 0.11402492970228195,
"learning_rate": 6.918270787298361e-06,
"loss": 0.4461,
"mean_token_accuracy": 0.02989783536759205,
"step": 427
},
{
"epoch": 1.5184774164909554,
"grad_norm": 0.10975202918052673,
"learning_rate": 6.90532309246844e-06,
"loss": 0.415,
"mean_token_accuracy": 0.03237479853851255,
"step": 428
},
{
"epoch": 1.5220286316724003,
"grad_norm": 0.14297430217266083,
"learning_rate": 6.89236043332264e-06,
"loss": 0.4613,
"mean_token_accuracy": 0.031666447979660006,
"step": 429
},
{
"epoch": 1.5255798468538453,
"grad_norm": 0.10512206703424454,
"learning_rate": 6.87938291166906e-06,
"loss": 0.4329,
"mean_token_accuracy": 0.032351893107261276,
"step": 430
},
{
"epoch": 1.5291310620352903,
"grad_norm": 0.134112149477005,
"learning_rate": 6.866390629432533e-06,
"loss": 0.4216,
"mean_token_accuracy": 0.03164473375727539,
"step": 431
},
{
"epoch": 1.5326822772167352,
"grad_norm": 0.0910695493221283,
"learning_rate": 6.8533836886538175e-06,
"loss": 0.4107,
"mean_token_accuracy": 0.031041957496199757,
"step": 432
},
{
"epoch": 1.53623349239818,
"grad_norm": 0.0995115339756012,
"learning_rate": 6.840362191488801e-06,
"loss": 0.3981,
"mean_token_accuracy": 0.03637770725617884,
"step": 433
},
{
"epoch": 1.539784707579625,
"grad_norm": 0.12063033878803253,
"learning_rate": 6.8273262402076935e-06,
"loss": 0.4365,
"mean_token_accuracy": 0.03462712019972969,
"step": 434
},
{
"epoch": 1.5433359227610697,
"grad_norm": 0.12645626068115234,
"learning_rate": 6.814275937194233e-06,
"loss": 0.4463,
"mean_token_accuracy": 0.02990383934957208,
"step": 435
},
{
"epoch": 1.5468871379425146,
"grad_norm": 0.11070489883422852,
"learning_rate": 6.801211384944867e-06,
"loss": 0.4389,
"mean_token_accuracy": 0.0330234444263624,
"step": 436
},
{
"epoch": 1.5504383531239596,
"grad_norm": 0.10706287622451782,
"learning_rate": 6.788132686067963e-06,
"loss": 0.4184,
"mean_token_accuracy": 0.036259193846490234,
"step": 437
},
{
"epoch": 1.5539895683054046,
"grad_norm": 0.12582163512706757,
"learning_rate": 6.77503994328299e-06,
"loss": 0.4433,
"mean_token_accuracy": 0.030336310745042283,
"step": 438
},
{
"epoch": 1.5575407834868495,
"grad_norm": 0.09505017101764679,
"learning_rate": 6.761933259419725e-06,
"loss": 0.4364,
"mean_token_accuracy": 0.03261732070313883,
"step": 439
},
{
"epoch": 1.5610919986682943,
"grad_norm": 0.11827551573514938,
"learning_rate": 6.748812737417428e-06,
"loss": 0.4153,
"mean_token_accuracy": 0.03193877744342899,
"step": 440
},
{
"epoch": 1.5646432138497393,
"grad_norm": 0.11618656665086746,
"learning_rate": 6.7356784803240464e-06,
"loss": 0.4355,
"mean_token_accuracy": 0.02966096373529581,
"step": 441
},
{
"epoch": 1.568194429031184,
"grad_norm": 0.12943704426288605,
"learning_rate": 6.722530591295406e-06,
"loss": 0.4683,
"mean_token_accuracy": 0.030868882487993687,
"step": 442
},
{
"epoch": 1.571745644212629,
"grad_norm": 0.10661391913890839,
"learning_rate": 6.709369173594396e-06,
"loss": 0.4248,
"mean_token_accuracy": 0.03413955620635534,
"step": 443
},
{
"epoch": 1.575296859394074,
"grad_norm": 0.13364438712596893,
"learning_rate": 6.6961943305901515e-06,
"loss": 0.4322,
"mean_token_accuracy": 0.03244770221499493,
"step": 444
},
{
"epoch": 1.578848074575519,
"grad_norm": 0.13524463772773743,
"learning_rate": 6.683006165757262e-06,
"loss": 0.4533,
"mean_token_accuracy": 0.033012805608450435,
"step": 445
},
{
"epoch": 1.5823992897569639,
"grad_norm": 0.10790959745645523,
"learning_rate": 6.669804782674937e-06,
"loss": 0.4432,
"mean_token_accuracy": 0.031002847008494427,
"step": 446
},
{
"epoch": 1.5859505049384086,
"grad_norm": 0.14740656316280365,
"learning_rate": 6.656590285026203e-06,
"loss": 0.4616,
"mean_token_accuracy": 0.03030816976024653,
"step": 447
},
{
"epoch": 1.5895017201198534,
"grad_norm": 0.09671668708324432,
"learning_rate": 6.643362776597089e-06,
"loss": 0.4238,
"mean_token_accuracy": 0.031174172054306837,
"step": 448
},
{
"epoch": 1.5930529353012983,
"grad_norm": 0.11425669491291046,
"learning_rate": 6.630122361275811e-06,
"loss": 0.4572,
"mean_token_accuracy": 0.033975018672208535,
"step": 449
},
{
"epoch": 1.5966041504827433,
"grad_norm": 0.09901127219200134,
"learning_rate": 6.6168691430519524e-06,
"loss": 0.4273,
"mean_token_accuracy": 0.031919095879857196,
"step": 450
},
{
"epoch": 1.6001553656641883,
"grad_norm": 0.13218143582344055,
"learning_rate": 6.6036032260156526e-06,
"loss": 0.4082,
"mean_token_accuracy": 0.03161406527215149,
"step": 451
},
{
"epoch": 1.6037065808456332,
"grad_norm": 0.11274532973766327,
"learning_rate": 6.590324714356784e-06,
"loss": 0.4487,
"mean_token_accuracy": 0.030842622476484394,
"step": 452
},
{
"epoch": 1.607257796027078,
"grad_norm": 0.13735929131507874,
"learning_rate": 6.5770337123641405e-06,
"loss": 0.4471,
"mean_token_accuracy": 0.03498955326358555,
"step": 453
},
{
"epoch": 1.610809011208523,
"grad_norm": 0.10870594531297684,
"learning_rate": 6.563730324424609e-06,
"loss": 0.4142,
"mean_token_accuracy": 0.03486323829929461,
"step": 454
},
{
"epoch": 1.6143602263899677,
"grad_norm": 0.12857501208782196,
"learning_rate": 6.55041465502236e-06,
"loss": 0.4341,
"mean_token_accuracy": 0.032055066079919925,
"step": 455
},
{
"epoch": 1.6179114415714126,
"grad_norm": 0.10850770026445389,
"learning_rate": 6.53708680873802e-06,
"loss": 0.4373,
"mean_token_accuracy": 0.032894781605136814,
"step": 456
},
{
"epoch": 1.6214626567528576,
"grad_norm": 0.130377396941185,
"learning_rate": 6.523746890247853e-06,
"loss": 0.4425,
"mean_token_accuracy": 0.03230476283351891,
"step": 457
},
{
"epoch": 1.6250138719343026,
"grad_norm": 0.11864569783210754,
"learning_rate": 6.510395004322937e-06,
"loss": 0.4256,
"mean_token_accuracy": 0.03536723868455738,
"step": 458
},
{
"epoch": 1.6285650871157475,
"grad_norm": 0.11540473997592926,
"learning_rate": 6.49703125582834e-06,
"loss": 0.4048,
"mean_token_accuracy": 0.03272077367910242,
"step": 459
},
{
"epoch": 1.6321163022971923,
"grad_norm": 0.1166347786784172,
"learning_rate": 6.4836557497222995e-06,
"loss": 0.427,
"mean_token_accuracy": 0.038396627987822285,
"step": 460
},
{
"epoch": 1.6356675174786373,
"grad_norm": 0.10623839497566223,
"learning_rate": 6.470268591055398e-06,
"loss": 0.4104,
"mean_token_accuracy": 0.035294696885102894,
"step": 461
},
{
"epoch": 1.639218732660082,
"grad_norm": 0.11819703876972198,
"learning_rate": 6.456869884969738e-06,
"loss": 0.4214,
"mean_token_accuracy": 0.02874127653922187,
"step": 462
},
{
"epoch": 1.642769947841527,
"grad_norm": 0.11898882687091827,
"learning_rate": 6.443459736698106e-06,
"loss": 0.4462,
"mean_token_accuracy": 0.030361584464117186,
"step": 463
},
{
"epoch": 1.646321163022972,
"grad_norm": 0.13163194060325623,
"learning_rate": 6.430038251563166e-06,
"loss": 0.4229,
"mean_token_accuracy": 0.03145527587912511,
"step": 464
},
{
"epoch": 1.649872378204417,
"grad_norm": 0.12886396050453186,
"learning_rate": 6.416605534976614e-06,
"loss": 0.444,
"mean_token_accuracy": 0.03168737835221691,
"step": 465
},
{
"epoch": 1.6534235933858619,
"grad_norm": 0.11800993233919144,
"learning_rate": 6.403161692438364e-06,
"loss": 0.4406,
"mean_token_accuracy": 0.029150941685657017,
"step": 466
},
{
"epoch": 1.6569748085673066,
"grad_norm": 0.15257884562015533,
"learning_rate": 6.3897068295357e-06,
"loss": 0.4646,
"mean_token_accuracy": 0.03103807869774755,
"step": 467
},
{
"epoch": 1.6605260237487516,
"grad_norm": 0.12270724773406982,
"learning_rate": 6.376241051942477e-06,
"loss": 0.4779,
"mean_token_accuracy": 0.03334709817136172,
"step": 468
},
{
"epoch": 1.6640772389301963,
"grad_norm": 0.11115626245737076,
"learning_rate": 6.362764465418258e-06,
"loss": 0.4228,
"mean_token_accuracy": 0.03353900604633964,
"step": 469
},
{
"epoch": 1.6676284541116413,
"grad_norm": 0.12104036659002304,
"learning_rate": 6.349277175807506e-06,
"loss": 0.4093,
"mean_token_accuracy": 0.03223917051946046,
"step": 470
},
{
"epoch": 1.6711796692930863,
"grad_norm": 0.10532081872224808,
"learning_rate": 6.3357792890387485e-06,
"loss": 0.4523,
"mean_token_accuracy": 0.031193107621220406,
"step": 471
},
{
"epoch": 1.6747308844745312,
"grad_norm": 0.107658751308918,
"learning_rate": 6.322270911123734e-06,
"loss": 0.4364,
"mean_token_accuracy": 0.02924270377479843,
"step": 472
},
{
"epoch": 1.6782820996559762,
"grad_norm": 0.14873118698596954,
"learning_rate": 6.308752148156614e-06,
"loss": 0.447,
"mean_token_accuracy": 0.028289265337662073,
"step": 473
},
{
"epoch": 1.681833314837421,
"grad_norm": 0.13524053990840912,
"learning_rate": 6.295223106313104e-06,
"loss": 0.456,
"mean_token_accuracy": 0.03146627026217175,
"step": 474
},
{
"epoch": 1.6853845300188657,
"grad_norm": 0.1123010441660881,
"learning_rate": 6.281683891849645e-06,
"loss": 0.4373,
"mean_token_accuracy": 0.031217788444337202,
"step": 475
},
{
"epoch": 1.6889357452003106,
"grad_norm": 0.1200728714466095,
"learning_rate": 6.268134611102578e-06,
"loss": 0.4294,
"mean_token_accuracy": 0.030439670525083784,
"step": 476
},
{
"epoch": 1.6924869603817556,
"grad_norm": 0.11351709067821503,
"learning_rate": 6.254575370487299e-06,
"loss": 0.454,
"mean_token_accuracy": 0.03279696796744247,
"step": 477
},
{
"epoch": 1.6960381755632006,
"grad_norm": 0.10373629629611969,
"learning_rate": 6.2410062764974366e-06,
"loss": 0.4088,
"mean_token_accuracy": 0.032122904136485886,
"step": 478
},
{
"epoch": 1.6995893907446455,
"grad_norm": 0.11496740579605103,
"learning_rate": 6.227427435703997e-06,
"loss": 0.4425,
"mean_token_accuracy": 0.026922807355731493,
"step": 479
},
{
"epoch": 1.7031406059260903,
"grad_norm": 0.13776014745235443,
"learning_rate": 6.213838954754543e-06,
"loss": 0.4615,
"mean_token_accuracy": 0.03186251565057319,
"step": 480
},
{
"epoch": 1.7066918211075353,
"grad_norm": 0.12256697565317154,
"learning_rate": 6.2002409403723525e-06,
"loss": 0.4266,
"mean_token_accuracy": 0.03119091654662043,
"step": 481
},
{
"epoch": 1.71024303628898,
"grad_norm": 0.13695059716701508,
"learning_rate": 6.186633499355576e-06,
"loss": 0.4413,
"mean_token_accuracy": 0.031419768780324375,
"step": 482
},
{
"epoch": 1.713794251470425,
"grad_norm": 0.11959680169820786,
"learning_rate": 6.173016738576396e-06,
"loss": 0.4069,
"mean_token_accuracy": 0.03165624950270285,
"step": 483
},
{
"epoch": 1.71734546665187,
"grad_norm": 0.12976723909378052,
"learning_rate": 6.159390764980202e-06,
"loss": 0.4587,
"mean_token_accuracy": 0.03226567378442269,
"step": 484
},
{
"epoch": 1.720896681833315,
"grad_norm": 0.12629680335521698,
"learning_rate": 6.145755685584731e-06,
"loss": 0.4318,
"mean_token_accuracy": 0.03316625406660023,
"step": 485
},
{
"epoch": 1.7244478970147599,
"grad_norm": 0.10905484110116959,
"learning_rate": 6.132111607479243e-06,
"loss": 0.3998,
"mean_token_accuracy": 0.03585824224865064,
"step": 486
},
{
"epoch": 1.7279991121962046,
"grad_norm": 0.1086510494351387,
"learning_rate": 6.118458637823669e-06,
"loss": 0.4114,
"mean_token_accuracy": 0.03143865071251639,
"step": 487
},
{
"epoch": 1.7315503273776496,
"grad_norm": 0.12035961449146271,
"learning_rate": 6.104796883847777e-06,
"loss": 0.4376,
"mean_token_accuracy": 0.03243117895544856,
"step": 488
},
{
"epoch": 1.7351015425590943,
"grad_norm": 0.14096233248710632,
"learning_rate": 6.091126452850324e-06,
"loss": 0.4117,
"mean_token_accuracy": 0.03267770476486476,
"step": 489
},
{
"epoch": 1.7386527577405393,
"grad_norm": 0.10478504002094269,
"learning_rate": 6.077447452198219e-06,
"loss": 0.427,
"mean_token_accuracy": 0.027900859065994155,
"step": 490
},
{
"epoch": 1.7422039729219843,
"grad_norm": 0.12248977273702621,
"learning_rate": 6.063759989325673e-06,
"loss": 0.4211,
"mean_token_accuracy": 0.035626769851660356,
"step": 491
},
{
"epoch": 1.7457551881034292,
"grad_norm": 0.11430156230926514,
"learning_rate": 6.050064171733362e-06,
"loss": 0.4506,
"mean_token_accuracy": 0.033796100367908366,
"step": 492
},
{
"epoch": 1.7493064032848742,
"grad_norm": 0.13546644151210785,
"learning_rate": 6.0363601069875755e-06,
"loss": 0.4384,
"mean_token_accuracy": 0.032566652556852205,
"step": 493
},
{
"epoch": 1.752857618466319,
"grad_norm": 0.11047898977994919,
"learning_rate": 6.022647902719384e-06,
"loss": 0.4434,
"mean_token_accuracy": 0.031517542514848174,
"step": 494
},
{
"epoch": 1.7564088336477637,
"grad_norm": 0.11789167672395706,
"learning_rate": 6.008927666623775e-06,
"loss": 0.4371,
"mean_token_accuracy": 0.029138855084966053,
"step": 495
},
{
"epoch": 1.7599600488292086,
"grad_norm": 0.1081371083855629,
"learning_rate": 5.9951995064588245e-06,
"loss": 0.4086,
"mean_token_accuracy": 0.036707406099594664,
"step": 496
},
{
"epoch": 1.7635112640106536,
"grad_norm": 0.1214103102684021,
"learning_rate": 5.981463530044841e-06,
"loss": 0.4525,
"mean_token_accuracy": 0.03377084386738716,
"step": 497
},
{
"epoch": 1.7670624791920986,
"grad_norm": 0.10647572576999664,
"learning_rate": 5.967719845263524e-06,
"loss": 0.403,
"mean_token_accuracy": 0.03056574211223051,
"step": 498
},
{
"epoch": 1.7706136943735435,
"grad_norm": 0.10062088072299957,
"learning_rate": 5.953968560057112e-06,
"loss": 0.4224,
"mean_token_accuracy": 0.03024762358836597,
"step": 499
},
{
"epoch": 1.7741649095549885,
"grad_norm": 0.11051978915929794,
"learning_rate": 5.940209782427535e-06,
"loss": 0.4435,
"mean_token_accuracy": 0.02924332962720655,
"step": 500
},
{
"epoch": 1.7777161247364333,
"grad_norm": 0.10874021053314209,
"learning_rate": 5.926443620435572e-06,
"loss": 0.4216,
"mean_token_accuracy": 0.031223400786984712,
"step": 501
},
{
"epoch": 1.781267339917878,
"grad_norm": 0.10309744626283646,
"learning_rate": 5.912670182199998e-06,
"loss": 0.421,
"mean_token_accuracy": 0.03220628422786831,
"step": 502
},
{
"epoch": 1.784818555099323,
"grad_norm": 0.10336648672819138,
"learning_rate": 5.898889575896731e-06,
"loss": 0.4301,
"mean_token_accuracy": 0.034015969904430676,
"step": 503
},
{
"epoch": 1.788369770280768,
"grad_norm": 0.15445692837238312,
"learning_rate": 5.8851019097579935e-06,
"loss": 0.4494,
"mean_token_accuracy": 0.03278758638043655,
"step": 504
},
{
"epoch": 1.791920985462213,
"grad_norm": 0.1205769032239914,
"learning_rate": 5.871307292071449e-06,
"loss": 0.4608,
"mean_token_accuracy": 0.030953851630329154,
"step": 505
},
{
"epoch": 1.7954722006436579,
"grad_norm": 0.11463318765163422,
"learning_rate": 5.857505831179361e-06,
"loss": 0.4238,
"mean_token_accuracy": 0.03327622167489608,
"step": 506
},
{
"epoch": 1.7990234158251026,
"grad_norm": 0.09844350814819336,
"learning_rate": 5.843697635477742e-06,
"loss": 0.436,
"mean_token_accuracy": 0.032107390790770296,
"step": 507
},
{
"epoch": 1.8025746310065476,
"grad_norm": 0.11865696310997009,
"learning_rate": 5.8298828134154935e-06,
"loss": 0.4604,
"mean_token_accuracy": 0.03441940287120815,
"step": 508
},
{
"epoch": 1.8061258461879923,
"grad_norm": 0.12131454795598984,
"learning_rate": 5.816061473493565e-06,
"loss": 0.4347,
"mean_token_accuracy": 0.029422461073409067,
"step": 509
},
{
"epoch": 1.8096770613694373,
"grad_norm": 0.10119353979825974,
"learning_rate": 5.802233724264094e-06,
"loss": 0.4206,
"mean_token_accuracy": 0.0319925009207509,
"step": 510
},
{
"epoch": 1.8132282765508823,
"grad_norm": 0.12341497838497162,
"learning_rate": 5.788399674329559e-06,
"loss": 0.4275,
"mean_token_accuracy": 0.03254448569350643,
"step": 511
},
{
"epoch": 1.8167794917323272,
"grad_norm": 0.12492340058088303,
"learning_rate": 5.774559432341918e-06,
"loss": 0.428,
"mean_token_accuracy": 0.02870176643045852,
"step": 512
},
{
"epoch": 1.8203307069137722,
"grad_norm": 0.10704270750284195,
"learning_rate": 5.760713107001773e-06,
"loss": 0.4395,
"mean_token_accuracy": 0.030266436016972875,
"step": 513
},
{
"epoch": 1.823881922095217,
"grad_norm": 0.11909916251897812,
"learning_rate": 5.746860807057491e-06,
"loss": 0.4153,
"mean_token_accuracy": 0.03108665631225449,
"step": 514
},
{
"epoch": 1.827433137276662,
"grad_norm": 0.11801562458276749,
"learning_rate": 5.7330026413043726e-06,
"loss": 0.4722,
"mean_token_accuracy": 0.03412359116009611,
"step": 515
},
{
"epoch": 1.8309843524581066,
"grad_norm": 0.12138685584068298,
"learning_rate": 5.719138718583781e-06,
"loss": 0.4266,
"mean_token_accuracy": 0.029616558851557784,
"step": 516
},
{
"epoch": 1.8345355676395516,
"grad_norm": 0.12482289224863052,
"learning_rate": 5.705269147782303e-06,
"loss": 0.4724,
"mean_token_accuracy": 0.029393230830464745,
"step": 517
},
{
"epoch": 1.8380867828209966,
"grad_norm": 0.12985889613628387,
"learning_rate": 5.6913940378308755e-06,
"loss": 0.4399,
"mean_token_accuracy": 0.030709353493875824,
"step": 518
},
{
"epoch": 1.8416379980024415,
"grad_norm": 0.1769595444202423,
"learning_rate": 5.677513497703947e-06,
"loss": 0.4505,
"mean_token_accuracy": 0.032131388477864675,
"step": 519
},
{
"epoch": 1.8451892131838865,
"grad_norm": 0.12460047751665115,
"learning_rate": 5.663627636418611e-06,
"loss": 0.4679,
"mean_token_accuracy": 0.033650853561994154,
"step": 520
},
{
"epoch": 1.8487404283653313,
"grad_norm": 0.10020967572927475,
"learning_rate": 5.649736563033754e-06,
"loss": 0.4295,
"mean_token_accuracy": 0.030336383748363005,
"step": 521
},
{
"epoch": 1.852291643546776,
"grad_norm": 0.146786630153656,
"learning_rate": 5.635840386649197e-06,
"loss": 0.4496,
"mean_token_accuracy": 0.031712467898614705,
"step": 522
},
{
"epoch": 1.855842858728221,
"grad_norm": 0.11752845346927643,
"learning_rate": 5.621939216404842e-06,
"loss": 0.4602,
"mean_token_accuracy": 0.03064700043250923,
"step": 523
},
{
"epoch": 1.859394073909666,
"grad_norm": 0.11091917008161545,
"learning_rate": 5.608033161479811e-06,
"loss": 0.4281,
"mean_token_accuracy": 0.03554350486774638,
"step": 524
},
{
"epoch": 1.862945289091111,
"grad_norm": 0.13093847036361694,
"learning_rate": 5.594122331091591e-06,
"loss": 0.4501,
"mean_token_accuracy": 0.03043439887915156,
"step": 525
},
{
"epoch": 1.8664965042725559,
"grad_norm": 0.10613659024238586,
"learning_rate": 5.580206834495169e-06,
"loss": 0.4009,
"mean_token_accuracy": 0.03187151045858627,
"step": 526
},
{
"epoch": 1.8700477194540006,
"grad_norm": 0.1050243079662323,
"learning_rate": 5.566286780982193e-06,
"loss": 0.4227,
"mean_token_accuracy": 0.03142415892943973,
"step": 527
},
{
"epoch": 1.8735989346354456,
"grad_norm": 0.09940161556005478,
"learning_rate": 5.552362279880091e-06,
"loss": 0.4193,
"mean_token_accuracy": 0.02958502833644161,
"step": 528
},
{
"epoch": 1.8771501498168903,
"grad_norm": 0.1255192905664444,
"learning_rate": 5.538433440551221e-06,
"loss": 0.4215,
"mean_token_accuracy": 0.03073620241775643,
"step": 529
},
{
"epoch": 1.8807013649983353,
"grad_norm": 0.10726135224103928,
"learning_rate": 5.524500372392021e-06,
"loss": 0.4116,
"mean_token_accuracy": 0.031495200710196514,
"step": 530
},
{
"epoch": 1.8842525801797803,
"grad_norm": 0.11809692531824112,
"learning_rate": 5.5105631848321375e-06,
"loss": 0.4421,
"mean_token_accuracy": 0.03594362937474216,
"step": 531
},
{
"epoch": 1.8878037953612252,
"grad_norm": 0.09988775849342346,
"learning_rate": 5.496621987333567e-06,
"loss": 0.4276,
"mean_token_accuracy": 0.03090111272831564,
"step": 532
},
{
"epoch": 1.8913550105426702,
"grad_norm": 0.12656764686107635,
"learning_rate": 5.482676889389808e-06,
"loss": 0.4362,
"mean_token_accuracy": 0.03345424540384556,
"step": 533
},
{
"epoch": 1.894906225724115,
"grad_norm": 0.12253513932228088,
"learning_rate": 5.468728000524987e-06,
"loss": 0.4233,
"mean_token_accuracy": 0.03485227169767313,
"step": 534
},
{
"epoch": 1.89845744090556,
"grad_norm": 0.10752210766077042,
"learning_rate": 5.454775430293008e-06,
"loss": 0.4049,
"mean_token_accuracy": 0.03149543050676584,
"step": 535
},
{
"epoch": 1.9020086560870046,
"grad_norm": 0.11358033865690231,
"learning_rate": 5.440819288276683e-06,
"loss": 0.4491,
"mean_token_accuracy": 0.03557099802492303,
"step": 536
},
{
"epoch": 1.9055598712684496,
"grad_norm": 0.10355143249034882,
"learning_rate": 5.426859684086881e-06,
"loss": 0.4221,
"mean_token_accuracy": 0.029905574676377,
"step": 537
},
{
"epoch": 1.9091110864498946,
"grad_norm": 0.1308654397726059,
"learning_rate": 5.412896727361663e-06,
"loss": 0.4051,
"mean_token_accuracy": 0.03401779759951751,
"step": 538
},
{
"epoch": 1.9126623016313395,
"grad_norm": 0.10328856110572815,
"learning_rate": 5.398930527765416e-06,
"loss": 0.4246,
"mean_token_accuracy": 0.03586249665386276,
"step": 539
},
{
"epoch": 1.9162135168127845,
"grad_norm": 0.12165062874555588,
"learning_rate": 5.384961194988002e-06,
"loss": 0.4367,
"mean_token_accuracy": 0.036468475984293036,
"step": 540
},
{
"epoch": 1.9197647319942293,
"grad_norm": 0.14289544522762299,
"learning_rate": 5.370988838743889e-06,
"loss": 0.467,
"mean_token_accuracy": 0.029684737717616372,
"step": 541
},
{
"epoch": 1.9233159471756742,
"grad_norm": 0.1259216070175171,
"learning_rate": 5.357013568771288e-06,
"loss": 0.4611,
"mean_token_accuracy": 0.030411327827096102,
"step": 542
},
{
"epoch": 1.926867162357119,
"grad_norm": 0.12499464303255081,
"learning_rate": 5.343035494831298e-06,
"loss": 0.475,
"mean_token_accuracy": 0.030978709481132682,
"step": 543
},
{
"epoch": 1.930418377538564,
"grad_norm": 0.11255602538585663,
"learning_rate": 5.32905472670704e-06,
"loss": 0.4276,
"mean_token_accuracy": 0.03142427492639399,
"step": 544
},
{
"epoch": 1.933969592720009,
"grad_norm": 0.10795030742883682,
"learning_rate": 5.315071374202792e-06,
"loss": 0.4334,
"mean_token_accuracy": 0.03035588754573837,
"step": 545
},
{
"epoch": 1.9375208079014539,
"grad_norm": 0.11768075078725815,
"learning_rate": 5.301085547143135e-06,
"loss": 0.4471,
"mean_token_accuracy": 0.03152179718381376,
"step": 546
},
{
"epoch": 1.9410720230828988,
"grad_norm": 0.11674405634403229,
"learning_rate": 5.287097355372079e-06,
"loss": 0.4385,
"mean_token_accuracy": 0.027633604368020315,
"step": 547
},
{
"epoch": 1.9446232382643436,
"grad_norm": 0.15666991472244263,
"learning_rate": 5.273106908752211e-06,
"loss": 0.49,
"mean_token_accuracy": 0.030374082733032992,
"step": 548
},
{
"epoch": 1.9481744534457883,
"grad_norm": 0.10733166337013245,
"learning_rate": 5.259114317163822e-06,
"loss": 0.4397,
"mean_token_accuracy": 0.02997263033830677,
"step": 549
},
{
"epoch": 1.9517256686272333,
"grad_norm": 0.1084926500916481,
"learning_rate": 5.245119690504056e-06,
"loss": 0.4458,
"mean_token_accuracy": 0.03219133894890547,
"step": 550
},
{
"epoch": 1.9552768838086783,
"grad_norm": 0.10508458316326141,
"learning_rate": 5.231123138686036e-06,
"loss": 0.4207,
"mean_token_accuracy": 0.03626753961361828,
"step": 551
},
{
"epoch": 1.9588280989901232,
"grad_norm": 0.12108492106199265,
"learning_rate": 5.217124771638008e-06,
"loss": 0.4703,
"mean_token_accuracy": 0.032320219550456386,
"step": 552
},
{
"epoch": 1.9623793141715682,
"grad_norm": 0.1054980456829071,
"learning_rate": 5.2031246993024705e-06,
"loss": 0.4487,
"mean_token_accuracy": 0.02940154373754922,
"step": 553
},
{
"epoch": 1.965930529353013,
"grad_norm": 0.11068796366453171,
"learning_rate": 5.1891230316353215e-06,
"loss": 0.4127,
"mean_token_accuracy": 0.033622686092712684,
"step": 554
},
{
"epoch": 1.969481744534458,
"grad_norm": 0.11467897891998291,
"learning_rate": 5.1751198786049815e-06,
"loss": 0.4409,
"mean_token_accuracy": 0.03024188138806494,
"step": 555
},
{
"epoch": 1.9730329597159026,
"grad_norm": 0.13145717978477478,
"learning_rate": 5.161115350191543e-06,
"loss": 0.4568,
"mean_token_accuracy": 0.03478666826777044,
"step": 556
},
{
"epoch": 1.9765841748973476,
"grad_norm": 0.11662815511226654,
"learning_rate": 5.147109556385898e-06,
"loss": 0.428,
"mean_token_accuracy": 0.029785827462546877,
"step": 557
},
{
"epoch": 1.9801353900787926,
"grad_norm": 0.12841393053531647,
"learning_rate": 5.133102607188875e-06,
"loss": 0.4369,
"mean_token_accuracy": 0.03490987789336941,
"step": 558
},
{
"epoch": 1.9836866052602375,
"grad_norm": 0.11382108181715012,
"learning_rate": 5.119094612610381e-06,
"loss": 0.4355,
"mean_token_accuracy": 0.031118642207729863,
"step": 559
},
{
"epoch": 1.9872378204416825,
"grad_norm": 0.108616903424263,
"learning_rate": 5.10508568266853e-06,
"loss": 0.4321,
"mean_token_accuracy": 0.029375262431130977,
"step": 560
},
{
"epoch": 1.9907890356231273,
"grad_norm": 0.1363229751586914,
"learning_rate": 5.091075927388785e-06,
"loss": 0.4382,
"mean_token_accuracy": 0.02874040436472569,
"step": 561
},
{
"epoch": 1.9943402508045722,
"grad_norm": 0.11008929461240768,
"learning_rate": 5.077065456803089e-06,
"loss": 0.3951,
"mean_token_accuracy": 0.03383657897938974,
"step": 562
},
{
"epoch": 1.997891465986017,
"grad_norm": 0.1290445476770401,
"learning_rate": 5.063054380949003e-06,
"loss": 0.4386,
"mean_token_accuracy": 0.02915131483678124,
"step": 563
},
{
"epoch": 2.0,
"grad_norm": 0.08082450926303864,
"learning_rate": 5.049042809868845e-06,
"loss": 0.2488,
"mean_token_accuracy": 0.030764293948825645,
"step": 564
},
{
"epoch": 2.003551215181445,
"grad_norm": 0.1310880482196808,
"learning_rate": 5.035030853608817e-06,
"loss": 0.4365,
"mean_token_accuracy": 0.031575486336805625,
"step": 565
},
{
"epoch": 2.00710243036289,
"grad_norm": 0.1138961985707283,
"learning_rate": 5.0210186222181515e-06,
"loss": 0.4157,
"mean_token_accuracy": 0.034992296999917016,
"step": 566
},
{
"epoch": 2.010653645544335,
"grad_norm": 0.11264721304178238,
"learning_rate": 5.007006225748238e-06,
"loss": 0.4476,
"mean_token_accuracy": 0.033530683980643516,
"step": 567
},
{
"epoch": 2.0142048607257794,
"grad_norm": 0.1077028214931488,
"learning_rate": 4.992993774251764e-06,
"loss": 0.4325,
"mean_token_accuracy": 0.03167944007873302,
"step": 568
},
{
"epoch": 2.0177560759072244,
"grad_norm": 0.1369977593421936,
"learning_rate": 4.97898137778185e-06,
"loss": 0.4407,
"mean_token_accuracy": 0.035672826332302066,
"step": 569
},
{
"epoch": 2.0213072910886694,
"grad_norm": 0.11429018527269363,
"learning_rate": 4.964969146391184e-06,
"loss": 0.43,
"mean_token_accuracy": 0.034091444191290066,
"step": 570
},
{
"epoch": 2.0248585062701143,
"grad_norm": 0.1280803382396698,
"learning_rate": 4.950957190131157e-06,
"loss": 0.4254,
"mean_token_accuracy": 0.03439853444797336,
"step": 571
},
{
"epoch": 2.0284097214515593,
"grad_norm": 0.10655295848846436,
"learning_rate": 4.936945619050998e-06,
"loss": 0.4306,
"mean_token_accuracy": 0.031939568114466965,
"step": 572
},
{
"epoch": 2.0319609366330043,
"grad_norm": 0.13407614827156067,
"learning_rate": 4.922934543196912e-06,
"loss": 0.4496,
"mean_token_accuracy": 0.029332300946407486,
"step": 573
},
{
"epoch": 2.0355121518144492,
"grad_norm": 0.1120862066745758,
"learning_rate": 4.908924072611218e-06,
"loss": 0.4399,
"mean_token_accuracy": 0.030439690985076595,
"step": 574
},
{
"epoch": 2.0390633669958937,
"grad_norm": 0.12993109226226807,
"learning_rate": 4.894914317331471e-06,
"loss": 0.4638,
"mean_token_accuracy": 0.03251677861408098,
"step": 575
},
{
"epoch": 2.0426145821773387,
"grad_norm": 0.12582442164421082,
"learning_rate": 4.88090538738962e-06,
"loss": 0.4329,
"mean_token_accuracy": 0.027871505477378378,
"step": 576
},
{
"epoch": 2.0461657973587837,
"grad_norm": 0.11255156993865967,
"learning_rate": 4.866897392811127e-06,
"loss": 0.4614,
"mean_token_accuracy": 0.033189341596880695,
"step": 577
},
{
"epoch": 2.0497170125402286,
"grad_norm": 0.11850475519895554,
"learning_rate": 4.852890443614105e-06,
"loss": 0.4593,
"mean_token_accuracy": 0.03234072294435464,
"step": 578
},
{
"epoch": 2.0532682277216736,
"grad_norm": 0.13706207275390625,
"learning_rate": 4.838884649808458e-06,
"loss": 0.4689,
"mean_token_accuracy": 0.03188859073998174,
"step": 579
},
{
"epoch": 2.0568194429031186,
"grad_norm": 0.1146935224533081,
"learning_rate": 4.82488012139502e-06,
"loss": 0.4204,
"mean_token_accuracy": 0.033560063729964895,
"step": 580
},
{
"epoch": 2.060370658084563,
"grad_norm": 0.09363409876823425,
"learning_rate": 4.810876968364679e-06,
"loss": 0.4026,
"mean_token_accuracy": 0.03469607957958942,
"step": 581
},
{
"epoch": 2.063921873266008,
"grad_norm": 0.12827607989311218,
"learning_rate": 4.796875300697532e-06,
"loss": 0.4439,
"mean_token_accuracy": 0.030342954167281277,
"step": 582
},
{
"epoch": 2.067473088447453,
"grad_norm": 0.1165771633386612,
"learning_rate": 4.782875228361994e-06,
"loss": 0.4475,
"mean_token_accuracy": 0.03175362127512926,
"step": 583
},
{
"epoch": 2.071024303628898,
"grad_norm": 0.12880617380142212,
"learning_rate": 4.7688768613139655e-06,
"loss": 0.4553,
"mean_token_accuracy": 0.028243271208339138,
"step": 584
},
{
"epoch": 2.074575518810343,
"grad_norm": 0.1199556365609169,
"learning_rate": 4.754880309495946e-06,
"loss": 0.4405,
"mean_token_accuracy": 0.03142109113832703,
"step": 585
},
{
"epoch": 2.078126733991788,
"grad_norm": 0.14064113795757294,
"learning_rate": 4.74088568283618e-06,
"loss": 0.447,
"mean_token_accuracy": 0.03218332341930363,
"step": 586
},
{
"epoch": 2.081677949173233,
"grad_norm": 0.10865464061498642,
"learning_rate": 4.726893091247792e-06,
"loss": 0.4187,
"mean_token_accuracy": 0.029737072265561437,
"step": 587
},
{
"epoch": 2.0852291643546774,
"grad_norm": 0.12148632109165192,
"learning_rate": 4.712902644627923e-06,
"loss": 0.4428,
"mean_token_accuracy": 0.03043746904222644,
"step": 588
},
{
"epoch": 2.0887803795361224,
"grad_norm": 0.13806842267513275,
"learning_rate": 4.698914452856866e-06,
"loss": 0.4767,
"mean_token_accuracy": 0.03084335032326635,
"step": 589
},
{
"epoch": 2.0923315947175674,
"grad_norm": 0.1365612894296646,
"learning_rate": 4.684928625797208e-06,
"loss": 0.4354,
"mean_token_accuracy": 0.03496370389621006,
"step": 590
},
{
"epoch": 2.0958828098990123,
"grad_norm": 0.120542012155056,
"learning_rate": 4.6709452732929614e-06,
"loss": 0.4747,
"mean_token_accuracy": 0.03065626499665086,
"step": 591
},
{
"epoch": 2.0994340250804573,
"grad_norm": 0.09649121761322021,
"learning_rate": 4.656964505168703e-06,
"loss": 0.4051,
"mean_token_accuracy": 0.03408970690725255,
"step": 592
},
{
"epoch": 2.1029852402619023,
"grad_norm": 0.09775994718074799,
"learning_rate": 4.642986431228713e-06,
"loss": 0.4257,
"mean_token_accuracy": 0.032279426413879264,
"step": 593
},
{
"epoch": 2.1065364554433472,
"grad_norm": 0.10995833575725555,
"learning_rate": 4.629011161256114e-06,
"loss": 0.4247,
"mean_token_accuracy": 0.03125830645512906,
"step": 594
},
{
"epoch": 2.1100876706247917,
"grad_norm": 0.11631479859352112,
"learning_rate": 4.615038805011999e-06,
"loss": 0.443,
"mean_token_accuracy": 0.031506394774623914,
"step": 595
},
{
"epoch": 2.1136388858062367,
"grad_norm": 0.10540800541639328,
"learning_rate": 4.601069472234584e-06,
"loss": 0.4341,
"mean_token_accuracy": 0.03257749425392831,
"step": 596
},
{
"epoch": 2.1171901009876817,
"grad_norm": 0.11153507232666016,
"learning_rate": 4.587103272638339e-06,
"loss": 0.4463,
"mean_token_accuracy": 0.0294464887920185,
"step": 597
},
{
"epoch": 2.1207413161691266,
"grad_norm": 0.10988204926252365,
"learning_rate": 4.57314031591312e-06,
"loss": 0.4159,
"mean_token_accuracy": 0.03306772064024699,
"step": 598
},
{
"epoch": 2.1242925313505716,
"grad_norm": 0.11690136790275574,
"learning_rate": 4.559180711723318e-06,
"loss": 0.4417,
"mean_token_accuracy": 0.030245611327700317,
"step": 599
},
{
"epoch": 2.1278437465320166,
"grad_norm": 0.11184585839509964,
"learning_rate": 4.545224569706994e-06,
"loss": 0.4132,
"mean_token_accuracy": 0.03208141641152906,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 1124,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.295286349900025e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}