| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.1278437465320166, |
| "eval_steps": 500, |
| "global_step": 600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0035512151814449007, |
| "grad_norm": 0.09903648495674133, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.4075, |
| "mean_token_accuracy": 0.03430055791613995, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.007102430362889801, |
| "grad_norm": 0.14180830121040344, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.4487, |
| "mean_token_accuracy": 0.03482818407428567, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.010653645544334702, |
| "grad_norm": 0.11331801116466522, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": 0.419, |
| "mean_token_accuracy": 0.030828955586912343, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.014204860725779603, |
| "grad_norm": 0.13718660175800323, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 0.4512, |
| "mean_token_accuracy": 0.03115637891824008, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.017756075907224503, |
| "grad_norm": 0.1235477402806282, |
| "learning_rate": 7.1428571428571436e-06, |
| "loss": 0.4367, |
| "mean_token_accuracy": 0.03077795269928174, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.021307291088669404, |
| "grad_norm": 0.11979226022958755, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 0.4685, |
| "mean_token_accuracy": 0.028771477849659277, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.024858506270114305, |
| "grad_norm": 0.12223777920007706, |
| "learning_rate": 1e-05, |
| "loss": 0.4658, |
| "mean_token_accuracy": 0.028678809732809896, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.028409721451559206, |
| "grad_norm": 0.12957823276519775, |
| "learning_rate": 9.999980365120307e-06, |
| "loss": 0.4549, |
| "mean_token_accuracy": 0.031924477760185255, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.0319609366330041, |
| "grad_norm": 0.11528096348047256, |
| "learning_rate": 9.999921460635436e-06, |
| "loss": 0.4568, |
| "mean_token_accuracy": 0.02935352978965966, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03551215181444901, |
| "grad_norm": 0.11329038441181183, |
| "learning_rate": 9.999823287008022e-06, |
| "loss": 0.4423, |
| "mean_token_accuracy": 0.031521555294602877, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.039063366995893904, |
| "grad_norm": 0.11604199558496475, |
| "learning_rate": 9.999685845009114e-06, |
| "loss": 0.4312, |
| "mean_token_accuracy": 0.03416921407915652, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.04261458217733881, |
| "grad_norm": 0.10851351916790009, |
| "learning_rate": 9.999509135718176e-06, |
| "loss": 0.4357, |
| "mean_token_accuracy": 0.029943688183266204, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.046165797358783706, |
| "grad_norm": 0.10252473503351212, |
| "learning_rate": 9.999293160523074e-06, |
| "loss": 0.4327, |
| "mean_token_accuracy": 0.03228329481498804, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.04971701254022861, |
| "grad_norm": 0.1350434422492981, |
| "learning_rate": 9.999037921120068e-06, |
| "loss": 0.4546, |
| "mean_token_accuracy": 0.03238412337668706, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.05326822772167351, |
| "grad_norm": 0.10226991772651672, |
| "learning_rate": 9.998743419513795e-06, |
| "loss": 0.443, |
| "mean_token_accuracy": 0.03165558609907748, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05681944290311841, |
| "grad_norm": 0.13848458230495453, |
| "learning_rate": 9.998409658017256e-06, |
| "loss": 0.4851, |
| "mean_token_accuracy": 0.03008650508854771, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.06037065808456331, |
| "grad_norm": 0.11423337459564209, |
| "learning_rate": 9.998036639251798e-06, |
| "loss": 0.4088, |
| "mean_token_accuracy": 0.030042485868762014, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.0639218732660082, |
| "grad_norm": 0.13896144926548004, |
| "learning_rate": 9.997624366147094e-06, |
| "loss": 0.4404, |
| "mean_token_accuracy": 0.030281073039077455, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.06747308844745312, |
| "grad_norm": 0.10911770910024643, |
| "learning_rate": 9.997172841941114e-06, |
| "loss": 0.4448, |
| "mean_token_accuracy": 0.03207043489237549, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.07102430362889801, |
| "grad_norm": 0.12348262220621109, |
| "learning_rate": 9.99668207018011e-06, |
| "loss": 0.4368, |
| "mean_token_accuracy": 0.033165274380735354, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.07457551881034291, |
| "grad_norm": 0.1128680482506752, |
| "learning_rate": 9.996152054718579e-06, |
| "loss": 0.432, |
| "mean_token_accuracy": 0.030988398069894174, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.07812673399178781, |
| "grad_norm": 0.11654637008905411, |
| "learning_rate": 9.995582799719237e-06, |
| "loss": 0.4493, |
| "mean_token_accuracy": 0.032766436190286186, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.08167794917323272, |
| "grad_norm": 0.11634049564599991, |
| "learning_rate": 9.994974309652984e-06, |
| "loss": 0.4752, |
| "mean_token_accuracy": 0.031991063471650705, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.08522916435467762, |
| "grad_norm": 0.1251709908246994, |
| "learning_rate": 9.994326589298875e-06, |
| "loss": 0.4215, |
| "mean_token_accuracy": 0.036875830935969134, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.08878037953612251, |
| "grad_norm": 0.1075059100985527, |
| "learning_rate": 9.993639643744071e-06, |
| "loss": 0.4055, |
| "mean_token_accuracy": 0.03151970944236382, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.09233159471756741, |
| "grad_norm": 0.11893657594919205, |
| "learning_rate": 9.99291347838381e-06, |
| "loss": 0.4635, |
| "mean_token_accuracy": 0.028766451931005577, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.09588280989901232, |
| "grad_norm": 0.12458056956529617, |
| "learning_rate": 9.992148098921361e-06, |
| "loss": 0.465, |
| "mean_token_accuracy": 0.03177367915850482, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.09943402508045722, |
| "grad_norm": 0.12087491899728775, |
| "learning_rate": 9.99134351136798e-06, |
| "loss": 0.4415, |
| "mean_token_accuracy": 0.034534925398475025, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.10298524026190212, |
| "grad_norm": 0.13300693035125732, |
| "learning_rate": 9.990499722042852e-06, |
| "loss": 0.4532, |
| "mean_token_accuracy": 0.03134459158900427, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.10653645544334701, |
| "grad_norm": 0.11541508138179779, |
| "learning_rate": 9.989616737573064e-06, |
| "loss": 0.4373, |
| "mean_token_accuracy": 0.03319291834850446, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.11008767062479193, |
| "grad_norm": 0.1059102788567543, |
| "learning_rate": 9.98869456489353e-06, |
| "loss": 0.4341, |
| "mean_token_accuracy": 0.03190637141960906, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.11363888580623682, |
| "grad_norm": 0.11466473340988159, |
| "learning_rate": 9.987733211246952e-06, |
| "loss": 0.4453, |
| "mean_token_accuracy": 0.030628334232460475, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.11719010098768172, |
| "grad_norm": 0.11979004740715027, |
| "learning_rate": 9.986732684183753e-06, |
| "loss": 0.4449, |
| "mean_token_accuracy": 0.03580710734240711, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.12074131616912662, |
| "grad_norm": 0.1313788890838623, |
| "learning_rate": 9.985692991562026e-06, |
| "loss": 0.4171, |
| "mean_token_accuracy": 0.035130951946484856, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.12429253135057153, |
| "grad_norm": 0.1338130682706833, |
| "learning_rate": 9.984614141547468e-06, |
| "loss": 0.4269, |
| "mean_token_accuracy": 0.03584297694033012, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.1278437465320164, |
| "grad_norm": 0.13190719485282898, |
| "learning_rate": 9.983496142613314e-06, |
| "loss": 0.4456, |
| "mean_token_accuracy": 0.02804510169880814, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.13139496171346132, |
| "grad_norm": 0.10598088055849075, |
| "learning_rate": 9.982339003540272e-06, |
| "loss": 0.4531, |
| "mean_token_accuracy": 0.036306285659520654, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.13494617689490623, |
| "grad_norm": 0.09667421877384186, |
| "learning_rate": 9.981142733416457e-06, |
| "loss": 0.3926, |
| "mean_token_accuracy": 0.03226046055351617, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.13849739207635112, |
| "grad_norm": 0.12407270818948746, |
| "learning_rate": 9.97990734163732e-06, |
| "loss": 0.4234, |
| "mean_token_accuracy": 0.032570251350989565, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.14204860725779603, |
| "grad_norm": 0.10452189296483994, |
| "learning_rate": 9.978632837905566e-06, |
| "loss": 0.4217, |
| "mean_token_accuracy": 0.034018361457128776, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.14559982243924094, |
| "grad_norm": 0.1165575161576271, |
| "learning_rate": 9.977319232231088e-06, |
| "loss": 0.4209, |
| "mean_token_accuracy": 0.03243047746218508, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.14915103762068582, |
| "grad_norm": 0.12650305032730103, |
| "learning_rate": 9.975966534930879e-06, |
| "loss": 0.4372, |
| "mean_token_accuracy": 0.02945730801729951, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.15270225280213073, |
| "grad_norm": 0.11621509492397308, |
| "learning_rate": 9.974574756628961e-06, |
| "loss": 0.452, |
| "mean_token_accuracy": 0.031811273845960386, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.15625346798357562, |
| "grad_norm": 0.12744253873825073, |
| "learning_rate": 9.973143908256291e-06, |
| "loss": 0.4459, |
| "mean_token_accuracy": 0.029921257570094895, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.15980468316502053, |
| "grad_norm": 0.15649749338626862, |
| "learning_rate": 9.971674001050687e-06, |
| "loss": 0.4252, |
| "mean_token_accuracy": 0.030991621693829075, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.16335589834646544, |
| "grad_norm": 0.11494240909814835, |
| "learning_rate": 9.970165046556726e-06, |
| "loss": 0.4232, |
| "mean_token_accuracy": 0.03496951759007061, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.16690711352791032, |
| "grad_norm": 0.12458129972219467, |
| "learning_rate": 9.968617056625665e-06, |
| "loss": 0.4633, |
| "mean_token_accuracy": 0.02982275520116673, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.17045832870935523, |
| "grad_norm": 0.11398381739854813, |
| "learning_rate": 9.967030043415345e-06, |
| "loss": 0.4503, |
| "mean_token_accuracy": 0.03281566769874189, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.17400954389080014, |
| "grad_norm": 0.10996269434690475, |
| "learning_rate": 9.965404019390087e-06, |
| "loss": 0.4218, |
| "mean_token_accuracy": 0.0321590854000533, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.17756075907224503, |
| "grad_norm": 0.12258810549974442, |
| "learning_rate": 9.963738997320609e-06, |
| "loss": 0.4482, |
| "mean_token_accuracy": 0.027974106487818062, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.18111197425368994, |
| "grad_norm": 0.1071733832359314, |
| "learning_rate": 9.962034990283912e-06, |
| "loss": 0.4257, |
| "mean_token_accuracy": 0.02945839857784449, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.18466318943513482, |
| "grad_norm": 0.12644729018211365, |
| "learning_rate": 9.960292011663186e-06, |
| "loss": 0.4792, |
| "mean_token_accuracy": 0.032736343830038095, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.18821440461657973, |
| "grad_norm": 0.10932295769453049, |
| "learning_rate": 9.958510075147703e-06, |
| "loss": 0.4029, |
| "mean_token_accuracy": 0.029769023109111004, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.19176561979802464, |
| "grad_norm": 0.1045532375574112, |
| "learning_rate": 9.956689194732702e-06, |
| "loss": 0.4121, |
| "mean_token_accuracy": 0.032606568154733395, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.19531683497946953, |
| "grad_norm": 0.11694876849651337, |
| "learning_rate": 9.954829384719296e-06, |
| "loss": 0.4416, |
| "mean_token_accuracy": 0.03312313952847035, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.19886805016091444, |
| "grad_norm": 0.12251029908657074, |
| "learning_rate": 9.95293065971434e-06, |
| "loss": 0.4289, |
| "mean_token_accuracy": 0.03135830574319698, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.20241926534235935, |
| "grad_norm": 0.10416404157876968, |
| "learning_rate": 9.950993034630328e-06, |
| "loss": 0.4275, |
| "mean_token_accuracy": 0.038488676873384975, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.20597048052380423, |
| "grad_norm": 0.1636134833097458, |
| "learning_rate": 9.949016524685277e-06, |
| "loss": 0.464, |
| "mean_token_accuracy": 0.02992881732279784, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.20952169570524914, |
| "grad_norm": 0.12780985236167908, |
| "learning_rate": 9.947001145402598e-06, |
| "loss": 0.4541, |
| "mean_token_accuracy": 0.029724605861702003, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.21307291088669403, |
| "grad_norm": 0.13115057349205017, |
| "learning_rate": 9.944946912610986e-06, |
| "loss": 0.462, |
| "mean_token_accuracy": 0.03166838363176794, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.21662412606813894, |
| "grad_norm": 0.12131655216217041, |
| "learning_rate": 9.942853842444283e-06, |
| "loss": 0.4206, |
| "mean_token_accuracy": 0.030444662748777773, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.22017534124958385, |
| "grad_norm": 0.13489212095737457, |
| "learning_rate": 9.940721951341365e-06, |
| "loss": 0.4322, |
| "mean_token_accuracy": 0.0293623886336718, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.22372655643102873, |
| "grad_norm": 0.11916383355855942, |
| "learning_rate": 9.938551256046e-06, |
| "loss": 0.4512, |
| "mean_token_accuracy": 0.035470658251142595, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.22727777161247364, |
| "grad_norm": 0.13127347826957703, |
| "learning_rate": 9.936341773606723e-06, |
| "loss": 0.4424, |
| "mean_token_accuracy": 0.032253861045319354, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.23082898679391856, |
| "grad_norm": 0.12579481303691864, |
| "learning_rate": 9.934093521376707e-06, |
| "loss": 0.4465, |
| "mean_token_accuracy": 0.03229751772960299, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.23438020197536344, |
| "grad_norm": 0.11594023555517197, |
| "learning_rate": 9.931806517013612e-06, |
| "loss": 0.4399, |
| "mean_token_accuracy": 0.02944770805333974, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.23793141715680835, |
| "grad_norm": 0.10294274985790253, |
| "learning_rate": 9.929480778479465e-06, |
| "loss": 0.4186, |
| "mean_token_accuracy": 0.03471728676959174, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.24148263233825323, |
| "grad_norm": 0.13848905265331268, |
| "learning_rate": 9.9271163240405e-06, |
| "loss": 0.4196, |
| "mean_token_accuracy": 0.03424310322952806, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.24503384751969814, |
| "grad_norm": 0.10381077975034714, |
| "learning_rate": 9.92471317226703e-06, |
| "loss": 0.4448, |
| "mean_token_accuracy": 0.03184566564414126, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.24858506270114306, |
| "grad_norm": 0.13564680516719818, |
| "learning_rate": 9.922271342033295e-06, |
| "loss": 0.4259, |
| "mean_token_accuracy": 0.03590564796468243, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.25213627788258797, |
| "grad_norm": 0.13113564252853394, |
| "learning_rate": 9.919790852517313e-06, |
| "loss": 0.4574, |
| "mean_token_accuracy": 0.034656246283702785, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.2556874930640328, |
| "grad_norm": 0.10604698956012726, |
| "learning_rate": 9.917271723200725e-06, |
| "loss": 0.4, |
| "mean_token_accuracy": 0.03376925739576109, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.25923870824547773, |
| "grad_norm": 0.1232958659529686, |
| "learning_rate": 9.914713973868654e-06, |
| "loss": 0.4529, |
| "mean_token_accuracy": 0.031287746594898636, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.26278992342692264, |
| "grad_norm": 0.12214750051498413, |
| "learning_rate": 9.91211762460954e-06, |
| "loss": 0.4576, |
| "mean_token_accuracy": 0.032098766198032536, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.26634113860836756, |
| "grad_norm": 0.11422038823366165, |
| "learning_rate": 9.909482695814986e-06, |
| "loss": 0.424, |
| "mean_token_accuracy": 0.03233322404776118, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.26989235378981247, |
| "grad_norm": 0.11640136688947678, |
| "learning_rate": 9.906809208179593e-06, |
| "loss": 0.4368, |
| "mean_token_accuracy": 0.03468143657846667, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.2734435689712574, |
| "grad_norm": 0.10775226354598999, |
| "learning_rate": 9.904097182700806e-06, |
| "loss": 0.413, |
| "mean_token_accuracy": 0.0327887820021715, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.27699478415270223, |
| "grad_norm": 0.1443656086921692, |
| "learning_rate": 9.901346640678744e-06, |
| "loss": 0.4565, |
| "mean_token_accuracy": 0.03074216824643372, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.28054599933414714, |
| "grad_norm": 0.13974997401237488, |
| "learning_rate": 9.898557603716031e-06, |
| "loss": 0.4522, |
| "mean_token_accuracy": 0.030708255846548127, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.28409721451559206, |
| "grad_norm": 0.11789468675851822, |
| "learning_rate": 9.895730093717629e-06, |
| "loss": 0.4354, |
| "mean_token_accuracy": 0.029810201263899216, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.28764842969703697, |
| "grad_norm": 0.1083148941397667, |
| "learning_rate": 9.892864132890663e-06, |
| "loss": 0.414, |
| "mean_token_accuracy": 0.032822823130118195, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.2911996448784819, |
| "grad_norm": 0.11463505774736404, |
| "learning_rate": 9.889959743744253e-06, |
| "loss": 0.4493, |
| "mean_token_accuracy": 0.030246941671066452, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.29475086005992673, |
| "grad_norm": 0.1068756952881813, |
| "learning_rate": 9.887016949089334e-06, |
| "loss": 0.4228, |
| "mean_token_accuracy": 0.03203033060890448, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.29830207524137164, |
| "grad_norm": 0.1452023833990097, |
| "learning_rate": 9.884035772038471e-06, |
| "loss": 0.4389, |
| "mean_token_accuracy": 0.03580652122036554, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.30185329042281656, |
| "grad_norm": 0.11701487749814987, |
| "learning_rate": 9.881016236005686e-06, |
| "loss": 0.4338, |
| "mean_token_accuracy": 0.03185546858730959, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.30540450560426147, |
| "grad_norm": 0.123188316822052, |
| "learning_rate": 9.877958364706269e-06, |
| "loss": 0.4633, |
| "mean_token_accuracy": 0.03289600303105544, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.3089557207857064, |
| "grad_norm": 0.10650799423456192, |
| "learning_rate": 9.874862182156596e-06, |
| "loss": 0.4365, |
| "mean_token_accuracy": 0.032173037085158285, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.31250693596715123, |
| "grad_norm": 0.13090448081493378, |
| "learning_rate": 9.871727712673931e-06, |
| "loss": 0.4148, |
| "mean_token_accuracy": 0.032025952023104765, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.31605815114859614, |
| "grad_norm": 0.1158757209777832, |
| "learning_rate": 9.868554980876253e-06, |
| "loss": 0.434, |
| "mean_token_accuracy": 0.03277148631241289, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.31960936633004106, |
| "grad_norm": 0.1502567082643509, |
| "learning_rate": 9.865344011682038e-06, |
| "loss": 0.4402, |
| "mean_token_accuracy": 0.03716896351761534, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.32316058151148597, |
| "grad_norm": 0.10705456882715225, |
| "learning_rate": 9.86209483031009e-06, |
| "loss": 0.4428, |
| "mean_token_accuracy": 0.028792706354579423, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.3267117966929309, |
| "grad_norm": 0.13711455464363098, |
| "learning_rate": 9.858807462279319e-06, |
| "loss": 0.4311, |
| "mean_token_accuracy": 0.03321756741206627, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.3302630118743758, |
| "grad_norm": 0.13615398108959198, |
| "learning_rate": 9.855481933408557e-06, |
| "loss": 0.4426, |
| "mean_token_accuracy": 0.03406765976615134, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.33381422705582064, |
| "grad_norm": 0.13447901606559753, |
| "learning_rate": 9.852118269816348e-06, |
| "loss": 0.4807, |
| "mean_token_accuracy": 0.029012137778408942, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.33736544223726556, |
| "grad_norm": 0.12379316985607147, |
| "learning_rate": 9.848716497920742e-06, |
| "loss": 0.4446, |
| "mean_token_accuracy": 0.030392555565413204, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.34091665741871047, |
| "grad_norm": 0.1602836698293686, |
| "learning_rate": 9.845276644439093e-06, |
| "loss": 0.4601, |
| "mean_token_accuracy": 0.03084721965751669, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.3444678726001554, |
| "grad_norm": 0.08885196596384048, |
| "learning_rate": 9.841798736387846e-06, |
| "loss": 0.4065, |
| "mean_token_accuracy": 0.03453358236947679, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.3480190877816003, |
| "grad_norm": 0.12009483575820923, |
| "learning_rate": 9.838282801082322e-06, |
| "loss": 0.473, |
| "mean_token_accuracy": 0.03147461433763965, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.35157030296304514, |
| "grad_norm": 0.1066279485821724, |
| "learning_rate": 9.834728866136506e-06, |
| "loss": 0.4057, |
| "mean_token_accuracy": 0.03152646504895529, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.35512151814449006, |
| "grad_norm": 0.11169735342264175, |
| "learning_rate": 9.831136959462835e-06, |
| "loss": 0.4499, |
| "mean_token_accuracy": 0.03387650641161599, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.35867273332593497, |
| "grad_norm": 0.12812377512454987, |
| "learning_rate": 9.82750710927197e-06, |
| "loss": 0.4314, |
| "mean_token_accuracy": 0.03172285951222875, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.3622239485073799, |
| "grad_norm": 0.1205376535654068, |
| "learning_rate": 9.823839344072582e-06, |
| "loss": 0.4214, |
| "mean_token_accuracy": 0.03345074072058196, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.3657751636888248, |
| "grad_norm": 0.12378670275211334, |
| "learning_rate": 9.820133692671116e-06, |
| "loss": 0.439, |
| "mean_token_accuracy": 0.029327621996344533, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.36932637887026964, |
| "grad_norm": 0.1197931170463562, |
| "learning_rate": 9.816390184171587e-06, |
| "loss": 0.4552, |
| "mean_token_accuracy": 0.03034232833306305, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.37287759405171456, |
| "grad_norm": 0.13017283380031586, |
| "learning_rate": 9.812608847975327e-06, |
| "loss": 0.4246, |
| "mean_token_accuracy": 0.03273073174932506, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.37642880923315947, |
| "grad_norm": 0.11206940561532974, |
| "learning_rate": 9.808789713780768e-06, |
| "loss": 0.43, |
| "mean_token_accuracy": 0.033182675353600644, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.3799800244146044, |
| "grad_norm": 0.15535788238048553, |
| "learning_rate": 9.804932811583208e-06, |
| "loss": 0.4609, |
| "mean_token_accuracy": 0.029387073122052243, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.3835312395960493, |
| "grad_norm": 0.10909680277109146, |
| "learning_rate": 9.801038171674571e-06, |
| "loss": 0.4159, |
| "mean_token_accuracy": 0.03437764603586402, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.3870824547774942, |
| "grad_norm": 0.09239600598812103, |
| "learning_rate": 9.797105824643171e-06, |
| "loss": 0.4191, |
| "mean_token_accuracy": 0.03158426017398597, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.39063366995893906, |
| "grad_norm": 0.1135198250412941, |
| "learning_rate": 9.793135801373472e-06, |
| "loss": 0.4322, |
| "mean_token_accuracy": 0.029844234144547954, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.39418488514038397, |
| "grad_norm": 0.13279543817043304, |
| "learning_rate": 9.789128133045846e-06, |
| "loss": 0.4734, |
| "mean_token_accuracy": 0.034466699355107266, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.3977361003218289, |
| "grad_norm": 0.1167665496468544, |
| "learning_rate": 9.785082851136327e-06, |
| "loss": 0.4249, |
| "mean_token_accuracy": 0.03815961653708655, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.4012873155032738, |
| "grad_norm": 0.10479965060949326, |
| "learning_rate": 9.780999987416363e-06, |
| "loss": 0.4256, |
| "mean_token_accuracy": 0.03297240539905033, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.4048385306847187, |
| "grad_norm": 0.11566231399774551, |
| "learning_rate": 9.776879573952573e-06, |
| "loss": 0.4347, |
| "mean_token_accuracy": 0.03426292850053869, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.40838974586616356, |
| "grad_norm": 0.13041988015174866, |
| "learning_rate": 9.772721643106483e-06, |
| "loss": 0.4171, |
| "mean_token_accuracy": 0.030951603603170952, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.41194096104760847, |
| "grad_norm": 0.13270442187786102, |
| "learning_rate": 9.768526227534286e-06, |
| "loss": 0.4541, |
| "mean_token_accuracy": 0.030458911915047793, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.4154921762290534, |
| "grad_norm": 0.1140994057059288, |
| "learning_rate": 9.764293360186568e-06, |
| "loss": 0.4252, |
| "mean_token_accuracy": 0.032363708789489465, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.4190433914104983, |
| "grad_norm": 0.10969191789627075, |
| "learning_rate": 9.760023074308067e-06, |
| "loss": 0.4403, |
| "mean_token_accuracy": 0.03104240054017282, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.4225946065919432, |
| "grad_norm": 0.10822800546884537, |
| "learning_rate": 9.755715403437405e-06, |
| "loss": 0.4271, |
| "mean_token_accuracy": 0.027545168155484134, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.42614582177338806, |
| "grad_norm": 0.10963085293769836, |
| "learning_rate": 9.75137038140682e-06, |
| "loss": 0.413, |
| "mean_token_accuracy": 0.030798182297075982, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.42969703695483297, |
| "grad_norm": 0.11720361560583115, |
| "learning_rate": 9.746988042341907e-06, |
| "loss": 0.4112, |
| "mean_token_accuracy": 0.03484758762715501, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.4332482521362779, |
| "grad_norm": 0.12010498344898224, |
| "learning_rate": 9.742568420661347e-06, |
| "loss": 0.4248, |
| "mean_token_accuracy": 0.034901620656455634, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.4367994673177228, |
| "grad_norm": 0.11497566103935242, |
| "learning_rate": 9.738111551076633e-06, |
| "loss": 0.4231, |
| "mean_token_accuracy": 0.03080306958872825, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.4403506824991677, |
| "grad_norm": 0.1160268560051918, |
| "learning_rate": 9.733617468591806e-06, |
| "loss": 0.4501, |
| "mean_token_accuracy": 0.029371788314165315, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.4439018976806126, |
| "grad_norm": 0.11650796234607697, |
| "learning_rate": 9.729086208503174e-06, |
| "loss": 0.4168, |
| "mean_token_accuracy": 0.03356469544087304, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.44745311286205747, |
| "grad_norm": 0.13858026266098022, |
| "learning_rate": 9.724517806399035e-06, |
| "loss": 0.4122, |
| "mean_token_accuracy": 0.030320848196424777, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.4510043280435024, |
| "grad_norm": 0.09430894255638123, |
| "learning_rate": 9.7199122981594e-06, |
| "loss": 0.4063, |
| "mean_token_accuracy": 0.03472123346364242, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.4545555432249473, |
| "grad_norm": 0.11508966982364655, |
| "learning_rate": 9.715269719955708e-06, |
| "loss": 0.4464, |
| "mean_token_accuracy": 0.032851900316018146, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.4581067584063922, |
| "grad_norm": 0.11557421833276749, |
| "learning_rate": 9.710590108250546e-06, |
| "loss": 0.4382, |
| "mean_token_accuracy": 0.03142100031618611, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.4616579735878371, |
| "grad_norm": 0.11115779727697372, |
| "learning_rate": 9.705873499797358e-06, |
| "loss": 0.4127, |
| "mean_token_accuracy": 0.030225146732846042, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.46520918876928197, |
| "grad_norm": 0.13472823798656464, |
| "learning_rate": 9.701119931640161e-06, |
| "loss": 0.4499, |
| "mean_token_accuracy": 0.03345891845674487, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.4687604039507269, |
| "grad_norm": 0.14002355933189392, |
| "learning_rate": 9.69632944111325e-06, |
| "loss": 0.4515, |
| "mean_token_accuracy": 0.02956337868090486, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.4723116191321718, |
| "grad_norm": 0.11660108715295792, |
| "learning_rate": 9.691502065840905e-06, |
| "loss": 0.4259, |
| "mean_token_accuracy": 0.03448187607500586, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.4758628343136167, |
| "grad_norm": 0.1005408763885498, |
| "learning_rate": 9.686637843737104e-06, |
| "loss": 0.4246, |
| "mean_token_accuracy": 0.0317718359438004, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.4794140494950616, |
| "grad_norm": 0.11530707031488419, |
| "learning_rate": 9.681736813005207e-06, |
| "loss": 0.4211, |
| "mean_token_accuracy": 0.028531658732390497, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.48296526467650647, |
| "grad_norm": 0.11586639285087585, |
| "learning_rate": 9.676799012137678e-06, |
| "loss": 0.4539, |
| "mean_token_accuracy": 0.035154912646248704, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.4865164798579514, |
| "grad_norm": 0.11311322450637817, |
| "learning_rate": 9.671824479915768e-06, |
| "loss": 0.4323, |
| "mean_token_accuracy": 0.03284673898087931, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.4900676950393963, |
| "grad_norm": 0.14726871252059937, |
| "learning_rate": 9.666813255409212e-06, |
| "loss": 0.4535, |
| "mean_token_accuracy": 0.02753433164252783, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.4936189102208412, |
| "grad_norm": 0.10065948218107224, |
| "learning_rate": 9.661765377975924e-06, |
| "loss": 0.4168, |
| "mean_token_accuracy": 0.034299176466447534, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.4971701254022861, |
| "grad_norm": 0.13521119952201843, |
| "learning_rate": 9.656680887261693e-06, |
| "loss": 0.4413, |
| "mean_token_accuracy": 0.03242656226575491, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.500721340583731, |
| "grad_norm": 0.11620158702135086, |
| "learning_rate": 9.651559823199865e-06, |
| "loss": 0.4219, |
| "mean_token_accuracy": 0.03236671327249496, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.5042725557651759, |
| "grad_norm": 0.13755175471305847, |
| "learning_rate": 9.646402226011028e-06, |
| "loss": 0.4646, |
| "mean_token_accuracy": 0.027886055768249207, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.5078237709466208, |
| "grad_norm": 0.12309759855270386, |
| "learning_rate": 9.641208136202705e-06, |
| "loss": 0.4387, |
| "mean_token_accuracy": 0.033787832529924344, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.5113749861280656, |
| "grad_norm": 0.09280434995889664, |
| "learning_rate": 9.635977594569025e-06, |
| "loss": 0.4072, |
| "mean_token_accuracy": 0.03402286476557492, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.5149262013095106, |
| "grad_norm": 0.10794156789779663, |
| "learning_rate": 9.630710642190412e-06, |
| "loss": 0.4207, |
| "mean_token_accuracy": 0.03402460503275506, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.5184774164909555, |
| "grad_norm": 0.12627671658992767, |
| "learning_rate": 9.625407320433257e-06, |
| "loss": 0.4509, |
| "mean_token_accuracy": 0.03308519825441181, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.5220286316724004, |
| "grad_norm": 0.11183463037014008, |
| "learning_rate": 9.620067670949593e-06, |
| "loss": 0.447, |
| "mean_token_accuracy": 0.030306155767902965, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.5255798468538453, |
| "grad_norm": 0.12490588426589966, |
| "learning_rate": 9.614691735676768e-06, |
| "loss": 0.438, |
| "mean_token_accuracy": 0.02836215259230812, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.5291310620352903, |
| "grad_norm": 0.09944428503513336, |
| "learning_rate": 9.609279556837122e-06, |
| "loss": 0.4048, |
| "mean_token_accuracy": 0.034694020345341414, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.5326822772167351, |
| "grad_norm": 0.11730767041444778, |
| "learning_rate": 9.603831176937645e-06, |
| "loss": 0.4741, |
| "mean_token_accuracy": 0.03214656777709024, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.53623349239818, |
| "grad_norm": 0.1205916702747345, |
| "learning_rate": 9.598346638769653e-06, |
| "loss": 0.4363, |
| "mean_token_accuracy": 0.02935866809639265, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.5397847075796249, |
| "grad_norm": 0.11929334700107574, |
| "learning_rate": 9.592825985408443e-06, |
| "loss": 0.4471, |
| "mean_token_accuracy": 0.033947633866773685, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.5433359227610698, |
| "grad_norm": 0.12662938237190247, |
| "learning_rate": 9.58726926021296e-06, |
| "loss": 0.4543, |
| "mean_token_accuracy": 0.028889564997371053, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.5468871379425148, |
| "grad_norm": 0.11105228215456009, |
| "learning_rate": 9.581676506825458e-06, |
| "loss": 0.4367, |
| "mean_token_accuracy": 0.03555938474892173, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.5504383531239596, |
| "grad_norm": 0.11740686744451523, |
| "learning_rate": 9.576047769171154e-06, |
| "loss": 0.4299, |
| "mean_token_accuracy": 0.03297806582850171, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.5539895683054045, |
| "grad_norm": 0.11532069742679596, |
| "learning_rate": 9.57038309145788e-06, |
| "loss": 0.4662, |
| "mean_token_accuracy": 0.02882920480624307, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.5575407834868494, |
| "grad_norm": 0.11282172054052353, |
| "learning_rate": 9.564682518175745e-06, |
| "loss": 0.4591, |
| "mean_token_accuracy": 0.03218559510423802, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.5610919986682943, |
| "grad_norm": 0.09830707311630249, |
| "learning_rate": 9.558946094096773e-06, |
| "loss": 0.4048, |
| "mean_token_accuracy": 0.030819052895822097, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.5646432138497393, |
| "grad_norm": 0.13022297620773315, |
| "learning_rate": 9.553173864274567e-06, |
| "loss": 0.4528, |
| "mean_token_accuracy": 0.031908016972010955, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.5681944290311841, |
| "grad_norm": 0.09976419061422348, |
| "learning_rate": 9.547365874043939e-06, |
| "loss": 0.4228, |
| "mean_token_accuracy": 0.03265460357397387, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.571745644212629, |
| "grad_norm": 0.12907098233699799, |
| "learning_rate": 9.541522169020568e-06, |
| "loss": 0.4338, |
| "mean_token_accuracy": 0.03252671978043509, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.5752968593940739, |
| "grad_norm": 0.11115579307079315, |
| "learning_rate": 9.535642795100628e-06, |
| "loss": 0.4329, |
| "mean_token_accuracy": 0.03106809964447166, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.5788480745755188, |
| "grad_norm": 0.10402530431747437, |
| "learning_rate": 9.529727798460443e-06, |
| "loss": 0.4158, |
| "mean_token_accuracy": 0.02999804872160894, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.5823992897569638, |
| "grad_norm": 0.11144474148750305, |
| "learning_rate": 9.52377722555611e-06, |
| "loss": 0.4318, |
| "mean_token_accuracy": 0.031277922937078984, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.5859505049384086, |
| "grad_norm": 0.11141372472047806, |
| "learning_rate": 9.517791123123141e-06, |
| "loss": 0.4513, |
| "mean_token_accuracy": 0.030790336892096093, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5895017201198535, |
| "grad_norm": 0.11419973522424698, |
| "learning_rate": 9.5117695381761e-06, |
| "loss": 0.4478, |
| "mean_token_accuracy": 0.03272564147846424, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.5930529353012984, |
| "grad_norm": 0.11346927285194397, |
| "learning_rate": 9.50571251800822e-06, |
| "loss": 0.4243, |
| "mean_token_accuracy": 0.034652669322895235, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.5966041504827433, |
| "grad_norm": 0.10645310580730438, |
| "learning_rate": 9.49962011019105e-06, |
| "loss": 0.4291, |
| "mean_token_accuracy": 0.030576513236155733, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.6001553656641883, |
| "grad_norm": 0.13944825530052185, |
| "learning_rate": 9.493492362574069e-06, |
| "loss": 0.4506, |
| "mean_token_accuracy": 0.032292869611410424, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.6037065808456331, |
| "grad_norm": 0.12811113893985748, |
| "learning_rate": 9.487329323284306e-06, |
| "loss": 0.4272, |
| "mean_token_accuracy": 0.028058004420017824, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.607257796027078, |
| "grad_norm": 0.12232954055070877, |
| "learning_rate": 9.481131040725982e-06, |
| "loss": 0.4304, |
| "mean_token_accuracy": 0.030032273742108373, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.6108090112085229, |
| "grad_norm": 0.12088574469089508, |
| "learning_rate": 9.474897563580105e-06, |
| "loss": 0.4298, |
| "mean_token_accuracy": 0.03754826654039789, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.6143602263899678, |
| "grad_norm": 0.14544999599456787, |
| "learning_rate": 9.468628940804109e-06, |
| "loss": 0.4611, |
| "mean_token_accuracy": 0.02830790860025445, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.6179114415714128, |
| "grad_norm": 0.12021893262863159, |
| "learning_rate": 9.46232522163145e-06, |
| "loss": 0.4297, |
| "mean_token_accuracy": 0.031058607095474144, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.6214626567528576, |
| "grad_norm": 0.11498667299747467, |
| "learning_rate": 9.45598645557124e-06, |
| "loss": 0.4365, |
| "mean_token_accuracy": 0.03069377435167553, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.6250138719343025, |
| "grad_norm": 0.11569106578826904, |
| "learning_rate": 9.44961269240784e-06, |
| "loss": 0.4524, |
| "mean_token_accuracy": 0.031692910630226834, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.6285650871157474, |
| "grad_norm": 0.11758638918399811, |
| "learning_rate": 9.443203982200479e-06, |
| "loss": 0.4201, |
| "mean_token_accuracy": 0.028326944633590756, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.6321163022971923, |
| "grad_norm": 0.12534549832344055, |
| "learning_rate": 9.436760375282858e-06, |
| "loss": 0.4518, |
| "mean_token_accuracy": 0.03576031195188989, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.6356675174786373, |
| "grad_norm": 0.12341169267892838, |
| "learning_rate": 9.430281922262758e-06, |
| "loss": 0.4191, |
| "mean_token_accuracy": 0.03796804480589344, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.6392187326600821, |
| "grad_norm": 0.12558291852474213, |
| "learning_rate": 9.423768674021638e-06, |
| "loss": 0.4382, |
| "mean_token_accuracy": 0.030642931031252374, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.642769947841527, |
| "grad_norm": 0.10039878636598587, |
| "learning_rate": 9.417220681714232e-06, |
| "loss": 0.4258, |
| "mean_token_accuracy": 0.031603102244844195, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.6463211630229719, |
| "grad_norm": 0.1142401471734047, |
| "learning_rate": 9.410637996768161e-06, |
| "loss": 0.4323, |
| "mean_token_accuracy": 0.030828532620944316, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.6498723782044168, |
| "grad_norm": 0.1079002246260643, |
| "learning_rate": 9.404020670883511e-06, |
| "loss": 0.4113, |
| "mean_token_accuracy": 0.03495721969375154, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.6534235933858618, |
| "grad_norm": 0.11479309946298599, |
| "learning_rate": 9.397368756032445e-06, |
| "loss": 0.444, |
| "mean_token_accuracy": 0.030968798611866077, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.6569748085673066, |
| "grad_norm": 0.12519249320030212, |
| "learning_rate": 9.390682304458782e-06, |
| "loss": 0.4447, |
| "mean_token_accuracy": 0.03184685645828722, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.6605260237487516, |
| "grad_norm": 0.13242708146572113, |
| "learning_rate": 9.38396136867759e-06, |
| "loss": 0.4325, |
| "mean_token_accuracy": 0.03234595538378926, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.6640772389301964, |
| "grad_norm": 0.1431904435157776, |
| "learning_rate": 9.377206001474773e-06, |
| "loss": 0.4637, |
| "mean_token_accuracy": 0.02888246741349576, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.6676284541116413, |
| "grad_norm": 0.12454501539468765, |
| "learning_rate": 9.370416255906663e-06, |
| "loss": 0.4437, |
| "mean_token_accuracy": 0.03247980809828732, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.6711796692930863, |
| "grad_norm": 0.142369344830513, |
| "learning_rate": 9.363592185299593e-06, |
| "loss": 0.4505, |
| "mean_token_accuracy": 0.03117010975802259, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.6747308844745311, |
| "grad_norm": 0.11705569177865982, |
| "learning_rate": 9.356733843249487e-06, |
| "loss": 0.4729, |
| "mean_token_accuracy": 0.030519108702719677, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6782820996559761, |
| "grad_norm": 0.10681698471307755, |
| "learning_rate": 9.349841283621432e-06, |
| "loss": 0.4472, |
| "mean_token_accuracy": 0.02811988699613721, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.6818333148374209, |
| "grad_norm": 0.1344628483057022, |
| "learning_rate": 9.34291456054926e-06, |
| "loss": 0.4646, |
| "mean_token_accuracy": 0.03209848375627189, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.6853845300188658, |
| "grad_norm": 0.12343592941761017, |
| "learning_rate": 9.33595372843512e-06, |
| "loss": 0.4418, |
| "mean_token_accuracy": 0.028969483559194487, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.6889357452003108, |
| "grad_norm": 0.10727835446596146, |
| "learning_rate": 9.328958841949056e-06, |
| "loss": 0.4168, |
| "mean_token_accuracy": 0.03168499671301106, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.6924869603817556, |
| "grad_norm": 0.1320902556180954, |
| "learning_rate": 9.321929956028565e-06, |
| "loss": 0.4324, |
| "mean_token_accuracy": 0.030861409675708273, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6960381755632006, |
| "grad_norm": 0.10983603447675705, |
| "learning_rate": 9.31486712587818e-06, |
| "loss": 0.4466, |
| "mean_token_accuracy": 0.03086903478106251, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.6995893907446454, |
| "grad_norm": 0.12487445771694183, |
| "learning_rate": 9.307770406969032e-06, |
| "loss": 0.3974, |
| "mean_token_accuracy": 0.03599889015458757, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.7031406059260903, |
| "grad_norm": 0.12027975171804428, |
| "learning_rate": 9.300639855038405e-06, |
| "loss": 0.4511, |
| "mean_token_accuracy": 0.03203024027243373, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.7066918211075353, |
| "grad_norm": 0.12009831517934799, |
| "learning_rate": 9.293475526089316e-06, |
| "loss": 0.4304, |
| "mean_token_accuracy": 0.031490876361203846, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.7102430362889801, |
| "grad_norm": 0.12162379175424576, |
| "learning_rate": 9.286277476390056e-06, |
| "loss": 0.4525, |
| "mean_token_accuracy": 0.029639694414072437, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7137942514704251, |
| "grad_norm": 0.1278138905763626, |
| "learning_rate": 9.279045762473764e-06, |
| "loss": 0.4715, |
| "mean_token_accuracy": 0.030264464585343376, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.7173454666518699, |
| "grad_norm": 0.10495288670063019, |
| "learning_rate": 9.27178044113797e-06, |
| "loss": 0.4277, |
| "mean_token_accuracy": 0.03156335685889644, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.7208966818333148, |
| "grad_norm": 0.12295868992805481, |
| "learning_rate": 9.264481569444157e-06, |
| "loss": 0.4545, |
| "mean_token_accuracy": 0.030497470339469146, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.7244478970147598, |
| "grad_norm": 0.09332438558340073, |
| "learning_rate": 9.257149204717317e-06, |
| "loss": 0.3944, |
| "mean_token_accuracy": 0.03187960746254248, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.7279991121962046, |
| "grad_norm": 0.14773069322109222, |
| "learning_rate": 9.249783404545488e-06, |
| "loss": 0.4389, |
| "mean_token_accuracy": 0.0344977921267855, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.7315503273776496, |
| "grad_norm": 0.12294626981019974, |
| "learning_rate": 9.242384226779308e-06, |
| "loss": 0.4572, |
| "mean_token_accuracy": 0.03103074165119324, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.7351015425590944, |
| "grad_norm": 0.13636770844459534, |
| "learning_rate": 9.234951729531564e-06, |
| "loss": 0.4458, |
| "mean_token_accuracy": 0.0317818928451743, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.7386527577405393, |
| "grad_norm": 0.10994532704353333, |
| "learning_rate": 9.227485971176734e-06, |
| "loss": 0.4293, |
| "mean_token_accuracy": 0.03580091031108168, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.7422039729219843, |
| "grad_norm": 0.13844896852970123, |
| "learning_rate": 9.219987010350522e-06, |
| "loss": 0.4661, |
| "mean_token_accuracy": 0.031334696937847184, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.7457551881034291, |
| "grad_norm": 0.12561361491680145, |
| "learning_rate": 9.212454905949406e-06, |
| "loss": 0.4398, |
| "mean_token_accuracy": 0.03021751243250037, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7493064032848741, |
| "grad_norm": 0.09732896089553833, |
| "learning_rate": 9.204889717130172e-06, |
| "loss": 0.4107, |
| "mean_token_accuracy": 0.03446503423401737, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.7528576184663189, |
| "grad_norm": 0.1296277940273285, |
| "learning_rate": 9.197291503309448e-06, |
| "loss": 0.4356, |
| "mean_token_accuracy": 0.03087400232834625, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.7564088336477638, |
| "grad_norm": 0.10486706346273422, |
| "learning_rate": 9.189660324163243e-06, |
| "loss": 0.418, |
| "mean_token_accuracy": 0.035182704639737494, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.7599600488292088, |
| "grad_norm": 0.12244177609682083, |
| "learning_rate": 9.181996239626468e-06, |
| "loss": 0.4523, |
| "mean_token_accuracy": 0.031129384631640278, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.7635112640106536, |
| "grad_norm": 0.11858798563480377, |
| "learning_rate": 9.174299309892474e-06, |
| "loss": 0.4207, |
| "mean_token_accuracy": 0.03296038699045312, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.7670624791920986, |
| "grad_norm": 0.11604490876197815, |
| "learning_rate": 9.166569595412576e-06, |
| "loss": 0.4627, |
| "mean_token_accuracy": 0.030237445222155657, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.7706136943735434, |
| "grad_norm": 0.1077352985739708, |
| "learning_rate": 9.158807156895581e-06, |
| "loss": 0.4314, |
| "mean_token_accuracy": 0.031499510438152356, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.7741649095549884, |
| "grad_norm": 0.10301569849252701, |
| "learning_rate": 9.151012055307308e-06, |
| "loss": 0.4322, |
| "mean_token_accuracy": 0.03421672209515236, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.7777161247364333, |
| "grad_norm": 0.10384063422679901, |
| "learning_rate": 9.14318435187011e-06, |
| "loss": 0.4272, |
| "mean_token_accuracy": 0.028132701208960498, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.7812673399178781, |
| "grad_norm": 0.13823232054710388, |
| "learning_rate": 9.135324108062391e-06, |
| "loss": 0.4281, |
| "mean_token_accuracy": 0.037649581041478086, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7848185550993231, |
| "grad_norm": 0.12410993129014969, |
| "learning_rate": 9.127431385618129e-06, |
| "loss": 0.418, |
| "mean_token_accuracy": 0.03602545694229775, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.7883697702807679, |
| "grad_norm": 0.117218516767025, |
| "learning_rate": 9.119506246526386e-06, |
| "loss": 0.445, |
| "mean_token_accuracy": 0.03329144358031044, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.7919209854622129, |
| "grad_norm": 0.11457571387290955, |
| "learning_rate": 9.111548753030824e-06, |
| "loss": 0.4492, |
| "mean_token_accuracy": 0.031933008820487885, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.7954722006436578, |
| "grad_norm": 0.12239421904087067, |
| "learning_rate": 9.103558967629211e-06, |
| "loss": 0.4316, |
| "mean_token_accuracy": 0.03384177159387036, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.7990234158251026, |
| "grad_norm": 0.1020677462220192, |
| "learning_rate": 9.09553695307294e-06, |
| "loss": 0.416, |
| "mean_token_accuracy": 0.02926015923367231, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.8025746310065476, |
| "grad_norm": 0.13255175948143005, |
| "learning_rate": 9.087482772366529e-06, |
| "loss": 0.4609, |
| "mean_token_accuracy": 0.029086984148307238, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.8061258461879924, |
| "grad_norm": 0.12250262498855591, |
| "learning_rate": 9.07939648876712e-06, |
| "loss": 0.4503, |
| "mean_token_accuracy": 0.028839589653216535, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.8096770613694374, |
| "grad_norm": 0.1015993133187294, |
| "learning_rate": 9.071278165784001e-06, |
| "loss": 0.4456, |
| "mean_token_accuracy": 0.036248502918169834, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.8132282765508823, |
| "grad_norm": 0.11699055135250092, |
| "learning_rate": 9.063127867178085e-06, |
| "loss": 0.4456, |
| "mean_token_accuracy": 0.03173907856398728, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.8167794917323271, |
| "grad_norm": 0.1485580950975418, |
| "learning_rate": 9.054945656961429e-06, |
| "loss": 0.4553, |
| "mean_token_accuracy": 0.031534753976302454, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8203307069137721, |
| "grad_norm": 0.11307156831026077, |
| "learning_rate": 9.046731599396716e-06, |
| "loss": 0.4558, |
| "mean_token_accuracy": 0.030681278425618075, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.8238819220952169, |
| "grad_norm": 0.13930678367614746, |
| "learning_rate": 9.03848575899676e-06, |
| "loss": 0.4537, |
| "mean_token_accuracy": 0.03255952116523986, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.8274331372766619, |
| "grad_norm": 0.12353396415710449, |
| "learning_rate": 9.030208200523994e-06, |
| "loss": 0.4994, |
| "mean_token_accuracy": 0.02909530242322944, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.8309843524581068, |
| "grad_norm": 0.13621285557746887, |
| "learning_rate": 9.021898988989966e-06, |
| "loss": 0.4975, |
| "mean_token_accuracy": 0.02880604089659755, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.8345355676395516, |
| "grad_norm": 0.12057027965784073, |
| "learning_rate": 9.013558189654819e-06, |
| "loss": 0.4231, |
| "mean_token_accuracy": 0.031169906702416483, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.8380867828209966, |
| "grad_norm": 0.1317608654499054, |
| "learning_rate": 9.005185868026793e-06, |
| "loss": 0.4354, |
| "mean_token_accuracy": 0.032176410291867796, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.8416379980024414, |
| "grad_norm": 0.11058870702981949, |
| "learning_rate": 8.996782089861699e-06, |
| "loss": 0.4211, |
| "mean_token_accuracy": 0.030159930855006678, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.8451892131838864, |
| "grad_norm": 0.11160682886838913, |
| "learning_rate": 8.988346921162407e-06, |
| "loss": 0.4608, |
| "mean_token_accuracy": 0.03140243760572048, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.8487404283653313, |
| "grad_norm": 0.12711158394813538, |
| "learning_rate": 8.979880428178323e-06, |
| "loss": 0.425, |
| "mean_token_accuracy": 0.03014450103364652, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.8522916435467761, |
| "grad_norm": 0.10027121752500534, |
| "learning_rate": 8.971382677404878e-06, |
| "loss": 0.4168, |
| "mean_token_accuracy": 0.0301158644942916, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8558428587282211, |
| "grad_norm": 0.11261257529258728, |
| "learning_rate": 8.962853735582996e-06, |
| "loss": 0.4391, |
| "mean_token_accuracy": 0.03371476338725188, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.8593940739096659, |
| "grad_norm": 0.11643965542316437, |
| "learning_rate": 8.95429366969858e-06, |
| "loss": 0.4387, |
| "mean_token_accuracy": 0.03266353774779418, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.8629452890911109, |
| "grad_norm": 0.1157926544547081, |
| "learning_rate": 8.94570254698197e-06, |
| "loss": 0.4163, |
| "mean_token_accuracy": 0.03458352739835391, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.8664965042725558, |
| "grad_norm": 0.13001175224781036, |
| "learning_rate": 8.93708043490743e-06, |
| "loss": 0.4514, |
| "mean_token_accuracy": 0.033025608005118556, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.8700477194540006, |
| "grad_norm": 0.10545831173658371, |
| "learning_rate": 8.928427401192618e-06, |
| "loss": 0.4254, |
| "mean_token_accuracy": 0.033079178792831954, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.8735989346354456, |
| "grad_norm": 0.13507139682769775, |
| "learning_rate": 8.919743513798044e-06, |
| "loss": 0.4596, |
| "mean_token_accuracy": 0.0301192174811149, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.8771501498168904, |
| "grad_norm": 0.14613857865333557, |
| "learning_rate": 8.911028840926537e-06, |
| "loss": 0.4366, |
| "mean_token_accuracy": 0.032090271219203714, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.8807013649983354, |
| "grad_norm": 0.16336016356945038, |
| "learning_rate": 8.902283451022725e-06, |
| "loss": 0.4568, |
| "mean_token_accuracy": 0.02893733787277597, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.8842525801797803, |
| "grad_norm": 0.1086035892367363, |
| "learning_rate": 8.89350741277247e-06, |
| "loss": 0.4177, |
| "mean_token_accuracy": 0.031324194565968355, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.8878037953612252, |
| "grad_norm": 0.11843832582235336, |
| "learning_rate": 8.884700795102365e-06, |
| "loss": 0.4408, |
| "mean_token_accuracy": 0.030065572103922023, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8913550105426701, |
| "grad_norm": 0.11013603955507278, |
| "learning_rate": 8.875863667179155e-06, |
| "loss": 0.4411, |
| "mean_token_accuracy": 0.03253144204427372, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.8949062257241149, |
| "grad_norm": 0.10877636820077896, |
| "learning_rate": 8.866996098409217e-06, |
| "loss": 0.4445, |
| "mean_token_accuracy": 0.030102188491582638, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.8984574409055599, |
| "grad_norm": 0.12473298609256744, |
| "learning_rate": 8.858098158438013e-06, |
| "loss": 0.4278, |
| "mean_token_accuracy": 0.036179308270220645, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.9020086560870048, |
| "grad_norm": 0.13368239998817444, |
| "learning_rate": 8.849169917149532e-06, |
| "loss": 0.4418, |
| "mean_token_accuracy": 0.0318060622739722, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.9055598712684497, |
| "grad_norm": 0.13873310387134552, |
| "learning_rate": 8.840211444665754e-06, |
| "loss": 0.4442, |
| "mean_token_accuracy": 0.031459740581340156, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.9091110864498946, |
| "grad_norm": 0.11932561546564102, |
| "learning_rate": 8.831222811346088e-06, |
| "loss": 0.4303, |
| "mean_token_accuracy": 0.034126964517781744, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.9126623016313394, |
| "grad_norm": 0.10621127486228943, |
| "learning_rate": 8.822204087786831e-06, |
| "loss": 0.4131, |
| "mean_token_accuracy": 0.028078828796424204, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.9162135168127844, |
| "grad_norm": 0.10923238098621368, |
| "learning_rate": 8.813155344820602e-06, |
| "loss": 0.4328, |
| "mean_token_accuracy": 0.03459552870117477, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.9197647319942293, |
| "grad_norm": 0.1389293372631073, |
| "learning_rate": 8.804076653515792e-06, |
| "loss": 0.4502, |
| "mean_token_accuracy": 0.030979796672909288, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.9233159471756742, |
| "grad_norm": 0.12477507442235947, |
| "learning_rate": 8.794968085176006e-06, |
| "loss": 0.4523, |
| "mean_token_accuracy": 0.030272630028775893, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9268671623571191, |
| "grad_norm": 0.10108153522014618, |
| "learning_rate": 8.785829711339502e-06, |
| "loss": 0.4305, |
| "mean_token_accuracy": 0.032170976937777596, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.9304183775385639, |
| "grad_norm": 0.12623409926891327, |
| "learning_rate": 8.776661603778629e-06, |
| "loss": 0.4341, |
| "mean_token_accuracy": 0.03348069915591623, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.9339695927200089, |
| "grad_norm": 0.1211409643292427, |
| "learning_rate": 8.767463834499261e-06, |
| "loss": 0.429, |
| "mean_token_accuracy": 0.03233881213964196, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.9375208079014538, |
| "grad_norm": 0.12843799591064453, |
| "learning_rate": 8.758236475740236e-06, |
| "loss": 0.4286, |
| "mean_token_accuracy": 0.030366436680196784, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.9410720230828987, |
| "grad_norm": 0.11599805951118469, |
| "learning_rate": 8.748979599972787e-06, |
| "loss": 0.4165, |
| "mean_token_accuracy": 0.030518364896124694, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.9446232382643436, |
| "grad_norm": 0.11995775997638702, |
| "learning_rate": 8.739693279899969e-06, |
| "loss": 0.4613, |
| "mean_token_accuracy": 0.036479957580013433, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.9481744534457884, |
| "grad_norm": 0.11425738781690598, |
| "learning_rate": 8.730377588456092e-06, |
| "loss": 0.4501, |
| "mean_token_accuracy": 0.03359218502737349, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.9517256686272334, |
| "grad_norm": 0.12363097071647644, |
| "learning_rate": 8.72103259880615e-06, |
| "loss": 0.383, |
| "mean_token_accuracy": 0.03248222741785867, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.9552768838086783, |
| "grad_norm": 0.14224040508270264, |
| "learning_rate": 8.711658384345244e-06, |
| "loss": 0.4453, |
| "mean_token_accuracy": 0.02758350678777788, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.9588280989901232, |
| "grad_norm": 0.11947707086801529, |
| "learning_rate": 8.702255018698e-06, |
| "loss": 0.4427, |
| "mean_token_accuracy": 0.030479307537461864, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9623793141715681, |
| "grad_norm": 0.11489012092351913, |
| "learning_rate": 8.692822575718e-06, |
| "loss": 0.4322, |
| "mean_token_accuracy": 0.032157614528841805, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.9659305293530129, |
| "grad_norm": 0.13005125522613525, |
| "learning_rate": 8.683361129487198e-06, |
| "loss": 0.4471, |
| "mean_token_accuracy": 0.0325647444005881, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.9694817445344579, |
| "grad_norm": 0.1157633364200592, |
| "learning_rate": 8.673870754315336e-06, |
| "loss": 0.4213, |
| "mean_token_accuracy": 0.03665181735595979, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.9730329597159028, |
| "grad_norm": 0.10300834476947784, |
| "learning_rate": 8.664351524739368e-06, |
| "loss": 0.4311, |
| "mean_token_accuracy": 0.030481873731332598, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.9765841748973477, |
| "grad_norm": 0.10374599695205688, |
| "learning_rate": 8.65480351552286e-06, |
| "loss": 0.3858, |
| "mean_token_accuracy": 0.029714216772845248, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.9801353900787926, |
| "grad_norm": 0.10875561088323593, |
| "learning_rate": 8.645226801655418e-06, |
| "loss": 0.4303, |
| "mean_token_accuracy": 0.03389121513828286, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.9836866052602374, |
| "grad_norm": 0.10975569486618042, |
| "learning_rate": 8.635621458352094e-06, |
| "loss": 0.4157, |
| "mean_token_accuracy": 0.030311406939290464, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.9872378204416824, |
| "grad_norm": 0.11089649796485901, |
| "learning_rate": 8.625987561052789e-06, |
| "loss": 0.4503, |
| "mean_token_accuracy": 0.03370039981746231, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.9907890356231273, |
| "grad_norm": 0.13259300589561462, |
| "learning_rate": 8.616325185421673e-06, |
| "loss": 0.4693, |
| "mean_token_accuracy": 0.029472638332663337, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.9943402508045722, |
| "grad_norm": 0.1153963953256607, |
| "learning_rate": 8.606634407346575e-06, |
| "loss": 0.4639, |
| "mean_token_accuracy": 0.033020730788848596, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9978914659860171, |
| "grad_norm": 0.10748148709535599, |
| "learning_rate": 8.596915302938403e-06, |
| "loss": 0.4285, |
| "mean_token_accuracy": 0.030481149649858708, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.06762544065713882, |
| "learning_rate": 8.587167948530533e-06, |
| "loss": 0.2615, |
| "mean_token_accuracy": 0.028144311064878774, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.003551215181445, |
| "grad_norm": 0.12099552899599075, |
| "learning_rate": 8.577392420678217e-06, |
| "loss": 0.4319, |
| "mean_token_accuracy": 0.03571089356773882, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.0071024303628897, |
| "grad_norm": 0.11393958330154419, |
| "learning_rate": 8.567588796157983e-06, |
| "loss": 0.4351, |
| "mean_token_accuracy": 0.03345001182970009, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.0106536455443347, |
| "grad_norm": 0.12373528629541397, |
| "learning_rate": 8.557757151967025e-06, |
| "loss": 0.428, |
| "mean_token_accuracy": 0.030941195098421304, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.0142048607257796, |
| "grad_norm": 0.10352396219968796, |
| "learning_rate": 8.547897565322601e-06, |
| "loss": 0.4078, |
| "mean_token_accuracy": 0.03218995450879447, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.0177560759072246, |
| "grad_norm": 0.11134106665849686, |
| "learning_rate": 8.538010113661434e-06, |
| "loss": 0.4118, |
| "mean_token_accuracy": 0.033460683858720586, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.0213072910886694, |
| "grad_norm": 0.11754720658063889, |
| "learning_rate": 8.528094874639092e-06, |
| "loss": 0.4467, |
| "mean_token_accuracy": 0.032714871398638934, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.0248585062701143, |
| "grad_norm": 0.12311739474534988, |
| "learning_rate": 8.518151926129384e-06, |
| "loss": 0.4248, |
| "mean_token_accuracy": 0.03665415535760985, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.0284097214515593, |
| "grad_norm": 0.11815127730369568, |
| "learning_rate": 8.508181346223749e-06, |
| "loss": 0.4683, |
| "mean_token_accuracy": 0.03141742098341638, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.031960936633004, |
| "grad_norm": 0.13334578275680542, |
| "learning_rate": 8.498183213230646e-06, |
| "loss": 0.4357, |
| "mean_token_accuracy": 0.034682282523135655, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.035512151814449, |
| "grad_norm": 0.11024576425552368, |
| "learning_rate": 8.488157605674924e-06, |
| "loss": 0.4039, |
| "mean_token_accuracy": 0.03140191955390037, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.039063366995894, |
| "grad_norm": 0.11426277458667755, |
| "learning_rate": 8.478104602297226e-06, |
| "loss": 0.4646, |
| "mean_token_accuracy": 0.0302239565171476, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.0426145821773387, |
| "grad_norm": 0.13004811108112335, |
| "learning_rate": 8.468024282053357e-06, |
| "loss": 0.4405, |
| "mean_token_accuracy": 0.030375482703675516, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.0461657973587837, |
| "grad_norm": 0.11746154725551605, |
| "learning_rate": 8.457916724113667e-06, |
| "loss": 0.4623, |
| "mean_token_accuracy": 0.03180732572218403, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.0497170125402286, |
| "grad_norm": 0.1322462111711502, |
| "learning_rate": 8.447782007862427e-06, |
| "loss": 0.4509, |
| "mean_token_accuracy": 0.03127452934131725, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.0532682277216736, |
| "grad_norm": 0.11387277394533157, |
| "learning_rate": 8.437620212897213e-06, |
| "loss": 0.4601, |
| "mean_token_accuracy": 0.03242337591655087, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.0568194429031184, |
| "grad_norm": 0.1132403239607811, |
| "learning_rate": 8.427431419028273e-06, |
| "loss": 0.4225, |
| "mean_token_accuracy": 0.03050946029179613, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.0603706580845633, |
| "grad_norm": 0.11399143934249878, |
| "learning_rate": 8.417215706277905e-06, |
| "loss": 0.4096, |
| "mean_token_accuracy": 0.030746224825634272, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.0639218732660083, |
| "grad_norm": 0.1122201532125473, |
| "learning_rate": 8.406973154879826e-06, |
| "loss": 0.4719, |
| "mean_token_accuracy": 0.030868174064380582, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.067473088447453, |
| "grad_norm": 0.11641374230384827, |
| "learning_rate": 8.396703845278537e-06, |
| "loss": 0.4785, |
| "mean_token_accuracy": 0.03244481591536896, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.071024303628898, |
| "grad_norm": 0.11136434972286224, |
| "learning_rate": 8.386407858128707e-06, |
| "loss": 0.414, |
| "mean_token_accuracy": 0.03586079240631079, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.074575518810343, |
| "grad_norm": 0.0993858203291893, |
| "learning_rate": 8.376085274294518e-06, |
| "loss": 0.4497, |
| "mean_token_accuracy": 0.031207030300720362, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.0781267339917877, |
| "grad_norm": 0.12647627294063568, |
| "learning_rate": 8.365736174849053e-06, |
| "loss": 0.4451, |
| "mean_token_accuracy": 0.032390626991400495, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.0816779491732327, |
| "grad_norm": 0.10189671069383621, |
| "learning_rate": 8.355360641073637e-06, |
| "loss": 0.4146, |
| "mean_token_accuracy": 0.03132528428795922, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.0852291643546776, |
| "grad_norm": 0.13301630318164825, |
| "learning_rate": 8.344958754457214e-06, |
| "loss": 0.4729, |
| "mean_token_accuracy": 0.030442484636296285, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.0887803795361226, |
| "grad_norm": 0.10021793097257614, |
| "learning_rate": 8.3345305966957e-06, |
| "loss": 0.4336, |
| "mean_token_accuracy": 0.029210989938292187, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.0923315947175674, |
| "grad_norm": 0.1116681918501854, |
| "learning_rate": 8.324076249691347e-06, |
| "loss": 0.4579, |
| "mean_token_accuracy": 0.03265956621180521, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.0958828098990123, |
| "grad_norm": 0.12556776404380798, |
| "learning_rate": 8.31359579555209e-06, |
| "loss": 0.4345, |
| "mean_token_accuracy": 0.03397847878295579, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.0994340250804573, |
| "grad_norm": 0.12205666303634644, |
| "learning_rate": 8.30308931659091e-06, |
| "loss": 0.4724, |
| "mean_token_accuracy": 0.031116866441152524, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.102985240261902, |
| "grad_norm": 0.11637762933969498, |
| "learning_rate": 8.292556895325195e-06, |
| "loss": 0.4209, |
| "mean_token_accuracy": 0.031364490951091284, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.106536455443347, |
| "grad_norm": 0.10156012326478958, |
| "learning_rate": 8.281998614476066e-06, |
| "loss": 0.4303, |
| "mean_token_accuracy": 0.03230309693390154, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.110087670624792, |
| "grad_norm": 0.12292397022247314, |
| "learning_rate": 8.271414556967758e-06, |
| "loss": 0.4684, |
| "mean_token_accuracy": 0.030929933149309363, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.1136388858062367, |
| "grad_norm": 0.12097756564617157, |
| "learning_rate": 8.260804805926948e-06, |
| "loss": 0.4372, |
| "mean_token_accuracy": 0.033056139553082176, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.1171901009876817, |
| "grad_norm": 0.12706424295902252, |
| "learning_rate": 8.250169444682109e-06, |
| "loss": 0.4453, |
| "mean_token_accuracy": 0.028371741809678497, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.1207413161691266, |
| "grad_norm": 0.11325722187757492, |
| "learning_rate": 8.239508556762857e-06, |
| "loss": 0.4328, |
| "mean_token_accuracy": 0.02932896776837879, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.1242925313505716, |
| "grad_norm": 0.11459724605083466, |
| "learning_rate": 8.228822225899294e-06, |
| "loss": 0.4533, |
| "mean_token_accuracy": 0.03276748675489216, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.1278437465320164, |
| "grad_norm": 0.11599034070968628, |
| "learning_rate": 8.218110536021347e-06, |
| "loss": 0.4169, |
| "mean_token_accuracy": 0.032454707339638844, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.1313949617134613, |
| "grad_norm": 0.11511659622192383, |
| "learning_rate": 8.207373571258113e-06, |
| "loss": 0.4274, |
| "mean_token_accuracy": 0.031180168389255414, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.1349461768949063, |
| "grad_norm": 0.1145598515868187, |
| "learning_rate": 8.196611415937196e-06, |
| "loss": 0.4306, |
| "mean_token_accuracy": 0.032077380437840475, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.138497392076351, |
| "grad_norm": 0.12687522172927856, |
| "learning_rate": 8.18582415458405e-06, |
| "loss": 0.4119, |
| "mean_token_accuracy": 0.03120192799178767, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.142048607257796, |
| "grad_norm": 0.11041875183582306, |
| "learning_rate": 8.1750118719213e-06, |
| "loss": 0.4385, |
| "mean_token_accuracy": 0.03349298075772822, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.145599822439241, |
| "grad_norm": 0.099665068089962, |
| "learning_rate": 8.164174652868097e-06, |
| "loss": 0.4246, |
| "mean_token_accuracy": 0.03043448956668726, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.149151037620686, |
| "grad_norm": 0.11485815793275833, |
| "learning_rate": 8.153312582539438e-06, |
| "loss": 0.4182, |
| "mean_token_accuracy": 0.030599489495216403, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.1527022528021307, |
| "grad_norm": 0.13416896760463715, |
| "learning_rate": 8.142425746245503e-06, |
| "loss": 0.4498, |
| "mean_token_accuracy": 0.029845738639778574, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.1562534679835756, |
| "grad_norm": 0.13239556550979614, |
| "learning_rate": 8.131514229490975e-06, |
| "loss": 0.507, |
| "mean_token_accuracy": 0.031054503782797838, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.1598046831650206, |
| "grad_norm": 0.12665045261383057, |
| "learning_rate": 8.120578117974388e-06, |
| "loss": 0.4287, |
| "mean_token_accuracy": 0.03402871212529135, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.1633558983464654, |
| "grad_norm": 0.14665213227272034, |
| "learning_rate": 8.109617497587429e-06, |
| "loss": 0.4638, |
| "mean_token_accuracy": 0.029450454625475686, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.1669071135279103, |
| "grad_norm": 0.1434643715620041, |
| "learning_rate": 8.098632454414286e-06, |
| "loss": 0.4413, |
| "mean_token_accuracy": 0.03347731575922808, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.1704583287093553, |
| "grad_norm": 0.09570540487766266, |
| "learning_rate": 8.08762307473096e-06, |
| "loss": 0.4027, |
| "mean_token_accuracy": 0.028911761146446224, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.1740095438908003, |
| "grad_norm": 0.0962023138999939, |
| "learning_rate": 8.07658944500459e-06, |
| "loss": 0.427, |
| "mean_token_accuracy": 0.03492621044642874, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.177560759072245, |
| "grad_norm": 0.09288761019706726, |
| "learning_rate": 8.065531651892771e-06, |
| "loss": 0.4205, |
| "mean_token_accuracy": 0.03696197941462742, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.18111197425369, |
| "grad_norm": 0.11237376183271408, |
| "learning_rate": 8.054449782242876e-06, |
| "loss": 0.4474, |
| "mean_token_accuracy": 0.029908099084423156, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.1846631894351347, |
| "grad_norm": 0.10639701038599014, |
| "learning_rate": 8.043343923091382e-06, |
| "loss": 0.4375, |
| "mean_token_accuracy": 0.03525051188262296, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.1882144046165797, |
| "grad_norm": 0.10115876793861389, |
| "learning_rate": 8.03221416166317e-06, |
| "loss": 0.4197, |
| "mean_token_accuracy": 0.035662662419781554, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.1917656197980246, |
| "grad_norm": 0.14355993270874023, |
| "learning_rate": 8.021060585370845e-06, |
| "loss": 0.4546, |
| "mean_token_accuracy": 0.03178581687097903, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.1953168349794696, |
| "grad_norm": 0.10448651015758514, |
| "learning_rate": 8.009883281814066e-06, |
| "loss": 0.3823, |
| "mean_token_accuracy": 0.030594974505220307, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.1988680501609144, |
| "grad_norm": 0.14605805277824402, |
| "learning_rate": 7.998682338778834e-06, |
| "loss": 0.4644, |
| "mean_token_accuracy": 0.032786186962766806, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.2024192653423593, |
| "grad_norm": 0.11398659646511078, |
| "learning_rate": 7.987457844236817e-06, |
| "loss": 0.4479, |
| "mean_token_accuracy": 0.030445763823081506, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.2059704805238043, |
| "grad_norm": 0.10509105026721954, |
| "learning_rate": 7.976209886344654e-06, |
| "loss": 0.4302, |
| "mean_token_accuracy": 0.031263302011211636, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.209521695705249, |
| "grad_norm": 0.12267457693815231, |
| "learning_rate": 7.964938553443267e-06, |
| "loss": 0.43, |
| "mean_token_accuracy": 0.03181008769752225, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.213072910886694, |
| "grad_norm": 0.13068453967571259, |
| "learning_rate": 7.953643934057162e-06, |
| "loss": 0.4138, |
| "mean_token_accuracy": 0.02875417193354224, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.216624126068139, |
| "grad_norm": 0.1138904020190239, |
| "learning_rate": 7.942326116893733e-06, |
| "loss": 0.4309, |
| "mean_token_accuracy": 0.035087752894469304, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.220175341249584, |
| "grad_norm": 0.11926258355379105, |
| "learning_rate": 7.930985190842576e-06, |
| "loss": 0.4252, |
| "mean_token_accuracy": 0.034354623672697926, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.2237265564310287, |
| "grad_norm": 0.11906962096691132, |
| "learning_rate": 7.919621244974773e-06, |
| "loss": 0.4375, |
| "mean_token_accuracy": 0.03322149996893131, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.2272777716124736, |
| "grad_norm": 0.1229124665260315, |
| "learning_rate": 7.908234368542214e-06, |
| "loss": 0.4282, |
| "mean_token_accuracy": 0.027521870553755434, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.2308289867939186, |
| "grad_norm": 0.12528809905052185, |
| "learning_rate": 7.896824650976873e-06, |
| "loss": 0.4434, |
| "mean_token_accuracy": 0.033167709574627224, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.2343802019753634, |
| "grad_norm": 0.13237693905830383, |
| "learning_rate": 7.885392181890126e-06, |
| "loss": 0.4345, |
| "mean_token_accuracy": 0.03176790558427456, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.2379314171568083, |
| "grad_norm": 0.11144175380468369, |
| "learning_rate": 7.873937051072037e-06, |
| "loss": 0.4237, |
| "mean_token_accuracy": 0.035554787718865555, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.2414826323382533, |
| "grad_norm": 0.11270050704479218, |
| "learning_rate": 7.862459348490645e-06, |
| "loss": 0.4276, |
| "mean_token_accuracy": 0.03402164916769834, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.2450338475196983, |
| "grad_norm": 0.11199501156806946, |
| "learning_rate": 7.85095916429128e-06, |
| "loss": 0.422, |
| "mean_token_accuracy": 0.029369741489063017, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.248585062701143, |
| "grad_norm": 0.11493431031703949, |
| "learning_rate": 7.839436588795834e-06, |
| "loss": 0.4439, |
| "mean_token_accuracy": 0.030408731843635906, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.252136277882588, |
| "grad_norm": 0.12098958343267441, |
| "learning_rate": 7.82789171250206e-06, |
| "loss": 0.4624, |
| "mean_token_accuracy": 0.03373954394919565, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.2556874930640327, |
| "grad_norm": 0.11300572007894516, |
| "learning_rate": 7.816324626082864e-06, |
| "loss": 0.4189, |
| "mean_token_accuracy": 0.030854827327857492, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.2592387082454777, |
| "grad_norm": 0.11174172908067703, |
| "learning_rate": 7.804735420385578e-06, |
| "loss": 0.4238, |
| "mean_token_accuracy": 0.036562988705554744, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.2627899234269226, |
| "grad_norm": 0.12956643104553223, |
| "learning_rate": 7.793124186431271e-06, |
| "loss": 0.4461, |
| "mean_token_accuracy": 0.0327669634934864, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.2663411386083676, |
| "grad_norm": 0.1083948016166687, |
| "learning_rate": 7.781491015414018e-06, |
| "loss": 0.4331, |
| "mean_token_accuracy": 0.03526076916750753, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.2698923537898126, |
| "grad_norm": 0.1142224371433258, |
| "learning_rate": 7.769835998700182e-06, |
| "loss": 0.4384, |
| "mean_token_accuracy": 0.033927020744158654, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.2734435689712573, |
| "grad_norm": 0.11039263755083084, |
| "learning_rate": 7.758159227827701e-06, |
| "loss": 0.4415, |
| "mean_token_accuracy": 0.03664633895459701, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.2769947841527023, |
| "grad_norm": 0.1170225441455841, |
| "learning_rate": 7.746460794505375e-06, |
| "loss": 0.4572, |
| "mean_token_accuracy": 0.028795534330129158, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.280545999334147, |
| "grad_norm": 0.10929251462221146, |
| "learning_rate": 7.734740790612137e-06, |
| "loss": 0.4153, |
| "mean_token_accuracy": 0.03168763507710537, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.284097214515592, |
| "grad_norm": 0.12044554948806763, |
| "learning_rate": 7.722999308196329e-06, |
| "loss": 0.4328, |
| "mean_token_accuracy": 0.02993700837396318, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.287648429697037, |
| "grad_norm": 0.11714787036180496, |
| "learning_rate": 7.711236439474991e-06, |
| "loss": 0.437, |
| "mean_token_accuracy": 0.033871306681248825, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.291199644878482, |
| "grad_norm": 0.10608412325382233, |
| "learning_rate": 7.69945227683313e-06, |
| "loss": 0.4126, |
| "mean_token_accuracy": 0.0351106549569522, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.2947508600599267, |
| "grad_norm": 0.12553463876247406, |
| "learning_rate": 7.68764691282299e-06, |
| "loss": 0.428, |
| "mean_token_accuracy": 0.03240462405665312, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.2983020752413716, |
| "grad_norm": 0.11900968104600906, |
| "learning_rate": 7.675820440163334e-06, |
| "loss": 0.4462, |
| "mean_token_accuracy": 0.03121896790980827, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.3018532904228166, |
| "grad_norm": 0.13135908544063568, |
| "learning_rate": 7.663972951738708e-06, |
| "loss": 0.4563, |
| "mean_token_accuracy": 0.0299510243057739, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.3054045056042614, |
| "grad_norm": 0.11855065077543259, |
| "learning_rate": 7.652104540598712e-06, |
| "loss": 0.4508, |
| "mean_token_accuracy": 0.03168911819375353, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.3089557207857063, |
| "grad_norm": 0.11534618586301804, |
| "learning_rate": 7.640215299957283e-06, |
| "loss": 0.4274, |
| "mean_token_accuracy": 0.028188226955535356, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.3125069359671513, |
| "grad_norm": 0.14458145201206207, |
| "learning_rate": 7.628305323191942e-06, |
| "loss": 0.4678, |
| "mean_token_accuracy": 0.030351929475727957, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3160581511485963, |
| "grad_norm": 0.10904058068990707, |
| "learning_rate": 7.616374703843071e-06, |
| "loss": 0.4343, |
| "mean_token_accuracy": 0.027722057624487206, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.319609366330041, |
| "grad_norm": 0.12762950360774994, |
| "learning_rate": 7.604423535613183e-06, |
| "loss": 0.4492, |
| "mean_token_accuracy": 0.030748855464480584, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.323160581511486, |
| "grad_norm": 0.11561151593923569, |
| "learning_rate": 7.592451912366176e-06, |
| "loss": 0.4323, |
| "mean_token_accuracy": 0.03335273687844165, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.326711796692931, |
| "grad_norm": 0.1088724359869957, |
| "learning_rate": 7.580459928126607e-06, |
| "loss": 0.4449, |
| "mean_token_accuracy": 0.02959715071847313, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.3302630118743757, |
| "grad_norm": 0.1027379035949707, |
| "learning_rate": 7.568447677078937e-06, |
| "loss": 0.4363, |
| "mean_token_accuracy": 0.03155246840651671, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.3338142270558206, |
| "grad_norm": 0.12113990634679794, |
| "learning_rate": 7.556415253566814e-06, |
| "loss": 0.4348, |
| "mean_token_accuracy": 0.033069032098865137, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.3373654422372656, |
| "grad_norm": 0.1241699829697609, |
| "learning_rate": 7.544362752092309e-06, |
| "loss": 0.451, |
| "mean_token_accuracy": 0.03430816161016992, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.3409166574187106, |
| "grad_norm": 0.11146720498800278, |
| "learning_rate": 7.532290267315189e-06, |
| "loss": 0.4543, |
| "mean_token_accuracy": 0.034234997216117335, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.3444678726001553, |
| "grad_norm": 0.12195354700088501, |
| "learning_rate": 7.52019789405217e-06, |
| "loss": 0.443, |
| "mean_token_accuracy": 0.030580678501792136, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.3480190877816003, |
| "grad_norm": 0.1010030210018158, |
| "learning_rate": 7.508085727276169e-06, |
| "loss": 0.4056, |
| "mean_token_accuracy": 0.029197495718108257, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.351570302963045, |
| "grad_norm": 0.11852242797613144, |
| "learning_rate": 7.495953862115561e-06, |
| "loss": 0.4604, |
| "mean_token_accuracy": 0.028626239189179614, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.35512151814449, |
| "grad_norm": 0.15514077246189117, |
| "learning_rate": 7.483802393853431e-06, |
| "loss": 0.4638, |
| "mean_token_accuracy": 0.028513169188954635, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.358672733325935, |
| "grad_norm": 0.1145181730389595, |
| "learning_rate": 7.471631417926826e-06, |
| "loss": 0.4245, |
| "mean_token_accuracy": 0.028723615967464866, |
| "step": 383 |
| }, |
| { |
| "epoch": 1.36222394850738, |
| "grad_norm": 0.10029944777488708, |
| "learning_rate": 7.459441029926006e-06, |
| "loss": 0.4169, |
| "mean_token_accuracy": 0.03500634835290839, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.365775163688825, |
| "grad_norm": 0.14268019795417786, |
| "learning_rate": 7.447231325593689e-06, |
| "loss": 0.4738, |
| "mean_token_accuracy": 0.03180533792328788, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.3693263788702696, |
| "grad_norm": 0.09910490363836288, |
| "learning_rate": 7.435002400824309e-06, |
| "loss": 0.4006, |
| "mean_token_accuracy": 0.035282006590932724, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.3728775940517146, |
| "grad_norm": 0.10883248597383499, |
| "learning_rate": 7.422754351663252e-06, |
| "loss": 0.4423, |
| "mean_token_accuracy": 0.03188303730712505, |
| "step": 387 |
| }, |
| { |
| "epoch": 1.3764288092331594, |
| "grad_norm": 0.11308008432388306, |
| "learning_rate": 7.410487274306104e-06, |
| "loss": 0.4171, |
| "mean_token_accuracy": 0.03165900051317294, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.3799800244146043, |
| "grad_norm": 0.12280994653701782, |
| "learning_rate": 7.398201265097902e-06, |
| "loss": 0.4296, |
| "mean_token_accuracy": 0.03297045080034877, |
| "step": 389 |
| }, |
| { |
| "epoch": 1.3835312395960493, |
| "grad_norm": 0.11657087504863739, |
| "learning_rate": 7.385896420532372e-06, |
| "loss": 0.4095, |
| "mean_token_accuracy": 0.031911868369206786, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.3870824547774943, |
| "grad_norm": 0.1274227350950241, |
| "learning_rate": 7.37357283725117e-06, |
| "loss": 0.4876, |
| "mean_token_accuracy": 0.02943271119875135, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.390633669958939, |
| "grad_norm": 0.11406645178794861, |
| "learning_rate": 7.361230612043125e-06, |
| "loss": 0.4178, |
| "mean_token_accuracy": 0.03322120701341191, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.394184885140384, |
| "grad_norm": 0.12713231146335602, |
| "learning_rate": 7.3488698418434824e-06, |
| "loss": 0.4538, |
| "mean_token_accuracy": 0.02631053911318304, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.397736100321829, |
| "grad_norm": 0.1251516342163086, |
| "learning_rate": 7.3364906237331345e-06, |
| "loss": 0.4197, |
| "mean_token_accuracy": 0.03226183277001837, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.4012873155032737, |
| "grad_norm": 0.11285842210054398, |
| "learning_rate": 7.324093054937864e-06, |
| "loss": 0.4141, |
| "mean_token_accuracy": 0.03131841377398814, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.4048385306847186, |
| "grad_norm": 0.11905115842819214, |
| "learning_rate": 7.311677232827583e-06, |
| "loss": 0.4419, |
| "mean_token_accuracy": 0.030178814733517356, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.4083897458661636, |
| "grad_norm": 0.12142392992973328, |
| "learning_rate": 7.299243254915558e-06, |
| "loss": 0.4421, |
| "mean_token_accuracy": 0.03145620572104235, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.4119409610476086, |
| "grad_norm": 0.12110509723424911, |
| "learning_rate": 7.286791218857654e-06, |
| "loss": 0.4323, |
| "mean_token_accuracy": 0.03314107269034139, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.4154921762290533, |
| "grad_norm": 0.11982744932174683, |
| "learning_rate": 7.274321222451561e-06, |
| "loss": 0.4694, |
| "mean_token_accuracy": 0.03136878209625138, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.4190433914104983, |
| "grad_norm": 0.11144396662712097, |
| "learning_rate": 7.261833363636036e-06, |
| "loss": 0.4529, |
| "mean_token_accuracy": 0.03282846643924131, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.4225946065919433, |
| "grad_norm": 0.10743708163499832, |
| "learning_rate": 7.249327740490114e-06, |
| "loss": 0.4403, |
| "mean_token_accuracy": 0.03177920994858141, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.426145821773388, |
| "grad_norm": 0.1281249076128006, |
| "learning_rate": 7.236804451232364e-06, |
| "loss": 0.415, |
| "mean_token_accuracy": 0.0316295579468715, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.429697036954833, |
| "grad_norm": 0.11751551181077957, |
| "learning_rate": 7.224263594220093e-06, |
| "loss": 0.4372, |
| "mean_token_accuracy": 0.03051399137621047, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.433248252136278, |
| "grad_norm": 0.12105882912874222, |
| "learning_rate": 7.211705267948592e-06, |
| "loss": 0.4326, |
| "mean_token_accuracy": 0.034232050165883265, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.436799467317723, |
| "grad_norm": 0.10550647228956223, |
| "learning_rate": 7.199129571050345e-06, |
| "loss": 0.424, |
| "mean_token_accuracy": 0.031322834107413655, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.4403506824991676, |
| "grad_norm": 0.12269502133131027, |
| "learning_rate": 7.186536602294278e-06, |
| "loss": 0.461, |
| "mean_token_accuracy": 0.03530319871424581, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.4439018976806126, |
| "grad_norm": 0.11797191202640533, |
| "learning_rate": 7.173926460584956e-06, |
| "loss": 0.426, |
| "mean_token_accuracy": 0.027765252081735525, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.4474531128620574, |
| "grad_norm": 0.1220933124423027, |
| "learning_rate": 7.161299244961828e-06, |
| "loss": 0.4229, |
| "mean_token_accuracy": 0.03250828143427498, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.4510043280435023, |
| "grad_norm": 0.1393410712480545, |
| "learning_rate": 7.148655054598436e-06, |
| "loss": 0.4955, |
| "mean_token_accuracy": 0.031024973690364277, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.4545555432249473, |
| "grad_norm": 0.12431557476520538, |
| "learning_rate": 7.135993988801644e-06, |
| "loss": 0.4545, |
| "mean_token_accuracy": 0.03278170326666441, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.4581067584063923, |
| "grad_norm": 0.0940745621919632, |
| "learning_rate": 7.1233161470108525e-06, |
| "loss": 0.4252, |
| "mean_token_accuracy": 0.0334139017832058, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.4616579735878372, |
| "grad_norm": 0.12231415510177612, |
| "learning_rate": 7.110621628797222e-06, |
| "loss": 0.4495, |
| "mean_token_accuracy": 0.03126630225597182, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.465209188769282, |
| "grad_norm": 0.11169279366731644, |
| "learning_rate": 7.097910533862886e-06, |
| "loss": 0.4242, |
| "mean_token_accuracy": 0.033264531775785144, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.468760403950727, |
| "grad_norm": 0.09450326859951019, |
| "learning_rate": 7.085182962040173e-06, |
| "loss": 0.4242, |
| "mean_token_accuracy": 0.03063771854431252, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.4723116191321717, |
| "grad_norm": 0.1040828675031662, |
| "learning_rate": 7.072439013290824e-06, |
| "loss": 0.4445, |
| "mean_token_accuracy": 0.030200165703718085, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.4758628343136166, |
| "grad_norm": 0.11325669288635254, |
| "learning_rate": 7.059678787705191e-06, |
| "loss": 0.4406, |
| "mean_token_accuracy": 0.0290958868645248, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.4794140494950616, |
| "grad_norm": 0.11439729481935501, |
| "learning_rate": 7.046902385501477e-06, |
| "loss": 0.4131, |
| "mean_token_accuracy": 0.034142243890528334, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.4829652646765066, |
| "grad_norm": 0.12844893336296082, |
| "learning_rate": 7.03410990702493e-06, |
| "loss": 0.4454, |
| "mean_token_accuracy": 0.0313700257538585, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.4865164798579513, |
| "grad_norm": 0.10952496528625488, |
| "learning_rate": 7.02130145274706e-06, |
| "loss": 0.4467, |
| "mean_token_accuracy": 0.032556907943217084, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.4900676950393963, |
| "grad_norm": 0.10907234251499176, |
| "learning_rate": 7.008477123264849e-06, |
| "loss": 0.4287, |
| "mean_token_accuracy": 0.0329911024782632, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.4936189102208413, |
| "grad_norm": 0.1262599229812622, |
| "learning_rate": 6.995637019299963e-06, |
| "loss": 0.4458, |
| "mean_token_accuracy": 0.03044356228565448, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.497170125402286, |
| "grad_norm": 0.14225925505161285, |
| "learning_rate": 6.982781241697963e-06, |
| "loss": 0.4441, |
| "mean_token_accuracy": 0.03241951846212032, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.500721340583731, |
| "grad_norm": 0.1305040866136551, |
| "learning_rate": 6.969909891427509e-06, |
| "loss": 0.4799, |
| "mean_token_accuracy": 0.030976602058217395, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.504272555765176, |
| "grad_norm": 0.14286646246910095, |
| "learning_rate": 6.957023069579561e-06, |
| "loss": 0.4688, |
| "mean_token_accuracy": 0.03533977083861828, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.507823770946621, |
| "grad_norm": 0.10188660025596619, |
| "learning_rate": 6.944120877366605e-06, |
| "loss": 0.4046, |
| "mean_token_accuracy": 0.03206467899872223, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.5113749861280656, |
| "grad_norm": 0.10866432636976242, |
| "learning_rate": 6.931203416121831e-06, |
| "loss": 0.433, |
| "mean_token_accuracy": 0.03298617543987348, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.5149262013095106, |
| "grad_norm": 0.11402492970228195, |
| "learning_rate": 6.918270787298361e-06, |
| "loss": 0.4461, |
| "mean_token_accuracy": 0.02989783536759205, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.5184774164909554, |
| "grad_norm": 0.10975202918052673, |
| "learning_rate": 6.90532309246844e-06, |
| "loss": 0.415, |
| "mean_token_accuracy": 0.03237479853851255, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.5220286316724003, |
| "grad_norm": 0.14297430217266083, |
| "learning_rate": 6.89236043332264e-06, |
| "loss": 0.4613, |
| "mean_token_accuracy": 0.031666447979660006, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.5255798468538453, |
| "grad_norm": 0.10512206703424454, |
| "learning_rate": 6.87938291166906e-06, |
| "loss": 0.4329, |
| "mean_token_accuracy": 0.032351893107261276, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.5291310620352903, |
| "grad_norm": 0.134112149477005, |
| "learning_rate": 6.866390629432533e-06, |
| "loss": 0.4216, |
| "mean_token_accuracy": 0.03164473375727539, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.5326822772167352, |
| "grad_norm": 0.0910695493221283, |
| "learning_rate": 6.8533836886538175e-06, |
| "loss": 0.4107, |
| "mean_token_accuracy": 0.031041957496199757, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.53623349239818, |
| "grad_norm": 0.0995115339756012, |
| "learning_rate": 6.840362191488801e-06, |
| "loss": 0.3981, |
| "mean_token_accuracy": 0.03637770725617884, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.539784707579625, |
| "grad_norm": 0.12063033878803253, |
| "learning_rate": 6.8273262402076935e-06, |
| "loss": 0.4365, |
| "mean_token_accuracy": 0.03462712019972969, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.5433359227610697, |
| "grad_norm": 0.12645626068115234, |
| "learning_rate": 6.814275937194233e-06, |
| "loss": 0.4463, |
| "mean_token_accuracy": 0.02990383934957208, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.5468871379425146, |
| "grad_norm": 0.11070489883422852, |
| "learning_rate": 6.801211384944867e-06, |
| "loss": 0.4389, |
| "mean_token_accuracy": 0.0330234444263624, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.5504383531239596, |
| "grad_norm": 0.10706287622451782, |
| "learning_rate": 6.788132686067963e-06, |
| "loss": 0.4184, |
| "mean_token_accuracy": 0.036259193846490234, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.5539895683054046, |
| "grad_norm": 0.12582163512706757, |
| "learning_rate": 6.77503994328299e-06, |
| "loss": 0.4433, |
| "mean_token_accuracy": 0.030336310745042283, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.5575407834868495, |
| "grad_norm": 0.09505017101764679, |
| "learning_rate": 6.761933259419725e-06, |
| "loss": 0.4364, |
| "mean_token_accuracy": 0.03261732070313883, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.5610919986682943, |
| "grad_norm": 0.11827551573514938, |
| "learning_rate": 6.748812737417428e-06, |
| "loss": 0.4153, |
| "mean_token_accuracy": 0.03193877744342899, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.5646432138497393, |
| "grad_norm": 0.11618656665086746, |
| "learning_rate": 6.7356784803240464e-06, |
| "loss": 0.4355, |
| "mean_token_accuracy": 0.02966096373529581, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.568194429031184, |
| "grad_norm": 0.12943704426288605, |
| "learning_rate": 6.722530591295406e-06, |
| "loss": 0.4683, |
| "mean_token_accuracy": 0.030868882487993687, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.571745644212629, |
| "grad_norm": 0.10661391913890839, |
| "learning_rate": 6.709369173594396e-06, |
| "loss": 0.4248, |
| "mean_token_accuracy": 0.03413955620635534, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.575296859394074, |
| "grad_norm": 0.13364438712596893, |
| "learning_rate": 6.6961943305901515e-06, |
| "loss": 0.4322, |
| "mean_token_accuracy": 0.03244770221499493, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.578848074575519, |
| "grad_norm": 0.13524463772773743, |
| "learning_rate": 6.683006165757262e-06, |
| "loss": 0.4533, |
| "mean_token_accuracy": 0.033012805608450435, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.5823992897569639, |
| "grad_norm": 0.10790959745645523, |
| "learning_rate": 6.669804782674937e-06, |
| "loss": 0.4432, |
| "mean_token_accuracy": 0.031002847008494427, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.5859505049384086, |
| "grad_norm": 0.14740656316280365, |
| "learning_rate": 6.656590285026203e-06, |
| "loss": 0.4616, |
| "mean_token_accuracy": 0.03030816976024653, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.5895017201198534, |
| "grad_norm": 0.09671668708324432, |
| "learning_rate": 6.643362776597089e-06, |
| "loss": 0.4238, |
| "mean_token_accuracy": 0.031174172054306837, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.5930529353012983, |
| "grad_norm": 0.11425669491291046, |
| "learning_rate": 6.630122361275811e-06, |
| "loss": 0.4572, |
| "mean_token_accuracy": 0.033975018672208535, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.5966041504827433, |
| "grad_norm": 0.09901127219200134, |
| "learning_rate": 6.6168691430519524e-06, |
| "loss": 0.4273, |
| "mean_token_accuracy": 0.031919095879857196, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.6001553656641883, |
| "grad_norm": 0.13218143582344055, |
| "learning_rate": 6.6036032260156526e-06, |
| "loss": 0.4082, |
| "mean_token_accuracy": 0.03161406527215149, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.6037065808456332, |
| "grad_norm": 0.11274532973766327, |
| "learning_rate": 6.590324714356784e-06, |
| "loss": 0.4487, |
| "mean_token_accuracy": 0.030842622476484394, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.607257796027078, |
| "grad_norm": 0.13735929131507874, |
| "learning_rate": 6.5770337123641405e-06, |
| "loss": 0.4471, |
| "mean_token_accuracy": 0.03498955326358555, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.610809011208523, |
| "grad_norm": 0.10870594531297684, |
| "learning_rate": 6.563730324424609e-06, |
| "loss": 0.4142, |
| "mean_token_accuracy": 0.03486323829929461, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.6143602263899677, |
| "grad_norm": 0.12857501208782196, |
| "learning_rate": 6.55041465502236e-06, |
| "loss": 0.4341, |
| "mean_token_accuracy": 0.032055066079919925, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.6179114415714126, |
| "grad_norm": 0.10850770026445389, |
| "learning_rate": 6.53708680873802e-06, |
| "loss": 0.4373, |
| "mean_token_accuracy": 0.032894781605136814, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.6214626567528576, |
| "grad_norm": 0.130377396941185, |
| "learning_rate": 6.523746890247853e-06, |
| "loss": 0.4425, |
| "mean_token_accuracy": 0.03230476283351891, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.6250138719343026, |
| "grad_norm": 0.11864569783210754, |
| "learning_rate": 6.510395004322937e-06, |
| "loss": 0.4256, |
| "mean_token_accuracy": 0.03536723868455738, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.6285650871157475, |
| "grad_norm": 0.11540473997592926, |
| "learning_rate": 6.49703125582834e-06, |
| "loss": 0.4048, |
| "mean_token_accuracy": 0.03272077367910242, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.6321163022971923, |
| "grad_norm": 0.1166347786784172, |
| "learning_rate": 6.4836557497222995e-06, |
| "loss": 0.427, |
| "mean_token_accuracy": 0.038396627987822285, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.6356675174786373, |
| "grad_norm": 0.10623839497566223, |
| "learning_rate": 6.470268591055398e-06, |
| "loss": 0.4104, |
| "mean_token_accuracy": 0.035294696885102894, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.639218732660082, |
| "grad_norm": 0.11819703876972198, |
| "learning_rate": 6.456869884969738e-06, |
| "loss": 0.4214, |
| "mean_token_accuracy": 0.02874127653922187, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.642769947841527, |
| "grad_norm": 0.11898882687091827, |
| "learning_rate": 6.443459736698106e-06, |
| "loss": 0.4462, |
| "mean_token_accuracy": 0.030361584464117186, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.646321163022972, |
| "grad_norm": 0.13163194060325623, |
| "learning_rate": 6.430038251563166e-06, |
| "loss": 0.4229, |
| "mean_token_accuracy": 0.03145527587912511, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.649872378204417, |
| "grad_norm": 0.12886396050453186, |
| "learning_rate": 6.416605534976614e-06, |
| "loss": 0.444, |
| "mean_token_accuracy": 0.03168737835221691, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.6534235933858619, |
| "grad_norm": 0.11800993233919144, |
| "learning_rate": 6.403161692438364e-06, |
| "loss": 0.4406, |
| "mean_token_accuracy": 0.029150941685657017, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.6569748085673066, |
| "grad_norm": 0.15257884562015533, |
| "learning_rate": 6.3897068295357e-06, |
| "loss": 0.4646, |
| "mean_token_accuracy": 0.03103807869774755, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.6605260237487516, |
| "grad_norm": 0.12270724773406982, |
| "learning_rate": 6.376241051942477e-06, |
| "loss": 0.4779, |
| "mean_token_accuracy": 0.03334709817136172, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.6640772389301963, |
| "grad_norm": 0.11115626245737076, |
| "learning_rate": 6.362764465418258e-06, |
| "loss": 0.4228, |
| "mean_token_accuracy": 0.03353900604633964, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.6676284541116413, |
| "grad_norm": 0.12104036659002304, |
| "learning_rate": 6.349277175807506e-06, |
| "loss": 0.4093, |
| "mean_token_accuracy": 0.03223917051946046, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.6711796692930863, |
| "grad_norm": 0.10532081872224808, |
| "learning_rate": 6.3357792890387485e-06, |
| "loss": 0.4523, |
| "mean_token_accuracy": 0.031193107621220406, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.6747308844745312, |
| "grad_norm": 0.107658751308918, |
| "learning_rate": 6.322270911123734e-06, |
| "loss": 0.4364, |
| "mean_token_accuracy": 0.02924270377479843, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.6782820996559762, |
| "grad_norm": 0.14873118698596954, |
| "learning_rate": 6.308752148156614e-06, |
| "loss": 0.447, |
| "mean_token_accuracy": 0.028289265337662073, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.681833314837421, |
| "grad_norm": 0.13524053990840912, |
| "learning_rate": 6.295223106313104e-06, |
| "loss": 0.456, |
| "mean_token_accuracy": 0.03146627026217175, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.6853845300188657, |
| "grad_norm": 0.1123010441660881, |
| "learning_rate": 6.281683891849645e-06, |
| "loss": 0.4373, |
| "mean_token_accuracy": 0.031217788444337202, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.6889357452003106, |
| "grad_norm": 0.1200728714466095, |
| "learning_rate": 6.268134611102578e-06, |
| "loss": 0.4294, |
| "mean_token_accuracy": 0.030439670525083784, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.6924869603817556, |
| "grad_norm": 0.11351709067821503, |
| "learning_rate": 6.254575370487299e-06, |
| "loss": 0.454, |
| "mean_token_accuracy": 0.03279696796744247, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.6960381755632006, |
| "grad_norm": 0.10373629629611969, |
| "learning_rate": 6.2410062764974366e-06, |
| "loss": 0.4088, |
| "mean_token_accuracy": 0.032122904136485886, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.6995893907446455, |
| "grad_norm": 0.11496740579605103, |
| "learning_rate": 6.227427435703997e-06, |
| "loss": 0.4425, |
| "mean_token_accuracy": 0.026922807355731493, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.7031406059260903, |
| "grad_norm": 0.13776014745235443, |
| "learning_rate": 6.213838954754543e-06, |
| "loss": 0.4615, |
| "mean_token_accuracy": 0.03186251565057319, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.7066918211075353, |
| "grad_norm": 0.12256697565317154, |
| "learning_rate": 6.2002409403723525e-06, |
| "loss": 0.4266, |
| "mean_token_accuracy": 0.03119091654662043, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.71024303628898, |
| "grad_norm": 0.13695059716701508, |
| "learning_rate": 6.186633499355576e-06, |
| "loss": 0.4413, |
| "mean_token_accuracy": 0.031419768780324375, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.713794251470425, |
| "grad_norm": 0.11959680169820786, |
| "learning_rate": 6.173016738576396e-06, |
| "loss": 0.4069, |
| "mean_token_accuracy": 0.03165624950270285, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.71734546665187, |
| "grad_norm": 0.12976723909378052, |
| "learning_rate": 6.159390764980202e-06, |
| "loss": 0.4587, |
| "mean_token_accuracy": 0.03226567378442269, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.720896681833315, |
| "grad_norm": 0.12629680335521698, |
| "learning_rate": 6.145755685584731e-06, |
| "loss": 0.4318, |
| "mean_token_accuracy": 0.03316625406660023, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.7244478970147599, |
| "grad_norm": 0.10905484110116959, |
| "learning_rate": 6.132111607479243e-06, |
| "loss": 0.3998, |
| "mean_token_accuracy": 0.03585824224865064, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.7279991121962046, |
| "grad_norm": 0.1086510494351387, |
| "learning_rate": 6.118458637823669e-06, |
| "loss": 0.4114, |
| "mean_token_accuracy": 0.03143865071251639, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.7315503273776496, |
| "grad_norm": 0.12035961449146271, |
| "learning_rate": 6.104796883847777e-06, |
| "loss": 0.4376, |
| "mean_token_accuracy": 0.03243117895544856, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.7351015425590943, |
| "grad_norm": 0.14096233248710632, |
| "learning_rate": 6.091126452850324e-06, |
| "loss": 0.4117, |
| "mean_token_accuracy": 0.03267770476486476, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.7386527577405393, |
| "grad_norm": 0.10478504002094269, |
| "learning_rate": 6.077447452198219e-06, |
| "loss": 0.427, |
| "mean_token_accuracy": 0.027900859065994155, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.7422039729219843, |
| "grad_norm": 0.12248977273702621, |
| "learning_rate": 6.063759989325673e-06, |
| "loss": 0.4211, |
| "mean_token_accuracy": 0.035626769851660356, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.7457551881034292, |
| "grad_norm": 0.11430156230926514, |
| "learning_rate": 6.050064171733362e-06, |
| "loss": 0.4506, |
| "mean_token_accuracy": 0.033796100367908366, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.7493064032848742, |
| "grad_norm": 0.13546644151210785, |
| "learning_rate": 6.0363601069875755e-06, |
| "loss": 0.4384, |
| "mean_token_accuracy": 0.032566652556852205, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.752857618466319, |
| "grad_norm": 0.11047898977994919, |
| "learning_rate": 6.022647902719384e-06, |
| "loss": 0.4434, |
| "mean_token_accuracy": 0.031517542514848174, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.7564088336477637, |
| "grad_norm": 0.11789167672395706, |
| "learning_rate": 6.008927666623775e-06, |
| "loss": 0.4371, |
| "mean_token_accuracy": 0.029138855084966053, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.7599600488292086, |
| "grad_norm": 0.1081371083855629, |
| "learning_rate": 5.9951995064588245e-06, |
| "loss": 0.4086, |
| "mean_token_accuracy": 0.036707406099594664, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.7635112640106536, |
| "grad_norm": 0.1214103102684021, |
| "learning_rate": 5.981463530044841e-06, |
| "loss": 0.4525, |
| "mean_token_accuracy": 0.03377084386738716, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.7670624791920986, |
| "grad_norm": 0.10647572576999664, |
| "learning_rate": 5.967719845263524e-06, |
| "loss": 0.403, |
| "mean_token_accuracy": 0.03056574211223051, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.7706136943735435, |
| "grad_norm": 0.10062088072299957, |
| "learning_rate": 5.953968560057112e-06, |
| "loss": 0.4224, |
| "mean_token_accuracy": 0.03024762358836597, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.7741649095549885, |
| "grad_norm": 0.11051978915929794, |
| "learning_rate": 5.940209782427535e-06, |
| "loss": 0.4435, |
| "mean_token_accuracy": 0.02924332962720655, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.7777161247364333, |
| "grad_norm": 0.10874021053314209, |
| "learning_rate": 5.926443620435572e-06, |
| "loss": 0.4216, |
| "mean_token_accuracy": 0.031223400786984712, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.781267339917878, |
| "grad_norm": 0.10309744626283646, |
| "learning_rate": 5.912670182199998e-06, |
| "loss": 0.421, |
| "mean_token_accuracy": 0.03220628422786831, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.784818555099323, |
| "grad_norm": 0.10336648672819138, |
| "learning_rate": 5.898889575896731e-06, |
| "loss": 0.4301, |
| "mean_token_accuracy": 0.034015969904430676, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.788369770280768, |
| "grad_norm": 0.15445692837238312, |
| "learning_rate": 5.8851019097579935e-06, |
| "loss": 0.4494, |
| "mean_token_accuracy": 0.03278758638043655, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.791920985462213, |
| "grad_norm": 0.1205769032239914, |
| "learning_rate": 5.871307292071449e-06, |
| "loss": 0.4608, |
| "mean_token_accuracy": 0.030953851630329154, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.7954722006436579, |
| "grad_norm": 0.11463318765163422, |
| "learning_rate": 5.857505831179361e-06, |
| "loss": 0.4238, |
| "mean_token_accuracy": 0.03327622167489608, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.7990234158251026, |
| "grad_norm": 0.09844350814819336, |
| "learning_rate": 5.843697635477742e-06, |
| "loss": 0.436, |
| "mean_token_accuracy": 0.032107390790770296, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.8025746310065476, |
| "grad_norm": 0.11865696310997009, |
| "learning_rate": 5.8298828134154935e-06, |
| "loss": 0.4604, |
| "mean_token_accuracy": 0.03441940287120815, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.8061258461879923, |
| "grad_norm": 0.12131454795598984, |
| "learning_rate": 5.816061473493565e-06, |
| "loss": 0.4347, |
| "mean_token_accuracy": 0.029422461073409067, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.8096770613694373, |
| "grad_norm": 0.10119353979825974, |
| "learning_rate": 5.802233724264094e-06, |
| "loss": 0.4206, |
| "mean_token_accuracy": 0.0319925009207509, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.8132282765508823, |
| "grad_norm": 0.12341497838497162, |
| "learning_rate": 5.788399674329559e-06, |
| "loss": 0.4275, |
| "mean_token_accuracy": 0.03254448569350643, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.8167794917323272, |
| "grad_norm": 0.12492340058088303, |
| "learning_rate": 5.774559432341918e-06, |
| "loss": 0.428, |
| "mean_token_accuracy": 0.02870176643045852, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.8203307069137722, |
| "grad_norm": 0.10704270750284195, |
| "learning_rate": 5.760713107001773e-06, |
| "loss": 0.4395, |
| "mean_token_accuracy": 0.030266436016972875, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.823881922095217, |
| "grad_norm": 0.11909916251897812, |
| "learning_rate": 5.746860807057491e-06, |
| "loss": 0.4153, |
| "mean_token_accuracy": 0.03108665631225449, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.827433137276662, |
| "grad_norm": 0.11801562458276749, |
| "learning_rate": 5.7330026413043726e-06, |
| "loss": 0.4722, |
| "mean_token_accuracy": 0.03412359116009611, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.8309843524581066, |
| "grad_norm": 0.12138685584068298, |
| "learning_rate": 5.719138718583781e-06, |
| "loss": 0.4266, |
| "mean_token_accuracy": 0.029616558851557784, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.8345355676395516, |
| "grad_norm": 0.12482289224863052, |
| "learning_rate": 5.705269147782303e-06, |
| "loss": 0.4724, |
| "mean_token_accuracy": 0.029393230830464745, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.8380867828209966, |
| "grad_norm": 0.12985889613628387, |
| "learning_rate": 5.6913940378308755e-06, |
| "loss": 0.4399, |
| "mean_token_accuracy": 0.030709353493875824, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.8416379980024415, |
| "grad_norm": 0.1769595444202423, |
| "learning_rate": 5.677513497703947e-06, |
| "loss": 0.4505, |
| "mean_token_accuracy": 0.032131388477864675, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.8451892131838865, |
| "grad_norm": 0.12460047751665115, |
| "learning_rate": 5.663627636418611e-06, |
| "loss": 0.4679, |
| "mean_token_accuracy": 0.033650853561994154, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.8487404283653313, |
| "grad_norm": 0.10020967572927475, |
| "learning_rate": 5.649736563033754e-06, |
| "loss": 0.4295, |
| "mean_token_accuracy": 0.030336383748363005, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.852291643546776, |
| "grad_norm": 0.146786630153656, |
| "learning_rate": 5.635840386649197e-06, |
| "loss": 0.4496, |
| "mean_token_accuracy": 0.031712467898614705, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.855842858728221, |
| "grad_norm": 0.11752845346927643, |
| "learning_rate": 5.621939216404842e-06, |
| "loss": 0.4602, |
| "mean_token_accuracy": 0.03064700043250923, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.859394073909666, |
| "grad_norm": 0.11091917008161545, |
| "learning_rate": 5.608033161479811e-06, |
| "loss": 0.4281, |
| "mean_token_accuracy": 0.03554350486774638, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.862945289091111, |
| "grad_norm": 0.13093847036361694, |
| "learning_rate": 5.594122331091591e-06, |
| "loss": 0.4501, |
| "mean_token_accuracy": 0.03043439887915156, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.8664965042725559, |
| "grad_norm": 0.10613659024238586, |
| "learning_rate": 5.580206834495169e-06, |
| "loss": 0.4009, |
| "mean_token_accuracy": 0.03187151045858627, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.8700477194540006, |
| "grad_norm": 0.1050243079662323, |
| "learning_rate": 5.566286780982193e-06, |
| "loss": 0.4227, |
| "mean_token_accuracy": 0.03142415892943973, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.8735989346354456, |
| "grad_norm": 0.09940161556005478, |
| "learning_rate": 5.552362279880091e-06, |
| "loss": 0.4193, |
| "mean_token_accuracy": 0.02958502833644161, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.8771501498168903, |
| "grad_norm": 0.1255192905664444, |
| "learning_rate": 5.538433440551221e-06, |
| "loss": 0.4215, |
| "mean_token_accuracy": 0.03073620241775643, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.8807013649983353, |
| "grad_norm": 0.10726135224103928, |
| "learning_rate": 5.524500372392021e-06, |
| "loss": 0.4116, |
| "mean_token_accuracy": 0.031495200710196514, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.8842525801797803, |
| "grad_norm": 0.11809692531824112, |
| "learning_rate": 5.5105631848321375e-06, |
| "loss": 0.4421, |
| "mean_token_accuracy": 0.03594362937474216, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.8878037953612252, |
| "grad_norm": 0.09988775849342346, |
| "learning_rate": 5.496621987333567e-06, |
| "loss": 0.4276, |
| "mean_token_accuracy": 0.03090111272831564, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.8913550105426702, |
| "grad_norm": 0.12656764686107635, |
| "learning_rate": 5.482676889389808e-06, |
| "loss": 0.4362, |
| "mean_token_accuracy": 0.03345424540384556, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.894906225724115, |
| "grad_norm": 0.12253513932228088, |
| "learning_rate": 5.468728000524987e-06, |
| "loss": 0.4233, |
| "mean_token_accuracy": 0.03485227169767313, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.89845744090556, |
| "grad_norm": 0.10752210766077042, |
| "learning_rate": 5.454775430293008e-06, |
| "loss": 0.4049, |
| "mean_token_accuracy": 0.03149543050676584, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.9020086560870046, |
| "grad_norm": 0.11358033865690231, |
| "learning_rate": 5.440819288276683e-06, |
| "loss": 0.4491, |
| "mean_token_accuracy": 0.03557099802492303, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.9055598712684496, |
| "grad_norm": 0.10355143249034882, |
| "learning_rate": 5.426859684086881e-06, |
| "loss": 0.4221, |
| "mean_token_accuracy": 0.029905574676377, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.9091110864498946, |
| "grad_norm": 0.1308654397726059, |
| "learning_rate": 5.412896727361663e-06, |
| "loss": 0.4051, |
| "mean_token_accuracy": 0.03401779759951751, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.9126623016313395, |
| "grad_norm": 0.10328856110572815, |
| "learning_rate": 5.398930527765416e-06, |
| "loss": 0.4246, |
| "mean_token_accuracy": 0.03586249665386276, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.9162135168127845, |
| "grad_norm": 0.12165062874555588, |
| "learning_rate": 5.384961194988002e-06, |
| "loss": 0.4367, |
| "mean_token_accuracy": 0.036468475984293036, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.9197647319942293, |
| "grad_norm": 0.14289544522762299, |
| "learning_rate": 5.370988838743889e-06, |
| "loss": 0.467, |
| "mean_token_accuracy": 0.029684737717616372, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.9233159471756742, |
| "grad_norm": 0.1259216070175171, |
| "learning_rate": 5.357013568771288e-06, |
| "loss": 0.4611, |
| "mean_token_accuracy": 0.030411327827096102, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.926867162357119, |
| "grad_norm": 0.12499464303255081, |
| "learning_rate": 5.343035494831298e-06, |
| "loss": 0.475, |
| "mean_token_accuracy": 0.030978709481132682, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.930418377538564, |
| "grad_norm": 0.11255602538585663, |
| "learning_rate": 5.32905472670704e-06, |
| "loss": 0.4276, |
| "mean_token_accuracy": 0.03142427492639399, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.933969592720009, |
| "grad_norm": 0.10795030742883682, |
| "learning_rate": 5.315071374202792e-06, |
| "loss": 0.4334, |
| "mean_token_accuracy": 0.03035588754573837, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.9375208079014539, |
| "grad_norm": 0.11768075078725815, |
| "learning_rate": 5.301085547143135e-06, |
| "loss": 0.4471, |
| "mean_token_accuracy": 0.03152179718381376, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.9410720230828988, |
| "grad_norm": 0.11674405634403229, |
| "learning_rate": 5.287097355372079e-06, |
| "loss": 0.4385, |
| "mean_token_accuracy": 0.027633604368020315, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.9446232382643436, |
| "grad_norm": 0.15666991472244263, |
| "learning_rate": 5.273106908752211e-06, |
| "loss": 0.49, |
| "mean_token_accuracy": 0.030374082733032992, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.9481744534457883, |
| "grad_norm": 0.10733166337013245, |
| "learning_rate": 5.259114317163822e-06, |
| "loss": 0.4397, |
| "mean_token_accuracy": 0.02997263033830677, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.9517256686272333, |
| "grad_norm": 0.1084926500916481, |
| "learning_rate": 5.245119690504056e-06, |
| "loss": 0.4458, |
| "mean_token_accuracy": 0.03219133894890547, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.9552768838086783, |
| "grad_norm": 0.10508458316326141, |
| "learning_rate": 5.231123138686036e-06, |
| "loss": 0.4207, |
| "mean_token_accuracy": 0.03626753961361828, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.9588280989901232, |
| "grad_norm": 0.12108492106199265, |
| "learning_rate": 5.217124771638008e-06, |
| "loss": 0.4703, |
| "mean_token_accuracy": 0.032320219550456386, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.9623793141715682, |
| "grad_norm": 0.1054980456829071, |
| "learning_rate": 5.2031246993024705e-06, |
| "loss": 0.4487, |
| "mean_token_accuracy": 0.02940154373754922, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.965930529353013, |
| "grad_norm": 0.11068796366453171, |
| "learning_rate": 5.1891230316353215e-06, |
| "loss": 0.4127, |
| "mean_token_accuracy": 0.033622686092712684, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.969481744534458, |
| "grad_norm": 0.11467897891998291, |
| "learning_rate": 5.1751198786049815e-06, |
| "loss": 0.4409, |
| "mean_token_accuracy": 0.03024188138806494, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.9730329597159026, |
| "grad_norm": 0.13145717978477478, |
| "learning_rate": 5.161115350191543e-06, |
| "loss": 0.4568, |
| "mean_token_accuracy": 0.03478666826777044, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.9765841748973476, |
| "grad_norm": 0.11662815511226654, |
| "learning_rate": 5.147109556385898e-06, |
| "loss": 0.428, |
| "mean_token_accuracy": 0.029785827462546877, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.9801353900787926, |
| "grad_norm": 0.12841393053531647, |
| "learning_rate": 5.133102607188875e-06, |
| "loss": 0.4369, |
| "mean_token_accuracy": 0.03490987789336941, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.9836866052602375, |
| "grad_norm": 0.11382108181715012, |
| "learning_rate": 5.119094612610381e-06, |
| "loss": 0.4355, |
| "mean_token_accuracy": 0.031118642207729863, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.9872378204416825, |
| "grad_norm": 0.108616903424263, |
| "learning_rate": 5.10508568266853e-06, |
| "loss": 0.4321, |
| "mean_token_accuracy": 0.029375262431130977, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.9907890356231273, |
| "grad_norm": 0.1363229751586914, |
| "learning_rate": 5.091075927388785e-06, |
| "loss": 0.4382, |
| "mean_token_accuracy": 0.02874040436472569, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.9943402508045722, |
| "grad_norm": 0.11008929461240768, |
| "learning_rate": 5.077065456803089e-06, |
| "loss": 0.3951, |
| "mean_token_accuracy": 0.03383657897938974, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.997891465986017, |
| "grad_norm": 0.1290445476770401, |
| "learning_rate": 5.063054380949003e-06, |
| "loss": 0.4386, |
| "mean_token_accuracy": 0.02915131483678124, |
| "step": 563 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.08082450926303864, |
| "learning_rate": 5.049042809868845e-06, |
| "loss": 0.2488, |
| "mean_token_accuracy": 0.030764293948825645, |
| "step": 564 |
| }, |
| { |
| "epoch": 2.003551215181445, |
| "grad_norm": 0.1310880482196808, |
| "learning_rate": 5.035030853608817e-06, |
| "loss": 0.4365, |
| "mean_token_accuracy": 0.031575486336805625, |
| "step": 565 |
| }, |
| { |
| "epoch": 2.00710243036289, |
| "grad_norm": 0.1138961985707283, |
| "learning_rate": 5.0210186222181515e-06, |
| "loss": 0.4157, |
| "mean_token_accuracy": 0.034992296999917016, |
| "step": 566 |
| }, |
| { |
| "epoch": 2.010653645544335, |
| "grad_norm": 0.11264721304178238, |
| "learning_rate": 5.007006225748238e-06, |
| "loss": 0.4476, |
| "mean_token_accuracy": 0.033530683980643516, |
| "step": 567 |
| }, |
| { |
| "epoch": 2.0142048607257794, |
| "grad_norm": 0.1077028214931488, |
| "learning_rate": 4.992993774251764e-06, |
| "loss": 0.4325, |
| "mean_token_accuracy": 0.03167944007873302, |
| "step": 568 |
| }, |
| { |
| "epoch": 2.0177560759072244, |
| "grad_norm": 0.1369977593421936, |
| "learning_rate": 4.97898137778185e-06, |
| "loss": 0.4407, |
| "mean_token_accuracy": 0.035672826332302066, |
| "step": 569 |
| }, |
| { |
| "epoch": 2.0213072910886694, |
| "grad_norm": 0.11429018527269363, |
| "learning_rate": 4.964969146391184e-06, |
| "loss": 0.43, |
| "mean_token_accuracy": 0.034091444191290066, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.0248585062701143, |
| "grad_norm": 0.1280803382396698, |
| "learning_rate": 4.950957190131157e-06, |
| "loss": 0.4254, |
| "mean_token_accuracy": 0.03439853444797336, |
| "step": 571 |
| }, |
| { |
| "epoch": 2.0284097214515593, |
| "grad_norm": 0.10655295848846436, |
| "learning_rate": 4.936945619050998e-06, |
| "loss": 0.4306, |
| "mean_token_accuracy": 0.031939568114466965, |
| "step": 572 |
| }, |
| { |
| "epoch": 2.0319609366330043, |
| "grad_norm": 0.13407614827156067, |
| "learning_rate": 4.922934543196912e-06, |
| "loss": 0.4496, |
| "mean_token_accuracy": 0.029332300946407486, |
| "step": 573 |
| }, |
| { |
| "epoch": 2.0355121518144492, |
| "grad_norm": 0.1120862066745758, |
| "learning_rate": 4.908924072611218e-06, |
| "loss": 0.4399, |
| "mean_token_accuracy": 0.030439690985076595, |
| "step": 574 |
| }, |
| { |
| "epoch": 2.0390633669958937, |
| "grad_norm": 0.12993109226226807, |
| "learning_rate": 4.894914317331471e-06, |
| "loss": 0.4638, |
| "mean_token_accuracy": 0.03251677861408098, |
| "step": 575 |
| }, |
| { |
| "epoch": 2.0426145821773387, |
| "grad_norm": 0.12582442164421082, |
| "learning_rate": 4.88090538738962e-06, |
| "loss": 0.4329, |
| "mean_token_accuracy": 0.027871505477378378, |
| "step": 576 |
| }, |
| { |
| "epoch": 2.0461657973587837, |
| "grad_norm": 0.11255156993865967, |
| "learning_rate": 4.866897392811127e-06, |
| "loss": 0.4614, |
| "mean_token_accuracy": 0.033189341596880695, |
| "step": 577 |
| }, |
| { |
| "epoch": 2.0497170125402286, |
| "grad_norm": 0.11850475519895554, |
| "learning_rate": 4.852890443614105e-06, |
| "loss": 0.4593, |
| "mean_token_accuracy": 0.03234072294435464, |
| "step": 578 |
| }, |
| { |
| "epoch": 2.0532682277216736, |
| "grad_norm": 0.13706207275390625, |
| "learning_rate": 4.838884649808458e-06, |
| "loss": 0.4689, |
| "mean_token_accuracy": 0.03188859073998174, |
| "step": 579 |
| }, |
| { |
| "epoch": 2.0568194429031186, |
| "grad_norm": 0.1146935224533081, |
| "learning_rate": 4.82488012139502e-06, |
| "loss": 0.4204, |
| "mean_token_accuracy": 0.033560063729964895, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.060370658084563, |
| "grad_norm": 0.09363409876823425, |
| "learning_rate": 4.810876968364679e-06, |
| "loss": 0.4026, |
| "mean_token_accuracy": 0.03469607957958942, |
| "step": 581 |
| }, |
| { |
| "epoch": 2.063921873266008, |
| "grad_norm": 0.12827607989311218, |
| "learning_rate": 4.796875300697532e-06, |
| "loss": 0.4439, |
| "mean_token_accuracy": 0.030342954167281277, |
| "step": 582 |
| }, |
| { |
| "epoch": 2.067473088447453, |
| "grad_norm": 0.1165771633386612, |
| "learning_rate": 4.782875228361994e-06, |
| "loss": 0.4475, |
| "mean_token_accuracy": 0.03175362127512926, |
| "step": 583 |
| }, |
| { |
| "epoch": 2.071024303628898, |
| "grad_norm": 0.12880617380142212, |
| "learning_rate": 4.7688768613139655e-06, |
| "loss": 0.4553, |
| "mean_token_accuracy": 0.028243271208339138, |
| "step": 584 |
| }, |
| { |
| "epoch": 2.074575518810343, |
| "grad_norm": 0.1199556365609169, |
| "learning_rate": 4.754880309495946e-06, |
| "loss": 0.4405, |
| "mean_token_accuracy": 0.03142109113832703, |
| "step": 585 |
| }, |
| { |
| "epoch": 2.078126733991788, |
| "grad_norm": 0.14064113795757294, |
| "learning_rate": 4.74088568283618e-06, |
| "loss": 0.447, |
| "mean_token_accuracy": 0.03218332341930363, |
| "step": 586 |
| }, |
| { |
| "epoch": 2.081677949173233, |
| "grad_norm": 0.10865464061498642, |
| "learning_rate": 4.726893091247792e-06, |
| "loss": 0.4187, |
| "mean_token_accuracy": 0.029737072265561437, |
| "step": 587 |
| }, |
| { |
| "epoch": 2.0852291643546774, |
| "grad_norm": 0.12148632109165192, |
| "learning_rate": 4.712902644627923e-06, |
| "loss": 0.4428, |
| "mean_token_accuracy": 0.03043746904222644, |
| "step": 588 |
| }, |
| { |
| "epoch": 2.0887803795361224, |
| "grad_norm": 0.13806842267513275, |
| "learning_rate": 4.698914452856866e-06, |
| "loss": 0.4767, |
| "mean_token_accuracy": 0.03084335032326635, |
| "step": 589 |
| }, |
| { |
| "epoch": 2.0923315947175674, |
| "grad_norm": 0.1365612894296646, |
| "learning_rate": 4.684928625797208e-06, |
| "loss": 0.4354, |
| "mean_token_accuracy": 0.03496370389621006, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.0958828098990123, |
| "grad_norm": 0.120542012155056, |
| "learning_rate": 4.6709452732929614e-06, |
| "loss": 0.4747, |
| "mean_token_accuracy": 0.03065626499665086, |
| "step": 591 |
| }, |
| { |
| "epoch": 2.0994340250804573, |
| "grad_norm": 0.09649121761322021, |
| "learning_rate": 4.656964505168703e-06, |
| "loss": 0.4051, |
| "mean_token_accuracy": 0.03408970690725255, |
| "step": 592 |
| }, |
| { |
| "epoch": 2.1029852402619023, |
| "grad_norm": 0.09775994718074799, |
| "learning_rate": 4.642986431228713e-06, |
| "loss": 0.4257, |
| "mean_token_accuracy": 0.032279426413879264, |
| "step": 593 |
| }, |
| { |
| "epoch": 2.1065364554433472, |
| "grad_norm": 0.10995833575725555, |
| "learning_rate": 4.629011161256114e-06, |
| "loss": 0.4247, |
| "mean_token_accuracy": 0.03125830645512906, |
| "step": 594 |
| }, |
| { |
| "epoch": 2.1100876706247917, |
| "grad_norm": 0.11631479859352112, |
| "learning_rate": 4.615038805011999e-06, |
| "loss": 0.443, |
| "mean_token_accuracy": 0.031506394774623914, |
| "step": 595 |
| }, |
| { |
| "epoch": 2.1136388858062367, |
| "grad_norm": 0.10540800541639328, |
| "learning_rate": 4.601069472234584e-06, |
| "loss": 0.4341, |
| "mean_token_accuracy": 0.03257749425392831, |
| "step": 596 |
| }, |
| { |
| "epoch": 2.1171901009876817, |
| "grad_norm": 0.11153507232666016, |
| "learning_rate": 4.587103272638339e-06, |
| "loss": 0.4463, |
| "mean_token_accuracy": 0.0294464887920185, |
| "step": 597 |
| }, |
| { |
| "epoch": 2.1207413161691266, |
| "grad_norm": 0.10988204926252365, |
| "learning_rate": 4.57314031591312e-06, |
| "loss": 0.4159, |
| "mean_token_accuracy": 0.03306772064024699, |
| "step": 598 |
| }, |
| { |
| "epoch": 2.1242925313505716, |
| "grad_norm": 0.11690136790275574, |
| "learning_rate": 4.559180711723318e-06, |
| "loss": 0.4417, |
| "mean_token_accuracy": 0.030245611327700317, |
| "step": 599 |
| }, |
| { |
| "epoch": 2.1278437465320166, |
| "grad_norm": 0.11184585839509964, |
| "learning_rate": 4.545224569706994e-06, |
| "loss": 0.4132, |
| "mean_token_accuracy": 0.03208141641152906, |
| "step": 600 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1124, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 300, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.295286349900025e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|