diff --git "a/train_cosine_ckpt600/trainer_state.json" "b/train_cosine_ckpt600/trainer_state.json" new file mode 100644--- /dev/null +++ "b/train_cosine_ckpt600/trainer_state.json" @@ -0,0 +1,4834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.134946176894906, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0035512151814449007, + "grad_norm": 3.746668577194214, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.03184996646086802, + "step": 1 + }, + { + "epoch": 0.007102430362889801, + "grad_norm": 4.603147029876709, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.03573758571292274, + "step": 2 + }, + { + "epoch": 0.010653645544334702, + "grad_norm": 3.065110206604004, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.031494880182435736, + "step": 3 + }, + { + "epoch": 0.014204860725779603, + "grad_norm": 2.540454864501953, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.033872520692966646, + "step": 4 + }, + { + "epoch": 0.017756075907224503, + "grad_norm": 3.2157845497131348, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.03215573047782527, + "step": 5 + }, + { + "epoch": 0.021307291088669404, + "grad_norm": 2.591473340988159, + "learning_rate": 8.571428571428571e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.031042124777741265, + "step": 6 + }, + { + "epoch": 0.024858506270114305, + "grad_norm": 5.040935516357422, + "learning_rate": 1e-05, + "loss": 0.3018, + "mean_token_accuracy": 0.029873713359847898, + "step": 7 + }, + { + "epoch": 0.028409721451559206, + "grad_norm": 3.1976966857910156, + "learning_rate": 9.999980365120307e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.029250323150336044, + "step": 8 + }, + { + "epoch": 0.0319609366330041, + "grad_norm": 2.1849372386932373, + "learning_rate": 9.999921460635436e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.02684653406322468, + "step": 9 + }, + { + "epoch": 0.03551215181444901, + "grad_norm": 1.1641992330551147, + "learning_rate": 9.999823287008022e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.030458620822173543, + "step": 10 + }, + { + "epoch": 0.039063366995893904, + "grad_norm": 1.8289661407470703, + "learning_rate": 9.999685845009114e-06, + "loss": 0.282, + "mean_token_accuracy": 0.03673582868941594, + "step": 11 + }, + { + "epoch": 0.04261458217733881, + "grad_norm": 2.047220468521118, + "learning_rate": 9.999509135718176e-06, + "loss": 0.285, + "mean_token_accuracy": 0.03312260610982776, + "step": 12 + }, + { + "epoch": 0.046165797358783706, + "grad_norm": 1.0920840501785278, + "learning_rate": 9.999293160523074e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.03169899153363076, + "step": 13 + }, + { + "epoch": 0.04971701254022861, + "grad_norm": 2.3721137046813965, + "learning_rate": 9.999037921120068e-06, + "loss": 0.2752, + "mean_token_accuracy": 0.02908840416057501, + "step": 14 + }, + { + "epoch": 0.05326822772167351, + "grad_norm": 1.869300127029419, + "learning_rate": 9.998743419513795e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.029675625406525796, + "step": 15 + }, + { + "epoch": 0.05681944290311841, + "grad_norm": 1.656387448310852, + "learning_rate": 9.998409658017256e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.028696611592749832, + "step": 16 + }, + { + "epoch": 0.06037065808456331, + "grad_norm": 1.5127822160720825, + "learning_rate": 9.998036639251798e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.03376526683859993, + "step": 17 + }, + { + "epoch": 0.0639218732660082, + "grad_norm": 1.3357048034667969, + "learning_rate": 9.997624366147094e-06, + "loss": 0.2692, + "mean_token_accuracy": 0.03486147220610292, + "step": 18 + }, + { + "epoch": 0.06747308844745312, + "grad_norm": 1.4938886165618896, + "learning_rate": 9.997172841941114e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.03322860744447098, + "step": 19 + }, + { + "epoch": 0.07102430362889801, + "grad_norm": 0.8506525158882141, + "learning_rate": 9.99668207018011e-06, + "loss": 0.2694, + "mean_token_accuracy": 0.032776252537587425, + "step": 20 + }, + { + "epoch": 0.07457551881034291, + "grad_norm": 0.9477233290672302, + "learning_rate": 9.996152054718579e-06, + "loss": 0.263, + "mean_token_accuracy": 0.03463000854389975, + "step": 21 + }, + { + "epoch": 0.07812673399178781, + "grad_norm": 1.0164031982421875, + "learning_rate": 9.995582799719237e-06, + "loss": 0.2696, + "mean_token_accuracy": 0.03465155349476845, + "step": 22 + }, + { + "epoch": 0.08167794917323272, + "grad_norm": 1.2138930559158325, + "learning_rate": 9.994974309652984e-06, + "loss": 0.2676, + "mean_token_accuracy": 0.03345586970317527, + "step": 23 + }, + { + "epoch": 0.08522916435467762, + "grad_norm": 1.0064730644226074, + "learning_rate": 9.994326589298875e-06, + "loss": 0.2538, + "mean_token_accuracy": 0.03691909779263369, + "step": 24 + }, + { + "epoch": 0.08878037953612251, + "grad_norm": 1.5932868719100952, + "learning_rate": 9.993639643744071e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.03495999022561591, + "step": 25 + }, + { + "epoch": 0.09233159471756741, + "grad_norm": 1.5711798667907715, + "learning_rate": 9.99291347838381e-06, + "loss": 0.2632, + "mean_token_accuracy": 0.03424531156633748, + "step": 26 + }, + { + "epoch": 0.09588280989901232, + "grad_norm": 1.2252583503723145, + "learning_rate": 9.992148098921361e-06, + "loss": 0.2445, + "mean_token_accuracy": 0.03593750138679752, + "step": 27 + }, + { + "epoch": 0.09943402508045722, + "grad_norm": 0.9910848140716553, + "learning_rate": 9.99134351136798e-06, + "loss": 0.2582, + "mean_token_accuracy": 0.038970169473032, + "step": 28 + }, + { + "epoch": 0.10298524026190212, + "grad_norm": 0.8660365343093872, + "learning_rate": 9.990499722042852e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.03539915586588904, + "step": 29 + }, + { + "epoch": 0.10653645544334701, + "grad_norm": 1.146954894065857, + "learning_rate": 9.989616737573064e-06, + "loss": 0.266, + "mean_token_accuracy": 0.03540461422744556, + "step": 30 + }, + { + "epoch": 0.11008767062479193, + "grad_norm": 1.0442326068878174, + "learning_rate": 9.98869456489353e-06, + "loss": 0.263, + "mean_token_accuracy": 0.03115065989732102, + "step": 31 + }, + { + "epoch": 0.11363888580623682, + "grad_norm": 1.205544114112854, + "learning_rate": 9.987733211246952e-06, + "loss": 0.2488, + "mean_token_accuracy": 0.0319663421723817, + "step": 32 + }, + { + "epoch": 0.11719010098768172, + "grad_norm": 1.2089142799377441, + "learning_rate": 9.986732684183753e-06, + "loss": 0.2522, + "mean_token_accuracy": 0.03767831435834523, + "step": 33 + }, + { + "epoch": 0.12074131616912662, + "grad_norm": 1.5163798332214355, + "learning_rate": 9.985692991562026e-06, + "loss": 0.2507, + "mean_token_accuracy": 0.039727197097818134, + "step": 34 + }, + { + "epoch": 0.12429253135057153, + "grad_norm": 1.222414255142212, + "learning_rate": 9.984614141547468e-06, + "loss": 0.2449, + "mean_token_accuracy": 0.04050094210106181, + "step": 35 + }, + { + "epoch": 0.1278437465320164, + "grad_norm": 1.3800718784332275, + "learning_rate": 9.983496142613314e-06, + "loss": 0.254, + "mean_token_accuracy": 0.03262984177126782, + "step": 36 + }, + { + "epoch": 0.13139496171346132, + "grad_norm": 1.6306910514831543, + "learning_rate": 9.982339003540272e-06, + "loss": 0.2583, + "mean_token_accuracy": 0.03810747154057026, + "step": 37 + }, + { + "epoch": 0.13494617689490623, + "grad_norm": 1.1124504804611206, + "learning_rate": 9.981142733416457e-06, + "loss": 0.258, + "mean_token_accuracy": 0.03563006369222421, + "step": 38 + }, + { + "epoch": 0.13849739207635112, + "grad_norm": 1.0881694555282593, + "learning_rate": 9.97990734163732e-06, + "loss": 0.2464, + "mean_token_accuracy": 0.0357487567125645, + "step": 39 + }, + { + "epoch": 0.14204860725779603, + "grad_norm": 1.3406707048416138, + "learning_rate": 9.978632837905566e-06, + "loss": 0.2492, + "mean_token_accuracy": 0.03550324517709669, + "step": 40 + }, + { + "epoch": 0.14559982243924094, + "grad_norm": 1.7592324018478394, + "learning_rate": 9.977319232231088e-06, + "loss": 0.2534, + "mean_token_accuracy": 0.03265573883254547, + "step": 41 + }, + { + "epoch": 0.14915103762068582, + "grad_norm": 0.9613096117973328, + "learning_rate": 9.975966534930879e-06, + "loss": 0.2496, + "mean_token_accuracy": 0.03335131827407167, + "step": 42 + }, + { + "epoch": 0.15270225280213073, + "grad_norm": 1.0681848526000977, + "learning_rate": 9.974574756628961e-06, + "loss": 0.2521, + "mean_token_accuracy": 0.033948773343581706, + "step": 43 + }, + { + "epoch": 0.15625346798357562, + "grad_norm": 2.593073844909668, + "learning_rate": 9.973143908256291e-06, + "loss": 0.2533, + "mean_token_accuracy": 0.03512008564939606, + "step": 44 + }, + { + "epoch": 0.15980468316502053, + "grad_norm": 1.112453579902649, + "learning_rate": 9.971674001050687e-06, + "loss": 0.2485, + "mean_token_accuracy": 0.038668419925670605, + "step": 45 + }, + { + "epoch": 0.16335589834646544, + "grad_norm": 1.1808503866195679, + "learning_rate": 9.970165046556726e-06, + "loss": 0.2422, + "mean_token_accuracy": 0.042593366157234414, + "step": 46 + }, + { + "epoch": 0.16690711352791032, + "grad_norm": 1.1615327596664429, + "learning_rate": 9.968617056625665e-06, + "loss": 0.2469, + "mean_token_accuracy": 0.03583648353378521, + "step": 47 + }, + { + "epoch": 0.17045832870935523, + "grad_norm": 0.9273613691329956, + "learning_rate": 9.967030043415345e-06, + "loss": 0.2494, + "mean_token_accuracy": 0.03705584820272634, + "step": 48 + }, + { + "epoch": 0.17400954389080014, + "grad_norm": 1.2233788967132568, + "learning_rate": 9.965404019390087e-06, + "loss": 0.2617, + "mean_token_accuracy": 0.03263758916364168, + "step": 49 + }, + { + "epoch": 0.17756075907224503, + "grad_norm": 1.7746872901916504, + "learning_rate": 9.963738997320609e-06, + "loss": 0.2565, + "mean_token_accuracy": 0.029688541861105477, + "step": 50 + }, + { + "epoch": 0.18111197425368994, + "grad_norm": 1.6324982643127441, + "learning_rate": 9.962034990283912e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.031018283694720594, + "step": 51 + }, + { + "epoch": 0.18466318943513482, + "grad_norm": 1.072430968284607, + "learning_rate": 9.960292011663186e-06, + "loss": 0.2416, + "mean_token_accuracy": 0.03291421085668844, + "step": 52 + }, + { + "epoch": 0.18821440461657973, + "grad_norm": 1.6084487438201904, + "learning_rate": 9.958510075147703e-06, + "loss": 0.2537, + "mean_token_accuracy": 0.03283511256086058, + "step": 53 + }, + { + "epoch": 0.19176561979802464, + "grad_norm": 3.659834623336792, + "learning_rate": 9.956689194732702e-06, + "loss": 0.2606, + "mean_token_accuracy": 0.03444362949267088, + "step": 54 + }, + { + "epoch": 0.19531683497946953, + "grad_norm": 1.3310655355453491, + "learning_rate": 9.954829384719296e-06, + "loss": 0.2436, + "mean_token_accuracy": 0.0340039685870579, + "step": 55 + }, + { + "epoch": 0.19886805016091444, + "grad_norm": 1.2658661603927612, + "learning_rate": 9.95293065971434e-06, + "loss": 0.2523, + "mean_token_accuracy": 0.03461971837168676, + "step": 56 + }, + { + "epoch": 0.20241926534235935, + "grad_norm": 0.9548496007919312, + "learning_rate": 9.950993034630328e-06, + "loss": 0.2563, + "mean_token_accuracy": 0.03424741265553166, + "step": 57 + }, + { + "epoch": 0.20597048052380423, + "grad_norm": 1.5720734596252441, + "learning_rate": 9.949016524685277e-06, + "loss": 0.246, + "mean_token_accuracy": 0.03490980070273508, + "step": 58 + }, + { + "epoch": 0.20952169570524914, + "grad_norm": 1.0101721286773682, + "learning_rate": 9.947001145402598e-06, + "loss": 0.2424, + "mean_token_accuracy": 0.033455477951065404, + "step": 59 + }, + { + "epoch": 0.21307291088669403, + "grad_norm": 0.8857386112213135, + "learning_rate": 9.944946912610986e-06, + "loss": 0.2361, + "mean_token_accuracy": 0.03596452736383071, + "step": 60 + }, + { + "epoch": 0.21662412606813894, + "grad_norm": 1.0570611953735352, + "learning_rate": 9.942853842444283e-06, + "loss": 0.256, + "mean_token_accuracy": 0.03371396200964227, + "step": 61 + }, + { + "epoch": 0.22017534124958385, + "grad_norm": 0.9776428937911987, + "learning_rate": 9.940721951341365e-06, + "loss": 0.2467, + "mean_token_accuracy": 0.037202409857854946, + "step": 62 + }, + { + "epoch": 0.22372655643102873, + "grad_norm": 1.1859705448150635, + "learning_rate": 9.938551256046e-06, + "loss": 0.2418, + "mean_token_accuracy": 0.038250373654591385, + "step": 63 + }, + { + "epoch": 0.22727777161247364, + "grad_norm": 1.2014352083206177, + "learning_rate": 9.936341773606723e-06, + "loss": 0.2399, + "mean_token_accuracy": 0.03623262169639929, + "step": 64 + }, + { + "epoch": 0.23082898679391856, + "grad_norm": 1.0376017093658447, + "learning_rate": 9.934093521376707e-06, + "loss": 0.2344, + "mean_token_accuracy": 0.038730696996935876, + "step": 65 + }, + { + "epoch": 0.23438020197536344, + "grad_norm": 1.1257619857788086, + "learning_rate": 9.931806517013612e-06, + "loss": 0.2415, + "mean_token_accuracy": 0.03321881674128235, + "step": 66 + }, + { + "epoch": 0.23793141715680835, + "grad_norm": 1.2318155765533447, + "learning_rate": 9.929480778479465e-06, + "loss": 0.2437, + "mean_token_accuracy": 0.037453887685842346, + "step": 67 + }, + { + "epoch": 0.24148263233825323, + "grad_norm": 2.6596481800079346, + "learning_rate": 9.9271163240405e-06, + "loss": 0.2399, + "mean_token_accuracy": 0.03390174064406892, + "step": 68 + }, + { + "epoch": 0.24503384751969814, + "grad_norm": 1.1814781427383423, + "learning_rate": 9.92471317226703e-06, + "loss": 0.2411, + "mean_token_accuracy": 0.03444516302442935, + "step": 69 + }, + { + "epoch": 0.24858506270114306, + "grad_norm": 1.9260839223861694, + "learning_rate": 9.922271342033295e-06, + "loss": 0.232, + "mean_token_accuracy": 0.037160640149522806, + "step": 70 + }, + { + "epoch": 0.25213627788258797, + "grad_norm": 0.9885060787200928, + "learning_rate": 9.919790852517313e-06, + "loss": 0.2329, + "mean_token_accuracy": 0.03519195963963284, + "step": 71 + }, + { + "epoch": 0.2556874930640328, + "grad_norm": 1.4856972694396973, + "learning_rate": 9.917271723200725e-06, + "loss": 0.2477, + "mean_token_accuracy": 0.03523147122905357, + "step": 72 + }, + { + "epoch": 0.25923870824547773, + "grad_norm": 1.6875951290130615, + "learning_rate": 9.914713973868654e-06, + "loss": 0.2531, + "mean_token_accuracy": 0.034952133555634646, + "step": 73 + }, + { + "epoch": 0.26278992342692264, + "grad_norm": 1.6447116136550903, + "learning_rate": 9.91211762460954e-06, + "loss": 0.2591, + "mean_token_accuracy": 0.031627864758775104, + "step": 74 + }, + { + "epoch": 0.26634113860836756, + "grad_norm": 1.2278292179107666, + "learning_rate": 9.909482695814986e-06, + "loss": 0.2348, + "mean_token_accuracy": 0.034353334242041456, + "step": 75 + }, + { + "epoch": 0.26989235378981247, + "grad_norm": 1.049460530281067, + "learning_rate": 9.906809208179593e-06, + "loss": 0.2457, + "mean_token_accuracy": 0.038979591363386135, + "step": 76 + }, + { + "epoch": 0.2734435689712574, + "grad_norm": 1.2099536657333374, + "learning_rate": 9.904097182700806e-06, + "loss": 0.2524, + "mean_token_accuracy": 0.0339672717054782, + "step": 77 + }, + { + "epoch": 0.27699478415270223, + "grad_norm": 1.4347492456436157, + "learning_rate": 9.901346640678744e-06, + "loss": 0.2311, + "mean_token_accuracy": 0.03691132989479229, + "step": 78 + }, + { + "epoch": 0.28054599933414714, + "grad_norm": 1.3881317377090454, + "learning_rate": 9.898557603716031e-06, + "loss": 0.2574, + "mean_token_accuracy": 0.03597194645408308, + "step": 79 + }, + { + "epoch": 0.28409721451559206, + "grad_norm": 0.9869890809059143, + "learning_rate": 9.895730093717629e-06, + "loss": 0.2413, + "mean_token_accuracy": 0.034210870442620944, + "step": 80 + }, + { + "epoch": 0.28764842969703697, + "grad_norm": 1.0517185926437378, + "learning_rate": 9.892864132890663e-06, + "loss": 0.2441, + "mean_token_accuracy": 0.036712895660457434, + "step": 81 + }, + { + "epoch": 0.2911996448784819, + "grad_norm": 1.3158183097839355, + "learning_rate": 9.889959743744253e-06, + "loss": 0.2381, + "mean_token_accuracy": 0.035375316572753945, + "step": 82 + }, + { + "epoch": 0.29475086005992673, + "grad_norm": 1.3365075588226318, + "learning_rate": 9.887016949089334e-06, + "loss": 0.2514, + "mean_token_accuracy": 0.03394350171038241, + "step": 83 + }, + { + "epoch": 0.29830207524137164, + "grad_norm": 0.9649895429611206, + "learning_rate": 9.884035772038471e-06, + "loss": 0.236, + "mean_token_accuracy": 0.042694962285168, + "step": 84 + }, + { + "epoch": 0.30185329042281656, + "grad_norm": 1.5863405466079712, + "learning_rate": 9.881016236005686e-06, + "loss": 0.2345, + "mean_token_accuracy": 0.035104533468256705, + "step": 85 + }, + { + "epoch": 0.30540450560426147, + "grad_norm": 0.9601231217384338, + "learning_rate": 9.877958364706269e-06, + "loss": 0.2415, + "mean_token_accuracy": 0.03544433661591029, + "step": 86 + }, + { + "epoch": 0.3089557207857064, + "grad_norm": 1.2249877452850342, + "learning_rate": 9.874862182156596e-06, + "loss": 0.2467, + "mean_token_accuracy": 0.029483627313311445, + "step": 87 + }, + { + "epoch": 0.31250693596715123, + "grad_norm": 0.9038888216018677, + "learning_rate": 9.871727712673931e-06, + "loss": 0.2406, + "mean_token_accuracy": 0.03652138033430674, + "step": 88 + }, + { + "epoch": 0.31605815114859614, + "grad_norm": 1.1572030782699585, + "learning_rate": 9.868554980876253e-06, + "loss": 0.2352, + "mean_token_accuracy": 0.03487317850886029, + "step": 89 + }, + { + "epoch": 0.31960936633004106, + "grad_norm": 1.7031879425048828, + "learning_rate": 9.865344011682038e-06, + "loss": 0.2489, + "mean_token_accuracy": 0.03696917567867786, + "step": 90 + }, + { + "epoch": 0.32316058151148597, + "grad_norm": 1.187495470046997, + "learning_rate": 9.86209483031009e-06, + "loss": 0.244, + "mean_token_accuracy": 0.03272304959318717, + "step": 91 + }, + { + "epoch": 0.3267117966929309, + "grad_norm": 1.3152642250061035, + "learning_rate": 9.858807462279319e-06, + "loss": 0.2464, + "mean_token_accuracy": 0.03474508359613537, + "step": 92 + }, + { + "epoch": 0.3302630118743758, + "grad_norm": 0.9680661559104919, + "learning_rate": 9.855481933408557e-06, + "loss": 0.2378, + "mean_token_accuracy": 0.035624928095785435, + "step": 93 + }, + { + "epoch": 0.33381422705582064, + "grad_norm": 1.0342296361923218, + "learning_rate": 9.852118269816348e-06, + "loss": 0.242, + "mean_token_accuracy": 0.03397064344426326, + "step": 94 + }, + { + "epoch": 0.33736544223726556, + "grad_norm": 1.5446665287017822, + "learning_rate": 9.848716497920742e-06, + "loss": 0.2406, + "mean_token_accuracy": 0.03505690078418411, + "step": 95 + }, + { + "epoch": 0.34091665741871047, + "grad_norm": 1.498883843421936, + "learning_rate": 9.845276644439093e-06, + "loss": 0.2364, + "mean_token_accuracy": 0.036183488669848884, + "step": 96 + }, + { + "epoch": 0.3444678726001554, + "grad_norm": 1.212774395942688, + "learning_rate": 9.841798736387846e-06, + "loss": 0.2499, + "mean_token_accuracy": 0.03257646691054106, + "step": 97 + }, + { + "epoch": 0.3480190877816003, + "grad_norm": 0.9920580983161926, + "learning_rate": 9.838282801082322e-06, + "loss": 0.2363, + "mean_token_accuracy": 0.033475329148132005, + "step": 98 + }, + { + "epoch": 0.35157030296304514, + "grad_norm": 1.1953113079071045, + "learning_rate": 9.834728866136506e-06, + "loss": 0.2534, + "mean_token_accuracy": 0.03439514616547967, + "step": 99 + }, + { + "epoch": 0.35512151814449006, + "grad_norm": 1.2545762062072754, + "learning_rate": 9.831136959462835e-06, + "loss": 0.2405, + "mean_token_accuracy": 0.035495126408932265, + "step": 100 + }, + { + "epoch": 0.35867273332593497, + "grad_norm": 1.419901728630066, + "learning_rate": 9.82750710927197e-06, + "loss": 0.2283, + "mean_token_accuracy": 0.03381325537702651, + "step": 101 + }, + { + "epoch": 0.3622239485073799, + "grad_norm": 0.9885729551315308, + "learning_rate": 9.823839344072582e-06, + "loss": 0.2403, + "mean_token_accuracy": 0.03792576077466947, + "step": 102 + }, + { + "epoch": 0.3657751636888248, + "grad_norm": 0.9770134687423706, + "learning_rate": 9.820133692671116e-06, + "loss": 0.2382, + "mean_token_accuracy": 0.03301898992503993, + "step": 103 + }, + { + "epoch": 0.36932637887026964, + "grad_norm": 1.1737765073776245, + "learning_rate": 9.816390184171587e-06, + "loss": 0.2399, + "mean_token_accuracy": 0.03295345153310336, + "step": 104 + }, + { + "epoch": 0.37287759405171456, + "grad_norm": 1.1783053874969482, + "learning_rate": 9.812608847975327e-06, + "loss": 0.2271, + "mean_token_accuracy": 0.04089054596624919, + "step": 105 + }, + { + "epoch": 0.37642880923315947, + "grad_norm": 1.124003291130066, + "learning_rate": 9.808789713780768e-06, + "loss": 0.2473, + "mean_token_accuracy": 0.038133521022245986, + "step": 106 + }, + { + "epoch": 0.3799800244146044, + "grad_norm": 1.0809928178787231, + "learning_rate": 9.804932811583208e-06, + "loss": 0.2311, + "mean_token_accuracy": 0.037309447398001794, + "step": 107 + }, + { + "epoch": 0.3835312395960493, + "grad_norm": 1.0467100143432617, + "learning_rate": 9.801038171674571e-06, + "loss": 0.2353, + "mean_token_accuracy": 0.03867107529367786, + "step": 108 + }, + { + "epoch": 0.3870824547774942, + "grad_norm": 1.0050996541976929, + "learning_rate": 9.797105824643171e-06, + "loss": 0.2551, + "mean_token_accuracy": 0.030720586502866354, + "step": 109 + }, + { + "epoch": 0.39063366995893906, + "grad_norm": 0.9074941277503967, + "learning_rate": 9.793135801373472e-06, + "loss": 0.2436, + "mean_token_accuracy": 0.034242983878357336, + "step": 110 + }, + { + "epoch": 0.39418488514038397, + "grad_norm": 1.3338314294815063, + "learning_rate": 9.789128133045846e-06, + "loss": 0.2319, + "mean_token_accuracy": 0.04001064739713911, + "step": 111 + }, + { + "epoch": 0.3977361003218289, + "grad_norm": 0.9372747540473938, + "learning_rate": 9.785082851136327e-06, + "loss": 0.23, + "mean_token_accuracy": 0.0371717325233476, + "step": 112 + }, + { + "epoch": 0.4012873155032738, + "grad_norm": 1.2198783159255981, + "learning_rate": 9.780999987416363e-06, + "loss": 0.247, + "mean_token_accuracy": 0.03400729462009622, + "step": 113 + }, + { + "epoch": 0.4048385306847187, + "grad_norm": 1.3612855672836304, + "learning_rate": 9.776879573952573e-06, + "loss": 0.2483, + "mean_token_accuracy": 0.037493000188987935, + "step": 114 + }, + { + "epoch": 0.40838974586616356, + "grad_norm": 1.3870880603790283, + "learning_rate": 9.772721643106483e-06, + "loss": 0.2424, + "mean_token_accuracy": 0.03547792824974749, + "step": 115 + }, + { + "epoch": 0.41194096104760847, + "grad_norm": 0.9454783201217651, + "learning_rate": 9.768526227534286e-06, + "loss": 0.2369, + "mean_token_accuracy": 0.034673961708904244, + "step": 116 + }, + { + "epoch": 0.4154921762290534, + "grad_norm": 1.094822645187378, + "learning_rate": 9.764293360186568e-06, + "loss": 0.2376, + "mean_token_accuracy": 0.03994236302605714, + "step": 117 + }, + { + "epoch": 0.4190433914104983, + "grad_norm": 1.2195683717727661, + "learning_rate": 9.760023074308067e-06, + "loss": 0.2481, + "mean_token_accuracy": 0.035971136057924014, + "step": 118 + }, + { + "epoch": 0.4225946065919432, + "grad_norm": 1.001287579536438, + "learning_rate": 9.755715403437405e-06, + "loss": 0.2396, + "mean_token_accuracy": 0.030299254751298577, + "step": 119 + }, + { + "epoch": 0.42614582177338806, + "grad_norm": 1.384993314743042, + "learning_rate": 9.75137038140682e-06, + "loss": 0.2349, + "mean_token_accuracy": 0.034674650321903755, + "step": 120 + }, + { + "epoch": 0.42969703695483297, + "grad_norm": 1.7163217067718506, + "learning_rate": 9.746988042341907e-06, + "loss": 0.2347, + "mean_token_accuracy": 0.0343936083263543, + "step": 121 + }, + { + "epoch": 0.4332482521362779, + "grad_norm": 1.5516914129257202, + "learning_rate": 9.742568420661347e-06, + "loss": 0.2314, + "mean_token_accuracy": 0.03660766014945693, + "step": 122 + }, + { + "epoch": 0.4367994673177228, + "grad_norm": 1.278739094734192, + "learning_rate": 9.738111551076633e-06, + "loss": 0.2428, + "mean_token_accuracy": 0.0372478184508509, + "step": 123 + }, + { + "epoch": 0.4403506824991677, + "grad_norm": 1.1330451965332031, + "learning_rate": 9.733617468591806e-06, + "loss": 0.2318, + "mean_token_accuracy": 0.03080255860913894, + "step": 124 + }, + { + "epoch": 0.4439018976806126, + "grad_norm": 1.3162201642990112, + "learning_rate": 9.729086208503174e-06, + "loss": 0.2396, + "mean_token_accuracy": 0.037999619937181706, + "step": 125 + }, + { + "epoch": 0.44745311286205747, + "grad_norm": 0.9050405621528625, + "learning_rate": 9.724517806399035e-06, + "loss": 0.2318, + "mean_token_accuracy": 0.03370005947726895, + "step": 126 + }, + { + "epoch": 0.4510043280435024, + "grad_norm": 0.9520599842071533, + "learning_rate": 9.7199122981594e-06, + "loss": 0.2492, + "mean_token_accuracy": 0.03589029877548455, + "step": 127 + }, + { + "epoch": 0.4545555432249473, + "grad_norm": 1.1406325101852417, + "learning_rate": 9.715269719955708e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.033397240742488066, + "step": 128 + }, + { + "epoch": 0.4581067584063922, + "grad_norm": 1.3370436429977417, + "learning_rate": 9.710590108250546e-06, + "loss": 0.2448, + "mean_token_accuracy": 0.03260400308499811, + "step": 129 + }, + { + "epoch": 0.4616579735878371, + "grad_norm": 1.1314901113510132, + "learning_rate": 9.705873499797358e-06, + "loss": 0.2361, + "mean_token_accuracy": 0.03638589083129773, + "step": 130 + }, + { + "epoch": 0.46520918876928197, + "grad_norm": 0.8222520351409912, + "learning_rate": 9.701119931640161e-06, + "loss": 0.2369, + "mean_token_accuracy": 0.039288555715756956, + "step": 131 + }, + { + "epoch": 0.4687604039507269, + "grad_norm": 1.8080474138259888, + "learning_rate": 9.69632944111325e-06, + "loss": 0.2457, + "mean_token_accuracy": 0.03426650948313181, + "step": 132 + }, + { + "epoch": 0.4723116191321718, + "grad_norm": 1.0059517621994019, + "learning_rate": 9.691502065840905e-06, + "loss": 0.2318, + "mean_token_accuracy": 0.041799060640187236, + "step": 133 + }, + { + "epoch": 0.4758628343136167, + "grad_norm": 1.5800079107284546, + "learning_rate": 9.686637843737104e-06, + "loss": 0.2453, + "mean_token_accuracy": 0.035510726302163675, + "step": 134 + }, + { + "epoch": 0.4794140494950616, + "grad_norm": 1.1752355098724365, + "learning_rate": 9.681736813005207e-06, + "loss": 0.243, + "mean_token_accuracy": 0.03367971369516454, + "step": 135 + }, + { + "epoch": 0.48296526467650647, + "grad_norm": 1.1899182796478271, + "learning_rate": 9.676799012137678e-06, + "loss": 0.2333, + "mean_token_accuracy": 0.03657108947663801, + "step": 136 + }, + { + "epoch": 0.4865164798579514, + "grad_norm": 0.9200431704521179, + "learning_rate": 9.671824479915768e-06, + "loss": 0.2228, + "mean_token_accuracy": 0.04016808481901535, + "step": 137 + }, + { + "epoch": 0.4900676950393963, + "grad_norm": 1.828884243965149, + "learning_rate": 9.666813255409212e-06, + "loss": 0.249, + "mean_token_accuracy": 0.03349095129669877, + "step": 138 + }, + { + "epoch": 0.4936189102208412, + "grad_norm": 0.9927622079849243, + "learning_rate": 9.661765377975924e-06, + "loss": 0.2457, + "mean_token_accuracy": 0.035240638426330406, + "step": 139 + }, + { + "epoch": 0.4971701254022861, + "grad_norm": 1.5192276239395142, + "learning_rate": 9.656680887261693e-06, + "loss": 0.2209, + "mean_token_accuracy": 0.034234478349389974, + "step": 140 + }, + { + "epoch": 0.500721340583731, + "grad_norm": 1.0839775800704956, + "learning_rate": 9.651559823199865e-06, + "loss": 0.2449, + "mean_token_accuracy": 0.03251597065900569, + "step": 141 + }, + { + "epoch": 0.5042725557651759, + "grad_norm": 1.23935866355896, + "learning_rate": 9.646402226011028e-06, + "loss": 0.2435, + "mean_token_accuracy": 0.03836617184606439, + "step": 142 + }, + { + "epoch": 0.5078237709466208, + "grad_norm": 1.1805026531219482, + "learning_rate": 9.641208136202705e-06, + "loss": 0.2317, + "mean_token_accuracy": 0.037522376484048436, + "step": 143 + }, + { + "epoch": 0.5113749861280656, + "grad_norm": 1.4856340885162354, + "learning_rate": 9.635977594569025e-06, + "loss": 0.2464, + "mean_token_accuracy": 0.03469312738525332, + "step": 144 + }, + { + "epoch": 0.5149262013095106, + "grad_norm": 1.1366444826126099, + "learning_rate": 9.630710642190412e-06, + "loss": 0.2318, + "mean_token_accuracy": 0.03400819407397648, + "step": 145 + }, + { + "epoch": 0.5184774164909555, + "grad_norm": 1.1727653741836548, + "learning_rate": 9.625407320433257e-06, + "loss": 0.2378, + "mean_token_accuracy": 0.0386261100647971, + "step": 146 + }, + { + "epoch": 0.5220286316724004, + "grad_norm": 1.0548739433288574, + "learning_rate": 9.620067670949593e-06, + "loss": 0.2375, + "mean_token_accuracy": 0.03571181180086569, + "step": 147 + }, + { + "epoch": 0.5255798468538453, + "grad_norm": 1.2281569242477417, + "learning_rate": 9.614691735676768e-06, + "loss": 0.2358, + "mean_token_accuracy": 0.034898955855169334, + "step": 148 + }, + { + "epoch": 0.5291310620352903, + "grad_norm": 1.2215083837509155, + "learning_rate": 9.609279556837122e-06, + "loss": 0.2364, + "mean_token_accuracy": 0.03552666449104436, + "step": 149 + }, + { + "epoch": 0.5326822772167351, + "grad_norm": 1.1264266967773438, + "learning_rate": 9.603831176937645e-06, + "loss": 0.2429, + "mean_token_accuracy": 0.03438397626450751, + "step": 150 + }, + { + "epoch": 0.53623349239818, + "grad_norm": 0.9536157846450806, + "learning_rate": 9.598346638769653e-06, + "loss": 0.2457, + "mean_token_accuracy": 0.03396081876780954, + "step": 151 + }, + { + "epoch": 0.5397847075796249, + "grad_norm": 1.115391731262207, + "learning_rate": 9.592825985408443e-06, + "loss": 0.2324, + "mean_token_accuracy": 0.0359127135889139, + "step": 152 + }, + { + "epoch": 0.5433359227610698, + "grad_norm": 1.0178639888763428, + "learning_rate": 9.58726926021296e-06, + "loss": 0.2312, + "mean_token_accuracy": 0.0330092023978068, + "step": 153 + }, + { + "epoch": 0.5468871379425148, + "grad_norm": 1.2052620649337769, + "learning_rate": 9.581676506825458e-06, + "loss": 0.2263, + "mean_token_accuracy": 0.038123431153508136, + "step": 154 + }, + { + "epoch": 0.5504383531239596, + "grad_norm": 1.4705978631973267, + "learning_rate": 9.576047769171154e-06, + "loss": 0.2442, + "mean_token_accuracy": 0.038363178242434515, + "step": 155 + }, + { + "epoch": 0.5539895683054045, + "grad_norm": 1.043751835823059, + "learning_rate": 9.57038309145788e-06, + "loss": 0.2371, + "mean_token_accuracy": 0.03430595702957362, + "step": 156 + }, + { + "epoch": 0.5575407834868494, + "grad_norm": 1.1250089406967163, + "learning_rate": 9.564682518175745e-06, + "loss": 0.2289, + "mean_token_accuracy": 0.034604653304995736, + "step": 157 + }, + { + "epoch": 0.5610919986682943, + "grad_norm": 1.0135129690170288, + "learning_rate": 9.558946094096773e-06, + "loss": 0.2437, + "mean_token_accuracy": 0.03395201343300869, + "step": 158 + }, + { + "epoch": 0.5646432138497393, + "grad_norm": 1.4492766857147217, + "learning_rate": 9.553173864274567e-06, + "loss": 0.2324, + "mean_token_accuracy": 0.03644484080723487, + "step": 159 + }, + { + "epoch": 0.5681944290311841, + "grad_norm": 1.1282693147659302, + "learning_rate": 9.547365874043939e-06, + "loss": 0.2241, + "mean_token_accuracy": 0.03566149080506875, + "step": 160 + }, + { + "epoch": 0.571745644212629, + "grad_norm": 1.1843982934951782, + "learning_rate": 9.541522169020568e-06, + "loss": 0.2326, + "mean_token_accuracy": 0.03567118446153472, + "step": 161 + }, + { + "epoch": 0.5752968593940739, + "grad_norm": 0.8801372647285461, + "learning_rate": 9.535642795100628e-06, + "loss": 0.227, + "mean_token_accuracy": 0.03749654847706552, + "step": 162 + }, + { + "epoch": 0.5788480745755188, + "grad_norm": 1.0453648567199707, + "learning_rate": 9.529727798460443e-06, + "loss": 0.2282, + "mean_token_accuracy": 0.032709133800381096, + "step": 163 + }, + { + "epoch": 0.5823992897569638, + "grad_norm": 1.3325905799865723, + "learning_rate": 9.52377722555611e-06, + "loss": 0.2356, + "mean_token_accuracy": 0.035057135524766636, + "step": 164 + }, + { + "epoch": 0.5859505049384086, + "grad_norm": 1.281227707862854, + "learning_rate": 9.517791123123141e-06, + "loss": 0.2365, + "mean_token_accuracy": 0.036725746911542956, + "step": 165 + }, + { + "epoch": 0.5895017201198535, + "grad_norm": 1.3783758878707886, + "learning_rate": 9.5117695381761e-06, + "loss": 0.2363, + "mean_token_accuracy": 0.03276101666051545, + "step": 166 + }, + { + "epoch": 0.5930529353012984, + "grad_norm": 1.5050840377807617, + "learning_rate": 9.50571251800822e-06, + "loss": 0.2369, + "mean_token_accuracy": 0.03802061551141378, + "step": 167 + }, + { + "epoch": 0.5966041504827433, + "grad_norm": 0.8776186108589172, + "learning_rate": 9.49962011019105e-06, + "loss": 0.2455, + "mean_token_accuracy": 0.030048082109715324, + "step": 168 + }, + { + "epoch": 0.6001553656641883, + "grad_norm": 1.001824975013733, + "learning_rate": 9.493492362574069e-06, + "loss": 0.2132, + "mean_token_accuracy": 0.04021573403588263, + "step": 169 + }, + { + "epoch": 0.6037065808456331, + "grad_norm": 0.9855964779853821, + "learning_rate": 9.487329323284306e-06, + "loss": 0.2409, + "mean_token_accuracy": 0.03264193019276718, + "step": 170 + }, + { + "epoch": 0.607257796027078, + "grad_norm": 2.301527976989746, + "learning_rate": 9.481131040725982e-06, + "loss": 0.2431, + "mean_token_accuracy": 0.03716234745661495, + "step": 171 + }, + { + "epoch": 0.6108090112085229, + "grad_norm": 1.8789873123168945, + "learning_rate": 9.474897563580105e-06, + "loss": 0.2187, + "mean_token_accuracy": 0.03926722923642956, + "step": 172 + }, + { + "epoch": 0.6143602263899678, + "grad_norm": 1.0674188137054443, + "learning_rate": 9.468628940804109e-06, + "loss": 0.2238, + "mean_token_accuracy": 0.039616512553038774, + "step": 173 + }, + { + "epoch": 0.6179114415714128, + "grad_norm": 1.457808256149292, + "learning_rate": 9.46232522163145e-06, + "loss": 0.2439, + "mean_token_accuracy": 0.03321230758956517, + "step": 174 + }, + { + "epoch": 0.6214626567528576, + "grad_norm": 0.8303464651107788, + "learning_rate": 9.45598645557124e-06, + "loss": 0.2333, + "mean_token_accuracy": 0.035840667918819236, + "step": 175 + }, + { + "epoch": 0.6250138719343025, + "grad_norm": 3.895867347717285, + "learning_rate": 9.44961269240784e-06, + "loss": 0.2339, + "mean_token_accuracy": 0.03322799965098966, + "step": 176 + }, + { + "epoch": 0.6285650871157474, + "grad_norm": 1.2655450105667114, + "learning_rate": 9.443203982200479e-06, + "loss": 0.2463, + "mean_token_accuracy": 0.03559355030665756, + "step": 177 + }, + { + "epoch": 0.6321163022971923, + "grad_norm": 1.4873794317245483, + "learning_rate": 9.436760375282858e-06, + "loss": 0.222, + "mean_token_accuracy": 0.04104303981876001, + "step": 178 + }, + { + "epoch": 0.6356675174786373, + "grad_norm": 1.4663066864013672, + "learning_rate": 9.430281922262758e-06, + "loss": 0.2258, + "mean_token_accuracy": 0.03942336593900109, + "step": 179 + }, + { + "epoch": 0.6392187326600821, + "grad_norm": 0.9148339629173279, + "learning_rate": 9.423768674021638e-06, + "loss": 0.238, + "mean_token_accuracy": 0.03264097062674409, + "step": 180 + }, + { + "epoch": 0.642769947841527, + "grad_norm": 1.171152114868164, + "learning_rate": 9.417220681714232e-06, + "loss": 0.227, + "mean_token_accuracy": 0.034916177624836564, + "step": 181 + }, + { + "epoch": 0.6463211630229719, + "grad_norm": 1.0754197835922241, + "learning_rate": 9.410637996768161e-06, + "loss": 0.2239, + "mean_token_accuracy": 0.036934319799911464, + "step": 182 + }, + { + "epoch": 0.6498723782044168, + "grad_norm": 1.3153674602508545, + "learning_rate": 9.404020670883511e-06, + "loss": 0.2346, + "mean_token_accuracy": 0.03758588138043706, + "step": 183 + }, + { + "epoch": 0.6534235933858618, + "grad_norm": 1.2649421691894531, + "learning_rate": 9.397368756032445e-06, + "loss": 0.2282, + "mean_token_accuracy": 0.03252386118401773, + "step": 184 + }, + { + "epoch": 0.6569748085673066, + "grad_norm": 1.4130192995071411, + "learning_rate": 9.390682304458782e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.03749604838958476, + "step": 185 + }, + { + "epoch": 0.6605260237487516, + "grad_norm": 1.3785449266433716, + "learning_rate": 9.38396136867759e-06, + "loss": 0.2242, + "mean_token_accuracy": 0.037102947433595546, + "step": 186 + }, + { + "epoch": 0.6640772389301964, + "grad_norm": 1.6444662809371948, + "learning_rate": 9.377206001474773e-06, + "loss": 0.2333, + "mean_token_accuracy": 0.03256806965509895, + "step": 187 + }, + { + "epoch": 0.6676284541116413, + "grad_norm": 1.3745763301849365, + "learning_rate": 9.370416255906663e-06, + "loss": 0.2367, + "mean_token_accuracy": 0.034505165538575966, + "step": 188 + }, + { + "epoch": 0.6711796692930863, + "grad_norm": 0.9810280203819275, + "learning_rate": 9.363592185299593e-06, + "loss": 0.2341, + "mean_token_accuracy": 0.03995652306730335, + "step": 189 + }, + { + "epoch": 0.6747308844745311, + "grad_norm": 1.3432363271713257, + "learning_rate": 9.356733843249487e-06, + "loss": 0.2309, + "mean_token_accuracy": 0.030092548069660552, + "step": 190 + }, + { + "epoch": 0.6782820996559761, + "grad_norm": 2.273892641067505, + "learning_rate": 9.349841283621432e-06, + "loss": 0.241, + "mean_token_accuracy": 0.031039203957334394, + "step": 191 + }, + { + "epoch": 0.6818333148374209, + "grad_norm": 1.2541855573654175, + "learning_rate": 9.34291456054926e-06, + "loss": 0.2188, + "mean_token_accuracy": 0.039437778086721664, + "step": 192 + }, + { + "epoch": 0.6853845300188658, + "grad_norm": 1.4611351490020752, + "learning_rate": 9.33595372843512e-06, + "loss": 0.2245, + "mean_token_accuracy": 0.03319703893066617, + "step": 193 + }, + { + "epoch": 0.6889357452003108, + "grad_norm": 1.4521944522857666, + "learning_rate": 9.328958841949056e-06, + "loss": 0.2427, + "mean_token_accuracy": 0.03371610334215802, + "step": 194 + }, + { + "epoch": 0.6924869603817556, + "grad_norm": 1.4048359394073486, + "learning_rate": 9.321929956028565e-06, + "loss": 0.2379, + "mean_token_accuracy": 0.039065972556272754, + "step": 195 + }, + { + "epoch": 0.6960381755632006, + "grad_norm": 1.262329339981079, + "learning_rate": 9.31486712587818e-06, + "loss": 0.2395, + "mean_token_accuracy": 0.03516424228109827, + "step": 196 + }, + { + "epoch": 0.6995893907446454, + "grad_norm": 1.9205256700515747, + "learning_rate": 9.307770406969032e-06, + "loss": 0.235, + "mean_token_accuracy": 0.03766121077205753, + "step": 197 + }, + { + "epoch": 0.7031406059260903, + "grad_norm": 0.861643373966217, + "learning_rate": 9.300639855038405e-06, + "loss": 0.2333, + "mean_token_accuracy": 0.03458166824930231, + "step": 198 + }, + { + "epoch": 0.7066918211075353, + "grad_norm": 1.0324631929397583, + "learning_rate": 9.293475526089316e-06, + "loss": 0.2266, + "mean_token_accuracy": 0.0378209915907064, + "step": 199 + }, + { + "epoch": 0.7102430362889801, + "grad_norm": 1.0572841167449951, + "learning_rate": 9.286277476390056e-06, + "loss": 0.2228, + "mean_token_accuracy": 0.033504162300232565, + "step": 200 + }, + { + "epoch": 0.7137942514704251, + "grad_norm": 0.977691113948822, + "learning_rate": 9.279045762473764e-06, + "loss": 0.2319, + "mean_token_accuracy": 0.03699978294389439, + "step": 201 + }, + { + "epoch": 0.7173454666518699, + "grad_norm": 1.1246163845062256, + "learning_rate": 9.27178044113797e-06, + "loss": 0.2289, + "mean_token_accuracy": 0.03518677899228351, + "step": 202 + }, + { + "epoch": 0.7208966818333148, + "grad_norm": 1.03395676612854, + "learning_rate": 9.264481569444157e-06, + "loss": 0.2276, + "mean_token_accuracy": 0.03365413462961442, + "step": 203 + }, + { + "epoch": 0.7244478970147598, + "grad_norm": 1.1664369106292725, + "learning_rate": 9.257149204717317e-06, + "loss": 0.2322, + "mean_token_accuracy": 0.03404544836485002, + "step": 204 + }, + { + "epoch": 0.7279991121962046, + "grad_norm": 1.9012341499328613, + "learning_rate": 9.249783404545488e-06, + "loss": 0.2295, + "mean_token_accuracy": 0.0391799282260763, + "step": 205 + }, + { + "epoch": 0.7315503273776496, + "grad_norm": 1.7467440366744995, + "learning_rate": 9.242384226779308e-06, + "loss": 0.2284, + "mean_token_accuracy": 0.03427470392125542, + "step": 206 + }, + { + "epoch": 0.7351015425590944, + "grad_norm": 1.0553667545318604, + "learning_rate": 9.234951729531564e-06, + "loss": 0.2225, + "mean_token_accuracy": 0.039189494265883695, + "step": 207 + }, + { + "epoch": 0.7386527577405393, + "grad_norm": 1.2300719022750854, + "learning_rate": 9.227485971176734e-06, + "loss": 0.2304, + "mean_token_accuracy": 0.04073228745619417, + "step": 208 + }, + { + "epoch": 0.7422039729219843, + "grad_norm": 1.1945102214813232, + "learning_rate": 9.219987010350522e-06, + "loss": 0.221, + "mean_token_accuracy": 0.038925422908505425, + "step": 209 + }, + { + "epoch": 0.7457551881034291, + "grad_norm": 0.7722426056861877, + "learning_rate": 9.212454905949406e-06, + "loss": 0.2224, + "mean_token_accuracy": 0.033903196033861605, + "step": 210 + }, + { + "epoch": 0.7493064032848741, + "grad_norm": 1.1328966617584229, + "learning_rate": 9.204889717130172e-06, + "loss": 0.2335, + "mean_token_accuracy": 0.03668181393004488, + "step": 211 + }, + { + "epoch": 0.7528576184663189, + "grad_norm": 1.0629628896713257, + "learning_rate": 9.197291503309448e-06, + "loss": 0.2243, + "mean_token_accuracy": 0.03883554994536098, + "step": 212 + }, + { + "epoch": 0.7564088336477638, + "grad_norm": 1.6294198036193848, + "learning_rate": 9.189660324163243e-06, + "loss": 0.2295, + "mean_token_accuracy": 0.03799285244895145, + "step": 213 + }, + { + "epoch": 0.7599600488292088, + "grad_norm": 0.9800763726234436, + "learning_rate": 9.181996239626468e-06, + "loss": 0.218, + "mean_token_accuracy": 0.03557873377940268, + "step": 214 + }, + { + "epoch": 0.7635112640106536, + "grad_norm": 1.125218391418457, + "learning_rate": 9.174299309892474e-06, + "loss": 0.2264, + "mean_token_accuracy": 0.037647852554073324, + "step": 215 + }, + { + "epoch": 0.7670624791920986, + "grad_norm": 1.0833550691604614, + "learning_rate": 9.166569595412576e-06, + "loss": 0.2169, + "mean_token_accuracy": 0.03527776500050095, + "step": 216 + }, + { + "epoch": 0.7706136943735434, + "grad_norm": 1.386415719985962, + "learning_rate": 9.158807156895581e-06, + "loss": 0.223, + "mean_token_accuracy": 0.03334747435292229, + "step": 217 + }, + { + "epoch": 0.7741649095549884, + "grad_norm": 1.3321865797042847, + "learning_rate": 9.151012055307308e-06, + "loss": 0.2239, + "mean_token_accuracy": 0.036874980345601216, + "step": 218 + }, + { + "epoch": 0.7777161247364333, + "grad_norm": 1.1025806665420532, + "learning_rate": 9.14318435187011e-06, + "loss": 0.2261, + "mean_token_accuracy": 0.03307490728911944, + "step": 219 + }, + { + "epoch": 0.7812673399178781, + "grad_norm": 1.6245715618133545, + "learning_rate": 9.135324108062391e-06, + "loss": 0.2085, + "mean_token_accuracy": 0.04187627647843328, + "step": 220 + }, + { + "epoch": 0.7848185550993231, + "grad_norm": 1.4246594905853271, + "learning_rate": 9.127431385618129e-06, + "loss": 0.2189, + "mean_token_accuracy": 0.040660028342244914, + "step": 221 + }, + { + "epoch": 0.7883697702807679, + "grad_norm": 1.2004705667495728, + "learning_rate": 9.119506246526386e-06, + "loss": 0.2248, + "mean_token_accuracy": 0.034995593878193176, + "step": 222 + }, + { + "epoch": 0.7919209854622129, + "grad_norm": 1.6502361297607422, + "learning_rate": 9.111548753030824e-06, + "loss": 0.223, + "mean_token_accuracy": 0.03552522065365338, + "step": 223 + }, + { + "epoch": 0.7954722006436578, + "grad_norm": 1.5170484781265259, + "learning_rate": 9.103558967629211e-06, + "loss": 0.2267, + "mean_token_accuracy": 0.037829035503818886, + "step": 224 + }, + { + "epoch": 0.7990234158251026, + "grad_norm": 1.460752010345459, + "learning_rate": 9.09553695307294e-06, + "loss": 0.234, + "mean_token_accuracy": 0.028729848068905994, + "step": 225 + }, + { + "epoch": 0.8025746310065476, + "grad_norm": 1.1285916566848755, + "learning_rate": 9.087482772366529e-06, + "loss": 0.2304, + "mean_token_accuracy": 0.033455276032327674, + "step": 226 + }, + { + "epoch": 0.8061258461879924, + "grad_norm": 0.9813740849494934, + "learning_rate": 9.07939648876712e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.03443802075344138, + "step": 227 + }, + { + "epoch": 0.8096770613694374, + "grad_norm": 0.9483611583709717, + "learning_rate": 9.071278165784001e-06, + "loss": 0.2283, + "mean_token_accuracy": 0.03545017533178907, + "step": 228 + }, + { + "epoch": 0.8132282765508823, + "grad_norm": 1.1032304763793945, + "learning_rate": 9.063127867178085e-06, + "loss": 0.2182, + "mean_token_accuracy": 0.04028806747373892, + "step": 229 + }, + { + "epoch": 0.8167794917323271, + "grad_norm": 2.941054344177246, + "learning_rate": 9.054945656961429e-06, + "loss": 0.2166, + "mean_token_accuracy": 0.04179423921959824, + "step": 230 + }, + { + "epoch": 0.8203307069137721, + "grad_norm": 1.0875635147094727, + "learning_rate": 9.046731599396716e-06, + "loss": 0.2295, + "mean_token_accuracy": 0.034964146889251424, + "step": 231 + }, + { + "epoch": 0.8238819220952169, + "grad_norm": 1.1555942296981812, + "learning_rate": 9.03848575899676e-06, + "loss": 0.2269, + "mean_token_accuracy": 0.04241397403529845, + "step": 232 + }, + { + "epoch": 0.8274331372766619, + "grad_norm": 0.948806881904602, + "learning_rate": 9.030208200523994e-06, + "loss": 0.2202, + "mean_token_accuracy": 0.03350217042680015, + "step": 233 + }, + { + "epoch": 0.8309843524581068, + "grad_norm": 1.2038196325302124, + "learning_rate": 9.021898988989966e-06, + "loss": 0.2247, + "mean_token_accuracy": 0.03134763890193426, + "step": 234 + }, + { + "epoch": 0.8345355676395516, + "grad_norm": 1.6858941316604614, + "learning_rate": 9.013558189654819e-06, + "loss": 0.218, + "mean_token_accuracy": 0.03856555292077246, + "step": 235 + }, + { + "epoch": 0.8380867828209966, + "grad_norm": 1.1609561443328857, + "learning_rate": 9.005185868026793e-06, + "loss": 0.2169, + "mean_token_accuracy": 0.036304176566773094, + "step": 236 + }, + { + "epoch": 0.8416379980024414, + "grad_norm": 1.1071784496307373, + "learning_rate": 8.996782089861699e-06, + "loss": 0.2419, + "mean_token_accuracy": 0.032351685425965115, + "step": 237 + }, + { + "epoch": 0.8451892131838864, + "grad_norm": 1.0100809335708618, + "learning_rate": 8.988346921162407e-06, + "loss": 0.2319, + "mean_token_accuracy": 0.03175320574155194, + "step": 238 + }, + { + "epoch": 0.8487404283653313, + "grad_norm": 1.4512320756912231, + "learning_rate": 8.979880428178323e-06, + "loss": 0.2249, + "mean_token_accuracy": 0.03225310727066244, + "step": 239 + }, + { + "epoch": 0.8522916435467761, + "grad_norm": 1.0108217000961304, + "learning_rate": 8.971382677404878e-06, + "loss": 0.233, + "mean_token_accuracy": 0.03236227970774053, + "step": 240 + }, + { + "epoch": 0.8558428587282211, + "grad_norm": 1.3838858604431152, + "learning_rate": 8.962853735582996e-06, + "loss": 0.2155, + "mean_token_accuracy": 0.036608970935049, + "step": 241 + }, + { + "epoch": 0.8593940739096659, + "grad_norm": 1.138848900794983, + "learning_rate": 8.95429366969858e-06, + "loss": 0.2345, + "mean_token_accuracy": 0.0361562248053815, + "step": 242 + }, + { + "epoch": 0.8629452890911109, + "grad_norm": 0.8619561791419983, + "learning_rate": 8.94570254698197e-06, + "loss": 0.2309, + "mean_token_accuracy": 0.04240272083552554, + "step": 243 + }, + { + "epoch": 0.8664965042725558, + "grad_norm": 1.1279878616333008, + "learning_rate": 8.93708043490743e-06, + "loss": 0.2304, + "mean_token_accuracy": 0.03780577134239138, + "step": 244 + }, + { + "epoch": 0.8700477194540006, + "grad_norm": 1.1739652156829834, + "learning_rate": 8.928427401192618e-06, + "loss": 0.2265, + "mean_token_accuracy": 0.037726832346379524, + "step": 245 + }, + { + "epoch": 0.8735989346354456, + "grad_norm": 0.7752535343170166, + "learning_rate": 8.919743513798044e-06, + "loss": 0.2236, + "mean_token_accuracy": 0.03597815223474754, + "step": 246 + }, + { + "epoch": 0.8771501498168904, + "grad_norm": 0.7733471989631653, + "learning_rate": 8.911028840926537e-06, + "loss": 0.2221, + "mean_token_accuracy": 0.04170313518625335, + "step": 247 + }, + { + "epoch": 0.8807013649983354, + "grad_norm": 0.7916595935821533, + "learning_rate": 8.902283451022725e-06, + "loss": 0.2138, + "mean_token_accuracy": 0.03986302729026647, + "step": 248 + }, + { + "epoch": 0.8842525801797803, + "grad_norm": 1.2641276121139526, + "learning_rate": 8.89350741277247e-06, + "loss": 0.2362, + "mean_token_accuracy": 0.03598766483992222, + "step": 249 + }, + { + "epoch": 0.8878037953612252, + "grad_norm": 1.6378071308135986, + "learning_rate": 8.884700795102365e-06, + "loss": 0.2299, + "mean_token_accuracy": 0.03333858217229135, + "step": 250 + }, + { + "epoch": 0.8913550105426701, + "grad_norm": 1.092628836631775, + "learning_rate": 8.875863667179155e-06, + "loss": 0.2276, + "mean_token_accuracy": 0.03222806086341734, + "step": 251 + }, + { + "epoch": 0.8949062257241149, + "grad_norm": 1.4616332054138184, + "learning_rate": 8.866996098409217e-06, + "loss": 0.2212, + "mean_token_accuracy": 0.0341962569837051, + "step": 252 + }, + { + "epoch": 0.8984574409055599, + "grad_norm": 1.8922557830810547, + "learning_rate": 8.858098158438013e-06, + "loss": 0.2308, + "mean_token_accuracy": 0.03757270462301676, + "step": 253 + }, + { + "epoch": 0.9020086560870048, + "grad_norm": 1.3642209768295288, + "learning_rate": 8.849169917149532e-06, + "loss": 0.2239, + "mean_token_accuracy": 0.03927645156363724, + "step": 254 + }, + { + "epoch": 0.9055598712684497, + "grad_norm": 1.0666165351867676, + "learning_rate": 8.840211444665754e-06, + "loss": 0.2302, + "mean_token_accuracy": 0.0400322419773147, + "step": 255 + }, + { + "epoch": 0.9091110864498946, + "grad_norm": 1.0763819217681885, + "learning_rate": 8.831222811346088e-06, + "loss": 0.2227, + "mean_token_accuracy": 0.038276460734778084, + "step": 256 + }, + { + "epoch": 0.9126623016313394, + "grad_norm": 1.3471585512161255, + "learning_rate": 8.822204087786831e-06, + "loss": 0.2316, + "mean_token_accuracy": 0.03097587551746983, + "step": 257 + }, + { + "epoch": 0.9162135168127844, + "grad_norm": 1.2004120349884033, + "learning_rate": 8.813155344820602e-06, + "loss": 0.2179, + "mean_token_accuracy": 0.03698654064646689, + "step": 258 + }, + { + "epoch": 0.9197647319942293, + "grad_norm": 1.486273169517517, + "learning_rate": 8.804076653515792e-06, + "loss": 0.2206, + "mean_token_accuracy": 0.03723590575464186, + "step": 259 + }, + { + "epoch": 0.9233159471756742, + "grad_norm": 0.8779009580612183, + "learning_rate": 8.794968085176006e-06, + "loss": 0.2253, + "mean_token_accuracy": 0.03663652328577882, + "step": 260 + }, + { + "epoch": 0.9268671623571191, + "grad_norm": 0.961213231086731, + "learning_rate": 8.785829711339502e-06, + "loss": 0.2193, + "mean_token_accuracy": 0.03452028890387737, + "step": 261 + }, + { + "epoch": 0.9304183775385639, + "grad_norm": 1.5822545289993286, + "learning_rate": 8.776661603778629e-06, + "loss": 0.2283, + "mean_token_accuracy": 0.03859327645113808, + "step": 262 + }, + { + "epoch": 0.9339695927200089, + "grad_norm": 1.3098887205123901, + "learning_rate": 8.767463834499261e-06, + "loss": 0.2355, + "mean_token_accuracy": 0.03154927755895187, + "step": 263 + }, + { + "epoch": 0.9375208079014538, + "grad_norm": 1.379447102546692, + "learning_rate": 8.758236475740236e-06, + "loss": 0.2152, + "mean_token_accuracy": 0.03622293883745442, + "step": 264 + }, + { + "epoch": 0.9410720230828987, + "grad_norm": 1.2279735803604126, + "learning_rate": 8.748979599972787e-06, + "loss": 0.2245, + "mean_token_accuracy": 0.033197632066730876, + "step": 265 + }, + { + "epoch": 0.9446232382643436, + "grad_norm": 1.2326911687850952, + "learning_rate": 8.739693279899969e-06, + "loss": 0.2176, + "mean_token_accuracy": 0.03476043541013496, + "step": 266 + }, + { + "epoch": 0.9481744534457884, + "grad_norm": 0.9703882336616516, + "learning_rate": 8.730377588456092e-06, + "loss": 0.2202, + "mean_token_accuracy": 0.0382119021815015, + "step": 267 + }, + { + "epoch": 0.9517256686272334, + "grad_norm": 1.0774365663528442, + "learning_rate": 8.72103259880615e-06, + "loss": 0.2243, + "mean_token_accuracy": 0.03840548314838088, + "step": 268 + }, + { + "epoch": 0.9552768838086783, + "grad_norm": 1.117008924484253, + "learning_rate": 8.711658384345244e-06, + "loss": 0.225, + "mean_token_accuracy": 0.035315932909725234, + "step": 269 + }, + { + "epoch": 0.9588280989901232, + "grad_norm": 1.1434279680252075, + "learning_rate": 8.702255018698e-06, + "loss": 0.21, + "mean_token_accuracy": 0.03513298414691235, + "step": 270 + }, + { + "epoch": 0.9623793141715681, + "grad_norm": 0.7816430330276489, + "learning_rate": 8.692822575718e-06, + "loss": 0.2157, + "mean_token_accuracy": 0.03654791028384352, + "step": 271 + }, + { + "epoch": 0.9659305293530129, + "grad_norm": 1.0198558568954468, + "learning_rate": 8.683361129487198e-06, + "loss": 0.2165, + "mean_token_accuracy": 0.04054846076542162, + "step": 272 + }, + { + "epoch": 0.9694817445344579, + "grad_norm": 1.140480637550354, + "learning_rate": 8.673870754315336e-06, + "loss": 0.2168, + "mean_token_accuracy": 0.04233230032332358, + "step": 273 + }, + { + "epoch": 0.9730329597159028, + "grad_norm": 0.9938759803771973, + "learning_rate": 8.664351524739368e-06, + "loss": 0.2304, + "mean_token_accuracy": 0.03541681009664899, + "step": 274 + }, + { + "epoch": 0.9765841748973477, + "grad_norm": 0.8925390243530273, + "learning_rate": 8.65480351552286e-06, + "loss": 0.2295, + "mean_token_accuracy": 0.03779280128037499, + "step": 275 + }, + { + "epoch": 0.9801353900787926, + "grad_norm": 1.2885398864746094, + "learning_rate": 8.645226801655418e-06, + "loss": 0.2223, + "mean_token_accuracy": 0.03629426244151546, + "step": 276 + }, + { + "epoch": 0.9836866052602374, + "grad_norm": 0.9392145872116089, + "learning_rate": 8.635621458352094e-06, + "loss": 0.2313, + "mean_token_accuracy": 0.0358071523696708, + "step": 277 + }, + { + "epoch": 0.9872378204416824, + "grad_norm": 0.9267784357070923, + "learning_rate": 8.625987561052789e-06, + "loss": 0.2288, + "mean_token_accuracy": 0.034536936505901394, + "step": 278 + }, + { + "epoch": 0.9907890356231273, + "grad_norm": 1.0564697980880737, + "learning_rate": 8.616325185421673e-06, + "loss": 0.2313, + "mean_token_accuracy": 0.032076333489385433, + "step": 279 + }, + { + "epoch": 0.9943402508045722, + "grad_norm": 1.2704871892929077, + "learning_rate": 8.606634407346575e-06, + "loss": 0.2254, + "mean_token_accuracy": 0.03875074074676377, + "step": 280 + }, + { + "epoch": 0.9978914659860171, + "grad_norm": 0.9733898043632507, + "learning_rate": 8.596915302938403e-06, + "loss": 0.2252, + "mean_token_accuracy": 0.032296190700435545, + "step": 281 + }, + { + "epoch": 1.0, + "grad_norm": 0.6648575663566589, + "learning_rate": 8.587167948530533e-06, + "loss": 0.1356, + "mean_token_accuracy": 0.03019784249083482, + "step": 282 + }, + { + "epoch": 1.003551215181445, + "grad_norm": 1.3055585622787476, + "learning_rate": 8.577392420678217e-06, + "loss": 0.2124, + "mean_token_accuracy": 0.03929255892217043, + "step": 283 + }, + { + "epoch": 1.0071024303628897, + "grad_norm": 1.0825082063674927, + "learning_rate": 8.567588796157983e-06, + "loss": 0.2265, + "mean_token_accuracy": 0.03449270429700846, + "step": 284 + }, + { + "epoch": 1.0106536455443347, + "grad_norm": 0.9831315279006958, + "learning_rate": 8.557757151967025e-06, + "loss": 0.2257, + "mean_token_accuracy": 0.03714761855371762, + "step": 285 + }, + { + "epoch": 1.0142048607257796, + "grad_norm": 0.7693557739257812, + "learning_rate": 8.547897565322601e-06, + "loss": 0.2208, + "mean_token_accuracy": 0.039464723708078964, + "step": 286 + }, + { + "epoch": 1.0177560759072246, + "grad_norm": 0.9719224572181702, + "learning_rate": 8.538010113661434e-06, + "loss": 0.215, + "mean_token_accuracy": 0.03782110561223817, + "step": 287 + }, + { + "epoch": 1.0213072910886694, + "grad_norm": 1.2199909687042236, + "learning_rate": 8.528094874639092e-06, + "loss": 0.2298, + "mean_token_accuracy": 0.03888447284771246, + "step": 288 + }, + { + "epoch": 1.0248585062701143, + "grad_norm": 1.0379016399383545, + "learning_rate": 8.518151926129384e-06, + "loss": 0.2048, + "mean_token_accuracy": 0.04360219682166644, + "step": 289 + }, + { + "epoch": 1.0284097214515593, + "grad_norm": 0.8994417786598206, + "learning_rate": 8.508181346223749e-06, + "loss": 0.227, + "mean_token_accuracy": 0.03488920487325231, + "step": 290 + }, + { + "epoch": 1.031960936633004, + "grad_norm": 0.9707528948783875, + "learning_rate": 8.498183213230646e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.04115383749740431, + "step": 291 + }, + { + "epoch": 1.035512151814449, + "grad_norm": 1.017004370689392, + "learning_rate": 8.488157605674924e-06, + "loss": 0.217, + "mean_token_accuracy": 0.035543966083423584, + "step": 292 + }, + { + "epoch": 1.039063366995894, + "grad_norm": 1.0191696882247925, + "learning_rate": 8.478104602297226e-06, + "loss": 0.2168, + "mean_token_accuracy": 0.03651970579812769, + "step": 293 + }, + { + "epoch": 1.0426145821773387, + "grad_norm": 1.1888155937194824, + "learning_rate": 8.468024282053357e-06, + "loss": 0.2232, + "mean_token_accuracy": 0.03674552895972738, + "step": 294 + }, + { + "epoch": 1.0461657973587837, + "grad_norm": 1.1544668674468994, + "learning_rate": 8.457916724113667e-06, + "loss": 0.2193, + "mean_token_accuracy": 0.035183957385015674, + "step": 295 + }, + { + "epoch": 1.0497170125402286, + "grad_norm": 1.1487054824829102, + "learning_rate": 8.447782007862427e-06, + "loss": 0.2127, + "mean_token_accuracy": 0.03971767070652277, + "step": 296 + }, + { + "epoch": 1.0532682277216736, + "grad_norm": 1.1387826204299927, + "learning_rate": 8.437620212897213e-06, + "loss": 0.2139, + "mean_token_accuracy": 0.03873823723915848, + "step": 297 + }, + { + "epoch": 1.0568194429031184, + "grad_norm": 1.2510346174240112, + "learning_rate": 8.427431419028273e-06, + "loss": 0.2265, + "mean_token_accuracy": 0.03466822681366466, + "step": 298 + }, + { + "epoch": 1.0603706580845633, + "grad_norm": 1.0624276399612427, + "learning_rate": 8.417215706277905e-06, + "loss": 0.2138, + "mean_token_accuracy": 0.0331218589690252, + "step": 299 + }, + { + "epoch": 1.0639218732660083, + "grad_norm": 1.1180115938186646, + "learning_rate": 8.406973154879826e-06, + "loss": 0.2237, + "mean_token_accuracy": 0.03226224229001673, + "step": 300 + }, + { + "epoch": 1.071024303628898, + "grad_norm": 0.9965167045593262, + "learning_rate": 8.396703845278537e-06, + "loss": 0.221, + "mean_token_accuracy": 0.03691619080200326, + "step": 301 + }, + { + "epoch": 1.074575518810343, + "grad_norm": 1.0591256618499756, + "learning_rate": 8.386407858128707e-06, + "loss": 0.2196, + "mean_token_accuracy": 0.03875752529347665, + "step": 302 + }, + { + "epoch": 1.0781267339917877, + "grad_norm": 1.100635051727295, + "learning_rate": 8.376085274294518e-06, + "loss": 0.2219, + "mean_token_accuracy": 0.037956459491397254, + "step": 303 + }, + { + "epoch": 1.0816779491732327, + "grad_norm": 0.9553704261779785, + "learning_rate": 8.365736174849053e-06, + "loss": 0.2171, + "mean_token_accuracy": 0.03450688702832849, + "step": 304 + }, + { + "epoch": 1.0852291643546776, + "grad_norm": 0.9865755438804626, + "learning_rate": 8.355360641073637e-06, + "loss": 0.2168, + "mean_token_accuracy": 0.036030067367391894, + "step": 305 + }, + { + "epoch": 1.0887803795361226, + "grad_norm": 1.0858834981918335, + "learning_rate": 8.344958754457214e-06, + "loss": 0.2256, + "mean_token_accuracy": 0.03380810684393509, + "step": 306 + }, + { + "epoch": 1.0923315947175674, + "grad_norm": 1.026892066001892, + "learning_rate": 8.3345305966957e-06, + "loss": 0.222, + "mean_token_accuracy": 0.03506634270888753, + "step": 307 + }, + { + "epoch": 1.0958828098990123, + "grad_norm": 0.98395836353302, + "learning_rate": 8.324076249691347e-06, + "loss": 0.2131, + "mean_token_accuracy": 0.037066298340505455, + "step": 308 + }, + { + "epoch": 1.0994340250804573, + "grad_norm": 0.8688275814056396, + "learning_rate": 8.31359579555209e-06, + "loss": 0.2087, + "mean_token_accuracy": 0.035992500863358146, + "step": 309 + }, + { + "epoch": 1.102985240261902, + "grad_norm": 7.995417594909668, + "learning_rate": 8.30308931659091e-06, + "loss": 0.2316, + "mean_token_accuracy": 0.03655194863677025, + "step": 310 + }, + { + "epoch": 1.106536455443347, + "grad_norm": 1.0539637804031372, + "learning_rate": 8.292556895325195e-06, + "loss": 0.227, + "mean_token_accuracy": 0.03512826969381422, + "step": 311 + }, + { + "epoch": 1.110087670624792, + "grad_norm": 0.8968519568443298, + "learning_rate": 8.281998614476066e-06, + "loss": 0.226, + "mean_token_accuracy": 0.037603993361699395, + "step": 312 + }, + { + "epoch": 1.1136388858062367, + "grad_norm": 0.9943398237228394, + "learning_rate": 8.271414556967758e-06, + "loss": 0.2156, + "mean_token_accuracy": 0.03420978486246895, + "step": 313 + }, + { + "epoch": 1.1171901009876817, + "grad_norm": 1.1365629434585571, + "learning_rate": 8.260804805926948e-06, + "loss": 0.2233, + "mean_token_accuracy": 0.03783213975839317, + "step": 314 + }, + { + "epoch": 1.1207413161691266, + "grad_norm": 1.178529977798462, + "learning_rate": 8.250169444682109e-06, + "loss": 0.2198, + "mean_token_accuracy": 0.03976841686562693, + "step": 315 + }, + { + "epoch": 1.1242925313505716, + "grad_norm": 1.0319217443466187, + "learning_rate": 8.239508556762857e-06, + "loss": 0.2241, + "mean_token_accuracy": 0.031951177821611054, + "step": 316 + }, + { + "epoch": 1.1278437465320164, + "grad_norm": 1.2369407415390015, + "learning_rate": 8.228822225899294e-06, + "loss": 0.212, + "mean_token_accuracy": 0.04304049965503509, + "step": 317 + }, + { + "epoch": 1.1313949617134613, + "grad_norm": 1.0376719236373901, + "learning_rate": 8.218110536021347e-06, + "loss": 0.2085, + "mean_token_accuracy": 0.040561975689342944, + "step": 318 + }, + { + "epoch": 1.1349461768949063, + "grad_norm": 1.4454623460769653, + "learning_rate": 8.207373571258113e-06, + "loss": 0.2171, + "mean_token_accuracy": 0.03455675131408498, + "step": 319 + }, + { + "epoch": 1.138497392076351, + "grad_norm": 1.2810287475585938, + "learning_rate": 8.196611415937196e-06, + "loss": 0.2207, + "mean_token_accuracy": 0.03659132699976908, + "step": 320 + }, + { + "epoch": 1.142048607257796, + "grad_norm": 0.9189555048942566, + "learning_rate": 8.18582415458405e-06, + "loss": 0.2257, + "mean_token_accuracy": 0.03751322147945757, + "step": 321 + }, + { + "epoch": 1.145599822439241, + "grad_norm": 1.264859914779663, + "learning_rate": 8.1750118719213e-06, + "loss": 0.2195, + "mean_token_accuracy": 0.037348673798987875, + "step": 322 + }, + { + "epoch": 1.149151037620686, + "grad_norm": 0.9888535737991333, + "learning_rate": 8.164174652868097e-06, + "loss": 0.2255, + "mean_token_accuracy": 0.03571068402743549, + "step": 323 + }, + { + "epoch": 1.1527022528021307, + "grad_norm": 2.5238475799560547, + "learning_rate": 8.153312582539438e-06, + "loss": 0.2119, + "mean_token_accuracy": 0.038603254355621175, + "step": 324 + }, + { + "epoch": 1.1562534679835756, + "grad_norm": 0.9326993227005005, + "learning_rate": 8.142425746245503e-06, + "loss": 0.2127, + "mean_token_accuracy": 0.03710804141519475, + "step": 325 + }, + { + "epoch": 1.1598046831650206, + "grad_norm": 0.9224873185157776, + "learning_rate": 8.131514229490975e-06, + "loss": 0.2231, + "mean_token_accuracy": 0.041035250716959126, + "step": 326 + }, + { + "epoch": 1.1633558983464654, + "grad_norm": 1.1489871740341187, + "learning_rate": 8.120578117974388e-06, + "loss": 0.207, + "mean_token_accuracy": 0.035616049925010884, + "step": 327 + }, + { + "epoch": 1.1669071135279103, + "grad_norm": 1.3951830863952637, + "learning_rate": 8.109617497587429e-06, + "loss": 0.2138, + "mean_token_accuracy": 0.03988678594760131, + "step": 328 + }, + { + "epoch": 1.1704583287093553, + "grad_norm": 0.8216147422790527, + "learning_rate": 8.098632454414286e-06, + "loss": 0.2159, + "mean_token_accuracy": 0.03136701822586474, + "step": 329 + }, + { + "epoch": 1.1740095438908003, + "grad_norm": 0.9959815144538879, + "learning_rate": 8.08762307473096e-06, + "loss": 0.2064, + "mean_token_accuracy": 0.04014438956437516, + "step": 330 + }, + { + "epoch": 1.177560759072245, + "grad_norm": 1.0166196823120117, + "learning_rate": 8.07658944500459e-06, + "loss": 0.2289, + "mean_token_accuracy": 0.03964185667791753, + "step": 331 + }, + { + "epoch": 1.18111197425369, + "grad_norm": 1.4372248649597168, + "learning_rate": 8.065531651892771e-06, + "loss": 0.2184, + "mean_token_accuracy": 0.03303642533865059, + "step": 332 + }, + { + "epoch": 1.1846631894351347, + "grad_norm": 1.0342096090316772, + "learning_rate": 8.054449782242876e-06, + "loss": 0.2117, + "mean_token_accuracy": 0.03587262858854956, + "step": 333 + }, + { + "epoch": 1.1882144046165797, + "grad_norm": 1.0142632722854614, + "learning_rate": 8.043343923091382e-06, + "loss": 0.2144, + "mean_token_accuracy": 0.03768015895911958, + "step": 334 + }, + { + "epoch": 1.1917656197980246, + "grad_norm": 1.0247020721435547, + "learning_rate": 8.03221416166317e-06, + "loss": 0.2094, + "mean_token_accuracy": 0.04144499241374433, + "step": 335 + }, + { + "epoch": 1.1953168349794696, + "grad_norm": 1.1292544603347778, + "learning_rate": 8.021060585370845e-06, + "loss": 0.2211, + "mean_token_accuracy": 0.03466017847676994, + "step": 336 + }, + { + "epoch": 1.1988680501609144, + "grad_norm": 1.0083832740783691, + "learning_rate": 8.009883281814066e-06, + "loss": 0.2099, + "mean_token_accuracy": 0.04081952280466794, + "step": 337 + }, + { + "epoch": 1.2024192653423593, + "grad_norm": 1.3860927820205688, + "learning_rate": 7.998682338778834e-06, + "loss": 0.2076, + "mean_token_accuracy": 0.034925906780699734, + "step": 338 + }, + { + "epoch": 1.2059704805238043, + "grad_norm": 0.8742907643318176, + "learning_rate": 7.987457844236817e-06, + "loss": 0.2178, + "mean_token_accuracy": 0.03645704350856249, + "step": 339 + }, + { + "epoch": 1.209521695705249, + "grad_norm": 1.479430913925171, + "learning_rate": 7.976209886344654e-06, + "loss": 0.2147, + "mean_token_accuracy": 0.03608882729895413, + "step": 340 + }, + { + "epoch": 1.213072910886694, + "grad_norm": 1.5018517971038818, + "learning_rate": 7.964938553443267e-06, + "loss": 0.2275, + "mean_token_accuracy": 0.03744373080553487, + "step": 341 + }, + { + "epoch": 1.216624126068139, + "grad_norm": 1.1072347164154053, + "learning_rate": 7.953643934057162e-06, + "loss": 0.2148, + "mean_token_accuracy": 0.038897628408449236, + "step": 342 + }, + { + "epoch": 1.220175341249584, + "grad_norm": 0.8880918622016907, + "learning_rate": 7.942326116893733e-06, + "loss": 0.2166, + "mean_token_accuracy": 0.03384126560558798, + "step": 343 + }, + { + "epoch": 1.2237265564310287, + "grad_norm": 1.4236310720443726, + "learning_rate": 7.930985190842576e-06, + "loss": 0.2247, + "mean_token_accuracy": 0.038816192594822496, + "step": 344 + }, + { + "epoch": 1.2272777716124736, + "grad_norm": 1.3533358573913574, + "learning_rate": 7.919621244974773e-06, + "loss": 0.2154, + "mean_token_accuracy": 0.03848464526163298, + "step": 345 + }, + { + "epoch": 1.2308289867939186, + "grad_norm": 1.5561343431472778, + "learning_rate": 7.908234368542214e-06, + "loss": 0.1965, + "mean_token_accuracy": 0.04135380086154328, + "step": 346 + }, + { + "epoch": 1.2343802019753634, + "grad_norm": 1.2853002548217773, + "learning_rate": 7.896824650976873e-06, + "loss": 0.2127, + "mean_token_accuracy": 0.04092289448453812, + "step": 347 + }, + { + "epoch": 1.2379314171568083, + "grad_norm": 1.1815927028656006, + "learning_rate": 7.885392181890126e-06, + "loss": 0.2138, + "mean_token_accuracy": 0.03702772545875632, + "step": 348 + }, + { + "epoch": 1.2414826323382533, + "grad_norm": 2.602296829223633, + "learning_rate": 7.873937051072037e-06, + "loss": 0.2198, + "mean_token_accuracy": 0.03560452723831986, + "step": 349 + }, + { + "epoch": 1.2450338475196983, + "grad_norm": 1.52498197555542, + "learning_rate": 7.862459348490645e-06, + "loss": 0.2185, + "mean_token_accuracy": 0.0324377975302923, + "step": 350 + }, + { + "epoch": 1.248585062701143, + "grad_norm": 1.2384191751480103, + "learning_rate": 7.85095916429128e-06, + "loss": 0.2171, + "mean_token_accuracy": 0.03811432257498382, + "step": 351 + }, + { + "epoch": 1.252136277882588, + "grad_norm": 1.357321858406067, + "learning_rate": 7.839436588795834e-06, + "loss": 0.2034, + "mean_token_accuracy": 0.04051168984005926, + "step": 352 + }, + { + "epoch": 1.2556874930640327, + "grad_norm": 1.1218904256820679, + "learning_rate": 7.82789171250206e-06, + "loss": 0.2218, + "mean_token_accuracy": 0.032163290336029604, + "step": 353 + }, + { + "epoch": 1.2592387082454777, + "grad_norm": 1.573211669921875, + "learning_rate": 7.816324626082864e-06, + "loss": 0.2227, + "mean_token_accuracy": 0.04222477459552465, + "step": 354 + }, + { + "epoch": 1.2627899234269226, + "grad_norm": 1.5673032999038696, + "learning_rate": 7.804735420385578e-06, + "loss": 0.2163, + "mean_token_accuracy": 0.035729966661165236, + "step": 355 + }, + { + "epoch": 1.2663411386083676, + "grad_norm": 3.8074378967285156, + "learning_rate": 7.793124186431271e-06, + "loss": 0.2082, + "mean_token_accuracy": 0.04122993563032651, + "step": 356 + }, + { + "epoch": 1.2698923537898126, + "grad_norm": 0.7542282938957214, + "learning_rate": 7.781491015414018e-06, + "loss": 0.2151, + "mean_token_accuracy": 0.03684526444703806, + "step": 357 + }, + { + "epoch": 1.2734435689712573, + "grad_norm": 1.1057549715042114, + "learning_rate": 7.769835998700182e-06, + "loss": 0.216, + "mean_token_accuracy": 0.03879151448927587, + "step": 358 + }, + { + "epoch": 1.2769947841527023, + "grad_norm": 1.0620713233947754, + "learning_rate": 7.758159227827701e-06, + "loss": 0.2275, + "mean_token_accuracy": 0.0345176875598554, + "step": 359 + }, + { + "epoch": 1.280545999334147, + "grad_norm": 0.9399253129959106, + "learning_rate": 7.746460794505375e-06, + "loss": 0.2181, + "mean_token_accuracy": 0.03365673908410827, + "step": 360 + }, + { + "epoch": 1.284097214515592, + "grad_norm": 0.9033060669898987, + "learning_rate": 7.734740790612137e-06, + "loss": 0.2096, + "mean_token_accuracy": 0.038896608664799714, + "step": 361 + }, + { + "epoch": 1.287648429697037, + "grad_norm": 1.473177433013916, + "learning_rate": 7.722999308196329e-06, + "loss": 0.2176, + "mean_token_accuracy": 0.036279349031246966, + "step": 362 + }, + { + "epoch": 1.291199644878482, + "grad_norm": 1.2070612907409668, + "learning_rate": 7.711236439474991e-06, + "loss": 0.2179, + "mean_token_accuracy": 0.04047916533454554, + "step": 363 + }, + { + "epoch": 1.2947508600599267, + "grad_norm": 1.7395224571228027, + "learning_rate": 7.69945227683313e-06, + "loss": 0.2305, + "mean_token_accuracy": 0.03877663587627467, + "step": 364 + }, + { + "epoch": 1.2983020752413716, + "grad_norm": 0.9686111211776733, + "learning_rate": 7.68764691282299e-06, + "loss": 0.2117, + "mean_token_accuracy": 0.04020940974805853, + "step": 365 + }, + { + "epoch": 1.3018532904228166, + "grad_norm": 5.303558349609375, + "learning_rate": 7.675820440163334e-06, + "loss": 0.214, + "mean_token_accuracy": 0.03582368233401212, + "step": 366 + }, + { + "epoch": 1.3054045056042614, + "grad_norm": 1.0394208431243896, + "learning_rate": 7.663972951738708e-06, + "loss": 0.2109, + "mean_token_accuracy": 0.03519305539157358, + "step": 367 + }, + { + "epoch": 1.3089557207857063, + "grad_norm": 1.7033623456954956, + "learning_rate": 7.652104540598712e-06, + "loss": 0.2245, + "mean_token_accuracy": 0.032953111793176504, + "step": 368 + }, + { + "epoch": 1.3125069359671513, + "grad_norm": 1.0205411911010742, + "learning_rate": 7.640215299957283e-06, + "loss": 0.2089, + "mean_token_accuracy": 0.033462781550042564, + "step": 369 + }, + { + "epoch": 1.3160581511485963, + "grad_norm": 1.0846375226974487, + "learning_rate": 7.628305323191942e-06, + "loss": 0.2231, + "mean_token_accuracy": 0.03299743057868909, + "step": 370 + }, + { + "epoch": 1.319609366330041, + "grad_norm": 1.0049022436141968, + "learning_rate": 7.616374703843071e-06, + "loss": 0.2095, + "mean_token_accuracy": 0.03856489048484946, + "step": 371 + }, + { + "epoch": 1.323160581511486, + "grad_norm": 1.0001732110977173, + "learning_rate": 7.604423535613183e-06, + "loss": 0.2082, + "mean_token_accuracy": 0.038502528364915634, + "step": 372 + }, + { + "epoch": 1.326711796692931, + "grad_norm": 0.8959653377532959, + "learning_rate": 7.592451912366176e-06, + "loss": 0.2196, + "mean_token_accuracy": 0.03337086039027781, + "step": 373 + }, + { + "epoch": 1.3302630118743757, + "grad_norm": 0.7602843046188354, + "learning_rate": 7.580459928126607e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.0357423342011316, + "step": 374 + }, + { + "epoch": 1.3338142270558206, + "grad_norm": 0.9946548342704773, + "learning_rate": 7.568447677078937e-06, + "loss": 0.2133, + "mean_token_accuracy": 0.04242287410306744, + "step": 375 + }, + { + "epoch": 1.3373654422372656, + "grad_norm": 0.9074912071228027, + "learning_rate": 7.556415253566814e-06, + "loss": 0.2127, + "mean_token_accuracy": 0.03988895907787082, + "step": 376 + }, + { + "epoch": 1.3409166574187106, + "grad_norm": 0.9580668807029724, + "learning_rate": 7.544362752092309e-06, + "loss": 0.2192, + "mean_token_accuracy": 0.037165690009715036, + "step": 377 + }, + { + "epoch": 1.3444678726001553, + "grad_norm": 1.1636556386947632, + "learning_rate": 7.532290267315189e-06, + "loss": 0.2139, + "mean_token_accuracy": 0.03949245312833227, + "step": 378 + }, + { + "epoch": 1.3480190877816003, + "grad_norm": 1.179194450378418, + "learning_rate": 7.52019789405217e-06, + "loss": 0.2205, + "mean_token_accuracy": 0.04195636221629684, + "step": 379 + }, + { + "epoch": 1.351570302963045, + "grad_norm": 1.0651406049728394, + "learning_rate": 7.508085727276169e-06, + "loss": 0.2106, + "mean_token_accuracy": 0.035512080634362064, + "step": 380 + }, + { + "epoch": 1.35512151814449, + "grad_norm": 1.2458323240280151, + "learning_rate": 7.495953862115561e-06, + "loss": 0.202, + "mean_token_accuracy": 0.041346150457684416, + "step": 381 + }, + { + "epoch": 1.358672733325935, + "grad_norm": 1.0106011629104614, + "learning_rate": 7.483802393853431e-06, + "loss": 0.2243, + "mean_token_accuracy": 0.03703319465057575, + "step": 382 + }, + { + "epoch": 1.36222394850738, + "grad_norm": 0.9817686676979065, + "learning_rate": 7.471631417926826e-06, + "loss": 0.2095, + "mean_token_accuracy": 0.04350194331345847, + "step": 383 + }, + { + "epoch": 1.365775163688825, + "grad_norm": 0.8153394460678101, + "learning_rate": 7.459441029926006e-06, + "loss": 0.2142, + "mean_token_accuracy": 0.036771614526514895, + "step": 384 + }, + { + "epoch": 1.3693263788702696, + "grad_norm": 1.0360249280929565, + "learning_rate": 7.447231325593689e-06, + "loss": 0.2179, + "mean_token_accuracy": 0.04122182544961106, + "step": 385 + }, + { + "epoch": 1.3728775940517146, + "grad_norm": 1.5009338855743408, + "learning_rate": 7.435002400824309e-06, + "loss": 0.2055, + "mean_token_accuracy": 0.03624642355134711, + "step": 386 + }, + { + "epoch": 1.3764288092331594, + "grad_norm": 1.2901028394699097, + "learning_rate": 7.422754351663252e-06, + "loss": 0.2141, + "mean_token_accuracy": 0.03373757504050445, + "step": 387 + }, + { + "epoch": 1.3799800244146043, + "grad_norm": 0.9240864515304565, + "learning_rate": 7.410487274306104e-06, + "loss": 0.2137, + "mean_token_accuracy": 0.037080507419887, + "step": 388 + }, + { + "epoch": 1.3835312395960493, + "grad_norm": 0.9881526827812195, + "learning_rate": 7.398201265097902e-06, + "loss": 0.2093, + "mean_token_accuracy": 0.038321658816130366, + "step": 389 + }, + { + "epoch": 1.3870824547774943, + "grad_norm": 1.0277929306030273, + "learning_rate": 7.385896420532372e-06, + "loss": 0.2107, + "mean_token_accuracy": 0.03264967034010624, + "step": 390 + }, + { + "epoch": 1.390633669958939, + "grad_norm": 1.092461347579956, + "learning_rate": 7.37357283725117e-06, + "loss": 0.2179, + "mean_token_accuracy": 0.041954617630835855, + "step": 391 + }, + { + "epoch": 1.394184885140384, + "grad_norm": 0.9751242399215698, + "learning_rate": 7.361230612043125e-06, + "loss": 0.212, + "mean_token_accuracy": 0.03894812406542769, + "step": 392 + }, + { + "epoch": 1.397736100321829, + "grad_norm": 1.890830159187317, + "learning_rate": 7.3488698418434824e-06, + "loss": 0.2172, + "mean_token_accuracy": 0.03717777330894023, + "step": 393 + }, + { + "epoch": 1.4012873155032737, + "grad_norm": 0.8804880380630493, + "learning_rate": 7.3364906237331345e-06, + "loss": 0.2204, + "mean_token_accuracy": 0.04008191364482627, + "step": 394 + }, + { + "epoch": 1.4048385306847186, + "grad_norm": 0.8936333060264587, + "learning_rate": 7.324093054937864e-06, + "loss": 0.2257, + "mean_token_accuracy": 0.03301443805685267, + "step": 395 + }, + { + "epoch": 1.4083897458661636, + "grad_norm": 1.1294957399368286, + "learning_rate": 7.311677232827583e-06, + "loss": 0.2233, + "mean_token_accuracy": 0.04345220190225518, + "step": 396 + }, + { + "epoch": 1.4119409610476086, + "grad_norm": 0.895491898059845, + "learning_rate": 7.299243254915558e-06, + "loss": 0.2083, + "mean_token_accuracy": 0.04362610646057874, + "step": 397 + }, + { + "epoch": 1.4154921762290533, + "grad_norm": 0.995071291923523, + "learning_rate": 7.286791218857654e-06, + "loss": 0.2061, + "mean_token_accuracy": 0.04004020981301437, + "step": 398 + }, + { + "epoch": 1.4190433914104983, + "grad_norm": 1.0868768692016602, + "learning_rate": 7.274321222451561e-06, + "loss": 0.2202, + "mean_token_accuracy": 0.03820264393652906, + "step": 399 + }, + { + "epoch": 1.4225946065919433, + "grad_norm": 0.8654125928878784, + "learning_rate": 7.261833363636036e-06, + "loss": 0.212, + "mean_token_accuracy": 0.03712031566828955, + "step": 400 + }, + { + "epoch": 1.426145821773388, + "grad_norm": 0.8807328939437866, + "learning_rate": 7.249327740490114e-06, + "loss": 0.2177, + "mean_token_accuracy": 0.03679542864483665, + "step": 401 + }, + { + "epoch": 1.429697036954833, + "grad_norm": 1.5379787683486938, + "learning_rate": 7.236804451232364e-06, + "loss": 0.2246, + "mean_token_accuracy": 0.03669572240141861, + "step": 402 + }, + { + "epoch": 1.433248252136278, + "grad_norm": 1.0445082187652588, + "learning_rate": 7.224263594220093e-06, + "loss": 0.2067, + "mean_token_accuracy": 0.03605292076827027, + "step": 403 + }, + { + "epoch": 1.436799467317723, + "grad_norm": 1.1058682203292847, + "learning_rate": 7.211705267948592e-06, + "loss": 0.2292, + "mean_token_accuracy": 0.036163140674034366, + "step": 404 + }, + { + "epoch": 1.4403506824991676, + "grad_norm": 0.9819757342338562, + "learning_rate": 7.199129571050345e-06, + "loss": 0.2037, + "mean_token_accuracy": 0.0389055151536013, + "step": 405 + }, + { + "epoch": 1.4439018976806126, + "grad_norm": 1.1496120691299438, + "learning_rate": 7.186536602294278e-06, + "loss": 0.2129, + "mean_token_accuracy": 0.03939257287129294, + "step": 406 + }, + { + "epoch": 1.4474531128620574, + "grad_norm": 0.9700692892074585, + "learning_rate": 7.173926460584956e-06, + "loss": 0.2229, + "mean_token_accuracy": 0.034446998419298325, + "step": 407 + }, + { + "epoch": 1.4510043280435023, + "grad_norm": 1.0145217180252075, + "learning_rate": 7.161299244961828e-06, + "loss": 0.1973, + "mean_token_accuracy": 0.03743944912821462, + "step": 408 + }, + { + "epoch": 1.4545555432249473, + "grad_norm": 4.707770347595215, + "learning_rate": 7.148655054598436e-06, + "loss": 0.2118, + "mean_token_accuracy": 0.03483109064109158, + "step": 409 + }, + { + "epoch": 1.4581067584063923, + "grad_norm": 1.4786440134048462, + "learning_rate": 7.135993988801644e-06, + "loss": 0.2141, + "mean_token_accuracy": 0.038917855665204115, + "step": 410 + }, + { + "epoch": 1.4616579735878372, + "grad_norm": 1.013339638710022, + "learning_rate": 7.1233161470108525e-06, + "loss": 0.2192, + "mean_token_accuracy": 0.033355345083691645, + "step": 411 + }, + { + "epoch": 1.465209188769282, + "grad_norm": 1.132614016532898, + "learning_rate": 7.110621628797222e-06, + "loss": 0.2125, + "mean_token_accuracy": 0.03544226154917851, + "step": 412 + }, + { + "epoch": 1.468760403950727, + "grad_norm": 0.8791604042053223, + "learning_rate": 7.097910533862886e-06, + "loss": 0.2229, + "mean_token_accuracy": 0.038792295250459574, + "step": 413 + }, + { + "epoch": 1.4723116191321717, + "grad_norm": 0.8531237840652466, + "learning_rate": 7.085182962040173e-06, + "loss": 0.2097, + "mean_token_accuracy": 0.04108742300013546, + "step": 414 + }, + { + "epoch": 1.4758628343136166, + "grad_norm": 0.8082413673400879, + "learning_rate": 7.072439013290824e-06, + "loss": 0.2204, + "mean_token_accuracy": 0.03489942911255639, + "step": 415 + }, + { + "epoch": 1.4794140494950616, + "grad_norm": 0.8210154175758362, + "learning_rate": 7.059678787705191e-06, + "loss": 0.2165, + "mean_token_accuracy": 0.03975592031565611, + "step": 416 + }, + { + "epoch": 1.4829652646765066, + "grad_norm": 0.9179903268814087, + "learning_rate": 7.046902385501477e-06, + "loss": 0.2115, + "mean_token_accuracy": 0.03542604565154761, + "step": 417 + }, + { + "epoch": 1.4865164798579513, + "grad_norm": 0.9118829965591431, + "learning_rate": 7.03410990702493e-06, + "loss": 0.2156, + "mean_token_accuracy": 0.036973950522224186, + "step": 418 + }, + { + "epoch": 1.4900676950393963, + "grad_norm": 0.9617339372634888, + "learning_rate": 7.02130145274706e-06, + "loss": 0.2196, + "mean_token_accuracy": 0.03498687346291263, + "step": 419 + }, + { + "epoch": 1.4936189102208413, + "grad_norm": 1.2348254919052124, + "learning_rate": 7.008477123264849e-06, + "loss": 0.2227, + "mean_token_accuracy": 0.035096731524390634, + "step": 420 + }, + { + "epoch": 1.497170125402286, + "grad_norm": 1.189163327217102, + "learning_rate": 6.995637019299963e-06, + "loss": 0.219, + "mean_token_accuracy": 0.039271164114325074, + "step": 421 + }, + { + "epoch": 1.500721340583731, + "grad_norm": 1.7504018545150757, + "learning_rate": 6.982781241697963e-06, + "loss": 0.2074, + "mean_token_accuracy": 0.03581648114777636, + "step": 422 + }, + { + "epoch": 1.504272555765176, + "grad_norm": 0.9267095327377319, + "learning_rate": 6.969909891427509e-06, + "loss": 0.2035, + "mean_token_accuracy": 0.04157684100209735, + "step": 423 + }, + { + "epoch": 1.507823770946621, + "grad_norm": 0.8455734848976135, + "learning_rate": 6.957023069579561e-06, + "loss": 0.2127, + "mean_token_accuracy": 0.04015160134440521, + "step": 424 + }, + { + "epoch": 1.5113749861280656, + "grad_norm": 0.744219958782196, + "learning_rate": 6.944120877366605e-06, + "loss": 0.1999, + "mean_token_accuracy": 0.040050789906672435, + "step": 425 + }, + { + "epoch": 1.5149262013095106, + "grad_norm": 1.2124446630477905, + "learning_rate": 6.931203416121831e-06, + "loss": 0.2172, + "mean_token_accuracy": 0.03541620170290116, + "step": 426 + }, + { + "epoch": 1.5184774164909554, + "grad_norm": 0.9840066432952881, + "learning_rate": 6.918270787298361e-06, + "loss": 0.2214, + "mean_token_accuracy": 0.03850063406935078, + "step": 427 + }, + { + "epoch": 1.5220286316724003, + "grad_norm": 0.9824987649917603, + "learning_rate": 6.90532309246844e-06, + "loss": 0.1999, + "mean_token_accuracy": 0.04028791991004255, + "step": 428 + }, + { + "epoch": 1.5255798468538453, + "grad_norm": 0.895193874835968, + "learning_rate": 6.89236043332264e-06, + "loss": 0.2085, + "mean_token_accuracy": 0.03992676907000714, + "step": 429 + }, + { + "epoch": 1.5291310620352903, + "grad_norm": 0.7865180969238281, + "learning_rate": 6.87938291166906e-06, + "loss": 0.2103, + "mean_token_accuracy": 0.04152852020342834, + "step": 430 + }, + { + "epoch": 1.5326822772167352, + "grad_norm": 1.021116018295288, + "learning_rate": 6.866390629432533e-06, + "loss": 0.227, + "mean_token_accuracy": 0.035348424105904996, + "step": 431 + }, + { + "epoch": 1.53623349239818, + "grad_norm": 1.3936994075775146, + "learning_rate": 6.8533836886538175e-06, + "loss": 0.2196, + "mean_token_accuracy": 0.038212617593671894, + "step": 432 + }, + { + "epoch": 1.539784707579625, + "grad_norm": 0.9515417814254761, + "learning_rate": 6.840362191488801e-06, + "loss": 0.2171, + "mean_token_accuracy": 0.037786570228490746, + "step": 433 + }, + { + "epoch": 1.5433359227610697, + "grad_norm": 1.0766609907150269, + "learning_rate": 6.8273262402076935e-06, + "loss": 0.2167, + "mean_token_accuracy": 0.033810261673352215, + "step": 434 + }, + { + "epoch": 1.5468871379425146, + "grad_norm": 0.9607550501823425, + "learning_rate": 6.814275937194233e-06, + "loss": 0.2182, + "mean_token_accuracy": 0.03374743430686067, + "step": 435 + }, + { + "epoch": 1.5504383531239596, + "grad_norm": 0.8036266565322876, + "learning_rate": 6.801211384944867e-06, + "loss": 0.2058, + "mean_token_accuracy": 0.04227837919461308, + "step": 436 + }, + { + "epoch": 1.5539895683054046, + "grad_norm": 1.0911484956741333, + "learning_rate": 6.788132686067963e-06, + "loss": 0.2087, + "mean_token_accuracy": 0.03773209520295495, + "step": 437 + }, + { + "epoch": 1.5575407834868495, + "grad_norm": 0.9918379187583923, + "learning_rate": 6.77503994328299e-06, + "loss": 0.2111, + "mean_token_accuracy": 0.03738878494914388, + "step": 438 + }, + { + "epoch": 1.5610919986682943, + "grad_norm": 0.8218954205513, + "learning_rate": 6.761933259419725e-06, + "loss": 0.2112, + "mean_token_accuracy": 0.03775092940122704, + "step": 439 + }, + { + "epoch": 1.5646432138497393, + "grad_norm": 0.9518681168556213, + "learning_rate": 6.748812737417428e-06, + "loss": 0.2242, + "mean_token_accuracy": 0.03522239228550461, + "step": 440 + }, + { + "epoch": 1.568194429031184, + "grad_norm": 1.0347586870193481, + "learning_rate": 6.7356784803240464e-06, + "loss": 0.2118, + "mean_token_accuracy": 0.03939944650483085, + "step": 441 + }, + { + "epoch": 1.571745644212629, + "grad_norm": 0.9376158714294434, + "learning_rate": 6.722530591295406e-06, + "loss": 0.2172, + "mean_token_accuracy": 0.03799405520840082, + "step": 442 + }, + { + "epoch": 1.575296859394074, + "grad_norm": 0.8542515635490417, + "learning_rate": 6.709369173594396e-06, + "loss": 0.2127, + "mean_token_accuracy": 0.04009221462183632, + "step": 443 + }, + { + "epoch": 1.578848074575519, + "grad_norm": 0.9119886159896851, + "learning_rate": 6.6961943305901515e-06, + "loss": 0.2111, + "mean_token_accuracy": 0.035265431808511494, + "step": 444 + }, + { + "epoch": 1.5823992897569639, + "grad_norm": 1.0846859216690063, + "learning_rate": 6.683006165757262e-06, + "loss": 0.2119, + "mean_token_accuracy": 0.03483350280839659, + "step": 445 + }, + { + "epoch": 1.5859505049384086, + "grad_norm": 0.9828306436538696, + "learning_rate": 6.669804782674937e-06, + "loss": 0.209, + "mean_token_accuracy": 0.039095990352507215, + "step": 446 + }, + { + "epoch": 1.5895017201198534, + "grad_norm": 0.9361124634742737, + "learning_rate": 6.656590285026203e-06, + "loss": 0.23, + "mean_token_accuracy": 0.03466227107855957, + "step": 447 + }, + { + "epoch": 1.5930529353012983, + "grad_norm": 1.067156434059143, + "learning_rate": 6.643362776597089e-06, + "loss": 0.2166, + "mean_token_accuracy": 0.03962546743787243, + "step": 448 + }, + { + "epoch": 1.5966041504827433, + "grad_norm": 0.9628080725669861, + "learning_rate": 6.630122361275811e-06, + "loss": 0.2206, + "mean_token_accuracy": 0.031071114382939413, + "step": 449 + }, + { + "epoch": 1.6001553656641883, + "grad_norm": 0.9671528935432434, + "learning_rate": 6.6168691430519524e-06, + "loss": 0.2121, + "mean_token_accuracy": 0.04189558584039332, + "step": 450 + }, + { + "epoch": 1.6037065808456332, + "grad_norm": 1.0035713911056519, + "learning_rate": 6.6036032260156526e-06, + "loss": 0.205, + "mean_token_accuracy": 0.03662525209074374, + "step": 451 + }, + { + "epoch": 1.607257796027078, + "grad_norm": 1.223681092262268, + "learning_rate": 6.590324714356784e-06, + "loss": 0.2028, + "mean_token_accuracy": 0.03729518407271826, + "step": 452 + }, + { + "epoch": 1.610809011208523, + "grad_norm": 1.0575910806655884, + "learning_rate": 6.5770337123641405e-06, + "loss": 0.2196, + "mean_token_accuracy": 0.035925558919188916, + "step": 453 + }, + { + "epoch": 1.6143602263899677, + "grad_norm": 0.9104030132293701, + "learning_rate": 6.563730324424609e-06, + "loss": 0.2034, + "mean_token_accuracy": 0.03999634759748005, + "step": 454 + }, + { + "epoch": 1.6179114415714126, + "grad_norm": 0.9227558970451355, + "learning_rate": 6.55041465502236e-06, + "loss": 0.2169, + "mean_token_accuracy": 0.04184209147206275, + "step": 455 + }, + { + "epoch": 1.6214626567528576, + "grad_norm": 1.1232300996780396, + "learning_rate": 6.53708680873802e-06, + "loss": 0.2108, + "mean_token_accuracy": 0.03624327100624214, + "step": 456 + }, + { + "epoch": 1.6250138719343026, + "grad_norm": 1.1040129661560059, + "learning_rate": 6.523746890247853e-06, + "loss": 0.2165, + "mean_token_accuracy": 0.033005929173668846, + "step": 457 + }, + { + "epoch": 1.6285650871157475, + "grad_norm": 1.0858967304229736, + "learning_rate": 6.510395004322937e-06, + "loss": 0.2156, + "mean_token_accuracy": 0.03949285501221311, + "step": 458 + }, + { + "epoch": 1.6321163022971923, + "grad_norm": 1.2183141708374023, + "learning_rate": 6.49703125582834e-06, + "loss": 0.2179, + "mean_token_accuracy": 0.03773052282485878, + "step": 459 + }, + { + "epoch": 1.6356675174786373, + "grad_norm": 0.9809972643852234, + "learning_rate": 6.4836557497222995e-06, + "loss": 0.2185, + "mean_token_accuracy": 0.03755065660698165, + "step": 460 + }, + { + "epoch": 1.639218732660082, + "grad_norm": 0.8669877648353577, + "learning_rate": 6.470268591055398e-06, + "loss": 0.1969, + "mean_token_accuracy": 0.03949119582102867, + "step": 461 + }, + { + "epoch": 1.642769947841527, + "grad_norm": 0.9765094518661499, + "learning_rate": 6.456869884969738e-06, + "loss": 0.2146, + "mean_token_accuracy": 0.03167624427806004, + "step": 462 + }, + { + "epoch": 1.646321163022972, + "grad_norm": 1.4784200191497803, + "learning_rate": 6.443459736698106e-06, + "loss": 0.2109, + "mean_token_accuracy": 0.03601810210602707, + "step": 463 + }, + { + "epoch": 1.649872378204417, + "grad_norm": 1.0272235870361328, + "learning_rate": 6.430038251563166e-06, + "loss": 0.2069, + "mean_token_accuracy": 0.039396893360390095, + "step": 464 + }, + { + "epoch": 1.6534235933858619, + "grad_norm": 0.9422529339790344, + "learning_rate": 6.416605534976614e-06, + "loss": 0.2046, + "mean_token_accuracy": 0.03725221472996054, + "step": 465 + }, + { + "epoch": 1.6569748085673066, + "grad_norm": 1.115491271018982, + "learning_rate": 6.403161692438364e-06, + "loss": 0.2054, + "mean_token_accuracy": 0.03913160233787494, + "step": 466 + }, + { + "epoch": 1.6605260237487516, + "grad_norm": 0.8288925886154175, + "learning_rate": 6.3897068295357e-06, + "loss": 0.2092, + "mean_token_accuracy": 0.03602888582099695, + "step": 467 + }, + { + "epoch": 1.6640772389301963, + "grad_norm": 1.186328649520874, + "learning_rate": 6.376241051942477e-06, + "loss": 0.2133, + "mean_token_accuracy": 0.037299348470696714, + "step": 468 + }, + { + "epoch": 1.6676284541116413, + "grad_norm": 0.938909113407135, + "learning_rate": 6.362764465418258e-06, + "loss": 0.2099, + "mean_token_accuracy": 0.03910366345371585, + "step": 469 + }, + { + "epoch": 1.6711796692930863, + "grad_norm": 1.1985607147216797, + "learning_rate": 6.349277175807506e-06, + "loss": 0.2213, + "mean_token_accuracy": 0.029635498765856028, + "step": 470 + }, + { + "epoch": 1.6747308844745312, + "grad_norm": 0.8005155324935913, + "learning_rate": 6.3357792890387485e-06, + "loss": 0.2098, + "mean_token_accuracy": 0.03648436323783244, + "step": 471 + }, + { + "epoch": 1.6782820996559762, + "grad_norm": 1.2143996953964233, + "learning_rate": 6.322270911123734e-06, + "loss": 0.2019, + "mean_token_accuracy": 0.033510430232126964, + "step": 472 + }, + { + "epoch": 1.681833314837421, + "grad_norm": 1.0368294715881348, + "learning_rate": 6.308752148156614e-06, + "loss": 0.2147, + "mean_token_accuracy": 0.03938718199060531, + "step": 473 + }, + { + "epoch": 1.6853845300188657, + "grad_norm": 1.0544377565383911, + "learning_rate": 6.295223106313104e-06, + "loss": 0.2091, + "mean_token_accuracy": 0.03419090985698858, + "step": 474 + }, + { + "epoch": 1.6889357452003106, + "grad_norm": 0.8890665173530579, + "learning_rate": 6.281683891849645e-06, + "loss": 0.2069, + "mean_token_accuracy": 0.041956931356253335, + "step": 475 + }, + { + "epoch": 1.6924869603817556, + "grad_norm": 0.9658064842224121, + "learning_rate": 6.268134611102578e-06, + "loss": 0.2068, + "mean_token_accuracy": 0.03853438159421785, + "step": 476 + }, + { + "epoch": 1.6960381755632006, + "grad_norm": 0.9596953392028809, + "learning_rate": 6.254575370487299e-06, + "loss": 0.2093, + "mean_token_accuracy": 0.03829575631971238, + "step": 477 + }, + { + "epoch": 1.6995893907446455, + "grad_norm": 1.0915920734405518, + "learning_rate": 6.2410062764974366e-06, + "loss": 0.2098, + "mean_token_accuracy": 0.03425979395979084, + "step": 478 + }, + { + "epoch": 1.7031406059260903, + "grad_norm": 1.2812224626541138, + "learning_rate": 6.227427435703997e-06, + "loss": 0.218, + "mean_token_accuracy": 0.04161245372961275, + "step": 479 + }, + { + "epoch": 1.7066918211075353, + "grad_norm": 1.0697816610336304, + "learning_rate": 6.213838954754543e-06, + "loss": 0.2111, + "mean_token_accuracy": 0.037880031057284214, + "step": 480 + }, + { + "epoch": 1.71024303628898, + "grad_norm": 1.0685772895812988, + "learning_rate": 6.2002409403723525e-06, + "loss": 0.2126, + "mean_token_accuracy": 0.03928147720944253, + "step": 481 + }, + { + "epoch": 1.713794251470425, + "grad_norm": 1.0521827936172485, + "learning_rate": 6.186633499355576e-06, + "loss": 0.2215, + "mean_token_accuracy": 0.03902989218295261, + "step": 482 + }, + { + "epoch": 1.71734546665187, + "grad_norm": 1.095388650894165, + "learning_rate": 6.173016738576396e-06, + "loss": 0.2178, + "mean_token_accuracy": 0.03534844119349145, + "step": 483 + }, + { + "epoch": 1.720896681833315, + "grad_norm": 1.2036548852920532, + "learning_rate": 6.159390764980202e-06, + "loss": 0.2042, + "mean_token_accuracy": 0.039945425454789074, + "step": 484 + }, + { + "epoch": 1.7244478970147599, + "grad_norm": 1.3349320888519287, + "learning_rate": 6.145755685584731e-06, + "loss": 0.2095, + "mean_token_accuracy": 0.040973992294311756, + "step": 485 + }, + { + "epoch": 1.7279991121962046, + "grad_norm": 0.9357543587684631, + "learning_rate": 6.132111607479243e-06, + "loss": 0.209, + "mean_token_accuracy": 0.035786267388175474, + "step": 486 + }, + { + "epoch": 1.7315503273776496, + "grad_norm": 1.2930482625961304, + "learning_rate": 6.118458637823669e-06, + "loss": 0.213, + "mean_token_accuracy": 0.03360192391482997, + "step": 487 + }, + { + "epoch": 1.7351015425590943, + "grad_norm": 1.242834210395813, + "learning_rate": 6.104796883847777e-06, + "loss": 0.1924, + "mean_token_accuracy": 0.04279121259241947, + "step": 488 + }, + { + "epoch": 1.7386527577405393, + "grad_norm": 0.919654130935669, + "learning_rate": 6.091126452850324e-06, + "loss": 0.2134, + "mean_token_accuracy": 0.03625394479422539, + "step": 489 + }, + { + "epoch": 1.7422039729219843, + "grad_norm": 1.0187784433364868, + "learning_rate": 6.077447452198219e-06, + "loss": 0.204, + "mean_token_accuracy": 0.046423808718827786, + "step": 490 + }, + { + "epoch": 1.7457551881034292, + "grad_norm": 1.249862790107727, + "learning_rate": 6.063759989325673e-06, + "loss": 0.2159, + "mean_token_accuracy": 0.03750733687047614, + "step": 491 + }, + { + "epoch": 1.7493064032848742, + "grad_norm": 1.0927352905273438, + "learning_rate": 6.050064171733362e-06, + "loss": 0.2136, + "mean_token_accuracy": 0.040005916329391766, + "step": 492 + }, + { + "epoch": 1.752857618466319, + "grad_norm": 0.8594483137130737, + "learning_rate": 6.0363601069875755e-06, + "loss": 0.2241, + "mean_token_accuracy": 0.035941219015512615, + "step": 493 + }, + { + "epoch": 1.7564088336477637, + "grad_norm": 0.9354445934295654, + "learning_rate": 6.022647902719384e-06, + "loss": 0.2037, + "mean_token_accuracy": 0.037646496271918295, + "step": 494 + }, + { + "epoch": 1.7599600488292086, + "grad_norm": 0.9080971479415894, + "learning_rate": 6.008927666623775e-06, + "loss": 0.2043, + "mean_token_accuracy": 0.04433829447225435, + "step": 495 + }, + { + "epoch": 1.7635112640106536, + "grad_norm": 1.0300090312957764, + "learning_rate": 5.9951995064588245e-06, + "loss": 0.1977, + "mean_token_accuracy": 0.0387411278388754, + "step": 496 + }, + { + "epoch": 1.7670624791920986, + "grad_norm": 1.3653299808502197, + "learning_rate": 5.981463530044841e-06, + "loss": 0.2134, + "mean_token_accuracy": 0.03764768938708585, + "step": 497 + }, + { + "epoch": 1.7706136943735435, + "grad_norm": 1.3612983226776123, + "learning_rate": 5.967719845263524e-06, + "loss": 0.2152, + "mean_token_accuracy": 0.03451220032366109, + "step": 498 + }, + { + "epoch": 1.7741649095549885, + "grad_norm": 0.9325419068336487, + "learning_rate": 5.953968560057112e-06, + "loss": 0.2147, + "mean_token_accuracy": 0.0384930808286299, + "step": 499 + }, + { + "epoch": 1.7777161247364333, + "grad_norm": 0.928591787815094, + "learning_rate": 5.940209782427535e-06, + "loss": 0.214, + "mean_token_accuracy": 0.03583947871811688, + "step": 500 + }, + { + "epoch": 1.781267339917878, + "grad_norm": 0.897335946559906, + "learning_rate": 5.926443620435572e-06, + "loss": 0.2157, + "mean_token_accuracy": 0.038455418245575856, + "step": 501 + }, + { + "epoch": 1.784818555099323, + "grad_norm": 0.91599440574646, + "learning_rate": 5.912670182199998e-06, + "loss": 0.2073, + "mean_token_accuracy": 0.03296034881896048, + "step": 502 + }, + { + "epoch": 1.788369770280768, + "grad_norm": 1.1746525764465332, + "learning_rate": 5.898889575896731e-06, + "loss": 0.2048, + "mean_token_accuracy": 0.03975271817398607, + "step": 503 + }, + { + "epoch": 1.791920985462213, + "grad_norm": 0.8332899808883667, + "learning_rate": 5.8851019097579935e-06, + "loss": 0.2103, + "mean_token_accuracy": 0.037684340750274714, + "step": 504 + }, + { + "epoch": 1.7954722006436579, + "grad_norm": 0.9919717907905579, + "learning_rate": 5.871307292071449e-06, + "loss": 0.2195, + "mean_token_accuracy": 0.03749480012265849, + "step": 505 + }, + { + "epoch": 1.7990234158251026, + "grad_norm": 0.9827485680580139, + "learning_rate": 5.857505831179361e-06, + "loss": 0.2189, + "mean_token_accuracy": 0.03539751814241754, + "step": 506 + }, + { + "epoch": 1.8025746310065476, + "grad_norm": 1.1644009351730347, + "learning_rate": 5.843697635477742e-06, + "loss": 0.2006, + "mean_token_accuracy": 0.03862465297243034, + "step": 507 + }, + { + "epoch": 1.8061258461879923, + "grad_norm": 0.9732457399368286, + "learning_rate": 5.8298828134154935e-06, + "loss": 0.2134, + "mean_token_accuracy": 0.037525917465245584, + "step": 508 + }, + { + "epoch": 1.8096770613694373, + "grad_norm": 0.8709985017776489, + "learning_rate": 5.816061473493565e-06, + "loss": 0.2217, + "mean_token_accuracy": 0.034015703642580775, + "step": 509 + }, + { + "epoch": 1.8132282765508823, + "grad_norm": 1.3914642333984375, + "learning_rate": 5.802233724264094e-06, + "loss": 0.2185, + "mean_token_accuracy": 0.03763900963895139, + "step": 510 + }, + { + "epoch": 1.8167794917323272, + "grad_norm": 0.8272927403450012, + "learning_rate": 5.788399674329559e-06, + "loss": 0.2144, + "mean_token_accuracy": 0.03993615026047337, + "step": 511 + }, + { + "epoch": 1.8203307069137722, + "grad_norm": 0.843360185623169, + "learning_rate": 5.774559432341918e-06, + "loss": 0.2125, + "mean_token_accuracy": 0.04127829002754879, + "step": 512 + }, + { + "epoch": 1.823881922095217, + "grad_norm": 0.801755964756012, + "learning_rate": 5.760713107001773e-06, + "loss": 0.2045, + "mean_token_accuracy": 0.03806999635708053, + "step": 513 + }, + { + "epoch": 1.827433137276662, + "grad_norm": 1.2582050561904907, + "learning_rate": 5.746860807057491e-06, + "loss": 0.2072, + "mean_token_accuracy": 0.04005840528043336, + "step": 514 + }, + { + "epoch": 1.8309843524581066, + "grad_norm": 0.9559704661369324, + "learning_rate": 5.7330026413043726e-06, + "loss": 0.2228, + "mean_token_accuracy": 0.034368609467492206, + "step": 515 + }, + { + "epoch": 1.8345355676395516, + "grad_norm": 0.9817132353782654, + "learning_rate": 5.719138718583781e-06, + "loss": 0.2036, + "mean_token_accuracy": 0.03506710561487125, + "step": 516 + }, + { + "epoch": 1.8380867828209966, + "grad_norm": 0.9805539846420288, + "learning_rate": 5.705269147782303e-06, + "loss": 0.2147, + "mean_token_accuracy": 0.03908530145417899, + "step": 517 + }, + { + "epoch": 1.8416379980024415, + "grad_norm": 1.0621707439422607, + "learning_rate": 5.6913940378308755e-06, + "loss": 0.2108, + "mean_token_accuracy": 0.04440686923044268, + "step": 518 + }, + { + "epoch": 1.8451892131838865, + "grad_norm": 1.127663254737854, + "learning_rate": 5.677513497703947e-06, + "loss": 0.2088, + "mean_token_accuracy": 0.03553861051477725, + "step": 519 + }, + { + "epoch": 1.8487404283653313, + "grad_norm": 0.9272313117980957, + "learning_rate": 5.663627636418611e-06, + "loss": 0.2182, + "mean_token_accuracy": 0.03748767764045624, + "step": 520 + }, + { + "epoch": 1.852291643546776, + "grad_norm": 0.8673235177993774, + "learning_rate": 5.649736563033754e-06, + "loss": 0.2166, + "mean_token_accuracy": 0.03570136218331754, + "step": 521 + }, + { + "epoch": 1.855842858728221, + "grad_norm": 1.1679906845092773, + "learning_rate": 5.635840386649197e-06, + "loss": 0.2268, + "mean_token_accuracy": 0.03190056296989496, + "step": 522 + }, + { + "epoch": 1.859394073909666, + "grad_norm": 1.715111494064331, + "learning_rate": 5.621939216404842e-06, + "loss": 0.1987, + "mean_token_accuracy": 0.04647469030533102, + "step": 523 + }, + { + "epoch": 1.862945289091111, + "grad_norm": 0.9395443201065063, + "learning_rate": 5.608033161479811e-06, + "loss": 0.2078, + "mean_token_accuracy": 0.037822478254383896, + "step": 524 + }, + { + "epoch": 1.8664965042725559, + "grad_norm": 1.0644373893737793, + "learning_rate": 5.594122331091591e-06, + "loss": 0.2138, + "mean_token_accuracy": 0.03309350729978178, + "step": 525 + }, + { + "epoch": 1.8700477194540006, + "grad_norm": 1.1506586074829102, + "learning_rate": 5.580206834495169e-06, + "loss": 0.2106, + "mean_token_accuracy": 0.03856055447977269, + "step": 526 + }, + { + "epoch": 1.8735989346354456, + "grad_norm": 1.2600033283233643, + "learning_rate": 5.566286780982193e-06, + "loss": 0.2145, + "mean_token_accuracy": 0.03731459183472907, + "step": 527 + }, + { + "epoch": 1.8771501498168903, + "grad_norm": 0.9154390692710876, + "learning_rate": 5.552362279880091e-06, + "loss": 0.209, + "mean_token_accuracy": 0.04003632850435679, + "step": 528 + }, + { + "epoch": 1.8807013649983353, + "grad_norm": 1.1231424808502197, + "learning_rate": 5.538433440551221e-06, + "loss": 0.2109, + "mean_token_accuracy": 0.03613886435414315, + "step": 529 + }, + { + "epoch": 1.8842525801797803, + "grad_norm": 0.8639524579048157, + "learning_rate": 5.524500372392021e-06, + "loss": 0.2216, + "mean_token_accuracy": 0.04050558650124003, + "step": 530 + }, + { + "epoch": 1.8878037953612252, + "grad_norm": 0.93828946352005, + "learning_rate": 5.5105631848321375e-06, + "loss": 0.2131, + "mean_token_accuracy": 0.03937426700940705, + "step": 531 + }, + { + "epoch": 1.8913550105426702, + "grad_norm": 0.7636436223983765, + "learning_rate": 5.496621987333567e-06, + "loss": 0.2097, + "mean_token_accuracy": 0.0382932660941151, + "step": 532 + }, + { + "epoch": 1.894906225724115, + "grad_norm": 0.9772413969039917, + "learning_rate": 5.482676889389808e-06, + "loss": 0.2035, + "mean_token_accuracy": 0.0444465134969505, + "step": 533 + }, + { + "epoch": 1.89845744090556, + "grad_norm": 0.761626660823822, + "learning_rate": 5.468728000524987e-06, + "loss": 0.2022, + "mean_token_accuracy": 0.03659985137346666, + "step": 534 + }, + { + "epoch": 1.9020086560870046, + "grad_norm": 0.8718194961547852, + "learning_rate": 5.454775430293008e-06, + "loss": 0.2072, + "mean_token_accuracy": 0.04068213729078707, + "step": 535 + }, + { + "epoch": 1.9055598712684496, + "grad_norm": 1.0313729047775269, + "learning_rate": 5.440819288276683e-06, + "loss": 0.2206, + "mean_token_accuracy": 0.035872990021744044, + "step": 536 + }, + { + "epoch": 1.9091110864498946, + "grad_norm": 1.0423539876937866, + "learning_rate": 5.426859684086881e-06, + "loss": 0.2074, + "mean_token_accuracy": 0.041051392698136624, + "step": 537 + }, + { + "epoch": 1.9126623016313395, + "grad_norm": 0.9633597135543823, + "learning_rate": 5.412896727361663e-06, + "loss": 0.2048, + "mean_token_accuracy": 0.03679951139201876, + "step": 538 + }, + { + "epoch": 1.9162135168127845, + "grad_norm": 0.912843644618988, + "learning_rate": 5.398930527765416e-06, + "loss": 0.2101, + "mean_token_accuracy": 0.03617726853190106, + "step": 539 + }, + { + "epoch": 1.9197647319942293, + "grad_norm": 1.0215671062469482, + "learning_rate": 5.384961194988002e-06, + "loss": 0.2072, + "mean_token_accuracy": 0.03606809282791801, + "step": 540 + }, + { + "epoch": 1.9233159471756742, + "grad_norm": 1.0828865766525269, + "learning_rate": 5.370988838743889e-06, + "loss": 0.2036, + "mean_token_accuracy": 0.033105893973697675, + "step": 541 + }, + { + "epoch": 1.926867162357119, + "grad_norm": 0.932388961315155, + "learning_rate": 5.357013568771288e-06, + "loss": 0.1939, + "mean_token_accuracy": 0.04302502845530398, + "step": 542 + }, + { + "epoch": 1.930418377538564, + "grad_norm": 0.7949482798576355, + "learning_rate": 5.343035494831298e-06, + "loss": 0.2002, + "mean_token_accuracy": 0.03803440502088051, + "step": 543 + }, + { + "epoch": 1.933969592720009, + "grad_norm": 0.931699275970459, + "learning_rate": 5.32905472670704e-06, + "loss": 0.2212, + "mean_token_accuracy": 0.03603428568749223, + "step": 544 + }, + { + "epoch": 1.9375208079014539, + "grad_norm": 0.9046686887741089, + "learning_rate": 5.315071374202792e-06, + "loss": 0.2025, + "mean_token_accuracy": 0.0395622910837119, + "step": 545 + }, + { + "epoch": 1.9410720230828988, + "grad_norm": 0.9733835458755493, + "learning_rate": 5.301085547143135e-06, + "loss": 0.2163, + "mean_token_accuracy": 0.035136927999701584, + "step": 546 + }, + { + "epoch": 1.9446232382643436, + "grad_norm": 1.6521155834197998, + "learning_rate": 5.287097355372079e-06, + "loss": 0.1976, + "mean_token_accuracy": 0.03871955324575538, + "step": 547 + }, + { + "epoch": 1.9481744534457883, + "grad_norm": 0.8703343272209167, + "learning_rate": 5.273106908752211e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.03640766240459925, + "step": 548 + }, + { + "epoch": 1.9517256686272333, + "grad_norm": 1.1232815980911255, + "learning_rate": 5.259114317163822e-06, + "loss": 0.217, + "mean_token_accuracy": 0.036017874652316095, + "step": 549 + }, + { + "epoch": 1.9552768838086783, + "grad_norm": 1.1956578493118286, + "learning_rate": 5.245119690504056e-06, + "loss": 0.2009, + "mean_token_accuracy": 0.04000877890211996, + "step": 550 + }, + { + "epoch": 1.9588280989901232, + "grad_norm": 0.9923151135444641, + "learning_rate": 5.231123138686036e-06, + "loss": 0.2153, + "mean_token_accuracy": 0.039021609194605844, + "step": 551 + }, + { + "epoch": 1.9623793141715682, + "grad_norm": 0.8768529295921326, + "learning_rate": 5.217124771638008e-06, + "loss": 0.2066, + "mean_token_accuracy": 0.03712507517411723, + "step": 552 + }, + { + "epoch": 1.965930529353013, + "grad_norm": 1.1248713731765747, + "learning_rate": 5.2031246993024705e-06, + "loss": 0.2071, + "mean_token_accuracy": 0.039892343469546176, + "step": 553 + }, + { + "epoch": 1.969481744534458, + "grad_norm": 0.9119243025779724, + "learning_rate": 5.1891230316353215e-06, + "loss": 0.1952, + "mean_token_accuracy": 0.04049671125903842, + "step": 554 + }, + { + "epoch": 1.9730329597159026, + "grad_norm": 0.8841214179992676, + "learning_rate": 5.1751198786049815e-06, + "loss": 0.2062, + "mean_token_accuracy": 0.037975409573846264, + "step": 555 + }, + { + "epoch": 1.9765841748973476, + "grad_norm": 0.9124864935874939, + "learning_rate": 5.161115350191543e-06, + "loss": 0.198, + "mean_token_accuracy": 0.038486146746436134, + "step": 556 + }, + { + "epoch": 1.9801353900787926, + "grad_norm": 1.0492109060287476, + "learning_rate": 5.147109556385898e-06, + "loss": 0.2181, + "mean_token_accuracy": 0.03872556518763304, + "step": 557 + }, + { + "epoch": 1.9836866052602375, + "grad_norm": 1.163588047027588, + "learning_rate": 5.133102607188875e-06, + "loss": 0.2046, + "mean_token_accuracy": 0.04012502250043326, + "step": 558 + }, + { + "epoch": 1.9872378204416825, + "grad_norm": 0.9493401646614075, + "learning_rate": 5.119094612610381e-06, + "loss": 0.2074, + "mean_token_accuracy": 0.03857941404930898, + "step": 559 + }, + { + "epoch": 1.9907890356231273, + "grad_norm": 1.357474446296692, + "learning_rate": 5.10508568266853e-06, + "loss": 0.2164, + "mean_token_accuracy": 0.037847537991183344, + "step": 560 + }, + { + "epoch": 1.9943402508045722, + "grad_norm": 1.5208334922790527, + "learning_rate": 5.091075927388785e-06, + "loss": 0.2208, + "mean_token_accuracy": 0.03787937617380521, + "step": 561 + }, + { + "epoch": 1.997891465986017, + "grad_norm": 1.336180329322815, + "learning_rate": 5.077065456803089e-06, + "loss": 0.2072, + "mean_token_accuracy": 0.036571884866134496, + "step": 562 + }, + { + "epoch": 2.003551215181445, + "grad_norm": 1.1412030458450317, + "learning_rate": 5.063054380949003e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.04019247937043581, + "step": 563 + }, + { + "epoch": 2.00710243036289, + "grad_norm": 1.1244559288024902, + "learning_rate": 5.049042809868845e-06, + "loss": 0.1987, + "mean_token_accuracy": 0.042167539832007606, + "step": 564 + }, + { + "epoch": 2.010653645544335, + "grad_norm": 0.9170485138893127, + "learning_rate": 5.035030853608817e-06, + "loss": 0.2067, + "mean_token_accuracy": 0.034519848279160215, + "step": 565 + }, + { + "epoch": 2.0142048607257794, + "grad_norm": 1.1951957941055298, + "learning_rate": 5.0210186222181515e-06, + "loss": 0.2132, + "mean_token_accuracy": 0.03651535582685028, + "step": 566 + }, + { + "epoch": 2.0177560759072244, + "grad_norm": 1.1706410646438599, + "learning_rate": 5.007006225748238e-06, + "loss": 0.198, + "mean_token_accuracy": 0.040820111647917656, + "step": 567 + }, + { + "epoch": 2.0213072910886694, + "grad_norm": 1.603208065032959, + "learning_rate": 4.992993774251764e-06, + "loss": 0.201, + "mean_token_accuracy": 0.040759597257419955, + "step": 568 + }, + { + "epoch": 2.0248585062701143, + "grad_norm": 1.4895318746566772, + "learning_rate": 4.97898137778185e-06, + "loss": 0.2021, + "mean_token_accuracy": 0.040484553053829586, + "step": 569 + }, + { + "epoch": 2.0284097214515593, + "grad_norm": 0.868621826171875, + "learning_rate": 4.964969146391184e-06, + "loss": 0.2115, + "mean_token_accuracy": 0.0362443107587751, + "step": 570 + }, + { + "epoch": 2.0319609366330043, + "grad_norm": 0.8804165124893188, + "learning_rate": 4.950957190131157e-06, + "loss": 0.2073, + "mean_token_accuracy": 0.034275940739462385, + "step": 571 + }, + { + "epoch": 2.0355121518144492, + "grad_norm": 0.9676554203033447, + "learning_rate": 4.936945619050998e-06, + "loss": 0.2061, + "mean_token_accuracy": 0.03710194981977111, + "step": 572 + }, + { + "epoch": 2.0390633669958937, + "grad_norm": 0.9396111369132996, + "learning_rate": 4.922934543196912e-06, + "loss": 0.2082, + "mean_token_accuracy": 0.03654846304198145, + "step": 573 + }, + { + "epoch": 2.0426145821773387, + "grad_norm": 0.9149210453033447, + "learning_rate": 4.908924072611218e-06, + "loss": 0.2101, + "mean_token_accuracy": 0.03367238876671763, + "step": 574 + }, + { + "epoch": 2.0461657973587837, + "grad_norm": 1.1090168952941895, + "learning_rate": 4.894914317331471e-06, + "loss": 0.213, + "mean_token_accuracy": 0.0373724405435496, + "step": 575 + }, + { + "epoch": 2.0497170125402286, + "grad_norm": 1.4507415294647217, + "learning_rate": 4.88090538738962e-06, + "loss": 0.2124, + "mean_token_accuracy": 0.037985340273735346, + "step": 576 + }, + { + "epoch": 2.0532682277216736, + "grad_norm": 1.0356301069259644, + "learning_rate": 4.866897392811127e-06, + "loss": 0.2111, + "mean_token_accuracy": 0.03993125998385949, + "step": 577 + }, + { + "epoch": 2.0568194429031186, + "grad_norm": 0.9829997420310974, + "learning_rate": 4.852890443614105e-06, + "loss": 0.2215, + "mean_token_accuracy": 0.035097517848043935, + "step": 578 + }, + { + "epoch": 2.060370658084563, + "grad_norm": 1.0447118282318115, + "learning_rate": 4.838884649808458e-06, + "loss": 0.2126, + "mean_token_accuracy": 0.03923964609930408, + "step": 579 + }, + { + "epoch": 2.063921873266008, + "grad_norm": 0.8506307601928711, + "learning_rate": 4.82488012139502e-06, + "loss": 0.2073, + "mean_token_accuracy": 0.03836632040110999, + "step": 580 + }, + { + "epoch": 2.067473088447453, + "grad_norm": 0.7885401844978333, + "learning_rate": 4.810876968364679e-06, + "loss": 0.2026, + "mean_token_accuracy": 0.036981942706916016, + "step": 581 + }, + { + "epoch": 2.071024303628898, + "grad_norm": 1.110027551651001, + "learning_rate": 4.796875300697532e-06, + "loss": 0.2124, + "mean_token_accuracy": 0.03656144966953434, + "step": 582 + }, + { + "epoch": 2.074575518810343, + "grad_norm": 1.0753891468048096, + "learning_rate": 4.782875228361994e-06, + "loss": 0.201, + "mean_token_accuracy": 0.03901127418066608, + "step": 583 + }, + { + "epoch": 2.078126733991788, + "grad_norm": 0.9426742792129517, + "learning_rate": 4.7688768613139655e-06, + "loss": 0.2071, + "mean_token_accuracy": 0.038467124963062815, + "step": 584 + }, + { + "epoch": 2.081677949173233, + "grad_norm": 0.9639796614646912, + "learning_rate": 4.754880309495946e-06, + "loss": 0.204, + "mean_token_accuracy": 0.03624466506698809, + "step": 585 + }, + { + "epoch": 2.0852291643546774, + "grad_norm": 1.00525963306427, + "learning_rate": 4.74088568283618e-06, + "loss": 0.2041, + "mean_token_accuracy": 0.0362434946000576, + "step": 586 + }, + { + "epoch": 2.0887803795361224, + "grad_norm": 0.9281271696090698, + "learning_rate": 4.726893091247792e-06, + "loss": 0.192, + "mean_token_accuracy": 0.03966748606035253, + "step": 587 + }, + { + "epoch": 2.0923315947175674, + "grad_norm": 1.0103939771652222, + "learning_rate": 4.712902644627923e-06, + "loss": 0.1977, + "mean_token_accuracy": 0.04393619080656208, + "step": 588 + }, + { + "epoch": 2.0958828098990123, + "grad_norm": 0.937879204750061, + "learning_rate": 4.698914452856866e-06, + "loss": 0.201, + "mean_token_accuracy": 0.03757725107789156, + "step": 589 + }, + { + "epoch": 2.0994340250804573, + "grad_norm": 2.00667667388916, + "learning_rate": 4.684928625797208e-06, + "loss": 0.2183, + "mean_token_accuracy": 0.04129169932093646, + "step": 590 + }, + { + "epoch": 2.1029852402619023, + "grad_norm": 1.2946336269378662, + "learning_rate": 4.6709452732929614e-06, + "loss": 0.2017, + "mean_token_accuracy": 0.0390139233568334, + "step": 591 + }, + { + "epoch": 2.1065364554433472, + "grad_norm": 1.0413742065429688, + "learning_rate": 4.656964505168703e-06, + "loss": 0.2057, + "mean_token_accuracy": 0.03321857753326185, + "step": 592 + }, + { + "epoch": 2.1100876706247917, + "grad_norm": 1.1079062223434448, + "learning_rate": 4.642986431228713e-06, + "loss": 0.2084, + "mean_token_accuracy": 0.039933988635311835, + "step": 593 + }, + { + "epoch": 2.1136388858062367, + "grad_norm": 1.2978788614273071, + "learning_rate": 4.629011161256114e-06, + "loss": 0.2116, + "mean_token_accuracy": 0.03421423670442891, + "step": 594 + }, + { + "epoch": 2.1171901009876817, + "grad_norm": 0.9687265753746033, + "learning_rate": 4.615038805011999e-06, + "loss": 0.1959, + "mean_token_accuracy": 0.041332302451337455, + "step": 595 + }, + { + "epoch": 2.1207413161691266, + "grad_norm": 0.7898393273353577, + "learning_rate": 4.601069472234584e-06, + "loss": 0.2053, + "mean_token_accuracy": 0.036602654870875995, + "step": 596 + }, + { + "epoch": 2.1242925313505716, + "grad_norm": 0.8681390285491943, + "learning_rate": 4.587103272638339e-06, + "loss": 0.2209, + "mean_token_accuracy": 0.03851122210471658, + "step": 597 + }, + { + "epoch": 2.1278437465320166, + "grad_norm": 0.8869109153747559, + "learning_rate": 4.57314031591312e-06, + "loss": 0.2046, + "mean_token_accuracy": 0.0369587852510449, + "step": 598 + }, + { + "epoch": 2.131394961713461, + "grad_norm": 0.8230946063995361, + "learning_rate": 4.559180711723318e-06, + "loss": 0.1999, + "mean_token_accuracy": 0.03601818616698438, + "step": 599 + }, + { + "epoch": 2.134946176894906, + "grad_norm": 0.8362439870834351, + "learning_rate": 4.545224569706994e-06, + "loss": 0.2004, + "mean_token_accuracy": 0.04065158122466528, + "step": 600 + } + ], + "logging_steps": 1, + "max_steps": 1124, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.297489199477934e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}