{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 6430, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015552099533437013, "grad_norm": 4.662841910859059, "learning_rate": 9.999999403215137e-06, "loss": 0.5705, "step": 1 }, { "epoch": 0.00031104199066874026, "grad_norm": 3.2101601568771754, "learning_rate": 9.999997612860688e-06, "loss": 0.5483, "step": 2 }, { "epoch": 0.00046656298600311044, "grad_norm": 4.156927485367742, "learning_rate": 9.999994628937082e-06, "loss": 0.5274, "step": 3 }, { "epoch": 0.0006220839813374805, "grad_norm": 3.3680995992825533, "learning_rate": 9.99999045144503e-06, "loss": 0.5289, "step": 4 }, { "epoch": 0.0007776049766718507, "grad_norm": 3.3625564866317608, "learning_rate": 9.99998508038553e-06, "loss": 0.379, "step": 5 }, { "epoch": 0.0009331259720062209, "grad_norm": 2.7876669390361055, "learning_rate": 9.999978515759865e-06, "loss": 0.4019, "step": 6 }, { "epoch": 0.001088646967340591, "grad_norm": 2.8774460632164955, "learning_rate": 9.999970757569602e-06, "loss": 0.4947, "step": 7 }, { "epoch": 0.001244167962674961, "grad_norm": 2.126555417939935, "learning_rate": 9.999961805816589e-06, "loss": 0.3555, "step": 8 }, { "epoch": 0.0013996889580093312, "grad_norm": 1.9764001175562893, "learning_rate": 9.999951660502969e-06, "loss": 0.3102, "step": 9 }, { "epoch": 0.0015552099533437014, "grad_norm": 1.122915497849003, "learning_rate": 9.999940321631158e-06, "loss": 0.2802, "step": 10 }, { "epoch": 0.0017107309486780716, "grad_norm": 2.6849893357934436, "learning_rate": 9.99992778920387e-06, "loss": 0.3883, "step": 11 }, { "epoch": 0.0018662519440124418, "grad_norm": 1.5146624591903277, "learning_rate": 9.999914063224088e-06, "loss": 0.2749, "step": 12 }, { "epoch": 0.002021772939346812, "grad_norm": 1.4042567809467454, "learning_rate": 9.999899143695095e-06, "loss": 0.296, "step": 13 }, { "epoch": 0.002177293934681182, "grad_norm": 1.817010561320059, "learning_rate": 9.99988303062045e-06, "loss": 0.3278, "step": 14 }, { "epoch": 0.0023328149300155523, "grad_norm": 1.795433870197738, "learning_rate": 9.999865724003998e-06, "loss": 0.3146, "step": 15 }, { "epoch": 0.002488335925349922, "grad_norm": 1.7826807436565577, "learning_rate": 9.999847223849875e-06, "loss": 0.3233, "step": 16 }, { "epoch": 0.0026438569206842922, "grad_norm": 1.7206284351475924, "learning_rate": 9.999827530162493e-06, "loss": 0.3246, "step": 17 }, { "epoch": 0.0027993779160186624, "grad_norm": 3.8760819362380867, "learning_rate": 9.999806642946554e-06, "loss": 0.2648, "step": 18 }, { "epoch": 0.0029548989113530326, "grad_norm": 1.5644293516985985, "learning_rate": 9.999784562207046e-06, "loss": 0.3096, "step": 19 }, { "epoch": 0.003110419906687403, "grad_norm": 2.0190853150190877, "learning_rate": 9.999761287949237e-06, "loss": 0.307, "step": 20 }, { "epoch": 0.003265940902021773, "grad_norm": 2.763319388592032, "learning_rate": 9.999736820178686e-06, "loss": 0.4327, "step": 21 }, { "epoch": 0.003421461897356143, "grad_norm": 1.6605096033172442, "learning_rate": 9.999711158901231e-06, "loss": 0.3918, "step": 22 }, { "epoch": 0.0035769828926905133, "grad_norm": 1.7508401571856476, "learning_rate": 9.999684304123e-06, "loss": 0.3852, "step": 23 }, { "epoch": 0.0037325038880248835, "grad_norm": 2.0163360210179335, "learning_rate": 9.999656255850401e-06, "loss": 0.3567, "step": 24 }, { "epoch": 0.0038880248833592537, "grad_norm": 1.5224484473221345, "learning_rate": 9.999627014090133e-06, "loss": 0.3185, "step": 25 }, { "epoch": 0.004043545878693624, "grad_norm": 1.5651644136387708, "learning_rate": 9.999596578849173e-06, "loss": 0.2548, "step": 26 }, { "epoch": 0.004199066874027994, "grad_norm": 1.506984699015577, "learning_rate": 9.999564950134788e-06, "loss": 0.2719, "step": 27 }, { "epoch": 0.004354587869362364, "grad_norm": 1.5707101400798584, "learning_rate": 9.99953212795453e-06, "loss": 0.2585, "step": 28 }, { "epoch": 0.004510108864696734, "grad_norm": 1.6678601949561362, "learning_rate": 9.999498112316231e-06, "loss": 0.2642, "step": 29 }, { "epoch": 0.004665629860031105, "grad_norm": 1.1937228959267376, "learning_rate": 9.99946290322801e-06, "loss": 0.3348, "step": 30 }, { "epoch": 0.004821150855365474, "grad_norm": 1.474398491556367, "learning_rate": 9.999426500698277e-06, "loss": 0.2936, "step": 31 }, { "epoch": 0.004976671850699844, "grad_norm": 1.4230321858584387, "learning_rate": 9.999388904735718e-06, "loss": 0.316, "step": 32 }, { "epoch": 0.005132192846034215, "grad_norm": 2.5118600752998645, "learning_rate": 9.999350115349309e-06, "loss": 0.3135, "step": 33 }, { "epoch": 0.0052877138413685845, "grad_norm": 1.7910755988881728, "learning_rate": 9.999310132548308e-06, "loss": 0.249, "step": 34 }, { "epoch": 0.005443234836702955, "grad_norm": 1.4981333944055653, "learning_rate": 9.999268956342261e-06, "loss": 0.2594, "step": 35 }, { "epoch": 0.005598755832037325, "grad_norm": 0.9261919071743852, "learning_rate": 9.999226586740995e-06, "loss": 0.2333, "step": 36 }, { "epoch": 0.0057542768273716955, "grad_norm": 1.26246346078558, "learning_rate": 9.999183023754628e-06, "loss": 0.1787, "step": 37 }, { "epoch": 0.005909797822706065, "grad_norm": 1.9545697787374448, "learning_rate": 9.999138267393557e-06, "loss": 0.3246, "step": 38 }, { "epoch": 0.006065318818040436, "grad_norm": 1.4285410822616305, "learning_rate": 9.999092317668467e-06, "loss": 0.223, "step": 39 }, { "epoch": 0.006220839813374806, "grad_norm": 1.4526856529113084, "learning_rate": 9.999045174590324e-06, "loss": 0.182, "step": 40 }, { "epoch": 0.006376360808709175, "grad_norm": 2.4846217662340995, "learning_rate": 9.998996838170387e-06, "loss": 0.36, "step": 41 }, { "epoch": 0.006531881804043546, "grad_norm": 1.2772759621800358, "learning_rate": 9.998947308420189e-06, "loss": 0.241, "step": 42 }, { "epoch": 0.006687402799377916, "grad_norm": 2.7720889102611945, "learning_rate": 9.998896585351557e-06, "loss": 0.3213, "step": 43 }, { "epoch": 0.006842923794712286, "grad_norm": 1.7490095603308047, "learning_rate": 9.998844668976595e-06, "loss": 0.3155, "step": 44 }, { "epoch": 0.006998444790046656, "grad_norm": 1.3823301922226903, "learning_rate": 9.998791559307702e-06, "loss": 0.2149, "step": 45 }, { "epoch": 0.007153965785381027, "grad_norm": 1.288871141891326, "learning_rate": 9.998737256357551e-06, "loss": 0.2887, "step": 46 }, { "epoch": 0.007309486780715396, "grad_norm": 3.483009451782568, "learning_rate": 9.99868176013911e-06, "loss": 0.263, "step": 47 }, { "epoch": 0.007465007776049767, "grad_norm": 1.652490483156804, "learning_rate": 9.998625070665622e-06, "loss": 0.2664, "step": 48 }, { "epoch": 0.007620528771384137, "grad_norm": 1.8206039592741312, "learning_rate": 9.99856718795062e-06, "loss": 0.224, "step": 49 }, { "epoch": 0.007776049766718507, "grad_norm": 3.2471818448644743, "learning_rate": 9.998508112007925e-06, "loss": 0.293, "step": 50 }, { "epoch": 0.007931570762052876, "grad_norm": 2.4630640416023, "learning_rate": 9.998447842851638e-06, "loss": 0.2958, "step": 51 }, { "epoch": 0.008087091757387248, "grad_norm": 2.1952255314920817, "learning_rate": 9.998386380496144e-06, "loss": 0.2841, "step": 52 }, { "epoch": 0.008242612752721618, "grad_norm": 1.7440998263562653, "learning_rate": 9.998323724956114e-06, "loss": 0.2392, "step": 53 }, { "epoch": 0.008398133748055987, "grad_norm": 1.7713538170023606, "learning_rate": 9.998259876246509e-06, "loss": 0.2148, "step": 54 }, { "epoch": 0.008553654743390357, "grad_norm": 2.196248357816803, "learning_rate": 9.998194834382567e-06, "loss": 0.2314, "step": 55 }, { "epoch": 0.008709175738724729, "grad_norm": 1.5241920091736059, "learning_rate": 9.998128599379817e-06, "loss": 0.3538, "step": 56 }, { "epoch": 0.008864696734059098, "grad_norm": 1.084932443566165, "learning_rate": 9.998061171254068e-06, "loss": 0.2061, "step": 57 }, { "epoch": 0.009020217729393468, "grad_norm": 1.7028355052947441, "learning_rate": 9.997992550021418e-06, "loss": 0.2286, "step": 58 }, { "epoch": 0.009175738724727838, "grad_norm": 1.7850241306158636, "learning_rate": 9.997922735698247e-06, "loss": 0.1935, "step": 59 }, { "epoch": 0.00933125972006221, "grad_norm": 2.7780720350287287, "learning_rate": 9.997851728301219e-06, "loss": 0.2658, "step": 60 }, { "epoch": 0.009486780715396579, "grad_norm": 1.8811033125325856, "learning_rate": 9.997779527847287e-06, "loss": 0.1963, "step": 61 }, { "epoch": 0.009642301710730949, "grad_norm": 1.3758579938738247, "learning_rate": 9.997706134353687e-06, "loss": 0.2529, "step": 62 }, { "epoch": 0.009797822706065318, "grad_norm": 1.9634000227706385, "learning_rate": 9.997631547837934e-06, "loss": 0.2544, "step": 63 }, { "epoch": 0.009953343701399688, "grad_norm": 1.594710372018227, "learning_rate": 9.997555768317838e-06, "loss": 0.3528, "step": 64 }, { "epoch": 0.01010886469673406, "grad_norm": 1.8005547220704254, "learning_rate": 9.997478795811486e-06, "loss": 0.2165, "step": 65 }, { "epoch": 0.01026438569206843, "grad_norm": 2.290269323202059, "learning_rate": 9.997400630337254e-06, "loss": 0.2786, "step": 66 }, { "epoch": 0.0104199066874028, "grad_norm": 1.5486051696063095, "learning_rate": 9.997321271913801e-06, "loss": 0.2188, "step": 67 }, { "epoch": 0.010575427682737169, "grad_norm": 0.9684733219759649, "learning_rate": 9.997240720560068e-06, "loss": 0.2043, "step": 68 }, { "epoch": 0.01073094867807154, "grad_norm": 2.1081587478577437, "learning_rate": 9.997158976295288e-06, "loss": 0.2908, "step": 69 }, { "epoch": 0.01088646967340591, "grad_norm": 3.6233628477076736, "learning_rate": 9.99707603913897e-06, "loss": 0.2579, "step": 70 }, { "epoch": 0.01104199066874028, "grad_norm": 1.090209411261846, "learning_rate": 9.996991909110918e-06, "loss": 0.2864, "step": 71 }, { "epoch": 0.01119751166407465, "grad_norm": 1.3430452010098815, "learning_rate": 9.99690658623121e-06, "loss": 0.2217, "step": 72 }, { "epoch": 0.01135303265940902, "grad_norm": 2.3549515267664005, "learning_rate": 9.996820070520216e-06, "loss": 0.2822, "step": 73 }, { "epoch": 0.011508553654743391, "grad_norm": 1.5602820881890913, "learning_rate": 9.996732361998588e-06, "loss": 0.2456, "step": 74 }, { "epoch": 0.01166407465007776, "grad_norm": 1.5856862134183374, "learning_rate": 9.996643460687264e-06, "loss": 0.3056, "step": 75 }, { "epoch": 0.01181959564541213, "grad_norm": 1.6134033436501471, "learning_rate": 9.996553366607464e-06, "loss": 0.2141, "step": 76 }, { "epoch": 0.0119751166407465, "grad_norm": 1.3597955630988308, "learning_rate": 9.996462079780696e-06, "loss": 0.2295, "step": 77 }, { "epoch": 0.012130637636080872, "grad_norm": 1.1374281802105086, "learning_rate": 9.996369600228753e-06, "loss": 0.2487, "step": 78 }, { "epoch": 0.012286158631415241, "grad_norm": 1.4298077500438133, "learning_rate": 9.99627592797371e-06, "loss": 0.2446, "step": 79 }, { "epoch": 0.012441679626749611, "grad_norm": 1.3975983522660094, "learning_rate": 9.996181063037924e-06, "loss": 0.2611, "step": 80 }, { "epoch": 0.012597200622083981, "grad_norm": 1.5544782250742402, "learning_rate": 9.996085005444046e-06, "loss": 0.2311, "step": 81 }, { "epoch": 0.01275272161741835, "grad_norm": 1.3603452791878323, "learning_rate": 9.995987755215006e-06, "loss": 0.2003, "step": 82 }, { "epoch": 0.012908242612752722, "grad_norm": 1.3071118505273163, "learning_rate": 9.995889312374016e-06, "loss": 0.2338, "step": 83 }, { "epoch": 0.013063763608087092, "grad_norm": 1.7380116089178919, "learning_rate": 9.995789676944576e-06, "loss": 0.2645, "step": 84 }, { "epoch": 0.013219284603421462, "grad_norm": 1.263086313797395, "learning_rate": 9.995688848950473e-06, "loss": 0.2215, "step": 85 }, { "epoch": 0.013374805598755831, "grad_norm": 1.5100086739523095, "learning_rate": 9.995586828415774e-06, "loss": 0.2444, "step": 86 }, { "epoch": 0.013530326594090203, "grad_norm": 1.0847005275250092, "learning_rate": 9.995483615364833e-06, "loss": 0.2129, "step": 87 }, { "epoch": 0.013685847589424573, "grad_norm": 1.3155329082198164, "learning_rate": 9.995379209822289e-06, "loss": 0.2788, "step": 88 }, { "epoch": 0.013841368584758942, "grad_norm": 1.8214452427995387, "learning_rate": 9.995273611813065e-06, "loss": 0.3027, "step": 89 }, { "epoch": 0.013996889580093312, "grad_norm": 0.8312908694387112, "learning_rate": 9.995166821362368e-06, "loss": 0.226, "step": 90 }, { "epoch": 0.014152410575427682, "grad_norm": 1.6627232520442479, "learning_rate": 9.995058838495689e-06, "loss": 0.2742, "step": 91 }, { "epoch": 0.014307931570762053, "grad_norm": 0.9378761990044046, "learning_rate": 9.994949663238809e-06, "loss": 0.267, "step": 92 }, { "epoch": 0.014463452566096423, "grad_norm": 2.122534441012584, "learning_rate": 9.994839295617786e-06, "loss": 0.2438, "step": 93 }, { "epoch": 0.014618973561430793, "grad_norm": 1.577456726662404, "learning_rate": 9.994727735658968e-06, "loss": 0.2659, "step": 94 }, { "epoch": 0.014774494556765163, "grad_norm": 1.6054087269070507, "learning_rate": 9.994614983388986e-06, "loss": 0.2404, "step": 95 }, { "epoch": 0.014930015552099534, "grad_norm": 1.5558443339273214, "learning_rate": 9.994501038834755e-06, "loss": 0.2703, "step": 96 }, { "epoch": 0.015085536547433904, "grad_norm": 1.7101494645663162, "learning_rate": 9.994385902023474e-06, "loss": 0.2148, "step": 97 }, { "epoch": 0.015241057542768274, "grad_norm": 1.5422168422798725, "learning_rate": 9.99426957298263e-06, "loss": 0.2045, "step": 98 }, { "epoch": 0.015396578538102643, "grad_norm": 1.3177834617215995, "learning_rate": 9.994152051739991e-06, "loss": 0.2097, "step": 99 }, { "epoch": 0.015552099533437015, "grad_norm": 1.7250496500116883, "learning_rate": 9.994033338323612e-06, "loss": 0.2309, "step": 100 }, { "epoch": 0.015552099533437015, "eval_loss": 0.255514532327652, "eval_runtime": 9.4404, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.741, "step": 100 }, { "epoch": 0.015707620528771383, "grad_norm": 1.2242797466034228, "learning_rate": 9.993913432761831e-06, "loss": 0.2309, "step": 101 }, { "epoch": 0.015863141524105753, "grad_norm": 1.2091727169379591, "learning_rate": 9.993792335083272e-06, "loss": 0.215, "step": 102 }, { "epoch": 0.016018662519440126, "grad_norm": 1.6991288183534923, "learning_rate": 9.99367004531684e-06, "loss": 0.2716, "step": 103 }, { "epoch": 0.016174183514774496, "grad_norm": 1.8626540300463013, "learning_rate": 9.99354656349173e-06, "loss": 0.287, "step": 104 }, { "epoch": 0.016329704510108865, "grad_norm": 1.2626220604624867, "learning_rate": 9.993421889637418e-06, "loss": 0.1737, "step": 105 }, { "epoch": 0.016485225505443235, "grad_norm": 0.8807151838598477, "learning_rate": 9.993296023783664e-06, "loss": 0.227, "step": 106 }, { "epoch": 0.016640746500777605, "grad_norm": 1.4662006360846318, "learning_rate": 9.993168965960515e-06, "loss": 0.2698, "step": 107 }, { "epoch": 0.016796267496111975, "grad_norm": 2.5676508719496383, "learning_rate": 9.993040716198304e-06, "loss": 0.2231, "step": 108 }, { "epoch": 0.016951788491446344, "grad_norm": 2.144008184181988, "learning_rate": 9.992911274527641e-06, "loss": 0.2729, "step": 109 }, { "epoch": 0.017107309486780714, "grad_norm": 1.3871826576036752, "learning_rate": 9.99278064097943e-06, "loss": 0.2078, "step": 110 }, { "epoch": 0.017262830482115084, "grad_norm": 1.9299054218636398, "learning_rate": 9.992648815584853e-06, "loss": 0.2543, "step": 111 }, { "epoch": 0.017418351477449457, "grad_norm": 6.182669074382352, "learning_rate": 9.992515798375379e-06, "loss": 0.2442, "step": 112 }, { "epoch": 0.017573872472783827, "grad_norm": 1.9218049477099652, "learning_rate": 9.992381589382761e-06, "loss": 0.2909, "step": 113 }, { "epoch": 0.017729393468118197, "grad_norm": 1.7558505152868706, "learning_rate": 9.992246188639035e-06, "loss": 0.2182, "step": 114 }, { "epoch": 0.017884914463452566, "grad_norm": 1.3145893008937046, "learning_rate": 9.992109596176525e-06, "loss": 0.2445, "step": 115 }, { "epoch": 0.018040435458786936, "grad_norm": 2.3756692802265094, "learning_rate": 9.991971812027836e-06, "loss": 0.2961, "step": 116 }, { "epoch": 0.018195956454121306, "grad_norm": 2.027933705938777, "learning_rate": 9.991832836225863e-06, "loss": 0.2459, "step": 117 }, { "epoch": 0.018351477449455676, "grad_norm": 1.9997556478308784, "learning_rate": 9.991692668803775e-06, "loss": 0.2108, "step": 118 }, { "epoch": 0.018506998444790045, "grad_norm": 1.39831187226532, "learning_rate": 9.991551309795038e-06, "loss": 0.1902, "step": 119 }, { "epoch": 0.01866251944012442, "grad_norm": 1.6377700259822823, "learning_rate": 9.991408759233394e-06, "loss": 0.2491, "step": 120 }, { "epoch": 0.018818040435458788, "grad_norm": 2.09576564356888, "learning_rate": 9.991265017152869e-06, "loss": 0.2526, "step": 121 }, { "epoch": 0.018973561430793158, "grad_norm": 2.031216743667695, "learning_rate": 9.991120083587779e-06, "loss": 0.2418, "step": 122 }, { "epoch": 0.019129082426127528, "grad_norm": 1.9897151692182136, "learning_rate": 9.990973958572723e-06, "loss": 0.2786, "step": 123 }, { "epoch": 0.019284603421461897, "grad_norm": 1.7503968375792016, "learning_rate": 9.990826642142581e-06, "loss": 0.3231, "step": 124 }, { "epoch": 0.019440124416796267, "grad_norm": 0.8307156104752434, "learning_rate": 9.990678134332521e-06, "loss": 0.2058, "step": 125 }, { "epoch": 0.019595645412130637, "grad_norm": 2.105265419067902, "learning_rate": 9.990528435177992e-06, "loss": 0.2665, "step": 126 }, { "epoch": 0.019751166407465007, "grad_norm": 0.845573052530141, "learning_rate": 9.99037754471473e-06, "loss": 0.1706, "step": 127 }, { "epoch": 0.019906687402799376, "grad_norm": 1.3561288374051286, "learning_rate": 9.990225462978756e-06, "loss": 0.2834, "step": 128 }, { "epoch": 0.02006220839813375, "grad_norm": 1.4639985615099256, "learning_rate": 9.990072190006371e-06, "loss": 0.2775, "step": 129 }, { "epoch": 0.02021772939346812, "grad_norm": 1.424715750901468, "learning_rate": 9.989917725834166e-06, "loss": 0.2331, "step": 130 }, { "epoch": 0.02037325038880249, "grad_norm": 1.4908712495988423, "learning_rate": 9.989762070499015e-06, "loss": 0.2326, "step": 131 }, { "epoch": 0.02052877138413686, "grad_norm": 1.9371986234951772, "learning_rate": 9.98960522403807e-06, "loss": 0.248, "step": 132 }, { "epoch": 0.02068429237947123, "grad_norm": 1.7802271420639102, "learning_rate": 9.989447186488777e-06, "loss": 0.2881, "step": 133 }, { "epoch": 0.0208398133748056, "grad_norm": 1.1250396512690675, "learning_rate": 9.98928795788886e-06, "loss": 0.2309, "step": 134 }, { "epoch": 0.020995334370139968, "grad_norm": 1.6801724252117862, "learning_rate": 9.989127538276329e-06, "loss": 0.2292, "step": 135 }, { "epoch": 0.021150855365474338, "grad_norm": 1.1771299351260398, "learning_rate": 9.98896592768948e-06, "loss": 0.1553, "step": 136 }, { "epoch": 0.021306376360808708, "grad_norm": 2.1842202518230645, "learning_rate": 9.988803126166889e-06, "loss": 0.3029, "step": 137 }, { "epoch": 0.02146189735614308, "grad_norm": 1.3745547142156036, "learning_rate": 9.988639133747422e-06, "loss": 0.1702, "step": 138 }, { "epoch": 0.02161741835147745, "grad_norm": 1.8504088238591443, "learning_rate": 9.988473950470223e-06, "loss": 0.2318, "step": 139 }, { "epoch": 0.02177293934681182, "grad_norm": 1.7870069125473158, "learning_rate": 9.988307576374727e-06, "loss": 0.2008, "step": 140 }, { "epoch": 0.02192846034214619, "grad_norm": 2.3953898044564883, "learning_rate": 9.988140011500647e-06, "loss": 0.2007, "step": 141 }, { "epoch": 0.02208398133748056, "grad_norm": 1.1845465157973594, "learning_rate": 9.987971255887985e-06, "loss": 0.2334, "step": 142 }, { "epoch": 0.02223950233281493, "grad_norm": 1.747163885973197, "learning_rate": 9.987801309577026e-06, "loss": 0.2559, "step": 143 }, { "epoch": 0.0223950233281493, "grad_norm": 1.6909380164686145, "learning_rate": 9.987630172608333e-06, "loss": 0.2819, "step": 144 }, { "epoch": 0.02255054432348367, "grad_norm": 1.6459040836915735, "learning_rate": 9.987457845022767e-06, "loss": 0.2283, "step": 145 }, { "epoch": 0.02270606531881804, "grad_norm": 1.0639213494130906, "learning_rate": 9.987284326861459e-06, "loss": 0.2947, "step": 146 }, { "epoch": 0.022861586314152412, "grad_norm": 1.423659630662775, "learning_rate": 9.987109618165832e-06, "loss": 0.1895, "step": 147 }, { "epoch": 0.023017107309486782, "grad_norm": 2.1171729246911966, "learning_rate": 9.986933718977591e-06, "loss": 0.1967, "step": 148 }, { "epoch": 0.02317262830482115, "grad_norm": 1.4659656443481106, "learning_rate": 9.986756629338728e-06, "loss": 0.1553, "step": 149 }, { "epoch": 0.02332814930015552, "grad_norm": 3.3524464413937762, "learning_rate": 9.986578349291514e-06, "loss": 0.2472, "step": 150 }, { "epoch": 0.02348367029548989, "grad_norm": 1.4421209559287633, "learning_rate": 9.986398878878507e-06, "loss": 0.1791, "step": 151 }, { "epoch": 0.02363919129082426, "grad_norm": 1.7313564339261944, "learning_rate": 9.98621821814255e-06, "loss": 0.2238, "step": 152 }, { "epoch": 0.02379471228615863, "grad_norm": 1.7017996756379121, "learning_rate": 9.986036367126769e-06, "loss": 0.2007, "step": 153 }, { "epoch": 0.023950233281493, "grad_norm": 1.515471002124247, "learning_rate": 9.985853325874575e-06, "loss": 0.2688, "step": 154 }, { "epoch": 0.02410575427682737, "grad_norm": 0.8049651881516254, "learning_rate": 9.985669094429662e-06, "loss": 0.1865, "step": 155 }, { "epoch": 0.024261275272161743, "grad_norm": 1.2861650933813724, "learning_rate": 9.985483672836007e-06, "loss": 0.2403, "step": 156 }, { "epoch": 0.024416796267496113, "grad_norm": 2.173379700965189, "learning_rate": 9.985297061137877e-06, "loss": 0.2045, "step": 157 }, { "epoch": 0.024572317262830483, "grad_norm": 1.5915407935889336, "learning_rate": 9.985109259379813e-06, "loss": 0.2063, "step": 158 }, { "epoch": 0.024727838258164853, "grad_norm": 1.877271886192633, "learning_rate": 9.98492026760665e-06, "loss": 0.226, "step": 159 }, { "epoch": 0.024883359253499222, "grad_norm": 1.590999803444347, "learning_rate": 9.984730085863504e-06, "loss": 0.2243, "step": 160 }, { "epoch": 0.025038880248833592, "grad_norm": 2.2602490405621016, "learning_rate": 9.98453871419577e-06, "loss": 0.2599, "step": 161 }, { "epoch": 0.025194401244167962, "grad_norm": 1.8247502790432317, "learning_rate": 9.984346152649135e-06, "loss": 0.2575, "step": 162 }, { "epoch": 0.02534992223950233, "grad_norm": 1.6317702406563646, "learning_rate": 9.984152401269562e-06, "loss": 0.2513, "step": 163 }, { "epoch": 0.0255054432348367, "grad_norm": 1.479820350518653, "learning_rate": 9.983957460103307e-06, "loss": 0.2134, "step": 164 }, { "epoch": 0.025660964230171075, "grad_norm": 2.2204278110409716, "learning_rate": 9.9837613291969e-06, "loss": 0.2288, "step": 165 }, { "epoch": 0.025816485225505444, "grad_norm": 1.8249773963334357, "learning_rate": 9.983564008597164e-06, "loss": 0.2342, "step": 166 }, { "epoch": 0.025972006220839814, "grad_norm": 1.892476010698033, "learning_rate": 9.9833654983512e-06, "loss": 0.2263, "step": 167 }, { "epoch": 0.026127527216174184, "grad_norm": 1.593847254715758, "learning_rate": 9.983165798506398e-06, "loss": 0.2163, "step": 168 }, { "epoch": 0.026283048211508554, "grad_norm": 1.7653992228114257, "learning_rate": 9.982964909110426e-06, "loss": 0.2938, "step": 169 }, { "epoch": 0.026438569206842923, "grad_norm": 1.3352350617943483, "learning_rate": 9.982762830211239e-06, "loss": 0.2069, "step": 170 }, { "epoch": 0.026594090202177293, "grad_norm": 1.6623662216358996, "learning_rate": 9.982559561857079e-06, "loss": 0.213, "step": 171 }, { "epoch": 0.026749611197511663, "grad_norm": 1.1923151136153478, "learning_rate": 9.982355104096468e-06, "loss": 0.2068, "step": 172 }, { "epoch": 0.026905132192846033, "grad_norm": 1.5009321240819553, "learning_rate": 9.98214945697821e-06, "loss": 0.3292, "step": 173 }, { "epoch": 0.027060653188180406, "grad_norm": 1.6168504596283289, "learning_rate": 9.981942620551399e-06, "loss": 0.2001, "step": 174 }, { "epoch": 0.027216174183514776, "grad_norm": 1.0410735731938325, "learning_rate": 9.98173459486541e-06, "loss": 0.2697, "step": 175 }, { "epoch": 0.027371695178849145, "grad_norm": 1.477725722611291, "learning_rate": 9.9815253799699e-06, "loss": 0.1796, "step": 176 }, { "epoch": 0.027527216174183515, "grad_norm": 1.5159741098115525, "learning_rate": 9.981314975914811e-06, "loss": 0.2203, "step": 177 }, { "epoch": 0.027682737169517885, "grad_norm": 0.8954975243967727, "learning_rate": 9.981103382750372e-06, "loss": 0.2662, "step": 178 }, { "epoch": 0.027838258164852255, "grad_norm": 1.418625218985406, "learning_rate": 9.980890600527092e-06, "loss": 0.2484, "step": 179 }, { "epoch": 0.027993779160186624, "grad_norm": 1.4411516373436362, "learning_rate": 9.980676629295763e-06, "loss": 0.302, "step": 180 }, { "epoch": 0.028149300155520994, "grad_norm": 0.9480510792156464, "learning_rate": 9.980461469107463e-06, "loss": 0.2075, "step": 181 }, { "epoch": 0.028304821150855364, "grad_norm": 2.081864475923441, "learning_rate": 9.980245120013558e-06, "loss": 0.2942, "step": 182 }, { "epoch": 0.028460342146189737, "grad_norm": 1.2615838373847896, "learning_rate": 9.980027582065691e-06, "loss": 0.2018, "step": 183 }, { "epoch": 0.028615863141524107, "grad_norm": 1.2086223544731691, "learning_rate": 9.979808855315792e-06, "loss": 0.2743, "step": 184 }, { "epoch": 0.028771384136858476, "grad_norm": 0.9412206342605678, "learning_rate": 9.979588939816071e-06, "loss": 0.2318, "step": 185 }, { "epoch": 0.028926905132192846, "grad_norm": 1.365479987499767, "learning_rate": 9.979367835619029e-06, "loss": 0.2813, "step": 186 }, { "epoch": 0.029082426127527216, "grad_norm": 1.1385427599520912, "learning_rate": 9.979145542777444e-06, "loss": 0.2627, "step": 187 }, { "epoch": 0.029237947122861586, "grad_norm": 1.560448582637042, "learning_rate": 9.97892206134438e-06, "loss": 0.2042, "step": 188 }, { "epoch": 0.029393468118195955, "grad_norm": 1.9585068672638826, "learning_rate": 9.97869739137319e-06, "loss": 0.2647, "step": 189 }, { "epoch": 0.029548989113530325, "grad_norm": 1.612253014357388, "learning_rate": 9.9784715329175e-06, "loss": 0.2573, "step": 190 }, { "epoch": 0.0297045101088647, "grad_norm": 1.2782552177366555, "learning_rate": 9.978244486031228e-06, "loss": 0.1914, "step": 191 }, { "epoch": 0.029860031104199068, "grad_norm": 2.1188620010348163, "learning_rate": 9.978016250768573e-06, "loss": 0.245, "step": 192 }, { "epoch": 0.030015552099533438, "grad_norm": 1.9777647488169638, "learning_rate": 9.977786827184019e-06, "loss": 0.2774, "step": 193 }, { "epoch": 0.030171073094867808, "grad_norm": 2.0157801185629407, "learning_rate": 9.977556215332332e-06, "loss": 0.297, "step": 194 }, { "epoch": 0.030326594090202177, "grad_norm": 0.8845310906810993, "learning_rate": 9.97732441526856e-06, "loss": 0.1756, "step": 195 }, { "epoch": 0.030482115085536547, "grad_norm": 1.2647941053184737, "learning_rate": 9.97709142704804e-06, "loss": 0.1773, "step": 196 }, { "epoch": 0.030637636080870917, "grad_norm": 1.1823797462719756, "learning_rate": 9.976857250726389e-06, "loss": 0.2501, "step": 197 }, { "epoch": 0.030793157076205287, "grad_norm": 1.643272741263538, "learning_rate": 9.976621886359506e-06, "loss": 0.2794, "step": 198 }, { "epoch": 0.030948678071539656, "grad_norm": 1.6415813649465196, "learning_rate": 9.976385334003577e-06, "loss": 0.2562, "step": 199 }, { "epoch": 0.03110419906687403, "grad_norm": 1.4019518238717095, "learning_rate": 9.976147593715074e-06, "loss": 0.2066, "step": 200 }, { "epoch": 0.03110419906687403, "eval_loss": 0.2431146204471588, "eval_runtime": 9.4441, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 200 }, { "epoch": 0.031259720062208396, "grad_norm": 1.5261705881041825, "learning_rate": 9.975908665550742e-06, "loss": 0.168, "step": 201 }, { "epoch": 0.031415241057542766, "grad_norm": 1.3552305394454693, "learning_rate": 9.975668549567623e-06, "loss": 0.2513, "step": 202 }, { "epoch": 0.031570762052877135, "grad_norm": 1.09704983539552, "learning_rate": 9.97542724582303e-06, "loss": 0.1877, "step": 203 }, { "epoch": 0.031726283048211505, "grad_norm": 1.8452203060592092, "learning_rate": 9.975184754374572e-06, "loss": 0.3442, "step": 204 }, { "epoch": 0.03188180404354588, "grad_norm": 1.4512649025391702, "learning_rate": 9.974941075280128e-06, "loss": 0.2172, "step": 205 }, { "epoch": 0.03203732503888025, "grad_norm": 1.5376722263850107, "learning_rate": 9.974696208597874e-06, "loss": 0.2206, "step": 206 }, { "epoch": 0.03219284603421462, "grad_norm": 1.6097488768932668, "learning_rate": 9.97445015438626e-06, "loss": 0.2134, "step": 207 }, { "epoch": 0.03234836702954899, "grad_norm": 1.2381378734127797, "learning_rate": 9.974202912704022e-06, "loss": 0.2026, "step": 208 }, { "epoch": 0.03250388802488336, "grad_norm": 2.0110329862327663, "learning_rate": 9.973954483610184e-06, "loss": 0.2117, "step": 209 }, { "epoch": 0.03265940902021773, "grad_norm": 4.938465463538487, "learning_rate": 9.973704867164044e-06, "loss": 0.2787, "step": 210 }, { "epoch": 0.0328149300155521, "grad_norm": 1.9318587506840115, "learning_rate": 9.973454063425191e-06, "loss": 0.2901, "step": 211 }, { "epoch": 0.03297045101088647, "grad_norm": 1.5730776773238022, "learning_rate": 9.973202072453498e-06, "loss": 0.3557, "step": 212 }, { "epoch": 0.03312597200622084, "grad_norm": 2.333406801079277, "learning_rate": 9.972948894309116e-06, "loss": 0.2553, "step": 213 }, { "epoch": 0.03328149300155521, "grad_norm": 1.2613725609366824, "learning_rate": 9.972694529052482e-06, "loss": 0.2721, "step": 214 }, { "epoch": 0.03343701399688958, "grad_norm": 1.233807021429561, "learning_rate": 9.972438976744317e-06, "loss": 0.194, "step": 215 }, { "epoch": 0.03359253499222395, "grad_norm": 1.0922019141763, "learning_rate": 9.972182237445624e-06, "loss": 0.2625, "step": 216 }, { "epoch": 0.03374805598755832, "grad_norm": 1.5332376003824164, "learning_rate": 9.971924311217693e-06, "loss": 0.2369, "step": 217 }, { "epoch": 0.03390357698289269, "grad_norm": 2.1386234582292856, "learning_rate": 9.971665198122093e-06, "loss": 0.2691, "step": 218 }, { "epoch": 0.03405909797822706, "grad_norm": 1.4374027394103162, "learning_rate": 9.97140489822068e-06, "loss": 0.2217, "step": 219 }, { "epoch": 0.03421461897356143, "grad_norm": 1.7261766376116665, "learning_rate": 9.971143411575585e-06, "loss": 0.3063, "step": 220 }, { "epoch": 0.0343701399688958, "grad_norm": 1.5632670578977363, "learning_rate": 9.970880738249236e-06, "loss": 0.2333, "step": 221 }, { "epoch": 0.03452566096423017, "grad_norm": 1.6709935682257062, "learning_rate": 9.97061687830433e-06, "loss": 0.2808, "step": 222 }, { "epoch": 0.034681181959564544, "grad_norm": 1.7747486994884278, "learning_rate": 9.970351831803862e-06, "loss": 0.3182, "step": 223 }, { "epoch": 0.034836702954898914, "grad_norm": 1.2079739996818415, "learning_rate": 9.970085598811094e-06, "loss": 0.2426, "step": 224 }, { "epoch": 0.034992223950233284, "grad_norm": 2.269795435480081, "learning_rate": 9.969818179389586e-06, "loss": 0.1933, "step": 225 }, { "epoch": 0.035147744945567654, "grad_norm": 1.28324330975912, "learning_rate": 9.96954957360317e-06, "loss": 0.2078, "step": 226 }, { "epoch": 0.03530326594090202, "grad_norm": 3.0240429569891147, "learning_rate": 9.969279781515967e-06, "loss": 0.2865, "step": 227 }, { "epoch": 0.03545878693623639, "grad_norm": 1.4022531253860526, "learning_rate": 9.969008803192385e-06, "loss": 0.189, "step": 228 }, { "epoch": 0.03561430793157076, "grad_norm": 1.4481645110880101, "learning_rate": 9.968736638697105e-06, "loss": 0.2038, "step": 229 }, { "epoch": 0.03576982892690513, "grad_norm": 1.2439638440320844, "learning_rate": 9.968463288095096e-06, "loss": 0.1962, "step": 230 }, { "epoch": 0.0359253499222395, "grad_norm": 1.550618674775446, "learning_rate": 9.968188751451613e-06, "loss": 0.2461, "step": 231 }, { "epoch": 0.03608087091757387, "grad_norm": 1.2590441656933422, "learning_rate": 9.967913028832192e-06, "loss": 0.28, "step": 232 }, { "epoch": 0.03623639191290824, "grad_norm": 15.743047596573488, "learning_rate": 9.96763612030265e-06, "loss": 0.2272, "step": 233 }, { "epoch": 0.03639191290824261, "grad_norm": 1.0832646660805165, "learning_rate": 9.967358025929092e-06, "loss": 0.2766, "step": 234 }, { "epoch": 0.03654743390357698, "grad_norm": 1.496152606461021, "learning_rate": 9.9670787457779e-06, "loss": 0.1928, "step": 235 }, { "epoch": 0.03670295489891135, "grad_norm": 1.5049076518304147, "learning_rate": 9.966798279915744e-06, "loss": 0.2023, "step": 236 }, { "epoch": 0.03685847589424572, "grad_norm": 0.9377725167524534, "learning_rate": 9.966516628409573e-06, "loss": 0.1657, "step": 237 }, { "epoch": 0.03701399688958009, "grad_norm": 1.5646202349920761, "learning_rate": 9.96623379132662e-06, "loss": 0.2157, "step": 238 }, { "epoch": 0.03716951788491446, "grad_norm": 1.14277577769819, "learning_rate": 9.965949768734409e-06, "loss": 0.2163, "step": 239 }, { "epoch": 0.03732503888024884, "grad_norm": 2.158716016882222, "learning_rate": 9.965664560700734e-06, "loss": 0.2041, "step": 240 }, { "epoch": 0.03748055987558321, "grad_norm": 1.8568349342429766, "learning_rate": 9.965378167293679e-06, "loss": 0.2266, "step": 241 }, { "epoch": 0.037636080870917576, "grad_norm": 2.035673543431871, "learning_rate": 9.965090588581609e-06, "loss": 0.2893, "step": 242 }, { "epoch": 0.037791601866251946, "grad_norm": 1.2421527558787024, "learning_rate": 9.964801824633177e-06, "loss": 0.166, "step": 243 }, { "epoch": 0.037947122861586316, "grad_norm": 1.7368625294642988, "learning_rate": 9.964511875517313e-06, "loss": 0.2593, "step": 244 }, { "epoch": 0.038102643856920686, "grad_norm": 1.274064232837515, "learning_rate": 9.964220741303232e-06, "loss": 0.1676, "step": 245 }, { "epoch": 0.038258164852255055, "grad_norm": 1.3271094523398685, "learning_rate": 9.963928422060432e-06, "loss": 0.2048, "step": 246 }, { "epoch": 0.038413685847589425, "grad_norm": 1.441894820882409, "learning_rate": 9.963634917858692e-06, "loss": 0.2102, "step": 247 }, { "epoch": 0.038569206842923795, "grad_norm": 1.3882607946902543, "learning_rate": 9.963340228768077e-06, "loss": 0.1862, "step": 248 }, { "epoch": 0.038724727838258165, "grad_norm": 1.1529068772443192, "learning_rate": 9.963044354858934e-06, "loss": 0.2519, "step": 249 }, { "epoch": 0.038880248833592534, "grad_norm": 2.236043321099024, "learning_rate": 9.962747296201891e-06, "loss": 0.1635, "step": 250 }, { "epoch": 0.039035769828926904, "grad_norm": 1.8503487939836718, "learning_rate": 9.96244905286786e-06, "loss": 0.181, "step": 251 }, { "epoch": 0.039191290824261274, "grad_norm": 1.4083157880171735, "learning_rate": 9.962149624928037e-06, "loss": 0.1781, "step": 252 }, { "epoch": 0.039346811819595644, "grad_norm": 1.6536407646222175, "learning_rate": 9.961849012453899e-06, "loss": 0.2699, "step": 253 }, { "epoch": 0.039502332814930013, "grad_norm": 1.3154495432198843, "learning_rate": 9.961547215517206e-06, "loss": 0.2096, "step": 254 }, { "epoch": 0.03965785381026438, "grad_norm": 1.222944730470649, "learning_rate": 9.961244234190001e-06, "loss": 0.209, "step": 255 }, { "epoch": 0.03981337480559875, "grad_norm": 1.3903861430735245, "learning_rate": 9.96094006854461e-06, "loss": 0.177, "step": 256 }, { "epoch": 0.03996889580093312, "grad_norm": 1.8733569984170189, "learning_rate": 9.960634718653644e-06, "loss": 0.4051, "step": 257 }, { "epoch": 0.0401244167962675, "grad_norm": 1.3013086938531622, "learning_rate": 9.96032818458999e-06, "loss": 0.2215, "step": 258 }, { "epoch": 0.04027993779160187, "grad_norm": 1.9062067810307814, "learning_rate": 9.960020466426825e-06, "loss": 0.2131, "step": 259 }, { "epoch": 0.04043545878693624, "grad_norm": 1.240725461727028, "learning_rate": 9.959711564237603e-06, "loss": 0.2376, "step": 260 }, { "epoch": 0.04059097978227061, "grad_norm": 1.504578258989953, "learning_rate": 9.95940147809607e-06, "loss": 0.2238, "step": 261 }, { "epoch": 0.04074650077760498, "grad_norm": 1.112441665378311, "learning_rate": 9.959090208076239e-06, "loss": 0.175, "step": 262 }, { "epoch": 0.04090202177293935, "grad_norm": 1.492328645699945, "learning_rate": 9.958777754252418e-06, "loss": 0.2332, "step": 263 }, { "epoch": 0.04105754276827372, "grad_norm": 1.4626777112927891, "learning_rate": 9.958464116699196e-06, "loss": 0.2093, "step": 264 }, { "epoch": 0.04121306376360809, "grad_norm": 2.4304182018626266, "learning_rate": 9.958149295491441e-06, "loss": 0.2495, "step": 265 }, { "epoch": 0.04136858475894246, "grad_norm": 2.1830670676642256, "learning_rate": 9.957833290704305e-06, "loss": 0.2151, "step": 266 }, { "epoch": 0.04152410575427683, "grad_norm": 0.9776646131405466, "learning_rate": 9.957516102413223e-06, "loss": 0.2215, "step": 267 }, { "epoch": 0.0416796267496112, "grad_norm": 0.9811824757237497, "learning_rate": 9.957197730693912e-06, "loss": 0.2671, "step": 268 }, { "epoch": 0.04183514774494557, "grad_norm": 1.025030756788744, "learning_rate": 9.956878175622372e-06, "loss": 0.1935, "step": 269 }, { "epoch": 0.041990668740279936, "grad_norm": 1.715248799705313, "learning_rate": 9.956557437274887e-06, "loss": 0.2639, "step": 270 }, { "epoch": 0.042146189735614306, "grad_norm": 1.4715136542514509, "learning_rate": 9.95623551572802e-06, "loss": 0.1863, "step": 271 }, { "epoch": 0.042301710730948676, "grad_norm": 2.0941396313348766, "learning_rate": 9.955912411058616e-06, "loss": 0.1764, "step": 272 }, { "epoch": 0.042457231726283046, "grad_norm": 1.4113410003708207, "learning_rate": 9.955588123343808e-06, "loss": 0.2635, "step": 273 }, { "epoch": 0.042612752721617415, "grad_norm": 1.0999635349018924, "learning_rate": 9.955262652661009e-06, "loss": 0.2424, "step": 274 }, { "epoch": 0.042768273716951785, "grad_norm": 1.0847541480257452, "learning_rate": 9.954935999087908e-06, "loss": 0.276, "step": 275 }, { "epoch": 0.04292379471228616, "grad_norm": 1.695906274664277, "learning_rate": 9.954608162702488e-06, "loss": 0.2316, "step": 276 }, { "epoch": 0.04307931570762053, "grad_norm": 1.428650374776818, "learning_rate": 9.954279143583003e-06, "loss": 0.234, "step": 277 }, { "epoch": 0.0432348367029549, "grad_norm": 1.261831528775643, "learning_rate": 9.953948941807998e-06, "loss": 0.2331, "step": 278 }, { "epoch": 0.04339035769828927, "grad_norm": 1.1389240235405695, "learning_rate": 9.953617557456295e-06, "loss": 0.1813, "step": 279 }, { "epoch": 0.04354587869362364, "grad_norm": 2.1356821017337264, "learning_rate": 9.953284990607e-06, "loss": 0.2716, "step": 280 }, { "epoch": 0.04370139968895801, "grad_norm": 1.256196669200449, "learning_rate": 9.952951241339501e-06, "loss": 0.2586, "step": 281 }, { "epoch": 0.04385692068429238, "grad_norm": 1.6264279435141102, "learning_rate": 9.952616309733471e-06, "loss": 0.2138, "step": 282 }, { "epoch": 0.04401244167962675, "grad_norm": 1.0771562874552736, "learning_rate": 9.952280195868859e-06, "loss": 0.2798, "step": 283 }, { "epoch": 0.04416796267496112, "grad_norm": 1.6634031368562676, "learning_rate": 9.951942899825906e-06, "loss": 0.3159, "step": 284 }, { "epoch": 0.04432348367029549, "grad_norm": 1.5379741925800816, "learning_rate": 9.951604421685121e-06, "loss": 0.3275, "step": 285 }, { "epoch": 0.04447900466562986, "grad_norm": 1.4489954817264272, "learning_rate": 9.951264761527311e-06, "loss": 0.1989, "step": 286 }, { "epoch": 0.04463452566096423, "grad_norm": 1.6369744606712289, "learning_rate": 9.950923919433555e-06, "loss": 0.2068, "step": 287 }, { "epoch": 0.0447900466562986, "grad_norm": 1.8400125131547473, "learning_rate": 9.950581895485214e-06, "loss": 0.1977, "step": 288 }, { "epoch": 0.04494556765163297, "grad_norm": 2.1448208174547743, "learning_rate": 9.950238689763937e-06, "loss": 0.1882, "step": 289 }, { "epoch": 0.04510108864696734, "grad_norm": 1.1002755110550755, "learning_rate": 9.949894302351653e-06, "loss": 0.2422, "step": 290 }, { "epoch": 0.04525660964230171, "grad_norm": 0.8557887132764603, "learning_rate": 9.94954873333057e-06, "loss": 0.2249, "step": 291 }, { "epoch": 0.04541213063763608, "grad_norm": 1.800548229871832, "learning_rate": 9.94920198278318e-06, "loss": 0.2462, "step": 292 }, { "epoch": 0.04556765163297045, "grad_norm": 1.077848623865367, "learning_rate": 9.948854050792256e-06, "loss": 0.1693, "step": 293 }, { "epoch": 0.045723172628304824, "grad_norm": 1.3420617788641933, "learning_rate": 9.948504937440857e-06, "loss": 0.2632, "step": 294 }, { "epoch": 0.045878693623639194, "grad_norm": 1.786889545891979, "learning_rate": 9.948154642812321e-06, "loss": 0.1812, "step": 295 }, { "epoch": 0.046034214618973564, "grad_norm": 1.6608331504976344, "learning_rate": 9.947803166990267e-06, "loss": 0.2781, "step": 296 }, { "epoch": 0.046189735614307934, "grad_norm": 1.479079510539959, "learning_rate": 9.947450510058596e-06, "loss": 0.2176, "step": 297 }, { "epoch": 0.0463452566096423, "grad_norm": 1.1205653962227666, "learning_rate": 9.947096672101496e-06, "loss": 0.2189, "step": 298 }, { "epoch": 0.04650077760497667, "grad_norm": 1.6903970393534788, "learning_rate": 9.94674165320343e-06, "loss": 0.1715, "step": 299 }, { "epoch": 0.04665629860031104, "grad_norm": 3.020535469766265, "learning_rate": 9.946385453449145e-06, "loss": 0.2334, "step": 300 }, { "epoch": 0.04665629860031104, "eval_loss": 0.23520340025424957, "eval_runtime": 9.4655, "eval_samples_per_second": 2.747, "eval_steps_per_second": 0.74, "step": 300 }, { "epoch": 0.04681181959564541, "grad_norm": 1.2625213750296742, "learning_rate": 9.946028072923675e-06, "loss": 0.2153, "step": 301 }, { "epoch": 0.04696734059097978, "grad_norm": 1.326552639234392, "learning_rate": 9.945669511712328e-06, "loss": 0.1378, "step": 302 }, { "epoch": 0.04712286158631415, "grad_norm": 1.1353660480206176, "learning_rate": 9.945309769900698e-06, "loss": 0.2505, "step": 303 }, { "epoch": 0.04727838258164852, "grad_norm": 1.2591178630665596, "learning_rate": 9.944948847574662e-06, "loss": 0.1704, "step": 304 }, { "epoch": 0.04743390357698289, "grad_norm": 1.3520689396483014, "learning_rate": 9.944586744820377e-06, "loss": 0.2324, "step": 305 }, { "epoch": 0.04758942457231726, "grad_norm": 1.0116417439713241, "learning_rate": 9.94422346172428e-06, "loss": 0.1512, "step": 306 }, { "epoch": 0.04774494556765163, "grad_norm": 1.479626380132595, "learning_rate": 9.943858998373093e-06, "loss": 0.2121, "step": 307 }, { "epoch": 0.047900466562986, "grad_norm": 1.4227055232441543, "learning_rate": 9.94349335485382e-06, "loss": 0.2667, "step": 308 }, { "epoch": 0.04805598755832037, "grad_norm": 1.583200032514501, "learning_rate": 9.943126531253744e-06, "loss": 0.289, "step": 309 }, { "epoch": 0.04821150855365474, "grad_norm": 1.8189938486203978, "learning_rate": 9.942758527660429e-06, "loss": 0.3084, "step": 310 }, { "epoch": 0.04836702954898912, "grad_norm": 1.146189412882889, "learning_rate": 9.942389344161724e-06, "loss": 0.1669, "step": 311 }, { "epoch": 0.04852255054432349, "grad_norm": 1.547896984860253, "learning_rate": 9.94201898084576e-06, "loss": 0.2064, "step": 312 }, { "epoch": 0.048678071539657856, "grad_norm": 1.5949794296702688, "learning_rate": 9.941647437800946e-06, "loss": 0.1929, "step": 313 }, { "epoch": 0.048833592534992226, "grad_norm": 1.803377063241175, "learning_rate": 9.941274715115976e-06, "loss": 0.2791, "step": 314 }, { "epoch": 0.048989113530326596, "grad_norm": 1.3837921692775779, "learning_rate": 9.940900812879822e-06, "loss": 0.1767, "step": 315 }, { "epoch": 0.049144634525660966, "grad_norm": 1.3433932609509933, "learning_rate": 9.940525731181741e-06, "loss": 0.2084, "step": 316 }, { "epoch": 0.049300155520995335, "grad_norm": 1.357062528683942, "learning_rate": 9.940149470111269e-06, "loss": 0.2047, "step": 317 }, { "epoch": 0.049455676516329705, "grad_norm": 1.6539883727473814, "learning_rate": 9.939772029758225e-06, "loss": 0.2925, "step": 318 }, { "epoch": 0.049611197511664075, "grad_norm": 1.2278880982790155, "learning_rate": 9.939393410212713e-06, "loss": 0.2649, "step": 319 }, { "epoch": 0.049766718506998445, "grad_norm": 1.6247947056783312, "learning_rate": 9.93901361156511e-06, "loss": 0.3355, "step": 320 }, { "epoch": 0.049922239502332814, "grad_norm": 1.1732603342184649, "learning_rate": 9.93863263390608e-06, "loss": 0.2603, "step": 321 }, { "epoch": 0.050077760497667184, "grad_norm": 1.4022468720638315, "learning_rate": 9.93825047732657e-06, "loss": 0.3171, "step": 322 }, { "epoch": 0.050233281493001554, "grad_norm": 1.3668475608164796, "learning_rate": 9.937867141917804e-06, "loss": 0.2952, "step": 323 }, { "epoch": 0.050388802488335924, "grad_norm": 1.4553813573539522, "learning_rate": 9.93748262777129e-06, "loss": 0.1581, "step": 324 }, { "epoch": 0.05054432348367029, "grad_norm": 1.9871080316775154, "learning_rate": 9.937096934978819e-06, "loss": 0.2368, "step": 325 }, { "epoch": 0.05069984447900466, "grad_norm": 1.2900065629907207, "learning_rate": 9.936710063632457e-06, "loss": 0.2831, "step": 326 }, { "epoch": 0.05085536547433903, "grad_norm": 0.9263549089146618, "learning_rate": 9.93632201382456e-06, "loss": 0.2086, "step": 327 }, { "epoch": 0.0510108864696734, "grad_norm": 1.9892589335821493, "learning_rate": 9.935932785647756e-06, "loss": 0.2717, "step": 328 }, { "epoch": 0.05116640746500778, "grad_norm": 1.1155547773179386, "learning_rate": 9.935542379194965e-06, "loss": 0.2731, "step": 329 }, { "epoch": 0.05132192846034215, "grad_norm": 1.0330106857849222, "learning_rate": 9.935150794559379e-06, "loss": 0.1841, "step": 330 }, { "epoch": 0.05147744945567652, "grad_norm": 1.52093348670823, "learning_rate": 9.934758031834475e-06, "loss": 0.2061, "step": 331 }, { "epoch": 0.05163297045101089, "grad_norm": 1.1824055834479263, "learning_rate": 9.93436409111401e-06, "loss": 0.2613, "step": 332 }, { "epoch": 0.05178849144634526, "grad_norm": 1.5329142188470473, "learning_rate": 9.933968972492026e-06, "loss": 0.2541, "step": 333 }, { "epoch": 0.05194401244167963, "grad_norm": 1.0304282737168275, "learning_rate": 9.933572676062841e-06, "loss": 0.2024, "step": 334 }, { "epoch": 0.052099533437014, "grad_norm": 1.1252175849664872, "learning_rate": 9.933175201921057e-06, "loss": 0.201, "step": 335 }, { "epoch": 0.05225505443234837, "grad_norm": 1.6828294804696526, "learning_rate": 9.932776550161559e-06, "loss": 0.2298, "step": 336 }, { "epoch": 0.05241057542768274, "grad_norm": 1.2831001226274117, "learning_rate": 9.932376720879503e-06, "loss": 0.2352, "step": 337 }, { "epoch": 0.05256609642301711, "grad_norm": 2.152789286567263, "learning_rate": 9.931975714170345e-06, "loss": 0.3382, "step": 338 }, { "epoch": 0.05272161741835148, "grad_norm": 1.702657664273862, "learning_rate": 9.931573530129803e-06, "loss": 0.2368, "step": 339 }, { "epoch": 0.05287713841368585, "grad_norm": 2.05056832602719, "learning_rate": 9.931170168853886e-06, "loss": 0.2992, "step": 340 }, { "epoch": 0.053032659409020216, "grad_norm": 1.5775290622934088, "learning_rate": 9.930765630438882e-06, "loss": 0.212, "step": 341 }, { "epoch": 0.053188180404354586, "grad_norm": 1.166034186090071, "learning_rate": 9.93035991498136e-06, "loss": 0.2081, "step": 342 }, { "epoch": 0.053343701399688956, "grad_norm": 1.4555896083998001, "learning_rate": 9.929953022578171e-06, "loss": 0.1857, "step": 343 }, { "epoch": 0.053499222395023326, "grad_norm": 1.343927833342108, "learning_rate": 9.929544953326445e-06, "loss": 0.2691, "step": 344 }, { "epoch": 0.053654743390357695, "grad_norm": 1.8890642830307378, "learning_rate": 9.929135707323592e-06, "loss": 0.1967, "step": 345 }, { "epoch": 0.053810264385692065, "grad_norm": 1.4990308791372666, "learning_rate": 9.928725284667308e-06, "loss": 0.1774, "step": 346 }, { "epoch": 0.05396578538102644, "grad_norm": 1.615806257387967, "learning_rate": 9.928313685455565e-06, "loss": 0.2234, "step": 347 }, { "epoch": 0.05412130637636081, "grad_norm": 1.3758078431089233, "learning_rate": 9.927900909786617e-06, "loss": 0.259, "step": 348 }, { "epoch": 0.05427682737169518, "grad_norm": 0.855435278326685, "learning_rate": 9.927486957759001e-06, "loss": 0.2068, "step": 349 }, { "epoch": 0.05443234836702955, "grad_norm": 1.5217482862634222, "learning_rate": 9.927071829471531e-06, "loss": 0.1551, "step": 350 }, { "epoch": 0.05458786936236392, "grad_norm": 1.5111503264835533, "learning_rate": 9.926655525023304e-06, "loss": 0.2599, "step": 351 }, { "epoch": 0.05474339035769829, "grad_norm": 0.8967930843733002, "learning_rate": 9.9262380445137e-06, "loss": 0.169, "step": 352 }, { "epoch": 0.05489891135303266, "grad_norm": 1.9464375941159884, "learning_rate": 9.925819388042374e-06, "loss": 0.2983, "step": 353 }, { "epoch": 0.05505443234836703, "grad_norm": 1.574189824318599, "learning_rate": 9.925399555709269e-06, "loss": 0.1937, "step": 354 }, { "epoch": 0.0552099533437014, "grad_norm": 3.1438752373638232, "learning_rate": 9.924978547614604e-06, "loss": 0.2181, "step": 355 }, { "epoch": 0.05536547433903577, "grad_norm": 1.6348127637741856, "learning_rate": 9.924556363858877e-06, "loss": 0.1847, "step": 356 }, { "epoch": 0.05552099533437014, "grad_norm": 1.724455721347507, "learning_rate": 9.92413300454287e-06, "loss": 0.1924, "step": 357 }, { "epoch": 0.05567651632970451, "grad_norm": 0.9215074637606898, "learning_rate": 9.923708469767645e-06, "loss": 0.1484, "step": 358 }, { "epoch": 0.05583203732503888, "grad_norm": 1.0048144642733263, "learning_rate": 9.923282759634547e-06, "loss": 0.139, "step": 359 }, { "epoch": 0.05598755832037325, "grad_norm": 1.6563473574979655, "learning_rate": 9.922855874245197e-06, "loss": 0.2462, "step": 360 }, { "epoch": 0.05614307931570762, "grad_norm": 1.0753481257964308, "learning_rate": 9.922427813701495e-06, "loss": 0.2543, "step": 361 }, { "epoch": 0.05629860031104199, "grad_norm": 1.1607722120362791, "learning_rate": 9.92199857810563e-06, "loss": 0.1919, "step": 362 }, { "epoch": 0.05645412130637636, "grad_norm": 1.0235707105593828, "learning_rate": 9.921568167560065e-06, "loss": 0.1851, "step": 363 }, { "epoch": 0.05660964230171073, "grad_norm": 1.443489161948352, "learning_rate": 9.921136582167545e-06, "loss": 0.2566, "step": 364 }, { "epoch": 0.056765163297045104, "grad_norm": 1.1047251832726421, "learning_rate": 9.920703822031094e-06, "loss": 0.2268, "step": 365 }, { "epoch": 0.056920684292379474, "grad_norm": 1.8071891113724519, "learning_rate": 9.92026988725402e-06, "loss": 0.286, "step": 366 }, { "epoch": 0.057076205287713844, "grad_norm": 1.127534519608966, "learning_rate": 9.919834777939908e-06, "loss": 0.2078, "step": 367 }, { "epoch": 0.05723172628304821, "grad_norm": 1.3537981754957027, "learning_rate": 9.919398494192625e-06, "loss": 0.2574, "step": 368 }, { "epoch": 0.05738724727838258, "grad_norm": 1.5740289483284484, "learning_rate": 9.918961036116317e-06, "loss": 0.2168, "step": 369 }, { "epoch": 0.05754276827371695, "grad_norm": 2.1521943324617854, "learning_rate": 9.918522403815414e-06, "loss": 0.5388, "step": 370 }, { "epoch": 0.05769828926905132, "grad_norm": 0.9621156840694527, "learning_rate": 9.918082597394621e-06, "loss": 0.2206, "step": 371 }, { "epoch": 0.05785381026438569, "grad_norm": 0.8374473543740336, "learning_rate": 9.91764161695893e-06, "loss": 0.1931, "step": 372 }, { "epoch": 0.05800933125972006, "grad_norm": 1.594565893913882, "learning_rate": 9.917199462613601e-06, "loss": 0.2664, "step": 373 }, { "epoch": 0.05816485225505443, "grad_norm": 2.539276249800021, "learning_rate": 9.916756134464191e-06, "loss": 0.3158, "step": 374 }, { "epoch": 0.0583203732503888, "grad_norm": 1.0461962066836652, "learning_rate": 9.916311632616525e-06, "loss": 0.2489, "step": 375 }, { "epoch": 0.05847589424572317, "grad_norm": 1.1340444520472663, "learning_rate": 9.915865957176709e-06, "loss": 0.2718, "step": 376 }, { "epoch": 0.05863141524105754, "grad_norm": 1.467480205738983, "learning_rate": 9.915419108251138e-06, "loss": 0.1753, "step": 377 }, { "epoch": 0.05878693623639191, "grad_norm": 1.4394725259816188, "learning_rate": 9.914971085946476e-06, "loss": 0.1973, "step": 378 }, { "epoch": 0.05894245723172628, "grad_norm": 1.2534669496284443, "learning_rate": 9.914521890369676e-06, "loss": 0.2127, "step": 379 }, { "epoch": 0.05909797822706065, "grad_norm": 1.282361137311585, "learning_rate": 9.914071521627964e-06, "loss": 0.1881, "step": 380 }, { "epoch": 0.05925349922239502, "grad_norm": 1.7744186005576332, "learning_rate": 9.913619979828851e-06, "loss": 0.1875, "step": 381 }, { "epoch": 0.0594090202177294, "grad_norm": 1.5020250209663002, "learning_rate": 9.913167265080126e-06, "loss": 0.1684, "step": 382 }, { "epoch": 0.05956454121306377, "grad_norm": 1.259074929221576, "learning_rate": 9.912713377489858e-06, "loss": 0.2268, "step": 383 }, { "epoch": 0.059720062208398136, "grad_norm": 1.7761373693512776, "learning_rate": 9.912258317166398e-06, "loss": 0.223, "step": 384 }, { "epoch": 0.059875583203732506, "grad_norm": 2.38865888975245, "learning_rate": 9.911802084218374e-06, "loss": 0.2401, "step": 385 }, { "epoch": 0.060031104199066876, "grad_norm": 0.8949382740792282, "learning_rate": 9.911344678754694e-06, "loss": 0.1922, "step": 386 }, { "epoch": 0.060186625194401246, "grad_norm": 1.5889982876131292, "learning_rate": 9.910886100884547e-06, "loss": 0.1943, "step": 387 }, { "epoch": 0.060342146189735615, "grad_norm": 1.4147870380604834, "learning_rate": 9.910426350717404e-06, "loss": 0.1812, "step": 388 }, { "epoch": 0.060497667185069985, "grad_norm": 1.8231195124047115, "learning_rate": 9.909965428363012e-06, "loss": 0.2312, "step": 389 }, { "epoch": 0.060653188180404355, "grad_norm": 1.8874621933930384, "learning_rate": 9.909503333931402e-06, "loss": 0.287, "step": 390 }, { "epoch": 0.060808709175738725, "grad_norm": 1.7665216636429069, "learning_rate": 9.90904006753288e-06, "loss": 0.2185, "step": 391 }, { "epoch": 0.060964230171073094, "grad_norm": 1.256357590139898, "learning_rate": 9.908575629278034e-06, "loss": 0.1919, "step": 392 }, { "epoch": 0.061119751166407464, "grad_norm": 4.375967721306914, "learning_rate": 9.908110019277735e-06, "loss": 0.1781, "step": 393 }, { "epoch": 0.061275272161741834, "grad_norm": 1.4286735960699084, "learning_rate": 9.907643237643127e-06, "loss": 0.253, "step": 394 }, { "epoch": 0.061430793157076204, "grad_norm": 1.6229980414007696, "learning_rate": 9.90717528448564e-06, "loss": 0.2598, "step": 395 }, { "epoch": 0.06158631415241057, "grad_norm": 1.654127403226531, "learning_rate": 9.906706159916977e-06, "loss": 0.2677, "step": 396 }, { "epoch": 0.06174183514774494, "grad_norm": 0.7489317566220969, "learning_rate": 9.90623586404913e-06, "loss": 0.1595, "step": 397 }, { "epoch": 0.06189735614307931, "grad_norm": 1.0243584995437751, "learning_rate": 9.90576439699436e-06, "loss": 0.2089, "step": 398 }, { "epoch": 0.06205287713841368, "grad_norm": 1.2843274122650117, "learning_rate": 9.905291758865217e-06, "loss": 0.2458, "step": 399 }, { "epoch": 0.06220839813374806, "grad_norm": 1.482986812845832, "learning_rate": 9.904817949774524e-06, "loss": 0.2611, "step": 400 }, { "epoch": 0.06220839813374806, "eval_loss": 0.23184187710285187, "eval_runtime": 9.4466, "eval_samples_per_second": 2.752, "eval_steps_per_second": 0.741, "step": 400 }, { "epoch": 0.06236391912908243, "grad_norm": 2.01899839511783, "learning_rate": 9.904342969835385e-06, "loss": 0.2178, "step": 401 }, { "epoch": 0.06251944012441679, "grad_norm": 1.4244669635257896, "learning_rate": 9.903866819161188e-06, "loss": 0.2321, "step": 402 }, { "epoch": 0.06267496111975117, "grad_norm": 1.7090867256976423, "learning_rate": 9.903389497865593e-06, "loss": 0.2071, "step": 403 }, { "epoch": 0.06283048211508553, "grad_norm": 1.305136754505658, "learning_rate": 9.902911006062543e-06, "loss": 0.1899, "step": 404 }, { "epoch": 0.06298600311041991, "grad_norm": 1.0188677304744835, "learning_rate": 9.902431343866266e-06, "loss": 0.2457, "step": 405 }, { "epoch": 0.06314152410575427, "grad_norm": 1.6042710170666996, "learning_rate": 9.901950511391259e-06, "loss": 0.1894, "step": 406 }, { "epoch": 0.06329704510108865, "grad_norm": 1.3017493690494788, "learning_rate": 9.901468508752304e-06, "loss": 0.2908, "step": 407 }, { "epoch": 0.06345256609642301, "grad_norm": 1.3230633029674432, "learning_rate": 9.900985336064463e-06, "loss": 0.2786, "step": 408 }, { "epoch": 0.06360808709175739, "grad_norm": 1.5120257860737862, "learning_rate": 9.900500993443076e-06, "loss": 0.2516, "step": 409 }, { "epoch": 0.06376360808709176, "grad_norm": 1.004582433223966, "learning_rate": 9.900015481003762e-06, "loss": 0.2232, "step": 410 }, { "epoch": 0.06391912908242613, "grad_norm": 1.399115724283105, "learning_rate": 9.89952879886242e-06, "loss": 0.2763, "step": 411 }, { "epoch": 0.0640746500777605, "grad_norm": 1.816764777159624, "learning_rate": 9.899040947135225e-06, "loss": 0.2913, "step": 412 }, { "epoch": 0.06423017107309487, "grad_norm": 1.1949304261760583, "learning_rate": 9.898551925938638e-06, "loss": 0.191, "step": 413 }, { "epoch": 0.06438569206842924, "grad_norm": 1.6899096837752585, "learning_rate": 9.898061735389395e-06, "loss": 0.2314, "step": 414 }, { "epoch": 0.0645412130637636, "grad_norm": 1.6400875402483213, "learning_rate": 9.897570375604508e-06, "loss": 0.1985, "step": 415 }, { "epoch": 0.06469673405909798, "grad_norm": 1.1700291435704913, "learning_rate": 9.897077846701274e-06, "loss": 0.2178, "step": 416 }, { "epoch": 0.06485225505443235, "grad_norm": 1.6396026705753728, "learning_rate": 9.896584148797265e-06, "loss": 0.2443, "step": 417 }, { "epoch": 0.06500777604976672, "grad_norm": 0.8511496035113331, "learning_rate": 9.896089282010338e-06, "loss": 0.1619, "step": 418 }, { "epoch": 0.06516329704510108, "grad_norm": 1.3924064844406538, "learning_rate": 9.895593246458617e-06, "loss": 0.2021, "step": 419 }, { "epoch": 0.06531881804043546, "grad_norm": 0.8605197503722029, "learning_rate": 9.895096042260517e-06, "loss": 0.1628, "step": 420 }, { "epoch": 0.06547433903576982, "grad_norm": 1.3908417494412908, "learning_rate": 9.894597669534729e-06, "loss": 0.2054, "step": 421 }, { "epoch": 0.0656298600311042, "grad_norm": 1.445540354985538, "learning_rate": 9.894098128400219e-06, "loss": 0.2197, "step": 422 }, { "epoch": 0.06578538102643856, "grad_norm": 1.3103752658839474, "learning_rate": 9.893597418976234e-06, "loss": 0.2297, "step": 423 }, { "epoch": 0.06594090202177294, "grad_norm": 1.0497805770986521, "learning_rate": 9.893095541382304e-06, "loss": 0.1747, "step": 424 }, { "epoch": 0.0660964230171073, "grad_norm": 1.513640843523071, "learning_rate": 9.892592495738229e-06, "loss": 0.1754, "step": 425 }, { "epoch": 0.06625194401244168, "grad_norm": 1.0493517604475748, "learning_rate": 9.892088282164098e-06, "loss": 0.2586, "step": 426 }, { "epoch": 0.06640746500777606, "grad_norm": 1.4678962231044086, "learning_rate": 9.89158290078027e-06, "loss": 0.2932, "step": 427 }, { "epoch": 0.06656298600311042, "grad_norm": 1.6765991678498569, "learning_rate": 9.891076351707389e-06, "loss": 0.2116, "step": 428 }, { "epoch": 0.0667185069984448, "grad_norm": 1.4655721822686016, "learning_rate": 9.890568635066373e-06, "loss": 0.1543, "step": 429 }, { "epoch": 0.06687402799377916, "grad_norm": 1.6313534003780414, "learning_rate": 9.890059750978425e-06, "loss": 0.1571, "step": 430 }, { "epoch": 0.06702954898911354, "grad_norm": 1.0261848775525118, "learning_rate": 9.889549699565017e-06, "loss": 0.2865, "step": 431 }, { "epoch": 0.0671850699844479, "grad_norm": 1.5225780156038968, "learning_rate": 9.88903848094791e-06, "loss": 0.1914, "step": 432 }, { "epoch": 0.06734059097978227, "grad_norm": 1.3350387169313882, "learning_rate": 9.888526095249138e-06, "loss": 0.2754, "step": 433 }, { "epoch": 0.06749611197511664, "grad_norm": 1.192180411270206, "learning_rate": 9.888012542591014e-06, "loss": 0.1974, "step": 434 }, { "epoch": 0.06765163297045101, "grad_norm": 1.3005497242232493, "learning_rate": 9.88749782309613e-06, "loss": 0.1903, "step": 435 }, { "epoch": 0.06780715396578538, "grad_norm": 1.1288456938448086, "learning_rate": 9.88698193688736e-06, "loss": 0.2333, "step": 436 }, { "epoch": 0.06796267496111975, "grad_norm": 1.130396483559975, "learning_rate": 9.886464884087846e-06, "loss": 0.2674, "step": 437 }, { "epoch": 0.06811819595645412, "grad_norm": 0.9035948769600225, "learning_rate": 9.885946664821021e-06, "loss": 0.1864, "step": 438 }, { "epoch": 0.0682737169517885, "grad_norm": 1.1233476167867031, "learning_rate": 9.885427279210592e-06, "loss": 0.1787, "step": 439 }, { "epoch": 0.06842923794712286, "grad_norm": 1.2410015017602511, "learning_rate": 9.88490672738054e-06, "loss": 0.2509, "step": 440 }, { "epoch": 0.06858475894245723, "grad_norm": 1.3429869818046247, "learning_rate": 9.884385009455131e-06, "loss": 0.2811, "step": 441 }, { "epoch": 0.0687402799377916, "grad_norm": 0.7587532198438675, "learning_rate": 9.883862125558904e-06, "loss": 0.1781, "step": 442 }, { "epoch": 0.06889580093312597, "grad_norm": 0.9782244567957874, "learning_rate": 9.88333807581668e-06, "loss": 0.1891, "step": 443 }, { "epoch": 0.06905132192846034, "grad_norm": 1.8354472673215871, "learning_rate": 9.882812860353558e-06, "loss": 0.2372, "step": 444 }, { "epoch": 0.06920684292379471, "grad_norm": 1.0210293095436775, "learning_rate": 9.882286479294911e-06, "loss": 0.1988, "step": 445 }, { "epoch": 0.06936236391912909, "grad_norm": 2.117567357062213, "learning_rate": 9.881758932766398e-06, "loss": 0.1992, "step": 446 }, { "epoch": 0.06951788491446345, "grad_norm": 1.1644685693150085, "learning_rate": 9.881230220893948e-06, "loss": 0.18, "step": 447 }, { "epoch": 0.06967340590979783, "grad_norm": 1.1209275418337545, "learning_rate": 9.880700343803773e-06, "loss": 0.3069, "step": 448 }, { "epoch": 0.06982892690513219, "grad_norm": 1.155686416296927, "learning_rate": 9.880169301622362e-06, "loss": 0.1744, "step": 449 }, { "epoch": 0.06998444790046657, "grad_norm": 0.9709514091501408, "learning_rate": 9.879637094476482e-06, "loss": 0.1871, "step": 450 }, { "epoch": 0.07013996889580093, "grad_norm": 1.1219093494884402, "learning_rate": 9.87910372249318e-06, "loss": 0.1932, "step": 451 }, { "epoch": 0.07029548989113531, "grad_norm": 1.9094748023939434, "learning_rate": 9.878569185799778e-06, "loss": 0.2339, "step": 452 }, { "epoch": 0.07045101088646967, "grad_norm": 1.3264334862739553, "learning_rate": 9.878033484523876e-06, "loss": 0.1407, "step": 453 }, { "epoch": 0.07060653188180405, "grad_norm": 1.667180383137504, "learning_rate": 9.877496618793356e-06, "loss": 0.1867, "step": 454 }, { "epoch": 0.07076205287713841, "grad_norm": 1.0486860196671894, "learning_rate": 9.876958588736371e-06, "loss": 0.1683, "step": 455 }, { "epoch": 0.07091757387247279, "grad_norm": 1.2507603637095628, "learning_rate": 9.876419394481363e-06, "loss": 0.1958, "step": 456 }, { "epoch": 0.07107309486780715, "grad_norm": 1.7806763122908775, "learning_rate": 9.87587903615704e-06, "loss": 0.2466, "step": 457 }, { "epoch": 0.07122861586314153, "grad_norm": 1.0570385231053188, "learning_rate": 9.875337513892395e-06, "loss": 0.1336, "step": 458 }, { "epoch": 0.07138413685847589, "grad_norm": 1.8093621923009064, "learning_rate": 9.874794827816696e-06, "loss": 0.245, "step": 459 }, { "epoch": 0.07153965785381027, "grad_norm": 1.6343174119313473, "learning_rate": 9.874250978059489e-06, "loss": 0.1878, "step": 460 }, { "epoch": 0.07169517884914463, "grad_norm": 1.2474757406216732, "learning_rate": 9.873705964750603e-06, "loss": 0.201, "step": 461 }, { "epoch": 0.071850699844479, "grad_norm": 0.9854370189019162, "learning_rate": 9.873159788020135e-06, "loss": 0.1572, "step": 462 }, { "epoch": 0.07200622083981338, "grad_norm": 1.2046716423202313, "learning_rate": 9.872612447998466e-06, "loss": 0.1644, "step": 463 }, { "epoch": 0.07216174183514774, "grad_norm": 1.6657683984708445, "learning_rate": 9.872063944816257e-06, "loss": 0.2026, "step": 464 }, { "epoch": 0.07231726283048212, "grad_norm": 1.6319780353610651, "learning_rate": 9.871514278604439e-06, "loss": 0.2361, "step": 465 }, { "epoch": 0.07247278382581648, "grad_norm": 0.930626270347552, "learning_rate": 9.870963449494228e-06, "loss": 0.2334, "step": 466 }, { "epoch": 0.07262830482115086, "grad_norm": 1.7347785771237878, "learning_rate": 9.870411457617115e-06, "loss": 0.3121, "step": 467 }, { "epoch": 0.07278382581648522, "grad_norm": 1.6861297399111428, "learning_rate": 9.869858303104864e-06, "loss": 0.2234, "step": 468 }, { "epoch": 0.0729393468118196, "grad_norm": 2.2175613812233856, "learning_rate": 9.869303986089525e-06, "loss": 0.215, "step": 469 }, { "epoch": 0.07309486780715396, "grad_norm": 1.2151103786584494, "learning_rate": 9.86874850670342e-06, "loss": 0.143, "step": 470 }, { "epoch": 0.07325038880248834, "grad_norm": 1.8347498082665927, "learning_rate": 9.868191865079149e-06, "loss": 0.1847, "step": 471 }, { "epoch": 0.0734059097978227, "grad_norm": 0.7662001443118179, "learning_rate": 9.867634061349592e-06, "loss": 0.2132, "step": 472 }, { "epoch": 0.07356143079315708, "grad_norm": 1.127229878211817, "learning_rate": 9.8670750956479e-06, "loss": 0.2405, "step": 473 }, { "epoch": 0.07371695178849144, "grad_norm": 0.8919765028163983, "learning_rate": 9.866514968107511e-06, "loss": 0.2187, "step": 474 }, { "epoch": 0.07387247278382582, "grad_norm": 0.8318099634868261, "learning_rate": 9.865953678862133e-06, "loss": 0.149, "step": 475 }, { "epoch": 0.07402799377916018, "grad_norm": 1.577340616348031, "learning_rate": 9.865391228045753e-06, "loss": 0.2319, "step": 476 }, { "epoch": 0.07418351477449456, "grad_norm": 1.116181816359047, "learning_rate": 9.864827615792637e-06, "loss": 0.1901, "step": 477 }, { "epoch": 0.07433903576982892, "grad_norm": 1.105109643192386, "learning_rate": 9.864262842237327e-06, "loss": 0.2011, "step": 478 }, { "epoch": 0.0744945567651633, "grad_norm": 1.9701207318396636, "learning_rate": 9.863696907514641e-06, "loss": 0.2409, "step": 479 }, { "epoch": 0.07465007776049767, "grad_norm": 2.2498632028053507, "learning_rate": 9.863129811759678e-06, "loss": 0.3829, "step": 480 }, { "epoch": 0.07480559875583204, "grad_norm": 1.1224194434111838, "learning_rate": 9.86256155510781e-06, "loss": 0.2114, "step": 481 }, { "epoch": 0.07496111975116641, "grad_norm": 1.5539407325523458, "learning_rate": 9.861992137694687e-06, "loss": 0.1976, "step": 482 }, { "epoch": 0.07511664074650078, "grad_norm": 1.962092802549792, "learning_rate": 9.86142155965624e-06, "loss": 0.2725, "step": 483 }, { "epoch": 0.07527216174183515, "grad_norm": 0.8983695148666645, "learning_rate": 9.860849821128668e-06, "loss": 0.154, "step": 484 }, { "epoch": 0.07542768273716952, "grad_norm": 1.398592267234838, "learning_rate": 9.86027692224846e-06, "loss": 0.1497, "step": 485 }, { "epoch": 0.07558320373250389, "grad_norm": 1.0403186420901969, "learning_rate": 9.859702863152372e-06, "loss": 0.1936, "step": 486 }, { "epoch": 0.07573872472783826, "grad_norm": 0.7470818354767621, "learning_rate": 9.859127643977438e-06, "loss": 0.1523, "step": 487 }, { "epoch": 0.07589424572317263, "grad_norm": 1.2067693893481815, "learning_rate": 9.858551264860972e-06, "loss": 0.3168, "step": 488 }, { "epoch": 0.076049766718507, "grad_norm": 1.5295551443098423, "learning_rate": 9.857973725940565e-06, "loss": 0.2194, "step": 489 }, { "epoch": 0.07620528771384137, "grad_norm": 1.618418958541224, "learning_rate": 9.857395027354085e-06, "loss": 0.2209, "step": 490 }, { "epoch": 0.07636080870917573, "grad_norm": 1.1696631104347366, "learning_rate": 9.856815169239671e-06, "loss": 0.1993, "step": 491 }, { "epoch": 0.07651632970451011, "grad_norm": 1.4918786793556023, "learning_rate": 9.856234151735744e-06, "loss": 0.2657, "step": 492 }, { "epoch": 0.07667185069984447, "grad_norm": 1.3100404095494855, "learning_rate": 9.855651974981005e-06, "loss": 0.2832, "step": 493 }, { "epoch": 0.07682737169517885, "grad_norm": 13.98784357990924, "learning_rate": 9.855068639114425e-06, "loss": 0.2488, "step": 494 }, { "epoch": 0.07698289269051321, "grad_norm": 1.3956332181045448, "learning_rate": 9.854484144275254e-06, "loss": 0.225, "step": 495 }, { "epoch": 0.07713841368584759, "grad_norm": 1.1858198275947147, "learning_rate": 9.853898490603018e-06, "loss": 0.2041, "step": 496 }, { "epoch": 0.07729393468118195, "grad_norm": 0.765411378364051, "learning_rate": 9.853311678237524e-06, "loss": 0.1492, "step": 497 }, { "epoch": 0.07744945567651633, "grad_norm": 1.2288325537770441, "learning_rate": 9.85272370731885e-06, "loss": 0.1773, "step": 498 }, { "epoch": 0.0776049766718507, "grad_norm": 1.3901203640607709, "learning_rate": 9.852134577987353e-06, "loss": 0.2091, "step": 499 }, { "epoch": 0.07776049766718507, "grad_norm": 1.5991626946866644, "learning_rate": 9.85154429038367e-06, "loss": 0.2485, "step": 500 }, { "epoch": 0.07776049766718507, "eval_loss": 0.22800126671791077, "eval_runtime": 9.4446, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 500 }, { "epoch": 0.07791601866251945, "grad_norm": 0.9946822389547595, "learning_rate": 9.850952844648705e-06, "loss": 0.2324, "step": 501 }, { "epoch": 0.07807153965785381, "grad_norm": 1.088817573789371, "learning_rate": 9.850360240923647e-06, "loss": 0.1813, "step": 502 }, { "epoch": 0.07822706065318819, "grad_norm": 5.945777339639669, "learning_rate": 9.849766479349959e-06, "loss": 0.1976, "step": 503 }, { "epoch": 0.07838258164852255, "grad_norm": 0.8593394406117729, "learning_rate": 9.84917156006938e-06, "loss": 0.2474, "step": 504 }, { "epoch": 0.07853810264385692, "grad_norm": 1.3930666133589364, "learning_rate": 9.848575483223925e-06, "loss": 0.215, "step": 505 }, { "epoch": 0.07869362363919129, "grad_norm": 1.6493288101835173, "learning_rate": 9.84797824895589e-06, "loss": 0.303, "step": 506 }, { "epoch": 0.07884914463452566, "grad_norm": 1.1106903817577367, "learning_rate": 9.847379857407835e-06, "loss": 0.1654, "step": 507 }, { "epoch": 0.07900466562986003, "grad_norm": 1.166896696404847, "learning_rate": 9.846780308722612e-06, "loss": 0.2046, "step": 508 }, { "epoch": 0.0791601866251944, "grad_norm": 1.7221901123414272, "learning_rate": 9.846179603043338e-06, "loss": 0.2543, "step": 509 }, { "epoch": 0.07931570762052877, "grad_norm": 1.0398664154595585, "learning_rate": 9.845577740513409e-06, "loss": 0.2616, "step": 510 }, { "epoch": 0.07947122861586314, "grad_norm": 1.2062182369254026, "learning_rate": 9.8449747212765e-06, "loss": 0.1641, "step": 511 }, { "epoch": 0.0796267496111975, "grad_norm": 1.3859318575453086, "learning_rate": 9.84437054547656e-06, "loss": 0.193, "step": 512 }, { "epoch": 0.07978227060653188, "grad_norm": 3.5056235741823523, "learning_rate": 9.843765213257814e-06, "loss": 0.2399, "step": 513 }, { "epoch": 0.07993779160186625, "grad_norm": 1.2578551416521373, "learning_rate": 9.843158724764762e-06, "loss": 0.2177, "step": 514 }, { "epoch": 0.08009331259720062, "grad_norm": 1.4118043035204642, "learning_rate": 9.842551080142182e-06, "loss": 0.21, "step": 515 }, { "epoch": 0.080248833592535, "grad_norm": 1.1155160124053434, "learning_rate": 9.841942279535128e-06, "loss": 0.2128, "step": 516 }, { "epoch": 0.08040435458786936, "grad_norm": 1.0287833439256027, "learning_rate": 9.84133232308893e-06, "loss": 0.1846, "step": 517 }, { "epoch": 0.08055987558320374, "grad_norm": 2.2894965228305377, "learning_rate": 9.84072121094919e-06, "loss": 0.1814, "step": 518 }, { "epoch": 0.0807153965785381, "grad_norm": 1.345886098139959, "learning_rate": 9.84010894326179e-06, "loss": 0.1912, "step": 519 }, { "epoch": 0.08087091757387248, "grad_norm": 1.9234609876851483, "learning_rate": 9.83949552017289e-06, "loss": 0.2982, "step": 520 }, { "epoch": 0.08102643856920684, "grad_norm": 1.2452886345823744, "learning_rate": 9.83888094182892e-06, "loss": 0.2144, "step": 521 }, { "epoch": 0.08118195956454122, "grad_norm": 1.2711995935698062, "learning_rate": 9.838265208376584e-06, "loss": 0.1799, "step": 522 }, { "epoch": 0.08133748055987558, "grad_norm": 1.0755729955519457, "learning_rate": 9.837648319962876e-06, "loss": 0.3311, "step": 523 }, { "epoch": 0.08149300155520996, "grad_norm": 1.5025152130217085, "learning_rate": 9.837030276735049e-06, "loss": 0.203, "step": 524 }, { "epoch": 0.08164852255054432, "grad_norm": 1.4271542860149822, "learning_rate": 9.83641107884064e-06, "loss": 0.2055, "step": 525 }, { "epoch": 0.0818040435458787, "grad_norm": 1.1896665999932865, "learning_rate": 9.83579072642746e-06, "loss": 0.2191, "step": 526 }, { "epoch": 0.08195956454121306, "grad_norm": 1.6391797544527267, "learning_rate": 9.835169219643597e-06, "loss": 0.2164, "step": 527 }, { "epoch": 0.08211508553654744, "grad_norm": 1.3905263994766632, "learning_rate": 9.834546558637412e-06, "loss": 0.2188, "step": 528 }, { "epoch": 0.0822706065318818, "grad_norm": 1.1325886941547982, "learning_rate": 9.833922743557545e-06, "loss": 0.3596, "step": 529 }, { "epoch": 0.08242612752721618, "grad_norm": 1.58458236573862, "learning_rate": 9.833297774552905e-06, "loss": 0.2725, "step": 530 }, { "epoch": 0.08258164852255054, "grad_norm": 1.2630288499628133, "learning_rate": 9.832671651772685e-06, "loss": 0.3327, "step": 531 }, { "epoch": 0.08273716951788491, "grad_norm": 1.1472998381036559, "learning_rate": 9.832044375366347e-06, "loss": 0.1758, "step": 532 }, { "epoch": 0.08289269051321929, "grad_norm": 1.148175513948538, "learning_rate": 9.831415945483634e-06, "loss": 0.189, "step": 533 }, { "epoch": 0.08304821150855365, "grad_norm": 1.2010115460022994, "learning_rate": 9.830786362274556e-06, "loss": 0.2065, "step": 534 }, { "epoch": 0.08320373250388803, "grad_norm": 1.357353814240526, "learning_rate": 9.830155625889406e-06, "loss": 0.1505, "step": 535 }, { "epoch": 0.0833592534992224, "grad_norm": 1.2541527971168078, "learning_rate": 9.829523736478748e-06, "loss": 0.2309, "step": 536 }, { "epoch": 0.08351477449455677, "grad_norm": 1.0453169347517781, "learning_rate": 9.828890694193425e-06, "loss": 0.1593, "step": 537 }, { "epoch": 0.08367029548989113, "grad_norm": 1.256435896986176, "learning_rate": 9.828256499184553e-06, "loss": 0.2081, "step": 538 }, { "epoch": 0.08382581648522551, "grad_norm": 1.4617851677608784, "learning_rate": 9.827621151603522e-06, "loss": 0.2181, "step": 539 }, { "epoch": 0.08398133748055987, "grad_norm": 2.512069587946666, "learning_rate": 9.826984651601998e-06, "loss": 0.4003, "step": 540 }, { "epoch": 0.08413685847589425, "grad_norm": 1.1168842922612399, "learning_rate": 9.826346999331923e-06, "loss": 0.2823, "step": 541 }, { "epoch": 0.08429237947122861, "grad_norm": 1.0417831453973136, "learning_rate": 9.825708194945514e-06, "loss": 0.1889, "step": 542 }, { "epoch": 0.08444790046656299, "grad_norm": 0.9096481087872343, "learning_rate": 9.82506823859526e-06, "loss": 0.2351, "step": 543 }, { "epoch": 0.08460342146189735, "grad_norm": 1.0430082881953087, "learning_rate": 9.824427130433932e-06, "loss": 0.1953, "step": 544 }, { "epoch": 0.08475894245723173, "grad_norm": 0.6608850743713712, "learning_rate": 9.823784870614568e-06, "loss": 0.1854, "step": 545 }, { "epoch": 0.08491446345256609, "grad_norm": 0.9535990803944258, "learning_rate": 9.823141459290486e-06, "loss": 0.3623, "step": 546 }, { "epoch": 0.08506998444790047, "grad_norm": 1.2084813471627978, "learning_rate": 9.822496896615276e-06, "loss": 0.2088, "step": 547 }, { "epoch": 0.08522550544323483, "grad_norm": 1.751880921507202, "learning_rate": 9.821851182742806e-06, "loss": 0.2367, "step": 548 }, { "epoch": 0.08538102643856921, "grad_norm": 0.859776642879622, "learning_rate": 9.821204317827214e-06, "loss": 0.249, "step": 549 }, { "epoch": 0.08553654743390357, "grad_norm": 1.127529266910784, "learning_rate": 9.820556302022916e-06, "loss": 0.2038, "step": 550 }, { "epoch": 0.08569206842923795, "grad_norm": 1.1762380712487397, "learning_rate": 9.819907135484607e-06, "loss": 0.1408, "step": 551 }, { "epoch": 0.08584758942457232, "grad_norm": 1.1841316789710945, "learning_rate": 9.819256818367247e-06, "loss": 0.1971, "step": 552 }, { "epoch": 0.08600311041990669, "grad_norm": 0.9978225930526609, "learning_rate": 9.818605350826078e-06, "loss": 0.2221, "step": 553 }, { "epoch": 0.08615863141524106, "grad_norm": 1.6694424755142652, "learning_rate": 9.817952733016614e-06, "loss": 0.1549, "step": 554 }, { "epoch": 0.08631415241057543, "grad_norm": 0.9346983450738274, "learning_rate": 9.817298965094644e-06, "loss": 0.1579, "step": 555 }, { "epoch": 0.0864696734059098, "grad_norm": 1.147526345911482, "learning_rate": 9.816644047216231e-06, "loss": 0.1873, "step": 556 }, { "epoch": 0.08662519440124417, "grad_norm": 1.1886850012764587, "learning_rate": 9.815987979537713e-06, "loss": 0.2347, "step": 557 }, { "epoch": 0.08678071539657854, "grad_norm": 1.6793433753087175, "learning_rate": 9.815330762215704e-06, "loss": 0.2773, "step": 558 }, { "epoch": 0.0869362363919129, "grad_norm": 0.7389091927152867, "learning_rate": 9.81467239540709e-06, "loss": 0.2376, "step": 559 }, { "epoch": 0.08709175738724728, "grad_norm": 1.5501383478894555, "learning_rate": 9.814012879269031e-06, "loss": 0.249, "step": 560 }, { "epoch": 0.08724727838258164, "grad_norm": 1.985092307546573, "learning_rate": 9.813352213958966e-06, "loss": 0.2293, "step": 561 }, { "epoch": 0.08740279937791602, "grad_norm": 1.1408911673993625, "learning_rate": 9.812690399634601e-06, "loss": 0.29, "step": 562 }, { "epoch": 0.08755832037325038, "grad_norm": 1.2461126532920535, "learning_rate": 9.812027436453924e-06, "loss": 0.2783, "step": 563 }, { "epoch": 0.08771384136858476, "grad_norm": 1.764223151926025, "learning_rate": 9.81136332457519e-06, "loss": 0.2528, "step": 564 }, { "epoch": 0.08786936236391912, "grad_norm": 1.0618642840366128, "learning_rate": 9.810698064156935e-06, "loss": 0.1723, "step": 565 }, { "epoch": 0.0880248833592535, "grad_norm": 0.8569330765683667, "learning_rate": 9.810031655357964e-06, "loss": 0.2241, "step": 566 }, { "epoch": 0.08818040435458786, "grad_norm": 1.0553303848822568, "learning_rate": 9.80936409833736e-06, "loss": 0.2312, "step": 567 }, { "epoch": 0.08833592534992224, "grad_norm": 1.8702866312005988, "learning_rate": 9.808695393254474e-06, "loss": 0.1949, "step": 568 }, { "epoch": 0.08849144634525662, "grad_norm": 0.9476538253542002, "learning_rate": 9.808025540268939e-06, "loss": 0.1783, "step": 569 }, { "epoch": 0.08864696734059098, "grad_norm": 1.4661601306937122, "learning_rate": 9.80735453954066e-06, "loss": 0.2941, "step": 570 }, { "epoch": 0.08880248833592536, "grad_norm": 1.1865752816456114, "learning_rate": 9.80668239122981e-06, "loss": 0.2196, "step": 571 }, { "epoch": 0.08895800933125972, "grad_norm": 0.9682721759722641, "learning_rate": 9.80600909549684e-06, "loss": 0.2453, "step": 572 }, { "epoch": 0.0891135303265941, "grad_norm": 1.0402552655035497, "learning_rate": 9.805334652502478e-06, "loss": 0.2528, "step": 573 }, { "epoch": 0.08926905132192846, "grad_norm": 1.1058208608284787, "learning_rate": 9.804659062407721e-06, "loss": 0.1704, "step": 574 }, { "epoch": 0.08942457231726283, "grad_norm": 0.9300562072054855, "learning_rate": 9.803982325373843e-06, "loss": 0.241, "step": 575 }, { "epoch": 0.0895800933125972, "grad_norm": 1.3452145435832572, "learning_rate": 9.803304441562391e-06, "loss": 0.179, "step": 576 }, { "epoch": 0.08973561430793157, "grad_norm": 0.934714522466104, "learning_rate": 9.802625411135183e-06, "loss": 0.2131, "step": 577 }, { "epoch": 0.08989113530326594, "grad_norm": 1.2723518042915498, "learning_rate": 9.801945234254315e-06, "loss": 0.2342, "step": 578 }, { "epoch": 0.09004665629860031, "grad_norm": 2.11692073632197, "learning_rate": 9.801263911082154e-06, "loss": 0.2148, "step": 579 }, { "epoch": 0.09020217729393468, "grad_norm": 2.6365326907523396, "learning_rate": 9.800581441781342e-06, "loss": 0.2787, "step": 580 }, { "epoch": 0.09035769828926905, "grad_norm": 1.3369047254369875, "learning_rate": 9.799897826514793e-06, "loss": 0.2365, "step": 581 }, { "epoch": 0.09051321928460342, "grad_norm": 0.9493060685693816, "learning_rate": 9.799213065445696e-06, "loss": 0.1656, "step": 582 }, { "epoch": 0.09066874027993779, "grad_norm": 1.0470819909783555, "learning_rate": 9.798527158737512e-06, "loss": 0.1578, "step": 583 }, { "epoch": 0.09082426127527216, "grad_norm": 1.0969444747176942, "learning_rate": 9.797840106553977e-06, "loss": 0.2095, "step": 584 }, { "epoch": 0.09097978227060653, "grad_norm": 1.6035875172395766, "learning_rate": 9.797151909059102e-06, "loss": 0.2682, "step": 585 }, { "epoch": 0.0911353032659409, "grad_norm": 1.3049640527657593, "learning_rate": 9.796462566417169e-06, "loss": 0.2537, "step": 586 }, { "epoch": 0.09129082426127527, "grad_norm": 1.365745492042764, "learning_rate": 9.79577207879273e-06, "loss": 0.2065, "step": 587 }, { "epoch": 0.09144634525660965, "grad_norm": 0.9500261347653985, "learning_rate": 9.795080446350616e-06, "loss": 0.1885, "step": 588 }, { "epoch": 0.09160186625194401, "grad_norm": 1.5405453397493063, "learning_rate": 9.79438766925593e-06, "loss": 0.2507, "step": 589 }, { "epoch": 0.09175738724727839, "grad_norm": 0.9919977440587929, "learning_rate": 9.79369374767405e-06, "loss": 0.1607, "step": 590 }, { "epoch": 0.09191290824261275, "grad_norm": 1.2052697190695243, "learning_rate": 9.79299868177062e-06, "loss": 0.2247, "step": 591 }, { "epoch": 0.09206842923794713, "grad_norm": 1.5911347684916193, "learning_rate": 9.792302471711564e-06, "loss": 0.1812, "step": 592 }, { "epoch": 0.09222395023328149, "grad_norm": 1.3772469912987155, "learning_rate": 9.791605117663076e-06, "loss": 0.1567, "step": 593 }, { "epoch": 0.09237947122861587, "grad_norm": 1.456752513640415, "learning_rate": 9.790906619791627e-06, "loss": 0.2009, "step": 594 }, { "epoch": 0.09253499222395023, "grad_norm": 0.9824754188966437, "learning_rate": 9.790206978263955e-06, "loss": 0.2041, "step": 595 }, { "epoch": 0.0926905132192846, "grad_norm": 1.1576177882724517, "learning_rate": 9.789506193247075e-06, "loss": 0.2304, "step": 596 }, { "epoch": 0.09284603421461897, "grad_norm": 1.3814578099918997, "learning_rate": 9.788804264908276e-06, "loss": 0.1935, "step": 597 }, { "epoch": 0.09300155520995335, "grad_norm": 0.8483069008778095, "learning_rate": 9.788101193415116e-06, "loss": 0.2148, "step": 598 }, { "epoch": 0.09315707620528771, "grad_norm": 1.3477202886979611, "learning_rate": 9.787396978935431e-06, "loss": 0.23, "step": 599 }, { "epoch": 0.09331259720062209, "grad_norm": 1.4372703771133322, "learning_rate": 9.786691621637322e-06, "loss": 0.2496, "step": 600 }, { "epoch": 0.09331259720062209, "eval_loss": 0.22426502406597137, "eval_runtime": 9.4405, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.741, "step": 600 }, { "epoch": 0.09346811819595645, "grad_norm": 1.9416761367047068, "learning_rate": 9.785985121689171e-06, "loss": 0.6927, "step": 601 }, { "epoch": 0.09362363919129083, "grad_norm": 1.268764907148312, "learning_rate": 9.785277479259629e-06, "loss": 0.2501, "step": 602 }, { "epoch": 0.09377916018662519, "grad_norm": 2.3273439454641514, "learning_rate": 9.784568694517618e-06, "loss": 0.2469, "step": 603 }, { "epoch": 0.09393468118195956, "grad_norm": 1.8747313801721335, "learning_rate": 9.783858767632338e-06, "loss": 0.2289, "step": 604 }, { "epoch": 0.09409020217729394, "grad_norm": 1.2586569667037595, "learning_rate": 9.783147698773257e-06, "loss": 0.1962, "step": 605 }, { "epoch": 0.0942457231726283, "grad_norm": 1.496316694651238, "learning_rate": 9.782435488110116e-06, "loss": 0.298, "step": 606 }, { "epoch": 0.09440124416796268, "grad_norm": 1.1801510466185432, "learning_rate": 9.781722135812932e-06, "loss": 0.2189, "step": 607 }, { "epoch": 0.09455676516329704, "grad_norm": 1.2565748248573585, "learning_rate": 9.78100764205199e-06, "loss": 0.2186, "step": 608 }, { "epoch": 0.09471228615863142, "grad_norm": 0.9391168040034623, "learning_rate": 9.780292006997849e-06, "loss": 0.2144, "step": 609 }, { "epoch": 0.09486780715396578, "grad_norm": 1.1387381134081225, "learning_rate": 9.779575230821344e-06, "loss": 0.1718, "step": 610 }, { "epoch": 0.09502332814930016, "grad_norm": 1.0177855034745955, "learning_rate": 9.778857313693578e-06, "loss": 0.1586, "step": 611 }, { "epoch": 0.09517884914463452, "grad_norm": 1.4624255805438011, "learning_rate": 9.778138255785928e-06, "loss": 0.2697, "step": 612 }, { "epoch": 0.0953343701399689, "grad_norm": 1.0796167649791846, "learning_rate": 9.77741805727004e-06, "loss": 0.2668, "step": 613 }, { "epoch": 0.09548989113530326, "grad_norm": 2.1747859377128806, "learning_rate": 9.776696718317842e-06, "loss": 0.2117, "step": 614 }, { "epoch": 0.09564541213063764, "grad_norm": 1.6173977205310859, "learning_rate": 9.775974239101522e-06, "loss": 0.2048, "step": 615 }, { "epoch": 0.095800933125972, "grad_norm": 1.281075534048029, "learning_rate": 9.775250619793548e-06, "loss": 0.2218, "step": 616 }, { "epoch": 0.09595645412130638, "grad_norm": 1.5623409338338163, "learning_rate": 9.77452586056666e-06, "loss": 0.2843, "step": 617 }, { "epoch": 0.09611197511664074, "grad_norm": 0.920135780872905, "learning_rate": 9.773799961593862e-06, "loss": 0.218, "step": 618 }, { "epoch": 0.09626749611197512, "grad_norm": 1.6644765009913491, "learning_rate": 9.773072923048443e-06, "loss": 0.277, "step": 619 }, { "epoch": 0.09642301710730948, "grad_norm": 1.0758387537045102, "learning_rate": 9.772344745103955e-06, "loss": 0.2405, "step": 620 }, { "epoch": 0.09657853810264386, "grad_norm": 1.1751354263981124, "learning_rate": 9.77161542793422e-06, "loss": 0.2362, "step": 621 }, { "epoch": 0.09673405909797823, "grad_norm": 2.7957127911749655, "learning_rate": 9.770884971713344e-06, "loss": 0.178, "step": 622 }, { "epoch": 0.0968895800933126, "grad_norm": 5.021758252286217, "learning_rate": 9.770153376615692e-06, "loss": 0.2095, "step": 623 }, { "epoch": 0.09704510108864697, "grad_norm": 0.8518883317455118, "learning_rate": 9.769420642815905e-06, "loss": 0.2174, "step": 624 }, { "epoch": 0.09720062208398134, "grad_norm": 1.0603512343033086, "learning_rate": 9.7686867704889e-06, "loss": 0.2437, "step": 625 }, { "epoch": 0.09735614307931571, "grad_norm": 2.7767054670419067, "learning_rate": 9.767951759809861e-06, "loss": 0.3072, "step": 626 }, { "epoch": 0.09751166407465008, "grad_norm": 0.875830402681162, "learning_rate": 9.767215610954246e-06, "loss": 0.1865, "step": 627 }, { "epoch": 0.09766718506998445, "grad_norm": 1.1746324049289305, "learning_rate": 9.766478324097784e-06, "loss": 0.1775, "step": 628 }, { "epoch": 0.09782270606531882, "grad_norm": 1.3198405804921558, "learning_rate": 9.765739899416474e-06, "loss": 0.2202, "step": 629 }, { "epoch": 0.09797822706065319, "grad_norm": 0.9040537149469751, "learning_rate": 9.76500033708659e-06, "loss": 0.134, "step": 630 }, { "epoch": 0.09813374805598755, "grad_norm": 1.1116680855923542, "learning_rate": 9.764259637284674e-06, "loss": 0.2413, "step": 631 }, { "epoch": 0.09828926905132193, "grad_norm": 1.816511140625042, "learning_rate": 9.763517800187543e-06, "loss": 0.1881, "step": 632 }, { "epoch": 0.0984447900466563, "grad_norm": 1.1808179637924803, "learning_rate": 9.762774825972284e-06, "loss": 0.1797, "step": 633 }, { "epoch": 0.09860031104199067, "grad_norm": 0.9260180174403776, "learning_rate": 9.762030714816255e-06, "loss": 0.1692, "step": 634 }, { "epoch": 0.09875583203732503, "grad_norm": 0.9809663827224766, "learning_rate": 9.761285466897086e-06, "loss": 0.1971, "step": 635 }, { "epoch": 0.09891135303265941, "grad_norm": 1.1818951833176021, "learning_rate": 9.760539082392678e-06, "loss": 0.3061, "step": 636 }, { "epoch": 0.09906687402799377, "grad_norm": 1.5126562950843534, "learning_rate": 9.759791561481201e-06, "loss": 0.2214, "step": 637 }, { "epoch": 0.09922239502332815, "grad_norm": 1.1563368410762391, "learning_rate": 9.759042904341103e-06, "loss": 0.1879, "step": 638 }, { "epoch": 0.09937791601866251, "grad_norm": 1.7465834025848672, "learning_rate": 9.758293111151094e-06, "loss": 0.2936, "step": 639 }, { "epoch": 0.09953343701399689, "grad_norm": 1.4420901394687415, "learning_rate": 9.757542182090165e-06, "loss": 0.1977, "step": 640 }, { "epoch": 0.09968895800933127, "grad_norm": 1.4320029014579423, "learning_rate": 9.756790117337569e-06, "loss": 0.235, "step": 641 }, { "epoch": 0.09984447900466563, "grad_norm": 1.0178157213981396, "learning_rate": 9.756036917072837e-06, "loss": 0.228, "step": 642 }, { "epoch": 0.1, "grad_norm": 1.634337451034447, "learning_rate": 9.755282581475769e-06, "loss": 0.174, "step": 643 }, { "epoch": 0.10015552099533437, "grad_norm": 1.3123622467109133, "learning_rate": 9.754527110726432e-06, "loss": 0.1854, "step": 644 }, { "epoch": 0.10031104199066875, "grad_norm": 1.3700959071130703, "learning_rate": 9.753770505005171e-06, "loss": 0.271, "step": 645 }, { "epoch": 0.10046656298600311, "grad_norm": 1.5589446061903662, "learning_rate": 9.753012764492596e-06, "loss": 0.1669, "step": 646 }, { "epoch": 0.10062208398133748, "grad_norm": 1.3813884723817376, "learning_rate": 9.752253889369592e-06, "loss": 0.1525, "step": 647 }, { "epoch": 0.10077760497667185, "grad_norm": 1.3858844961504873, "learning_rate": 9.75149387981731e-06, "loss": 0.2673, "step": 648 }, { "epoch": 0.10093312597200622, "grad_norm": 0.9436000404569762, "learning_rate": 9.75073273601718e-06, "loss": 0.2058, "step": 649 }, { "epoch": 0.10108864696734059, "grad_norm": 1.4599521330072638, "learning_rate": 9.749970458150893e-06, "loss": 0.2145, "step": 650 }, { "epoch": 0.10124416796267496, "grad_norm": 1.3455835009343615, "learning_rate": 9.749207046400415e-06, "loss": 0.2353, "step": 651 }, { "epoch": 0.10139968895800933, "grad_norm": 1.6299219848605395, "learning_rate": 9.748442500947988e-06, "loss": 0.2582, "step": 652 }, { "epoch": 0.1015552099533437, "grad_norm": 2.1538893724554966, "learning_rate": 9.747676821976116e-06, "loss": 0.2128, "step": 653 }, { "epoch": 0.10171073094867807, "grad_norm": 1.1642628978054306, "learning_rate": 9.746910009667577e-06, "loss": 0.2092, "step": 654 }, { "epoch": 0.10186625194401244, "grad_norm": 0.9776673463806724, "learning_rate": 9.746142064205422e-06, "loss": 0.176, "step": 655 }, { "epoch": 0.1020217729393468, "grad_norm": 1.350687490540933, "learning_rate": 9.745372985772968e-06, "loss": 0.2426, "step": 656 }, { "epoch": 0.10217729393468118, "grad_norm": 1.7681295289484116, "learning_rate": 9.744602774553807e-06, "loss": 0.2204, "step": 657 }, { "epoch": 0.10233281493001556, "grad_norm": 0.9199423619051535, "learning_rate": 9.743831430731796e-06, "loss": 0.1647, "step": 658 }, { "epoch": 0.10248833592534992, "grad_norm": 5.138426947168042, "learning_rate": 9.743058954491067e-06, "loss": 0.2107, "step": 659 }, { "epoch": 0.1026438569206843, "grad_norm": 1.446510693113484, "learning_rate": 9.742285346016024e-06, "loss": 0.2379, "step": 660 }, { "epoch": 0.10279937791601866, "grad_norm": 1.4833539837619547, "learning_rate": 9.741510605491335e-06, "loss": 0.1714, "step": 661 }, { "epoch": 0.10295489891135304, "grad_norm": 1.3228899574182327, "learning_rate": 9.74073473310194e-06, "loss": 0.2388, "step": 662 }, { "epoch": 0.1031104199066874, "grad_norm": 1.0712502633957945, "learning_rate": 9.739957729033054e-06, "loss": 0.2289, "step": 663 }, { "epoch": 0.10326594090202178, "grad_norm": 1.1587775220461487, "learning_rate": 9.739179593470156e-06, "loss": 0.1741, "step": 664 }, { "epoch": 0.10342146189735614, "grad_norm": 1.0260279383302884, "learning_rate": 9.738400326599e-06, "loss": 0.2412, "step": 665 }, { "epoch": 0.10357698289269052, "grad_norm": 1.491042707966078, "learning_rate": 9.737619928605605e-06, "loss": 0.1833, "step": 666 }, { "epoch": 0.10373250388802488, "grad_norm": 1.6710832262506907, "learning_rate": 9.736838399676266e-06, "loss": 0.1712, "step": 667 }, { "epoch": 0.10388802488335926, "grad_norm": 1.4001413138925893, "learning_rate": 9.736055739997543e-06, "loss": 0.2739, "step": 668 }, { "epoch": 0.10404354587869362, "grad_norm": 1.0413982567358797, "learning_rate": 9.735271949756269e-06, "loss": 0.1655, "step": 669 }, { "epoch": 0.104199066874028, "grad_norm": 2.062452927969995, "learning_rate": 9.734487029139544e-06, "loss": 0.2384, "step": 670 }, { "epoch": 0.10435458786936236, "grad_norm": 1.1419346714711909, "learning_rate": 9.733700978334741e-06, "loss": 0.2176, "step": 671 }, { "epoch": 0.10451010886469674, "grad_norm": 1.4704145498498906, "learning_rate": 9.7329137975295e-06, "loss": 0.2281, "step": 672 }, { "epoch": 0.1046656298600311, "grad_norm": 1.7257595787120843, "learning_rate": 9.732125486911733e-06, "loss": 0.1964, "step": 673 }, { "epoch": 0.10482115085536547, "grad_norm": 1.596182048450316, "learning_rate": 9.731336046669621e-06, "loss": 0.1863, "step": 674 }, { "epoch": 0.10497667185069985, "grad_norm": 1.741565962255971, "learning_rate": 9.730545476991613e-06, "loss": 0.1358, "step": 675 }, { "epoch": 0.10513219284603421, "grad_norm": 1.2105023861624677, "learning_rate": 9.729753778066431e-06, "loss": 0.2757, "step": 676 }, { "epoch": 0.10528771384136859, "grad_norm": 1.1483441998296096, "learning_rate": 9.728960950083062e-06, "loss": 0.2327, "step": 677 }, { "epoch": 0.10544323483670295, "grad_norm": 2.6827889453865255, "learning_rate": 9.728166993230768e-06, "loss": 0.2841, "step": 678 }, { "epoch": 0.10559875583203733, "grad_norm": 1.3531013447523792, "learning_rate": 9.727371907699075e-06, "loss": 0.2742, "step": 679 }, { "epoch": 0.1057542768273717, "grad_norm": 1.4165422039945663, "learning_rate": 9.726575693677782e-06, "loss": 0.1733, "step": 680 }, { "epoch": 0.10590979782270607, "grad_norm": 1.1633994693280907, "learning_rate": 9.725778351356958e-06, "loss": 0.1752, "step": 681 }, { "epoch": 0.10606531881804043, "grad_norm": 1.4801298044861129, "learning_rate": 9.724979880926937e-06, "loss": 0.1654, "step": 682 }, { "epoch": 0.10622083981337481, "grad_norm": 1.038476254792903, "learning_rate": 9.724180282578327e-06, "loss": 0.1796, "step": 683 }, { "epoch": 0.10637636080870917, "grad_norm": 1.1715546692057253, "learning_rate": 9.723379556502002e-06, "loss": 0.2615, "step": 684 }, { "epoch": 0.10653188180404355, "grad_norm": 0.9669903775949065, "learning_rate": 9.722577702889106e-06, "loss": 0.2217, "step": 685 }, { "epoch": 0.10668740279937791, "grad_norm": 0.9554324370526551, "learning_rate": 9.721774721931056e-06, "loss": 0.2067, "step": 686 }, { "epoch": 0.10684292379471229, "grad_norm": 1.5055382554521828, "learning_rate": 9.720970613819532e-06, "loss": 0.2886, "step": 687 }, { "epoch": 0.10699844479004665, "grad_norm": 1.4701983051316598, "learning_rate": 9.720165378746486e-06, "loss": 0.2461, "step": 688 }, { "epoch": 0.10715396578538103, "grad_norm": 0.8955915121278603, "learning_rate": 9.719359016904137e-06, "loss": 0.1296, "step": 689 }, { "epoch": 0.10730948678071539, "grad_norm": 1.1365940197104127, "learning_rate": 9.718551528484979e-06, "loss": 0.1756, "step": 690 }, { "epoch": 0.10746500777604977, "grad_norm": 1.1309854500820393, "learning_rate": 9.717742913681769e-06, "loss": 0.1685, "step": 691 }, { "epoch": 0.10762052877138413, "grad_norm": 1.228647590848163, "learning_rate": 9.716933172687533e-06, "loss": 0.1988, "step": 692 }, { "epoch": 0.1077760497667185, "grad_norm": 1.8437087557242553, "learning_rate": 9.71612230569557e-06, "loss": 0.2259, "step": 693 }, { "epoch": 0.10793157076205288, "grad_norm": 2.190128145243616, "learning_rate": 9.715310312899445e-06, "loss": 0.1593, "step": 694 }, { "epoch": 0.10808709175738725, "grad_norm": 1.9542747095305757, "learning_rate": 9.714497194492988e-06, "loss": 0.1942, "step": 695 }, { "epoch": 0.10824261275272162, "grad_norm": 1.190017072453523, "learning_rate": 9.713682950670305e-06, "loss": 0.184, "step": 696 }, { "epoch": 0.10839813374805599, "grad_norm": 1.3702585397170965, "learning_rate": 9.712867581625769e-06, "loss": 0.2747, "step": 697 }, { "epoch": 0.10855365474339036, "grad_norm": 1.1224607857205071, "learning_rate": 9.712051087554017e-06, "loss": 0.1851, "step": 698 }, { "epoch": 0.10870917573872473, "grad_norm": 1.1610995749820388, "learning_rate": 9.711233468649958e-06, "loss": 0.1651, "step": 699 }, { "epoch": 0.1088646967340591, "grad_norm": 1.0713548580433974, "learning_rate": 9.710414725108771e-06, "loss": 0.2798, "step": 700 }, { "epoch": 0.1088646967340591, "eval_loss": 0.2192843109369278, "eval_runtime": 9.4454, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 700 }, { "epoch": 0.10902021772939346, "grad_norm": 1.086974338576193, "learning_rate": 9.709594857125898e-06, "loss": 0.3235, "step": 701 }, { "epoch": 0.10917573872472784, "grad_norm": 3.455927159294357, "learning_rate": 9.708773864897059e-06, "loss": 0.1502, "step": 702 }, { "epoch": 0.1093312597200622, "grad_norm": 1.6070730415734276, "learning_rate": 9.707951748618229e-06, "loss": 0.2652, "step": 703 }, { "epoch": 0.10948678071539658, "grad_norm": 1.0297377958380671, "learning_rate": 9.707128508485663e-06, "loss": 0.2352, "step": 704 }, { "epoch": 0.10964230171073094, "grad_norm": 1.07292209906991, "learning_rate": 9.706304144695877e-06, "loss": 0.1471, "step": 705 }, { "epoch": 0.10979782270606532, "grad_norm": 1.2095547877752455, "learning_rate": 9.705478657445661e-06, "loss": 0.2107, "step": 706 }, { "epoch": 0.10995334370139968, "grad_norm": 1.307669146215221, "learning_rate": 9.70465204693207e-06, "loss": 0.2337, "step": 707 }, { "epoch": 0.11010886469673406, "grad_norm": 0.8004125116368356, "learning_rate": 9.703824313352428e-06, "loss": 0.2042, "step": 708 }, { "epoch": 0.11026438569206842, "grad_norm": 1.5202724190274493, "learning_rate": 9.702995456904323e-06, "loss": 0.2446, "step": 709 }, { "epoch": 0.1104199066874028, "grad_norm": 1.3109419274601464, "learning_rate": 9.702165477785618e-06, "loss": 0.2791, "step": 710 }, { "epoch": 0.11057542768273718, "grad_norm": 1.2175779348655416, "learning_rate": 9.70133437619444e-06, "loss": 0.2787, "step": 711 }, { "epoch": 0.11073094867807154, "grad_norm": 2.4619987863193824, "learning_rate": 9.700502152329182e-06, "loss": 0.2184, "step": 712 }, { "epoch": 0.11088646967340592, "grad_norm": 1.1204962171981678, "learning_rate": 9.69966880638851e-06, "loss": 0.1796, "step": 713 }, { "epoch": 0.11104199066874028, "grad_norm": 1.3460375672771012, "learning_rate": 9.698834338571355e-06, "loss": 0.1536, "step": 714 }, { "epoch": 0.11119751166407466, "grad_norm": 1.4551247859245915, "learning_rate": 9.697998749076916e-06, "loss": 0.1775, "step": 715 }, { "epoch": 0.11135303265940902, "grad_norm": 1.64865769787968, "learning_rate": 9.69716203810466e-06, "loss": 0.2341, "step": 716 }, { "epoch": 0.1115085536547434, "grad_norm": 1.8250018792840808, "learning_rate": 9.696324205854322e-06, "loss": 0.2058, "step": 717 }, { "epoch": 0.11166407465007776, "grad_norm": 1.067050937242904, "learning_rate": 9.695485252525902e-06, "loss": 0.1463, "step": 718 }, { "epoch": 0.11181959564541213, "grad_norm": 2.2821274396758127, "learning_rate": 9.694645178319673e-06, "loss": 0.2508, "step": 719 }, { "epoch": 0.1119751166407465, "grad_norm": 1.388014808020173, "learning_rate": 9.69380398343617e-06, "loss": 0.1977, "step": 720 }, { "epoch": 0.11213063763608087, "grad_norm": 1.5658859493501038, "learning_rate": 9.692961668076197e-06, "loss": 0.2291, "step": 721 }, { "epoch": 0.11228615863141524, "grad_norm": 1.0853791710998715, "learning_rate": 9.69211823244083e-06, "loss": 0.2763, "step": 722 }, { "epoch": 0.11244167962674961, "grad_norm": 1.27256020581809, "learning_rate": 9.691273676731408e-06, "loss": 0.195, "step": 723 }, { "epoch": 0.11259720062208398, "grad_norm": 0.6768405188507002, "learning_rate": 9.690428001149537e-06, "loss": 0.1839, "step": 724 }, { "epoch": 0.11275272161741835, "grad_norm": 3.309861478677342, "learning_rate": 9.68958120589709e-06, "loss": 0.1446, "step": 725 }, { "epoch": 0.11290824261275272, "grad_norm": 1.3577561463931358, "learning_rate": 9.688733291176211e-06, "loss": 0.174, "step": 726 }, { "epoch": 0.11306376360808709, "grad_norm": 0.7899130738957459, "learning_rate": 9.68788425718931e-06, "loss": 0.1819, "step": 727 }, { "epoch": 0.11321928460342146, "grad_norm": 1.9374468863177388, "learning_rate": 9.68703410413906e-06, "loss": 0.2148, "step": 728 }, { "epoch": 0.11337480559875583, "grad_norm": 0.9790173123360771, "learning_rate": 9.686182832228408e-06, "loss": 0.1842, "step": 729 }, { "epoch": 0.11353032659409021, "grad_norm": 1.8838507925348544, "learning_rate": 9.685330441660564e-06, "loss": 0.2482, "step": 730 }, { "epoch": 0.11368584758942457, "grad_norm": 1.7209011423931209, "learning_rate": 9.684476932639002e-06, "loss": 0.1938, "step": 731 }, { "epoch": 0.11384136858475895, "grad_norm": 1.3133247484457822, "learning_rate": 9.68362230536747e-06, "loss": 0.1629, "step": 732 }, { "epoch": 0.11399688958009331, "grad_norm": 1.4346328630835792, "learning_rate": 9.682766560049979e-06, "loss": 0.2393, "step": 733 }, { "epoch": 0.11415241057542769, "grad_norm": 1.416880965769396, "learning_rate": 9.681909696890805e-06, "loss": 0.2149, "step": 734 }, { "epoch": 0.11430793157076205, "grad_norm": 1.3604331981225013, "learning_rate": 9.681051716094497e-06, "loss": 0.2116, "step": 735 }, { "epoch": 0.11446345256609643, "grad_norm": 1.370682231566524, "learning_rate": 9.680192617865862e-06, "loss": 0.1574, "step": 736 }, { "epoch": 0.11461897356143079, "grad_norm": 3.11697026931608, "learning_rate": 9.679332402409983e-06, "loss": 0.1659, "step": 737 }, { "epoch": 0.11477449455676517, "grad_norm": 1.0795485204091093, "learning_rate": 9.678471069932205e-06, "loss": 0.1843, "step": 738 }, { "epoch": 0.11493001555209953, "grad_norm": 1.089003737321956, "learning_rate": 9.677608620638138e-06, "loss": 0.1289, "step": 739 }, { "epoch": 0.1150855365474339, "grad_norm": 1.9816825572482675, "learning_rate": 9.676745054733661e-06, "loss": 0.183, "step": 740 }, { "epoch": 0.11524105754276827, "grad_norm": 4.608323882578619, "learning_rate": 9.675880372424922e-06, "loss": 0.1797, "step": 741 }, { "epoch": 0.11539657853810265, "grad_norm": 0.9751878331403108, "learning_rate": 9.675014573918328e-06, "loss": 0.2649, "step": 742 }, { "epoch": 0.11555209953343701, "grad_norm": 0.913137520804308, "learning_rate": 9.67414765942056e-06, "loss": 0.1229, "step": 743 }, { "epoch": 0.11570762052877138, "grad_norm": 1.1182409613228717, "learning_rate": 9.673279629138565e-06, "loss": 0.1554, "step": 744 }, { "epoch": 0.11586314152410575, "grad_norm": 2.425925853364065, "learning_rate": 9.67241048327955e-06, "loss": 0.2414, "step": 745 }, { "epoch": 0.11601866251944012, "grad_norm": 2.1643434151507024, "learning_rate": 9.671540222050995e-06, "loss": 0.2402, "step": 746 }, { "epoch": 0.1161741835147745, "grad_norm": 1.1869224601016288, "learning_rate": 9.67066884566064e-06, "loss": 0.225, "step": 747 }, { "epoch": 0.11632970451010886, "grad_norm": 1.1850496858694712, "learning_rate": 9.669796354316497e-06, "loss": 0.1732, "step": 748 }, { "epoch": 0.11648522550544324, "grad_norm": 1.083880428656249, "learning_rate": 9.668922748226842e-06, "loss": 0.2256, "step": 749 }, { "epoch": 0.1166407465007776, "grad_norm": 0.9290306352610638, "learning_rate": 9.668048027600217e-06, "loss": 0.1814, "step": 750 }, { "epoch": 0.11679626749611198, "grad_norm": 1.1985316233321583, "learning_rate": 9.66717219264543e-06, "loss": 0.2646, "step": 751 }, { "epoch": 0.11695178849144634, "grad_norm": 1.5752976014862634, "learning_rate": 9.666295243571553e-06, "loss": 0.2212, "step": 752 }, { "epoch": 0.11710730948678072, "grad_norm": 1.554593030529623, "learning_rate": 9.665417180587928e-06, "loss": 0.2008, "step": 753 }, { "epoch": 0.11726283048211508, "grad_norm": 1.802147426905897, "learning_rate": 9.664538003904162e-06, "loss": 0.1694, "step": 754 }, { "epoch": 0.11741835147744946, "grad_norm": 1.117253074112765, "learning_rate": 9.663657713730123e-06, "loss": 0.1769, "step": 755 }, { "epoch": 0.11757387247278382, "grad_norm": 1.2713208371120763, "learning_rate": 9.662776310275954e-06, "loss": 0.3356, "step": 756 }, { "epoch": 0.1177293934681182, "grad_norm": 1.5049877808240208, "learning_rate": 9.661893793752053e-06, "loss": 0.2156, "step": 757 }, { "epoch": 0.11788491446345256, "grad_norm": 1.3646831264890733, "learning_rate": 9.661010164369092e-06, "loss": 0.2077, "step": 758 }, { "epoch": 0.11804043545878694, "grad_norm": 1.2057674637964264, "learning_rate": 9.660125422338003e-06, "loss": 0.234, "step": 759 }, { "epoch": 0.1181959564541213, "grad_norm": 1.7059599899477969, "learning_rate": 9.659239567869989e-06, "loss": 0.2019, "step": 760 }, { "epoch": 0.11835147744945568, "grad_norm": 1.359054263386884, "learning_rate": 9.658352601176514e-06, "loss": 0.2263, "step": 761 }, { "epoch": 0.11850699844479004, "grad_norm": 1.4779502971821263, "learning_rate": 9.65746452246931e-06, "loss": 0.229, "step": 762 }, { "epoch": 0.11866251944012442, "grad_norm": 1.2106031530437371, "learning_rate": 9.656575331960376e-06, "loss": 0.2075, "step": 763 }, { "epoch": 0.1188180404354588, "grad_norm": 1.5750869920441555, "learning_rate": 9.655685029861969e-06, "loss": 0.2103, "step": 764 }, { "epoch": 0.11897356143079316, "grad_norm": 1.328300416339256, "learning_rate": 9.654793616386621e-06, "loss": 0.1822, "step": 765 }, { "epoch": 0.11912908242612753, "grad_norm": 2.218866258760128, "learning_rate": 9.653901091747124e-06, "loss": 0.1909, "step": 766 }, { "epoch": 0.1192846034214619, "grad_norm": 1.8622051312400103, "learning_rate": 9.653007456156536e-06, "loss": 0.2241, "step": 767 }, { "epoch": 0.11944012441679627, "grad_norm": 1.3832228672336278, "learning_rate": 9.652112709828179e-06, "loss": 0.2256, "step": 768 }, { "epoch": 0.11959564541213064, "grad_norm": 1.0673171707909481, "learning_rate": 9.651216852975643e-06, "loss": 0.1959, "step": 769 }, { "epoch": 0.11975116640746501, "grad_norm": 1.3393619429463375, "learning_rate": 9.650319885812777e-06, "loss": 0.2727, "step": 770 }, { "epoch": 0.11990668740279938, "grad_norm": 1.0882111784771522, "learning_rate": 9.649421808553708e-06, "loss": 0.2259, "step": 771 }, { "epoch": 0.12006220839813375, "grad_norm": 4.447919742603164, "learning_rate": 9.648522621412812e-06, "loss": 0.231, "step": 772 }, { "epoch": 0.12021772939346811, "grad_norm": 1.5176403638597071, "learning_rate": 9.647622324604742e-06, "loss": 0.2824, "step": 773 }, { "epoch": 0.12037325038880249, "grad_norm": 1.7576074795768224, "learning_rate": 9.646720918344409e-06, "loss": 0.2034, "step": 774 }, { "epoch": 0.12052877138413685, "grad_norm": 1.5792838723378395, "learning_rate": 9.645818402846992e-06, "loss": 0.1677, "step": 775 }, { "epoch": 0.12068429237947123, "grad_norm": 1.0405000433648128, "learning_rate": 9.644914778327935e-06, "loss": 0.1742, "step": 776 }, { "epoch": 0.1208398133748056, "grad_norm": 1.545200668981177, "learning_rate": 9.644010045002942e-06, "loss": 0.215, "step": 777 }, { "epoch": 0.12099533437013997, "grad_norm": 1.203039484308954, "learning_rate": 9.64310420308799e-06, "loss": 0.1997, "step": 778 }, { "epoch": 0.12115085536547433, "grad_norm": 1.038062251460105, "learning_rate": 9.642197252799315e-06, "loss": 0.2001, "step": 779 }, { "epoch": 0.12130637636080871, "grad_norm": 1.3963430783849184, "learning_rate": 9.641289194353418e-06, "loss": 0.2034, "step": 780 }, { "epoch": 0.12146189735614307, "grad_norm": 1.7069918759015217, "learning_rate": 9.640380027967065e-06, "loss": 0.1763, "step": 781 }, { "epoch": 0.12161741835147745, "grad_norm": 1.1485309219449071, "learning_rate": 9.639469753857287e-06, "loss": 0.1946, "step": 782 }, { "epoch": 0.12177293934681183, "grad_norm": 0.9976269624811838, "learning_rate": 9.63855837224138e-06, "loss": 0.1797, "step": 783 }, { "epoch": 0.12192846034214619, "grad_norm": 1.413148682632424, "learning_rate": 9.6376458833369e-06, "loss": 0.1873, "step": 784 }, { "epoch": 0.12208398133748057, "grad_norm": 1.287068701523726, "learning_rate": 9.636732287361675e-06, "loss": 0.1964, "step": 785 }, { "epoch": 0.12223950233281493, "grad_norm": 1.338092957612231, "learning_rate": 9.635817584533791e-06, "loss": 0.2353, "step": 786 }, { "epoch": 0.1223950233281493, "grad_norm": 1.018985176065171, "learning_rate": 9.6349017750716e-06, "loss": 0.243, "step": 787 }, { "epoch": 0.12255054432348367, "grad_norm": 1.434405666961768, "learning_rate": 9.633984859193722e-06, "loss": 0.1622, "step": 788 }, { "epoch": 0.12270606531881804, "grad_norm": 1.2392900109261706, "learning_rate": 9.633066837119034e-06, "loss": 0.2223, "step": 789 }, { "epoch": 0.12286158631415241, "grad_norm": 0.9045673894396051, "learning_rate": 9.632147709066682e-06, "loss": 0.2079, "step": 790 }, { "epoch": 0.12301710730948678, "grad_norm": 1.14443309047443, "learning_rate": 9.631227475256072e-06, "loss": 0.1611, "step": 791 }, { "epoch": 0.12317262830482115, "grad_norm": 1.1564291271253233, "learning_rate": 9.630306135906882e-06, "loss": 0.1918, "step": 792 }, { "epoch": 0.12332814930015552, "grad_norm": 2.1831582412646138, "learning_rate": 9.629383691239043e-06, "loss": 0.3687, "step": 793 }, { "epoch": 0.12348367029548989, "grad_norm": 1.0115623861000755, "learning_rate": 9.628460141472759e-06, "loss": 0.1589, "step": 794 }, { "epoch": 0.12363919129082426, "grad_norm": 0.8936049036056027, "learning_rate": 9.627535486828491e-06, "loss": 0.1775, "step": 795 }, { "epoch": 0.12379471228615863, "grad_norm": 1.3757750926899586, "learning_rate": 9.626609727526973e-06, "loss": 0.2, "step": 796 }, { "epoch": 0.123950233281493, "grad_norm": 1.3462049704057701, "learning_rate": 9.62568286378919e-06, "loss": 0.2079, "step": 797 }, { "epoch": 0.12410575427682737, "grad_norm": 2.793319589376331, "learning_rate": 9.624754895836401e-06, "loss": 0.2297, "step": 798 }, { "epoch": 0.12426127527216174, "grad_norm": 2.1016347336310357, "learning_rate": 9.623825823890123e-06, "loss": 0.3106, "step": 799 }, { "epoch": 0.12441679626749612, "grad_norm": 1.003756031018623, "learning_rate": 9.622895648172141e-06, "loss": 0.2143, "step": 800 }, { "epoch": 0.12441679626749612, "eval_loss": 0.2170763909816742, "eval_runtime": 9.4305, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 800 }, { "epoch": 0.12457231726283048, "grad_norm": 0.897563337381756, "learning_rate": 9.621964368904497e-06, "loss": 0.1512, "step": 801 }, { "epoch": 0.12472783825816486, "grad_norm": 1.4190659163727315, "learning_rate": 9.621031986309504e-06, "loss": 0.1372, "step": 802 }, { "epoch": 0.12488335925349922, "grad_norm": 1.4031206175030444, "learning_rate": 9.620098500609734e-06, "loss": 0.1871, "step": 803 }, { "epoch": 0.12503888024883358, "grad_norm": 1.387547575925909, "learning_rate": 9.61916391202802e-06, "loss": 0.2899, "step": 804 }, { "epoch": 0.12519440124416797, "grad_norm": 1.3476031364192975, "learning_rate": 9.618228220787466e-06, "loss": 0.1693, "step": 805 }, { "epoch": 0.12534992223950234, "grad_norm": 2.5401419561208787, "learning_rate": 9.617291427111431e-06, "loss": 0.141, "step": 806 }, { "epoch": 0.1255054432348367, "grad_norm": 1.918003643731122, "learning_rate": 9.616353531223543e-06, "loss": 0.2531, "step": 807 }, { "epoch": 0.12566096423017106, "grad_norm": 0.8824574964250353, "learning_rate": 9.61541453334769e-06, "loss": 0.2257, "step": 808 }, { "epoch": 0.12581648522550545, "grad_norm": 1.2069677012195894, "learning_rate": 9.614474433708021e-06, "loss": 0.2012, "step": 809 }, { "epoch": 0.12597200622083982, "grad_norm": 0.8806254573901449, "learning_rate": 9.613533232528956e-06, "loss": 0.2312, "step": 810 }, { "epoch": 0.12612752721617418, "grad_norm": 0.9758926813848963, "learning_rate": 9.61259093003517e-06, "loss": 0.1623, "step": 811 }, { "epoch": 0.12628304821150854, "grad_norm": 1.601541464183247, "learning_rate": 9.611647526451603e-06, "loss": 0.2448, "step": 812 }, { "epoch": 0.12643856920684293, "grad_norm": 0.987236561765066, "learning_rate": 9.610703022003462e-06, "loss": 0.1833, "step": 813 }, { "epoch": 0.1265940902021773, "grad_norm": 1.1685078861500846, "learning_rate": 9.60975741691621e-06, "loss": 0.2708, "step": 814 }, { "epoch": 0.12674961119751166, "grad_norm": 1.2818789908746795, "learning_rate": 9.608810711415577e-06, "loss": 0.2132, "step": 815 }, { "epoch": 0.12690513219284602, "grad_norm": 1.7355503765107922, "learning_rate": 9.607862905727556e-06, "loss": 0.2316, "step": 816 }, { "epoch": 0.1270606531881804, "grad_norm": 2.4291900998321614, "learning_rate": 9.6069140000784e-06, "loss": 0.2607, "step": 817 }, { "epoch": 0.12721617418351477, "grad_norm": 1.2126882446943306, "learning_rate": 9.605963994694625e-06, "loss": 0.2374, "step": 818 }, { "epoch": 0.12737169517884914, "grad_norm": 1.402793253608196, "learning_rate": 9.605012889803013e-06, "loss": 0.1854, "step": 819 }, { "epoch": 0.12752721617418353, "grad_norm": 1.1350096409875572, "learning_rate": 9.604060685630608e-06, "loss": 0.2353, "step": 820 }, { "epoch": 0.1276827371695179, "grad_norm": 0.8605397955086846, "learning_rate": 9.603107382404708e-06, "loss": 0.1725, "step": 821 }, { "epoch": 0.12783825816485225, "grad_norm": 1.8193213761501528, "learning_rate": 9.602152980352884e-06, "loss": 0.191, "step": 822 }, { "epoch": 0.12799377916018662, "grad_norm": 1.0560479092155457, "learning_rate": 9.601197479702963e-06, "loss": 0.2129, "step": 823 }, { "epoch": 0.128149300155521, "grad_norm": 0.9886146739779551, "learning_rate": 9.60024088068304e-06, "loss": 0.1349, "step": 824 }, { "epoch": 0.12830482115085537, "grad_norm": 1.044208330213169, "learning_rate": 9.599283183521467e-06, "loss": 0.1611, "step": 825 }, { "epoch": 0.12846034214618973, "grad_norm": 1.105951942629371, "learning_rate": 9.598324388446856e-06, "loss": 0.25, "step": 826 }, { "epoch": 0.1286158631415241, "grad_norm": 1.2794645483672162, "learning_rate": 9.59736449568809e-06, "loss": 0.2132, "step": 827 }, { "epoch": 0.12877138413685849, "grad_norm": 1.3758053785309152, "learning_rate": 9.596403505474304e-06, "loss": 0.2149, "step": 828 }, { "epoch": 0.12892690513219285, "grad_norm": 3.355818230170184, "learning_rate": 9.595441418034903e-06, "loss": 0.3682, "step": 829 }, { "epoch": 0.1290824261275272, "grad_norm": 1.837073128336488, "learning_rate": 9.594478233599551e-06, "loss": 0.2032, "step": 830 }, { "epoch": 0.12923794712286157, "grad_norm": 1.5066969144898332, "learning_rate": 9.593513952398172e-06, "loss": 0.2378, "step": 831 }, { "epoch": 0.12939346811819596, "grad_norm": 2.2384679831338614, "learning_rate": 9.592548574660954e-06, "loss": 0.3073, "step": 832 }, { "epoch": 0.12954898911353033, "grad_norm": 0.9921790422628257, "learning_rate": 9.591582100618345e-06, "loss": 0.1937, "step": 833 }, { "epoch": 0.1297045101088647, "grad_norm": 1.198440591432804, "learning_rate": 9.590614530501057e-06, "loss": 0.1925, "step": 834 }, { "epoch": 0.12986003110419908, "grad_norm": 1.3748463927035848, "learning_rate": 9.589645864540061e-06, "loss": 0.1941, "step": 835 }, { "epoch": 0.13001555209953344, "grad_norm": 1.3610943196332044, "learning_rate": 9.588676102966593e-06, "loss": 0.166, "step": 836 }, { "epoch": 0.1301710730948678, "grad_norm": 0.8955532583487235, "learning_rate": 9.58770524601215e-06, "loss": 0.1495, "step": 837 }, { "epoch": 0.13032659409020217, "grad_norm": 1.285038495994977, "learning_rate": 9.586733293908486e-06, "loss": 0.2182, "step": 838 }, { "epoch": 0.13048211508553656, "grad_norm": 1.3128144306673817, "learning_rate": 9.585760246887618e-06, "loss": 0.2371, "step": 839 }, { "epoch": 0.13063763608087092, "grad_norm": 0.9827038114137296, "learning_rate": 9.584786105181831e-06, "loss": 0.2151, "step": 840 }, { "epoch": 0.13079315707620529, "grad_norm": 1.0846767572687748, "learning_rate": 9.583810869023663e-06, "loss": 0.2757, "step": 841 }, { "epoch": 0.13094867807153965, "grad_norm": 2.4064875629004265, "learning_rate": 9.582834538645917e-06, "loss": 0.2357, "step": 842 }, { "epoch": 0.13110419906687404, "grad_norm": 1.2894114641673238, "learning_rate": 9.581857114281656e-06, "loss": 0.1877, "step": 843 }, { "epoch": 0.1312597200622084, "grad_norm": 1.5574730662344252, "learning_rate": 9.580878596164207e-06, "loss": 0.1623, "step": 844 }, { "epoch": 0.13141524105754276, "grad_norm": 1.2710340775794473, "learning_rate": 9.579898984527154e-06, "loss": 0.187, "step": 845 }, { "epoch": 0.13157076205287713, "grad_norm": 1.4508001676942102, "learning_rate": 9.578918279604346e-06, "loss": 0.1372, "step": 846 }, { "epoch": 0.13172628304821152, "grad_norm": 1.0012618056091263, "learning_rate": 9.577936481629887e-06, "loss": 0.2201, "step": 847 }, { "epoch": 0.13188180404354588, "grad_norm": 0.8624233281967797, "learning_rate": 9.576953590838149e-06, "loss": 0.1979, "step": 848 }, { "epoch": 0.13203732503888024, "grad_norm": 1.083965076436999, "learning_rate": 9.57596960746376e-06, "loss": 0.2404, "step": 849 }, { "epoch": 0.1321928460342146, "grad_norm": 2.717551231092263, "learning_rate": 9.574984531741613e-06, "loss": 0.2745, "step": 850 }, { "epoch": 0.132348367029549, "grad_norm": 1.1408480485083061, "learning_rate": 9.573998363906858e-06, "loss": 0.207, "step": 851 }, { "epoch": 0.13250388802488336, "grad_norm": 1.6013848917828304, "learning_rate": 9.573011104194907e-06, "loss": 0.1826, "step": 852 }, { "epoch": 0.13265940902021772, "grad_norm": 1.1178529036140945, "learning_rate": 9.572022752841433e-06, "loss": 0.1676, "step": 853 }, { "epoch": 0.1328149300155521, "grad_norm": 1.4964605327939924, "learning_rate": 9.571033310082367e-06, "loss": 0.1929, "step": 854 }, { "epoch": 0.13297045101088648, "grad_norm": 1.1404147062516024, "learning_rate": 9.570042776153904e-06, "loss": 0.2274, "step": 855 }, { "epoch": 0.13312597200622084, "grad_norm": 1.037410347500119, "learning_rate": 9.5690511512925e-06, "loss": 0.1577, "step": 856 }, { "epoch": 0.1332814930015552, "grad_norm": 0.8366673014473697, "learning_rate": 9.56805843573487e-06, "loss": 0.1689, "step": 857 }, { "epoch": 0.1334370139968896, "grad_norm": 1.1452085152848681, "learning_rate": 9.567064629717986e-06, "loss": 0.1882, "step": 858 }, { "epoch": 0.13359253499222395, "grad_norm": 1.574854487100182, "learning_rate": 9.566069733479087e-06, "loss": 0.31, "step": 859 }, { "epoch": 0.13374805598755832, "grad_norm": 1.593208427145828, "learning_rate": 9.565073747255665e-06, "loss": 0.2198, "step": 860 }, { "epoch": 0.13390357698289268, "grad_norm": 1.1177720055491567, "learning_rate": 9.564076671285477e-06, "loss": 0.2164, "step": 861 }, { "epoch": 0.13405909797822707, "grad_norm": 0.8165999821951461, "learning_rate": 9.56307850580654e-06, "loss": 0.1506, "step": 862 }, { "epoch": 0.13421461897356143, "grad_norm": 1.6750367279986849, "learning_rate": 9.562079251057129e-06, "loss": 0.1732, "step": 863 }, { "epoch": 0.1343701399688958, "grad_norm": 0.8044448243559967, "learning_rate": 9.561078907275781e-06, "loss": 0.1922, "step": 864 }, { "epoch": 0.13452566096423016, "grad_norm": 1.271960150991974, "learning_rate": 9.56007747470129e-06, "loss": 0.2229, "step": 865 }, { "epoch": 0.13468118195956455, "grad_norm": 1.0004490456147865, "learning_rate": 9.559074953572713e-06, "loss": 0.171, "step": 866 }, { "epoch": 0.1348367029548989, "grad_norm": 1.312217862895249, "learning_rate": 9.558071344129368e-06, "loss": 0.1783, "step": 867 }, { "epoch": 0.13499222395023328, "grad_norm": 0.9356844106701133, "learning_rate": 9.557066646610826e-06, "loss": 0.1279, "step": 868 }, { "epoch": 0.13514774494556764, "grad_norm": 1.4966712904656105, "learning_rate": 9.556060861256928e-06, "loss": 0.1971, "step": 869 }, { "epoch": 0.13530326594090203, "grad_norm": 0.9157016732315058, "learning_rate": 9.555053988307764e-06, "loss": 0.1739, "step": 870 }, { "epoch": 0.1354587869362364, "grad_norm": 1.6187813697357434, "learning_rate": 9.554046028003691e-06, "loss": 0.2326, "step": 871 }, { "epoch": 0.13561430793157075, "grad_norm": 1.649258041134042, "learning_rate": 9.553036980585323e-06, "loss": 0.2775, "step": 872 }, { "epoch": 0.13576982892690515, "grad_norm": 0.8386386166459481, "learning_rate": 9.552026846293532e-06, "loss": 0.2225, "step": 873 }, { "epoch": 0.1359253499222395, "grad_norm": 0.96771492040488, "learning_rate": 9.551015625369455e-06, "loss": 0.1999, "step": 874 }, { "epoch": 0.13608087091757387, "grad_norm": 1.4939182411934322, "learning_rate": 9.550003318054482e-06, "loss": 0.2427, "step": 875 }, { "epoch": 0.13623639191290823, "grad_norm": 1.1599555983572944, "learning_rate": 9.548989924590263e-06, "loss": 0.2038, "step": 876 }, { "epoch": 0.13639191290824262, "grad_norm": 1.094972018927162, "learning_rate": 9.547975445218712e-06, "loss": 0.1477, "step": 877 }, { "epoch": 0.136547433903577, "grad_norm": 1.5378516224601575, "learning_rate": 9.546959880181998e-06, "loss": 0.2411, "step": 878 }, { "epoch": 0.13670295489891135, "grad_norm": 0.8702765312556789, "learning_rate": 9.545943229722553e-06, "loss": 0.1646, "step": 879 }, { "epoch": 0.1368584758942457, "grad_norm": 1.3664019719395564, "learning_rate": 9.544925494083062e-06, "loss": 0.1688, "step": 880 }, { "epoch": 0.1370139968895801, "grad_norm": 1.3206104649159593, "learning_rate": 9.543906673506474e-06, "loss": 0.1623, "step": 881 }, { "epoch": 0.13716951788491447, "grad_norm": 1.3156230503659714, "learning_rate": 9.542886768235996e-06, "loss": 0.2297, "step": 882 }, { "epoch": 0.13732503888024883, "grad_norm": 1.727680640232904, "learning_rate": 9.541865778515094e-06, "loss": 0.2824, "step": 883 }, { "epoch": 0.1374805598755832, "grad_norm": 1.3346266664784416, "learning_rate": 9.540843704587492e-06, "loss": 0.2533, "step": 884 }, { "epoch": 0.13763608087091758, "grad_norm": 1.663603312691407, "learning_rate": 9.539820546697175e-06, "loss": 0.1889, "step": 885 }, { "epoch": 0.13779160186625194, "grad_norm": 1.3931002570801638, "learning_rate": 9.53879630508838e-06, "loss": 0.2125, "step": 886 }, { "epoch": 0.1379471228615863, "grad_norm": 1.0312695868953268, "learning_rate": 9.537770980005616e-06, "loss": 0.157, "step": 887 }, { "epoch": 0.13810264385692067, "grad_norm": 1.291055270497525, "learning_rate": 9.536744571693634e-06, "loss": 0.1542, "step": 888 }, { "epoch": 0.13825816485225506, "grad_norm": 1.0586309772197517, "learning_rate": 9.535717080397458e-06, "loss": 0.1413, "step": 889 }, { "epoch": 0.13841368584758942, "grad_norm": 1.9142459890481243, "learning_rate": 9.53468850636236e-06, "loss": 0.2132, "step": 890 }, { "epoch": 0.1385692068429238, "grad_norm": 1.57785159694773, "learning_rate": 9.533658849833879e-06, "loss": 0.2704, "step": 891 }, { "epoch": 0.13872472783825818, "grad_norm": 0.6767899331815482, "learning_rate": 9.532628111057804e-06, "loss": 0.1994, "step": 892 }, { "epoch": 0.13888024883359254, "grad_norm": 0.7786068585931847, "learning_rate": 9.531596290280191e-06, "loss": 0.2215, "step": 893 }, { "epoch": 0.1390357698289269, "grad_norm": 1.1907351307303637, "learning_rate": 9.530563387747348e-06, "loss": 0.1597, "step": 894 }, { "epoch": 0.13919129082426127, "grad_norm": 0.994862972128769, "learning_rate": 9.529529403705844e-06, "loss": 0.2586, "step": 895 }, { "epoch": 0.13934681181959566, "grad_norm": 0.9549652766512168, "learning_rate": 9.528494338402502e-06, "loss": 0.1332, "step": 896 }, { "epoch": 0.13950233281493002, "grad_norm": 1.1799329518454007, "learning_rate": 9.527458192084413e-06, "loss": 0.1884, "step": 897 }, { "epoch": 0.13965785381026438, "grad_norm": 0.7863314952979764, "learning_rate": 9.526420964998915e-06, "loss": 0.1679, "step": 898 }, { "epoch": 0.13981337480559874, "grad_norm": 0.937917950726602, "learning_rate": 9.52538265739361e-06, "loss": 0.2024, "step": 899 }, { "epoch": 0.13996889580093314, "grad_norm": 1.7160775693106616, "learning_rate": 9.524343269516354e-06, "loss": 0.2127, "step": 900 }, { "epoch": 0.13996889580093314, "eval_loss": 0.21867091953754425, "eval_runtime": 9.4128, "eval_samples_per_second": 2.762, "eval_steps_per_second": 0.744, "step": 900 }, { "epoch": 0.1401244167962675, "grad_norm": 1.4496209630087886, "learning_rate": 9.523302801615266e-06, "loss": 0.2026, "step": 901 }, { "epoch": 0.14027993779160186, "grad_norm": 0.9035504049737524, "learning_rate": 9.522261253938721e-06, "loss": 0.237, "step": 902 }, { "epoch": 0.14043545878693622, "grad_norm": 1.0344016899215176, "learning_rate": 9.521218626735347e-06, "loss": 0.2079, "step": 903 }, { "epoch": 0.14059097978227061, "grad_norm": 0.8764502702407341, "learning_rate": 9.52017492025404e-06, "loss": 0.1512, "step": 904 }, { "epoch": 0.14074650077760498, "grad_norm": 0.78362955023232, "learning_rate": 9.519130134743938e-06, "loss": 0.1544, "step": 905 }, { "epoch": 0.14090202177293934, "grad_norm": 1.331879071297993, "learning_rate": 9.518084270454456e-06, "loss": 0.208, "step": 906 }, { "epoch": 0.14105754276827373, "grad_norm": 1.0576721252655992, "learning_rate": 9.51703732763525e-06, "loss": 0.1777, "step": 907 }, { "epoch": 0.1412130637636081, "grad_norm": 0.9777650095779323, "learning_rate": 9.515989306536241e-06, "loss": 0.2431, "step": 908 }, { "epoch": 0.14136858475894246, "grad_norm": 1.2351460184737522, "learning_rate": 9.514940207407608e-06, "loss": 0.164, "step": 909 }, { "epoch": 0.14152410575427682, "grad_norm": 1.0466682687606328, "learning_rate": 9.513890030499786e-06, "loss": 0.1862, "step": 910 }, { "epoch": 0.1416796267496112, "grad_norm": 1.667573553968496, "learning_rate": 9.512838776063464e-06, "loss": 0.1881, "step": 911 }, { "epoch": 0.14183514774494557, "grad_norm": 1.0309274313381354, "learning_rate": 9.51178644434959e-06, "loss": 0.1894, "step": 912 }, { "epoch": 0.14199066874027994, "grad_norm": 1.1516030880613233, "learning_rate": 9.510733035609376e-06, "loss": 0.1906, "step": 913 }, { "epoch": 0.1421461897356143, "grad_norm": 1.1964374362259393, "learning_rate": 9.509678550094282e-06, "loss": 0.2193, "step": 914 }, { "epoch": 0.1423017107309487, "grad_norm": 1.018131456622998, "learning_rate": 9.508622988056026e-06, "loss": 0.18, "step": 915 }, { "epoch": 0.14245723172628305, "grad_norm": 0.9878879365994556, "learning_rate": 9.50756634974659e-06, "loss": 0.2303, "step": 916 }, { "epoch": 0.14261275272161741, "grad_norm": 0.9092163587106824, "learning_rate": 9.506508635418203e-06, "loss": 0.1565, "step": 917 }, { "epoch": 0.14276827371695178, "grad_norm": 1.2600000274625656, "learning_rate": 9.505449845323362e-06, "loss": 0.2203, "step": 918 }, { "epoch": 0.14292379471228617, "grad_norm": 1.0177653430547444, "learning_rate": 9.504389979714812e-06, "loss": 0.1708, "step": 919 }, { "epoch": 0.14307931570762053, "grad_norm": 1.323141251202386, "learning_rate": 9.503329038845556e-06, "loss": 0.2041, "step": 920 }, { "epoch": 0.1432348367029549, "grad_norm": 0.8666613786933973, "learning_rate": 9.50226702296886e-06, "loss": 0.1709, "step": 921 }, { "epoch": 0.14339035769828926, "grad_norm": 1.4717207003269144, "learning_rate": 9.501203932338238e-06, "loss": 0.1531, "step": 922 }, { "epoch": 0.14354587869362365, "grad_norm": 0.9850527774643847, "learning_rate": 9.500139767207465e-06, "loss": 0.2673, "step": 923 }, { "epoch": 0.143701399688958, "grad_norm": 0.795383661376322, "learning_rate": 9.499074527830576e-06, "loss": 0.1514, "step": 924 }, { "epoch": 0.14385692068429237, "grad_norm": 1.5926732733378721, "learning_rate": 9.498008214461854e-06, "loss": 0.1919, "step": 925 }, { "epoch": 0.14401244167962676, "grad_norm": 1.0577956165619293, "learning_rate": 9.496940827355843e-06, "loss": 0.2541, "step": 926 }, { "epoch": 0.14416796267496113, "grad_norm": 1.0853608193427453, "learning_rate": 9.495872366767345e-06, "loss": 0.3026, "step": 927 }, { "epoch": 0.1443234836702955, "grad_norm": 1.5841584604687593, "learning_rate": 9.494802832951416e-06, "loss": 0.237, "step": 928 }, { "epoch": 0.14447900466562985, "grad_norm": 1.2668912692543315, "learning_rate": 9.493732226163368e-06, "loss": 0.1962, "step": 929 }, { "epoch": 0.14463452566096424, "grad_norm": 1.1865934879383473, "learning_rate": 9.492660546658771e-06, "loss": 0.205, "step": 930 }, { "epoch": 0.1447900466562986, "grad_norm": 1.16907334182334, "learning_rate": 9.491587794693448e-06, "loss": 0.1649, "step": 931 }, { "epoch": 0.14494556765163297, "grad_norm": 2.6694118671679035, "learning_rate": 9.490513970523482e-06, "loss": 0.1716, "step": 932 }, { "epoch": 0.14510108864696733, "grad_norm": 1.2693916754547256, "learning_rate": 9.489439074405211e-06, "loss": 0.2102, "step": 933 }, { "epoch": 0.14525660964230172, "grad_norm": 1.4815910522621762, "learning_rate": 9.488363106595223e-06, "loss": 0.2146, "step": 934 }, { "epoch": 0.14541213063763608, "grad_norm": 1.5330200808441012, "learning_rate": 9.48728606735037e-06, "loss": 0.1767, "step": 935 }, { "epoch": 0.14556765163297045, "grad_norm": 1.123567228978502, "learning_rate": 9.486207956927756e-06, "loss": 0.1864, "step": 936 }, { "epoch": 0.1457231726283048, "grad_norm": 0.9960966752159592, "learning_rate": 9.485128775584737e-06, "loss": 0.2118, "step": 937 }, { "epoch": 0.1458786936236392, "grad_norm": 1.2303193618017887, "learning_rate": 9.484048523578934e-06, "loss": 0.2106, "step": 938 }, { "epoch": 0.14603421461897356, "grad_norm": 1.2867421133114936, "learning_rate": 9.482967201168218e-06, "loss": 0.2252, "step": 939 }, { "epoch": 0.14618973561430793, "grad_norm": 1.3372951799730566, "learning_rate": 9.481884808610712e-06, "loss": 0.2662, "step": 940 }, { "epoch": 0.1463452566096423, "grad_norm": 0.6808561025624517, "learning_rate": 9.4808013461648e-06, "loss": 0.1613, "step": 941 }, { "epoch": 0.14650077760497668, "grad_norm": 1.0617639952793092, "learning_rate": 9.479716814089119e-06, "loss": 0.22, "step": 942 }, { "epoch": 0.14665629860031104, "grad_norm": 1.2088515247514138, "learning_rate": 9.478631212642565e-06, "loss": 0.2027, "step": 943 }, { "epoch": 0.1468118195956454, "grad_norm": 0.9673478217504623, "learning_rate": 9.477544542084283e-06, "loss": 0.2291, "step": 944 }, { "epoch": 0.1469673405909798, "grad_norm": 1.3295783157520016, "learning_rate": 9.476456802673677e-06, "loss": 0.2153, "step": 945 }, { "epoch": 0.14712286158631416, "grad_norm": 1.1001160858062626, "learning_rate": 9.475367994670406e-06, "loss": 0.2195, "step": 946 }, { "epoch": 0.14727838258164852, "grad_norm": 1.291866801296516, "learning_rate": 9.474278118334382e-06, "loss": 0.2213, "step": 947 }, { "epoch": 0.14743390357698288, "grad_norm": 1.2674302718543788, "learning_rate": 9.473187173925777e-06, "loss": 0.1371, "step": 948 }, { "epoch": 0.14758942457231727, "grad_norm": 1.4168689609608738, "learning_rate": 9.472095161705014e-06, "loss": 0.1902, "step": 949 }, { "epoch": 0.14774494556765164, "grad_norm": 1.0439332293475743, "learning_rate": 9.471002081932767e-06, "loss": 0.2069, "step": 950 }, { "epoch": 0.147900466562986, "grad_norm": 1.346490441102045, "learning_rate": 9.469907934869974e-06, "loss": 0.1982, "step": 951 }, { "epoch": 0.14805598755832036, "grad_norm": 1.1817129831636979, "learning_rate": 9.468812720777822e-06, "loss": 0.1626, "step": 952 }, { "epoch": 0.14821150855365475, "grad_norm": 0.846186520557803, "learning_rate": 9.467716439917753e-06, "loss": 0.1659, "step": 953 }, { "epoch": 0.14836702954898912, "grad_norm": 1.77057726290962, "learning_rate": 9.466619092551467e-06, "loss": 0.1571, "step": 954 }, { "epoch": 0.14852255054432348, "grad_norm": 1.503606666530362, "learning_rate": 9.465520678940913e-06, "loss": 0.2317, "step": 955 }, { "epoch": 0.14867807153965784, "grad_norm": 1.2988561500793663, "learning_rate": 9.4644211993483e-06, "loss": 0.184, "step": 956 }, { "epoch": 0.14883359253499223, "grad_norm": 0.9494708116205622, "learning_rate": 9.463320654036088e-06, "loss": 0.2061, "step": 957 }, { "epoch": 0.1489891135303266, "grad_norm": 1.1960711999747602, "learning_rate": 9.462219043266993e-06, "loss": 0.1595, "step": 958 }, { "epoch": 0.14914463452566096, "grad_norm": 1.456286481771, "learning_rate": 9.461116367303985e-06, "loss": 0.1803, "step": 959 }, { "epoch": 0.14930015552099535, "grad_norm": 2.193608162058263, "learning_rate": 9.460012626410286e-06, "loss": 0.2372, "step": 960 }, { "epoch": 0.1494556765163297, "grad_norm": 1.1257027932111565, "learning_rate": 9.458907820849378e-06, "loss": 0.2183, "step": 961 }, { "epoch": 0.14961119751166407, "grad_norm": 1.2699403552308035, "learning_rate": 9.457801950884991e-06, "loss": 0.2112, "step": 962 }, { "epoch": 0.14976671850699844, "grad_norm": 2.0211225561288986, "learning_rate": 9.456695016781112e-06, "loss": 0.3771, "step": 963 }, { "epoch": 0.14992223950233283, "grad_norm": 1.6233952494139523, "learning_rate": 9.455587018801979e-06, "loss": 0.1654, "step": 964 }, { "epoch": 0.1500777604976672, "grad_norm": 0.9536635356305013, "learning_rate": 9.454477957212092e-06, "loss": 0.1971, "step": 965 }, { "epoch": 0.15023328149300155, "grad_norm": 1.2024688455270478, "learning_rate": 9.453367832276196e-06, "loss": 0.2073, "step": 966 }, { "epoch": 0.15038880248833592, "grad_norm": 1.0163258023024337, "learning_rate": 9.452256644259296e-06, "loss": 0.1622, "step": 967 }, { "epoch": 0.1505443234836703, "grad_norm": 1.4838973791587633, "learning_rate": 9.451144393426643e-06, "loss": 0.2058, "step": 968 }, { "epoch": 0.15069984447900467, "grad_norm": 1.0443777554962437, "learning_rate": 9.450031080043752e-06, "loss": 0.165, "step": 969 }, { "epoch": 0.15085536547433903, "grad_norm": 1.1175170370729908, "learning_rate": 9.448916704376384e-06, "loss": 0.1419, "step": 970 }, { "epoch": 0.1510108864696734, "grad_norm": 1.2857861611804626, "learning_rate": 9.447801266690557e-06, "loss": 0.2171, "step": 971 }, { "epoch": 0.15116640746500778, "grad_norm": 0.7407729973632995, "learning_rate": 9.446684767252539e-06, "loss": 0.1714, "step": 972 }, { "epoch": 0.15132192846034215, "grad_norm": 2.195989894115042, "learning_rate": 9.445567206328857e-06, "loss": 0.1989, "step": 973 }, { "epoch": 0.1514774494556765, "grad_norm": 0.989971668490221, "learning_rate": 9.444448584186288e-06, "loss": 0.1664, "step": 974 }, { "epoch": 0.15163297045101087, "grad_norm": 1.081538706581427, "learning_rate": 9.44332890109186e-06, "loss": 0.2066, "step": 975 }, { "epoch": 0.15178849144634526, "grad_norm": 1.4377035491264887, "learning_rate": 9.442208157312859e-06, "loss": 0.2057, "step": 976 }, { "epoch": 0.15194401244167963, "grad_norm": 1.5898783963503191, "learning_rate": 9.441086353116825e-06, "loss": 0.1665, "step": 977 }, { "epoch": 0.152099533437014, "grad_norm": 0.899579074969373, "learning_rate": 9.439963488771543e-06, "loss": 0.2091, "step": 978 }, { "epoch": 0.15225505443234838, "grad_norm": 1.4218933674345213, "learning_rate": 9.438839564545059e-06, "loss": 0.2344, "step": 979 }, { "epoch": 0.15241057542768274, "grad_norm": 1.2490316562718224, "learning_rate": 9.437714580705671e-06, "loss": 0.1771, "step": 980 }, { "epoch": 0.1525660964230171, "grad_norm": 1.3535600594171835, "learning_rate": 9.436588537521925e-06, "loss": 0.2402, "step": 981 }, { "epoch": 0.15272161741835147, "grad_norm": 1.2653882449622933, "learning_rate": 9.435461435262623e-06, "loss": 0.2368, "step": 982 }, { "epoch": 0.15287713841368586, "grad_norm": 1.4171554003791706, "learning_rate": 9.434333274196822e-06, "loss": 0.16, "step": 983 }, { "epoch": 0.15303265940902022, "grad_norm": 0.9372171947174371, "learning_rate": 9.433204054593832e-06, "loss": 0.1464, "step": 984 }, { "epoch": 0.15318818040435458, "grad_norm": 0.9807519101904891, "learning_rate": 9.43207377672321e-06, "loss": 0.1743, "step": 985 }, { "epoch": 0.15334370139968895, "grad_norm": 1.9830197584350164, "learning_rate": 9.430942440854772e-06, "loss": 0.2979, "step": 986 }, { "epoch": 0.15349922239502334, "grad_norm": 1.013327149062581, "learning_rate": 9.429810047258578e-06, "loss": 0.2257, "step": 987 }, { "epoch": 0.1536547433903577, "grad_norm": 1.3644569563063227, "learning_rate": 9.428676596204953e-06, "loss": 0.227, "step": 988 }, { "epoch": 0.15381026438569206, "grad_norm": 1.2971192291816034, "learning_rate": 9.427542087964462e-06, "loss": 0.2012, "step": 989 }, { "epoch": 0.15396578538102643, "grad_norm": 1.063681975107411, "learning_rate": 9.426406522807932e-06, "loss": 0.2299, "step": 990 }, { "epoch": 0.15412130637636082, "grad_norm": 1.0390353297783406, "learning_rate": 9.425269901006435e-06, "loss": 0.1438, "step": 991 }, { "epoch": 0.15427682737169518, "grad_norm": 1.821321152512482, "learning_rate": 9.424132222831301e-06, "loss": 0.1797, "step": 992 }, { "epoch": 0.15443234836702954, "grad_norm": 1.0266940584964872, "learning_rate": 9.422993488554108e-06, "loss": 0.1524, "step": 993 }, { "epoch": 0.1545878693623639, "grad_norm": 1.2357982408354415, "learning_rate": 9.42185369844669e-06, "loss": 0.1765, "step": 994 }, { "epoch": 0.1547433903576983, "grad_norm": 1.3007180654461126, "learning_rate": 9.420712852781129e-06, "loss": 0.2278, "step": 995 }, { "epoch": 0.15489891135303266, "grad_norm": 1.3519816843089092, "learning_rate": 9.419570951829761e-06, "loss": 0.2261, "step": 996 }, { "epoch": 0.15505443234836702, "grad_norm": 0.814621189176537, "learning_rate": 9.418427995865174e-06, "loss": 0.2172, "step": 997 }, { "epoch": 0.1552099533437014, "grad_norm": 1.7543842879443927, "learning_rate": 9.417283985160206e-06, "loss": 0.2164, "step": 998 }, { "epoch": 0.15536547433903578, "grad_norm": 0.8276231350286671, "learning_rate": 9.41613891998795e-06, "loss": 0.1975, "step": 999 }, { "epoch": 0.15552099533437014, "grad_norm": 1.1550898822511304, "learning_rate": 9.414992800621749e-06, "loss": 0.1501, "step": 1000 }, { "epoch": 0.15552099533437014, "eval_loss": 0.21367190778255463, "eval_runtime": 9.4284, "eval_samples_per_second": 2.758, "eval_steps_per_second": 0.742, "step": 1000 }, { "epoch": 0.1556765163297045, "grad_norm": 1.6764153048318766, "learning_rate": 9.413845627335197e-06, "loss": 0.2071, "step": 1001 }, { "epoch": 0.1558320373250389, "grad_norm": 1.1886246410449919, "learning_rate": 9.41269740040214e-06, "loss": 0.1956, "step": 1002 }, { "epoch": 0.15598755832037325, "grad_norm": 1.0793500722611682, "learning_rate": 9.411548120096676e-06, "loss": 0.144, "step": 1003 }, { "epoch": 0.15614307931570762, "grad_norm": 1.2449924636096124, "learning_rate": 9.410397786693157e-06, "loss": 0.2734, "step": 1004 }, { "epoch": 0.15629860031104198, "grad_norm": 0.8611732851449306, "learning_rate": 9.409246400466178e-06, "loss": 0.1923, "step": 1005 }, { "epoch": 0.15645412130637637, "grad_norm": 6.74577569453225, "learning_rate": 9.408093961690596e-06, "loss": 0.1956, "step": 1006 }, { "epoch": 0.15660964230171073, "grad_norm": 1.2060004741533563, "learning_rate": 9.406940470641512e-06, "loss": 0.2739, "step": 1007 }, { "epoch": 0.1567651632970451, "grad_norm": 1.6202727992084955, "learning_rate": 9.405785927594281e-06, "loss": 0.3171, "step": 1008 }, { "epoch": 0.15692068429237946, "grad_norm": 2.0124632761977534, "learning_rate": 9.404630332824509e-06, "loss": 0.2104, "step": 1009 }, { "epoch": 0.15707620528771385, "grad_norm": 2.0142886633624286, "learning_rate": 9.40347368660805e-06, "loss": 0.2548, "step": 1010 }, { "epoch": 0.1572317262830482, "grad_norm": 1.3434989581281018, "learning_rate": 9.402315989221013e-06, "loss": 0.2411, "step": 1011 }, { "epoch": 0.15738724727838257, "grad_norm": 1.3315974814677487, "learning_rate": 9.40115724093976e-06, "loss": 0.2839, "step": 1012 }, { "epoch": 0.15754276827371697, "grad_norm": 1.1186058721777734, "learning_rate": 9.399997442040894e-06, "loss": 0.167, "step": 1013 }, { "epoch": 0.15769828926905133, "grad_norm": 1.4492217703231243, "learning_rate": 9.39883659280128e-06, "loss": 0.1268, "step": 1014 }, { "epoch": 0.1578538102643857, "grad_norm": 1.257425749091041, "learning_rate": 9.39767469349803e-06, "loss": 0.1433, "step": 1015 }, { "epoch": 0.15800933125972005, "grad_norm": 1.7996939549666984, "learning_rate": 9.396511744408498e-06, "loss": 0.2012, "step": 1016 }, { "epoch": 0.15816485225505444, "grad_norm": 0.8429015986655448, "learning_rate": 9.395347745810304e-06, "loss": 0.1935, "step": 1017 }, { "epoch": 0.1583203732503888, "grad_norm": 1.370521795316769, "learning_rate": 9.394182697981306e-06, "loss": 0.2183, "step": 1018 }, { "epoch": 0.15847589424572317, "grad_norm": 1.622770939923456, "learning_rate": 9.393016601199622e-06, "loss": 0.1593, "step": 1019 }, { "epoch": 0.15863141524105753, "grad_norm": 1.011909638401176, "learning_rate": 9.39184945574361e-06, "loss": 0.2053, "step": 1020 }, { "epoch": 0.15878693623639192, "grad_norm": 1.6110438711648936, "learning_rate": 9.390681261891887e-06, "loss": 0.222, "step": 1021 }, { "epoch": 0.1589424572317263, "grad_norm": 1.4859951673056488, "learning_rate": 9.389512019923318e-06, "loss": 0.231, "step": 1022 }, { "epoch": 0.15909797822706065, "grad_norm": 1.166598629738374, "learning_rate": 9.388341730117015e-06, "loss": 0.1917, "step": 1023 }, { "epoch": 0.159253499222395, "grad_norm": 1.0987845208229972, "learning_rate": 9.387170392752342e-06, "loss": 0.184, "step": 1024 }, { "epoch": 0.1594090202177294, "grad_norm": 1.5795930559063704, "learning_rate": 9.385998008108917e-06, "loss": 0.2097, "step": 1025 }, { "epoch": 0.15956454121306377, "grad_norm": 1.4302193933514027, "learning_rate": 9.384824576466601e-06, "loss": 0.2194, "step": 1026 }, { "epoch": 0.15972006220839813, "grad_norm": 0.9372034033824603, "learning_rate": 9.383650098105512e-06, "loss": 0.243, "step": 1027 }, { "epoch": 0.1598755832037325, "grad_norm": 1.0038945695499553, "learning_rate": 9.382474573306011e-06, "loss": 0.1861, "step": 1028 }, { "epoch": 0.16003110419906688, "grad_norm": 0.9989868346004813, "learning_rate": 9.381298002348713e-06, "loss": 0.2324, "step": 1029 }, { "epoch": 0.16018662519440124, "grad_norm": 1.4240189031581216, "learning_rate": 9.380120385514484e-06, "loss": 0.1974, "step": 1030 }, { "epoch": 0.1603421461897356, "grad_norm": 1.378754367931683, "learning_rate": 9.378941723084436e-06, "loss": 0.245, "step": 1031 }, { "epoch": 0.16049766718507, "grad_norm": 1.8715129600892846, "learning_rate": 9.37776201533993e-06, "loss": 0.3174, "step": 1032 }, { "epoch": 0.16065318818040436, "grad_norm": 1.1921962243878195, "learning_rate": 9.376581262562584e-06, "loss": 0.1917, "step": 1033 }, { "epoch": 0.16080870917573872, "grad_norm": 1.2635206395103649, "learning_rate": 9.375399465034257e-06, "loss": 0.1878, "step": 1034 }, { "epoch": 0.16096423017107309, "grad_norm": 1.2398545424205532, "learning_rate": 9.374216623037057e-06, "loss": 0.2344, "step": 1035 }, { "epoch": 0.16111975116640748, "grad_norm": 0.9462934166321078, "learning_rate": 9.373032736853352e-06, "loss": 0.187, "step": 1036 }, { "epoch": 0.16127527216174184, "grad_norm": 1.5590735847268282, "learning_rate": 9.371847806765749e-06, "loss": 0.2097, "step": 1037 }, { "epoch": 0.1614307931570762, "grad_norm": 1.160888284446341, "learning_rate": 9.370661833057103e-06, "loss": 0.1506, "step": 1038 }, { "epoch": 0.16158631415241057, "grad_norm": 1.1778543046473768, "learning_rate": 9.36947481601053e-06, "loss": 0.1716, "step": 1039 }, { "epoch": 0.16174183514774496, "grad_norm": 1.4532605779910739, "learning_rate": 9.368286755909383e-06, "loss": 0.182, "step": 1040 }, { "epoch": 0.16189735614307932, "grad_norm": 0.9502972420425978, "learning_rate": 9.36709765303727e-06, "loss": 0.2161, "step": 1041 }, { "epoch": 0.16205287713841368, "grad_norm": 1.4588748874097772, "learning_rate": 9.365907507678045e-06, "loss": 0.2338, "step": 1042 }, { "epoch": 0.16220839813374804, "grad_norm": 1.4225573142040282, "learning_rate": 9.364716320115813e-06, "loss": 0.1781, "step": 1043 }, { "epoch": 0.16236391912908243, "grad_norm": 1.029996429205044, "learning_rate": 9.363524090634928e-06, "loss": 0.2257, "step": 1044 }, { "epoch": 0.1625194401244168, "grad_norm": 1.379085736135871, "learning_rate": 9.362330819519991e-06, "loss": 0.2186, "step": 1045 }, { "epoch": 0.16267496111975116, "grad_norm": 1.2962827183429935, "learning_rate": 9.361136507055853e-06, "loss": 0.1916, "step": 1046 }, { "epoch": 0.16283048211508552, "grad_norm": 0.9451500150098339, "learning_rate": 9.359941153527612e-06, "loss": 0.1859, "step": 1047 }, { "epoch": 0.1629860031104199, "grad_norm": 1.0944328685975881, "learning_rate": 9.358744759220614e-06, "loss": 0.2225, "step": 1048 }, { "epoch": 0.16314152410575428, "grad_norm": 1.1266179070522002, "learning_rate": 9.357547324420461e-06, "loss": 0.2039, "step": 1049 }, { "epoch": 0.16329704510108864, "grad_norm": 1.26823288307141, "learning_rate": 9.356348849412991e-06, "loss": 0.2686, "step": 1050 }, { "epoch": 0.16345256609642303, "grad_norm": 1.3783372129870655, "learning_rate": 9.355149334484302e-06, "loss": 0.2715, "step": 1051 }, { "epoch": 0.1636080870917574, "grad_norm": 0.950454440753535, "learning_rate": 9.35394877992073e-06, "loss": 0.1697, "step": 1052 }, { "epoch": 0.16376360808709176, "grad_norm": 2.4437577046740895, "learning_rate": 9.352747186008865e-06, "loss": 0.2087, "step": 1053 }, { "epoch": 0.16391912908242612, "grad_norm": 1.4140943006046114, "learning_rate": 9.351544553035547e-06, "loss": 0.2063, "step": 1054 }, { "epoch": 0.1640746500777605, "grad_norm": 0.967217619359645, "learning_rate": 9.350340881287861e-06, "loss": 0.2008, "step": 1055 }, { "epoch": 0.16423017107309487, "grad_norm": 1.4590565286071695, "learning_rate": 9.349136171053139e-06, "loss": 0.1897, "step": 1056 }, { "epoch": 0.16438569206842923, "grad_norm": 1.0794053199949247, "learning_rate": 9.34793042261896e-06, "loss": 0.1037, "step": 1057 }, { "epoch": 0.1645412130637636, "grad_norm": 1.15272662266887, "learning_rate": 9.346723636273157e-06, "loss": 0.239, "step": 1058 }, { "epoch": 0.164696734059098, "grad_norm": 1.3755496055051248, "learning_rate": 9.345515812303802e-06, "loss": 0.2655, "step": 1059 }, { "epoch": 0.16485225505443235, "grad_norm": 1.1623669619389423, "learning_rate": 9.344306950999226e-06, "loss": 0.2254, "step": 1060 }, { "epoch": 0.1650077760497667, "grad_norm": 1.1373510201117636, "learning_rate": 9.343097052647996e-06, "loss": 0.2515, "step": 1061 }, { "epoch": 0.16516329704510108, "grad_norm": 1.349812652007435, "learning_rate": 9.341886117538931e-06, "loss": 0.2367, "step": 1062 }, { "epoch": 0.16531881804043547, "grad_norm": 1.0436524504014346, "learning_rate": 9.340674145961101e-06, "loss": 0.1552, "step": 1063 }, { "epoch": 0.16547433903576983, "grad_norm": 1.3297059840324263, "learning_rate": 9.339461138203821e-06, "loss": 0.2201, "step": 1064 }, { "epoch": 0.1656298600311042, "grad_norm": 1.7541537167845238, "learning_rate": 9.338247094556651e-06, "loss": 0.2076, "step": 1065 }, { "epoch": 0.16578538102643858, "grad_norm": 1.442252163275357, "learning_rate": 9.3370320153094e-06, "loss": 0.1753, "step": 1066 }, { "epoch": 0.16594090202177295, "grad_norm": 1.143025605577321, "learning_rate": 9.335815900752125e-06, "loss": 0.2217, "step": 1067 }, { "epoch": 0.1660964230171073, "grad_norm": 1.178025675869792, "learning_rate": 9.33459875117513e-06, "loss": 0.1621, "step": 1068 }, { "epoch": 0.16625194401244167, "grad_norm": 0.8859479026343935, "learning_rate": 9.333380566868963e-06, "loss": 0.2214, "step": 1069 }, { "epoch": 0.16640746500777606, "grad_norm": 1.1580516447127225, "learning_rate": 9.332161348124426e-06, "loss": 0.2104, "step": 1070 }, { "epoch": 0.16656298600311042, "grad_norm": 0.9322363288405592, "learning_rate": 9.33094109523256e-06, "loss": 0.1524, "step": 1071 }, { "epoch": 0.1667185069984448, "grad_norm": 1.2071920671355123, "learning_rate": 9.32971980848466e-06, "loss": 0.2204, "step": 1072 }, { "epoch": 0.16687402799377915, "grad_norm": 1.4321090820471434, "learning_rate": 9.328497488172256e-06, "loss": 0.2185, "step": 1073 }, { "epoch": 0.16702954898911354, "grad_norm": 1.5323210185604608, "learning_rate": 9.327274134587144e-06, "loss": 0.1967, "step": 1074 }, { "epoch": 0.1671850699844479, "grad_norm": 1.2827697157454871, "learning_rate": 9.326049748021348e-06, "loss": 0.1835, "step": 1075 }, { "epoch": 0.16734059097978227, "grad_norm": 0.9598851088099357, "learning_rate": 9.324824328767148e-06, "loss": 0.1524, "step": 1076 }, { "epoch": 0.16749611197511663, "grad_norm": 1.1012363230038584, "learning_rate": 9.323597877117069e-06, "loss": 0.1934, "step": 1077 }, { "epoch": 0.16765163297045102, "grad_norm": 1.7979943018863753, "learning_rate": 9.322370393363881e-06, "loss": 0.2809, "step": 1078 }, { "epoch": 0.16780715396578538, "grad_norm": 0.9525483556320685, "learning_rate": 9.321141877800604e-06, "loss": 0.1544, "step": 1079 }, { "epoch": 0.16796267496111975, "grad_norm": 1.1079754408286966, "learning_rate": 9.319912330720502e-06, "loss": 0.1939, "step": 1080 }, { "epoch": 0.1681181959564541, "grad_norm": 1.4615045454023567, "learning_rate": 9.31868175241708e-06, "loss": 0.1879, "step": 1081 }, { "epoch": 0.1682737169517885, "grad_norm": 0.9677318917431114, "learning_rate": 9.3174501431841e-06, "loss": 0.1572, "step": 1082 }, { "epoch": 0.16842923794712286, "grad_norm": 1.1156223371393144, "learning_rate": 9.316217503315562e-06, "loss": 0.2477, "step": 1083 }, { "epoch": 0.16858475894245722, "grad_norm": 0.9283556985369971, "learning_rate": 9.314983833105713e-06, "loss": 0.1855, "step": 1084 }, { "epoch": 0.16874027993779162, "grad_norm": 0.9107625137180413, "learning_rate": 9.313749132849048e-06, "loss": 0.1941, "step": 1085 }, { "epoch": 0.16889580093312598, "grad_norm": 1.1200752990922627, "learning_rate": 9.312513402840308e-06, "loss": 0.1714, "step": 1086 }, { "epoch": 0.16905132192846034, "grad_norm": 1.5919484746453285, "learning_rate": 9.311276643374478e-06, "loss": 0.1907, "step": 1087 }, { "epoch": 0.1692068429237947, "grad_norm": 1.6737891841333687, "learning_rate": 9.310038854746793e-06, "loss": 0.3096, "step": 1088 }, { "epoch": 0.1693623639191291, "grad_norm": 0.9356610939198378, "learning_rate": 9.308800037252726e-06, "loss": 0.215, "step": 1089 }, { "epoch": 0.16951788491446346, "grad_norm": 0.9978911792591384, "learning_rate": 9.307560191188e-06, "loss": 0.2023, "step": 1090 }, { "epoch": 0.16967340590979782, "grad_norm": 0.8618605808228078, "learning_rate": 9.30631931684859e-06, "loss": 0.1835, "step": 1091 }, { "epoch": 0.16982892690513218, "grad_norm": 1.073899023320524, "learning_rate": 9.305077414530701e-06, "loss": 0.2856, "step": 1092 }, { "epoch": 0.16998444790046657, "grad_norm": 1.390799646940327, "learning_rate": 9.303834484530798e-06, "loss": 0.1768, "step": 1093 }, { "epoch": 0.17013996889580094, "grad_norm": 1.1517992631531213, "learning_rate": 9.302590527145585e-06, "loss": 0.1661, "step": 1094 }, { "epoch": 0.1702954898911353, "grad_norm": 1.0942354595322217, "learning_rate": 9.301345542672012e-06, "loss": 0.2161, "step": 1095 }, { "epoch": 0.17045101088646966, "grad_norm": 0.8079291053355052, "learning_rate": 9.300099531407273e-06, "loss": 0.1768, "step": 1096 }, { "epoch": 0.17060653188180405, "grad_norm": 0.8090971826904667, "learning_rate": 9.298852493648808e-06, "loss": 0.1761, "step": 1097 }, { "epoch": 0.17076205287713841, "grad_norm": 1.2570428694136606, "learning_rate": 9.297604429694305e-06, "loss": 0.1742, "step": 1098 }, { "epoch": 0.17091757387247278, "grad_norm": 1.4714283316352859, "learning_rate": 9.296355339841692e-06, "loss": 0.2716, "step": 1099 }, { "epoch": 0.17107309486780714, "grad_norm": 1.07865700806752, "learning_rate": 9.295105224389144e-06, "loss": 0.1507, "step": 1100 }, { "epoch": 0.17107309486780714, "eval_loss": 0.21004652976989746, "eval_runtime": 9.4236, "eval_samples_per_second": 2.759, "eval_steps_per_second": 0.743, "step": 1100 }, { "epoch": 0.17122861586314153, "grad_norm": 0.8784655316390252, "learning_rate": 9.293854083635081e-06, "loss": 0.1673, "step": 1101 }, { "epoch": 0.1713841368584759, "grad_norm": 1.025281186756548, "learning_rate": 9.292601917878169e-06, "loss": 0.1715, "step": 1102 }, { "epoch": 0.17153965785381026, "grad_norm": 1.409333718683306, "learning_rate": 9.291348727417318e-06, "loss": 0.2155, "step": 1103 }, { "epoch": 0.17169517884914465, "grad_norm": 1.0469534251307742, "learning_rate": 9.290094512551679e-06, "loss": 0.1918, "step": 1104 }, { "epoch": 0.171850699844479, "grad_norm": 1.275008024365504, "learning_rate": 9.288839273580652e-06, "loss": 0.1264, "step": 1105 }, { "epoch": 0.17200622083981337, "grad_norm": 1.2168876399929267, "learning_rate": 9.287583010803882e-06, "loss": 0.2855, "step": 1106 }, { "epoch": 0.17216174183514774, "grad_norm": 1.2066762279123466, "learning_rate": 9.286325724521254e-06, "loss": 0.2242, "step": 1107 }, { "epoch": 0.17231726283048213, "grad_norm": 2.4948253959447144, "learning_rate": 9.285067415032902e-06, "loss": 0.2875, "step": 1108 }, { "epoch": 0.1724727838258165, "grad_norm": 1.8284540511597713, "learning_rate": 9.283808082639198e-06, "loss": 0.2049, "step": 1109 }, { "epoch": 0.17262830482115085, "grad_norm": 1.3355119525104016, "learning_rate": 9.282547727640767e-06, "loss": 0.1717, "step": 1110 }, { "epoch": 0.17278382581648521, "grad_norm": 1.0266534905254066, "learning_rate": 9.281286350338472e-06, "loss": 0.2066, "step": 1111 }, { "epoch": 0.1729393468118196, "grad_norm": 1.2099083780797275, "learning_rate": 9.280023951033418e-06, "loss": 0.2807, "step": 1112 }, { "epoch": 0.17309486780715397, "grad_norm": 0.949550488293792, "learning_rate": 9.278760530026963e-06, "loss": 0.1992, "step": 1113 }, { "epoch": 0.17325038880248833, "grad_norm": 1.0598653084819885, "learning_rate": 9.277496087620696e-06, "loss": 0.2358, "step": 1114 }, { "epoch": 0.1734059097978227, "grad_norm": 1.4050304182051088, "learning_rate": 9.276230624116464e-06, "loss": 0.2222, "step": 1115 }, { "epoch": 0.17356143079315708, "grad_norm": 0.9817712530234229, "learning_rate": 9.274964139816347e-06, "loss": 0.1931, "step": 1116 }, { "epoch": 0.17371695178849145, "grad_norm": 1.7060543693066812, "learning_rate": 9.273696635022674e-06, "loss": 0.2343, "step": 1117 }, { "epoch": 0.1738724727838258, "grad_norm": 1.2527360379181598, "learning_rate": 9.272428110038016e-06, "loss": 0.1717, "step": 1118 }, { "epoch": 0.17402799377916017, "grad_norm": 1.0592648467758805, "learning_rate": 9.271158565165186e-06, "loss": 0.1338, "step": 1119 }, { "epoch": 0.17418351477449456, "grad_norm": 1.1697431614729739, "learning_rate": 9.269888000707243e-06, "loss": 0.0937, "step": 1120 }, { "epoch": 0.17433903576982893, "grad_norm": 1.3666630215902802, "learning_rate": 9.26861641696749e-06, "loss": 0.195, "step": 1121 }, { "epoch": 0.1744945567651633, "grad_norm": 0.9618565647030869, "learning_rate": 9.267343814249468e-06, "loss": 0.175, "step": 1122 }, { "epoch": 0.17465007776049768, "grad_norm": 1.4220832361635052, "learning_rate": 9.266070192856968e-06, "loss": 0.1593, "step": 1123 }, { "epoch": 0.17480559875583204, "grad_norm": 0.776257033559064, "learning_rate": 9.264795553094022e-06, "loss": 0.2249, "step": 1124 }, { "epoch": 0.1749611197511664, "grad_norm": 1.2113799530837854, "learning_rate": 9.263519895264901e-06, "loss": 0.1907, "step": 1125 }, { "epoch": 0.17511664074650077, "grad_norm": 1.3082437362032786, "learning_rate": 9.262243219674126e-06, "loss": 0.2666, "step": 1126 }, { "epoch": 0.17527216174183516, "grad_norm": 1.872862944531211, "learning_rate": 9.260965526626452e-06, "loss": 0.1784, "step": 1127 }, { "epoch": 0.17542768273716952, "grad_norm": 1.3432522813757912, "learning_rate": 9.25968681642689e-06, "loss": 0.1451, "step": 1128 }, { "epoch": 0.17558320373250388, "grad_norm": 0.9703679937198076, "learning_rate": 9.258407089380679e-06, "loss": 0.1297, "step": 1129 }, { "epoch": 0.17573872472783825, "grad_norm": 1.0365436632456377, "learning_rate": 9.25712634579331e-06, "loss": 0.1761, "step": 1130 }, { "epoch": 0.17589424572317264, "grad_norm": 2.1522303469420994, "learning_rate": 9.255844585970516e-06, "loss": 0.1296, "step": 1131 }, { "epoch": 0.176049766718507, "grad_norm": 1.291217930882477, "learning_rate": 9.254561810218269e-06, "loss": 0.2044, "step": 1132 }, { "epoch": 0.17620528771384136, "grad_norm": 0.9937462574500329, "learning_rate": 9.253278018842786e-06, "loss": 0.1997, "step": 1133 }, { "epoch": 0.17636080870917573, "grad_norm": 0.9450489875743622, "learning_rate": 9.251993212150525e-06, "loss": 0.1747, "step": 1134 }, { "epoch": 0.17651632970451012, "grad_norm": 1.4735357191672043, "learning_rate": 9.250707390448187e-06, "loss": 0.2377, "step": 1135 }, { "epoch": 0.17667185069984448, "grad_norm": 0.957023692443933, "learning_rate": 9.24942055404272e-06, "loss": 0.1319, "step": 1136 }, { "epoch": 0.17682737169517884, "grad_norm": 0.9533362941250507, "learning_rate": 9.248132703241306e-06, "loss": 0.142, "step": 1137 }, { "epoch": 0.17698289269051323, "grad_norm": 1.1321821260027138, "learning_rate": 9.246843838351371e-06, "loss": 0.185, "step": 1138 }, { "epoch": 0.1771384136858476, "grad_norm": 0.6564569809439412, "learning_rate": 9.24555395968059e-06, "loss": 0.1511, "step": 1139 }, { "epoch": 0.17729393468118196, "grad_norm": 0.8235534803965409, "learning_rate": 9.244263067536872e-06, "loss": 0.1851, "step": 1140 }, { "epoch": 0.17744945567651632, "grad_norm": 0.97851675810554, "learning_rate": 9.24297116222837e-06, "loss": 0.2184, "step": 1141 }, { "epoch": 0.1776049766718507, "grad_norm": 1.1485004351012151, "learning_rate": 9.241678244063482e-06, "loss": 0.2106, "step": 1142 }, { "epoch": 0.17776049766718507, "grad_norm": 1.081146125371241, "learning_rate": 9.240384313350845e-06, "loss": 0.1844, "step": 1143 }, { "epoch": 0.17791601866251944, "grad_norm": 1.4013409835542678, "learning_rate": 9.239089370399338e-06, "loss": 0.2538, "step": 1144 }, { "epoch": 0.1780715396578538, "grad_norm": 6.587281038828778, "learning_rate": 9.237793415518083e-06, "loss": 0.2319, "step": 1145 }, { "epoch": 0.1782270606531882, "grad_norm": 1.463087775034242, "learning_rate": 9.23649644901644e-06, "loss": 0.1833, "step": 1146 }, { "epoch": 0.17838258164852255, "grad_norm": 0.8603221586452274, "learning_rate": 9.235198471204017e-06, "loss": 0.1652, "step": 1147 }, { "epoch": 0.17853810264385692, "grad_norm": 1.243900965186844, "learning_rate": 9.233899482390654e-06, "loss": 0.1688, "step": 1148 }, { "epoch": 0.17869362363919128, "grad_norm": 2.2219504182745964, "learning_rate": 9.232599482886444e-06, "loss": 0.2472, "step": 1149 }, { "epoch": 0.17884914463452567, "grad_norm": 0.8152250444616337, "learning_rate": 9.23129847300171e-06, "loss": 0.1542, "step": 1150 }, { "epoch": 0.17900466562986003, "grad_norm": 0.8972000242254355, "learning_rate": 9.229996453047022e-06, "loss": 0.1914, "step": 1151 }, { "epoch": 0.1791601866251944, "grad_norm": 1.3946215944007783, "learning_rate": 9.228693423333192e-06, "loss": 0.2517, "step": 1152 }, { "epoch": 0.17931570762052876, "grad_norm": 1.7211813642698215, "learning_rate": 9.227389384171272e-06, "loss": 0.1639, "step": 1153 }, { "epoch": 0.17947122861586315, "grad_norm": 1.045567391255685, "learning_rate": 9.22608433587255e-06, "loss": 0.1269, "step": 1154 }, { "epoch": 0.1796267496111975, "grad_norm": 1.6046875031988923, "learning_rate": 9.224778278748567e-06, "loss": 0.279, "step": 1155 }, { "epoch": 0.17978227060653187, "grad_norm": 1.204453994991899, "learning_rate": 9.223471213111089e-06, "loss": 0.1925, "step": 1156 }, { "epoch": 0.17993779160186626, "grad_norm": 1.3023690662744187, "learning_rate": 9.222163139272134e-06, "loss": 0.1788, "step": 1157 }, { "epoch": 0.18009331259720063, "grad_norm": 1.1433449264456945, "learning_rate": 9.220854057543958e-06, "loss": 0.2228, "step": 1158 }, { "epoch": 0.180248833592535, "grad_norm": 1.2657407961939997, "learning_rate": 9.219543968239057e-06, "loss": 0.1985, "step": 1159 }, { "epoch": 0.18040435458786935, "grad_norm": 1.0010295228905417, "learning_rate": 9.218232871670168e-06, "loss": 0.1976, "step": 1160 }, { "epoch": 0.18055987558320374, "grad_norm": 1.0125003198196167, "learning_rate": 9.216920768150266e-06, "loss": 0.1886, "step": 1161 }, { "epoch": 0.1807153965785381, "grad_norm": 1.2238980097949077, "learning_rate": 9.215607657992569e-06, "loss": 0.2848, "step": 1162 }, { "epoch": 0.18087091757387247, "grad_norm": 1.6192762802858285, "learning_rate": 9.214293541510537e-06, "loss": 0.1714, "step": 1163 }, { "epoch": 0.18102643856920683, "grad_norm": 1.0273533259054548, "learning_rate": 9.212978419017864e-06, "loss": 0.2001, "step": 1164 }, { "epoch": 0.18118195956454122, "grad_norm": 1.1061300881511378, "learning_rate": 9.211662290828493e-06, "loss": 0.2214, "step": 1165 }, { "epoch": 0.18133748055987559, "grad_norm": 1.061080909625091, "learning_rate": 9.210345157256597e-06, "loss": 0.1914, "step": 1166 }, { "epoch": 0.18149300155520995, "grad_norm": 0.6997117059310394, "learning_rate": 9.209027018616598e-06, "loss": 0.1434, "step": 1167 }, { "epoch": 0.1816485225505443, "grad_norm": 2.4894170703666125, "learning_rate": 9.207707875223153e-06, "loss": 0.154, "step": 1168 }, { "epoch": 0.1818040435458787, "grad_norm": 1.294402841120763, "learning_rate": 9.20638772739116e-06, "loss": 0.1398, "step": 1169 }, { "epoch": 0.18195956454121306, "grad_norm": 1.4691556974020672, "learning_rate": 9.205066575435754e-06, "loss": 0.2599, "step": 1170 }, { "epoch": 0.18211508553654743, "grad_norm": 1.5109698106731952, "learning_rate": 9.203744419672318e-06, "loss": 0.2715, "step": 1171 }, { "epoch": 0.1822706065318818, "grad_norm": 0.7824851605920647, "learning_rate": 9.202421260416464e-06, "loss": 0.155, "step": 1172 }, { "epoch": 0.18242612752721618, "grad_norm": 1.229104135640711, "learning_rate": 9.20109709798405e-06, "loss": 0.173, "step": 1173 }, { "epoch": 0.18258164852255054, "grad_norm": 1.493187696337834, "learning_rate": 9.199771932691172e-06, "loss": 0.1874, "step": 1174 }, { "epoch": 0.1827371695178849, "grad_norm": 1.3355865457774434, "learning_rate": 9.198445764854166e-06, "loss": 0.1868, "step": 1175 }, { "epoch": 0.1828926905132193, "grad_norm": 1.4822915990950787, "learning_rate": 9.19711859478961e-06, "loss": 0.1936, "step": 1176 }, { "epoch": 0.18304821150855366, "grad_norm": 1.3568523945836255, "learning_rate": 9.19579042281431e-06, "loss": 0.2351, "step": 1177 }, { "epoch": 0.18320373250388802, "grad_norm": 1.1221237537622042, "learning_rate": 9.194461249245326e-06, "loss": 0.1651, "step": 1178 }, { "epoch": 0.18335925349922239, "grad_norm": 1.0427220049147299, "learning_rate": 9.193131074399949e-06, "loss": 0.2095, "step": 1179 }, { "epoch": 0.18351477449455678, "grad_norm": 1.1443234808493088, "learning_rate": 9.191799898595706e-06, "loss": 0.1987, "step": 1180 }, { "epoch": 0.18367029548989114, "grad_norm": 0.8812799774315752, "learning_rate": 9.190467722150373e-06, "loss": 0.2529, "step": 1181 }, { "epoch": 0.1838258164852255, "grad_norm": 0.9190808713383141, "learning_rate": 9.189134545381954e-06, "loss": 0.2043, "step": 1182 }, { "epoch": 0.18398133748055986, "grad_norm": 1.1496814316391453, "learning_rate": 9.187800368608703e-06, "loss": 0.2166, "step": 1183 }, { "epoch": 0.18413685847589426, "grad_norm": 1.3800541644049227, "learning_rate": 9.1864651921491e-06, "loss": 0.2258, "step": 1184 }, { "epoch": 0.18429237947122862, "grad_norm": 0.91743359427612, "learning_rate": 9.185129016321877e-06, "loss": 0.1383, "step": 1185 }, { "epoch": 0.18444790046656298, "grad_norm": 1.4610869068656602, "learning_rate": 9.18379184144599e-06, "loss": 0.1508, "step": 1186 }, { "epoch": 0.18460342146189734, "grad_norm": 1.675711445184492, "learning_rate": 9.18245366784065e-06, "loss": 0.303, "step": 1187 }, { "epoch": 0.18475894245723173, "grad_norm": 0.7182617914658281, "learning_rate": 9.18111449582529e-06, "loss": 0.1663, "step": 1188 }, { "epoch": 0.1849144634525661, "grad_norm": 2.5919566299762105, "learning_rate": 9.179774325719593e-06, "loss": 0.1913, "step": 1189 }, { "epoch": 0.18506998444790046, "grad_norm": 1.5246187638405735, "learning_rate": 9.178433157843474e-06, "loss": 0.1974, "step": 1190 }, { "epoch": 0.18522550544323485, "grad_norm": 1.1870049850604156, "learning_rate": 9.17709099251709e-06, "loss": 0.1889, "step": 1191 }, { "epoch": 0.1853810264385692, "grad_norm": 1.2780528349437963, "learning_rate": 9.175747830060837e-06, "loss": 0.1682, "step": 1192 }, { "epoch": 0.18553654743390358, "grad_norm": 0.852064776201917, "learning_rate": 9.174403670795342e-06, "loss": 0.1786, "step": 1193 }, { "epoch": 0.18569206842923794, "grad_norm": 0.982736851978155, "learning_rate": 9.173058515041477e-06, "loss": 0.1759, "step": 1194 }, { "epoch": 0.18584758942457233, "grad_norm": 5.383045313258924, "learning_rate": 9.171712363120351e-06, "loss": 0.3862, "step": 1195 }, { "epoch": 0.1860031104199067, "grad_norm": 0.9849374073337689, "learning_rate": 9.170365215353306e-06, "loss": 0.1981, "step": 1196 }, { "epoch": 0.18615863141524105, "grad_norm": 1.1001803535527055, "learning_rate": 9.169017072061926e-06, "loss": 0.1989, "step": 1197 }, { "epoch": 0.18631415241057542, "grad_norm": 1.1570335250140034, "learning_rate": 9.167667933568032e-06, "loss": 0.1822, "step": 1198 }, { "epoch": 0.1864696734059098, "grad_norm": 1.6984581879530103, "learning_rate": 9.166317800193683e-06, "loss": 0.2171, "step": 1199 }, { "epoch": 0.18662519440124417, "grad_norm": 1.650860536979747, "learning_rate": 9.164966672261171e-06, "loss": 0.3055, "step": 1200 }, { "epoch": 0.18662519440124417, "eval_loss": 0.210090771317482, "eval_runtime": 9.4293, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 1200 }, { "epoch": 0.18678071539657853, "grad_norm": 1.416615143797259, "learning_rate": 9.163614550093035e-06, "loss": 0.1347, "step": 1201 }, { "epoch": 0.1869362363919129, "grad_norm": 1.3794733777830905, "learning_rate": 9.16226143401204e-06, "loss": 0.2041, "step": 1202 }, { "epoch": 0.1870917573872473, "grad_norm": 0.9282701621282511, "learning_rate": 9.160907324341199e-06, "loss": 0.1589, "step": 1203 }, { "epoch": 0.18724727838258165, "grad_norm": 1.4894253244171338, "learning_rate": 9.159552221403752e-06, "loss": 0.174, "step": 1204 }, { "epoch": 0.187402799377916, "grad_norm": 1.1504157025776975, "learning_rate": 9.158196125523182e-06, "loss": 0.1942, "step": 1205 }, { "epoch": 0.18755832037325038, "grad_norm": 0.7255523870962133, "learning_rate": 9.156839037023209e-06, "loss": 0.1925, "step": 1206 }, { "epoch": 0.18771384136858477, "grad_norm": 1.3297160614851913, "learning_rate": 9.155480956227789e-06, "loss": 0.2448, "step": 1207 }, { "epoch": 0.18786936236391913, "grad_norm": 1.2394203928257357, "learning_rate": 9.154121883461115e-06, "loss": 0.1644, "step": 1208 }, { "epoch": 0.1880248833592535, "grad_norm": 1.110942304313815, "learning_rate": 9.152761819047617e-06, "loss": 0.158, "step": 1209 }, { "epoch": 0.18818040435458788, "grad_norm": 0.8597754146450871, "learning_rate": 9.151400763311958e-06, "loss": 0.1765, "step": 1210 }, { "epoch": 0.18833592534992225, "grad_norm": 1.1244255534137637, "learning_rate": 9.150038716579046e-06, "loss": 0.14, "step": 1211 }, { "epoch": 0.1884914463452566, "grad_norm": 0.9441808017939254, "learning_rate": 9.148675679174017e-06, "loss": 0.1685, "step": 1212 }, { "epoch": 0.18864696734059097, "grad_norm": 1.49569762403274, "learning_rate": 9.147311651422248e-06, "loss": 0.1637, "step": 1213 }, { "epoch": 0.18880248833592536, "grad_norm": 1.0568658204953814, "learning_rate": 9.145946633649352e-06, "loss": 0.1713, "step": 1214 }, { "epoch": 0.18895800933125972, "grad_norm": 1.2127109888393217, "learning_rate": 9.144580626181176e-06, "loss": 0.161, "step": 1215 }, { "epoch": 0.1891135303265941, "grad_norm": 0.8503234486008238, "learning_rate": 9.143213629343807e-06, "loss": 0.1489, "step": 1216 }, { "epoch": 0.18926905132192845, "grad_norm": 0.9120088478974758, "learning_rate": 9.141845643463565e-06, "loss": 0.1939, "step": 1217 }, { "epoch": 0.18942457231726284, "grad_norm": 1.0121267789823751, "learning_rate": 9.140476668867008e-06, "loss": 0.15, "step": 1218 }, { "epoch": 0.1895800933125972, "grad_norm": 1.3638566134338714, "learning_rate": 9.13910670588093e-06, "loss": 0.2105, "step": 1219 }, { "epoch": 0.18973561430793157, "grad_norm": 1.6276021550806605, "learning_rate": 9.13773575483236e-06, "loss": 0.2869, "step": 1220 }, { "epoch": 0.18989113530326593, "grad_norm": 1.6764188720931026, "learning_rate": 9.136363816048562e-06, "loss": 0.1458, "step": 1221 }, { "epoch": 0.19004665629860032, "grad_norm": 0.6701780576831128, "learning_rate": 9.134990889857036e-06, "loss": 0.1842, "step": 1222 }, { "epoch": 0.19020217729393468, "grad_norm": 1.1322931167082202, "learning_rate": 9.133616976585522e-06, "loss": 0.2556, "step": 1223 }, { "epoch": 0.19035769828926905, "grad_norm": 1.2524154763717683, "learning_rate": 9.13224207656199e-06, "loss": 0.2104, "step": 1224 }, { "epoch": 0.1905132192846034, "grad_norm": 0.9592897430767787, "learning_rate": 9.130866190114649e-06, "loss": 0.2833, "step": 1225 }, { "epoch": 0.1906687402799378, "grad_norm": 1.7651472837705433, "learning_rate": 9.12948931757194e-06, "loss": 0.2524, "step": 1226 }, { "epoch": 0.19082426127527216, "grad_norm": 0.9879072001537496, "learning_rate": 9.128111459262543e-06, "loss": 0.1624, "step": 1227 }, { "epoch": 0.19097978227060652, "grad_norm": 1.320308534660155, "learning_rate": 9.126732615515373e-06, "loss": 0.2937, "step": 1228 }, { "epoch": 0.19113530326594091, "grad_norm": 1.6528470759003213, "learning_rate": 9.125352786659577e-06, "loss": 0.1824, "step": 1229 }, { "epoch": 0.19129082426127528, "grad_norm": 1.099113810582022, "learning_rate": 9.123971973024543e-06, "loss": 0.2282, "step": 1230 }, { "epoch": 0.19144634525660964, "grad_norm": 0.9906932002367946, "learning_rate": 9.122590174939887e-06, "loss": 0.1908, "step": 1231 }, { "epoch": 0.191601866251944, "grad_norm": 1.3700619269813867, "learning_rate": 9.121207392735465e-06, "loss": 0.1736, "step": 1232 }, { "epoch": 0.1917573872472784, "grad_norm": 0.9132669255091096, "learning_rate": 9.119823626741367e-06, "loss": 0.2559, "step": 1233 }, { "epoch": 0.19191290824261276, "grad_norm": 1.0158832597362466, "learning_rate": 9.118438877287913e-06, "loss": 0.218, "step": 1234 }, { "epoch": 0.19206842923794712, "grad_norm": 0.9172450560816615, "learning_rate": 9.11705314470567e-06, "loss": 0.2038, "step": 1235 }, { "epoch": 0.19222395023328148, "grad_norm": 1.0457809289045787, "learning_rate": 9.115666429325424e-06, "loss": 0.2383, "step": 1236 }, { "epoch": 0.19237947122861587, "grad_norm": 0.6123808194220389, "learning_rate": 9.114278731478207e-06, "loss": 0.1059, "step": 1237 }, { "epoch": 0.19253499222395024, "grad_norm": 0.8957445923668392, "learning_rate": 9.112890051495281e-06, "loss": 0.1753, "step": 1238 }, { "epoch": 0.1926905132192846, "grad_norm": 1.010302756648279, "learning_rate": 9.111500389708144e-06, "loss": 0.2162, "step": 1239 }, { "epoch": 0.19284603421461896, "grad_norm": 1.26307408847368, "learning_rate": 9.110109746448527e-06, "loss": 0.1901, "step": 1240 }, { "epoch": 0.19300155520995335, "grad_norm": 1.237621554432501, "learning_rate": 9.108718122048395e-06, "loss": 0.1746, "step": 1241 }, { "epoch": 0.19315707620528771, "grad_norm": 0.9172927280641415, "learning_rate": 9.107325516839952e-06, "loss": 0.1556, "step": 1242 }, { "epoch": 0.19331259720062208, "grad_norm": 1.7268710214147918, "learning_rate": 9.105931931155626e-06, "loss": 0.2808, "step": 1243 }, { "epoch": 0.19346811819595647, "grad_norm": 0.8932022562830918, "learning_rate": 9.10453736532809e-06, "loss": 0.1527, "step": 1244 }, { "epoch": 0.19362363919129083, "grad_norm": 1.2202712676463288, "learning_rate": 9.103141819690246e-06, "loss": 0.1376, "step": 1245 }, { "epoch": 0.1937791601866252, "grad_norm": 1.0826681500025592, "learning_rate": 9.101745294575227e-06, "loss": 0.1449, "step": 1246 }, { "epoch": 0.19393468118195956, "grad_norm": 1.1807575757930213, "learning_rate": 9.100347790316409e-06, "loss": 0.2126, "step": 1247 }, { "epoch": 0.19409020217729395, "grad_norm": 0.941763687751761, "learning_rate": 9.098949307247391e-06, "loss": 0.1632, "step": 1248 }, { "epoch": 0.1942457231726283, "grad_norm": 1.378441641768549, "learning_rate": 9.097549845702009e-06, "loss": 0.1906, "step": 1249 }, { "epoch": 0.19440124416796267, "grad_norm": 1.2339116886059447, "learning_rate": 9.09614940601434e-06, "loss": 0.2006, "step": 1250 }, { "epoch": 0.19455676516329704, "grad_norm": 1.1239344680494445, "learning_rate": 9.094747988518683e-06, "loss": 0.2336, "step": 1251 }, { "epoch": 0.19471228615863143, "grad_norm": 0.927588276459713, "learning_rate": 9.093345593549579e-06, "loss": 0.1449, "step": 1252 }, { "epoch": 0.1948678071539658, "grad_norm": 1.13724282637735, "learning_rate": 9.091942221441797e-06, "loss": 0.2126, "step": 1253 }, { "epoch": 0.19502332814930015, "grad_norm": 1.0365698182525573, "learning_rate": 9.090537872530343e-06, "loss": 0.1867, "step": 1254 }, { "epoch": 0.19517884914463451, "grad_norm": 0.9371814591941575, "learning_rate": 9.089132547150453e-06, "loss": 0.1618, "step": 1255 }, { "epoch": 0.1953343701399689, "grad_norm": 1.0697225550230685, "learning_rate": 9.0877262456376e-06, "loss": 0.1849, "step": 1256 }, { "epoch": 0.19548989113530327, "grad_norm": 1.5583498729530745, "learning_rate": 9.086318968327488e-06, "loss": 0.2014, "step": 1257 }, { "epoch": 0.19564541213063763, "grad_norm": 1.2271229677253923, "learning_rate": 9.084910715556052e-06, "loss": 0.2017, "step": 1258 }, { "epoch": 0.195800933125972, "grad_norm": 1.0026506309270833, "learning_rate": 9.083501487659461e-06, "loss": 0.1646, "step": 1259 }, { "epoch": 0.19595645412130638, "grad_norm": 1.2598951391108157, "learning_rate": 9.08209128497412e-06, "loss": 0.1851, "step": 1260 }, { "epoch": 0.19611197511664075, "grad_norm": 1.5838356552966606, "learning_rate": 9.080680107836662e-06, "loss": 0.1948, "step": 1261 }, { "epoch": 0.1962674961119751, "grad_norm": 1.1087104243969894, "learning_rate": 9.079267956583953e-06, "loss": 0.1687, "step": 1262 }, { "epoch": 0.1964230171073095, "grad_norm": 1.6020412697904411, "learning_rate": 9.077854831553097e-06, "loss": 0.1854, "step": 1263 }, { "epoch": 0.19657853810264386, "grad_norm": 1.0315547992066338, "learning_rate": 9.076440733081426e-06, "loss": 0.2211, "step": 1264 }, { "epoch": 0.19673405909797823, "grad_norm": 1.0349194289967332, "learning_rate": 9.075025661506505e-06, "loss": 0.182, "step": 1265 }, { "epoch": 0.1968895800933126, "grad_norm": 0.8148640872234216, "learning_rate": 9.073609617166129e-06, "loss": 0.2319, "step": 1266 }, { "epoch": 0.19704510108864698, "grad_norm": 0.8956967698145264, "learning_rate": 9.072192600398328e-06, "loss": 0.2318, "step": 1267 }, { "epoch": 0.19720062208398134, "grad_norm": 1.512397062737358, "learning_rate": 9.070774611541366e-06, "loss": 0.1279, "step": 1268 }, { "epoch": 0.1973561430793157, "grad_norm": 1.089155641459757, "learning_rate": 9.069355650933732e-06, "loss": 0.132, "step": 1269 }, { "epoch": 0.19751166407465007, "grad_norm": 1.15341700389814, "learning_rate": 9.06793571891416e-06, "loss": 0.1416, "step": 1270 }, { "epoch": 0.19766718506998446, "grad_norm": 1.2188604321419376, "learning_rate": 9.0665148158216e-06, "loss": 0.1635, "step": 1271 }, { "epoch": 0.19782270606531882, "grad_norm": 1.6133883720632236, "learning_rate": 9.065092941995245e-06, "loss": 0.185, "step": 1272 }, { "epoch": 0.19797822706065318, "grad_norm": 1.4486872766212289, "learning_rate": 9.063670097774513e-06, "loss": 0.2325, "step": 1273 }, { "epoch": 0.19813374805598755, "grad_norm": 1.557263365124596, "learning_rate": 9.062246283499058e-06, "loss": 0.1712, "step": 1274 }, { "epoch": 0.19828926905132194, "grad_norm": 1.9875754585690109, "learning_rate": 9.060821499508769e-06, "loss": 0.1843, "step": 1275 }, { "epoch": 0.1984447900466563, "grad_norm": 1.1418131416263584, "learning_rate": 9.059395746143756e-06, "loss": 0.1777, "step": 1276 }, { "epoch": 0.19860031104199066, "grad_norm": 1.0395361627239141, "learning_rate": 9.057969023744367e-06, "loss": 0.2194, "step": 1277 }, { "epoch": 0.19875583203732503, "grad_norm": 1.305159234748547, "learning_rate": 9.056541332651183e-06, "loss": 0.2141, "step": 1278 }, { "epoch": 0.19891135303265942, "grad_norm": 1.0849932011185046, "learning_rate": 9.055112673205014e-06, "loss": 0.1821, "step": 1279 }, { "epoch": 0.19906687402799378, "grad_norm": 0.979089764226756, "learning_rate": 9.053683045746897e-06, "loss": 0.269, "step": 1280 }, { "epoch": 0.19922239502332814, "grad_norm": 1.078405593629792, "learning_rate": 9.052252450618106e-06, "loss": 0.1413, "step": 1281 }, { "epoch": 0.19937791601866253, "grad_norm": 1.2031448135959215, "learning_rate": 9.050820888160145e-06, "loss": 0.2268, "step": 1282 }, { "epoch": 0.1995334370139969, "grad_norm": 0.9432997632179643, "learning_rate": 9.049388358714747e-06, "loss": 0.0856, "step": 1283 }, { "epoch": 0.19968895800933126, "grad_norm": 1.1798467376681538, "learning_rate": 9.04795486262388e-06, "loss": 0.1487, "step": 1284 }, { "epoch": 0.19984447900466562, "grad_norm": 0.9959594825238516, "learning_rate": 9.046520400229734e-06, "loss": 0.1363, "step": 1285 }, { "epoch": 0.2, "grad_norm": 1.2777597650080654, "learning_rate": 9.045084971874738e-06, "loss": 0.2053, "step": 1286 }, { "epoch": 0.20015552099533437, "grad_norm": 1.3807813898572032, "learning_rate": 9.04364857790155e-06, "loss": 0.1608, "step": 1287 }, { "epoch": 0.20031104199066874, "grad_norm": 1.213101350130223, "learning_rate": 9.042211218653054e-06, "loss": 0.1783, "step": 1288 }, { "epoch": 0.2004665629860031, "grad_norm": 1.270497799974636, "learning_rate": 9.040772894472369e-06, "loss": 0.1335, "step": 1289 }, { "epoch": 0.2006220839813375, "grad_norm": 1.143678584624158, "learning_rate": 9.039333605702844e-06, "loss": 0.2566, "step": 1290 }, { "epoch": 0.20077760497667185, "grad_norm": 0.9321591383595857, "learning_rate": 9.03789335268806e-06, "loss": 0.1517, "step": 1291 }, { "epoch": 0.20093312597200622, "grad_norm": 0.8482625172580437, "learning_rate": 9.036452135771818e-06, "loss": 0.2284, "step": 1292 }, { "epoch": 0.20108864696734058, "grad_norm": 1.5799008472731184, "learning_rate": 9.035009955298163e-06, "loss": 0.2491, "step": 1293 }, { "epoch": 0.20124416796267497, "grad_norm": 1.5021594414320747, "learning_rate": 9.03356681161136e-06, "loss": 0.1623, "step": 1294 }, { "epoch": 0.20139968895800933, "grad_norm": 1.1207507593154515, "learning_rate": 9.032122705055912e-06, "loss": 0.1996, "step": 1295 }, { "epoch": 0.2015552099533437, "grad_norm": 1.1753346897113919, "learning_rate": 9.030677635976542e-06, "loss": 0.156, "step": 1296 }, { "epoch": 0.20171073094867809, "grad_norm": 1.582912014985177, "learning_rate": 9.02923160471821e-06, "loss": 0.2852, "step": 1297 }, { "epoch": 0.20186625194401245, "grad_norm": 4.24419003235004, "learning_rate": 9.027784611626108e-06, "loss": 0.1857, "step": 1298 }, { "epoch": 0.2020217729393468, "grad_norm": 3.465507316165179, "learning_rate": 9.026336657045646e-06, "loss": 0.1331, "step": 1299 }, { "epoch": 0.20217729393468117, "grad_norm": 0.8992554022243577, "learning_rate": 9.024887741322475e-06, "loss": 0.1649, "step": 1300 }, { "epoch": 0.20217729393468117, "eval_loss": 0.20873166620731354, "eval_runtime": 9.4107, "eval_samples_per_second": 2.763, "eval_steps_per_second": 0.744, "step": 1300 }, { "epoch": 0.20233281493001556, "grad_norm": 1.2089278079623347, "learning_rate": 9.023437864802472e-06, "loss": 0.2705, "step": 1301 }, { "epoch": 0.20248833592534993, "grad_norm": 1.2901991665649666, "learning_rate": 9.021987027831743e-06, "loss": 0.1672, "step": 1302 }, { "epoch": 0.2026438569206843, "grad_norm": 1.5354719963652408, "learning_rate": 9.02053523075662e-06, "loss": 0.239, "step": 1303 }, { "epoch": 0.20279937791601865, "grad_norm": 1.214882523492219, "learning_rate": 9.01908247392367e-06, "loss": 0.1566, "step": 1304 }, { "epoch": 0.20295489891135304, "grad_norm": 1.473765899129253, "learning_rate": 9.017628757679685e-06, "loss": 0.1931, "step": 1305 }, { "epoch": 0.2031104199066874, "grad_norm": 2.6517165969707683, "learning_rate": 9.01617408237169e-06, "loss": 0.1307, "step": 1306 }, { "epoch": 0.20326594090202177, "grad_norm": 1.4993932954062734, "learning_rate": 9.01471844834693e-06, "loss": 0.2079, "step": 1307 }, { "epoch": 0.20342146189735613, "grad_norm": 1.0866992812991043, "learning_rate": 9.013261855952893e-06, "loss": 0.2361, "step": 1308 }, { "epoch": 0.20357698289269052, "grad_norm": 1.4691858213747517, "learning_rate": 9.011804305537281e-06, "loss": 0.2062, "step": 1309 }, { "epoch": 0.20373250388802489, "grad_norm": 1.218397331201916, "learning_rate": 9.010345797448037e-06, "loss": 0.1295, "step": 1310 }, { "epoch": 0.20388802488335925, "grad_norm": 1.317910015288317, "learning_rate": 9.008886332033323e-06, "loss": 0.221, "step": 1311 }, { "epoch": 0.2040435458786936, "grad_norm": 1.4368413534493716, "learning_rate": 9.007425909641538e-06, "loss": 0.3292, "step": 1312 }, { "epoch": 0.204199066874028, "grad_norm": 1.24467623609956, "learning_rate": 9.005964530621301e-06, "loss": 0.2276, "step": 1313 }, { "epoch": 0.20435458786936236, "grad_norm": 0.9849662601801316, "learning_rate": 9.004502195321468e-06, "loss": 0.1825, "step": 1314 }, { "epoch": 0.20451010886469673, "grad_norm": 3.783152250453029, "learning_rate": 9.003038904091113e-06, "loss": 0.1834, "step": 1315 }, { "epoch": 0.20466562986003112, "grad_norm": 1.0234608190416166, "learning_rate": 9.001574657279548e-06, "loss": 0.2172, "step": 1316 }, { "epoch": 0.20482115085536548, "grad_norm": 1.3240981295825394, "learning_rate": 9.00010945523631e-06, "loss": 0.1857, "step": 1317 }, { "epoch": 0.20497667185069984, "grad_norm": 1.1823107793426477, "learning_rate": 8.99864329831116e-06, "loss": 0.2747, "step": 1318 }, { "epoch": 0.2051321928460342, "grad_norm": 1.183188676477308, "learning_rate": 8.997176186854091e-06, "loss": 0.2091, "step": 1319 }, { "epoch": 0.2052877138413686, "grad_norm": 1.1306812200844953, "learning_rate": 8.995708121215325e-06, "loss": 0.1789, "step": 1320 }, { "epoch": 0.20544323483670296, "grad_norm": 1.3914844708441778, "learning_rate": 8.994239101745309e-06, "loss": 0.1626, "step": 1321 }, { "epoch": 0.20559875583203732, "grad_norm": 1.3328736681097808, "learning_rate": 8.992769128794717e-06, "loss": 0.1699, "step": 1322 }, { "epoch": 0.20575427682737168, "grad_norm": 1.3262550452320387, "learning_rate": 8.991298202714453e-06, "loss": 0.1985, "step": 1323 }, { "epoch": 0.20590979782270608, "grad_norm": 1.5863201904107513, "learning_rate": 8.989826323855647e-06, "loss": 0.2729, "step": 1324 }, { "epoch": 0.20606531881804044, "grad_norm": 1.0484153422588192, "learning_rate": 8.988353492569657e-06, "loss": 0.2243, "step": 1325 }, { "epoch": 0.2062208398133748, "grad_norm": 0.9724310873787251, "learning_rate": 8.986879709208069e-06, "loss": 0.2349, "step": 1326 }, { "epoch": 0.20637636080870916, "grad_norm": 1.319839764006134, "learning_rate": 8.985404974122699e-06, "loss": 0.1796, "step": 1327 }, { "epoch": 0.20653188180404355, "grad_norm": 1.7134943634197457, "learning_rate": 8.983929287665579e-06, "loss": 0.2289, "step": 1328 }, { "epoch": 0.20668740279937792, "grad_norm": 1.1812406274342315, "learning_rate": 8.98245265018898e-06, "loss": 0.2123, "step": 1329 }, { "epoch": 0.20684292379471228, "grad_norm": 1.4771839041530355, "learning_rate": 8.980975062045398e-06, "loss": 0.2228, "step": 1330 }, { "epoch": 0.20699844479004664, "grad_norm": 1.0073337669892177, "learning_rate": 8.979496523587552e-06, "loss": 0.1455, "step": 1331 }, { "epoch": 0.20715396578538103, "grad_norm": 0.8665969448850475, "learning_rate": 8.978017035168389e-06, "loss": 0.1689, "step": 1332 }, { "epoch": 0.2073094867807154, "grad_norm": 1.0555827692971853, "learning_rate": 8.976536597141085e-06, "loss": 0.1708, "step": 1333 }, { "epoch": 0.20746500777604976, "grad_norm": 0.8842215270037568, "learning_rate": 8.97505520985904e-06, "loss": 0.1751, "step": 1334 }, { "epoch": 0.20762052877138415, "grad_norm": 1.6924145041248846, "learning_rate": 8.973572873675882e-06, "loss": 0.1697, "step": 1335 }, { "epoch": 0.2077760497667185, "grad_norm": 1.1225115788471978, "learning_rate": 8.972089588945467e-06, "loss": 0.22, "step": 1336 }, { "epoch": 0.20793157076205288, "grad_norm": 1.1964311921620439, "learning_rate": 8.970605356021873e-06, "loss": 0.1953, "step": 1337 }, { "epoch": 0.20808709175738724, "grad_norm": 1.1874827397504135, "learning_rate": 8.96912017525941e-06, "loss": 0.1541, "step": 1338 }, { "epoch": 0.20824261275272163, "grad_norm": 1.2996586003784654, "learning_rate": 8.967634047012607e-06, "loss": 0.2543, "step": 1339 }, { "epoch": 0.208398133748056, "grad_norm": 1.9568915465615424, "learning_rate": 8.96614697163623e-06, "loss": 0.1742, "step": 1340 }, { "epoch": 0.20855365474339035, "grad_norm": 1.327702070183964, "learning_rate": 8.96465894948526e-06, "loss": 0.1688, "step": 1341 }, { "epoch": 0.20870917573872472, "grad_norm": 0.998729186682604, "learning_rate": 8.963169980914908e-06, "loss": 0.2165, "step": 1342 }, { "epoch": 0.2088646967340591, "grad_norm": 0.9250328323650552, "learning_rate": 8.961680066280614e-06, "loss": 0.1978, "step": 1343 }, { "epoch": 0.20902021772939347, "grad_norm": 0.685484375204563, "learning_rate": 8.96018920593804e-06, "loss": 0.1521, "step": 1344 }, { "epoch": 0.20917573872472783, "grad_norm": 1.2194077898180222, "learning_rate": 8.958697400243077e-06, "loss": 0.129, "step": 1345 }, { "epoch": 0.2093312597200622, "grad_norm": 1.3390006867631312, "learning_rate": 8.957204649551838e-06, "loss": 0.2295, "step": 1346 }, { "epoch": 0.2094867807153966, "grad_norm": 1.0791715779616644, "learning_rate": 8.955710954220664e-06, "loss": 0.1922, "step": 1347 }, { "epoch": 0.20964230171073095, "grad_norm": 1.0448818497216468, "learning_rate": 8.954216314606123e-06, "loss": 0.2074, "step": 1348 }, { "epoch": 0.2097978227060653, "grad_norm": 1.0968024521734823, "learning_rate": 8.952720731065e-06, "loss": 0.1956, "step": 1349 }, { "epoch": 0.2099533437013997, "grad_norm": 1.1729159260054676, "learning_rate": 8.95122420395432e-06, "loss": 0.1032, "step": 1350 }, { "epoch": 0.21010886469673407, "grad_norm": 0.7605452577854958, "learning_rate": 8.949726733631319e-06, "loss": 0.2173, "step": 1351 }, { "epoch": 0.21026438569206843, "grad_norm": 0.7896405561018206, "learning_rate": 8.948228320453465e-06, "loss": 0.1411, "step": 1352 }, { "epoch": 0.2104199066874028, "grad_norm": 1.3664851820052848, "learning_rate": 8.946728964778452e-06, "loss": 0.2043, "step": 1353 }, { "epoch": 0.21057542768273718, "grad_norm": 1.0930532560076165, "learning_rate": 8.945228666964197e-06, "loss": 0.2112, "step": 1354 }, { "epoch": 0.21073094867807154, "grad_norm": 1.3370376996193614, "learning_rate": 8.94372742736884e-06, "loss": 0.2763, "step": 1355 }, { "epoch": 0.2108864696734059, "grad_norm": 1.1733695403983486, "learning_rate": 8.942225246350748e-06, "loss": 0.1383, "step": 1356 }, { "epoch": 0.21104199066874027, "grad_norm": 1.518123240050466, "learning_rate": 8.940722124268515e-06, "loss": 0.2035, "step": 1357 }, { "epoch": 0.21119751166407466, "grad_norm": 0.7154774393150748, "learning_rate": 8.939218061480955e-06, "loss": 0.1513, "step": 1358 }, { "epoch": 0.21135303265940902, "grad_norm": 1.7277749667928948, "learning_rate": 8.937713058347109e-06, "loss": 0.1852, "step": 1359 }, { "epoch": 0.2115085536547434, "grad_norm": 0.8101754008908368, "learning_rate": 8.936207115226242e-06, "loss": 0.1755, "step": 1360 }, { "epoch": 0.21166407465007775, "grad_norm": 2.154263107894285, "learning_rate": 8.934700232477845e-06, "loss": 0.2284, "step": 1361 }, { "epoch": 0.21181959564541214, "grad_norm": 2.9946702775104552, "learning_rate": 8.933192410461632e-06, "loss": 0.1571, "step": 1362 }, { "epoch": 0.2119751166407465, "grad_norm": 1.3293853025848206, "learning_rate": 8.931683649537539e-06, "loss": 0.1818, "step": 1363 }, { "epoch": 0.21213063763608087, "grad_norm": 1.069623910831374, "learning_rate": 8.93017395006573e-06, "loss": 0.2389, "step": 1364 }, { "epoch": 0.21228615863141523, "grad_norm": 1.2692486168753456, "learning_rate": 8.928663312406593e-06, "loss": 0.1725, "step": 1365 }, { "epoch": 0.21244167962674962, "grad_norm": 2.31269662319102, "learning_rate": 8.927151736920733e-06, "loss": 0.3472, "step": 1366 }, { "epoch": 0.21259720062208398, "grad_norm": 1.3024374295612378, "learning_rate": 8.925639223968989e-06, "loss": 0.1601, "step": 1367 }, { "epoch": 0.21275272161741834, "grad_norm": 1.475662600105692, "learning_rate": 8.924125773912418e-06, "loss": 0.1652, "step": 1368 }, { "epoch": 0.21290824261275273, "grad_norm": 0.8719883727219597, "learning_rate": 8.9226113871123e-06, "loss": 0.2406, "step": 1369 }, { "epoch": 0.2130637636080871, "grad_norm": 1.355947295843189, "learning_rate": 8.921096063930141e-06, "loss": 0.2387, "step": 1370 }, { "epoch": 0.21321928460342146, "grad_norm": 1.462171782992857, "learning_rate": 8.919579804727671e-06, "loss": 0.2075, "step": 1371 }, { "epoch": 0.21337480559875582, "grad_norm": 1.4186556891621878, "learning_rate": 8.91806260986684e-06, "loss": 0.1906, "step": 1372 }, { "epoch": 0.21353032659409021, "grad_norm": 1.0297515081183366, "learning_rate": 8.916544479709826e-06, "loss": 0.1813, "step": 1373 }, { "epoch": 0.21368584758942458, "grad_norm": 0.8517207332254344, "learning_rate": 8.915025414619025e-06, "loss": 0.2314, "step": 1374 }, { "epoch": 0.21384136858475894, "grad_norm": 1.4500725099182117, "learning_rate": 8.91350541495706e-06, "loss": 0.2702, "step": 1375 }, { "epoch": 0.2139968895800933, "grad_norm": 1.4840249529134437, "learning_rate": 8.911984481086779e-06, "loss": 0.1957, "step": 1376 }, { "epoch": 0.2141524105754277, "grad_norm": 1.0812621557572404, "learning_rate": 8.910462613371246e-06, "loss": 0.1773, "step": 1377 }, { "epoch": 0.21430793157076206, "grad_norm": 0.8285771638848516, "learning_rate": 8.908939812173756e-06, "loss": 0.1879, "step": 1378 }, { "epoch": 0.21446345256609642, "grad_norm": 1.5413069191948623, "learning_rate": 8.907416077857818e-06, "loss": 0.2024, "step": 1379 }, { "epoch": 0.21461897356143078, "grad_norm": 1.5546998088262725, "learning_rate": 8.905891410787174e-06, "loss": 0.1297, "step": 1380 }, { "epoch": 0.21477449455676517, "grad_norm": 1.0276705986435684, "learning_rate": 8.904365811325779e-06, "loss": 0.1777, "step": 1381 }, { "epoch": 0.21493001555209953, "grad_norm": 2.186178551364591, "learning_rate": 8.902839279837818e-06, "loss": 0.1936, "step": 1382 }, { "epoch": 0.2150855365474339, "grad_norm": 1.409142378067793, "learning_rate": 8.901311816687693e-06, "loss": 0.2347, "step": 1383 }, { "epoch": 0.21524105754276826, "grad_norm": 0.909249039104448, "learning_rate": 8.899783422240031e-06, "loss": 0.1858, "step": 1384 }, { "epoch": 0.21539657853810265, "grad_norm": 1.389710830109919, "learning_rate": 8.898254096859681e-06, "loss": 0.2546, "step": 1385 }, { "epoch": 0.215552099533437, "grad_norm": 1.1722812780197163, "learning_rate": 8.896723840911718e-06, "loss": 0.2451, "step": 1386 }, { "epoch": 0.21570762052877138, "grad_norm": 1.0186256750739588, "learning_rate": 8.89519265476143e-06, "loss": 0.1423, "step": 1387 }, { "epoch": 0.21586314152410577, "grad_norm": 1.463755060922718, "learning_rate": 8.893660538774335e-06, "loss": 0.678, "step": 1388 }, { "epoch": 0.21601866251944013, "grad_norm": 1.2144290461428764, "learning_rate": 8.892127493316172e-06, "loss": 0.1289, "step": 1389 }, { "epoch": 0.2161741835147745, "grad_norm": 1.2754281076641276, "learning_rate": 8.8905935187529e-06, "loss": 0.1775, "step": 1390 }, { "epoch": 0.21632970451010886, "grad_norm": 0.8239843617970345, "learning_rate": 8.889058615450695e-06, "loss": 0.1379, "step": 1391 }, { "epoch": 0.21648522550544325, "grad_norm": 0.8183516543340216, "learning_rate": 8.887522783775965e-06, "loss": 0.4396, "step": 1392 }, { "epoch": 0.2166407465007776, "grad_norm": 1.163898200737944, "learning_rate": 8.885986024095334e-06, "loss": 0.1788, "step": 1393 }, { "epoch": 0.21679626749611197, "grad_norm": 1.0398663598746642, "learning_rate": 8.884448336775647e-06, "loss": 0.2058, "step": 1394 }, { "epoch": 0.21695178849144633, "grad_norm": 1.1038527572141106, "learning_rate": 8.882909722183973e-06, "loss": 0.1603, "step": 1395 }, { "epoch": 0.21710730948678073, "grad_norm": 1.1407224011212185, "learning_rate": 8.881370180687597e-06, "loss": 0.212, "step": 1396 }, { "epoch": 0.2172628304821151, "grad_norm": 1.171491183176733, "learning_rate": 8.879829712654032e-06, "loss": 0.156, "step": 1397 }, { "epoch": 0.21741835147744945, "grad_norm": 1.0914587320494888, "learning_rate": 8.878288318451006e-06, "loss": 0.0999, "step": 1398 }, { "epoch": 0.2175738724727838, "grad_norm": 1.0719935831541472, "learning_rate": 8.876745998446477e-06, "loss": 0.2026, "step": 1399 }, { "epoch": 0.2177293934681182, "grad_norm": 0.8893812774700685, "learning_rate": 8.875202753008614e-06, "loss": 0.1152, "step": 1400 }, { "epoch": 0.2177293934681182, "eval_loss": 0.20550738275051117, "eval_runtime": 9.4165, "eval_samples_per_second": 2.761, "eval_steps_per_second": 0.743, "step": 1400 }, { "epoch": 0.21788491446345257, "grad_norm": 1.1732595194107243, "learning_rate": 8.873658582505813e-06, "loss": 0.184, "step": 1401 }, { "epoch": 0.21804043545878693, "grad_norm": 1.3681566501491238, "learning_rate": 8.872113487306686e-06, "loss": 0.1787, "step": 1402 }, { "epoch": 0.2181959564541213, "grad_norm": 0.9384518321736989, "learning_rate": 8.870567467780073e-06, "loss": 0.1907, "step": 1403 }, { "epoch": 0.21835147744945568, "grad_norm": 1.6918321800893066, "learning_rate": 8.86902052429503e-06, "loss": 0.1814, "step": 1404 }, { "epoch": 0.21850699844479005, "grad_norm": 1.0615675392544648, "learning_rate": 8.867472657220829e-06, "loss": 0.1807, "step": 1405 }, { "epoch": 0.2186625194401244, "grad_norm": 1.2104557155019795, "learning_rate": 8.865923866926973e-06, "loss": 0.2046, "step": 1406 }, { "epoch": 0.2188180404354588, "grad_norm": 1.409015102478802, "learning_rate": 8.864374153783177e-06, "loss": 0.2415, "step": 1407 }, { "epoch": 0.21897356143079316, "grad_norm": 1.2088161026937052, "learning_rate": 8.86282351815938e-06, "loss": 0.1573, "step": 1408 }, { "epoch": 0.21912908242612752, "grad_norm": 1.0288225427805875, "learning_rate": 8.861271960425741e-06, "loss": 0.1812, "step": 1409 }, { "epoch": 0.2192846034214619, "grad_norm": 1.1067487085965078, "learning_rate": 8.859719480952637e-06, "loss": 0.1955, "step": 1410 }, { "epoch": 0.21944012441679628, "grad_norm": 1.331527983707418, "learning_rate": 8.858166080110666e-06, "loss": 0.2153, "step": 1411 }, { "epoch": 0.21959564541213064, "grad_norm": 1.3966673201995545, "learning_rate": 8.85661175827065e-06, "loss": 0.1861, "step": 1412 }, { "epoch": 0.219751166407465, "grad_norm": 1.7346922539447693, "learning_rate": 8.855056515803624e-06, "loss": 0.2217, "step": 1413 }, { "epoch": 0.21990668740279937, "grad_norm": 1.0429561703393233, "learning_rate": 8.853500353080848e-06, "loss": 0.137, "step": 1414 }, { "epoch": 0.22006220839813376, "grad_norm": 1.099146007367247, "learning_rate": 8.851943270473797e-06, "loss": 0.1888, "step": 1415 }, { "epoch": 0.22021772939346812, "grad_norm": 1.0622173162674204, "learning_rate": 8.850385268354171e-06, "loss": 0.2054, "step": 1416 }, { "epoch": 0.22037325038880248, "grad_norm": 1.7275165681110787, "learning_rate": 8.848826347093887e-06, "loss": 0.1839, "step": 1417 }, { "epoch": 0.22052877138413685, "grad_norm": 1.4049206778214125, "learning_rate": 8.84726650706508e-06, "loss": 0.2719, "step": 1418 }, { "epoch": 0.22068429237947124, "grad_norm": 0.984134518775913, "learning_rate": 8.845705748640104e-06, "loss": 0.2118, "step": 1419 }, { "epoch": 0.2208398133748056, "grad_norm": 0.8575267757080008, "learning_rate": 8.844144072191537e-06, "loss": 0.1633, "step": 1420 }, { "epoch": 0.22099533437013996, "grad_norm": 1.2572159208716647, "learning_rate": 8.842581478092172e-06, "loss": 0.2397, "step": 1421 }, { "epoch": 0.22115085536547435, "grad_norm": 1.2016606507273602, "learning_rate": 8.841017966715019e-06, "loss": 0.2033, "step": 1422 }, { "epoch": 0.22130637636080872, "grad_norm": 1.3276461025791215, "learning_rate": 8.839453538433314e-06, "loss": 0.1925, "step": 1423 }, { "epoch": 0.22146189735614308, "grad_norm": 0.8224092915871075, "learning_rate": 8.837888193620506e-06, "loss": 0.1358, "step": 1424 }, { "epoch": 0.22161741835147744, "grad_norm": 1.4495835386689406, "learning_rate": 8.836321932650266e-06, "loss": 0.2432, "step": 1425 }, { "epoch": 0.22177293934681183, "grad_norm": 1.4755944744177818, "learning_rate": 8.83475475589648e-06, "loss": 0.1231, "step": 1426 }, { "epoch": 0.2219284603421462, "grad_norm": 0.8119316049057401, "learning_rate": 8.833186663733258e-06, "loss": 0.2097, "step": 1427 }, { "epoch": 0.22208398133748056, "grad_norm": 1.0060829041279713, "learning_rate": 8.83161765653492e-06, "loss": 0.1738, "step": 1428 }, { "epoch": 0.22223950233281492, "grad_norm": 2.4145754711073733, "learning_rate": 8.830047734676018e-06, "loss": 0.2858, "step": 1429 }, { "epoch": 0.2223950233281493, "grad_norm": 1.1242173153619541, "learning_rate": 8.828476898531308e-06, "loss": 0.2166, "step": 1430 }, { "epoch": 0.22255054432348367, "grad_norm": 0.9324040289076934, "learning_rate": 8.826905148475772e-06, "loss": 0.1157, "step": 1431 }, { "epoch": 0.22270606531881804, "grad_norm": 1.1091005510043248, "learning_rate": 8.82533248488461e-06, "loss": 0.2387, "step": 1432 }, { "epoch": 0.2228615863141524, "grad_norm": 1.5660091935097067, "learning_rate": 8.823758908133237e-06, "loss": 0.1783, "step": 1433 }, { "epoch": 0.2230171073094868, "grad_norm": 1.7595194847301099, "learning_rate": 8.822184418597289e-06, "loss": 0.1971, "step": 1434 }, { "epoch": 0.22317262830482115, "grad_norm": 1.1991294408769844, "learning_rate": 8.820609016652616e-06, "loss": 0.1993, "step": 1435 }, { "epoch": 0.22332814930015552, "grad_norm": 1.16155323748872, "learning_rate": 8.819032702675293e-06, "loss": 0.1663, "step": 1436 }, { "epoch": 0.22348367029548988, "grad_norm": 1.144471577400653, "learning_rate": 8.817455477041605e-06, "loss": 0.1553, "step": 1437 }, { "epoch": 0.22363919129082427, "grad_norm": 1.1758756635872867, "learning_rate": 8.815877340128059e-06, "loss": 0.1997, "step": 1438 }, { "epoch": 0.22379471228615863, "grad_norm": 1.0774573442962538, "learning_rate": 8.814298292311376e-06, "loss": 0.224, "step": 1439 }, { "epoch": 0.223950233281493, "grad_norm": 1.018897824496983, "learning_rate": 8.812718333968498e-06, "loss": 0.1969, "step": 1440 }, { "epoch": 0.22410575427682738, "grad_norm": 0.7464671714955523, "learning_rate": 8.811137465476584e-06, "loss": 0.1704, "step": 1441 }, { "epoch": 0.22426127527216175, "grad_norm": 1.120267062163412, "learning_rate": 8.80955568721301e-06, "loss": 0.1941, "step": 1442 }, { "epoch": 0.2244167962674961, "grad_norm": 1.57559360058438, "learning_rate": 8.807972999555368e-06, "loss": 0.2603, "step": 1443 }, { "epoch": 0.22457231726283047, "grad_norm": 0.939702806688543, "learning_rate": 8.806389402881466e-06, "loss": 0.2116, "step": 1444 }, { "epoch": 0.22472783825816486, "grad_norm": 1.2188628504615986, "learning_rate": 8.80480489756933e-06, "loss": 0.2424, "step": 1445 }, { "epoch": 0.22488335925349923, "grad_norm": 1.0944670304974327, "learning_rate": 8.803219483997205e-06, "loss": 0.1828, "step": 1446 }, { "epoch": 0.2250388802488336, "grad_norm": 1.6182633129569433, "learning_rate": 8.801633162543555e-06, "loss": 0.1964, "step": 1447 }, { "epoch": 0.22519440124416795, "grad_norm": 0.8387542620173406, "learning_rate": 8.800045933587052e-06, "loss": 0.1585, "step": 1448 }, { "epoch": 0.22534992223950234, "grad_norm": 1.2464384825217707, "learning_rate": 8.798457797506588e-06, "loss": 0.1134, "step": 1449 }, { "epoch": 0.2255054432348367, "grad_norm": 0.9893884401535724, "learning_rate": 8.79686875468128e-06, "loss": 0.2421, "step": 1450 }, { "epoch": 0.22566096423017107, "grad_norm": 1.0480572189617101, "learning_rate": 8.79527880549045e-06, "loss": 0.1921, "step": 1451 }, { "epoch": 0.22581648522550543, "grad_norm": 1.3152458887234093, "learning_rate": 8.793687950313643e-06, "loss": 0.1597, "step": 1452 }, { "epoch": 0.22597200622083982, "grad_norm": 1.0970403207876425, "learning_rate": 8.792096189530614e-06, "loss": 0.1854, "step": 1453 }, { "epoch": 0.22612752721617418, "grad_norm": 1.0705518033654797, "learning_rate": 8.790503523521346e-06, "loss": 0.1839, "step": 1454 }, { "epoch": 0.22628304821150855, "grad_norm": 1.132932961220967, "learning_rate": 8.788909952666024e-06, "loss": 0.1871, "step": 1455 }, { "epoch": 0.2264385692068429, "grad_norm": 1.4797221000535143, "learning_rate": 8.787315477345059e-06, "loss": 0.2295, "step": 1456 }, { "epoch": 0.2265940902021773, "grad_norm": 1.0944162670416104, "learning_rate": 8.785720097939075e-06, "loss": 0.1745, "step": 1457 }, { "epoch": 0.22674961119751166, "grad_norm": 1.6430830107526284, "learning_rate": 8.784123814828908e-06, "loss": 0.3592, "step": 1458 }, { "epoch": 0.22690513219284603, "grad_norm": 1.1438907408683774, "learning_rate": 8.782526628395616e-06, "loss": 0.1613, "step": 1459 }, { "epoch": 0.22706065318818042, "grad_norm": 2.655708868757693, "learning_rate": 8.780928539020467e-06, "loss": 0.1821, "step": 1460 }, { "epoch": 0.22721617418351478, "grad_norm": 0.9605535718803637, "learning_rate": 8.779329547084949e-06, "loss": 0.1707, "step": 1461 }, { "epoch": 0.22737169517884914, "grad_norm": 2.2075086894366036, "learning_rate": 8.777729652970765e-06, "loss": 0.1383, "step": 1462 }, { "epoch": 0.2275272161741835, "grad_norm": 1.1974721511606266, "learning_rate": 8.77612885705983e-06, "loss": 0.2615, "step": 1463 }, { "epoch": 0.2276827371695179, "grad_norm": 1.076273551290465, "learning_rate": 8.774527159734277e-06, "loss": 0.2094, "step": 1464 }, { "epoch": 0.22783825816485226, "grad_norm": 1.3601919661341624, "learning_rate": 8.772924561376454e-06, "loss": 0.2324, "step": 1465 }, { "epoch": 0.22799377916018662, "grad_norm": 1.4328079078867457, "learning_rate": 8.771321062368922e-06, "loss": 0.1763, "step": 1466 }, { "epoch": 0.22814930015552098, "grad_norm": 1.1869126356200645, "learning_rate": 8.76971666309446e-06, "loss": 0.1093, "step": 1467 }, { "epoch": 0.22830482115085537, "grad_norm": 0.8016043523305539, "learning_rate": 8.768111363936058e-06, "loss": 0.1716, "step": 1468 }, { "epoch": 0.22846034214618974, "grad_norm": 1.1279000832737547, "learning_rate": 8.766505165276928e-06, "loss": 0.1415, "step": 1469 }, { "epoch": 0.2286158631415241, "grad_norm": 1.4632653437041683, "learning_rate": 8.764898067500488e-06, "loss": 0.1682, "step": 1470 }, { "epoch": 0.22877138413685846, "grad_norm": 1.427331448842405, "learning_rate": 8.763290070990377e-06, "loss": 0.261, "step": 1471 }, { "epoch": 0.22892690513219285, "grad_norm": 0.9926126679211552, "learning_rate": 8.761681176130443e-06, "loss": 0.1625, "step": 1472 }, { "epoch": 0.22908242612752722, "grad_norm": 1.690385156533882, "learning_rate": 8.760071383304755e-06, "loss": 0.2803, "step": 1473 }, { "epoch": 0.22923794712286158, "grad_norm": 1.0976612977720204, "learning_rate": 8.758460692897593e-06, "loss": 0.1802, "step": 1474 }, { "epoch": 0.22939346811819597, "grad_norm": 1.2314757179900722, "learning_rate": 8.756849105293447e-06, "loss": 0.1768, "step": 1475 }, { "epoch": 0.22954898911353033, "grad_norm": 1.1327643054428198, "learning_rate": 8.755236620877033e-06, "loss": 0.1865, "step": 1476 }, { "epoch": 0.2297045101088647, "grad_norm": 1.1639229615649782, "learning_rate": 8.753623240033265e-06, "loss": 0.1524, "step": 1477 }, { "epoch": 0.22986003110419906, "grad_norm": 0.9603164098229106, "learning_rate": 8.752008963147285e-06, "loss": 0.1721, "step": 1478 }, { "epoch": 0.23001555209953345, "grad_norm": 1.38792631561096, "learning_rate": 8.750393790604442e-06, "loss": 0.2342, "step": 1479 }, { "epoch": 0.2301710730948678, "grad_norm": 1.2479053691859538, "learning_rate": 8.7487777227903e-06, "loss": 0.1938, "step": 1480 }, { "epoch": 0.23032659409020217, "grad_norm": 1.2509939431760002, "learning_rate": 8.747160760090637e-06, "loss": 0.1844, "step": 1481 }, { "epoch": 0.23048211508553654, "grad_norm": 1.465934150389407, "learning_rate": 8.745542902891444e-06, "loss": 0.205, "step": 1482 }, { "epoch": 0.23063763608087093, "grad_norm": 1.0510694170069674, "learning_rate": 8.743924151578928e-06, "loss": 0.1759, "step": 1483 }, { "epoch": 0.2307931570762053, "grad_norm": 1.2869382169156265, "learning_rate": 8.742304506539506e-06, "loss": 0.1634, "step": 1484 }, { "epoch": 0.23094867807153965, "grad_norm": 2.0849533877813067, "learning_rate": 8.740683968159808e-06, "loss": 0.1834, "step": 1485 }, { "epoch": 0.23110419906687402, "grad_norm": 0.5391088701503829, "learning_rate": 8.739062536826683e-06, "loss": 0.1062, "step": 1486 }, { "epoch": 0.2312597200622084, "grad_norm": 1.339043790882886, "learning_rate": 8.737440212927188e-06, "loss": 0.154, "step": 1487 }, { "epoch": 0.23141524105754277, "grad_norm": 1.2239049109865379, "learning_rate": 8.735816996848592e-06, "loss": 0.1694, "step": 1488 }, { "epoch": 0.23157076205287713, "grad_norm": 0.8785721668205927, "learning_rate": 8.734192888978381e-06, "loss": 0.1501, "step": 1489 }, { "epoch": 0.2317262830482115, "grad_norm": 1.1018359589714184, "learning_rate": 8.732567889704253e-06, "loss": 0.2004, "step": 1490 }, { "epoch": 0.23188180404354589, "grad_norm": 1.2782960384351885, "learning_rate": 8.730941999414117e-06, "loss": 0.1514, "step": 1491 }, { "epoch": 0.23203732503888025, "grad_norm": 0.7470536578634075, "learning_rate": 8.729315218496097e-06, "loss": 0.1828, "step": 1492 }, { "epoch": 0.2321928460342146, "grad_norm": 1.0314729949458916, "learning_rate": 8.727687547338527e-06, "loss": 0.1766, "step": 1493 }, { "epoch": 0.232348367029549, "grad_norm": 1.435780946058732, "learning_rate": 8.726058986329954e-06, "loss": 0.2574, "step": 1494 }, { "epoch": 0.23250388802488337, "grad_norm": 1.3013711909380183, "learning_rate": 8.72442953585914e-06, "loss": 0.2304, "step": 1495 }, { "epoch": 0.23265940902021773, "grad_norm": 1.3258835525000316, "learning_rate": 8.722799196315057e-06, "loss": 0.1649, "step": 1496 }, { "epoch": 0.2328149300155521, "grad_norm": 1.4810824648278473, "learning_rate": 8.721167968086888e-06, "loss": 0.2786, "step": 1497 }, { "epoch": 0.23297045101088648, "grad_norm": 0.8879588001193606, "learning_rate": 8.719535851564034e-06, "loss": 0.1662, "step": 1498 }, { "epoch": 0.23312597200622084, "grad_norm": 1.0006636128134747, "learning_rate": 8.7179028471361e-06, "loss": 0.144, "step": 1499 }, { "epoch": 0.2332814930015552, "grad_norm": 1.0732426035660707, "learning_rate": 8.716268955192908e-06, "loss": 0.1799, "step": 1500 }, { "epoch": 0.2332814930015552, "eval_loss": 0.20381511747837067, "eval_runtime": 9.4315, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 1500 }, { "epoch": 0.23343701399688957, "grad_norm": 1.1848798776210054, "learning_rate": 8.714634176124492e-06, "loss": 0.2192, "step": 1501 }, { "epoch": 0.23359253499222396, "grad_norm": 1.1734962627193575, "learning_rate": 8.712998510321095e-06, "loss": 0.2218, "step": 1502 }, { "epoch": 0.23374805598755832, "grad_norm": 1.0346380522248477, "learning_rate": 8.711361958173175e-06, "loss": 0.1561, "step": 1503 }, { "epoch": 0.23390357698289269, "grad_norm": 0.8380236750022618, "learning_rate": 8.709724520071399e-06, "loss": 0.1238, "step": 1504 }, { "epoch": 0.23405909797822705, "grad_norm": 0.8234400155679666, "learning_rate": 8.708086196406646e-06, "loss": 0.1887, "step": 1505 }, { "epoch": 0.23421461897356144, "grad_norm": 1.3627952832885772, "learning_rate": 8.706446987570005e-06, "loss": 0.1739, "step": 1506 }, { "epoch": 0.2343701399688958, "grad_norm": 2.486707766460104, "learning_rate": 8.704806893952782e-06, "loss": 0.1462, "step": 1507 }, { "epoch": 0.23452566096423016, "grad_norm": 1.041812062354574, "learning_rate": 8.703165915946488e-06, "loss": 0.2247, "step": 1508 }, { "epoch": 0.23468118195956453, "grad_norm": 1.2090827115985525, "learning_rate": 8.701524053942846e-06, "loss": 0.1931, "step": 1509 }, { "epoch": 0.23483670295489892, "grad_norm": 0.7956311279751848, "learning_rate": 8.699881308333794e-06, "loss": 0.1801, "step": 1510 }, { "epoch": 0.23499222395023328, "grad_norm": 2.3005427634248017, "learning_rate": 8.698237679511476e-06, "loss": 0.2116, "step": 1511 }, { "epoch": 0.23514774494556764, "grad_norm": 1.1297158899245439, "learning_rate": 8.696593167868252e-06, "loss": 0.2319, "step": 1512 }, { "epoch": 0.23530326594090203, "grad_norm": 0.960775125545338, "learning_rate": 8.694947773796685e-06, "loss": 0.1543, "step": 1513 }, { "epoch": 0.2354587869362364, "grad_norm": 1.213893040863673, "learning_rate": 8.69330149768956e-06, "loss": 0.2041, "step": 1514 }, { "epoch": 0.23561430793157076, "grad_norm": 0.8074468351762752, "learning_rate": 8.69165433993986e-06, "loss": 0.1965, "step": 1515 }, { "epoch": 0.23576982892690512, "grad_norm": 1.1267774919804718, "learning_rate": 8.690006300940789e-06, "loss": 0.1823, "step": 1516 }, { "epoch": 0.2359253499222395, "grad_norm": 1.4711843699980223, "learning_rate": 8.688357381085753e-06, "loss": 0.1753, "step": 1517 }, { "epoch": 0.23608087091757388, "grad_norm": 1.0215570051060534, "learning_rate": 8.686707580768376e-06, "loss": 0.214, "step": 1518 }, { "epoch": 0.23623639191290824, "grad_norm": 1.4485746749390973, "learning_rate": 8.685056900382486e-06, "loss": 0.1742, "step": 1519 }, { "epoch": 0.2363919129082426, "grad_norm": 1.6525523323599767, "learning_rate": 8.683405340322123e-06, "loss": 0.3261, "step": 1520 }, { "epoch": 0.236547433903577, "grad_norm": 1.411135121552525, "learning_rate": 8.681752900981539e-06, "loss": 0.1753, "step": 1521 }, { "epoch": 0.23670295489891136, "grad_norm": 1.4707330597490842, "learning_rate": 8.680099582755196e-06, "loss": 0.1668, "step": 1522 }, { "epoch": 0.23685847589424572, "grad_norm": 1.0942391175538886, "learning_rate": 8.678445386037759e-06, "loss": 0.1601, "step": 1523 }, { "epoch": 0.23701399688958008, "grad_norm": 1.470588177448403, "learning_rate": 8.67679031122411e-06, "loss": 0.246, "step": 1524 }, { "epoch": 0.23716951788491447, "grad_norm": 0.9581346042453303, "learning_rate": 8.675134358709341e-06, "loss": 0.1574, "step": 1525 }, { "epoch": 0.23732503888024883, "grad_norm": 1.4763786660245666, "learning_rate": 8.67347752888875e-06, "loss": 0.1907, "step": 1526 }, { "epoch": 0.2374805598755832, "grad_norm": 1.0363167034974192, "learning_rate": 8.671819822157842e-06, "loss": 0.1531, "step": 1527 }, { "epoch": 0.2376360808709176, "grad_norm": 1.1924345869848432, "learning_rate": 8.670161238912338e-06, "loss": 0.1347, "step": 1528 }, { "epoch": 0.23779160186625195, "grad_norm": 1.3358065512422586, "learning_rate": 8.668501779548165e-06, "loss": 0.1827, "step": 1529 }, { "epoch": 0.2379471228615863, "grad_norm": 2.021234266844145, "learning_rate": 8.666841444461456e-06, "loss": 0.1368, "step": 1530 }, { "epoch": 0.23810264385692068, "grad_norm": 1.4808660901110622, "learning_rate": 8.665180234048561e-06, "loss": 0.2527, "step": 1531 }, { "epoch": 0.23825816485225507, "grad_norm": 1.065494309629267, "learning_rate": 8.66351814870603e-06, "loss": 0.1645, "step": 1532 }, { "epoch": 0.23841368584758943, "grad_norm": 1.154174016882306, "learning_rate": 8.661855188830626e-06, "loss": 0.2328, "step": 1533 }, { "epoch": 0.2385692068429238, "grad_norm": 1.1447203609781391, "learning_rate": 8.660191354819324e-06, "loss": 0.1794, "step": 1534 }, { "epoch": 0.23872472783825816, "grad_norm": 0.9991428522588004, "learning_rate": 8.658526647069303e-06, "loss": 0.1233, "step": 1535 }, { "epoch": 0.23888024883359255, "grad_norm": 0.7670014014044277, "learning_rate": 8.65686106597795e-06, "loss": 0.1834, "step": 1536 }, { "epoch": 0.2390357698289269, "grad_norm": 1.5945089662017708, "learning_rate": 8.655194611942863e-06, "loss": 0.1921, "step": 1537 }, { "epoch": 0.23919129082426127, "grad_norm": 1.2997434550841578, "learning_rate": 8.65352728536185e-06, "loss": 0.1873, "step": 1538 }, { "epoch": 0.23934681181959563, "grad_norm": 0.7625665208100638, "learning_rate": 8.651859086632924e-06, "loss": 0.1049, "step": 1539 }, { "epoch": 0.23950233281493002, "grad_norm": 2.315830524891549, "learning_rate": 8.650190016154307e-06, "loss": 0.2199, "step": 1540 }, { "epoch": 0.2396578538102644, "grad_norm": 0.893513036921711, "learning_rate": 8.648520074324429e-06, "loss": 0.1486, "step": 1541 }, { "epoch": 0.23981337480559875, "grad_norm": 1.0954057776977126, "learning_rate": 8.64684926154193e-06, "loss": 0.143, "step": 1542 }, { "epoch": 0.2399688958009331, "grad_norm": 1.1636396222045602, "learning_rate": 8.645177578205654e-06, "loss": 0.1386, "step": 1543 }, { "epoch": 0.2401244167962675, "grad_norm": 1.6636278556595083, "learning_rate": 8.643505024714656e-06, "loss": 0.2057, "step": 1544 }, { "epoch": 0.24027993779160187, "grad_norm": 0.847583750776468, "learning_rate": 8.641831601468198e-06, "loss": 0.1272, "step": 1545 }, { "epoch": 0.24043545878693623, "grad_norm": 1.1676164916999088, "learning_rate": 8.640157308865751e-06, "loss": 0.2057, "step": 1546 }, { "epoch": 0.24059097978227062, "grad_norm": 1.1944835161358125, "learning_rate": 8.63848214730699e-06, "loss": 0.2237, "step": 1547 }, { "epoch": 0.24074650077760498, "grad_norm": 1.3051952058816747, "learning_rate": 8.6368061171918e-06, "loss": 0.1398, "step": 1548 }, { "epoch": 0.24090202177293935, "grad_norm": 1.2433159998532273, "learning_rate": 8.635129218920272e-06, "loss": 0.1514, "step": 1549 }, { "epoch": 0.2410575427682737, "grad_norm": 1.6469350149721569, "learning_rate": 8.633451452892707e-06, "loss": 0.2141, "step": 1550 }, { "epoch": 0.2412130637636081, "grad_norm": 1.0473985194623197, "learning_rate": 8.631772819509609e-06, "loss": 0.1629, "step": 1551 }, { "epoch": 0.24136858475894246, "grad_norm": 1.081030634052537, "learning_rate": 8.630093319171692e-06, "loss": 0.1647, "step": 1552 }, { "epoch": 0.24152410575427682, "grad_norm": 1.0002048515938975, "learning_rate": 8.628412952279879e-06, "loss": 0.1636, "step": 1553 }, { "epoch": 0.2416796267496112, "grad_norm": 1.2635804994332953, "learning_rate": 8.62673171923529e-06, "loss": 0.1922, "step": 1554 }, { "epoch": 0.24183514774494558, "grad_norm": 1.0841589283406547, "learning_rate": 8.625049620439266e-06, "loss": 0.1796, "step": 1555 }, { "epoch": 0.24199066874027994, "grad_norm": 1.2588626615586416, "learning_rate": 8.623366656293345e-06, "loss": 0.2045, "step": 1556 }, { "epoch": 0.2421461897356143, "grad_norm": 1.114070429674418, "learning_rate": 8.621682827199271e-06, "loss": 0.2155, "step": 1557 }, { "epoch": 0.24230171073094867, "grad_norm": 1.122877032526039, "learning_rate": 8.619998133559001e-06, "loss": 0.1647, "step": 1558 }, { "epoch": 0.24245723172628306, "grad_norm": 2.039494379737774, "learning_rate": 8.618312575774696e-06, "loss": 0.2327, "step": 1559 }, { "epoch": 0.24261275272161742, "grad_norm": 1.1450723191422727, "learning_rate": 8.616626154248717e-06, "loss": 0.1879, "step": 1560 }, { "epoch": 0.24276827371695178, "grad_norm": 1.1035439479736404, "learning_rate": 8.614938869383643e-06, "loss": 0.1987, "step": 1561 }, { "epoch": 0.24292379471228615, "grad_norm": 9.183796995970361, "learning_rate": 8.613250721582244e-06, "loss": 0.1657, "step": 1562 }, { "epoch": 0.24307931570762054, "grad_norm": 10.346790090579951, "learning_rate": 8.611561711247512e-06, "loss": 0.1277, "step": 1563 }, { "epoch": 0.2432348367029549, "grad_norm": 1.0950378522648088, "learning_rate": 8.609871838782636e-06, "loss": 0.1792, "step": 1564 }, { "epoch": 0.24339035769828926, "grad_norm": 1.2442899837619454, "learning_rate": 8.608181104591008e-06, "loss": 0.2481, "step": 1565 }, { "epoch": 0.24354587869362365, "grad_norm": 0.9579587283389649, "learning_rate": 8.606489509076232e-06, "loss": 0.1464, "step": 1566 }, { "epoch": 0.24370139968895801, "grad_norm": 1.3434609920952423, "learning_rate": 8.604797052642118e-06, "loss": 0.167, "step": 1567 }, { "epoch": 0.24385692068429238, "grad_norm": 1.3932778191886934, "learning_rate": 8.603103735692678e-06, "loss": 0.222, "step": 1568 }, { "epoch": 0.24401244167962674, "grad_norm": 1.2606515150004263, "learning_rate": 8.601409558632125e-06, "loss": 0.1734, "step": 1569 }, { "epoch": 0.24416796267496113, "grad_norm": 0.7524170445152542, "learning_rate": 8.59971452186489e-06, "loss": 0.1377, "step": 1570 }, { "epoch": 0.2443234836702955, "grad_norm": 1.8039225543958133, "learning_rate": 8.5980186257956e-06, "loss": 0.1645, "step": 1571 }, { "epoch": 0.24447900466562986, "grad_norm": 1.2660119379119157, "learning_rate": 8.596321870829084e-06, "loss": 0.1297, "step": 1572 }, { "epoch": 0.24463452566096422, "grad_norm": 0.9837487875887194, "learning_rate": 8.594624257370388e-06, "loss": 0.2292, "step": 1573 }, { "epoch": 0.2447900466562986, "grad_norm": 1.4946436207685003, "learning_rate": 8.592925785824753e-06, "loss": 0.171, "step": 1574 }, { "epoch": 0.24494556765163297, "grad_norm": 1.0654266730537136, "learning_rate": 8.591226456597626e-06, "loss": 0.1375, "step": 1575 }, { "epoch": 0.24510108864696734, "grad_norm": 0.971876018180366, "learning_rate": 8.589526270094664e-06, "loss": 0.1924, "step": 1576 }, { "epoch": 0.2452566096423017, "grad_norm": 1.0087644300116139, "learning_rate": 8.587825226721722e-06, "loss": 0.1687, "step": 1577 }, { "epoch": 0.2454121306376361, "grad_norm": 1.1652659496533695, "learning_rate": 8.586123326884865e-06, "loss": 0.186, "step": 1578 }, { "epoch": 0.24556765163297045, "grad_norm": 1.4775732365533967, "learning_rate": 8.584420570990361e-06, "loss": 0.1889, "step": 1579 }, { "epoch": 0.24572317262830481, "grad_norm": 1.0459439420285532, "learning_rate": 8.582716959444679e-06, "loss": 0.1928, "step": 1580 }, { "epoch": 0.2458786936236392, "grad_norm": 1.5372117734449058, "learning_rate": 8.581012492654495e-06, "loss": 0.1877, "step": 1581 }, { "epoch": 0.24603421461897357, "grad_norm": 1.9347395817267816, "learning_rate": 8.579307171026693e-06, "loss": 0.2777, "step": 1582 }, { "epoch": 0.24618973561430793, "grad_norm": 0.9029125279631515, "learning_rate": 8.577600994968352e-06, "loss": 0.1297, "step": 1583 }, { "epoch": 0.2463452566096423, "grad_norm": 0.8355029037365392, "learning_rate": 8.575893964886763e-06, "loss": 0.2099, "step": 1584 }, { "epoch": 0.24650077760497668, "grad_norm": 1.6899413873191795, "learning_rate": 8.574186081189416e-06, "loss": 0.2022, "step": 1585 }, { "epoch": 0.24665629860031105, "grad_norm": 1.087509710593699, "learning_rate": 8.572477344284009e-06, "loss": 0.1751, "step": 1586 }, { "epoch": 0.2468118195956454, "grad_norm": 1.0292806428751466, "learning_rate": 8.570767754578438e-06, "loss": 0.1593, "step": 1587 }, { "epoch": 0.24696734059097977, "grad_norm": 1.188609591991913, "learning_rate": 8.56905731248081e-06, "loss": 0.1491, "step": 1588 }, { "epoch": 0.24712286158631416, "grad_norm": 1.2300883239133906, "learning_rate": 8.567346018399427e-06, "loss": 0.165, "step": 1589 }, { "epoch": 0.24727838258164853, "grad_norm": 1.2064414577216789, "learning_rate": 8.565633872742803e-06, "loss": 0.2524, "step": 1590 }, { "epoch": 0.2474339035769829, "grad_norm": 0.8406003864640567, "learning_rate": 8.56392087591965e-06, "loss": 0.1658, "step": 1591 }, { "epoch": 0.24758942457231725, "grad_norm": 2.634699334807654, "learning_rate": 8.56220702833888e-06, "loss": 0.1692, "step": 1592 }, { "epoch": 0.24774494556765164, "grad_norm": 0.9815581638651881, "learning_rate": 8.560492330409618e-06, "loss": 0.1678, "step": 1593 }, { "epoch": 0.247900466562986, "grad_norm": 1.3909573488426212, "learning_rate": 8.558776782541183e-06, "loss": 0.2397, "step": 1594 }, { "epoch": 0.24805598755832037, "grad_norm": 1.2613818557792364, "learning_rate": 8.557060385143102e-06, "loss": 0.2273, "step": 1595 }, { "epoch": 0.24821150855365473, "grad_norm": 0.9777010646149178, "learning_rate": 8.5553431386251e-06, "loss": 0.1713, "step": 1596 }, { "epoch": 0.24836702954898912, "grad_norm": 1.2012423072130696, "learning_rate": 8.553625043397112e-06, "loss": 0.2192, "step": 1597 }, { "epoch": 0.24852255054432348, "grad_norm": 1.0747389022970961, "learning_rate": 8.551906099869269e-06, "loss": 0.1555, "step": 1598 }, { "epoch": 0.24867807153965785, "grad_norm": 0.9987345212261577, "learning_rate": 8.550186308451906e-06, "loss": 0.2117, "step": 1599 }, { "epoch": 0.24883359253499224, "grad_norm": 1.1743809541983374, "learning_rate": 8.548465669555564e-06, "loss": 0.1547, "step": 1600 }, { "epoch": 0.24883359253499224, "eval_loss": 0.2037108987569809, "eval_runtime": 9.4238, "eval_samples_per_second": 2.759, "eval_steps_per_second": 0.743, "step": 1600 }, { "epoch": 0.2489891135303266, "grad_norm": 1.0755504197866683, "learning_rate": 8.546744183590979e-06, "loss": 0.1448, "step": 1601 }, { "epoch": 0.24914463452566096, "grad_norm": 1.293645268455303, "learning_rate": 8.545021850969097e-06, "loss": 0.2045, "step": 1602 }, { "epoch": 0.24930015552099533, "grad_norm": 1.644496498579518, "learning_rate": 8.543298672101063e-06, "loss": 0.1745, "step": 1603 }, { "epoch": 0.24945567651632972, "grad_norm": 1.8853737644375217, "learning_rate": 8.541574647398224e-06, "loss": 0.1785, "step": 1604 }, { "epoch": 0.24961119751166408, "grad_norm": 0.8348472318309339, "learning_rate": 8.539849777272125e-06, "loss": 0.1976, "step": 1605 }, { "epoch": 0.24976671850699844, "grad_norm": 1.6007239985640846, "learning_rate": 8.538124062134521e-06, "loss": 0.1766, "step": 1606 }, { "epoch": 0.2499222395023328, "grad_norm": 2.1944156006209194, "learning_rate": 8.53639750239736e-06, "loss": 0.2715, "step": 1607 }, { "epoch": 0.25007776049766717, "grad_norm": 1.105749977206952, "learning_rate": 8.534670098472802e-06, "loss": 0.1564, "step": 1608 }, { "epoch": 0.25023328149300156, "grad_norm": 0.8083237797522677, "learning_rate": 8.532941850773195e-06, "loss": 0.1668, "step": 1609 }, { "epoch": 0.25038880248833595, "grad_norm": 1.172486307255137, "learning_rate": 8.531212759711103e-06, "loss": 0.2302, "step": 1610 }, { "epoch": 0.2505443234836703, "grad_norm": 1.268322758173216, "learning_rate": 8.52948282569928e-06, "loss": 0.1789, "step": 1611 }, { "epoch": 0.2506998444790047, "grad_norm": 0.9091823227567202, "learning_rate": 8.527752049150685e-06, "loss": 0.0784, "step": 1612 }, { "epoch": 0.250855365474339, "grad_norm": 1.3902158634610304, "learning_rate": 8.52602043047848e-06, "loss": 0.1681, "step": 1613 }, { "epoch": 0.2510108864696734, "grad_norm": 1.4942303280111533, "learning_rate": 8.524287970096026e-06, "loss": 0.217, "step": 1614 }, { "epoch": 0.2511664074650078, "grad_norm": 0.8627158934582907, "learning_rate": 8.522554668416887e-06, "loss": 0.2181, "step": 1615 }, { "epoch": 0.2513219284603421, "grad_norm": 1.0390290867530942, "learning_rate": 8.520820525854824e-06, "loss": 0.1764, "step": 1616 }, { "epoch": 0.2514774494556765, "grad_norm": 1.4108685539031005, "learning_rate": 8.519085542823802e-06, "loss": 0.2164, "step": 1617 }, { "epoch": 0.2516329704510109, "grad_norm": 1.371077345528009, "learning_rate": 8.517349719737984e-06, "loss": 0.1561, "step": 1618 }, { "epoch": 0.25178849144634524, "grad_norm": 1.2763042021188964, "learning_rate": 8.51561305701174e-06, "loss": 0.1526, "step": 1619 }, { "epoch": 0.25194401244167963, "grad_norm": 1.077695325158449, "learning_rate": 8.51387555505963e-06, "loss": 0.1876, "step": 1620 }, { "epoch": 0.252099533437014, "grad_norm": 1.3164226998591637, "learning_rate": 8.512137214296422e-06, "loss": 0.2131, "step": 1621 }, { "epoch": 0.25225505443234836, "grad_norm": 1.7522341912294899, "learning_rate": 8.510398035137083e-06, "loss": 0.133, "step": 1622 }, { "epoch": 0.25241057542768275, "grad_norm": 4.615604310333582, "learning_rate": 8.50865801799678e-06, "loss": 0.1955, "step": 1623 }, { "epoch": 0.2525660964230171, "grad_norm": 2.3506074867763536, "learning_rate": 8.506917163290877e-06, "loss": 0.3199, "step": 1624 }, { "epoch": 0.2527216174183515, "grad_norm": 0.7483739763165084, "learning_rate": 8.505175471434943e-06, "loss": 0.2213, "step": 1625 }, { "epoch": 0.25287713841368586, "grad_norm": 2.0095572169442333, "learning_rate": 8.50343294284474e-06, "loss": 0.2356, "step": 1626 }, { "epoch": 0.2530326594090202, "grad_norm": 0.9367298995041891, "learning_rate": 8.501689577936238e-06, "loss": 0.1567, "step": 1627 }, { "epoch": 0.2531881804043546, "grad_norm": 1.2746896918156698, "learning_rate": 8.499945377125602e-06, "loss": 0.1465, "step": 1628 }, { "epoch": 0.253343701399689, "grad_norm": 0.7971645300115215, "learning_rate": 8.498200340829195e-06, "loss": 0.1419, "step": 1629 }, { "epoch": 0.2534992223950233, "grad_norm": 1.7131432725110083, "learning_rate": 8.496454469463583e-06, "loss": 0.1437, "step": 1630 }, { "epoch": 0.2536547433903577, "grad_norm": 1.3945635968284718, "learning_rate": 8.494707763445526e-06, "loss": 0.2116, "step": 1631 }, { "epoch": 0.25381026438569204, "grad_norm": 1.130700720901677, "learning_rate": 8.492960223191994e-06, "loss": 0.1783, "step": 1632 }, { "epoch": 0.25396578538102643, "grad_norm": 0.9910207975897489, "learning_rate": 8.491211849120146e-06, "loss": 0.1275, "step": 1633 }, { "epoch": 0.2541213063763608, "grad_norm": 1.6819299813099522, "learning_rate": 8.48946264164734e-06, "loss": 0.2092, "step": 1634 }, { "epoch": 0.25427682737169516, "grad_norm": 0.8070165110990363, "learning_rate": 8.487712601191143e-06, "loss": 0.2104, "step": 1635 }, { "epoch": 0.25443234836702955, "grad_norm": 0.7832453865024183, "learning_rate": 8.485961728169308e-06, "loss": 0.1491, "step": 1636 }, { "epoch": 0.25458786936236394, "grad_norm": 1.570863259158348, "learning_rate": 8.484210022999795e-06, "loss": 0.1337, "step": 1637 }, { "epoch": 0.2547433903576983, "grad_norm": 2.094162070797788, "learning_rate": 8.482457486100761e-06, "loss": 0.1732, "step": 1638 }, { "epoch": 0.25489891135303266, "grad_norm": 1.3293274255208316, "learning_rate": 8.48070411789056e-06, "loss": 0.1587, "step": 1639 }, { "epoch": 0.25505443234836706, "grad_norm": 0.9704592907631973, "learning_rate": 8.478949918787746e-06, "loss": 0.167, "step": 1640 }, { "epoch": 0.2552099533437014, "grad_norm": 2.2927511192581935, "learning_rate": 8.47719488921107e-06, "loss": 0.1731, "step": 1641 }, { "epoch": 0.2553654743390358, "grad_norm": 1.2113969832398468, "learning_rate": 8.475439029579487e-06, "loss": 0.1636, "step": 1642 }, { "epoch": 0.2555209953343701, "grad_norm": 1.2700840486141427, "learning_rate": 8.473682340312136e-06, "loss": 0.2251, "step": 1643 }, { "epoch": 0.2556765163297045, "grad_norm": 0.8692629936958125, "learning_rate": 8.47192482182837e-06, "loss": 0.1944, "step": 1644 }, { "epoch": 0.2558320373250389, "grad_norm": 1.0546031026829716, "learning_rate": 8.470166474547731e-06, "loss": 0.1963, "step": 1645 }, { "epoch": 0.25598755832037323, "grad_norm": 1.8035421603246344, "learning_rate": 8.468407298889962e-06, "loss": 0.1678, "step": 1646 }, { "epoch": 0.2561430793157076, "grad_norm": 0.8593243264529278, "learning_rate": 8.466647295275002e-06, "loss": 0.1272, "step": 1647 }, { "epoch": 0.256298600311042, "grad_norm": 1.5174530612382813, "learning_rate": 8.464886464122988e-06, "loss": 0.2685, "step": 1648 }, { "epoch": 0.25645412130637635, "grad_norm": 1.5250972376290421, "learning_rate": 8.463124805854257e-06, "loss": 0.1674, "step": 1649 }, { "epoch": 0.25660964230171074, "grad_norm": 1.1663575092987046, "learning_rate": 8.461362320889338e-06, "loss": 0.1577, "step": 1650 }, { "epoch": 0.2567651632970451, "grad_norm": 1.474673013106268, "learning_rate": 8.459599009648964e-06, "loss": 0.1769, "step": 1651 }, { "epoch": 0.25692068429237946, "grad_norm": 1.1672631965692757, "learning_rate": 8.45783487255406e-06, "loss": 0.2249, "step": 1652 }, { "epoch": 0.25707620528771385, "grad_norm": 1.1953181883355133, "learning_rate": 8.456069910025751e-06, "loss": 0.2018, "step": 1653 }, { "epoch": 0.2572317262830482, "grad_norm": 1.1089828464331577, "learning_rate": 8.454304122485358e-06, "loss": 0.1419, "step": 1654 }, { "epoch": 0.2573872472783826, "grad_norm": 1.2716710060074294, "learning_rate": 8.452537510354397e-06, "loss": 0.1966, "step": 1655 }, { "epoch": 0.25754276827371697, "grad_norm": 1.952579937166782, "learning_rate": 8.450770074054586e-06, "loss": 0.2699, "step": 1656 }, { "epoch": 0.2576982892690513, "grad_norm": 0.7319931402583304, "learning_rate": 8.449001814007838e-06, "loss": 0.1401, "step": 1657 }, { "epoch": 0.2578538102643857, "grad_norm": 1.627013708512288, "learning_rate": 8.447232730636257e-06, "loss": 0.2617, "step": 1658 }, { "epoch": 0.2580093312597201, "grad_norm": 1.0492953509552387, "learning_rate": 8.44546282436215e-06, "loss": 0.1922, "step": 1659 }, { "epoch": 0.2581648522550544, "grad_norm": 0.9166534435780459, "learning_rate": 8.443692095608019e-06, "loss": 0.2099, "step": 1660 }, { "epoch": 0.2583203732503888, "grad_norm": 1.1458120209760718, "learning_rate": 8.441920544796558e-06, "loss": 0.1724, "step": 1661 }, { "epoch": 0.25847589424572315, "grad_norm": 1.071395804244241, "learning_rate": 8.440148172350666e-06, "loss": 0.1728, "step": 1662 }, { "epoch": 0.25863141524105754, "grad_norm": 1.2413704662622753, "learning_rate": 8.43837497869343e-06, "loss": 0.2031, "step": 1663 }, { "epoch": 0.25878693623639193, "grad_norm": 1.1068242296182698, "learning_rate": 8.436600964248138e-06, "loss": 0.1951, "step": 1664 }, { "epoch": 0.25894245723172626, "grad_norm": 0.8699381605693407, "learning_rate": 8.43482612943827e-06, "loss": 0.1764, "step": 1665 }, { "epoch": 0.25909797822706065, "grad_norm": 1.2048052321069596, "learning_rate": 8.433050474687505e-06, "loss": 0.2311, "step": 1666 }, { "epoch": 0.25925349922239505, "grad_norm": 1.315498269766704, "learning_rate": 8.431274000419716e-06, "loss": 0.2412, "step": 1667 }, { "epoch": 0.2594090202177294, "grad_norm": 0.6128855898398873, "learning_rate": 8.42949670705897e-06, "loss": 0.1068, "step": 1668 }, { "epoch": 0.25956454121306377, "grad_norm": 0.9552988172621262, "learning_rate": 8.427718595029537e-06, "loss": 0.1458, "step": 1669 }, { "epoch": 0.25972006220839816, "grad_norm": 1.411892967173632, "learning_rate": 8.425939664755874e-06, "loss": 0.2327, "step": 1670 }, { "epoch": 0.2598755832037325, "grad_norm": 1.066036249369497, "learning_rate": 8.424159916662636e-06, "loss": 0.1845, "step": 1671 }, { "epoch": 0.2600311041990669, "grad_norm": 1.0078601069914832, "learning_rate": 8.422379351174673e-06, "loss": 0.129, "step": 1672 }, { "epoch": 0.2601866251944012, "grad_norm": 0.9627418389301211, "learning_rate": 8.420597968717033e-06, "loss": 0.2346, "step": 1673 }, { "epoch": 0.2603421461897356, "grad_norm": 1.0190302705099263, "learning_rate": 8.418815769714956e-06, "loss": 0.1291, "step": 1674 }, { "epoch": 0.26049766718507, "grad_norm": 0.8536213147159897, "learning_rate": 8.417032754593879e-06, "loss": 0.1759, "step": 1675 }, { "epoch": 0.26065318818040434, "grad_norm": 0.9477728405361937, "learning_rate": 8.415248923779431e-06, "loss": 0.1708, "step": 1676 }, { "epoch": 0.26080870917573873, "grad_norm": 1.0305276755799404, "learning_rate": 8.413464277697436e-06, "loss": 0.3205, "step": 1677 }, { "epoch": 0.2609642301710731, "grad_norm": 1.324545893865915, "learning_rate": 8.411678816773916e-06, "loss": 0.2936, "step": 1678 }, { "epoch": 0.26111975116640745, "grad_norm": 1.3383489149505705, "learning_rate": 8.409892541435085e-06, "loss": 0.2406, "step": 1679 }, { "epoch": 0.26127527216174184, "grad_norm": 0.9651270377598534, "learning_rate": 8.408105452107353e-06, "loss": 0.1511, "step": 1680 }, { "epoch": 0.2614307931570762, "grad_norm": 0.6783781205233194, "learning_rate": 8.40631754921732e-06, "loss": 0.1567, "step": 1681 }, { "epoch": 0.26158631415241057, "grad_norm": 1.198981860000486, "learning_rate": 8.404528833191786e-06, "loss": 0.2125, "step": 1682 }, { "epoch": 0.26174183514774496, "grad_norm": 0.7449630196962097, "learning_rate": 8.402739304457743e-06, "loss": 0.179, "step": 1683 }, { "epoch": 0.2618973561430793, "grad_norm": 1.3499907032342544, "learning_rate": 8.400948963442373e-06, "loss": 0.1492, "step": 1684 }, { "epoch": 0.2620528771384137, "grad_norm": 1.2324653573954145, "learning_rate": 8.39915781057306e-06, "loss": 0.1442, "step": 1685 }, { "epoch": 0.2622083981337481, "grad_norm": 1.5240761421711815, "learning_rate": 8.397365846277371e-06, "loss": 0.3141, "step": 1686 }, { "epoch": 0.2623639191290824, "grad_norm": 0.9242701212113029, "learning_rate": 8.39557307098308e-06, "loss": 0.175, "step": 1687 }, { "epoch": 0.2625194401244168, "grad_norm": 1.0215723172112428, "learning_rate": 8.393779485118142e-06, "loss": 0.1572, "step": 1688 }, { "epoch": 0.2626749611197512, "grad_norm": 1.4272441271545482, "learning_rate": 8.391985089110715e-06, "loss": 0.2086, "step": 1689 }, { "epoch": 0.26283048211508553, "grad_norm": 0.98493015131112, "learning_rate": 8.390189883389143e-06, "loss": 0.1758, "step": 1690 }, { "epoch": 0.2629860031104199, "grad_norm": 1.412962012368002, "learning_rate": 8.388393868381967e-06, "loss": 0.137, "step": 1691 }, { "epoch": 0.26314152410575425, "grad_norm": 0.8439849086089997, "learning_rate": 8.386597044517923e-06, "loss": 0.1794, "step": 1692 }, { "epoch": 0.26329704510108864, "grad_norm": 0.9027272166442722, "learning_rate": 8.384799412225936e-06, "loss": 0.1827, "step": 1693 }, { "epoch": 0.26345256609642304, "grad_norm": 1.0861962602589315, "learning_rate": 8.383000971935129e-06, "loss": 0.1736, "step": 1694 }, { "epoch": 0.26360808709175737, "grad_norm": 1.4467531133479765, "learning_rate": 8.38120172407481e-06, "loss": 0.2872, "step": 1695 }, { "epoch": 0.26376360808709176, "grad_norm": 0.7243899321635017, "learning_rate": 8.379401669074489e-06, "loss": 0.1568, "step": 1696 }, { "epoch": 0.26391912908242615, "grad_norm": 0.8947544881090379, "learning_rate": 8.37760080736386e-06, "loss": 0.1516, "step": 1697 }, { "epoch": 0.2640746500777605, "grad_norm": 1.1759725115418023, "learning_rate": 8.375799139372818e-06, "loss": 0.1384, "step": 1698 }, { "epoch": 0.2642301710730949, "grad_norm": 0.8519187195565056, "learning_rate": 8.373996665531443e-06, "loss": 0.2027, "step": 1699 }, { "epoch": 0.2643856920684292, "grad_norm": 1.4756118825078526, "learning_rate": 8.37219338627001e-06, "loss": 0.2323, "step": 1700 }, { "epoch": 0.2643856920684292, "eval_loss": 0.19943906366825104, "eval_runtime": 9.4244, "eval_samples_per_second": 2.759, "eval_steps_per_second": 0.743, "step": 1700 }, { "epoch": 0.2645412130637636, "grad_norm": 1.1415194682343677, "learning_rate": 8.370389302018993e-06, "loss": 0.1627, "step": 1701 }, { "epoch": 0.264696734059098, "grad_norm": 0.9887030475180681, "learning_rate": 8.368584413209044e-06, "loss": 0.1913, "step": 1702 }, { "epoch": 0.26485225505443233, "grad_norm": 1.579433234849522, "learning_rate": 8.366778720271022e-06, "loss": 0.2494, "step": 1703 }, { "epoch": 0.2650077760497667, "grad_norm": 1.1581416599961576, "learning_rate": 8.364972223635967e-06, "loss": 0.1984, "step": 1704 }, { "epoch": 0.2651632970451011, "grad_norm": 1.4481396852315895, "learning_rate": 8.363164923735116e-06, "loss": 0.1772, "step": 1705 }, { "epoch": 0.26531881804043544, "grad_norm": 2.2248131911902918, "learning_rate": 8.361356820999897e-06, "loss": 0.2035, "step": 1706 }, { "epoch": 0.26547433903576984, "grad_norm": 1.296906679431483, "learning_rate": 8.359547915861927e-06, "loss": 0.1906, "step": 1707 }, { "epoch": 0.2656298600311042, "grad_norm": 1.4510599043288837, "learning_rate": 8.357738208753022e-06, "loss": 0.215, "step": 1708 }, { "epoch": 0.26578538102643856, "grad_norm": 1.3812180344156422, "learning_rate": 8.35592770010518e-06, "loss": 0.2366, "step": 1709 }, { "epoch": 0.26594090202177295, "grad_norm": 0.7624028953564842, "learning_rate": 8.354116390350594e-06, "loss": 0.1337, "step": 1710 }, { "epoch": 0.2660964230171073, "grad_norm": 1.0938571817018024, "learning_rate": 8.352304279921655e-06, "loss": 0.1739, "step": 1711 }, { "epoch": 0.2662519440124417, "grad_norm": 1.3112579396126312, "learning_rate": 8.350491369250933e-06, "loss": 0.2866, "step": 1712 }, { "epoch": 0.26640746500777607, "grad_norm": 1.4175431035953647, "learning_rate": 8.348677658771197e-06, "loss": 0.1308, "step": 1713 }, { "epoch": 0.2665629860031104, "grad_norm": 2.1014926949253327, "learning_rate": 8.346863148915402e-06, "loss": 0.1549, "step": 1714 }, { "epoch": 0.2667185069984448, "grad_norm": 1.132911689146343, "learning_rate": 8.345047840116704e-06, "loss": 0.2182, "step": 1715 }, { "epoch": 0.2668740279937792, "grad_norm": 0.6535130581015213, "learning_rate": 8.343231732808435e-06, "loss": 0.1748, "step": 1716 }, { "epoch": 0.2670295489891135, "grad_norm": 0.9808104365320156, "learning_rate": 8.34141482742413e-06, "loss": 0.1512, "step": 1717 }, { "epoch": 0.2671850699844479, "grad_norm": 1.2630125658621263, "learning_rate": 8.339597124397509e-06, "loss": 0.1698, "step": 1718 }, { "epoch": 0.26734059097978224, "grad_norm": 1.279259047820582, "learning_rate": 8.33777862416248e-06, "loss": 0.1769, "step": 1719 }, { "epoch": 0.26749611197511663, "grad_norm": 1.1242790219258612, "learning_rate": 8.335959327153148e-06, "loss": 0.2224, "step": 1720 }, { "epoch": 0.267651632970451, "grad_norm": 1.0035835372337707, "learning_rate": 8.334139233803801e-06, "loss": 0.1697, "step": 1721 }, { "epoch": 0.26780715396578536, "grad_norm": 1.9776796243145607, "learning_rate": 8.332318344548926e-06, "loss": 0.2033, "step": 1722 }, { "epoch": 0.26796267496111975, "grad_norm": 1.1521258085682824, "learning_rate": 8.330496659823189e-06, "loss": 0.1729, "step": 1723 }, { "epoch": 0.26811819595645414, "grad_norm": 1.0253842887133877, "learning_rate": 8.328674180061453e-06, "loss": 0.2185, "step": 1724 }, { "epoch": 0.2682737169517885, "grad_norm": 0.871091469827773, "learning_rate": 8.326850905698774e-06, "loss": 0.1359, "step": 1725 }, { "epoch": 0.26842923794712287, "grad_norm": 1.7009594103702224, "learning_rate": 8.325026837170386e-06, "loss": 0.2348, "step": 1726 }, { "epoch": 0.26858475894245726, "grad_norm": 1.367926551681483, "learning_rate": 8.323201974911723e-06, "loss": 0.1842, "step": 1727 }, { "epoch": 0.2687402799377916, "grad_norm": 1.148927442910907, "learning_rate": 8.321376319358407e-06, "loss": 0.1096, "step": 1728 }, { "epoch": 0.268895800933126, "grad_norm": 1.3075658909675654, "learning_rate": 8.319549870946244e-06, "loss": 0.1543, "step": 1729 }, { "epoch": 0.2690513219284603, "grad_norm": 0.8291270774545968, "learning_rate": 8.317722630111233e-06, "loss": 0.1093, "step": 1730 }, { "epoch": 0.2692068429237947, "grad_norm": 2.2622896049282706, "learning_rate": 8.315894597289565e-06, "loss": 0.2042, "step": 1731 }, { "epoch": 0.2693623639191291, "grad_norm": 0.7046996138148661, "learning_rate": 8.314065772917612e-06, "loss": 0.1303, "step": 1732 }, { "epoch": 0.26951788491446343, "grad_norm": 0.9333196367153322, "learning_rate": 8.312236157431946e-06, "loss": 0.169, "step": 1733 }, { "epoch": 0.2696734059097978, "grad_norm": 1.1869718333049797, "learning_rate": 8.310405751269318e-06, "loss": 0.2494, "step": 1734 }, { "epoch": 0.2698289269051322, "grad_norm": 0.9186255111712875, "learning_rate": 8.30857455486667e-06, "loss": 0.1449, "step": 1735 }, { "epoch": 0.26998444790046655, "grad_norm": 1.7158457711756847, "learning_rate": 8.306742568661137e-06, "loss": 0.2472, "step": 1736 }, { "epoch": 0.27013996889580094, "grad_norm": 0.9091734067747751, "learning_rate": 8.304909793090039e-06, "loss": 0.1517, "step": 1737 }, { "epoch": 0.2702954898911353, "grad_norm": 0.9472038650945157, "learning_rate": 8.303076228590885e-06, "loss": 0.1293, "step": 1738 }, { "epoch": 0.27045101088646967, "grad_norm": 1.359961162735269, "learning_rate": 8.301241875601371e-06, "loss": 0.1687, "step": 1739 }, { "epoch": 0.27060653188180406, "grad_norm": 1.3706412614563859, "learning_rate": 8.299406734559385e-06, "loss": 0.1151, "step": 1740 }, { "epoch": 0.2707620528771384, "grad_norm": 1.4633698039358347, "learning_rate": 8.297570805903e-06, "loss": 0.1834, "step": 1741 }, { "epoch": 0.2709175738724728, "grad_norm": 1.2706325476878815, "learning_rate": 8.295734090070477e-06, "loss": 0.1889, "step": 1742 }, { "epoch": 0.2710730948678072, "grad_norm": 1.40063937560449, "learning_rate": 8.293896587500266e-06, "loss": 0.1644, "step": 1743 }, { "epoch": 0.2712286158631415, "grad_norm": 1.756399176307069, "learning_rate": 8.292058298631003e-06, "loss": 0.2121, "step": 1744 }, { "epoch": 0.2713841368584759, "grad_norm": 1.3118943702099763, "learning_rate": 8.290219223901517e-06, "loss": 0.1657, "step": 1745 }, { "epoch": 0.2715396578538103, "grad_norm": 1.221070247479925, "learning_rate": 8.288379363750818e-06, "loss": 0.1799, "step": 1746 }, { "epoch": 0.2716951788491446, "grad_norm": 1.30049039400021, "learning_rate": 8.286538718618107e-06, "loss": 0.1659, "step": 1747 }, { "epoch": 0.271850699844479, "grad_norm": 0.8218052779463395, "learning_rate": 8.28469728894277e-06, "loss": 0.1417, "step": 1748 }, { "epoch": 0.27200622083981335, "grad_norm": 1.318881683721639, "learning_rate": 8.282855075164386e-06, "loss": 0.2086, "step": 1749 }, { "epoch": 0.27216174183514774, "grad_norm": 1.168225071909074, "learning_rate": 8.281012077722712e-06, "loss": 0.1481, "step": 1750 }, { "epoch": 0.27231726283048213, "grad_norm": 1.387527553498744, "learning_rate": 8.2791682970577e-06, "loss": 0.224, "step": 1751 }, { "epoch": 0.27247278382581647, "grad_norm": 0.9455523699522945, "learning_rate": 8.277323733609488e-06, "loss": 0.1689, "step": 1752 }, { "epoch": 0.27262830482115086, "grad_norm": 1.301993231412919, "learning_rate": 8.275478387818394e-06, "loss": 0.17, "step": 1753 }, { "epoch": 0.27278382581648525, "grad_norm": 1.1753804485169133, "learning_rate": 8.273632260124934e-06, "loss": 0.2231, "step": 1754 }, { "epoch": 0.2729393468118196, "grad_norm": 1.080698611275427, "learning_rate": 8.271785350969799e-06, "loss": 0.1796, "step": 1755 }, { "epoch": 0.273094867807154, "grad_norm": 1.290015540604507, "learning_rate": 8.269937660793875e-06, "loss": 0.1941, "step": 1756 }, { "epoch": 0.2732503888024883, "grad_norm": 1.070538218943679, "learning_rate": 8.268089190038228e-06, "loss": 0.1909, "step": 1757 }, { "epoch": 0.2734059097978227, "grad_norm": 1.2252798699112468, "learning_rate": 8.266239939144118e-06, "loss": 0.1569, "step": 1758 }, { "epoch": 0.2735614307931571, "grad_norm": 1.2346475130597931, "learning_rate": 8.264389908552987e-06, "loss": 0.1881, "step": 1759 }, { "epoch": 0.2737169517884914, "grad_norm": 0.8909529676508143, "learning_rate": 8.26253909870646e-06, "loss": 0.1635, "step": 1760 }, { "epoch": 0.2738724727838258, "grad_norm": 1.3801819199807877, "learning_rate": 8.260687510046352e-06, "loss": 0.1957, "step": 1761 }, { "epoch": 0.2740279937791602, "grad_norm": 0.9098604615543268, "learning_rate": 8.258835143014663e-06, "loss": 0.1556, "step": 1762 }, { "epoch": 0.27418351477449454, "grad_norm": 1.479953181946323, "learning_rate": 8.25698199805358e-06, "loss": 0.1673, "step": 1763 }, { "epoch": 0.27433903576982893, "grad_norm": 1.0391961011580078, "learning_rate": 8.255128075605475e-06, "loss": 0.1678, "step": 1764 }, { "epoch": 0.2744945567651633, "grad_norm": 1.1674213515628957, "learning_rate": 8.253273376112902e-06, "loss": 0.1575, "step": 1765 }, { "epoch": 0.27465007776049766, "grad_norm": 0.776827674790433, "learning_rate": 8.251417900018606e-06, "loss": 0.2087, "step": 1766 }, { "epoch": 0.27480559875583205, "grad_norm": 1.0737505366105782, "learning_rate": 8.249561647765515e-06, "loss": 0.202, "step": 1767 }, { "epoch": 0.2749611197511664, "grad_norm": 1.0278179070478979, "learning_rate": 8.247704619796743e-06, "loss": 0.2246, "step": 1768 }, { "epoch": 0.2751166407465008, "grad_norm": 1.3308057309065462, "learning_rate": 8.245846816555588e-06, "loss": 0.1781, "step": 1769 }, { "epoch": 0.27527216174183516, "grad_norm": 1.171891225152092, "learning_rate": 8.24398823848553e-06, "loss": 0.2838, "step": 1770 }, { "epoch": 0.2754276827371695, "grad_norm": 0.9162549134019579, "learning_rate": 8.242128886030243e-06, "loss": 0.153, "step": 1771 }, { "epoch": 0.2755832037325039, "grad_norm": 1.7094368421056838, "learning_rate": 8.240268759633576e-06, "loss": 0.1769, "step": 1772 }, { "epoch": 0.2757387247278383, "grad_norm": 1.088761334959302, "learning_rate": 8.23840785973957e-06, "loss": 0.1872, "step": 1773 }, { "epoch": 0.2758942457231726, "grad_norm": 1.0467068106039534, "learning_rate": 8.236546186792446e-06, "loss": 0.1941, "step": 1774 }, { "epoch": 0.276049766718507, "grad_norm": 1.469925204114295, "learning_rate": 8.234683741236612e-06, "loss": 0.2439, "step": 1775 }, { "epoch": 0.27620528771384134, "grad_norm": 1.286843667284798, "learning_rate": 8.23282052351666e-06, "loss": 0.1825, "step": 1776 }, { "epoch": 0.27636080870917573, "grad_norm": 1.5684518100667084, "learning_rate": 8.230956534077366e-06, "loss": 0.2088, "step": 1777 }, { "epoch": 0.2765163297045101, "grad_norm": 1.3158757876867857, "learning_rate": 8.22909177336369e-06, "loss": 0.1965, "step": 1778 }, { "epoch": 0.27667185069984446, "grad_norm": 0.7862541895693009, "learning_rate": 8.227226241820779e-06, "loss": 0.1388, "step": 1779 }, { "epoch": 0.27682737169517885, "grad_norm": 0.9288123715441376, "learning_rate": 8.225359939893954e-06, "loss": 0.243, "step": 1780 }, { "epoch": 0.27698289269051324, "grad_norm": 1.491008802108701, "learning_rate": 8.223492868028736e-06, "loss": 0.2521, "step": 1781 }, { "epoch": 0.2771384136858476, "grad_norm": 1.1202886550853388, "learning_rate": 8.221625026670814e-06, "loss": 0.1688, "step": 1782 }, { "epoch": 0.27729393468118196, "grad_norm": 1.1962734383960754, "learning_rate": 8.219756416266073e-06, "loss": 0.1294, "step": 1783 }, { "epoch": 0.27744945567651635, "grad_norm": 0.6740427840476089, "learning_rate": 8.217887037260575e-06, "loss": 0.1501, "step": 1784 }, { "epoch": 0.2776049766718507, "grad_norm": 1.8752578154372959, "learning_rate": 8.216016890100564e-06, "loss": 0.2524, "step": 1785 }, { "epoch": 0.2777604976671851, "grad_norm": 1.3276982202120067, "learning_rate": 8.214145975232474e-06, "loss": 0.1611, "step": 1786 }, { "epoch": 0.2779160186625194, "grad_norm": 0.9180331686214024, "learning_rate": 8.212274293102917e-06, "loss": 0.2069, "step": 1787 }, { "epoch": 0.2780715396578538, "grad_norm": 1.1644000920434754, "learning_rate": 8.210401844158688e-06, "loss": 0.2113, "step": 1788 }, { "epoch": 0.2782270606531882, "grad_norm": 1.6247680870264813, "learning_rate": 8.20852862884677e-06, "loss": 0.2167, "step": 1789 }, { "epoch": 0.27838258164852253, "grad_norm": 2.465352962757943, "learning_rate": 8.206654647614323e-06, "loss": 0.2917, "step": 1790 }, { "epoch": 0.2785381026438569, "grad_norm": 0.9826147561106185, "learning_rate": 8.204779900908694e-06, "loss": 0.1513, "step": 1791 }, { "epoch": 0.2786936236391913, "grad_norm": 1.1924827625995933, "learning_rate": 8.202904389177409e-06, "loss": 0.2069, "step": 1792 }, { "epoch": 0.27884914463452565, "grad_norm": 1.2507233550051102, "learning_rate": 8.201028112868182e-06, "loss": 0.1713, "step": 1793 }, { "epoch": 0.27900466562986004, "grad_norm": 1.056564405898492, "learning_rate": 8.199151072428903e-06, "loss": 0.152, "step": 1794 }, { "epoch": 0.27916018662519443, "grad_norm": 1.0582767182694146, "learning_rate": 8.19727326830765e-06, "loss": 0.1313, "step": 1795 }, { "epoch": 0.27931570762052876, "grad_norm": 0.9960646193169612, "learning_rate": 8.195394700952681e-06, "loss": 0.1663, "step": 1796 }, { "epoch": 0.27947122861586315, "grad_norm": 0.8580536351756373, "learning_rate": 8.193515370812433e-06, "loss": 0.1595, "step": 1797 }, { "epoch": 0.2796267496111975, "grad_norm": 1.0831765474333348, "learning_rate": 8.191635278335533e-06, "loss": 0.1646, "step": 1798 }, { "epoch": 0.2797822706065319, "grad_norm": 1.0292790758688968, "learning_rate": 8.189754423970783e-06, "loss": 0.1294, "step": 1799 }, { "epoch": 0.27993779160186627, "grad_norm": 0.6900206697382273, "learning_rate": 8.18787280816717e-06, "loss": 0.1962, "step": 1800 }, { "epoch": 0.27993779160186627, "eval_loss": 0.1942623406648636, "eval_runtime": 9.4402, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.742, "step": 1800 }, { "epoch": 0.2800933125972006, "grad_norm": 1.225359903420926, "learning_rate": 8.18599043137386e-06, "loss": 0.1613, "step": 1801 }, { "epoch": 0.280248833592535, "grad_norm": 1.6844618005986576, "learning_rate": 8.184107294040204e-06, "loss": 0.2253, "step": 1802 }, { "epoch": 0.2804043545878694, "grad_norm": 1.0175001190789204, "learning_rate": 8.182223396615733e-06, "loss": 0.1912, "step": 1803 }, { "epoch": 0.2805598755832037, "grad_norm": 1.050408004024866, "learning_rate": 8.18033873955016e-06, "loss": 0.2061, "step": 1804 }, { "epoch": 0.2807153965785381, "grad_norm": 1.4763046668239692, "learning_rate": 8.178453323293378e-06, "loss": 0.2781, "step": 1805 }, { "epoch": 0.28087091757387245, "grad_norm": 0.8219546561822222, "learning_rate": 8.176567148295462e-06, "loss": 0.2129, "step": 1806 }, { "epoch": 0.28102643856920684, "grad_norm": 0.9534941567105831, "learning_rate": 8.174680215006671e-06, "loss": 0.1653, "step": 1807 }, { "epoch": 0.28118195956454123, "grad_norm": 1.0531235123680651, "learning_rate": 8.172792523877439e-06, "loss": 0.1384, "step": 1808 }, { "epoch": 0.28133748055987556, "grad_norm": 1.3227244850484494, "learning_rate": 8.170904075358386e-06, "loss": 0.1878, "step": 1809 }, { "epoch": 0.28149300155520995, "grad_norm": 0.8199812475506189, "learning_rate": 8.169014869900308e-06, "loss": 0.1583, "step": 1810 }, { "epoch": 0.28164852255054434, "grad_norm": 1.1873233496157647, "learning_rate": 8.167124907954188e-06, "loss": 0.1689, "step": 1811 }, { "epoch": 0.2818040435458787, "grad_norm": 1.332689389458692, "learning_rate": 8.165234189971188e-06, "loss": 0.1509, "step": 1812 }, { "epoch": 0.28195956454121307, "grad_norm": 1.4288659016319332, "learning_rate": 8.163342716402645e-06, "loss": 0.1862, "step": 1813 }, { "epoch": 0.28211508553654746, "grad_norm": 1.314918590717926, "learning_rate": 8.16145048770008e-06, "loss": 0.226, "step": 1814 }, { "epoch": 0.2822706065318818, "grad_norm": 0.9155638179955898, "learning_rate": 8.159557504315197e-06, "loss": 0.1929, "step": 1815 }, { "epoch": 0.2824261275272162, "grad_norm": 1.0431139463881003, "learning_rate": 8.157663766699875e-06, "loss": 0.1443, "step": 1816 }, { "epoch": 0.2825816485225505, "grad_norm": 1.3294250069242237, "learning_rate": 8.155769275306178e-06, "loss": 0.193, "step": 1817 }, { "epoch": 0.2827371695178849, "grad_norm": 0.9943106694297035, "learning_rate": 8.153874030586343e-06, "loss": 0.1421, "step": 1818 }, { "epoch": 0.2828926905132193, "grad_norm": 1.165982265832558, "learning_rate": 8.151978032992798e-06, "loss": 0.1739, "step": 1819 }, { "epoch": 0.28304821150855364, "grad_norm": 0.7428727580266941, "learning_rate": 8.150081282978139e-06, "loss": 0.1572, "step": 1820 }, { "epoch": 0.28320373250388803, "grad_norm": 1.3026564844558632, "learning_rate": 8.14818378099515e-06, "loss": 0.1805, "step": 1821 }, { "epoch": 0.2833592534992224, "grad_norm": 1.2645554368075294, "learning_rate": 8.146285527496789e-06, "loss": 0.1798, "step": 1822 }, { "epoch": 0.28351477449455675, "grad_norm": 1.456481044065325, "learning_rate": 8.144386522936195e-06, "loss": 0.1598, "step": 1823 }, { "epoch": 0.28367029548989114, "grad_norm": 1.1300906728090474, "learning_rate": 8.142486767766688e-06, "loss": 0.1648, "step": 1824 }, { "epoch": 0.2838258164852255, "grad_norm": 1.1388001178297193, "learning_rate": 8.140586262441767e-06, "loss": 0.2733, "step": 1825 }, { "epoch": 0.28398133748055987, "grad_norm": 0.7532300919484063, "learning_rate": 8.138685007415109e-06, "loss": 0.1213, "step": 1826 }, { "epoch": 0.28413685847589426, "grad_norm": 1.0796067807349936, "learning_rate": 8.136783003140568e-06, "loss": 0.2189, "step": 1827 }, { "epoch": 0.2842923794712286, "grad_norm": 1.331438012696905, "learning_rate": 8.134880250072179e-06, "loss": 0.1804, "step": 1828 }, { "epoch": 0.284447900466563, "grad_norm": 1.2091930191659346, "learning_rate": 8.13297674866416e-06, "loss": 0.2194, "step": 1829 }, { "epoch": 0.2846034214618974, "grad_norm": 1.0049073814467957, "learning_rate": 8.131072499370897e-06, "loss": 0.1333, "step": 1830 }, { "epoch": 0.2847589424572317, "grad_norm": 1.0223717163539678, "learning_rate": 8.129167502646966e-06, "loss": 0.1988, "step": 1831 }, { "epoch": 0.2849144634525661, "grad_norm": 1.4867747212307119, "learning_rate": 8.127261758947114e-06, "loss": 0.1467, "step": 1832 }, { "epoch": 0.2850699844479005, "grad_norm": 0.8173079980321136, "learning_rate": 8.125355268726266e-06, "loss": 0.1058, "step": 1833 }, { "epoch": 0.28522550544323483, "grad_norm": 1.570586484505542, "learning_rate": 8.123448032439534e-06, "loss": 0.2065, "step": 1834 }, { "epoch": 0.2853810264385692, "grad_norm": 1.5595299992669573, "learning_rate": 8.121540050542198e-06, "loss": 0.2193, "step": 1835 }, { "epoch": 0.28553654743390355, "grad_norm": 1.007755342730857, "learning_rate": 8.119631323489722e-06, "loss": 0.1371, "step": 1836 }, { "epoch": 0.28569206842923794, "grad_norm": 1.301433540358406, "learning_rate": 8.117721851737744e-06, "loss": 0.176, "step": 1837 }, { "epoch": 0.28584758942457233, "grad_norm": 0.8910831403501445, "learning_rate": 8.115811635742079e-06, "loss": 0.1626, "step": 1838 }, { "epoch": 0.28600311041990667, "grad_norm": 0.7467945070581918, "learning_rate": 8.113900675958728e-06, "loss": 0.1821, "step": 1839 }, { "epoch": 0.28615863141524106, "grad_norm": 1.1530448700016815, "learning_rate": 8.111988972843859e-06, "loss": 0.1923, "step": 1840 }, { "epoch": 0.28631415241057545, "grad_norm": 2.088923036862537, "learning_rate": 8.110076526853824e-06, "loss": 0.1206, "step": 1841 }, { "epoch": 0.2864696734059098, "grad_norm": 1.2835755029352423, "learning_rate": 8.108163338445152e-06, "loss": 0.2546, "step": 1842 }, { "epoch": 0.2866251944012442, "grad_norm": 0.8829913186503389, "learning_rate": 8.106249408074544e-06, "loss": 0.1445, "step": 1843 }, { "epoch": 0.2867807153965785, "grad_norm": 1.7629923458811358, "learning_rate": 8.104334736198887e-06, "loss": 0.1544, "step": 1844 }, { "epoch": 0.2869362363919129, "grad_norm": 0.9657174681831697, "learning_rate": 8.102419323275234e-06, "loss": 0.2351, "step": 1845 }, { "epoch": 0.2870917573872473, "grad_norm": 1.492589192357126, "learning_rate": 8.100503169760827e-06, "loss": 0.186, "step": 1846 }, { "epoch": 0.28724727838258163, "grad_norm": 1.1233971084394738, "learning_rate": 8.098586276113073e-06, "loss": 0.1946, "step": 1847 }, { "epoch": 0.287402799377916, "grad_norm": 0.8577653456028049, "learning_rate": 8.096668642789565e-06, "loss": 0.1633, "step": 1848 }, { "epoch": 0.2875583203732504, "grad_norm": 1.1536545920544707, "learning_rate": 8.094750270248065e-06, "loss": 0.1603, "step": 1849 }, { "epoch": 0.28771384136858474, "grad_norm": 0.9838814306399297, "learning_rate": 8.09283115894652e-06, "loss": 0.1623, "step": 1850 }, { "epoch": 0.28786936236391913, "grad_norm": 0.9601384951644616, "learning_rate": 8.090911309343045e-06, "loss": 0.1252, "step": 1851 }, { "epoch": 0.2880248833592535, "grad_norm": 0.9976176427201153, "learning_rate": 8.088990721895938e-06, "loss": 0.1815, "step": 1852 }, { "epoch": 0.28818040435458786, "grad_norm": 0.7583399424827217, "learning_rate": 8.087069397063666e-06, "loss": 0.141, "step": 1853 }, { "epoch": 0.28833592534992225, "grad_norm": 1.5185081715928586, "learning_rate": 8.085147335304879e-06, "loss": 0.1887, "step": 1854 }, { "epoch": 0.2884914463452566, "grad_norm": 1.3061310770751247, "learning_rate": 8.083224537078401e-06, "loss": 0.1451, "step": 1855 }, { "epoch": 0.288646967340591, "grad_norm": 1.7351614485797129, "learning_rate": 8.081301002843226e-06, "loss": 0.1264, "step": 1856 }, { "epoch": 0.28880248833592537, "grad_norm": 0.9943204442132273, "learning_rate": 8.079376733058532e-06, "loss": 0.1743, "step": 1857 }, { "epoch": 0.2889580093312597, "grad_norm": 1.1594102001358773, "learning_rate": 8.07745172818367e-06, "loss": 0.1607, "step": 1858 }, { "epoch": 0.2891135303265941, "grad_norm": 1.6350163238448654, "learning_rate": 8.075525988678163e-06, "loss": 0.1813, "step": 1859 }, { "epoch": 0.2892690513219285, "grad_norm": 1.083878957563236, "learning_rate": 8.073599515001713e-06, "loss": 0.1194, "step": 1860 }, { "epoch": 0.2894245723172628, "grad_norm": 0.8178116527073355, "learning_rate": 8.071672307614195e-06, "loss": 0.228, "step": 1861 }, { "epoch": 0.2895800933125972, "grad_norm": 1.1118324651261078, "learning_rate": 8.069744366975664e-06, "loss": 0.197, "step": 1862 }, { "epoch": 0.28973561430793154, "grad_norm": 1.149751561349185, "learning_rate": 8.06781569354634e-06, "loss": 0.269, "step": 1863 }, { "epoch": 0.28989113530326593, "grad_norm": 1.1618468357632399, "learning_rate": 8.06588628778663e-06, "loss": 0.1846, "step": 1864 }, { "epoch": 0.2900466562986003, "grad_norm": 1.3277875865938236, "learning_rate": 8.063956150157107e-06, "loss": 0.1273, "step": 1865 }, { "epoch": 0.29020217729393466, "grad_norm": 2.436613602568436, "learning_rate": 8.062025281118524e-06, "loss": 0.2442, "step": 1866 }, { "epoch": 0.29035769828926905, "grad_norm": 1.1958552059286012, "learning_rate": 8.060093681131804e-06, "loss": 0.1874, "step": 1867 }, { "epoch": 0.29051321928460344, "grad_norm": 0.8401669076143116, "learning_rate": 8.058161350658047e-06, "loss": 0.1901, "step": 1868 }, { "epoch": 0.2906687402799378, "grad_norm": 0.9487811357677395, "learning_rate": 8.056228290158528e-06, "loss": 0.1346, "step": 1869 }, { "epoch": 0.29082426127527217, "grad_norm": 0.957901732333534, "learning_rate": 8.054294500094697e-06, "loss": 0.1411, "step": 1870 }, { "epoch": 0.29097978227060656, "grad_norm": 1.292131532239805, "learning_rate": 8.052359980928172e-06, "loss": 0.1827, "step": 1871 }, { "epoch": 0.2911353032659409, "grad_norm": 0.9675272253773427, "learning_rate": 8.050424733120757e-06, "loss": 0.1738, "step": 1872 }, { "epoch": 0.2912908242612753, "grad_norm": 1.367184606419033, "learning_rate": 8.048488757134416e-06, "loss": 0.1787, "step": 1873 }, { "epoch": 0.2914463452566096, "grad_norm": 1.2673549853684765, "learning_rate": 8.046552053431298e-06, "loss": 0.2333, "step": 1874 }, { "epoch": 0.291601866251944, "grad_norm": 1.9351495105907597, "learning_rate": 8.044614622473717e-06, "loss": 0.1987, "step": 1875 }, { "epoch": 0.2917573872472784, "grad_norm": 0.8606527150680897, "learning_rate": 8.042676464724169e-06, "loss": 0.172, "step": 1876 }, { "epoch": 0.29191290824261273, "grad_norm": 1.4901933699318817, "learning_rate": 8.040737580645316e-06, "loss": 0.1735, "step": 1877 }, { "epoch": 0.2920684292379471, "grad_norm": 1.1691647071434712, "learning_rate": 8.038797970699998e-06, "loss": 0.2316, "step": 1878 }, { "epoch": 0.2922239502332815, "grad_norm": 1.240770117738676, "learning_rate": 8.036857635351226e-06, "loss": 0.1667, "step": 1879 }, { "epoch": 0.29237947122861585, "grad_norm": 1.0351622011955766, "learning_rate": 8.034916575062188e-06, "loss": 0.1405, "step": 1880 }, { "epoch": 0.29253499222395024, "grad_norm": 0.9153491401389935, "learning_rate": 8.032974790296239e-06, "loss": 0.1726, "step": 1881 }, { "epoch": 0.2926905132192846, "grad_norm": 1.4362209764019978, "learning_rate": 8.031032281516913e-06, "loss": 0.1827, "step": 1882 }, { "epoch": 0.29284603421461897, "grad_norm": 1.183605507206791, "learning_rate": 8.029089049187909e-06, "loss": 0.1883, "step": 1883 }, { "epoch": 0.29300155520995336, "grad_norm": 1.0539400115284923, "learning_rate": 8.02714509377311e-06, "loss": 0.1208, "step": 1884 }, { "epoch": 0.2931570762052877, "grad_norm": 1.0217425114195149, "learning_rate": 8.02520041573656e-06, "loss": 0.174, "step": 1885 }, { "epoch": 0.2933125972006221, "grad_norm": 1.0405110359742253, "learning_rate": 8.023255015542482e-06, "loss": 0.249, "step": 1886 }, { "epoch": 0.2934681181959565, "grad_norm": 0.9949747841829932, "learning_rate": 8.021308893655273e-06, "loss": 0.1861, "step": 1887 }, { "epoch": 0.2936236391912908, "grad_norm": 0.9631918396707634, "learning_rate": 8.019362050539497e-06, "loss": 0.22, "step": 1888 }, { "epoch": 0.2937791601866252, "grad_norm": 1.471400212660711, "learning_rate": 8.017414486659894e-06, "loss": 0.2831, "step": 1889 }, { "epoch": 0.2939346811819596, "grad_norm": 1.6502542476240603, "learning_rate": 8.015466202481371e-06, "loss": 0.1856, "step": 1890 }, { "epoch": 0.2940902021772939, "grad_norm": 1.0678255046461738, "learning_rate": 8.013517198469017e-06, "loss": 0.2714, "step": 1891 }, { "epoch": 0.2942457231726283, "grad_norm": 1.5419672646129527, "learning_rate": 8.01156747508808e-06, "loss": 0.2432, "step": 1892 }, { "epoch": 0.29440124416796265, "grad_norm": 1.691620262630438, "learning_rate": 8.009617032803989e-06, "loss": 0.2494, "step": 1893 }, { "epoch": 0.29455676516329704, "grad_norm": 1.0149866152436102, "learning_rate": 8.007665872082343e-06, "loss": 0.1446, "step": 1894 }, { "epoch": 0.29471228615863143, "grad_norm": 1.2593397067130077, "learning_rate": 8.005713993388908e-06, "loss": 0.1813, "step": 1895 }, { "epoch": 0.29486780715396577, "grad_norm": 1.751259190433369, "learning_rate": 8.003761397189629e-06, "loss": 0.3067, "step": 1896 }, { "epoch": 0.29502332814930016, "grad_norm": 1.0592944557403567, "learning_rate": 8.001808083950615e-06, "loss": 0.1774, "step": 1897 }, { "epoch": 0.29517884914463455, "grad_norm": 0.7601316574689209, "learning_rate": 7.999854054138148e-06, "loss": 0.1986, "step": 1898 }, { "epoch": 0.2953343701399689, "grad_norm": 1.0763633141744329, "learning_rate": 7.997899308218687e-06, "loss": 0.1693, "step": 1899 }, { "epoch": 0.2954898911353033, "grad_norm": 0.848192935949934, "learning_rate": 7.995943846658852e-06, "loss": 0.1785, "step": 1900 }, { "epoch": 0.2954898911353033, "eval_loss": 0.19579939544200897, "eval_runtime": 9.4258, "eval_samples_per_second": 2.758, "eval_steps_per_second": 0.743, "step": 1900 }, { "epoch": 0.29564541213063766, "grad_norm": 1.1366949640186217, "learning_rate": 7.99398766992544e-06, "loss": 0.3427, "step": 1901 }, { "epoch": 0.295800933125972, "grad_norm": 1.3011369731626548, "learning_rate": 7.99203077848542e-06, "loss": 0.128, "step": 1902 }, { "epoch": 0.2959564541213064, "grad_norm": 1.6239083693901217, "learning_rate": 7.990073172805927e-06, "loss": 0.2033, "step": 1903 }, { "epoch": 0.2961119751166407, "grad_norm": 2.136757506768007, "learning_rate": 7.98811485335427e-06, "loss": 0.8244, "step": 1904 }, { "epoch": 0.2962674961119751, "grad_norm": 1.4156103108687226, "learning_rate": 7.986155820597927e-06, "loss": 0.2266, "step": 1905 }, { "epoch": 0.2964230171073095, "grad_norm": 1.3059948518525273, "learning_rate": 7.984196075004547e-06, "loss": 0.1772, "step": 1906 }, { "epoch": 0.29657853810264384, "grad_norm": 1.1897397554067446, "learning_rate": 7.982235617041947e-06, "loss": 0.2153, "step": 1907 }, { "epoch": 0.29673405909797823, "grad_norm": 1.8814984942898336, "learning_rate": 7.980274447178116e-06, "loss": 0.163, "step": 1908 }, { "epoch": 0.2968895800933126, "grad_norm": 0.8490191091642275, "learning_rate": 7.978312565881212e-06, "loss": 0.1929, "step": 1909 }, { "epoch": 0.29704510108864696, "grad_norm": 1.0730207253151238, "learning_rate": 7.976349973619567e-06, "loss": 0.152, "step": 1910 }, { "epoch": 0.29720062208398135, "grad_norm": 1.0988494794101311, "learning_rate": 7.974386670861676e-06, "loss": 0.1796, "step": 1911 }, { "epoch": 0.2973561430793157, "grad_norm": 0.8890702707468837, "learning_rate": 7.972422658076206e-06, "loss": 0.1658, "step": 1912 }, { "epoch": 0.2975116640746501, "grad_norm": 1.5485447290305507, "learning_rate": 7.970457935731996e-06, "loss": 0.219, "step": 1913 }, { "epoch": 0.29766718506998446, "grad_norm": 1.1870158533528972, "learning_rate": 7.968492504298053e-06, "loss": 0.1678, "step": 1914 }, { "epoch": 0.2978227060653188, "grad_norm": 0.8791513734953905, "learning_rate": 7.966526364243553e-06, "loss": 0.1379, "step": 1915 }, { "epoch": 0.2979782270606532, "grad_norm": 1.1547532699065137, "learning_rate": 7.96455951603784e-06, "loss": 0.1578, "step": 1916 }, { "epoch": 0.2981337480559876, "grad_norm": 1.2343036137247707, "learning_rate": 7.962591960150426e-06, "loss": 0.167, "step": 1917 }, { "epoch": 0.2982892690513219, "grad_norm": 1.199679900214714, "learning_rate": 7.960623697051e-06, "loss": 0.2216, "step": 1918 }, { "epoch": 0.2984447900466563, "grad_norm": 0.8701547919093023, "learning_rate": 7.958654727209406e-06, "loss": 0.1334, "step": 1919 }, { "epoch": 0.2986003110419907, "grad_norm": 1.0186941275746395, "learning_rate": 7.956685051095672e-06, "loss": 0.1992, "step": 1920 }, { "epoch": 0.29875583203732503, "grad_norm": 1.677907044209659, "learning_rate": 7.954714669179981e-06, "loss": 0.2557, "step": 1921 }, { "epoch": 0.2989113530326594, "grad_norm": 1.0741276350489937, "learning_rate": 7.952743581932696e-06, "loss": 0.2228, "step": 1922 }, { "epoch": 0.29906687402799376, "grad_norm": 1.1370483443720154, "learning_rate": 7.950771789824341e-06, "loss": 0.1822, "step": 1923 }, { "epoch": 0.29922239502332815, "grad_norm": 1.4805485099895457, "learning_rate": 7.948799293325607e-06, "loss": 0.2066, "step": 1924 }, { "epoch": 0.29937791601866254, "grad_norm": 1.0841471732459598, "learning_rate": 7.946826092907362e-06, "loss": 0.2086, "step": 1925 }, { "epoch": 0.2995334370139969, "grad_norm": 0.9923801848699839, "learning_rate": 7.944852189040633e-06, "loss": 0.1457, "step": 1926 }, { "epoch": 0.29968895800933126, "grad_norm": 1.1826489754247185, "learning_rate": 7.942877582196618e-06, "loss": 0.1335, "step": 1927 }, { "epoch": 0.29984447900466565, "grad_norm": 1.0374422374980892, "learning_rate": 7.940902272846684e-06, "loss": 0.1747, "step": 1928 }, { "epoch": 0.3, "grad_norm": 0.9355242713051211, "learning_rate": 7.938926261462366e-06, "loss": 0.2035, "step": 1929 }, { "epoch": 0.3001555209953344, "grad_norm": 1.3452996964657524, "learning_rate": 7.936949548515364e-06, "loss": 0.2284, "step": 1930 }, { "epoch": 0.3003110419906687, "grad_norm": 0.7948433007606517, "learning_rate": 7.93497213447755e-06, "loss": 0.2051, "step": 1931 }, { "epoch": 0.3004665629860031, "grad_norm": 1.130699049423352, "learning_rate": 7.932994019820956e-06, "loss": 0.174, "step": 1932 }, { "epoch": 0.3006220839813375, "grad_norm": 4.331642107991714, "learning_rate": 7.931015205017788e-06, "loss": 0.2259, "step": 1933 }, { "epoch": 0.30077760497667183, "grad_norm": 1.5306684316210843, "learning_rate": 7.929035690540414e-06, "loss": 0.1917, "step": 1934 }, { "epoch": 0.3009331259720062, "grad_norm": 0.8871970028065491, "learning_rate": 7.927055476861376e-06, "loss": 0.1765, "step": 1935 }, { "epoch": 0.3010886469673406, "grad_norm": 0.9400711133682595, "learning_rate": 7.925074564453376e-06, "loss": 0.1824, "step": 1936 }, { "epoch": 0.30124416796267495, "grad_norm": 0.9734009328190283, "learning_rate": 7.923092953789287e-06, "loss": 0.1575, "step": 1937 }, { "epoch": 0.30139968895800934, "grad_norm": 1.1309704131602631, "learning_rate": 7.921110645342144e-06, "loss": 0.2438, "step": 1938 }, { "epoch": 0.30155520995334373, "grad_norm": 1.2491112218817273, "learning_rate": 7.919127639585153e-06, "loss": 0.2252, "step": 1939 }, { "epoch": 0.30171073094867806, "grad_norm": 0.9626959898568382, "learning_rate": 7.917143936991688e-06, "loss": 0.1416, "step": 1940 }, { "epoch": 0.30186625194401245, "grad_norm": 0.933932349728071, "learning_rate": 7.915159538035284e-06, "loss": 0.1924, "step": 1941 }, { "epoch": 0.3020217729393468, "grad_norm": 1.198922066826054, "learning_rate": 7.913174443189645e-06, "loss": 0.1918, "step": 1942 }, { "epoch": 0.3021772939346812, "grad_norm": 0.711619672728743, "learning_rate": 7.911188652928639e-06, "loss": 0.1322, "step": 1943 }, { "epoch": 0.30233281493001557, "grad_norm": 0.9224372120486194, "learning_rate": 7.909202167726306e-06, "loss": 0.1775, "step": 1944 }, { "epoch": 0.3024883359253499, "grad_norm": 1.3276511094955517, "learning_rate": 7.907214988056844e-06, "loss": 0.2187, "step": 1945 }, { "epoch": 0.3026438569206843, "grad_norm": 0.8655219464600901, "learning_rate": 7.905227114394623e-06, "loss": 0.1465, "step": 1946 }, { "epoch": 0.3027993779160187, "grad_norm": 0.995295145761775, "learning_rate": 7.903238547214173e-06, "loss": 0.2004, "step": 1947 }, { "epoch": 0.302954898911353, "grad_norm": 1.1948454763354273, "learning_rate": 7.901249286990196e-06, "loss": 0.1755, "step": 1948 }, { "epoch": 0.3031104199066874, "grad_norm": 0.8682961110627464, "learning_rate": 7.899259334197554e-06, "loss": 0.1999, "step": 1949 }, { "epoch": 0.30326594090202175, "grad_norm": 1.0906703142458485, "learning_rate": 7.897268689311278e-06, "loss": 0.1014, "step": 1950 }, { "epoch": 0.30342146189735614, "grad_norm": 1.2664526681944839, "learning_rate": 7.895277352806562e-06, "loss": 0.2251, "step": 1951 }, { "epoch": 0.30357698289269053, "grad_norm": 0.9627019771781781, "learning_rate": 7.893285325158766e-06, "loss": 0.1591, "step": 1952 }, { "epoch": 0.30373250388802486, "grad_norm": 1.9216322578695895, "learning_rate": 7.891292606843414e-06, "loss": 0.2066, "step": 1953 }, { "epoch": 0.30388802488335925, "grad_norm": 0.9086586156841527, "learning_rate": 7.889299198336197e-06, "loss": 0.2196, "step": 1954 }, { "epoch": 0.30404354587869364, "grad_norm": 1.4203649142405548, "learning_rate": 7.887305100112967e-06, "loss": 0.1804, "step": 1955 }, { "epoch": 0.304199066874028, "grad_norm": 1.2381428600296667, "learning_rate": 7.885310312649747e-06, "loss": 0.1434, "step": 1956 }, { "epoch": 0.30435458786936237, "grad_norm": 0.7952770821447226, "learning_rate": 7.883314836422717e-06, "loss": 0.1955, "step": 1957 }, { "epoch": 0.30451010886469676, "grad_norm": 0.998488522800322, "learning_rate": 7.881318671908228e-06, "loss": 0.2239, "step": 1958 }, { "epoch": 0.3046656298600311, "grad_norm": 1.0829580403987296, "learning_rate": 7.879321819582788e-06, "loss": 0.2401, "step": 1959 }, { "epoch": 0.3048211508553655, "grad_norm": 1.043363355464928, "learning_rate": 7.877324279923078e-06, "loss": 0.1821, "step": 1960 }, { "epoch": 0.3049766718506998, "grad_norm": 1.7533270649933215, "learning_rate": 7.875326053405936e-06, "loss": 0.2513, "step": 1961 }, { "epoch": 0.3051321928460342, "grad_norm": 1.3436274607263432, "learning_rate": 7.873327140508367e-06, "loss": 0.2352, "step": 1962 }, { "epoch": 0.3052877138413686, "grad_norm": 2.0633364771352274, "learning_rate": 7.87132754170754e-06, "loss": 0.2125, "step": 1963 }, { "epoch": 0.30544323483670294, "grad_norm": 0.9097966158633792, "learning_rate": 7.869327257480787e-06, "loss": 0.1627, "step": 1964 }, { "epoch": 0.3055987558320373, "grad_norm": 1.8317277834761483, "learning_rate": 7.867326288305603e-06, "loss": 0.211, "step": 1965 }, { "epoch": 0.3057542768273717, "grad_norm": 1.1448361049962872, "learning_rate": 7.865324634659647e-06, "loss": 0.1683, "step": 1966 }, { "epoch": 0.30590979782270605, "grad_norm": 1.14865744697956, "learning_rate": 7.863322297020743e-06, "loss": 0.2238, "step": 1967 }, { "epoch": 0.30606531881804044, "grad_norm": 1.0967845312311937, "learning_rate": 7.861319275866877e-06, "loss": 0.1889, "step": 1968 }, { "epoch": 0.3062208398133748, "grad_norm": 1.2461473684464468, "learning_rate": 7.859315571676198e-06, "loss": 0.2138, "step": 1969 }, { "epoch": 0.30637636080870917, "grad_norm": 1.1992165952645324, "learning_rate": 7.857311184927015e-06, "loss": 0.2289, "step": 1970 }, { "epoch": 0.30653188180404356, "grad_norm": 0.9734656478980178, "learning_rate": 7.855306116097807e-06, "loss": 0.1798, "step": 1971 }, { "epoch": 0.3066874027993779, "grad_norm": 0.8576094794110676, "learning_rate": 7.853300365667211e-06, "loss": 0.1849, "step": 1972 }, { "epoch": 0.3068429237947123, "grad_norm": 0.9320489557446329, "learning_rate": 7.851293934114026e-06, "loss": 0.1663, "step": 1973 }, { "epoch": 0.3069984447900467, "grad_norm": 1.5628965027384294, "learning_rate": 7.849286821917217e-06, "loss": 0.2741, "step": 1974 }, { "epoch": 0.307153965785381, "grad_norm": 1.1064029390023975, "learning_rate": 7.847279029555908e-06, "loss": 0.1655, "step": 1975 }, { "epoch": 0.3073094867807154, "grad_norm": 1.1272492512254035, "learning_rate": 7.845270557509389e-06, "loss": 0.1473, "step": 1976 }, { "epoch": 0.3074650077760498, "grad_norm": 0.8321910160414181, "learning_rate": 7.843261406257108e-06, "loss": 0.1571, "step": 1977 }, { "epoch": 0.3076205287713841, "grad_norm": 0.9606345664210296, "learning_rate": 7.841251576278681e-06, "loss": 0.227, "step": 1978 }, { "epoch": 0.3077760497667185, "grad_norm": 1.0695344914586096, "learning_rate": 7.839241068053878e-06, "loss": 0.1616, "step": 1979 }, { "epoch": 0.30793157076205285, "grad_norm": 2.415757365845339, "learning_rate": 7.837229882062638e-06, "loss": 0.2091, "step": 1980 }, { "epoch": 0.30808709175738724, "grad_norm": 0.8945861519999385, "learning_rate": 7.83521801878506e-06, "loss": 0.1769, "step": 1981 }, { "epoch": 0.30824261275272163, "grad_norm": 1.1779736396630833, "learning_rate": 7.8332054787014e-06, "loss": 0.2311, "step": 1982 }, { "epoch": 0.30839813374805597, "grad_norm": 1.372493149836755, "learning_rate": 7.831192262292082e-06, "loss": 0.172, "step": 1983 }, { "epoch": 0.30855365474339036, "grad_norm": 2.4487535069237407, "learning_rate": 7.82917837003769e-06, "loss": 0.1395, "step": 1984 }, { "epoch": 0.30870917573872475, "grad_norm": 0.871516091971395, "learning_rate": 7.827163802418967e-06, "loss": 0.1437, "step": 1985 }, { "epoch": 0.3088646967340591, "grad_norm": 1.2731269701284036, "learning_rate": 7.825148559916817e-06, "loss": 0.1857, "step": 1986 }, { "epoch": 0.3090202177293935, "grad_norm": 0.9768725926218434, "learning_rate": 7.823132643012308e-06, "loss": 0.195, "step": 1987 }, { "epoch": 0.3091757387247278, "grad_norm": 0.9369720572131188, "learning_rate": 7.821116052186668e-06, "loss": 0.2034, "step": 1988 }, { "epoch": 0.3093312597200622, "grad_norm": 1.1315495162839369, "learning_rate": 7.819098787921283e-06, "loss": 0.1755, "step": 1989 }, { "epoch": 0.3094867807153966, "grad_norm": 1.0958800584806985, "learning_rate": 7.817080850697705e-06, "loss": 0.2575, "step": 1990 }, { "epoch": 0.3096423017107309, "grad_norm": 0.9336006846477578, "learning_rate": 7.815062240997642e-06, "loss": 0.1376, "step": 1991 }, { "epoch": 0.3097978227060653, "grad_norm": 0.9121065280126879, "learning_rate": 7.813042959302963e-06, "loss": 0.1212, "step": 1992 }, { "epoch": 0.3099533437013997, "grad_norm": 0.6936258475052076, "learning_rate": 7.811023006095703e-06, "loss": 0.13, "step": 1993 }, { "epoch": 0.31010886469673404, "grad_norm": 1.278051470184625, "learning_rate": 7.809002381858048e-06, "loss": 0.1686, "step": 1994 }, { "epoch": 0.31026438569206843, "grad_norm": 1.2807241898257353, "learning_rate": 7.806981087072354e-06, "loss": 0.2569, "step": 1995 }, { "epoch": 0.3104199066874028, "grad_norm": 1.6449581085415006, "learning_rate": 7.804959122221127e-06, "loss": 0.3075, "step": 1996 }, { "epoch": 0.31057542768273716, "grad_norm": 0.9051580549498448, "learning_rate": 7.802936487787045e-06, "loss": 0.1603, "step": 1997 }, { "epoch": 0.31073094867807155, "grad_norm": 1.5451818475345835, "learning_rate": 7.800913184252931e-06, "loss": 0.2057, "step": 1998 }, { "epoch": 0.3108864696734059, "grad_norm": 1.0935081143315897, "learning_rate": 7.79888921210178e-06, "loss": 0.2238, "step": 1999 }, { "epoch": 0.3110419906687403, "grad_norm": 1.262020237993972, "learning_rate": 7.796864571816745e-06, "loss": 0.1977, "step": 2000 }, { "epoch": 0.3110419906687403, "eval_loss": 0.19129334390163422, "eval_runtime": 9.4405, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.741, "step": 2000 }, { "epoch": 0.31119751166407467, "grad_norm": 1.3134456921918323, "learning_rate": 7.794839263881131e-06, "loss": 0.143, "step": 2001 }, { "epoch": 0.311353032659409, "grad_norm": 1.2295835408014146, "learning_rate": 7.79281328877841e-06, "loss": 0.1953, "step": 2002 }, { "epoch": 0.3115085536547434, "grad_norm": 1.1393217633484007, "learning_rate": 7.79078664699221e-06, "loss": 0.1441, "step": 2003 }, { "epoch": 0.3116640746500778, "grad_norm": 5.137337755359654, "learning_rate": 7.78875933900632e-06, "loss": 0.2155, "step": 2004 }, { "epoch": 0.3118195956454121, "grad_norm": 0.7672729071094779, "learning_rate": 7.786731365304682e-06, "loss": 0.1688, "step": 2005 }, { "epoch": 0.3119751166407465, "grad_norm": 1.11844522114181, "learning_rate": 7.784702726371407e-06, "loss": 0.1845, "step": 2006 }, { "epoch": 0.31213063763608084, "grad_norm": 1.1400197526152083, "learning_rate": 7.782673422690756e-06, "loss": 0.2674, "step": 2007 }, { "epoch": 0.31228615863141523, "grad_norm": 0.9932326761105157, "learning_rate": 7.780643454747155e-06, "loss": 0.1223, "step": 2008 }, { "epoch": 0.3124416796267496, "grad_norm": 0.7783882471190926, "learning_rate": 7.77861282302518e-06, "loss": 0.2237, "step": 2009 }, { "epoch": 0.31259720062208396, "grad_norm": 0.9845419734444856, "learning_rate": 7.77658152800958e-06, "loss": 0.1564, "step": 2010 }, { "epoch": 0.31275272161741835, "grad_norm": 1.3175867234826637, "learning_rate": 7.774549570185245e-06, "loss": 0.2137, "step": 2011 }, { "epoch": 0.31290824261275274, "grad_norm": 0.9913627268827314, "learning_rate": 7.772516950037237e-06, "loss": 0.2103, "step": 2012 }, { "epoch": 0.3130637636080871, "grad_norm": 0.9937133972212141, "learning_rate": 7.770483668050769e-06, "loss": 0.1813, "step": 2013 }, { "epoch": 0.31321928460342147, "grad_norm": 1.0381063132239563, "learning_rate": 7.76844972471121e-06, "loss": 0.1605, "step": 2014 }, { "epoch": 0.31337480559875586, "grad_norm": 1.487117370348742, "learning_rate": 7.766415120504098e-06, "loss": 0.2061, "step": 2015 }, { "epoch": 0.3135303265940902, "grad_norm": 1.4338120488261108, "learning_rate": 7.764379855915118e-06, "loss": 0.2516, "step": 2016 }, { "epoch": 0.3136858475894246, "grad_norm": 1.3445400702119068, "learning_rate": 7.762343931430114e-06, "loss": 0.2473, "step": 2017 }, { "epoch": 0.3138413685847589, "grad_norm": 1.0901091093318087, "learning_rate": 7.760307347535092e-06, "loss": 0.1862, "step": 2018 }, { "epoch": 0.3139968895800933, "grad_norm": 0.680181073536634, "learning_rate": 7.758270104716213e-06, "loss": 0.1371, "step": 2019 }, { "epoch": 0.3141524105754277, "grad_norm": 1.3891649724133557, "learning_rate": 7.756232203459794e-06, "loss": 0.1634, "step": 2020 }, { "epoch": 0.31430793157076203, "grad_norm": 1.0718773684948228, "learning_rate": 7.754193644252311e-06, "loss": 0.1783, "step": 2021 }, { "epoch": 0.3144634525660964, "grad_norm": 0.8573281023796301, "learning_rate": 7.752154427580396e-06, "loss": 0.1675, "step": 2022 }, { "epoch": 0.3146189735614308, "grad_norm": 0.9599002670732969, "learning_rate": 7.75011455393084e-06, "loss": 0.1311, "step": 2023 }, { "epoch": 0.31477449455676515, "grad_norm": 0.9885849056405935, "learning_rate": 7.748074023790589e-06, "loss": 0.1803, "step": 2024 }, { "epoch": 0.31493001555209954, "grad_norm": 1.131479751488905, "learning_rate": 7.746032837646742e-06, "loss": 0.1598, "step": 2025 }, { "epoch": 0.31508553654743393, "grad_norm": 1.542582763177572, "learning_rate": 7.743990995986566e-06, "loss": 0.2195, "step": 2026 }, { "epoch": 0.31524105754276827, "grad_norm": 1.0386776896260557, "learning_rate": 7.741948499297471e-06, "loss": 0.1844, "step": 2027 }, { "epoch": 0.31539657853810266, "grad_norm": 1.382852691050598, "learning_rate": 7.739905348067031e-06, "loss": 0.1892, "step": 2028 }, { "epoch": 0.315552099533437, "grad_norm": 1.034998336837451, "learning_rate": 7.737861542782976e-06, "loss": 0.096, "step": 2029 }, { "epoch": 0.3157076205287714, "grad_norm": 1.4309092181034129, "learning_rate": 7.735817083933189e-06, "loss": 0.173, "step": 2030 }, { "epoch": 0.3158631415241058, "grad_norm": 1.3318886616939016, "learning_rate": 7.733771972005712e-06, "loss": 0.164, "step": 2031 }, { "epoch": 0.3160186625194401, "grad_norm": 1.193059681608616, "learning_rate": 7.73172620748874e-06, "loss": 0.2412, "step": 2032 }, { "epoch": 0.3161741835147745, "grad_norm": 1.1304778822102943, "learning_rate": 7.72967979087063e-06, "loss": 0.1746, "step": 2033 }, { "epoch": 0.3163297045101089, "grad_norm": 0.9875583497111531, "learning_rate": 7.727632722639885e-06, "loss": 0.2178, "step": 2034 }, { "epoch": 0.3164852255054432, "grad_norm": 1.0893402135397499, "learning_rate": 7.725585003285175e-06, "loss": 0.1363, "step": 2035 }, { "epoch": 0.3166407465007776, "grad_norm": 0.9888506226054471, "learning_rate": 7.72353663329531e-06, "loss": 0.1584, "step": 2036 }, { "epoch": 0.31679626749611195, "grad_norm": 0.8763849994189918, "learning_rate": 7.721487613159273e-06, "loss": 0.1581, "step": 2037 }, { "epoch": 0.31695178849144634, "grad_norm": 0.8846053442056078, "learning_rate": 7.719437943366188e-06, "loss": 0.2065, "step": 2038 }, { "epoch": 0.31710730948678073, "grad_norm": 1.138445525985918, "learning_rate": 7.717387624405343e-06, "loss": 0.1173, "step": 2039 }, { "epoch": 0.31726283048211507, "grad_norm": 1.1351835752922899, "learning_rate": 7.715336656766176e-06, "loss": 0.212, "step": 2040 }, { "epoch": 0.31741835147744946, "grad_norm": 0.8950041732228967, "learning_rate": 7.713285040938283e-06, "loss": 0.1558, "step": 2041 }, { "epoch": 0.31757387247278385, "grad_norm": 0.828983941342685, "learning_rate": 7.711232777411412e-06, "loss": 0.2135, "step": 2042 }, { "epoch": 0.3177293934681182, "grad_norm": 1.363496489885953, "learning_rate": 7.709179866675468e-06, "loss": 0.2062, "step": 2043 }, { "epoch": 0.3178849144634526, "grad_norm": 0.9630039501090878, "learning_rate": 7.70712630922051e-06, "loss": 0.1726, "step": 2044 }, { "epoch": 0.31804043545878696, "grad_norm": 1.3734792623339027, "learning_rate": 7.705072105536748e-06, "loss": 0.1521, "step": 2045 }, { "epoch": 0.3181959564541213, "grad_norm": 1.0343426454154478, "learning_rate": 7.703017256114554e-06, "loss": 0.1486, "step": 2046 }, { "epoch": 0.3183514774494557, "grad_norm": 1.7457716792384999, "learning_rate": 7.700961761444443e-06, "loss": 0.1373, "step": 2047 }, { "epoch": 0.31850699844479, "grad_norm": 1.0641887213884955, "learning_rate": 7.698905622017095e-06, "loss": 0.1962, "step": 2048 }, { "epoch": 0.3186625194401244, "grad_norm": 1.0256448516382406, "learning_rate": 7.696848838323335e-06, "loss": 0.2133, "step": 2049 }, { "epoch": 0.3188180404354588, "grad_norm": 0.9554447787453604, "learning_rate": 7.69479141085415e-06, "loss": 0.1449, "step": 2050 }, { "epoch": 0.31897356143079314, "grad_norm": 0.9133607174662264, "learning_rate": 7.692733340100676e-06, "loss": 0.1709, "step": 2051 }, { "epoch": 0.31912908242612753, "grad_norm": 0.958921056182501, "learning_rate": 7.690674626554203e-06, "loss": 0.1738, "step": 2052 }, { "epoch": 0.3192846034214619, "grad_norm": 1.0497602520910723, "learning_rate": 7.68861527070617e-06, "loss": 0.1577, "step": 2053 }, { "epoch": 0.31944012441679626, "grad_norm": 0.8515430338148767, "learning_rate": 7.686555273048181e-06, "loss": 0.1839, "step": 2054 }, { "epoch": 0.31959564541213065, "grad_norm": 0.9444440620676442, "learning_rate": 7.684494634071982e-06, "loss": 0.2211, "step": 2055 }, { "epoch": 0.319751166407465, "grad_norm": 1.083714885338235, "learning_rate": 7.682433354269478e-06, "loss": 0.1715, "step": 2056 }, { "epoch": 0.3199066874027994, "grad_norm": 1.3253706780913534, "learning_rate": 7.680371434132723e-06, "loss": 0.1738, "step": 2057 }, { "epoch": 0.32006220839813376, "grad_norm": 1.0817009895803502, "learning_rate": 7.678308874153928e-06, "loss": 0.1986, "step": 2058 }, { "epoch": 0.3202177293934681, "grad_norm": 0.8727007616705321, "learning_rate": 7.676245674825456e-06, "loss": 0.2261, "step": 2059 }, { "epoch": 0.3203732503888025, "grad_norm": 0.8876827719130287, "learning_rate": 7.674181836639819e-06, "loss": 0.1324, "step": 2060 }, { "epoch": 0.3205287713841369, "grad_norm": 1.7833075283132949, "learning_rate": 7.672117360089683e-06, "loss": 0.2328, "step": 2061 }, { "epoch": 0.3206842923794712, "grad_norm": 1.3867630131733413, "learning_rate": 7.670052245667871e-06, "loss": 0.1557, "step": 2062 }, { "epoch": 0.3208398133748056, "grad_norm": 1.17207131523591, "learning_rate": 7.667986493867354e-06, "loss": 0.2388, "step": 2063 }, { "epoch": 0.32099533437014, "grad_norm": 0.9924976057775378, "learning_rate": 7.665920105181253e-06, "loss": 0.1801, "step": 2064 }, { "epoch": 0.32115085536547433, "grad_norm": 1.3125939259187236, "learning_rate": 7.663853080102845e-06, "loss": 0.1918, "step": 2065 }, { "epoch": 0.3213063763608087, "grad_norm": 1.1144289297016323, "learning_rate": 7.661785419125559e-06, "loss": 0.1812, "step": 2066 }, { "epoch": 0.32146189735614306, "grad_norm": 0.8877711872758638, "learning_rate": 7.659717122742974e-06, "loss": 0.1331, "step": 2067 }, { "epoch": 0.32161741835147745, "grad_norm": 0.8697887146941115, "learning_rate": 7.657648191448818e-06, "loss": 0.1477, "step": 2068 }, { "epoch": 0.32177293934681184, "grad_norm": 1.3945025846070924, "learning_rate": 7.655578625736979e-06, "loss": 0.2481, "step": 2069 }, { "epoch": 0.32192846034214617, "grad_norm": 0.960963808874566, "learning_rate": 7.653508426101488e-06, "loss": 0.2027, "step": 2070 }, { "epoch": 0.32208398133748056, "grad_norm": 0.9298160918953824, "learning_rate": 7.65143759303653e-06, "loss": 0.1154, "step": 2071 }, { "epoch": 0.32223950233281495, "grad_norm": 1.3020366553838805, "learning_rate": 7.649366127036445e-06, "loss": 0.2141, "step": 2072 }, { "epoch": 0.3223950233281493, "grad_norm": 1.2722625411547916, "learning_rate": 7.647294028595718e-06, "loss": 0.1969, "step": 2073 }, { "epoch": 0.3225505443234837, "grad_norm": 1.1411275225074438, "learning_rate": 7.64522129820899e-06, "loss": 0.2009, "step": 2074 }, { "epoch": 0.322706065318818, "grad_norm": 0.8581369791922125, "learning_rate": 7.643147936371047e-06, "loss": 0.1385, "step": 2075 }, { "epoch": 0.3228615863141524, "grad_norm": 0.9293347565265316, "learning_rate": 7.641073943576832e-06, "loss": 0.1218, "step": 2076 }, { "epoch": 0.3230171073094868, "grad_norm": 0.7190161056722457, "learning_rate": 7.638999320321436e-06, "loss": 0.156, "step": 2077 }, { "epoch": 0.32317262830482113, "grad_norm": 1.2700953858554853, "learning_rate": 7.6369240671001e-06, "loss": 0.2708, "step": 2078 }, { "epoch": 0.3233281493001555, "grad_norm": 1.0223126056778697, "learning_rate": 7.634848184408215e-06, "loss": 0.2271, "step": 2079 }, { "epoch": 0.3234836702954899, "grad_norm": 1.1258562524361366, "learning_rate": 7.632771672741326e-06, "loss": 0.1824, "step": 2080 }, { "epoch": 0.32363919129082425, "grad_norm": 1.0963356442256762, "learning_rate": 7.630694532595122e-06, "loss": 0.1303, "step": 2081 }, { "epoch": 0.32379471228615864, "grad_norm": 1.2021316878758006, "learning_rate": 7.6286167644654475e-06, "loss": 0.209, "step": 2082 }, { "epoch": 0.323950233281493, "grad_norm": 0.910371720521649, "learning_rate": 7.626538368848294e-06, "loss": 0.1469, "step": 2083 }, { "epoch": 0.32410575427682736, "grad_norm": 0.7362934077155303, "learning_rate": 7.6244593462398045e-06, "loss": 0.1639, "step": 2084 }, { "epoch": 0.32426127527216175, "grad_norm": 0.7771748937783678, "learning_rate": 7.6223796971362685e-06, "loss": 0.1339, "step": 2085 }, { "epoch": 0.3244167962674961, "grad_norm": 0.8667288190298778, "learning_rate": 7.620299422034127e-06, "loss": 0.1462, "step": 2086 }, { "epoch": 0.3245723172628305, "grad_norm": 1.3684227258619721, "learning_rate": 7.618218521429974e-06, "loss": 0.1927, "step": 2087 }, { "epoch": 0.32472783825816487, "grad_norm": 1.3207143934999601, "learning_rate": 7.6161369958205465e-06, "loss": 0.238, "step": 2088 }, { "epoch": 0.3248833592534992, "grad_norm": 1.0453470094718333, "learning_rate": 7.614054845702737e-06, "loss": 0.2253, "step": 2089 }, { "epoch": 0.3250388802488336, "grad_norm": 1.1830610441091318, "learning_rate": 7.611972071573579e-06, "loss": 0.2584, "step": 2090 }, { "epoch": 0.325194401244168, "grad_norm": 0.8421373169861435, "learning_rate": 7.609888673930264e-06, "loss": 0.1986, "step": 2091 }, { "epoch": 0.3253499222395023, "grad_norm": 0.8394651153042954, "learning_rate": 7.607804653270126e-06, "loss": 0.1544, "step": 2092 }, { "epoch": 0.3255054432348367, "grad_norm": 1.3872743106387866, "learning_rate": 7.605720010090649e-06, "loss": 0.1896, "step": 2093 }, { "epoch": 0.32566096423017105, "grad_norm": 0.7036222346548416, "learning_rate": 7.6036347448894695e-06, "loss": 0.1614, "step": 2094 }, { "epoch": 0.32581648522550544, "grad_norm": 1.302813252635676, "learning_rate": 7.601548858164366e-06, "loss": 0.2329, "step": 2095 }, { "epoch": 0.3259720062208398, "grad_norm": 1.3519215074415778, "learning_rate": 7.599462350413271e-06, "loss": 0.189, "step": 2096 }, { "epoch": 0.32612752721617416, "grad_norm": 1.0894473178878574, "learning_rate": 7.597375222134261e-06, "loss": 0.1794, "step": 2097 }, { "epoch": 0.32628304821150855, "grad_norm": 0.9113443512184399, "learning_rate": 7.595287473825565e-06, "loss": 0.2056, "step": 2098 }, { "epoch": 0.32643856920684294, "grad_norm": 1.0974779516974789, "learning_rate": 7.593199105985556e-06, "loss": 0.1518, "step": 2099 }, { "epoch": 0.3265940902021773, "grad_norm": 1.6493643365933612, "learning_rate": 7.591110119112757e-06, "loss": 0.1919, "step": 2100 }, { "epoch": 0.3265940902021773, "eval_loss": 0.1888900250196457, "eval_runtime": 9.4298, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 2100 }, { "epoch": 0.32674961119751167, "grad_norm": 0.9817443292018961, "learning_rate": 7.589020513705838e-06, "loss": 0.1204, "step": 2101 }, { "epoch": 0.32690513219284606, "grad_norm": 1.1406523620504585, "learning_rate": 7.586930290263617e-06, "loss": 0.1493, "step": 2102 }, { "epoch": 0.3270606531881804, "grad_norm": 1.1209394447287175, "learning_rate": 7.5848394492850605e-06, "loss": 0.1664, "step": 2103 }, { "epoch": 0.3272161741835148, "grad_norm": 0.9723114609827509, "learning_rate": 7.5827479912692815e-06, "loss": 0.195, "step": 2104 }, { "epoch": 0.3273716951788491, "grad_norm": 1.7309766831211526, "learning_rate": 7.580655916715537e-06, "loss": 0.2514, "step": 2105 }, { "epoch": 0.3275272161741835, "grad_norm": 1.184256886960677, "learning_rate": 7.578563226123238e-06, "loss": 0.1884, "step": 2106 }, { "epoch": 0.3276827371695179, "grad_norm": 1.0677571779260098, "learning_rate": 7.5764699199919375e-06, "loss": 0.1612, "step": 2107 }, { "epoch": 0.32783825816485224, "grad_norm": 1.0289147127220388, "learning_rate": 7.574375998821338e-06, "loss": 0.1294, "step": 2108 }, { "epoch": 0.3279937791601866, "grad_norm": 1.6824502489722701, "learning_rate": 7.572281463111284e-06, "loss": 0.2543, "step": 2109 }, { "epoch": 0.328149300155521, "grad_norm": 0.8827695694214756, "learning_rate": 7.5701863133617735e-06, "loss": 0.152, "step": 2110 }, { "epoch": 0.32830482115085535, "grad_norm": 0.9162295653798136, "learning_rate": 7.568090550072951e-06, "loss": 0.1585, "step": 2111 }, { "epoch": 0.32846034214618974, "grad_norm": 1.151519309498341, "learning_rate": 7.565994173745097e-06, "loss": 0.1942, "step": 2112 }, { "epoch": 0.3286158631415241, "grad_norm": 0.747495249604077, "learning_rate": 7.563897184878652e-06, "loss": 0.1778, "step": 2113 }, { "epoch": 0.32877138413685847, "grad_norm": 1.3710150078388301, "learning_rate": 7.561799583974193e-06, "loss": 0.2153, "step": 2114 }, { "epoch": 0.32892690513219286, "grad_norm": 1.436590959329486, "learning_rate": 7.559701371532449e-06, "loss": 0.1693, "step": 2115 }, { "epoch": 0.3290824261275272, "grad_norm": 1.1037998612389397, "learning_rate": 7.55760254805429e-06, "loss": 0.142, "step": 2116 }, { "epoch": 0.3292379471228616, "grad_norm": 1.1388616177937485, "learning_rate": 7.555503114040736e-06, "loss": 0.177, "step": 2117 }, { "epoch": 0.329393468118196, "grad_norm": 1.0085851146476574, "learning_rate": 7.553403069992951e-06, "loss": 0.167, "step": 2118 }, { "epoch": 0.3295489891135303, "grad_norm": 0.7640506337727971, "learning_rate": 7.551302416412245e-06, "loss": 0.1813, "step": 2119 }, { "epoch": 0.3297045101088647, "grad_norm": 1.3516550652642587, "learning_rate": 7.549201153800073e-06, "loss": 0.2007, "step": 2120 }, { "epoch": 0.3298600311041991, "grad_norm": 1.2554174354825145, "learning_rate": 7.547099282658036e-06, "loss": 0.2242, "step": 2121 }, { "epoch": 0.3300155520995334, "grad_norm": 0.989817572044348, "learning_rate": 7.544996803487878e-06, "loss": 0.1452, "step": 2122 }, { "epoch": 0.3301710730948678, "grad_norm": 7.083500120073432, "learning_rate": 7.542893716791494e-06, "loss": 0.3039, "step": 2123 }, { "epoch": 0.33032659409020215, "grad_norm": 1.1145564068392033, "learning_rate": 7.5407900230709185e-06, "loss": 0.2036, "step": 2124 }, { "epoch": 0.33048211508553654, "grad_norm": 1.1339214195029401, "learning_rate": 7.53868572282833e-06, "loss": 0.176, "step": 2125 }, { "epoch": 0.33063763608087093, "grad_norm": 0.9462077327312849, "learning_rate": 7.536580816566055e-06, "loss": 0.1964, "step": 2126 }, { "epoch": 0.33079315707620527, "grad_norm": 1.546296219545882, "learning_rate": 7.534475304786568e-06, "loss": 0.2047, "step": 2127 }, { "epoch": 0.33094867807153966, "grad_norm": 0.8906205369453303, "learning_rate": 7.532369187992481e-06, "loss": 0.1672, "step": 2128 }, { "epoch": 0.33110419906687405, "grad_norm": 1.1383551903848197, "learning_rate": 7.530262466686553e-06, "loss": 0.2268, "step": 2129 }, { "epoch": 0.3312597200622084, "grad_norm": 1.1935715251242796, "learning_rate": 7.528155141371688e-06, "loss": 0.1522, "step": 2130 }, { "epoch": 0.3314152410575428, "grad_norm": 0.8221199515559443, "learning_rate": 7.526047212550934e-06, "loss": 0.2212, "step": 2131 }, { "epoch": 0.33157076205287717, "grad_norm": 1.6760059705752035, "learning_rate": 7.523938680727485e-06, "loss": 0.2116, "step": 2132 }, { "epoch": 0.3317262830482115, "grad_norm": 1.339374321069871, "learning_rate": 7.521829546404675e-06, "loss": 0.2915, "step": 2133 }, { "epoch": 0.3318818040435459, "grad_norm": 1.390671507615236, "learning_rate": 7.519719810085982e-06, "loss": 0.2087, "step": 2134 }, { "epoch": 0.3320373250388802, "grad_norm": 0.9055268963428281, "learning_rate": 7.5176094722750344e-06, "loss": 0.1473, "step": 2135 }, { "epoch": 0.3321928460342146, "grad_norm": 1.1251785698614165, "learning_rate": 7.515498533475596e-06, "loss": 0.191, "step": 2136 }, { "epoch": 0.332348367029549, "grad_norm": 1.0015543560697442, "learning_rate": 7.513386994191577e-06, "loss": 0.2046, "step": 2137 }, { "epoch": 0.33250388802488334, "grad_norm": 1.201818868982661, "learning_rate": 7.511274854927032e-06, "loss": 0.1973, "step": 2138 }, { "epoch": 0.33265940902021773, "grad_norm": 1.2671593073838994, "learning_rate": 7.509162116186159e-06, "loss": 0.2002, "step": 2139 }, { "epoch": 0.3328149300155521, "grad_norm": 1.4384454615337903, "learning_rate": 7.507048778473296e-06, "loss": 0.1719, "step": 2140 }, { "epoch": 0.33297045101088646, "grad_norm": 1.0871674451856448, "learning_rate": 7.50493484229293e-06, "loss": 0.1371, "step": 2141 }, { "epoch": 0.33312597200622085, "grad_norm": 1.0548882795218661, "learning_rate": 7.502820308149681e-06, "loss": 0.1871, "step": 2142 }, { "epoch": 0.3332814930015552, "grad_norm": 0.8035702822717011, "learning_rate": 7.500705176548324e-06, "loss": 0.2055, "step": 2143 }, { "epoch": 0.3334370139968896, "grad_norm": 1.0934386709394424, "learning_rate": 7.498589447993768e-06, "loss": 0.2272, "step": 2144 }, { "epoch": 0.33359253499222397, "grad_norm": 0.8158439300117843, "learning_rate": 7.496473122991066e-06, "loss": 0.1698, "step": 2145 }, { "epoch": 0.3337480559875583, "grad_norm": 1.1378254150964608, "learning_rate": 7.4943562020454144e-06, "loss": 0.1742, "step": 2146 }, { "epoch": 0.3339035769828927, "grad_norm": 1.0011260204739405, "learning_rate": 7.492238685662153e-06, "loss": 0.1581, "step": 2147 }, { "epoch": 0.3340590979782271, "grad_norm": 0.8345222085755947, "learning_rate": 7.490120574346762e-06, "loss": 0.1399, "step": 2148 }, { "epoch": 0.3342146189735614, "grad_norm": 1.465104109096865, "learning_rate": 7.488001868604864e-06, "loss": 0.1706, "step": 2149 }, { "epoch": 0.3343701399688958, "grad_norm": 1.4289499557430132, "learning_rate": 7.485882568942222e-06, "loss": 0.1952, "step": 2150 }, { "epoch": 0.3345256609642302, "grad_norm": 0.9038596478228801, "learning_rate": 7.483762675864745e-06, "loss": 0.1029, "step": 2151 }, { "epoch": 0.33468118195956453, "grad_norm": 1.2181710928273417, "learning_rate": 7.481642189878482e-06, "loss": 0.1781, "step": 2152 }, { "epoch": 0.3348367029548989, "grad_norm": 1.061979844318888, "learning_rate": 7.479521111489618e-06, "loss": 0.2034, "step": 2153 }, { "epoch": 0.33499222395023326, "grad_norm": 1.9725764981408074, "learning_rate": 7.477399441204488e-06, "loss": 0.146, "step": 2154 }, { "epoch": 0.33514774494556765, "grad_norm": 1.582921073931499, "learning_rate": 7.475277179529562e-06, "loss": 0.1516, "step": 2155 }, { "epoch": 0.33530326594090204, "grad_norm": 1.1175526990109041, "learning_rate": 7.473154326971455e-06, "loss": 0.1931, "step": 2156 }, { "epoch": 0.3354587869362364, "grad_norm": 1.1572380165113694, "learning_rate": 7.47103088403692e-06, "loss": 0.1868, "step": 2157 }, { "epoch": 0.33561430793157077, "grad_norm": 1.133159191478057, "learning_rate": 7.468906851232853e-06, "loss": 0.192, "step": 2158 }, { "epoch": 0.33576982892690516, "grad_norm": 0.966866218688593, "learning_rate": 7.466782229066291e-06, "loss": 0.2209, "step": 2159 }, { "epoch": 0.3359253499222395, "grad_norm": 1.3029225787456367, "learning_rate": 7.464657018044411e-06, "loss": 0.1732, "step": 2160 }, { "epoch": 0.3360808709175739, "grad_norm": 1.1491820214273687, "learning_rate": 7.462531218674529e-06, "loss": 0.154, "step": 2161 }, { "epoch": 0.3362363919129082, "grad_norm": 0.9342812714836881, "learning_rate": 7.4604048314641055e-06, "loss": 0.2198, "step": 2162 }, { "epoch": 0.3363919129082426, "grad_norm": 0.9768685673519966, "learning_rate": 7.458277856920736e-06, "loss": 0.1391, "step": 2163 }, { "epoch": 0.336547433903577, "grad_norm": 0.8526960478386801, "learning_rate": 7.45615029555216e-06, "loss": 0.1955, "step": 2164 }, { "epoch": 0.33670295489891133, "grad_norm": 1.1121130947201663, "learning_rate": 7.4540221478662565e-06, "loss": 0.189, "step": 2165 }, { "epoch": 0.3368584758942457, "grad_norm": 1.227722351989062, "learning_rate": 7.451893414371043e-06, "loss": 0.181, "step": 2166 }, { "epoch": 0.3370139968895801, "grad_norm": 1.1862346053920063, "learning_rate": 7.44976409557468e-06, "loss": 0.1474, "step": 2167 }, { "epoch": 0.33716951788491445, "grad_norm": 1.205692844961232, "learning_rate": 7.447634191985464e-06, "loss": 0.2136, "step": 2168 }, { "epoch": 0.33732503888024884, "grad_norm": 1.2204831206262905, "learning_rate": 7.445503704111833e-06, "loss": 0.1686, "step": 2169 }, { "epoch": 0.33748055987558323, "grad_norm": 0.8193763346873587, "learning_rate": 7.443372632462363e-06, "loss": 0.2293, "step": 2170 }, { "epoch": 0.33763608087091757, "grad_norm": 1.5150428590299205, "learning_rate": 7.441240977545772e-06, "loss": 0.1565, "step": 2171 }, { "epoch": 0.33779160186625196, "grad_norm": 1.3059404291569194, "learning_rate": 7.439108739870915e-06, "loss": 0.205, "step": 2172 }, { "epoch": 0.3379471228615863, "grad_norm": 1.0129410062874789, "learning_rate": 7.436975919946789e-06, "loss": 0.1953, "step": 2173 }, { "epoch": 0.3381026438569207, "grad_norm": 1.1904992654316553, "learning_rate": 7.434842518282524e-06, "loss": 0.1589, "step": 2174 }, { "epoch": 0.33825816485225507, "grad_norm": 1.1188487757322199, "learning_rate": 7.432708535387397e-06, "loss": 0.1526, "step": 2175 }, { "epoch": 0.3384136858475894, "grad_norm": 0.9788849364121561, "learning_rate": 7.430573971770816e-06, "loss": 0.1567, "step": 2176 }, { "epoch": 0.3385692068429238, "grad_norm": 1.142505661873861, "learning_rate": 7.428438827942333e-06, "loss": 0.3209, "step": 2177 }, { "epoch": 0.3387247278382582, "grad_norm": 1.2975253752387061, "learning_rate": 7.426303104411634e-06, "loss": 0.1774, "step": 2178 }, { "epoch": 0.3388802488335925, "grad_norm": 2.0020744696644455, "learning_rate": 7.424166801688551e-06, "loss": 0.2189, "step": 2179 }, { "epoch": 0.3390357698289269, "grad_norm": 0.9221829536825615, "learning_rate": 7.422029920283044e-06, "loss": 0.1578, "step": 2180 }, { "epoch": 0.33919129082426125, "grad_norm": 0.7105042097068801, "learning_rate": 7.41989246070522e-06, "loss": 0.1269, "step": 2181 }, { "epoch": 0.33934681181959564, "grad_norm": 0.8173199029760152, "learning_rate": 7.4177544234653174e-06, "loss": 0.1738, "step": 2182 }, { "epoch": 0.33950233281493003, "grad_norm": 1.3275531627930977, "learning_rate": 7.415615809073717e-06, "loss": 0.1462, "step": 2183 }, { "epoch": 0.33965785381026437, "grad_norm": 1.1047205284745352, "learning_rate": 7.413476618040939e-06, "loss": 0.1498, "step": 2184 }, { "epoch": 0.33981337480559876, "grad_norm": 1.5091229115800868, "learning_rate": 7.411336850877633e-06, "loss": 0.277, "step": 2185 }, { "epoch": 0.33996889580093315, "grad_norm": 0.998754492621575, "learning_rate": 7.409196508094593e-06, "loss": 0.1964, "step": 2186 }, { "epoch": 0.3401244167962675, "grad_norm": 1.112067772627418, "learning_rate": 7.407055590202751e-06, "loss": 0.2369, "step": 2187 }, { "epoch": 0.34027993779160187, "grad_norm": 0.912922463473531, "learning_rate": 7.40491409771317e-06, "loss": 0.2261, "step": 2188 }, { "epoch": 0.34043545878693626, "grad_norm": 1.4528011961296432, "learning_rate": 7.402772031137058e-06, "loss": 0.2757, "step": 2189 }, { "epoch": 0.3405909797822706, "grad_norm": 1.183289721220463, "learning_rate": 7.400629390985753e-06, "loss": 0.1322, "step": 2190 }, { "epoch": 0.340746500777605, "grad_norm": 0.6244923769308766, "learning_rate": 7.398486177770735e-06, "loss": 0.171, "step": 2191 }, { "epoch": 0.3409020217729393, "grad_norm": 1.1444393476930488, "learning_rate": 7.39634239200362e-06, "loss": 0.231, "step": 2192 }, { "epoch": 0.3410575427682737, "grad_norm": 1.140886812503682, "learning_rate": 7.394198034196155e-06, "loss": 0.1926, "step": 2193 }, { "epoch": 0.3412130637636081, "grad_norm": 0.8953446398110485, "learning_rate": 7.392053104860231e-06, "loss": 0.2539, "step": 2194 }, { "epoch": 0.34136858475894244, "grad_norm": 0.6700009630737177, "learning_rate": 7.389907604507874e-06, "loss": 0.1429, "step": 2195 }, { "epoch": 0.34152410575427683, "grad_norm": 2.069452133990495, "learning_rate": 7.387761533651243e-06, "loss": 0.1679, "step": 2196 }, { "epoch": 0.3416796267496112, "grad_norm": 1.3126004575841903, "learning_rate": 7.385614892802635e-06, "loss": 0.1404, "step": 2197 }, { "epoch": 0.34183514774494556, "grad_norm": 1.3410068450249955, "learning_rate": 7.383467682474484e-06, "loss": 0.186, "step": 2198 }, { "epoch": 0.34199066874027995, "grad_norm": 1.1055105583839044, "learning_rate": 7.381319903179358e-06, "loss": 0.1702, "step": 2199 }, { "epoch": 0.3421461897356143, "grad_norm": 0.9016971526492875, "learning_rate": 7.379171555429965e-06, "loss": 0.1463, "step": 2200 }, { "epoch": 0.3421461897356143, "eval_loss": 0.18944497406482697, "eval_runtime": 9.4234, "eval_samples_per_second": 2.759, "eval_steps_per_second": 0.743, "step": 2200 }, { "epoch": 0.34230171073094867, "grad_norm": 0.9037781872771364, "learning_rate": 7.37702263973914e-06, "loss": 0.1541, "step": 2201 }, { "epoch": 0.34245723172628306, "grad_norm": 1.0167768901535814, "learning_rate": 7.374873156619862e-06, "loss": 0.0897, "step": 2202 }, { "epoch": 0.3426127527216174, "grad_norm": 1.0744384436880956, "learning_rate": 7.372723106585244e-06, "loss": 0.2071, "step": 2203 }, { "epoch": 0.3427682737169518, "grad_norm": 0.9231570140690939, "learning_rate": 7.370572490148533e-06, "loss": 0.2124, "step": 2204 }, { "epoch": 0.3429237947122862, "grad_norm": 0.7595128291837981, "learning_rate": 7.3684213078231084e-06, "loss": 0.1819, "step": 2205 }, { "epoch": 0.3430793157076205, "grad_norm": 1.2523296522978697, "learning_rate": 7.3662695601224875e-06, "loss": 0.1493, "step": 2206 }, { "epoch": 0.3432348367029549, "grad_norm": 0.9617816938148513, "learning_rate": 7.364117247560325e-06, "loss": 0.153, "step": 2207 }, { "epoch": 0.3433903576982893, "grad_norm": 1.0712931530764909, "learning_rate": 7.361964370650407e-06, "loss": 0.1934, "step": 2208 }, { "epoch": 0.34354587869362363, "grad_norm": 0.9237057456039339, "learning_rate": 7.359810929906657e-06, "loss": 0.1578, "step": 2209 }, { "epoch": 0.343701399688958, "grad_norm": 1.4482546523140547, "learning_rate": 7.357656925843125e-06, "loss": 0.1458, "step": 2210 }, { "epoch": 0.34385692068429236, "grad_norm": 1.0597417518980254, "learning_rate": 7.3555023589740095e-06, "loss": 0.138, "step": 2211 }, { "epoch": 0.34401244167962675, "grad_norm": 0.9274797074652902, "learning_rate": 7.353347229813631e-06, "loss": 0.1355, "step": 2212 }, { "epoch": 0.34416796267496114, "grad_norm": 0.8742449202287172, "learning_rate": 7.35119153887645e-06, "loss": 0.179, "step": 2213 }, { "epoch": 0.34432348367029547, "grad_norm": 1.1402403370503393, "learning_rate": 7.3490352866770594e-06, "loss": 0.1399, "step": 2214 }, { "epoch": 0.34447900466562986, "grad_norm": 1.0801376571258645, "learning_rate": 7.346878473730189e-06, "loss": 0.1675, "step": 2215 }, { "epoch": 0.34463452566096425, "grad_norm": 1.1670657039239176, "learning_rate": 7.344721100550698e-06, "loss": 0.2189, "step": 2216 }, { "epoch": 0.3447900466562986, "grad_norm": 0.8470631893731005, "learning_rate": 7.342563167653582e-06, "loss": 0.1392, "step": 2217 }, { "epoch": 0.344945567651633, "grad_norm": 1.033014417793829, "learning_rate": 7.340404675553969e-06, "loss": 0.1712, "step": 2218 }, { "epoch": 0.3451010886469673, "grad_norm": 0.9923990700451135, "learning_rate": 7.3382456247671245e-06, "loss": 0.2144, "step": 2219 }, { "epoch": 0.3452566096423017, "grad_norm": 1.1456743303567751, "learning_rate": 7.336086015808439e-06, "loss": 0.1391, "step": 2220 }, { "epoch": 0.3454121306376361, "grad_norm": 1.2177291031485395, "learning_rate": 7.333925849193444e-06, "loss": 0.2848, "step": 2221 }, { "epoch": 0.34556765163297043, "grad_norm": 1.553753208121964, "learning_rate": 7.331765125437801e-06, "loss": 0.1833, "step": 2222 }, { "epoch": 0.3457231726283048, "grad_norm": 1.3876469032792997, "learning_rate": 7.329603845057305e-06, "loss": 0.1596, "step": 2223 }, { "epoch": 0.3458786936236392, "grad_norm": 1.1590315918181497, "learning_rate": 7.327442008567884e-06, "loss": 0.2211, "step": 2224 }, { "epoch": 0.34603421461897355, "grad_norm": 1.2253872626188305, "learning_rate": 7.325279616485599e-06, "loss": 0.2046, "step": 2225 }, { "epoch": 0.34618973561430794, "grad_norm": 0.7936100140192845, "learning_rate": 7.32311666932664e-06, "loss": 0.1265, "step": 2226 }, { "epoch": 0.3463452566096423, "grad_norm": 1.163862295570761, "learning_rate": 7.320953167607336e-06, "loss": 0.1919, "step": 2227 }, { "epoch": 0.34650077760497666, "grad_norm": 0.9743641652001349, "learning_rate": 7.318789111844146e-06, "loss": 0.2091, "step": 2228 }, { "epoch": 0.34665629860031105, "grad_norm": 0.9836462918624215, "learning_rate": 7.316624502553658e-06, "loss": 0.2001, "step": 2229 }, { "epoch": 0.3468118195956454, "grad_norm": 1.2768689875160077, "learning_rate": 7.314459340252593e-06, "loss": 0.221, "step": 2230 }, { "epoch": 0.3469673405909798, "grad_norm": 1.0537288189759726, "learning_rate": 7.312293625457807e-06, "loss": 0.1845, "step": 2231 }, { "epoch": 0.34712286158631417, "grad_norm": 1.4719253280420965, "learning_rate": 7.310127358686287e-06, "loss": 0.2366, "step": 2232 }, { "epoch": 0.3472783825816485, "grad_norm": 1.477148999050551, "learning_rate": 7.307960540455152e-06, "loss": 0.1895, "step": 2233 }, { "epoch": 0.3474339035769829, "grad_norm": 1.1755476650010885, "learning_rate": 7.30579317128165e-06, "loss": 0.1773, "step": 2234 }, { "epoch": 0.3475894245723173, "grad_norm": 1.1652362153151328, "learning_rate": 7.303625251683162e-06, "loss": 0.1846, "step": 2235 }, { "epoch": 0.3477449455676516, "grad_norm": 1.07852417516138, "learning_rate": 7.301456782177202e-06, "loss": 0.1544, "step": 2236 }, { "epoch": 0.347900466562986, "grad_norm": 1.056249347688394, "learning_rate": 7.299287763281412e-06, "loss": 0.2084, "step": 2237 }, { "epoch": 0.34805598755832035, "grad_norm": 0.9328495453037987, "learning_rate": 7.297118195513568e-06, "loss": 0.1729, "step": 2238 }, { "epoch": 0.34821150855365474, "grad_norm": 0.8137804262781729, "learning_rate": 7.294948079391577e-06, "loss": 0.1472, "step": 2239 }, { "epoch": 0.3483670295489891, "grad_norm": 1.0432696988235302, "learning_rate": 7.2927774154334765e-06, "loss": 0.2655, "step": 2240 }, { "epoch": 0.34852255054432346, "grad_norm": 0.927023837868047, "learning_rate": 7.290606204157432e-06, "loss": 0.1918, "step": 2241 }, { "epoch": 0.34867807153965785, "grad_norm": 1.0954857263423547, "learning_rate": 7.288434446081742e-06, "loss": 0.2333, "step": 2242 }, { "epoch": 0.34883359253499224, "grad_norm": 1.0887307204063708, "learning_rate": 7.286262141724837e-06, "loss": 0.2086, "step": 2243 }, { "epoch": 0.3489891135303266, "grad_norm": 1.262427582708812, "learning_rate": 7.284089291605277e-06, "loss": 0.1203, "step": 2244 }, { "epoch": 0.34914463452566097, "grad_norm": 0.7560961164836532, "learning_rate": 7.281915896241749e-06, "loss": 0.1701, "step": 2245 }, { "epoch": 0.34930015552099536, "grad_norm": 0.8699627300397481, "learning_rate": 7.279741956153075e-06, "loss": 0.1804, "step": 2246 }, { "epoch": 0.3494556765163297, "grad_norm": 1.2719819028255959, "learning_rate": 7.277567471858203e-06, "loss": 0.1305, "step": 2247 }, { "epoch": 0.3496111975116641, "grad_norm": 1.0835684438995092, "learning_rate": 7.275392443876214e-06, "loss": 0.1665, "step": 2248 }, { "epoch": 0.3497667185069984, "grad_norm": 0.9945124239358721, "learning_rate": 7.2732168727263175e-06, "loss": 0.1641, "step": 2249 }, { "epoch": 0.3499222395023328, "grad_norm": 0.8082169499875355, "learning_rate": 7.271040758927852e-06, "loss": 0.195, "step": 2250 }, { "epoch": 0.3500777604976672, "grad_norm": 4.355490023631713, "learning_rate": 7.268864103000286e-06, "loss": 0.1114, "step": 2251 }, { "epoch": 0.35023328149300154, "grad_norm": 0.9071032559338355, "learning_rate": 7.266686905463219e-06, "loss": 0.1608, "step": 2252 }, { "epoch": 0.3503888024883359, "grad_norm": 1.1194860712826606, "learning_rate": 7.264509166836377e-06, "loss": 0.1485, "step": 2253 }, { "epoch": 0.3505443234836703, "grad_norm": 1.5127510147587406, "learning_rate": 7.262330887639619e-06, "loss": 0.2047, "step": 2254 }, { "epoch": 0.35069984447900465, "grad_norm": 1.1063379473935793, "learning_rate": 7.260152068392927e-06, "loss": 0.1759, "step": 2255 }, { "epoch": 0.35085536547433904, "grad_norm": 1.2586720767235173, "learning_rate": 7.257972709616418e-06, "loss": 0.1897, "step": 2256 }, { "epoch": 0.35101088646967343, "grad_norm": 1.5169576685972481, "learning_rate": 7.255792811830335e-06, "loss": 0.3277, "step": 2257 }, { "epoch": 0.35116640746500777, "grad_norm": 0.9343359450336397, "learning_rate": 7.253612375555049e-06, "loss": 0.1032, "step": 2258 }, { "epoch": 0.35132192846034216, "grad_norm": 0.8942548818999339, "learning_rate": 7.251431401311061e-06, "loss": 0.2021, "step": 2259 }, { "epoch": 0.3514774494556765, "grad_norm": 1.582203707452359, "learning_rate": 7.2492498896190015e-06, "loss": 0.2559, "step": 2260 }, { "epoch": 0.3516329704510109, "grad_norm": 1.1605831379198273, "learning_rate": 7.247067840999625e-06, "loss": 0.238, "step": 2261 }, { "epoch": 0.3517884914463453, "grad_norm": 1.5644988205909856, "learning_rate": 7.244885255973819e-06, "loss": 0.2074, "step": 2262 }, { "epoch": 0.3519440124416796, "grad_norm": 0.8064501753169878, "learning_rate": 7.242702135062597e-06, "loss": 0.1652, "step": 2263 }, { "epoch": 0.352099533437014, "grad_norm": 0.9679857166635403, "learning_rate": 7.2405184787870985e-06, "loss": 0.2333, "step": 2264 }, { "epoch": 0.3522550544323484, "grad_norm": 0.7314945276765478, "learning_rate": 7.238334287668595e-06, "loss": 0.1753, "step": 2265 }, { "epoch": 0.3524105754276827, "grad_norm": 0.8027493373554788, "learning_rate": 7.236149562228483e-06, "loss": 0.1303, "step": 2266 }, { "epoch": 0.3525660964230171, "grad_norm": 1.33862586498083, "learning_rate": 7.233964302988284e-06, "loss": 0.2083, "step": 2267 }, { "epoch": 0.35272161741835145, "grad_norm": 0.9107212533165417, "learning_rate": 7.2317785104696545e-06, "loss": 0.1673, "step": 2268 }, { "epoch": 0.35287713841368584, "grad_norm": 0.9321902829073512, "learning_rate": 7.2295921851943705e-06, "loss": 0.1788, "step": 2269 }, { "epoch": 0.35303265940902023, "grad_norm": 1.522960820890859, "learning_rate": 7.227405327684339e-06, "loss": 0.1625, "step": 2270 }, { "epoch": 0.35318818040435457, "grad_norm": 1.3068891779143499, "learning_rate": 7.225217938461593e-06, "loss": 0.165, "step": 2271 }, { "epoch": 0.35334370139968896, "grad_norm": 1.0299586101374207, "learning_rate": 7.223030018048294e-06, "loss": 0.1825, "step": 2272 }, { "epoch": 0.35349922239502335, "grad_norm": 1.115797306310303, "learning_rate": 7.220841566966729e-06, "loss": 0.1682, "step": 2273 }, { "epoch": 0.3536547433903577, "grad_norm": 0.9737802311290622, "learning_rate": 7.218652585739311e-06, "loss": 0.2234, "step": 2274 }, { "epoch": 0.3538102643856921, "grad_norm": 0.9451595068865823, "learning_rate": 7.216463074888579e-06, "loss": 0.1545, "step": 2275 }, { "epoch": 0.35396578538102647, "grad_norm": 0.7548167278503619, "learning_rate": 7.214273034937203e-06, "loss": 0.1772, "step": 2276 }, { "epoch": 0.3541213063763608, "grad_norm": 0.9881080181319524, "learning_rate": 7.212082466407975e-06, "loss": 0.1526, "step": 2277 }, { "epoch": 0.3542768273716952, "grad_norm": 1.1043054306122095, "learning_rate": 7.209891369823811e-06, "loss": 0.1573, "step": 2278 }, { "epoch": 0.3544323483670295, "grad_norm": 0.9109524584999198, "learning_rate": 7.207699745707761e-06, "loss": 0.1103, "step": 2279 }, { "epoch": 0.3545878693623639, "grad_norm": 1.5785869425357018, "learning_rate": 7.205507594582994e-06, "loss": 0.149, "step": 2280 }, { "epoch": 0.3547433903576983, "grad_norm": 1.405963573178661, "learning_rate": 7.203314916972808e-06, "loss": 0.2765, "step": 2281 }, { "epoch": 0.35489891135303264, "grad_norm": 0.8668811088072333, "learning_rate": 7.201121713400625e-06, "loss": 0.1382, "step": 2282 }, { "epoch": 0.35505443234836703, "grad_norm": 1.2160763994643455, "learning_rate": 7.198927984389991e-06, "loss": 0.1399, "step": 2283 }, { "epoch": 0.3552099533437014, "grad_norm": 1.0906445289443447, "learning_rate": 7.196733730464583e-06, "loss": 0.1829, "step": 2284 }, { "epoch": 0.35536547433903576, "grad_norm": 1.0001471385756653, "learning_rate": 7.194538952148201e-06, "loss": 0.2361, "step": 2285 }, { "epoch": 0.35552099533437015, "grad_norm": 1.4187662150180531, "learning_rate": 7.192343649964766e-06, "loss": 0.2701, "step": 2286 }, { "epoch": 0.3556765163297045, "grad_norm": 1.2526282472921804, "learning_rate": 7.1901478244383275e-06, "loss": 0.2104, "step": 2287 }, { "epoch": 0.3558320373250389, "grad_norm": 1.1154075241968762, "learning_rate": 7.187951476093061e-06, "loss": 0.1431, "step": 2288 }, { "epoch": 0.35598755832037327, "grad_norm": 1.2251456501955684, "learning_rate": 7.185754605453265e-06, "loss": 0.2471, "step": 2289 }, { "epoch": 0.3561430793157076, "grad_norm": 1.0411610852893285, "learning_rate": 7.183557213043365e-06, "loss": 0.202, "step": 2290 }, { "epoch": 0.356298600311042, "grad_norm": 1.5532140043891516, "learning_rate": 7.181359299387906e-06, "loss": 0.2151, "step": 2291 }, { "epoch": 0.3564541213063764, "grad_norm": 1.0417994464641769, "learning_rate": 7.179160865011562e-06, "loss": 0.1761, "step": 2292 }, { "epoch": 0.3566096423017107, "grad_norm": 0.7976309702114018, "learning_rate": 7.176961910439131e-06, "loss": 0.1007, "step": 2293 }, { "epoch": 0.3567651632970451, "grad_norm": 1.0863249943337554, "learning_rate": 7.1747624361955325e-06, "loss": 0.1835, "step": 2294 }, { "epoch": 0.3569206842923795, "grad_norm": 1.1565616772152922, "learning_rate": 7.172562442805812e-06, "loss": 0.1627, "step": 2295 }, { "epoch": 0.35707620528771383, "grad_norm": 0.9479507825317183, "learning_rate": 7.17036193079514e-06, "loss": 0.1925, "step": 2296 }, { "epoch": 0.3572317262830482, "grad_norm": 1.043363629556522, "learning_rate": 7.1681609006888075e-06, "loss": 0.1611, "step": 2297 }, { "epoch": 0.35738724727838256, "grad_norm": 1.4399957139277806, "learning_rate": 7.165959353012233e-06, "loss": 0.2088, "step": 2298 }, { "epoch": 0.35754276827371695, "grad_norm": 1.2991009814345642, "learning_rate": 7.163757288290953e-06, "loss": 0.214, "step": 2299 }, { "epoch": 0.35769828926905134, "grad_norm": 1.9478887285157858, "learning_rate": 7.161554707050637e-06, "loss": 0.1946, "step": 2300 }, { "epoch": 0.35769828926905134, "eval_loss": 0.18921887874603271, "eval_runtime": 9.4087, "eval_samples_per_second": 2.763, "eval_steps_per_second": 0.744, "step": 2300 }, { "epoch": 0.3578538102643857, "grad_norm": 0.9428570146616179, "learning_rate": 7.159351609817067e-06, "loss": 0.1503, "step": 2301 }, { "epoch": 0.35800933125972006, "grad_norm": 1.1048808779315675, "learning_rate": 7.157147997116154e-06, "loss": 0.2356, "step": 2302 }, { "epoch": 0.35816485225505446, "grad_norm": 1.006633850639556, "learning_rate": 7.1549438694739315e-06, "loss": 0.2197, "step": 2303 }, { "epoch": 0.3583203732503888, "grad_norm": 1.2798004090008583, "learning_rate": 7.152739227416557e-06, "loss": 0.2492, "step": 2304 }, { "epoch": 0.3584758942457232, "grad_norm": 1.1331689100319342, "learning_rate": 7.1505340714703074e-06, "loss": 0.1343, "step": 2305 }, { "epoch": 0.3586314152410575, "grad_norm": 1.0395499103161603, "learning_rate": 7.148328402161585e-06, "loss": 0.1136, "step": 2306 }, { "epoch": 0.3587869362363919, "grad_norm": 0.9102078725995751, "learning_rate": 7.146122220016911e-06, "loss": 0.182, "step": 2307 }, { "epoch": 0.3589424572317263, "grad_norm": 0.8643449115033852, "learning_rate": 7.143915525562937e-06, "loss": 0.1514, "step": 2308 }, { "epoch": 0.35909797822706063, "grad_norm": 0.8372495276918588, "learning_rate": 7.1417083193264295e-06, "loss": 0.1537, "step": 2309 }, { "epoch": 0.359253499222395, "grad_norm": 0.8810038544039921, "learning_rate": 7.1395006018342774e-06, "loss": 0.1929, "step": 2310 }, { "epoch": 0.3594090202177294, "grad_norm": 1.276857093195218, "learning_rate": 7.1372923736134945e-06, "loss": 0.1837, "step": 2311 }, { "epoch": 0.35956454121306375, "grad_norm": 1.4509395248383896, "learning_rate": 7.135083635191219e-06, "loss": 0.1722, "step": 2312 }, { "epoch": 0.35972006220839814, "grad_norm": 1.0687269790413854, "learning_rate": 7.132874387094704e-06, "loss": 0.2111, "step": 2313 }, { "epoch": 0.35987558320373253, "grad_norm": 0.9627452182505646, "learning_rate": 7.130664629851327e-06, "loss": 0.1767, "step": 2314 }, { "epoch": 0.36003110419906686, "grad_norm": 1.5551679046263391, "learning_rate": 7.128454363988592e-06, "loss": 0.2389, "step": 2315 }, { "epoch": 0.36018662519440126, "grad_norm": 1.2093971714044265, "learning_rate": 7.126243590034116e-06, "loss": 0.2611, "step": 2316 }, { "epoch": 0.3603421461897356, "grad_norm": 1.4565354322942279, "learning_rate": 7.124032308515647e-06, "loss": 0.1925, "step": 2317 }, { "epoch": 0.36049766718507, "grad_norm": 1.1661596300826522, "learning_rate": 7.1218205199610425e-06, "loss": 0.2004, "step": 2318 }, { "epoch": 0.36065318818040437, "grad_norm": 1.1584339216434665, "learning_rate": 7.11960822489829e-06, "loss": 0.1793, "step": 2319 }, { "epoch": 0.3608087091757387, "grad_norm": 0.8607122447968496, "learning_rate": 7.117395423855496e-06, "loss": 0.1751, "step": 2320 }, { "epoch": 0.3609642301710731, "grad_norm": 1.091050473397846, "learning_rate": 7.115182117360886e-06, "loss": 0.2502, "step": 2321 }, { "epoch": 0.3611197511664075, "grad_norm": 0.8711785688336565, "learning_rate": 7.112968305942808e-06, "loss": 0.1058, "step": 2322 }, { "epoch": 0.3612752721617418, "grad_norm": 1.1552594994960588, "learning_rate": 7.110753990129728e-06, "loss": 0.1151, "step": 2323 }, { "epoch": 0.3614307931570762, "grad_norm": 1.3836718059030237, "learning_rate": 7.108539170450235e-06, "loss": 0.2018, "step": 2324 }, { "epoch": 0.36158631415241055, "grad_norm": 1.0124246120132934, "learning_rate": 7.106323847433039e-06, "loss": 0.163, "step": 2325 }, { "epoch": 0.36174183514774494, "grad_norm": 0.9333383598312576, "learning_rate": 7.104108021606966e-06, "loss": 0.2181, "step": 2326 }, { "epoch": 0.36189735614307933, "grad_norm": 1.2601007150200607, "learning_rate": 7.101891693500964e-06, "loss": 0.2585, "step": 2327 }, { "epoch": 0.36205287713841366, "grad_norm": 1.1292583425837437, "learning_rate": 7.099674863644105e-06, "loss": 0.1223, "step": 2328 }, { "epoch": 0.36220839813374806, "grad_norm": 1.593214182401497, "learning_rate": 7.097457532565575e-06, "loss": 0.1995, "step": 2329 }, { "epoch": 0.36236391912908245, "grad_norm": 0.7240505284643941, "learning_rate": 7.09523970079468e-06, "loss": 0.1143, "step": 2330 }, { "epoch": 0.3625194401244168, "grad_norm": 1.3173091068449545, "learning_rate": 7.093021368860851e-06, "loss": 0.1293, "step": 2331 }, { "epoch": 0.36267496111975117, "grad_norm": 1.9330725465559453, "learning_rate": 7.090802537293632e-06, "loss": 0.222, "step": 2332 }, { "epoch": 0.36283048211508556, "grad_norm": 1.0206696029321662, "learning_rate": 7.08858320662269e-06, "loss": 0.1524, "step": 2333 }, { "epoch": 0.3629860031104199, "grad_norm": 1.1547632205218992, "learning_rate": 7.0863633773778115e-06, "loss": 0.2109, "step": 2334 }, { "epoch": 0.3631415241057543, "grad_norm": 1.091488563826298, "learning_rate": 7.084143050088898e-06, "loss": 0.2162, "step": 2335 }, { "epoch": 0.3632970451010886, "grad_norm": 1.4114119480538185, "learning_rate": 7.081922225285976e-06, "loss": 0.2112, "step": 2336 }, { "epoch": 0.363452566096423, "grad_norm": 1.542634464786214, "learning_rate": 7.079700903499183e-06, "loss": 0.2389, "step": 2337 }, { "epoch": 0.3636080870917574, "grad_norm": 1.6663038598481195, "learning_rate": 7.077479085258784e-06, "loss": 0.2255, "step": 2338 }, { "epoch": 0.36376360808709174, "grad_norm": 1.0429631905201653, "learning_rate": 7.075256771095155e-06, "loss": 0.1493, "step": 2339 }, { "epoch": 0.36391912908242613, "grad_norm": 1.1114274658581276, "learning_rate": 7.073033961538793e-06, "loss": 0.1007, "step": 2340 }, { "epoch": 0.3640746500777605, "grad_norm": 1.0216240779694274, "learning_rate": 7.070810657120317e-06, "loss": 0.1775, "step": 2341 }, { "epoch": 0.36423017107309485, "grad_norm": 1.5208783377763164, "learning_rate": 7.068586858370458e-06, "loss": 0.1676, "step": 2342 }, { "epoch": 0.36438569206842925, "grad_norm": 1.194665201212614, "learning_rate": 7.066362565820067e-06, "loss": 0.16, "step": 2343 }, { "epoch": 0.3645412130637636, "grad_norm": 1.4748288923281365, "learning_rate": 7.064137780000118e-06, "loss": 0.1795, "step": 2344 }, { "epoch": 0.36469673405909797, "grad_norm": 0.9283284960054102, "learning_rate": 7.061912501441694e-06, "loss": 0.2119, "step": 2345 }, { "epoch": 0.36485225505443236, "grad_norm": 1.0726218835557328, "learning_rate": 7.059686730676001e-06, "loss": 0.1868, "step": 2346 }, { "epoch": 0.3650077760497667, "grad_norm": 1.143320110733354, "learning_rate": 7.057460468234363e-06, "loss": 0.2076, "step": 2347 }, { "epoch": 0.3651632970451011, "grad_norm": 1.3463778386934702, "learning_rate": 7.055233714648217e-06, "loss": 0.2071, "step": 2348 }, { "epoch": 0.3653188180404355, "grad_norm": 0.7091020207501477, "learning_rate": 7.053006470449124e-06, "loss": 0.1479, "step": 2349 }, { "epoch": 0.3654743390357698, "grad_norm": 1.0202546024896801, "learning_rate": 7.050778736168757e-06, "loss": 0.2454, "step": 2350 }, { "epoch": 0.3656298600311042, "grad_norm": 1.076061191362471, "learning_rate": 7.0485505123389044e-06, "loss": 0.166, "step": 2351 }, { "epoch": 0.3657853810264386, "grad_norm": 1.3287680646010458, "learning_rate": 7.046321799491478e-06, "loss": 0.2175, "step": 2352 }, { "epoch": 0.36594090202177293, "grad_norm": 1.0906061228214774, "learning_rate": 7.044092598158501e-06, "loss": 0.1732, "step": 2353 }, { "epoch": 0.3660964230171073, "grad_norm": 1.5305176229815427, "learning_rate": 7.0418629088721144e-06, "loss": 0.1612, "step": 2354 }, { "epoch": 0.36625194401244165, "grad_norm": 1.005472350514282, "learning_rate": 7.039632732164576e-06, "loss": 0.2195, "step": 2355 }, { "epoch": 0.36640746500777605, "grad_norm": 0.9842880088873223, "learning_rate": 7.037402068568262e-06, "loss": 0.1575, "step": 2356 }, { "epoch": 0.36656298600311044, "grad_norm": 1.3664857079128792, "learning_rate": 7.035170918615661e-06, "loss": 0.1692, "step": 2357 }, { "epoch": 0.36671850699844477, "grad_norm": 0.9185085935226209, "learning_rate": 7.032939282839382e-06, "loss": 0.1634, "step": 2358 }, { "epoch": 0.36687402799377916, "grad_norm": 3.608017689586461, "learning_rate": 7.030707161772144e-06, "loss": 0.2445, "step": 2359 }, { "epoch": 0.36702954898911355, "grad_norm": 1.1952612508416054, "learning_rate": 7.028474555946787e-06, "loss": 0.2449, "step": 2360 }, { "epoch": 0.3671850699844479, "grad_norm": 2.672074128737491, "learning_rate": 7.0262414658962664e-06, "loss": 0.243, "step": 2361 }, { "epoch": 0.3673405909797823, "grad_norm": 1.0751314382874948, "learning_rate": 7.02400789215365e-06, "loss": 0.2035, "step": 2362 }, { "epoch": 0.36749611197511667, "grad_norm": 1.6408460217628706, "learning_rate": 7.021773835252126e-06, "loss": 0.1474, "step": 2363 }, { "epoch": 0.367651632970451, "grad_norm": 1.0288566964453982, "learning_rate": 7.019539295724991e-06, "loss": 0.177, "step": 2364 }, { "epoch": 0.3678071539657854, "grad_norm": 0.9667479424453713, "learning_rate": 7.017304274105663e-06, "loss": 0.2053, "step": 2365 }, { "epoch": 0.36796267496111973, "grad_norm": 0.792769552176139, "learning_rate": 7.015068770927673e-06, "loss": 0.2044, "step": 2366 }, { "epoch": 0.3681181959564541, "grad_norm": 1.109201151346079, "learning_rate": 7.012832786724666e-06, "loss": 0.1992, "step": 2367 }, { "epoch": 0.3682737169517885, "grad_norm": 1.0799477552682741, "learning_rate": 7.010596322030402e-06, "loss": 0.1451, "step": 2368 }, { "epoch": 0.36842923794712285, "grad_norm": 0.979615789141077, "learning_rate": 7.008359377378759e-06, "loss": 0.1699, "step": 2369 }, { "epoch": 0.36858475894245724, "grad_norm": 1.353723136460497, "learning_rate": 7.006121953303724e-06, "loss": 0.235, "step": 2370 }, { "epoch": 0.3687402799377916, "grad_norm": 0.9905333697227681, "learning_rate": 7.003884050339402e-06, "loss": 0.1475, "step": 2371 }, { "epoch": 0.36889580093312596, "grad_norm": 3.041187288228009, "learning_rate": 7.001645669020013e-06, "loss": 0.1688, "step": 2372 }, { "epoch": 0.36905132192846035, "grad_norm": 2.0765975441044993, "learning_rate": 6.999406809879888e-06, "loss": 0.3695, "step": 2373 }, { "epoch": 0.3692068429237947, "grad_norm": 1.158228564800467, "learning_rate": 6.997167473453477e-06, "loss": 0.1507, "step": 2374 }, { "epoch": 0.3693623639191291, "grad_norm": 1.1219378585438622, "learning_rate": 6.9949276602753345e-06, "loss": 0.2191, "step": 2375 }, { "epoch": 0.36951788491446347, "grad_norm": 1.297976727938507, "learning_rate": 6.992687370880142e-06, "loss": 0.1518, "step": 2376 }, { "epoch": 0.3696734059097978, "grad_norm": 0.8045314657419504, "learning_rate": 6.990446605802685e-06, "loss": 0.1968, "step": 2377 }, { "epoch": 0.3698289269051322, "grad_norm": 0.9115680958396285, "learning_rate": 6.9882053655778655e-06, "loss": 0.1802, "step": 2378 }, { "epoch": 0.3699844479004666, "grad_norm": 0.9563446251409226, "learning_rate": 6.985963650740698e-06, "loss": 0.1923, "step": 2379 }, { "epoch": 0.3701399688958009, "grad_norm": 1.315517151736937, "learning_rate": 6.983721461826312e-06, "loss": 0.1805, "step": 2380 }, { "epoch": 0.3702954898911353, "grad_norm": 0.9732554468122933, "learning_rate": 6.98147879936995e-06, "loss": 0.2192, "step": 2381 }, { "epoch": 0.3704510108864697, "grad_norm": 1.1081210160073807, "learning_rate": 6.979235663906965e-06, "loss": 0.1317, "step": 2382 }, { "epoch": 0.37060653188180404, "grad_norm": 1.1889206916002855, "learning_rate": 6.976992055972826e-06, "loss": 0.1866, "step": 2383 }, { "epoch": 0.3707620528771384, "grad_norm": 11.135787030920733, "learning_rate": 6.9747479761031126e-06, "loss": 0.1505, "step": 2384 }, { "epoch": 0.37091757387247276, "grad_norm": 1.3454658025160413, "learning_rate": 6.972503424833519e-06, "loss": 0.1996, "step": 2385 }, { "epoch": 0.37107309486780715, "grad_norm": 1.3190010631410074, "learning_rate": 6.97025840269985e-06, "loss": 0.1258, "step": 2386 }, { "epoch": 0.37122861586314154, "grad_norm": 0.8893058874083165, "learning_rate": 6.968012910238024e-06, "loss": 0.1811, "step": 2387 }, { "epoch": 0.3713841368584759, "grad_norm": 0.9329759550645588, "learning_rate": 6.965766947984072e-06, "loss": 0.182, "step": 2388 }, { "epoch": 0.37153965785381027, "grad_norm": 4.10257063822203, "learning_rate": 6.963520516474136e-06, "loss": 0.1941, "step": 2389 }, { "epoch": 0.37169517884914466, "grad_norm": 0.8899616705795278, "learning_rate": 6.9612736162444695e-06, "loss": 0.2562, "step": 2390 }, { "epoch": 0.371850699844479, "grad_norm": 1.4049984392577397, "learning_rate": 6.9590262478314406e-06, "loss": 0.2079, "step": 2391 }, { "epoch": 0.3720062208398134, "grad_norm": 1.603810693877871, "learning_rate": 6.9567784117715265e-06, "loss": 0.1368, "step": 2392 }, { "epoch": 0.3721617418351477, "grad_norm": 0.9514503338716984, "learning_rate": 6.9545301086013185e-06, "loss": 0.2073, "step": 2393 }, { "epoch": 0.3723172628304821, "grad_norm": 1.0799365732141513, "learning_rate": 6.952281338857515e-06, "loss": 0.1835, "step": 2394 }, { "epoch": 0.3724727838258165, "grad_norm": 1.2626826035246546, "learning_rate": 6.950032103076931e-06, "loss": 0.1335, "step": 2395 }, { "epoch": 0.37262830482115084, "grad_norm": 0.9803757947383319, "learning_rate": 6.94778240179649e-06, "loss": 0.3108, "step": 2396 }, { "epoch": 0.3727838258164852, "grad_norm": 0.671144218564636, "learning_rate": 6.945532235553226e-06, "loss": 0.1344, "step": 2397 }, { "epoch": 0.3729393468118196, "grad_norm": 1.4059418072184426, "learning_rate": 6.943281604884287e-06, "loss": 0.253, "step": 2398 }, { "epoch": 0.37309486780715395, "grad_norm": 0.930830156352673, "learning_rate": 6.941030510326929e-06, "loss": 0.1902, "step": 2399 }, { "epoch": 0.37325038880248834, "grad_norm": 1.4637538753942325, "learning_rate": 6.938778952418519e-06, "loss": 0.1867, "step": 2400 }, { "epoch": 0.37325038880248834, "eval_loss": 0.18691138923168182, "eval_runtime": 9.4263, "eval_samples_per_second": 2.758, "eval_steps_per_second": 0.743, "step": 2400 }, { "epoch": 0.37340590979782273, "grad_norm": 1.376668603053833, "learning_rate": 6.9365269316965355e-06, "loss": 0.1441, "step": 2401 }, { "epoch": 0.37356143079315707, "grad_norm": 1.176964614649653, "learning_rate": 6.9342744486985696e-06, "loss": 0.124, "step": 2402 }, { "epoch": 0.37371695178849146, "grad_norm": 1.0020803920976693, "learning_rate": 6.932021503962316e-06, "loss": 0.1367, "step": 2403 }, { "epoch": 0.3738724727838258, "grad_norm": 0.9062784193368545, "learning_rate": 6.929768098025587e-06, "loss": 0.1711, "step": 2404 }, { "epoch": 0.3740279937791602, "grad_norm": 1.1520679415765451, "learning_rate": 6.927514231426302e-06, "loss": 0.148, "step": 2405 }, { "epoch": 0.3741835147744946, "grad_norm": 1.1645414451706537, "learning_rate": 6.925259904702491e-06, "loss": 0.148, "step": 2406 }, { "epoch": 0.3743390357698289, "grad_norm": 1.153328163680638, "learning_rate": 6.9230051183922895e-06, "loss": 0.2166, "step": 2407 }, { "epoch": 0.3744945567651633, "grad_norm": 2.233481664092907, "learning_rate": 6.92074987303395e-06, "loss": 0.1399, "step": 2408 }, { "epoch": 0.3746500777604977, "grad_norm": 1.0687893634840073, "learning_rate": 6.918494169165831e-06, "loss": 0.137, "step": 2409 }, { "epoch": 0.374805598755832, "grad_norm": 0.9041087146454834, "learning_rate": 6.916238007326399e-06, "loss": 0.2334, "step": 2410 }, { "epoch": 0.3749611197511664, "grad_norm": 0.8102135810375276, "learning_rate": 6.913981388054231e-06, "loss": 0.1083, "step": 2411 }, { "epoch": 0.37511664074650075, "grad_norm": 1.4172850770009626, "learning_rate": 6.911724311888015e-06, "loss": 0.233, "step": 2412 }, { "epoch": 0.37527216174183514, "grad_norm": 1.2024379388632052, "learning_rate": 6.909466779366546e-06, "loss": 0.1257, "step": 2413 }, { "epoch": 0.37542768273716953, "grad_norm": 1.337922851302582, "learning_rate": 6.907208791028728e-06, "loss": 0.2466, "step": 2414 }, { "epoch": 0.37558320373250387, "grad_norm": 1.8058372626532604, "learning_rate": 6.904950347413575e-06, "loss": 0.214, "step": 2415 }, { "epoch": 0.37573872472783826, "grad_norm": 1.0987667197419122, "learning_rate": 6.902691449060207e-06, "loss": 0.1897, "step": 2416 }, { "epoch": 0.37589424572317265, "grad_norm": 1.6984912982683522, "learning_rate": 6.900432096507858e-06, "loss": 0.2006, "step": 2417 }, { "epoch": 0.376049766718507, "grad_norm": 1.0710604367613714, "learning_rate": 6.898172290295865e-06, "loss": 0.1745, "step": 2418 }, { "epoch": 0.3762052877138414, "grad_norm": 1.276584802917871, "learning_rate": 6.895912030963674e-06, "loss": 0.2203, "step": 2419 }, { "epoch": 0.37636080870917576, "grad_norm": 0.9232690446413373, "learning_rate": 6.893651319050842e-06, "loss": 0.2074, "step": 2420 }, { "epoch": 0.3765163297045101, "grad_norm": 1.3598306850215713, "learning_rate": 6.891390155097034e-06, "loss": 0.1576, "step": 2421 }, { "epoch": 0.3766718506998445, "grad_norm": 0.9536124715878034, "learning_rate": 6.889128539642018e-06, "loss": 0.2588, "step": 2422 }, { "epoch": 0.3768273716951788, "grad_norm": 0.8588846269148263, "learning_rate": 6.886866473225675e-06, "loss": 0.2085, "step": 2423 }, { "epoch": 0.3769828926905132, "grad_norm": 0.9861452491369436, "learning_rate": 6.8846039563879916e-06, "loss": 0.1993, "step": 2424 }, { "epoch": 0.3771384136858476, "grad_norm": 0.8320288261996919, "learning_rate": 6.882340989669063e-06, "loss": 0.1147, "step": 2425 }, { "epoch": 0.37729393468118194, "grad_norm": 1.073796206681467, "learning_rate": 6.88007757360909e-06, "loss": 0.2175, "step": 2426 }, { "epoch": 0.37744945567651633, "grad_norm": 1.2030279884880695, "learning_rate": 6.877813708748381e-06, "loss": 0.2499, "step": 2427 }, { "epoch": 0.3776049766718507, "grad_norm": 1.1568304904607627, "learning_rate": 6.875549395627351e-06, "loss": 0.2288, "step": 2428 }, { "epoch": 0.37776049766718506, "grad_norm": 1.1059557610586614, "learning_rate": 6.8732846347865275e-06, "loss": 0.145, "step": 2429 }, { "epoch": 0.37791601866251945, "grad_norm": 1.5101029858830746, "learning_rate": 6.871019426766537e-06, "loss": 0.2392, "step": 2430 }, { "epoch": 0.3780715396578538, "grad_norm": 0.7663081636579712, "learning_rate": 6.868753772108117e-06, "loss": 0.1621, "step": 2431 }, { "epoch": 0.3782270606531882, "grad_norm": 1.0643777897930133, "learning_rate": 6.86648767135211e-06, "loss": 0.2628, "step": 2432 }, { "epoch": 0.37838258164852256, "grad_norm": 1.1276215468571031, "learning_rate": 6.864221125039467e-06, "loss": 0.1393, "step": 2433 }, { "epoch": 0.3785381026438569, "grad_norm": 0.9483944644463491, "learning_rate": 6.861954133711246e-06, "loss": 0.2078, "step": 2434 }, { "epoch": 0.3786936236391913, "grad_norm": 1.143537210655515, "learning_rate": 6.8596866979086055e-06, "loss": 0.2484, "step": 2435 }, { "epoch": 0.3788491446345257, "grad_norm": 1.0158762847745515, "learning_rate": 6.857418818172815e-06, "loss": 0.1377, "step": 2436 }, { "epoch": 0.37900466562986, "grad_norm": 0.7875302187216353, "learning_rate": 6.855150495045252e-06, "loss": 0.1805, "step": 2437 }, { "epoch": 0.3791601866251944, "grad_norm": 0.8770663624076738, "learning_rate": 6.852881729067394e-06, "loss": 0.1369, "step": 2438 }, { "epoch": 0.3793157076205288, "grad_norm": 1.1045098441898251, "learning_rate": 6.850612520780829e-06, "loss": 0.1534, "step": 2439 }, { "epoch": 0.37947122861586313, "grad_norm": 0.6781485590859858, "learning_rate": 6.8483428707272456e-06, "loss": 0.1465, "step": 2440 }, { "epoch": 0.3796267496111975, "grad_norm": 2.03536699146056, "learning_rate": 6.846072779448444e-06, "loss": 0.2241, "step": 2441 }, { "epoch": 0.37978227060653186, "grad_norm": 1.1720489471489604, "learning_rate": 6.843802247486326e-06, "loss": 0.1022, "step": 2442 }, { "epoch": 0.37993779160186625, "grad_norm": 1.3092721690937488, "learning_rate": 6.841531275382899e-06, "loss": 0.2588, "step": 2443 }, { "epoch": 0.38009331259720064, "grad_norm": 0.8039255611681045, "learning_rate": 6.839259863680275e-06, "loss": 0.1417, "step": 2444 }, { "epoch": 0.380248833592535, "grad_norm": 0.9378444103999392, "learning_rate": 6.8369880129206715e-06, "loss": 0.1903, "step": 2445 }, { "epoch": 0.38040435458786936, "grad_norm": 0.7550162494178648, "learning_rate": 6.834715723646413e-06, "loss": 0.135, "step": 2446 }, { "epoch": 0.38055987558320375, "grad_norm": 0.885248758342545, "learning_rate": 6.832442996399924e-06, "loss": 0.2259, "step": 2447 }, { "epoch": 0.3807153965785381, "grad_norm": 0.8332798100532923, "learning_rate": 6.830169831723738e-06, "loss": 0.1481, "step": 2448 }, { "epoch": 0.3808709175738725, "grad_norm": 1.1842923552395075, "learning_rate": 6.82789623016049e-06, "loss": 0.1399, "step": 2449 }, { "epoch": 0.3810264385692068, "grad_norm": 1.3757370959390027, "learning_rate": 6.825622192252922e-06, "loss": 0.1438, "step": 2450 }, { "epoch": 0.3811819595645412, "grad_norm": 2.0897240403641857, "learning_rate": 6.8233477185438765e-06, "loss": 0.2637, "step": 2451 }, { "epoch": 0.3813374805598756, "grad_norm": 0.9248134923456466, "learning_rate": 6.821072809576303e-06, "loss": 0.1615, "step": 2452 }, { "epoch": 0.38149300155520993, "grad_norm": 1.700273388755456, "learning_rate": 6.818797465893256e-06, "loss": 0.2249, "step": 2453 }, { "epoch": 0.3816485225505443, "grad_norm": 0.9016608237583374, "learning_rate": 6.816521688037888e-06, "loss": 0.1379, "step": 2454 }, { "epoch": 0.3818040435458787, "grad_norm": 0.8005966305673343, "learning_rate": 6.814245476553462e-06, "loss": 0.1315, "step": 2455 }, { "epoch": 0.38195956454121305, "grad_norm": 1.408025175131532, "learning_rate": 6.811968831983339e-06, "loss": 0.2578, "step": 2456 }, { "epoch": 0.38211508553654744, "grad_norm": 0.9143571801468223, "learning_rate": 6.809691754870988e-06, "loss": 0.2131, "step": 2457 }, { "epoch": 0.38227060653188183, "grad_norm": 0.8362783551802667, "learning_rate": 6.807414245759977e-06, "loss": 0.1649, "step": 2458 }, { "epoch": 0.38242612752721616, "grad_norm": 1.2556737596290073, "learning_rate": 6.805136305193981e-06, "loss": 0.1228, "step": 2459 }, { "epoch": 0.38258164852255055, "grad_norm": 0.6349585408217063, "learning_rate": 6.802857933716774e-06, "loss": 0.1853, "step": 2460 }, { "epoch": 0.3827371695178849, "grad_norm": 0.680456202876358, "learning_rate": 6.800579131872239e-06, "loss": 0.1461, "step": 2461 }, { "epoch": 0.3828926905132193, "grad_norm": 0.9305532506458315, "learning_rate": 6.798299900204355e-06, "loss": 0.1648, "step": 2462 }, { "epoch": 0.38304821150855367, "grad_norm": 1.2284510138826186, "learning_rate": 6.796020239257205e-06, "loss": 0.2326, "step": 2463 }, { "epoch": 0.383203732503888, "grad_norm": 0.926786685916868, "learning_rate": 6.793740149574979e-06, "loss": 0.193, "step": 2464 }, { "epoch": 0.3833592534992224, "grad_norm": 1.3978321445270028, "learning_rate": 6.791459631701963e-06, "loss": 0.1746, "step": 2465 }, { "epoch": 0.3835147744945568, "grad_norm": 1.038492222035446, "learning_rate": 6.789178686182552e-06, "loss": 0.1335, "step": 2466 }, { "epoch": 0.3836702954898911, "grad_norm": 1.1143222294688881, "learning_rate": 6.786897313561237e-06, "loss": 0.1722, "step": 2467 }, { "epoch": 0.3838258164852255, "grad_norm": 0.8463624463774098, "learning_rate": 6.784615514382613e-06, "loss": 0.1454, "step": 2468 }, { "epoch": 0.3839813374805599, "grad_norm": 1.0404936695642062, "learning_rate": 6.782333289191379e-06, "loss": 0.1484, "step": 2469 }, { "epoch": 0.38413685847589424, "grad_norm": 1.364985863839375, "learning_rate": 6.7800506385323335e-06, "loss": 0.2078, "step": 2470 }, { "epoch": 0.38429237947122863, "grad_norm": 0.8421076682657236, "learning_rate": 6.777767562950378e-06, "loss": 0.1038, "step": 2471 }, { "epoch": 0.38444790046656296, "grad_norm": 0.9549120799804582, "learning_rate": 6.775484062990512e-06, "loss": 0.23, "step": 2472 }, { "epoch": 0.38460342146189735, "grad_norm": 1.029470255441356, "learning_rate": 6.773200139197841e-06, "loss": 0.2276, "step": 2473 }, { "epoch": 0.38475894245723175, "grad_norm": 1.6792776587314855, "learning_rate": 6.770915792117567e-06, "loss": 0.2731, "step": 2474 }, { "epoch": 0.3849144634525661, "grad_norm": 0.7318425515352489, "learning_rate": 6.768631022295e-06, "loss": 0.1469, "step": 2475 }, { "epoch": 0.38506998444790047, "grad_norm": 1.2105013154394777, "learning_rate": 6.76634583027554e-06, "loss": 0.2563, "step": 2476 }, { "epoch": 0.38522550544323486, "grad_norm": 1.3911670036391348, "learning_rate": 6.7640602166047e-06, "loss": 0.2096, "step": 2477 }, { "epoch": 0.3853810264385692, "grad_norm": 0.8946243870336498, "learning_rate": 6.761774181828087e-06, "loss": 0.1679, "step": 2478 }, { "epoch": 0.3855365474339036, "grad_norm": 1.4635211181163266, "learning_rate": 6.759487726491406e-06, "loss": 0.2103, "step": 2479 }, { "epoch": 0.3856920684292379, "grad_norm": 0.8507830922186865, "learning_rate": 6.757200851140468e-06, "loss": 0.2464, "step": 2480 }, { "epoch": 0.3858475894245723, "grad_norm": 1.1518824623115165, "learning_rate": 6.754913556321181e-06, "loss": 0.199, "step": 2481 }, { "epoch": 0.3860031104199067, "grad_norm": 0.7580533454819722, "learning_rate": 6.752625842579557e-06, "loss": 0.1638, "step": 2482 }, { "epoch": 0.38615863141524104, "grad_norm": 1.3857158874499211, "learning_rate": 6.750337710461702e-06, "loss": 0.272, "step": 2483 }, { "epoch": 0.38631415241057543, "grad_norm": 1.0427533654702674, "learning_rate": 6.7480491605138255e-06, "loss": 0.1823, "step": 2484 }, { "epoch": 0.3864696734059098, "grad_norm": 1.3753758935908103, "learning_rate": 6.745760193282238e-06, "loss": 0.1845, "step": 2485 }, { "epoch": 0.38662519440124415, "grad_norm": 0.9737491829236448, "learning_rate": 6.743470809313347e-06, "loss": 0.1545, "step": 2486 }, { "epoch": 0.38678071539657854, "grad_norm": 0.9836561982240439, "learning_rate": 6.74118100915366e-06, "loss": 0.1666, "step": 2487 }, { "epoch": 0.38693623639191294, "grad_norm": 1.2822808823251517, "learning_rate": 6.738890793349784e-06, "loss": 0.1784, "step": 2488 }, { "epoch": 0.38709175738724727, "grad_norm": 1.1568671135204884, "learning_rate": 6.7366001624484256e-06, "loss": 0.1481, "step": 2489 }, { "epoch": 0.38724727838258166, "grad_norm": 0.7907764994245496, "learning_rate": 6.734309116996392e-06, "loss": 0.152, "step": 2490 }, { "epoch": 0.387402799377916, "grad_norm": 0.7362136144474631, "learning_rate": 6.732017657540586e-06, "loss": 0.1845, "step": 2491 }, { "epoch": 0.3875583203732504, "grad_norm": 1.1794652000791737, "learning_rate": 6.729725784628011e-06, "loss": 0.1549, "step": 2492 }, { "epoch": 0.3877138413685848, "grad_norm": 0.9052715048695029, "learning_rate": 6.727433498805768e-06, "loss": 0.2484, "step": 2493 }, { "epoch": 0.3878693623639191, "grad_norm": 0.87038691711346, "learning_rate": 6.7251408006210615e-06, "loss": 0.1672, "step": 2494 }, { "epoch": 0.3880248833592535, "grad_norm": 0.787438840145094, "learning_rate": 6.722847690621188e-06, "loss": 0.1488, "step": 2495 }, { "epoch": 0.3881804043545879, "grad_norm": 0.8036573319821115, "learning_rate": 6.720554169353544e-06, "loss": 0.1909, "step": 2496 }, { "epoch": 0.38833592534992223, "grad_norm": 1.0459585577622728, "learning_rate": 6.7182602373656245e-06, "loss": 0.192, "step": 2497 }, { "epoch": 0.3884914463452566, "grad_norm": 1.4434500609817846, "learning_rate": 6.715965895205025e-06, "loss": 0.208, "step": 2498 }, { "epoch": 0.38864696734059095, "grad_norm": 0.8886354404441944, "learning_rate": 6.713671143419438e-06, "loss": 0.1686, "step": 2499 }, { "epoch": 0.38880248833592534, "grad_norm": 1.0441152070555744, "learning_rate": 6.711375982556648e-06, "loss": 0.1452, "step": 2500 }, { "epoch": 0.38880248833592534, "eval_loss": 0.18550948798656464, "eval_runtime": 9.412, "eval_samples_per_second": 2.762, "eval_steps_per_second": 0.744, "step": 2500 }, { "epoch": 0.38895800933125974, "grad_norm": 0.9990467571065071, "learning_rate": 6.709080413164547e-06, "loss": 0.1436, "step": 2501 }, { "epoch": 0.38911353032659407, "grad_norm": 0.9860297858570273, "learning_rate": 6.706784435791118e-06, "loss": 0.1867, "step": 2502 }, { "epoch": 0.38926905132192846, "grad_norm": 1.5675453361251264, "learning_rate": 6.704488050984442e-06, "loss": 0.1578, "step": 2503 }, { "epoch": 0.38942457231726285, "grad_norm": 1.6856171907471373, "learning_rate": 6.702191259292696e-06, "loss": 0.1585, "step": 2504 }, { "epoch": 0.3895800933125972, "grad_norm": 1.5479321558493317, "learning_rate": 6.69989406126416e-06, "loss": 0.2077, "step": 2505 }, { "epoch": 0.3897356143079316, "grad_norm": 1.1356553084894583, "learning_rate": 6.6975964574472065e-06, "loss": 0.1921, "step": 2506 }, { "epoch": 0.38989113530326597, "grad_norm": 0.9735689832390695, "learning_rate": 6.695298448390304e-06, "loss": 0.1132, "step": 2507 }, { "epoch": 0.3900466562986003, "grad_norm": 1.0106664740902342, "learning_rate": 6.693000034642021e-06, "loss": 0.2769, "step": 2508 }, { "epoch": 0.3902021772939347, "grad_norm": 1.3854882589502213, "learning_rate": 6.690701216751019e-06, "loss": 0.0958, "step": 2509 }, { "epoch": 0.39035769828926903, "grad_norm": 1.0092649527281206, "learning_rate": 6.688401995266061e-06, "loss": 0.2515, "step": 2510 }, { "epoch": 0.3905132192846034, "grad_norm": 0.831811555825957, "learning_rate": 6.686102370735998e-06, "loss": 0.1482, "step": 2511 }, { "epoch": 0.3906687402799378, "grad_norm": 0.703226079660379, "learning_rate": 6.683802343709787e-06, "loss": 0.1134, "step": 2512 }, { "epoch": 0.39082426127527214, "grad_norm": 1.4651862257166302, "learning_rate": 6.681501914736476e-06, "loss": 0.2147, "step": 2513 }, { "epoch": 0.39097978227060654, "grad_norm": 0.8776085756879949, "learning_rate": 6.679201084365208e-06, "loss": 0.1651, "step": 2514 }, { "epoch": 0.3911353032659409, "grad_norm": 1.132023872865195, "learning_rate": 6.676899853145222e-06, "loss": 0.1692, "step": 2515 }, { "epoch": 0.39129082426127526, "grad_norm": 1.2505265952362783, "learning_rate": 6.674598221625859e-06, "loss": 0.1451, "step": 2516 }, { "epoch": 0.39144634525660965, "grad_norm": 1.1032347671190128, "learning_rate": 6.672296190356545e-06, "loss": 0.1426, "step": 2517 }, { "epoch": 0.391601866251944, "grad_norm": 1.3733223503003242, "learning_rate": 6.669993759886812e-06, "loss": 0.1629, "step": 2518 }, { "epoch": 0.3917573872472784, "grad_norm": 1.0330547139967006, "learning_rate": 6.667690930766277e-06, "loss": 0.1607, "step": 2519 }, { "epoch": 0.39191290824261277, "grad_norm": 0.8571881185562782, "learning_rate": 6.665387703544661e-06, "loss": 0.2146, "step": 2520 }, { "epoch": 0.3920684292379471, "grad_norm": 1.0641537545163189, "learning_rate": 6.663084078771776e-06, "loss": 0.131, "step": 2521 }, { "epoch": 0.3922239502332815, "grad_norm": 1.135785796686817, "learning_rate": 6.660780056997528e-06, "loss": 0.1236, "step": 2522 }, { "epoch": 0.3923794712286159, "grad_norm": 1.151796277425874, "learning_rate": 6.6584756387719196e-06, "loss": 0.1863, "step": 2523 }, { "epoch": 0.3925349922239502, "grad_norm": 1.0894859561226162, "learning_rate": 6.6561708246450486e-06, "loss": 0.2826, "step": 2524 }, { "epoch": 0.3926905132192846, "grad_norm": 1.1273022058274078, "learning_rate": 6.6538656151671055e-06, "loss": 0.2095, "step": 2525 }, { "epoch": 0.392846034214619, "grad_norm": 0.9958228872301811, "learning_rate": 6.651560010888376e-06, "loss": 0.1744, "step": 2526 }, { "epoch": 0.39300155520995333, "grad_norm": 0.9190591012037924, "learning_rate": 6.6492540123592416e-06, "loss": 0.1711, "step": 2527 }, { "epoch": 0.3931570762052877, "grad_norm": 1.4248251944479098, "learning_rate": 6.646947620130174e-06, "loss": 0.2668, "step": 2528 }, { "epoch": 0.39331259720062206, "grad_norm": 1.4189007741704207, "learning_rate": 6.644640834751741e-06, "loss": 0.196, "step": 2529 }, { "epoch": 0.39346811819595645, "grad_norm": 1.5761727662706542, "learning_rate": 6.642333656774607e-06, "loss": 0.191, "step": 2530 }, { "epoch": 0.39362363919129084, "grad_norm": 1.046089701169227, "learning_rate": 6.640026086749525e-06, "loss": 0.2153, "step": 2531 }, { "epoch": 0.3937791601866252, "grad_norm": 1.3824607375826918, "learning_rate": 6.637718125227345e-06, "loss": 0.1491, "step": 2532 }, { "epoch": 0.39393468118195957, "grad_norm": 1.3750367616165267, "learning_rate": 6.63540977275901e-06, "loss": 0.1914, "step": 2533 }, { "epoch": 0.39409020217729396, "grad_norm": 0.69638905428689, "learning_rate": 6.6331010298955555e-06, "loss": 0.1694, "step": 2534 }, { "epoch": 0.3942457231726283, "grad_norm": 1.1535229946544527, "learning_rate": 6.6307918971881115e-06, "loss": 0.2225, "step": 2535 }, { "epoch": 0.3944012441679627, "grad_norm": 0.7069936805516683, "learning_rate": 6.628482375187899e-06, "loss": 0.1351, "step": 2536 }, { "epoch": 0.394556765163297, "grad_norm": 2.2165766613680833, "learning_rate": 6.626172464446233e-06, "loss": 0.2179, "step": 2537 }, { "epoch": 0.3947122861586314, "grad_norm": 1.1965809528538873, "learning_rate": 6.623862165514523e-06, "loss": 0.2103, "step": 2538 }, { "epoch": 0.3948678071539658, "grad_norm": 0.8844978481215461, "learning_rate": 6.621551478944267e-06, "loss": 0.1874, "step": 2539 }, { "epoch": 0.39502332814930013, "grad_norm": 1.2808112194841403, "learning_rate": 6.61924040528706e-06, "loss": 0.2183, "step": 2540 }, { "epoch": 0.3951788491446345, "grad_norm": 1.6710345531801987, "learning_rate": 6.616928945094589e-06, "loss": 0.2676, "step": 2541 }, { "epoch": 0.3953343701399689, "grad_norm": 1.3969732923389626, "learning_rate": 6.614617098918628e-06, "loss": 0.1292, "step": 2542 }, { "epoch": 0.39548989113530325, "grad_norm": 1.093741880242213, "learning_rate": 6.61230486731105e-06, "loss": 0.1619, "step": 2543 }, { "epoch": 0.39564541213063764, "grad_norm": 1.10174030321484, "learning_rate": 6.609992250823816e-06, "loss": 0.122, "step": 2544 }, { "epoch": 0.39580093312597203, "grad_norm": 0.8873597599913727, "learning_rate": 6.607679250008977e-06, "loss": 0.2044, "step": 2545 }, { "epoch": 0.39595645412130637, "grad_norm": 1.2069196683573005, "learning_rate": 6.605365865418685e-06, "loss": 0.1871, "step": 2546 }, { "epoch": 0.39611197511664076, "grad_norm": 0.9561341594919964, "learning_rate": 6.60305209760517e-06, "loss": 0.1432, "step": 2547 }, { "epoch": 0.3962674961119751, "grad_norm": 1.1253490298879367, "learning_rate": 6.600737947120766e-06, "loss": 0.143, "step": 2548 }, { "epoch": 0.3964230171073095, "grad_norm": 0.9659254372478461, "learning_rate": 6.59842341451789e-06, "loss": 0.1451, "step": 2549 }, { "epoch": 0.3965785381026439, "grad_norm": 1.1995873023353782, "learning_rate": 6.596108500349054e-06, "loss": 0.2183, "step": 2550 }, { "epoch": 0.3967340590979782, "grad_norm": 0.9996271887925069, "learning_rate": 6.593793205166863e-06, "loss": 0.14, "step": 2551 }, { "epoch": 0.3968895800933126, "grad_norm": 1.0657433191093166, "learning_rate": 6.5914775295240055e-06, "loss": 0.1795, "step": 2552 }, { "epoch": 0.397045101088647, "grad_norm": 0.9252514521856418, "learning_rate": 6.589161473973267e-06, "loss": 0.1631, "step": 2553 }, { "epoch": 0.3972006220839813, "grad_norm": 1.0094538507841677, "learning_rate": 6.586845039067524e-06, "loss": 0.2324, "step": 2554 }, { "epoch": 0.3973561430793157, "grad_norm": 1.2027400622245066, "learning_rate": 6.584528225359741e-06, "loss": 0.2006, "step": 2555 }, { "epoch": 0.39751166407465005, "grad_norm": 0.8178915256144502, "learning_rate": 6.582211033402973e-06, "loss": 0.1227, "step": 2556 }, { "epoch": 0.39766718506998444, "grad_norm": 1.1667701053353001, "learning_rate": 6.579893463750368e-06, "loss": 0.2173, "step": 2557 }, { "epoch": 0.39782270606531883, "grad_norm": 0.687233458313678, "learning_rate": 6.577575516955159e-06, "loss": 0.1673, "step": 2558 }, { "epoch": 0.39797822706065317, "grad_norm": 0.915520126183653, "learning_rate": 6.575257193570675e-06, "loss": 0.2475, "step": 2559 }, { "epoch": 0.39813374805598756, "grad_norm": 1.3543601407286745, "learning_rate": 6.572938494150332e-06, "loss": 0.1642, "step": 2560 }, { "epoch": 0.39828926905132195, "grad_norm": 1.1710161709539633, "learning_rate": 6.570619419247632e-06, "loss": 0.1932, "step": 2561 }, { "epoch": 0.3984447900466563, "grad_norm": 0.8522325579505854, "learning_rate": 6.568299969416177e-06, "loss": 0.1561, "step": 2562 }, { "epoch": 0.3986003110419907, "grad_norm": 0.8617306286599958, "learning_rate": 6.565980145209647e-06, "loss": 0.1641, "step": 2563 }, { "epoch": 0.39875583203732506, "grad_norm": 1.1274722811981948, "learning_rate": 6.563659947181818e-06, "loss": 0.1711, "step": 2564 }, { "epoch": 0.3989113530326594, "grad_norm": 3.3021048144517473, "learning_rate": 6.561339375886554e-06, "loss": 0.1655, "step": 2565 }, { "epoch": 0.3990668740279938, "grad_norm": 1.3341003956327966, "learning_rate": 6.559018431877807e-06, "loss": 0.2471, "step": 2566 }, { "epoch": 0.3992223950233281, "grad_norm": 1.1996450092092934, "learning_rate": 6.5566971157096206e-06, "loss": 0.1449, "step": 2567 }, { "epoch": 0.3993779160186625, "grad_norm": 0.982569264444697, "learning_rate": 6.554375427936122e-06, "loss": 0.1215, "step": 2568 }, { "epoch": 0.3995334370139969, "grad_norm": 1.4792560919540625, "learning_rate": 6.5520533691115314e-06, "loss": 0.1593, "step": 2569 }, { "epoch": 0.39968895800933124, "grad_norm": 0.835825186388195, "learning_rate": 6.54973093979016e-06, "loss": 0.1793, "step": 2570 }, { "epoch": 0.39984447900466563, "grad_norm": 1.0347358509727098, "learning_rate": 6.5474081405264e-06, "loss": 0.122, "step": 2571 }, { "epoch": 0.4, "grad_norm": 1.3082966545418786, "learning_rate": 6.545084971874738e-06, "loss": 0.2343, "step": 2572 }, { "epoch": 0.40015552099533436, "grad_norm": 1.1609096266201222, "learning_rate": 6.542761434389746e-06, "loss": 0.1987, "step": 2573 }, { "epoch": 0.40031104199066875, "grad_norm": 1.2901002578091536, "learning_rate": 6.540437528626084e-06, "loss": 0.247, "step": 2574 }, { "epoch": 0.4004665629860031, "grad_norm": 1.338428131646427, "learning_rate": 6.538113255138504e-06, "loss": 0.1758, "step": 2575 }, { "epoch": 0.4006220839813375, "grad_norm": 0.9203586556095333, "learning_rate": 6.5357886144818395e-06, "loss": 0.1967, "step": 2576 }, { "epoch": 0.40077760497667186, "grad_norm": 0.8861777595132381, "learning_rate": 6.533463607211014e-06, "loss": 0.1505, "step": 2577 }, { "epoch": 0.4009331259720062, "grad_norm": 1.0863253727544036, "learning_rate": 6.531138233881042e-06, "loss": 0.1279, "step": 2578 }, { "epoch": 0.4010886469673406, "grad_norm": 1.9508982713892475, "learning_rate": 6.528812495047021e-06, "loss": 0.1701, "step": 2579 }, { "epoch": 0.401244167962675, "grad_norm": 1.22849548259053, "learning_rate": 6.526486391264137e-06, "loss": 0.2266, "step": 2580 }, { "epoch": 0.4013996889580093, "grad_norm": 1.139795350440526, "learning_rate": 6.5241599230876625e-06, "loss": 0.1594, "step": 2581 }, { "epoch": 0.4015552099533437, "grad_norm": 0.9950317958220553, "learning_rate": 6.521833091072961e-06, "loss": 0.1872, "step": 2582 }, { "epoch": 0.4017107309486781, "grad_norm": 1.0932112377073522, "learning_rate": 6.519505895775477e-06, "loss": 0.1559, "step": 2583 }, { "epoch": 0.40186625194401243, "grad_norm": 1.2070662595798771, "learning_rate": 6.517178337750747e-06, "loss": 0.1881, "step": 2584 }, { "epoch": 0.4020217729393468, "grad_norm": 1.0151555254771387, "learning_rate": 6.514850417554388e-06, "loss": 0.1283, "step": 2585 }, { "epoch": 0.40217729393468116, "grad_norm": 1.4916575114548716, "learning_rate": 6.512522135742111e-06, "loss": 0.2287, "step": 2586 }, { "epoch": 0.40233281493001555, "grad_norm": 0.9835611563866982, "learning_rate": 6.510193492869706e-06, "loss": 0.1682, "step": 2587 }, { "epoch": 0.40248833592534994, "grad_norm": 1.0095171292114156, "learning_rate": 6.507864489493054e-06, "loss": 0.1531, "step": 2588 }, { "epoch": 0.4026438569206843, "grad_norm": 0.7192920567975817, "learning_rate": 6.505535126168121e-06, "loss": 0.1222, "step": 2589 }, { "epoch": 0.40279937791601866, "grad_norm": 0.9567575568459159, "learning_rate": 6.503205403450957e-06, "loss": 0.099, "step": 2590 }, { "epoch": 0.40295489891135305, "grad_norm": 0.8403750129951797, "learning_rate": 6.5008753218977e-06, "loss": 0.2145, "step": 2591 }, { "epoch": 0.4031104199066874, "grad_norm": 0.9515307385148366, "learning_rate": 6.498544882064576e-06, "loss": 0.2046, "step": 2592 }, { "epoch": 0.4032659409020218, "grad_norm": 0.9826148084354455, "learning_rate": 6.496214084507888e-06, "loss": 0.1735, "step": 2593 }, { "epoch": 0.40342146189735617, "grad_norm": 1.2292330812081729, "learning_rate": 6.493882929784032e-06, "loss": 0.1547, "step": 2594 }, { "epoch": 0.4035769828926905, "grad_norm": 1.1498595408012762, "learning_rate": 6.491551418449491e-06, "loss": 0.2069, "step": 2595 }, { "epoch": 0.4037325038880249, "grad_norm": 0.8880969386784783, "learning_rate": 6.489219551060824e-06, "loss": 0.1732, "step": 2596 }, { "epoch": 0.40388802488335923, "grad_norm": 1.5756279660116221, "learning_rate": 6.486887328174682e-06, "loss": 0.2083, "step": 2597 }, { "epoch": 0.4040435458786936, "grad_norm": 1.1086323584904356, "learning_rate": 6.4845547503478e-06, "loss": 0.1775, "step": 2598 }, { "epoch": 0.404199066874028, "grad_norm": 12.047463947408607, "learning_rate": 6.482221818136995e-06, "loss": 0.1474, "step": 2599 }, { "epoch": 0.40435458786936235, "grad_norm": 1.1122619424069295, "learning_rate": 6.479888532099175e-06, "loss": 0.1442, "step": 2600 }, { "epoch": 0.40435458786936235, "eval_loss": 0.18392392992973328, "eval_runtime": 9.4278, "eval_samples_per_second": 2.758, "eval_steps_per_second": 0.742, "step": 2600 }, { "epoch": 0.40451010886469674, "grad_norm": 1.6012241880498883, "learning_rate": 6.47755489279132e-06, "loss": 0.231, "step": 2601 }, { "epoch": 0.40466562986003113, "grad_norm": 1.0471135008496655, "learning_rate": 6.475220900770509e-06, "loss": 0.152, "step": 2602 }, { "epoch": 0.40482115085536546, "grad_norm": 1.4663020626238195, "learning_rate": 6.472886556593898e-06, "loss": 0.241, "step": 2603 }, { "epoch": 0.40497667185069985, "grad_norm": 1.7404520971287787, "learning_rate": 6.470551860818725e-06, "loss": 0.1969, "step": 2604 }, { "epoch": 0.4051321928460342, "grad_norm": 1.0267819614483535, "learning_rate": 6.468216814002314e-06, "loss": 0.2262, "step": 2605 }, { "epoch": 0.4052877138413686, "grad_norm": 1.2063361670716368, "learning_rate": 6.465881416702075e-06, "loss": 0.1653, "step": 2606 }, { "epoch": 0.40544323483670297, "grad_norm": 1.4889741705264727, "learning_rate": 6.463545669475499e-06, "loss": 0.1706, "step": 2607 }, { "epoch": 0.4055987558320373, "grad_norm": 1.0200237015393088, "learning_rate": 6.461209572880163e-06, "loss": 0.2231, "step": 2608 }, { "epoch": 0.4057542768273717, "grad_norm": 1.0699383355086862, "learning_rate": 6.458873127473724e-06, "loss": 0.1446, "step": 2609 }, { "epoch": 0.4059097978227061, "grad_norm": 1.1610852999195076, "learning_rate": 6.4565363338139245e-06, "loss": 0.0969, "step": 2610 }, { "epoch": 0.4060653188180404, "grad_norm": 0.8571185506880802, "learning_rate": 6.45419919245859e-06, "loss": 0.1822, "step": 2611 }, { "epoch": 0.4062208398133748, "grad_norm": 1.0582602538344552, "learning_rate": 6.451861703965629e-06, "loss": 0.1854, "step": 2612 }, { "epoch": 0.4063763608087092, "grad_norm": 1.1914856822676623, "learning_rate": 6.449523868893033e-06, "loss": 0.1716, "step": 2613 }, { "epoch": 0.40653188180404354, "grad_norm": 1.1166961129590682, "learning_rate": 6.447185687798873e-06, "loss": 0.1871, "step": 2614 }, { "epoch": 0.40668740279937793, "grad_norm": 1.1480598691897306, "learning_rate": 6.44484716124131e-06, "loss": 0.2112, "step": 2615 }, { "epoch": 0.40684292379471226, "grad_norm": 0.6585214221885924, "learning_rate": 6.4425082897785804e-06, "loss": 0.1236, "step": 2616 }, { "epoch": 0.40699844479004665, "grad_norm": 1.2656751084038902, "learning_rate": 6.4401690739690045e-06, "loss": 0.1576, "step": 2617 }, { "epoch": 0.40715396578538104, "grad_norm": 1.1455803018284632, "learning_rate": 6.437829514370987e-06, "loss": 0.1705, "step": 2618 }, { "epoch": 0.4073094867807154, "grad_norm": 0.9252820891880718, "learning_rate": 6.435489611543014e-06, "loss": 0.125, "step": 2619 }, { "epoch": 0.40746500777604977, "grad_norm": 1.3534425411284317, "learning_rate": 6.433149366043652e-06, "loss": 0.215, "step": 2620 }, { "epoch": 0.40762052877138416, "grad_norm": 0.8965574795162109, "learning_rate": 6.4308087784315495e-06, "loss": 0.1626, "step": 2621 }, { "epoch": 0.4077760497667185, "grad_norm": 1.393104946172338, "learning_rate": 6.428467849265438e-06, "loss": 0.197, "step": 2622 }, { "epoch": 0.4079315707620529, "grad_norm": 0.9992776445754116, "learning_rate": 6.42612657910413e-06, "loss": 0.22, "step": 2623 }, { "epoch": 0.4080870917573872, "grad_norm": 1.3972994757368875, "learning_rate": 6.423784968506522e-06, "loss": 0.1379, "step": 2624 }, { "epoch": 0.4082426127527216, "grad_norm": 1.1290940218231371, "learning_rate": 6.421443018031583e-06, "loss": 0.2309, "step": 2625 }, { "epoch": 0.408398133748056, "grad_norm": 1.1975551415540504, "learning_rate": 6.419100728238376e-06, "loss": 0.1785, "step": 2626 }, { "epoch": 0.40855365474339034, "grad_norm": 0.9490423553872639, "learning_rate": 6.416758099686035e-06, "loss": 0.1642, "step": 2627 }, { "epoch": 0.40870917573872473, "grad_norm": 1.3032944880216926, "learning_rate": 6.414415132933777e-06, "loss": 0.1268, "step": 2628 }, { "epoch": 0.4088646967340591, "grad_norm": 1.4969984220940926, "learning_rate": 6.412071828540902e-06, "loss": 0.2249, "step": 2629 }, { "epoch": 0.40902021772939345, "grad_norm": 1.2114591316600913, "learning_rate": 6.409728187066789e-06, "loss": 0.1469, "step": 2630 }, { "epoch": 0.40917573872472784, "grad_norm": 0.8952060394614973, "learning_rate": 6.4073842090709e-06, "loss": 0.1942, "step": 2631 }, { "epoch": 0.40933125972006223, "grad_norm": 1.1019489445397095, "learning_rate": 6.405039895112772e-06, "loss": 0.1943, "step": 2632 }, { "epoch": 0.40948678071539657, "grad_norm": 1.1532203737322906, "learning_rate": 6.402695245752027e-06, "loss": 0.2128, "step": 2633 }, { "epoch": 0.40964230171073096, "grad_norm": 0.9643307533316556, "learning_rate": 6.400350261548367e-06, "loss": 0.4534, "step": 2634 }, { "epoch": 0.4097978227060653, "grad_norm": 1.1582055252802126, "learning_rate": 6.398004943061571e-06, "loss": 0.1636, "step": 2635 }, { "epoch": 0.4099533437013997, "grad_norm": 1.0473207477604118, "learning_rate": 6.395659290851499e-06, "loss": 0.1744, "step": 2636 }, { "epoch": 0.4101088646967341, "grad_norm": 0.8248634354547286, "learning_rate": 6.3933133054780905e-06, "loss": 0.1963, "step": 2637 }, { "epoch": 0.4102643856920684, "grad_norm": 1.124966003012794, "learning_rate": 6.390966987501366e-06, "loss": 0.1182, "step": 2638 }, { "epoch": 0.4104199066874028, "grad_norm": 1.339745129550256, "learning_rate": 6.3886203374814236e-06, "loss": 0.1681, "step": 2639 }, { "epoch": 0.4105754276827372, "grad_norm": 1.0232675408147445, "learning_rate": 6.386273355978442e-06, "loss": 0.1534, "step": 2640 }, { "epoch": 0.41073094867807153, "grad_norm": 1.327329347563451, "learning_rate": 6.383926043552678e-06, "loss": 0.2196, "step": 2641 }, { "epoch": 0.4108864696734059, "grad_norm": 1.840741335646371, "learning_rate": 6.381578400764466e-06, "loss": 0.2126, "step": 2642 }, { "epoch": 0.41104199066874025, "grad_norm": 1.4028723630845288, "learning_rate": 6.379230428174227e-06, "loss": 0.2583, "step": 2643 }, { "epoch": 0.41119751166407464, "grad_norm": 0.9947583910777306, "learning_rate": 6.3768821263424474e-06, "loss": 0.1866, "step": 2644 }, { "epoch": 0.41135303265940903, "grad_norm": 0.962856557087839, "learning_rate": 6.374533495829704e-06, "loss": 0.1334, "step": 2645 }, { "epoch": 0.41150855365474337, "grad_norm": 1.137871710735757, "learning_rate": 6.372184537196646e-06, "loss": 0.1547, "step": 2646 }, { "epoch": 0.41166407465007776, "grad_norm": 0.969129304871265, "learning_rate": 6.369835251004004e-06, "loss": 0.1269, "step": 2647 }, { "epoch": 0.41181959564541215, "grad_norm": 1.3406940840006811, "learning_rate": 6.367485637812585e-06, "loss": 0.1806, "step": 2648 }, { "epoch": 0.4119751166407465, "grad_norm": 1.4927411439793692, "learning_rate": 6.3651356981832725e-06, "loss": 0.2062, "step": 2649 }, { "epoch": 0.4121306376360809, "grad_norm": 0.9309800839960175, "learning_rate": 6.3627854326770326e-06, "loss": 0.1581, "step": 2650 }, { "epoch": 0.41228615863141527, "grad_norm": 1.629558036391231, "learning_rate": 6.360434841854904e-06, "loss": 0.1197, "step": 2651 }, { "epoch": 0.4124416796267496, "grad_norm": 0.9407864882469291, "learning_rate": 6.3580839262780094e-06, "loss": 0.2111, "step": 2652 }, { "epoch": 0.412597200622084, "grad_norm": 1.1756867926270818, "learning_rate": 6.35573268650754e-06, "loss": 0.169, "step": 2653 }, { "epoch": 0.41275272161741833, "grad_norm": 1.4873178902086037, "learning_rate": 6.353381123104772e-06, "loss": 0.1024, "step": 2654 }, { "epoch": 0.4129082426127527, "grad_norm": 1.0372923373757021, "learning_rate": 6.351029236631057e-06, "loss": 0.2047, "step": 2655 }, { "epoch": 0.4130637636080871, "grad_norm": 0.9963095003866385, "learning_rate": 6.348677027647822e-06, "loss": 0.1642, "step": 2656 }, { "epoch": 0.41321928460342144, "grad_norm": 1.3496282742580388, "learning_rate": 6.346324496716573e-06, "loss": 0.1507, "step": 2657 }, { "epoch": 0.41337480559875583, "grad_norm": 0.844902273134157, "learning_rate": 6.343971644398892e-06, "loss": 0.1087, "step": 2658 }, { "epoch": 0.4135303265940902, "grad_norm": 0.662739748926624, "learning_rate": 6.341618471256437e-06, "loss": 0.1361, "step": 2659 }, { "epoch": 0.41368584758942456, "grad_norm": 1.3071753158171826, "learning_rate": 6.339264977850943e-06, "loss": 0.154, "step": 2660 }, { "epoch": 0.41384136858475895, "grad_norm": 0.9539346223760845, "learning_rate": 6.3369111647442215e-06, "loss": 0.2557, "step": 2661 }, { "epoch": 0.4139968895800933, "grad_norm": 1.2551329247175391, "learning_rate": 6.334557032498162e-06, "loss": 0.1509, "step": 2662 }, { "epoch": 0.4141524105754277, "grad_norm": 1.3397656967537157, "learning_rate": 6.332202581674727e-06, "loss": 0.25, "step": 2663 }, { "epoch": 0.41430793157076207, "grad_norm": 1.0622216161203566, "learning_rate": 6.329847812835959e-06, "loss": 0.1219, "step": 2664 }, { "epoch": 0.4144634525660964, "grad_norm": 1.0728195170375892, "learning_rate": 6.327492726543971e-06, "loss": 0.1471, "step": 2665 }, { "epoch": 0.4146189735614308, "grad_norm": 0.8756118992580546, "learning_rate": 6.325137323360957e-06, "loss": 0.2169, "step": 2666 }, { "epoch": 0.4147744945567652, "grad_norm": 0.8408713878001115, "learning_rate": 6.322781603849184e-06, "loss": 0.2046, "step": 2667 }, { "epoch": 0.4149300155520995, "grad_norm": 0.9213640910578913, "learning_rate": 6.320425568570997e-06, "loss": 0.1469, "step": 2668 }, { "epoch": 0.4150855365474339, "grad_norm": 1.248378336688698, "learning_rate": 6.318069218088812e-06, "loss": 0.1649, "step": 2669 }, { "epoch": 0.4152410575427683, "grad_norm": 1.245514553943111, "learning_rate": 6.3157125529651205e-06, "loss": 0.1603, "step": 2670 }, { "epoch": 0.41539657853810263, "grad_norm": 1.0259536671755478, "learning_rate": 6.313355573762498e-06, "loss": 0.2525, "step": 2671 }, { "epoch": 0.415552099533437, "grad_norm": 0.6633081615629601, "learning_rate": 6.310998281043584e-06, "loss": 0.1332, "step": 2672 }, { "epoch": 0.41570762052877136, "grad_norm": 1.1235919728297674, "learning_rate": 6.308640675371098e-06, "loss": 0.2105, "step": 2673 }, { "epoch": 0.41586314152410575, "grad_norm": 0.6922528126225543, "learning_rate": 6.306282757307832e-06, "loss": 0.1865, "step": 2674 }, { "epoch": 0.41601866251944014, "grad_norm": 1.28929243572396, "learning_rate": 6.303924527416656e-06, "loss": 0.2295, "step": 2675 }, { "epoch": 0.4161741835147745, "grad_norm": 1.3530022261500312, "learning_rate": 6.301565986260512e-06, "loss": 0.1754, "step": 2676 }, { "epoch": 0.41632970451010887, "grad_norm": 1.287119137660476, "learning_rate": 6.299207134402416e-06, "loss": 0.1263, "step": 2677 }, { "epoch": 0.41648522550544326, "grad_norm": 1.4930391891253567, "learning_rate": 6.296847972405456e-06, "loss": 0.1616, "step": 2678 }, { "epoch": 0.4166407465007776, "grad_norm": 0.6449132277982559, "learning_rate": 6.2944885008328035e-06, "loss": 0.1217, "step": 2679 }, { "epoch": 0.416796267496112, "grad_norm": 0.900032571548147, "learning_rate": 6.292128720247692e-06, "loss": 0.1071, "step": 2680 }, { "epoch": 0.4169517884914463, "grad_norm": 1.144811166476071, "learning_rate": 6.289768631213435e-06, "loss": 0.2634, "step": 2681 }, { "epoch": 0.4171073094867807, "grad_norm": 0.7470519319077137, "learning_rate": 6.287408234293421e-06, "loss": 0.1582, "step": 2682 }, { "epoch": 0.4172628304821151, "grad_norm": 0.7399178538227238, "learning_rate": 6.285047530051107e-06, "loss": 0.1413, "step": 2683 }, { "epoch": 0.41741835147744943, "grad_norm": 0.8328246511470986, "learning_rate": 6.282686519050027e-06, "loss": 0.1041, "step": 2684 }, { "epoch": 0.4175738724727838, "grad_norm": 1.0152417317695583, "learning_rate": 6.280325201853787e-06, "loss": 0.1717, "step": 2685 }, { "epoch": 0.4177293934681182, "grad_norm": 1.1382879711120875, "learning_rate": 6.277963579026067e-06, "loss": 0.1407, "step": 2686 }, { "epoch": 0.41788491446345255, "grad_norm": 0.8129669486475961, "learning_rate": 6.275601651130618e-06, "loss": 0.1896, "step": 2687 }, { "epoch": 0.41804043545878694, "grad_norm": 1.0565990613592156, "learning_rate": 6.273239418731267e-06, "loss": 0.1192, "step": 2688 }, { "epoch": 0.41819595645412133, "grad_norm": 0.7901517490083826, "learning_rate": 6.27087688239191e-06, "loss": 0.1216, "step": 2689 }, { "epoch": 0.41835147744945567, "grad_norm": 1.119159307172197, "learning_rate": 6.268514042676519e-06, "loss": 0.114, "step": 2690 }, { "epoch": 0.41850699844479006, "grad_norm": 1.5849822239001683, "learning_rate": 6.266150900149135e-06, "loss": 0.1871, "step": 2691 }, { "epoch": 0.4186625194401244, "grad_norm": 0.8903869628133892, "learning_rate": 6.263787455373875e-06, "loss": 0.1611, "step": 2692 }, { "epoch": 0.4188180404354588, "grad_norm": 0.9702277926833924, "learning_rate": 6.261423708914925e-06, "loss": 0.1834, "step": 2693 }, { "epoch": 0.4189735614307932, "grad_norm": 1.3679898423790677, "learning_rate": 6.259059661336543e-06, "loss": 0.1524, "step": 2694 }, { "epoch": 0.4191290824261275, "grad_norm": 1.5442427900376117, "learning_rate": 6.256695313203064e-06, "loss": 0.2535, "step": 2695 }, { "epoch": 0.4192846034214619, "grad_norm": 0.9596280367748563, "learning_rate": 6.254330665078887e-06, "loss": 0.1349, "step": 2696 }, { "epoch": 0.4194401244167963, "grad_norm": 1.3924475715171074, "learning_rate": 6.251965717528489e-06, "loss": 0.1868, "step": 2697 }, { "epoch": 0.4195956454121306, "grad_norm": 0.9376631637186745, "learning_rate": 6.249600471116414e-06, "loss": 0.1845, "step": 2698 }, { "epoch": 0.419751166407465, "grad_norm": 0.9297826280170743, "learning_rate": 6.2472349264072805e-06, "loss": 0.1173, "step": 2699 }, { "epoch": 0.4199066874027994, "grad_norm": 1.1021757886576273, "learning_rate": 6.244869083965777e-06, "loss": 0.1449, "step": 2700 }, { "epoch": 0.4199066874027994, "eval_loss": 0.18403930962085724, "eval_runtime": 9.4456, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 2700 }, { "epoch": 0.42006220839813374, "grad_norm": 1.3839131174734114, "learning_rate": 6.242502944356664e-06, "loss": 0.2283, "step": 2701 }, { "epoch": 0.42021772939346813, "grad_norm": 1.2103695953597375, "learning_rate": 6.240136508144769e-06, "loss": 0.1979, "step": 2702 }, { "epoch": 0.42037325038880247, "grad_norm": 1.0325445375434532, "learning_rate": 6.237769775894996e-06, "loss": 0.169, "step": 2703 }, { "epoch": 0.42052877138413686, "grad_norm": 1.0209707078615324, "learning_rate": 6.235402748172317e-06, "loss": 0.1391, "step": 2704 }, { "epoch": 0.42068429237947125, "grad_norm": 1.131172047813453, "learning_rate": 6.233035425541771e-06, "loss": 0.0976, "step": 2705 }, { "epoch": 0.4208398133748056, "grad_norm": 1.0592885096611644, "learning_rate": 6.230667808568476e-06, "loss": 0.1221, "step": 2706 }, { "epoch": 0.42099533437014, "grad_norm": 0.8993285570457389, "learning_rate": 6.228299897817612e-06, "loss": 0.107, "step": 2707 }, { "epoch": 0.42115085536547436, "grad_norm": 1.3316754384126217, "learning_rate": 6.225931693854433e-06, "loss": 0.1692, "step": 2708 }, { "epoch": 0.4213063763608087, "grad_norm": 1.409893657230433, "learning_rate": 6.223563197244264e-06, "loss": 0.1962, "step": 2709 }, { "epoch": 0.4214618973561431, "grad_norm": 0.85137288967356, "learning_rate": 6.221194408552494e-06, "loss": 0.1318, "step": 2710 }, { "epoch": 0.4216174183514774, "grad_norm": 1.8567028612535978, "learning_rate": 6.218825328344592e-06, "loss": 0.1696, "step": 2711 }, { "epoch": 0.4217729393468118, "grad_norm": 0.9681702365732746, "learning_rate": 6.2164559571860846e-06, "loss": 0.133, "step": 2712 }, { "epoch": 0.4219284603421462, "grad_norm": 1.214936517892813, "learning_rate": 6.2140862956425764e-06, "loss": 0.1648, "step": 2713 }, { "epoch": 0.42208398133748054, "grad_norm": 1.3318961231484716, "learning_rate": 6.211716344279739e-06, "loss": 0.1711, "step": 2714 }, { "epoch": 0.42223950233281493, "grad_norm": 0.7485709469445069, "learning_rate": 6.2093461036633116e-06, "loss": 0.1114, "step": 2715 }, { "epoch": 0.4223950233281493, "grad_norm": 1.0680634365618416, "learning_rate": 6.2069755743591044e-06, "loss": 0.1379, "step": 2716 }, { "epoch": 0.42255054432348366, "grad_norm": 1.4696828459658164, "learning_rate": 6.204604756932997e-06, "loss": 0.2144, "step": 2717 }, { "epoch": 0.42270606531881805, "grad_norm": 1.1670456572923806, "learning_rate": 6.202233651950933e-06, "loss": 0.1702, "step": 2718 }, { "epoch": 0.42286158631415244, "grad_norm": 1.073338214494645, "learning_rate": 6.199862259978931e-06, "loss": 0.1904, "step": 2719 }, { "epoch": 0.4230171073094868, "grad_norm": 0.7818406597452295, "learning_rate": 6.197490581583078e-06, "loss": 0.1233, "step": 2720 }, { "epoch": 0.42317262830482116, "grad_norm": 0.7767090098621725, "learning_rate": 6.195118617329521e-06, "loss": 0.2202, "step": 2721 }, { "epoch": 0.4233281493001555, "grad_norm": 1.2087028632227785, "learning_rate": 6.192746367784483e-06, "loss": 0.2522, "step": 2722 }, { "epoch": 0.4234836702954899, "grad_norm": 1.101934876622112, "learning_rate": 6.1903738335142535e-06, "loss": 0.2551, "step": 2723 }, { "epoch": 0.4236391912908243, "grad_norm": 1.0688244779812055, "learning_rate": 6.188001015085191e-06, "loss": 0.2584, "step": 2724 }, { "epoch": 0.4237947122861586, "grad_norm": 1.0952006908866212, "learning_rate": 6.185627913063719e-06, "loss": 0.0999, "step": 2725 }, { "epoch": 0.423950233281493, "grad_norm": 0.9687490932628856, "learning_rate": 6.183254528016329e-06, "loss": 0.1735, "step": 2726 }, { "epoch": 0.4241057542768274, "grad_norm": 1.021966827250519, "learning_rate": 6.180880860509582e-06, "loss": 0.164, "step": 2727 }, { "epoch": 0.42426127527216173, "grad_norm": 1.0404367024477605, "learning_rate": 6.178506911110107e-06, "loss": 0.1331, "step": 2728 }, { "epoch": 0.4244167962674961, "grad_norm": 1.2256681309262265, "learning_rate": 6.176132680384597e-06, "loss": 0.2051, "step": 2729 }, { "epoch": 0.42457231726283046, "grad_norm": 0.9536222110996312, "learning_rate": 6.173758168899814e-06, "loss": 0.1285, "step": 2730 }, { "epoch": 0.42472783825816485, "grad_norm": 2.3675460804772936, "learning_rate": 6.171383377222588e-06, "loss": 0.1318, "step": 2731 }, { "epoch": 0.42488335925349924, "grad_norm": 0.8201832475630726, "learning_rate": 6.169008305919812e-06, "loss": 0.1497, "step": 2732 }, { "epoch": 0.4250388802488336, "grad_norm": 0.8366398235116912, "learning_rate": 6.1666329555584545e-06, "loss": 0.1869, "step": 2733 }, { "epoch": 0.42519440124416796, "grad_norm": 0.804692759986444, "learning_rate": 6.164257326705539e-06, "loss": 0.1881, "step": 2734 }, { "epoch": 0.42534992223950235, "grad_norm": 0.9860949820595488, "learning_rate": 6.161881419928164e-06, "loss": 0.1821, "step": 2735 }, { "epoch": 0.4255054432348367, "grad_norm": 1.0219815132629755, "learning_rate": 6.159505235793492e-06, "loss": 0.1581, "step": 2736 }, { "epoch": 0.4256609642301711, "grad_norm": 1.1430977608140263, "learning_rate": 6.1571287748687495e-06, "loss": 0.2025, "step": 2737 }, { "epoch": 0.42581648522550547, "grad_norm": 1.946710285709528, "learning_rate": 6.154752037721231e-06, "loss": 0.2121, "step": 2738 }, { "epoch": 0.4259720062208398, "grad_norm": 0.7470352704437541, "learning_rate": 6.152375024918298e-06, "loss": 0.1541, "step": 2739 }, { "epoch": 0.4261275272161742, "grad_norm": 1.0415401285149013, "learning_rate": 6.149997737027377e-06, "loss": 0.179, "step": 2740 }, { "epoch": 0.42628304821150853, "grad_norm": 1.0741900992661568, "learning_rate": 6.147620174615959e-06, "loss": 0.1686, "step": 2741 }, { "epoch": 0.4264385692068429, "grad_norm": 0.8112296386381275, "learning_rate": 6.145242338251599e-06, "loss": 0.1508, "step": 2742 }, { "epoch": 0.4265940902021773, "grad_norm": 1.1582796783536475, "learning_rate": 6.142864228501923e-06, "loss": 0.1963, "step": 2743 }, { "epoch": 0.42674961119751165, "grad_norm": 1.203418154190312, "learning_rate": 6.14048584593462e-06, "loss": 0.2358, "step": 2744 }, { "epoch": 0.42690513219284604, "grad_norm": 0.738032741036954, "learning_rate": 6.138107191117439e-06, "loss": 0.0915, "step": 2745 }, { "epoch": 0.42706065318818043, "grad_norm": 1.1428068991107414, "learning_rate": 6.135728264618199e-06, "loss": 0.2341, "step": 2746 }, { "epoch": 0.42721617418351476, "grad_norm": 1.3963243468594224, "learning_rate": 6.133349067004785e-06, "loss": 0.1242, "step": 2747 }, { "epoch": 0.42737169517884915, "grad_norm": 1.0624959012919213, "learning_rate": 6.130969598845144e-06, "loss": 0.2516, "step": 2748 }, { "epoch": 0.4275272161741835, "grad_norm": 1.0824886763772477, "learning_rate": 6.128589860707288e-06, "loss": 0.1925, "step": 2749 }, { "epoch": 0.4276827371695179, "grad_norm": 0.814417028298076, "learning_rate": 6.126209853159293e-06, "loss": 0.2005, "step": 2750 }, { "epoch": 0.42783825816485227, "grad_norm": 1.1937991072742926, "learning_rate": 6.1238295767693e-06, "loss": 0.1803, "step": 2751 }, { "epoch": 0.4279937791601866, "grad_norm": 1.401886691527961, "learning_rate": 6.121449032105516e-06, "loss": 0.201, "step": 2752 }, { "epoch": 0.428149300155521, "grad_norm": 0.5940624211299631, "learning_rate": 6.1190682197362084e-06, "loss": 0.1291, "step": 2753 }, { "epoch": 0.4283048211508554, "grad_norm": 0.8861897913302674, "learning_rate": 6.11668714022971e-06, "loss": 0.1322, "step": 2754 }, { "epoch": 0.4284603421461897, "grad_norm": 0.7095702863546661, "learning_rate": 6.114305794154419e-06, "loss": 0.1304, "step": 2755 }, { "epoch": 0.4286158631415241, "grad_norm": 1.2612396921430442, "learning_rate": 6.111924182078796e-06, "loss": 0.1682, "step": 2756 }, { "epoch": 0.4287713841368585, "grad_norm": 1.0035168666602698, "learning_rate": 6.109542304571363e-06, "loss": 0.1752, "step": 2757 }, { "epoch": 0.42892690513219284, "grad_norm": 0.7121608308196761, "learning_rate": 6.107160162200709e-06, "loss": 0.1326, "step": 2758 }, { "epoch": 0.42908242612752723, "grad_norm": 1.1619661661760665, "learning_rate": 6.104777755535485e-06, "loss": 0.1345, "step": 2759 }, { "epoch": 0.42923794712286156, "grad_norm": 1.2996017300650153, "learning_rate": 6.102395085144406e-06, "loss": 0.2134, "step": 2760 }, { "epoch": 0.42939346811819595, "grad_norm": 1.332109498205559, "learning_rate": 6.100012151596244e-06, "loss": 0.2343, "step": 2761 }, { "epoch": 0.42954898911353034, "grad_norm": 1.8347254708432401, "learning_rate": 6.097628955459842e-06, "loss": 0.1633, "step": 2762 }, { "epoch": 0.4297045101088647, "grad_norm": 0.9693547880747295, "learning_rate": 6.095245497304101e-06, "loss": 0.1711, "step": 2763 }, { "epoch": 0.42986003110419907, "grad_norm": 3.3104239938993336, "learning_rate": 6.092861777697987e-06, "loss": 0.1519, "step": 2764 }, { "epoch": 0.43001555209953346, "grad_norm": 1.1901935273384043, "learning_rate": 6.090477797210525e-06, "loss": 0.1497, "step": 2765 }, { "epoch": 0.4301710730948678, "grad_norm": 0.662484568038456, "learning_rate": 6.088093556410806e-06, "loss": 0.1129, "step": 2766 }, { "epoch": 0.4303265940902022, "grad_norm": 1.078409033667744, "learning_rate": 6.085709055867981e-06, "loss": 0.1643, "step": 2767 }, { "epoch": 0.4304821150855365, "grad_norm": 1.1065575083377894, "learning_rate": 6.083324296151265e-06, "loss": 0.1925, "step": 2768 }, { "epoch": 0.4306376360808709, "grad_norm": 1.3350606272565388, "learning_rate": 6.080939277829931e-06, "loss": 0.2053, "step": 2769 }, { "epoch": 0.4307931570762053, "grad_norm": 1.2508941660307586, "learning_rate": 6.078554001473317e-06, "loss": 0.2212, "step": 2770 }, { "epoch": 0.43094867807153964, "grad_norm": 1.6892182108658822, "learning_rate": 6.0761684676508224e-06, "loss": 0.1292, "step": 2771 }, { "epoch": 0.431104199066874, "grad_norm": 0.8668974133139533, "learning_rate": 6.073782676931906e-06, "loss": 0.1963, "step": 2772 }, { "epoch": 0.4312597200622084, "grad_norm": 1.2827046902591641, "learning_rate": 6.071396629886091e-06, "loss": 0.2194, "step": 2773 }, { "epoch": 0.43141524105754275, "grad_norm": 1.2223434254101395, "learning_rate": 6.069010327082958e-06, "loss": 0.1729, "step": 2774 }, { "epoch": 0.43157076205287714, "grad_norm": 1.3258450693702206, "learning_rate": 6.066623769092154e-06, "loss": 0.2045, "step": 2775 }, { "epoch": 0.43172628304821153, "grad_norm": 1.3927551196659436, "learning_rate": 6.064236956483381e-06, "loss": 0.1167, "step": 2776 }, { "epoch": 0.43188180404354587, "grad_norm": 1.3432278378740052, "learning_rate": 6.0618498898264045e-06, "loss": 0.211, "step": 2777 }, { "epoch": 0.43203732503888026, "grad_norm": 1.4656125379900269, "learning_rate": 6.059462569691053e-06, "loss": 0.1121, "step": 2778 }, { "epoch": 0.4321928460342146, "grad_norm": 1.038270771546754, "learning_rate": 6.057074996647209e-06, "loss": 0.1434, "step": 2779 }, { "epoch": 0.432348367029549, "grad_norm": 1.0215187659015073, "learning_rate": 6.054687171264822e-06, "loss": 0.1616, "step": 2780 }, { "epoch": 0.4325038880248834, "grad_norm": 1.0389056062360256, "learning_rate": 6.0522990941139005e-06, "loss": 0.1482, "step": 2781 }, { "epoch": 0.4326594090202177, "grad_norm": 0.7056092432298983, "learning_rate": 6.049910765764511e-06, "loss": 0.1943, "step": 2782 }, { "epoch": 0.4328149300155521, "grad_norm": 1.1564230243764642, "learning_rate": 6.047522186786778e-06, "loss": 0.2145, "step": 2783 }, { "epoch": 0.4329704510108865, "grad_norm": 1.078138461613302, "learning_rate": 6.045133357750892e-06, "loss": 0.2301, "step": 2784 }, { "epoch": 0.4331259720062208, "grad_norm": 1.1786643649889719, "learning_rate": 6.0427442792271e-06, "loss": 0.1623, "step": 2785 }, { "epoch": 0.4332814930015552, "grad_norm": 1.270300645361853, "learning_rate": 6.040354951785706e-06, "loss": 0.2662, "step": 2786 }, { "epoch": 0.43343701399688955, "grad_norm": 0.873659296238628, "learning_rate": 6.037965375997075e-06, "loss": 0.274, "step": 2787 }, { "epoch": 0.43359253499222394, "grad_norm": 0.9246410387047762, "learning_rate": 6.035575552431635e-06, "loss": 0.1773, "step": 2788 }, { "epoch": 0.43374805598755833, "grad_norm": 1.375329824980958, "learning_rate": 6.033185481659869e-06, "loss": 0.2068, "step": 2789 }, { "epoch": 0.43390357698289267, "grad_norm": 1.1499596797540992, "learning_rate": 6.030795164252321e-06, "loss": 0.2106, "step": 2790 }, { "epoch": 0.43405909797822706, "grad_norm": 0.9408599058803538, "learning_rate": 6.028404600779592e-06, "loss": 0.2393, "step": 2791 }, { "epoch": 0.43421461897356145, "grad_norm": 1.0778697973398839, "learning_rate": 6.026013791812342e-06, "loss": 0.1432, "step": 2792 }, { "epoch": 0.4343701399688958, "grad_norm": 1.0100235432164175, "learning_rate": 6.023622737921294e-06, "loss": 0.1608, "step": 2793 }, { "epoch": 0.4345256609642302, "grad_norm": 0.7044933532545072, "learning_rate": 6.021231439677222e-06, "loss": 0.1694, "step": 2794 }, { "epoch": 0.43468118195956457, "grad_norm": 1.2375989299016086, "learning_rate": 6.018839897650962e-06, "loss": 0.1602, "step": 2795 }, { "epoch": 0.4348367029548989, "grad_norm": 0.913444855853312, "learning_rate": 6.016448112413414e-06, "loss": 0.1826, "step": 2796 }, { "epoch": 0.4349922239502333, "grad_norm": 1.0558776997873436, "learning_rate": 6.014056084535525e-06, "loss": 0.19, "step": 2797 }, { "epoch": 0.4351477449455676, "grad_norm": 1.0898931184038367, "learning_rate": 6.0116638145883065e-06, "loss": 0.2241, "step": 2798 }, { "epoch": 0.435303265940902, "grad_norm": 1.1187765772901797, "learning_rate": 6.0092713031428275e-06, "loss": 0.2315, "step": 2799 }, { "epoch": 0.4354587869362364, "grad_norm": 0.8882182163976724, "learning_rate": 6.006878550770213e-06, "loss": 0.109, "step": 2800 }, { "epoch": 0.4354587869362364, "eval_loss": 0.18156467378139496, "eval_runtime": 9.4195, "eval_samples_per_second": 2.76, "eval_steps_per_second": 0.743, "step": 2800 }, { "epoch": 0.43561430793157074, "grad_norm": 1.3698138557530846, "learning_rate": 6.00448555804165e-06, "loss": 0.1803, "step": 2801 }, { "epoch": 0.43576982892690513, "grad_norm": 1.1371537095075348, "learning_rate": 6.002092325528374e-06, "loss": 0.1617, "step": 2802 }, { "epoch": 0.4359253499222395, "grad_norm": 1.2999274774606788, "learning_rate": 5.999698853801684e-06, "loss": 0.1301, "step": 2803 }, { "epoch": 0.43608087091757386, "grad_norm": 1.1801258272053377, "learning_rate": 5.997305143432939e-06, "loss": 0.1887, "step": 2804 }, { "epoch": 0.43623639191290825, "grad_norm": 1.0052560251446043, "learning_rate": 5.9949111949935465e-06, "loss": 0.0917, "step": 2805 }, { "epoch": 0.4363919129082426, "grad_norm": 0.9205278931166823, "learning_rate": 5.9925170090549775e-06, "loss": 0.124, "step": 2806 }, { "epoch": 0.436547433903577, "grad_norm": 1.6149890987288589, "learning_rate": 5.990122586188758e-06, "loss": 0.1452, "step": 2807 }, { "epoch": 0.43670295489891137, "grad_norm": 1.1638581037427569, "learning_rate": 5.987727926966469e-06, "loss": 0.1756, "step": 2808 }, { "epoch": 0.4368584758942457, "grad_norm": 1.159653336170906, "learning_rate": 5.98533303195975e-06, "loss": 0.1735, "step": 2809 }, { "epoch": 0.4370139968895801, "grad_norm": 1.0187740469748812, "learning_rate": 5.982937901740296e-06, "loss": 0.1984, "step": 2810 }, { "epoch": 0.4371695178849145, "grad_norm": 1.7261246913965869, "learning_rate": 5.9805425368798545e-06, "loss": 0.2245, "step": 2811 }, { "epoch": 0.4373250388802488, "grad_norm": 1.2633614584696002, "learning_rate": 5.978146937950238e-06, "loss": 0.1525, "step": 2812 }, { "epoch": 0.4374805598755832, "grad_norm": 1.2005135064303358, "learning_rate": 5.975751105523305e-06, "loss": 0.1708, "step": 2813 }, { "epoch": 0.4376360808709176, "grad_norm": 1.0019243250356757, "learning_rate": 5.973355040170976e-06, "loss": 0.212, "step": 2814 }, { "epoch": 0.43779160186625193, "grad_norm": 1.0365968268686525, "learning_rate": 5.970958742465226e-06, "loss": 0.1433, "step": 2815 }, { "epoch": 0.4379471228615863, "grad_norm": 0.8337550824820793, "learning_rate": 5.968562212978083e-06, "loss": 0.1788, "step": 2816 }, { "epoch": 0.43810264385692066, "grad_norm": 0.9158959182447146, "learning_rate": 5.9661654522816336e-06, "loss": 0.1942, "step": 2817 }, { "epoch": 0.43825816485225505, "grad_norm": 0.8463021401446613, "learning_rate": 5.963768460948016e-06, "loss": 0.1539, "step": 2818 }, { "epoch": 0.43841368584758944, "grad_norm": 1.0053387750516638, "learning_rate": 5.961371239549426e-06, "loss": 0.15, "step": 2819 }, { "epoch": 0.4385692068429238, "grad_norm": 1.4657656947185458, "learning_rate": 5.958973788658115e-06, "loss": 0.2239, "step": 2820 }, { "epoch": 0.43872472783825817, "grad_norm": 1.4385164579884262, "learning_rate": 5.956576108846388e-06, "loss": 0.1633, "step": 2821 }, { "epoch": 0.43888024883359256, "grad_norm": 0.9514126484043769, "learning_rate": 5.9541782006866034e-06, "loss": 0.172, "step": 2822 }, { "epoch": 0.4390357698289269, "grad_norm": 1.087631545229999, "learning_rate": 5.951780064751176e-06, "loss": 0.2118, "step": 2823 }, { "epoch": 0.4391912908242613, "grad_norm": 0.8111579988056196, "learning_rate": 5.949381701612574e-06, "loss": 0.1569, "step": 2824 }, { "epoch": 0.4393468118195957, "grad_norm": 1.7587959894945275, "learning_rate": 5.94698311184332e-06, "loss": 0.2385, "step": 2825 }, { "epoch": 0.43950233281493, "grad_norm": 1.0995869935196272, "learning_rate": 5.9445842960159916e-06, "loss": 0.2054, "step": 2826 }, { "epoch": 0.4396578538102644, "grad_norm": 0.7690937530894449, "learning_rate": 5.9421852547032175e-06, "loss": 0.0863, "step": 2827 }, { "epoch": 0.43981337480559873, "grad_norm": 1.2034338007232304, "learning_rate": 5.939785988477687e-06, "loss": 0.2277, "step": 2828 }, { "epoch": 0.4399688958009331, "grad_norm": 1.1223705956089585, "learning_rate": 5.937386497912132e-06, "loss": 0.1229, "step": 2829 }, { "epoch": 0.4401244167962675, "grad_norm": 0.9231194238850893, "learning_rate": 5.934986783579349e-06, "loss": 0.1843, "step": 2830 }, { "epoch": 0.44027993779160185, "grad_norm": 0.9680209596730575, "learning_rate": 5.932586846052182e-06, "loss": 0.1657, "step": 2831 }, { "epoch": 0.44043545878693624, "grad_norm": 1.3505200241024857, "learning_rate": 5.9301866859035295e-06, "loss": 0.2072, "step": 2832 }, { "epoch": 0.44059097978227063, "grad_norm": 0.9158453803564267, "learning_rate": 5.9277863037063435e-06, "loss": 0.1115, "step": 2833 }, { "epoch": 0.44074650077760497, "grad_norm": 0.9833163426052633, "learning_rate": 5.9253857000336275e-06, "loss": 0.1705, "step": 2834 }, { "epoch": 0.44090202177293936, "grad_norm": 1.19901082598529, "learning_rate": 5.92298487545844e-06, "loss": 0.1909, "step": 2835 }, { "epoch": 0.4410575427682737, "grad_norm": 1.1841048329534065, "learning_rate": 5.920583830553892e-06, "loss": 0.2159, "step": 2836 }, { "epoch": 0.4412130637636081, "grad_norm": 1.4726201243068984, "learning_rate": 5.918182565893146e-06, "loss": 0.1949, "step": 2837 }, { "epoch": 0.4413685847589425, "grad_norm": 1.5314428382903325, "learning_rate": 5.915781082049416e-06, "loss": 0.2016, "step": 2838 }, { "epoch": 0.4415241057542768, "grad_norm": 1.4225939057762684, "learning_rate": 5.9133793795959705e-06, "loss": 0.1556, "step": 2839 }, { "epoch": 0.4416796267496112, "grad_norm": 0.9245470256508254, "learning_rate": 5.91097745910613e-06, "loss": 0.2044, "step": 2840 }, { "epoch": 0.4418351477449456, "grad_norm": 1.168875442636591, "learning_rate": 5.908575321153266e-06, "loss": 0.1594, "step": 2841 }, { "epoch": 0.4419906687402799, "grad_norm": 0.9284268233359886, "learning_rate": 5.906172966310803e-06, "loss": 0.1577, "step": 2842 }, { "epoch": 0.4421461897356143, "grad_norm": 1.0601948197007813, "learning_rate": 5.903770395152214e-06, "loss": 0.1676, "step": 2843 }, { "epoch": 0.4423017107309487, "grad_norm": 0.9279991132696253, "learning_rate": 5.901367608251029e-06, "loss": 0.1692, "step": 2844 }, { "epoch": 0.44245723172628304, "grad_norm": 1.1845993112138098, "learning_rate": 5.898964606180828e-06, "loss": 0.1461, "step": 2845 }, { "epoch": 0.44261275272161743, "grad_norm": 1.2661810409848273, "learning_rate": 5.8965613895152375e-06, "loss": 0.1623, "step": 2846 }, { "epoch": 0.44276827371695177, "grad_norm": 1.3581051151634238, "learning_rate": 5.8941579588279396e-06, "loss": 0.1712, "step": 2847 }, { "epoch": 0.44292379471228616, "grad_norm": 1.121505892855375, "learning_rate": 5.891754314692668e-06, "loss": 0.1684, "step": 2848 }, { "epoch": 0.44307931570762055, "grad_norm": 1.1343682518218325, "learning_rate": 5.889350457683205e-06, "loss": 0.1297, "step": 2849 }, { "epoch": 0.4432348367029549, "grad_norm": 0.7705397587261061, "learning_rate": 5.886946388373387e-06, "loss": 0.1072, "step": 2850 }, { "epoch": 0.4433903576982893, "grad_norm": 0.7722747993303517, "learning_rate": 5.884542107337094e-06, "loss": 0.1607, "step": 2851 }, { "epoch": 0.44354587869362366, "grad_norm": 1.1114474551423301, "learning_rate": 5.882137615148268e-06, "loss": 0.1718, "step": 2852 }, { "epoch": 0.443701399688958, "grad_norm": 1.0739072249337023, "learning_rate": 5.879732912380891e-06, "loss": 0.1269, "step": 2853 }, { "epoch": 0.4438569206842924, "grad_norm": 0.8874395689977277, "learning_rate": 5.877327999608998e-06, "loss": 0.2047, "step": 2854 }, { "epoch": 0.4440124416796267, "grad_norm": 0.6280417746051472, "learning_rate": 5.874922877406677e-06, "loss": 0.1253, "step": 2855 }, { "epoch": 0.4441679626749611, "grad_norm": 2.321092813281568, "learning_rate": 5.8725175463480645e-06, "loss": 0.1519, "step": 2856 }, { "epoch": 0.4443234836702955, "grad_norm": 0.9466706318616153, "learning_rate": 5.8701120070073454e-06, "loss": 0.1546, "step": 2857 }, { "epoch": 0.44447900466562984, "grad_norm": 1.2280679980834144, "learning_rate": 5.867706259958757e-06, "loss": 0.1894, "step": 2858 }, { "epoch": 0.44463452566096423, "grad_norm": 0.9488437145574975, "learning_rate": 5.865300305776582e-06, "loss": 0.1449, "step": 2859 }, { "epoch": 0.4447900466562986, "grad_norm": 0.963502334929132, "learning_rate": 5.862894145035158e-06, "loss": 0.12, "step": 2860 }, { "epoch": 0.44494556765163296, "grad_norm": 1.5296324317300785, "learning_rate": 5.86048777830887e-06, "loss": 0.1806, "step": 2861 }, { "epoch": 0.44510108864696735, "grad_norm": 1.721423068084887, "learning_rate": 5.858081206172148e-06, "loss": 0.2251, "step": 2862 }, { "epoch": 0.44525660964230174, "grad_norm": 1.2707536054860695, "learning_rate": 5.855674429199474e-06, "loss": 0.2259, "step": 2863 }, { "epoch": 0.4454121306376361, "grad_norm": 1.4582141081987299, "learning_rate": 5.853267447965384e-06, "loss": 0.261, "step": 2864 }, { "epoch": 0.44556765163297046, "grad_norm": 0.7992400817516426, "learning_rate": 5.850860263044454e-06, "loss": 0.183, "step": 2865 }, { "epoch": 0.4457231726283048, "grad_norm": 1.2173696819640933, "learning_rate": 5.848452875011312e-06, "loss": 0.1505, "step": 2866 }, { "epoch": 0.4458786936236392, "grad_norm": 1.1236080420455812, "learning_rate": 5.846045284440637e-06, "loss": 0.2376, "step": 2867 }, { "epoch": 0.4460342146189736, "grad_norm": 0.8540495870121949, "learning_rate": 5.8436374919071545e-06, "loss": 0.1852, "step": 2868 }, { "epoch": 0.4461897356143079, "grad_norm": 0.9767839680883946, "learning_rate": 5.841229497985639e-06, "loss": 0.1828, "step": 2869 }, { "epoch": 0.4463452566096423, "grad_norm": 1.3291770739575728, "learning_rate": 5.83882130325091e-06, "loss": 0.1708, "step": 2870 }, { "epoch": 0.4465007776049767, "grad_norm": 0.8911793698225078, "learning_rate": 5.8364129082778365e-06, "loss": 0.1196, "step": 2871 }, { "epoch": 0.44665629860031103, "grad_norm": 0.7953929696971068, "learning_rate": 5.83400431364134e-06, "loss": 0.1538, "step": 2872 }, { "epoch": 0.4468118195956454, "grad_norm": 1.1559441707967881, "learning_rate": 5.831595519916382e-06, "loss": 0.1204, "step": 2873 }, { "epoch": 0.44696734059097976, "grad_norm": 1.3626083017323063, "learning_rate": 5.829186527677978e-06, "loss": 0.22, "step": 2874 }, { "epoch": 0.44712286158631415, "grad_norm": 1.0926889554111812, "learning_rate": 5.826777337501184e-06, "loss": 0.2067, "step": 2875 }, { "epoch": 0.44727838258164854, "grad_norm": 0.8466657231561131, "learning_rate": 5.824367949961111e-06, "loss": 0.1612, "step": 2876 }, { "epoch": 0.44743390357698287, "grad_norm": 0.7989157607504066, "learning_rate": 5.821958365632912e-06, "loss": 0.1547, "step": 2877 }, { "epoch": 0.44758942457231726, "grad_norm": 0.8094031945333647, "learning_rate": 5.819548585091789e-06, "loss": 0.0945, "step": 2878 }, { "epoch": 0.44774494556765165, "grad_norm": 0.7958207811760195, "learning_rate": 5.817138608912988e-06, "loss": 0.0959, "step": 2879 }, { "epoch": 0.447900466562986, "grad_norm": 0.8266333205646537, "learning_rate": 5.814728437671808e-06, "loss": 0.1754, "step": 2880 }, { "epoch": 0.4480559875583204, "grad_norm": 0.972550276634945, "learning_rate": 5.8123180719435865e-06, "loss": 0.1777, "step": 2881 }, { "epoch": 0.44821150855365477, "grad_norm": 1.4282703560632453, "learning_rate": 5.809907512303714e-06, "loss": 0.2125, "step": 2882 }, { "epoch": 0.4483670295489891, "grad_norm": 1.3961819843956684, "learning_rate": 5.8074967593276235e-06, "loss": 0.1805, "step": 2883 }, { "epoch": 0.4485225505443235, "grad_norm": 1.3967416200849212, "learning_rate": 5.805085813590795e-06, "loss": 0.1684, "step": 2884 }, { "epoch": 0.44867807153965783, "grad_norm": 0.8170828049973748, "learning_rate": 5.802674675668757e-06, "loss": 0.1907, "step": 2885 }, { "epoch": 0.4488335925349922, "grad_norm": 0.9296133501653576, "learning_rate": 5.8002633461370796e-06, "loss": 0.1419, "step": 2886 }, { "epoch": 0.4489891135303266, "grad_norm": 1.8380074268134765, "learning_rate": 5.797851825571379e-06, "loss": 0.1747, "step": 2887 }, { "epoch": 0.44914463452566095, "grad_norm": 1.015617215668892, "learning_rate": 5.795440114547325e-06, "loss": 0.1683, "step": 2888 }, { "epoch": 0.44930015552099534, "grad_norm": 1.084096727796126, "learning_rate": 5.79302821364062e-06, "loss": 0.1034, "step": 2889 }, { "epoch": 0.4494556765163297, "grad_norm": 1.2319166472887824, "learning_rate": 5.7906161234270234e-06, "loss": 0.2228, "step": 2890 }, { "epoch": 0.44961119751166406, "grad_norm": 0.8506838169450476, "learning_rate": 5.788203844482331e-06, "loss": 0.1398, "step": 2891 }, { "epoch": 0.44976671850699845, "grad_norm": 1.1276990081993483, "learning_rate": 5.785791377382389e-06, "loss": 0.1923, "step": 2892 }, { "epoch": 0.4499222395023328, "grad_norm": 0.8536619968041392, "learning_rate": 5.783378722703089e-06, "loss": 0.2165, "step": 2893 }, { "epoch": 0.4500777604976672, "grad_norm": 1.6447718698418483, "learning_rate": 5.780965881020361e-06, "loss": 0.1833, "step": 2894 }, { "epoch": 0.45023328149300157, "grad_norm": 0.9227010065348715, "learning_rate": 5.7785528529101866e-06, "loss": 0.1726, "step": 2895 }, { "epoch": 0.4503888024883359, "grad_norm": 1.0319400682044817, "learning_rate": 5.77613963894859e-06, "loss": 0.1634, "step": 2896 }, { "epoch": 0.4505443234836703, "grad_norm": 0.9624996871000231, "learning_rate": 5.773726239711637e-06, "loss": 0.1566, "step": 2897 }, { "epoch": 0.4506998444790047, "grad_norm": 0.9665375966806482, "learning_rate": 5.771312655775441e-06, "loss": 0.213, "step": 2898 }, { "epoch": 0.450855365474339, "grad_norm": 1.0472633521697343, "learning_rate": 5.768898887716158e-06, "loss": 0.178, "step": 2899 }, { "epoch": 0.4510108864696734, "grad_norm": 0.9701300513855342, "learning_rate": 5.766484936109987e-06, "loss": 0.1445, "step": 2900 }, { "epoch": 0.4510108864696734, "eval_loss": 0.18043436110019684, "eval_runtime": 9.4294, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 2900 }, { "epoch": 0.4511664074650078, "grad_norm": 1.2900305838475343, "learning_rate": 5.764070801533174e-06, "loss": 0.214, "step": 2901 }, { "epoch": 0.45132192846034214, "grad_norm": 0.9643548247197803, "learning_rate": 5.761656484562005e-06, "loss": 0.0993, "step": 2902 }, { "epoch": 0.4514774494556765, "grad_norm": 0.6880266809778685, "learning_rate": 5.75924198577281e-06, "loss": 0.1412, "step": 2903 }, { "epoch": 0.45163297045101086, "grad_norm": 1.5335177976095686, "learning_rate": 5.756827305741967e-06, "loss": 0.2312, "step": 2904 }, { "epoch": 0.45178849144634525, "grad_norm": 0.7370899531181879, "learning_rate": 5.75441244504589e-06, "loss": 0.1374, "step": 2905 }, { "epoch": 0.45194401244167964, "grad_norm": 1.1385722358234098, "learning_rate": 5.7519974042610425e-06, "loss": 0.1962, "step": 2906 }, { "epoch": 0.452099533437014, "grad_norm": 0.934913713048744, "learning_rate": 5.749582183963928e-06, "loss": 0.195, "step": 2907 }, { "epoch": 0.45225505443234837, "grad_norm": 0.7013540764662275, "learning_rate": 5.7471667847310915e-06, "loss": 0.1577, "step": 2908 }, { "epoch": 0.45241057542768276, "grad_norm": 0.9601841108405198, "learning_rate": 5.744751207139125e-06, "loss": 0.1423, "step": 2909 }, { "epoch": 0.4525660964230171, "grad_norm": 0.8147547221027489, "learning_rate": 5.7423354517646616e-06, "loss": 0.1792, "step": 2910 }, { "epoch": 0.4527216174183515, "grad_norm": 1.5310198643371102, "learning_rate": 5.7399195191843695e-06, "loss": 0.1724, "step": 2911 }, { "epoch": 0.4528771384136858, "grad_norm": 1.3005017129305958, "learning_rate": 5.737503409974973e-06, "loss": 0.2131, "step": 2912 }, { "epoch": 0.4530326594090202, "grad_norm": 0.931545980548968, "learning_rate": 5.735087124713225e-06, "loss": 0.1468, "step": 2913 }, { "epoch": 0.4531881804043546, "grad_norm": 1.4067499689812948, "learning_rate": 5.732670663975931e-06, "loss": 0.2067, "step": 2914 }, { "epoch": 0.45334370139968894, "grad_norm": 0.8038471006508118, "learning_rate": 5.730254028339932e-06, "loss": 0.1257, "step": 2915 }, { "epoch": 0.4534992223950233, "grad_norm": 0.8234548360878575, "learning_rate": 5.7278372183821115e-06, "loss": 0.1317, "step": 2916 }, { "epoch": 0.4536547433903577, "grad_norm": 1.3563766756833842, "learning_rate": 5.725420234679397e-06, "loss": 0.1388, "step": 2917 }, { "epoch": 0.45381026438569205, "grad_norm": 1.4107629731691662, "learning_rate": 5.723003077808759e-06, "loss": 0.2219, "step": 2918 }, { "epoch": 0.45396578538102644, "grad_norm": 1.1349929852897689, "learning_rate": 5.720585748347199e-06, "loss": 0.1564, "step": 2919 }, { "epoch": 0.45412130637636083, "grad_norm": 0.8889192942018806, "learning_rate": 5.718168246871775e-06, "loss": 0.1655, "step": 2920 }, { "epoch": 0.45427682737169517, "grad_norm": 1.2975732559146709, "learning_rate": 5.715750573959575e-06, "loss": 0.1793, "step": 2921 }, { "epoch": 0.45443234836702956, "grad_norm": 1.0899522988114856, "learning_rate": 5.713332730187732e-06, "loss": 0.1869, "step": 2922 }, { "epoch": 0.4545878693623639, "grad_norm": 1.4964439511889236, "learning_rate": 5.710914716133418e-06, "loss": 0.1893, "step": 2923 }, { "epoch": 0.4547433903576983, "grad_norm": 0.8390985837031766, "learning_rate": 5.708496532373846e-06, "loss": 0.1272, "step": 2924 }, { "epoch": 0.4548989113530327, "grad_norm": 0.8763457379966617, "learning_rate": 5.706078179486273e-06, "loss": 0.1326, "step": 2925 }, { "epoch": 0.455054432348367, "grad_norm": 1.0296574472025, "learning_rate": 5.703659658047992e-06, "loss": 0.1817, "step": 2926 }, { "epoch": 0.4552099533437014, "grad_norm": 0.9540895399062591, "learning_rate": 5.701240968636335e-06, "loss": 0.176, "step": 2927 }, { "epoch": 0.4553654743390358, "grad_norm": 1.3109789795952564, "learning_rate": 5.698822111828683e-06, "loss": 0.1827, "step": 2928 }, { "epoch": 0.4555209953343701, "grad_norm": 0.9898270193501792, "learning_rate": 5.696403088202447e-06, "loss": 0.1283, "step": 2929 }, { "epoch": 0.4556765163297045, "grad_norm": 0.7970609247877734, "learning_rate": 5.69398389833508e-06, "loss": 0.1554, "step": 2930 }, { "epoch": 0.4558320373250389, "grad_norm": 1.0587572673461583, "learning_rate": 5.69156454280408e-06, "loss": 0.0949, "step": 2931 }, { "epoch": 0.45598755832037324, "grad_norm": 1.243116668405912, "learning_rate": 5.6891450221869795e-06, "loss": 0.1373, "step": 2932 }, { "epoch": 0.45614307931570763, "grad_norm": 0.8127104321653069, "learning_rate": 5.6867253370613515e-06, "loss": 0.1279, "step": 2933 }, { "epoch": 0.45629860031104197, "grad_norm": 1.139577907063286, "learning_rate": 5.68430548800481e-06, "loss": 0.1638, "step": 2934 }, { "epoch": 0.45645412130637636, "grad_norm": 1.1406110297822638, "learning_rate": 5.681885475595002e-06, "loss": 0.1228, "step": 2935 }, { "epoch": 0.45660964230171075, "grad_norm": 0.8655781409420343, "learning_rate": 5.679465300409625e-06, "loss": 0.1104, "step": 2936 }, { "epoch": 0.4567651632970451, "grad_norm": 1.0872213246586482, "learning_rate": 5.677044963026406e-06, "loss": 0.1364, "step": 2937 }, { "epoch": 0.4569206842923795, "grad_norm": 1.104743977261722, "learning_rate": 5.674624464023111e-06, "loss": 0.1376, "step": 2938 }, { "epoch": 0.45707620528771387, "grad_norm": 1.2209737773955285, "learning_rate": 5.672203803977547e-06, "loss": 0.1574, "step": 2939 }, { "epoch": 0.4572317262830482, "grad_norm": 1.385168172013817, "learning_rate": 5.669782983467562e-06, "loss": 0.1942, "step": 2940 }, { "epoch": 0.4573872472783826, "grad_norm": 1.023629213338247, "learning_rate": 5.6673620030710394e-06, "loss": 0.1543, "step": 2941 }, { "epoch": 0.4575427682737169, "grad_norm": 0.9487646843128547, "learning_rate": 5.6649408633659e-06, "loss": 0.1008, "step": 2942 }, { "epoch": 0.4576982892690513, "grad_norm": 0.7279759732800104, "learning_rate": 5.662519564930102e-06, "loss": 0.1079, "step": 2943 }, { "epoch": 0.4578538102643857, "grad_norm": 1.1653906045237807, "learning_rate": 5.660098108341645e-06, "loss": 0.1462, "step": 2944 }, { "epoch": 0.45800933125972004, "grad_norm": 1.3943655512297544, "learning_rate": 5.657676494178565e-06, "loss": 0.2395, "step": 2945 }, { "epoch": 0.45816485225505443, "grad_norm": 0.9992551890042024, "learning_rate": 5.655254723018935e-06, "loss": 0.1785, "step": 2946 }, { "epoch": 0.4583203732503888, "grad_norm": 0.8046790893530553, "learning_rate": 5.652832795440864e-06, "loss": 0.2063, "step": 2947 }, { "epoch": 0.45847589424572316, "grad_norm": 1.0188095368013486, "learning_rate": 5.6504107120225e-06, "loss": 0.175, "step": 2948 }, { "epoch": 0.45863141524105755, "grad_norm": 1.2414221425644305, "learning_rate": 5.6479884733420285e-06, "loss": 0.26, "step": 2949 }, { "epoch": 0.45878693623639194, "grad_norm": 1.956967303938825, "learning_rate": 5.645566079977673e-06, "loss": 0.1932, "step": 2950 }, { "epoch": 0.4589424572317263, "grad_norm": 1.257951692828672, "learning_rate": 5.64314353250769e-06, "loss": 0.1767, "step": 2951 }, { "epoch": 0.45909797822706067, "grad_norm": 1.0706521816521888, "learning_rate": 5.640720831510378e-06, "loss": 0.1031, "step": 2952 }, { "epoch": 0.459253499222395, "grad_norm": 1.3002734940588991, "learning_rate": 5.638297977564069e-06, "loss": 0.2175, "step": 2953 }, { "epoch": 0.4594090202177294, "grad_norm": 1.029093279115583, "learning_rate": 5.635874971247131e-06, "loss": 0.1529, "step": 2954 }, { "epoch": 0.4595645412130638, "grad_norm": 1.3405248207625982, "learning_rate": 5.633451813137969e-06, "loss": 0.1433, "step": 2955 }, { "epoch": 0.4597200622083981, "grad_norm": 0.967859940725995, "learning_rate": 5.631028503815026e-06, "loss": 0.2348, "step": 2956 }, { "epoch": 0.4598755832037325, "grad_norm": 0.8406362620059038, "learning_rate": 5.6286050438567785e-06, "loss": 0.1773, "step": 2957 }, { "epoch": 0.4600311041990669, "grad_norm": 0.9195035136133698, "learning_rate": 5.626181433841741e-06, "loss": 0.163, "step": 2958 }, { "epoch": 0.46018662519440123, "grad_norm": 0.573807239943094, "learning_rate": 5.623757674348462e-06, "loss": 0.09, "step": 2959 }, { "epoch": 0.4603421461897356, "grad_norm": 1.266679315415924, "learning_rate": 5.621333765955529e-06, "loss": 0.2472, "step": 2960 }, { "epoch": 0.46049766718506996, "grad_norm": 0.9431725369294948, "learning_rate": 5.618909709241562e-06, "loss": 0.1312, "step": 2961 }, { "epoch": 0.46065318818040435, "grad_norm": 1.65489951941891, "learning_rate": 5.616485504785215e-06, "loss": 0.1976, "step": 2962 }, { "epoch": 0.46080870917573874, "grad_norm": 0.9375287367835167, "learning_rate": 5.614061153165181e-06, "loss": 0.2043, "step": 2963 }, { "epoch": 0.4609642301710731, "grad_norm": 1.198729950243542, "learning_rate": 5.611636654960186e-06, "loss": 0.2229, "step": 2964 }, { "epoch": 0.46111975116640747, "grad_norm": 0.8192132011238209, "learning_rate": 5.609212010748992e-06, "loss": 0.149, "step": 2965 }, { "epoch": 0.46127527216174186, "grad_norm": 0.915133083121473, "learning_rate": 5.606787221110396e-06, "loss": 0.1414, "step": 2966 }, { "epoch": 0.4614307931570762, "grad_norm": 1.0377567710495836, "learning_rate": 5.60436228662323e-06, "loss": 0.1413, "step": 2967 }, { "epoch": 0.4615863141524106, "grad_norm": 0.8854081300200816, "learning_rate": 5.601937207866353e-06, "loss": 0.1066, "step": 2968 }, { "epoch": 0.461741835147745, "grad_norm": 1.3472121554196015, "learning_rate": 5.599511985418674e-06, "loss": 0.228, "step": 2969 }, { "epoch": 0.4618973561430793, "grad_norm": 1.0161857386245492, "learning_rate": 5.5970866198591235e-06, "loss": 0.2455, "step": 2970 }, { "epoch": 0.4620528771384137, "grad_norm": 1.0283889641715582, "learning_rate": 5.594661111766669e-06, "loss": 0.2147, "step": 2971 }, { "epoch": 0.46220839813374803, "grad_norm": 0.9959046363416548, "learning_rate": 5.592235461720315e-06, "loss": 0.1507, "step": 2972 }, { "epoch": 0.4623639191290824, "grad_norm": 1.0081946931150914, "learning_rate": 5.5898096702990975e-06, "loss": 0.17, "step": 2973 }, { "epoch": 0.4625194401244168, "grad_norm": 1.0888584331197622, "learning_rate": 5.587383738082086e-06, "loss": 0.2032, "step": 2974 }, { "epoch": 0.46267496111975115, "grad_norm": 1.5091818333041744, "learning_rate": 5.584957665648385e-06, "loss": 0.216, "step": 2975 }, { "epoch": 0.46283048211508554, "grad_norm": 1.015232186634148, "learning_rate": 5.582531453577131e-06, "loss": 0.19, "step": 2976 }, { "epoch": 0.46298600311041993, "grad_norm": 1.0831972635065807, "learning_rate": 5.580105102447496e-06, "loss": 0.1784, "step": 2977 }, { "epoch": 0.46314152410575427, "grad_norm": 1.1987715613337797, "learning_rate": 5.577678612838684e-06, "loss": 0.1653, "step": 2978 }, { "epoch": 0.46329704510108866, "grad_norm": 0.8926139395698894, "learning_rate": 5.57525198532993e-06, "loss": 0.1704, "step": 2979 }, { "epoch": 0.463452566096423, "grad_norm": 1.0714779425863559, "learning_rate": 5.572825220500505e-06, "loss": 0.2035, "step": 2980 }, { "epoch": 0.4636080870917574, "grad_norm": 1.1767701167353064, "learning_rate": 5.570398318929712e-06, "loss": 0.173, "step": 2981 }, { "epoch": 0.46376360808709177, "grad_norm": 1.564824922849072, "learning_rate": 5.567971281196885e-06, "loss": 0.1902, "step": 2982 }, { "epoch": 0.4639191290824261, "grad_norm": 1.012053022138034, "learning_rate": 5.565544107881394e-06, "loss": 0.1953, "step": 2983 }, { "epoch": 0.4640746500777605, "grad_norm": 1.2207483571090558, "learning_rate": 5.5631167995626355e-06, "loss": 0.1635, "step": 2984 }, { "epoch": 0.4642301710730949, "grad_norm": 1.3081377386067845, "learning_rate": 5.560689356820045e-06, "loss": 0.2208, "step": 2985 }, { "epoch": 0.4643856920684292, "grad_norm": 1.2893196092092747, "learning_rate": 5.558261780233087e-06, "loss": 0.2713, "step": 2986 }, { "epoch": 0.4645412130637636, "grad_norm": 0.9823202062369777, "learning_rate": 5.555834070381257e-06, "loss": 0.1362, "step": 2987 }, { "epoch": 0.464696734059098, "grad_norm": 0.9603031745756975, "learning_rate": 5.55340622784408e-06, "loss": 0.1686, "step": 2988 }, { "epoch": 0.46485225505443234, "grad_norm": 1.2879671106326651, "learning_rate": 5.5509782532011225e-06, "loss": 0.127, "step": 2989 }, { "epoch": 0.46500777604976673, "grad_norm": 1.0330396681687495, "learning_rate": 5.548550147031971e-06, "loss": 0.246, "step": 2990 }, { "epoch": 0.46516329704510107, "grad_norm": 1.1311776844405803, "learning_rate": 5.546121909916249e-06, "loss": 0.1492, "step": 2991 }, { "epoch": 0.46531881804043546, "grad_norm": 1.111869813103286, "learning_rate": 5.54369354243361e-06, "loss": 0.1548, "step": 2992 }, { "epoch": 0.46547433903576985, "grad_norm": 0.984356772171275, "learning_rate": 5.541265045163743e-06, "loss": 0.2079, "step": 2993 }, { "epoch": 0.4656298600311042, "grad_norm": 0.9182537896646621, "learning_rate": 5.538836418686361e-06, "loss": 0.1496, "step": 2994 }, { "epoch": 0.46578538102643857, "grad_norm": 1.4083305113292814, "learning_rate": 5.53640766358121e-06, "loss": 0.1971, "step": 2995 }, { "epoch": 0.46594090202177296, "grad_norm": 0.867924307820186, "learning_rate": 5.53397878042807e-06, "loss": 0.1591, "step": 2996 }, { "epoch": 0.4660964230171073, "grad_norm": 0.8023849407926169, "learning_rate": 5.531549769806749e-06, "loss": 0.1655, "step": 2997 }, { "epoch": 0.4662519440124417, "grad_norm": 0.9362505959263914, "learning_rate": 5.5291206322970845e-06, "loss": 0.2344, "step": 2998 }, { "epoch": 0.466407465007776, "grad_norm": 1.0871146048308495, "learning_rate": 5.526691368478948e-06, "loss": 0.1712, "step": 2999 }, { "epoch": 0.4665629860031104, "grad_norm": 0.922739833598575, "learning_rate": 5.524261978932234e-06, "loss": 0.1717, "step": 3000 }, { "epoch": 0.4665629860031104, "eval_loss": 0.17966650426387787, "eval_runtime": 9.4046, "eval_samples_per_second": 2.765, "eval_steps_per_second": 0.744, "step": 3000 }, { "epoch": 0.4667185069984448, "grad_norm": 0.9588651866259083, "learning_rate": 5.5218324642368756e-06, "loss": 0.1756, "step": 3001 }, { "epoch": 0.46687402799377914, "grad_norm": 0.9358116544339568, "learning_rate": 5.519402824972833e-06, "loss": 0.1652, "step": 3002 }, { "epoch": 0.46702954898911353, "grad_norm": 1.400816378541592, "learning_rate": 5.51697306172009e-06, "loss": 0.2275, "step": 3003 }, { "epoch": 0.4671850699844479, "grad_norm": 1.1023717827007156, "learning_rate": 5.514543175058668e-06, "loss": 0.1994, "step": 3004 }, { "epoch": 0.46734059097978226, "grad_norm": 0.89410038619993, "learning_rate": 5.512113165568615e-06, "loss": 0.1277, "step": 3005 }, { "epoch": 0.46749611197511665, "grad_norm": 0.9407398942686858, "learning_rate": 5.509683033830009e-06, "loss": 0.1524, "step": 3006 }, { "epoch": 0.46765163297045104, "grad_norm": 1.272696013922867, "learning_rate": 5.507252780422954e-06, "loss": 0.2258, "step": 3007 }, { "epoch": 0.46780715396578537, "grad_norm": 1.3811890378666423, "learning_rate": 5.504822405927586e-06, "loss": 0.2132, "step": 3008 }, { "epoch": 0.46796267496111976, "grad_norm": 0.7797998281642341, "learning_rate": 5.5023919109240694e-06, "loss": 0.2039, "step": 3009 }, { "epoch": 0.4681181959564541, "grad_norm": 1.1737632339070458, "learning_rate": 5.4999612959925995e-06, "loss": 0.2243, "step": 3010 }, { "epoch": 0.4682737169517885, "grad_norm": 1.304873814247508, "learning_rate": 5.4975305617133945e-06, "loss": 0.178, "step": 3011 }, { "epoch": 0.4684292379471229, "grad_norm": 1.040719423771746, "learning_rate": 5.495099708666706e-06, "loss": 0.1632, "step": 3012 }, { "epoch": 0.4685847589424572, "grad_norm": 1.4265888173544476, "learning_rate": 5.492668737432814e-06, "loss": 0.1587, "step": 3013 }, { "epoch": 0.4687402799377916, "grad_norm": 1.2552780049955896, "learning_rate": 5.490237648592022e-06, "loss": 0.2278, "step": 3014 }, { "epoch": 0.468895800933126, "grad_norm": 1.1129672452041854, "learning_rate": 5.487806442724668e-06, "loss": 0.1276, "step": 3015 }, { "epoch": 0.46905132192846033, "grad_norm": 0.9547377056101426, "learning_rate": 5.485375120411113e-06, "loss": 0.202, "step": 3016 }, { "epoch": 0.4692068429237947, "grad_norm": 0.9634875188872221, "learning_rate": 5.4829436822317485e-06, "loss": 0.1232, "step": 3017 }, { "epoch": 0.46936236391912906, "grad_norm": 0.996917640777059, "learning_rate": 5.480512128766992e-06, "loss": 0.1596, "step": 3018 }, { "epoch": 0.46951788491446345, "grad_norm": 1.051241552325428, "learning_rate": 5.4780804605972895e-06, "loss": 0.1676, "step": 3019 }, { "epoch": 0.46967340590979784, "grad_norm": 0.9250818509591997, "learning_rate": 5.475648678303112e-06, "loss": 0.1852, "step": 3020 }, { "epoch": 0.46982892690513217, "grad_norm": 0.8290853444366253, "learning_rate": 5.473216782464964e-06, "loss": 0.1476, "step": 3021 }, { "epoch": 0.46998444790046656, "grad_norm": 1.0788648303676924, "learning_rate": 5.47078477366337e-06, "loss": 0.1863, "step": 3022 }, { "epoch": 0.47013996889580095, "grad_norm": 0.8422972294489973, "learning_rate": 5.468352652478885e-06, "loss": 0.1431, "step": 3023 }, { "epoch": 0.4702954898911353, "grad_norm": 1.0635412912654953, "learning_rate": 5.4659204194920915e-06, "loss": 0.1104, "step": 3024 }, { "epoch": 0.4704510108864697, "grad_norm": 1.017127394487587, "learning_rate": 5.4634880752835954e-06, "loss": 0.1329, "step": 3025 }, { "epoch": 0.47060653188180407, "grad_norm": 0.8816488051501804, "learning_rate": 5.461055620434033e-06, "loss": 0.1664, "step": 3026 }, { "epoch": 0.4707620528771384, "grad_norm": 1.1538446669881461, "learning_rate": 5.4586230555240635e-06, "loss": 0.1729, "step": 3027 }, { "epoch": 0.4709175738724728, "grad_norm": 0.7517992702097397, "learning_rate": 5.456190381134374e-06, "loss": 0.1357, "step": 3028 }, { "epoch": 0.47107309486780713, "grad_norm": 0.8637667094136556, "learning_rate": 5.4537575978456815e-06, "loss": 0.2124, "step": 3029 }, { "epoch": 0.4712286158631415, "grad_norm": 1.3674407697686304, "learning_rate": 5.451324706238721e-06, "loss": 0.1218, "step": 3030 }, { "epoch": 0.4713841368584759, "grad_norm": 1.0226215269793277, "learning_rate": 5.448891706894259e-06, "loss": 0.1852, "step": 3031 }, { "epoch": 0.47153965785381025, "grad_norm": 1.2093792604850722, "learning_rate": 5.446458600393086e-06, "loss": 0.1188, "step": 3032 }, { "epoch": 0.47169517884914464, "grad_norm": 1.1962698764251234, "learning_rate": 5.44402538731602e-06, "loss": 0.1138, "step": 3033 }, { "epoch": 0.471850699844479, "grad_norm": 1.27023131385316, "learning_rate": 5.441592068243902e-06, "loss": 0.1736, "step": 3034 }, { "epoch": 0.47200622083981336, "grad_norm": 1.1205258875043107, "learning_rate": 5.439158643757599e-06, "loss": 0.1737, "step": 3035 }, { "epoch": 0.47216174183514775, "grad_norm": 1.6329772466225074, "learning_rate": 5.436725114438002e-06, "loss": 0.1653, "step": 3036 }, { "epoch": 0.4723172628304821, "grad_norm": 1.410714860078429, "learning_rate": 5.434291480866032e-06, "loss": 0.1647, "step": 3037 }, { "epoch": 0.4724727838258165, "grad_norm": 1.0899529731834223, "learning_rate": 5.431857743622629e-06, "loss": 0.2041, "step": 3038 }, { "epoch": 0.47262830482115087, "grad_norm": 1.4783540667253363, "learning_rate": 5.429423903288759e-06, "loss": 0.1789, "step": 3039 }, { "epoch": 0.4727838258164852, "grad_norm": 1.0193701821951653, "learning_rate": 5.426989960445415e-06, "loss": 0.1252, "step": 3040 }, { "epoch": 0.4729393468118196, "grad_norm": 0.8781121916908616, "learning_rate": 5.424555915673613e-06, "loss": 0.1916, "step": 3041 }, { "epoch": 0.473094867807154, "grad_norm": 1.0555837502542007, "learning_rate": 5.422121769554393e-06, "loss": 0.1442, "step": 3042 }, { "epoch": 0.4732503888024883, "grad_norm": 0.9680563834176071, "learning_rate": 5.419687522668821e-06, "loss": 0.1561, "step": 3043 }, { "epoch": 0.4734059097978227, "grad_norm": 1.1433077659391624, "learning_rate": 5.417253175597981e-06, "loss": 0.1909, "step": 3044 }, { "epoch": 0.4735614307931571, "grad_norm": 1.6226960752427693, "learning_rate": 5.414818728922993e-06, "loss": 0.1812, "step": 3045 }, { "epoch": 0.47371695178849144, "grad_norm": 1.5238472174642002, "learning_rate": 5.412384183224989e-06, "loss": 0.1634, "step": 3046 }, { "epoch": 0.4738724727838258, "grad_norm": 1.3110344558573224, "learning_rate": 5.409949539085128e-06, "loss": 0.1693, "step": 3047 }, { "epoch": 0.47402799377916016, "grad_norm": 0.9414912888855911, "learning_rate": 5.407514797084596e-06, "loss": 0.1687, "step": 3048 }, { "epoch": 0.47418351477449455, "grad_norm": 0.8820619059321233, "learning_rate": 5.4050799578046e-06, "loss": 0.129, "step": 3049 }, { "epoch": 0.47433903576982894, "grad_norm": 1.039084905453534, "learning_rate": 5.402645021826367e-06, "loss": 0.1749, "step": 3050 }, { "epoch": 0.4744945567651633, "grad_norm": 0.8641686904450486, "learning_rate": 5.400209989731155e-06, "loss": 0.132, "step": 3051 }, { "epoch": 0.47465007776049767, "grad_norm": 0.9340746318822944, "learning_rate": 5.3977748621002335e-06, "loss": 0.1497, "step": 3052 }, { "epoch": 0.47480559875583206, "grad_norm": 0.7019027941114268, "learning_rate": 5.395339639514907e-06, "loss": 0.1035, "step": 3053 }, { "epoch": 0.4749611197511664, "grad_norm": 0.9068942757043313, "learning_rate": 5.392904322556497e-06, "loss": 0.1677, "step": 3054 }, { "epoch": 0.4751166407465008, "grad_norm": 0.9896258610220877, "learning_rate": 5.390468911806345e-06, "loss": 0.1054, "step": 3055 }, { "epoch": 0.4752721617418352, "grad_norm": 1.327990427018126, "learning_rate": 5.388033407845817e-06, "loss": 0.1904, "step": 3056 }, { "epoch": 0.4754276827371695, "grad_norm": 1.259660703116942, "learning_rate": 5.385597811256305e-06, "loss": 0.1628, "step": 3057 }, { "epoch": 0.4755832037325039, "grad_norm": 1.5606340274756678, "learning_rate": 5.3831621226192175e-06, "loss": 0.1421, "step": 3058 }, { "epoch": 0.47573872472783824, "grad_norm": 0.8178318938000314, "learning_rate": 5.380726342515988e-06, "loss": 0.0879, "step": 3059 }, { "epoch": 0.4758942457231726, "grad_norm": 1.291335553326262, "learning_rate": 5.3782904715280705e-06, "loss": 0.1293, "step": 3060 }, { "epoch": 0.476049766718507, "grad_norm": 1.3113443349218536, "learning_rate": 5.375854510236942e-06, "loss": 0.1441, "step": 3061 }, { "epoch": 0.47620528771384135, "grad_norm": 1.3521075422350242, "learning_rate": 5.373418459224102e-06, "loss": 0.1805, "step": 3062 }, { "epoch": 0.47636080870917574, "grad_norm": 1.4476591821306468, "learning_rate": 5.370982319071068e-06, "loss": 0.1705, "step": 3063 }, { "epoch": 0.47651632970451013, "grad_norm": 1.1667411083497297, "learning_rate": 5.36854609035938e-06, "loss": 0.1908, "step": 3064 }, { "epoch": 0.47667185069984447, "grad_norm": 1.2963930617293362, "learning_rate": 5.3661097736706e-06, "loss": 0.2251, "step": 3065 }, { "epoch": 0.47682737169517886, "grad_norm": 1.6003909155090952, "learning_rate": 5.363673369586312e-06, "loss": 0.1472, "step": 3066 }, { "epoch": 0.4769828926905132, "grad_norm": 1.4499482563924861, "learning_rate": 5.361236878688121e-06, "loss": 0.2279, "step": 3067 }, { "epoch": 0.4771384136858476, "grad_norm": 0.9024660751198853, "learning_rate": 5.358800301557646e-06, "loss": 0.1276, "step": 3068 }, { "epoch": 0.477293934681182, "grad_norm": 0.7123768414802126, "learning_rate": 5.3563636387765375e-06, "loss": 0.1132, "step": 3069 }, { "epoch": 0.4774494556765163, "grad_norm": 1.2495378811273787, "learning_rate": 5.35392689092646e-06, "loss": 0.1655, "step": 3070 }, { "epoch": 0.4776049766718507, "grad_norm": 0.9300871231548139, "learning_rate": 5.351490058589095e-06, "loss": 0.161, "step": 3071 }, { "epoch": 0.4777604976671851, "grad_norm": 1.3485451387566407, "learning_rate": 5.349053142346153e-06, "loss": 0.1158, "step": 3072 }, { "epoch": 0.4779160186625194, "grad_norm": 0.913590506783622, "learning_rate": 5.346616142779358e-06, "loss": 0.1613, "step": 3073 }, { "epoch": 0.4780715396578538, "grad_norm": 1.2986286578620267, "learning_rate": 5.344179060470456e-06, "loss": 0.1961, "step": 3074 }, { "epoch": 0.4782270606531882, "grad_norm": 1.2407211371253672, "learning_rate": 5.341741896001212e-06, "loss": 0.1935, "step": 3075 }, { "epoch": 0.47838258164852254, "grad_norm": 1.3247600869556069, "learning_rate": 5.339304649953412e-06, "loss": 0.1981, "step": 3076 }, { "epoch": 0.47853810264385693, "grad_norm": 0.9040352832929329, "learning_rate": 5.33686732290886e-06, "loss": 0.1467, "step": 3077 }, { "epoch": 0.47869362363919127, "grad_norm": 1.3320335943694792, "learning_rate": 5.334429915449382e-06, "loss": 0.2098, "step": 3078 }, { "epoch": 0.47884914463452566, "grad_norm": 0.8566374409331093, "learning_rate": 5.331992428156817e-06, "loss": 0.1704, "step": 3079 }, { "epoch": 0.47900466562986005, "grad_norm": 0.9812058143372876, "learning_rate": 5.329554861613031e-06, "loss": 0.2109, "step": 3080 }, { "epoch": 0.4791601866251944, "grad_norm": 1.0228495570753828, "learning_rate": 5.327117216399903e-06, "loss": 0.2011, "step": 3081 }, { "epoch": 0.4793157076205288, "grad_norm": 1.315868632792085, "learning_rate": 5.324679493099334e-06, "loss": 0.1316, "step": 3082 }, { "epoch": 0.47947122861586317, "grad_norm": 2.2740261041755905, "learning_rate": 5.322241692293242e-06, "loss": 0.138, "step": 3083 }, { "epoch": 0.4796267496111975, "grad_norm": 1.4587087373680214, "learning_rate": 5.319803814563565e-06, "loss": 0.1971, "step": 3084 }, { "epoch": 0.4797822706065319, "grad_norm": 2.095518815746736, "learning_rate": 5.317365860492256e-06, "loss": 0.1554, "step": 3085 }, { "epoch": 0.4799377916018662, "grad_norm": 1.204678829911481, "learning_rate": 5.314927830661293e-06, "loss": 0.1665, "step": 3086 }, { "epoch": 0.4800933125972006, "grad_norm": 1.4418550659505385, "learning_rate": 5.312489725652662e-06, "loss": 0.2122, "step": 3087 }, { "epoch": 0.480248833592535, "grad_norm": 0.8136316150913194, "learning_rate": 5.310051546048377e-06, "loss": 0.1424, "step": 3088 }, { "epoch": 0.48040435458786934, "grad_norm": 0.9852417096728027, "learning_rate": 5.307613292430465e-06, "loss": 0.1003, "step": 3089 }, { "epoch": 0.48055987558320373, "grad_norm": 1.1222990527462533, "learning_rate": 5.3051749653809685e-06, "loss": 0.1812, "step": 3090 }, { "epoch": 0.4807153965785381, "grad_norm": 1.0373183756116142, "learning_rate": 5.302736565481953e-06, "loss": 0.1423, "step": 3091 }, { "epoch": 0.48087091757387246, "grad_norm": 1.324759578661333, "learning_rate": 5.3002980933154965e-06, "loss": 0.1376, "step": 3092 }, { "epoch": 0.48102643856920685, "grad_norm": 0.9770183908314843, "learning_rate": 5.297859549463698e-06, "loss": 0.2025, "step": 3093 }, { "epoch": 0.48118195956454124, "grad_norm": 0.9654108767528797, "learning_rate": 5.295420934508671e-06, "loss": 0.1144, "step": 3094 }, { "epoch": 0.4813374805598756, "grad_norm": 1.3796291375829575, "learning_rate": 5.292982249032549e-06, "loss": 0.2264, "step": 3095 }, { "epoch": 0.48149300155520997, "grad_norm": 1.701119558258769, "learning_rate": 5.290543493617477e-06, "loss": 0.1883, "step": 3096 }, { "epoch": 0.4816485225505443, "grad_norm": 1.028181011357914, "learning_rate": 5.288104668845622e-06, "loss": 0.0827, "step": 3097 }, { "epoch": 0.4818040435458787, "grad_norm": 0.8508209433559467, "learning_rate": 5.285665775299164e-06, "loss": 0.1664, "step": 3098 }, { "epoch": 0.4819595645412131, "grad_norm": 1.0901734334162163, "learning_rate": 5.283226813560301e-06, "loss": 0.1333, "step": 3099 }, { "epoch": 0.4821150855365474, "grad_norm": 0.8044564361885318, "learning_rate": 5.2807877842112475e-06, "loss": 0.1591, "step": 3100 }, { "epoch": 0.4821150855365474, "eval_loss": 0.1795499622821808, "eval_runtime": 9.4158, "eval_samples_per_second": 2.761, "eval_steps_per_second": 0.743, "step": 3100 }, { "epoch": 0.4822706065318818, "grad_norm": 1.2041703341026146, "learning_rate": 5.278348687834236e-06, "loss": 0.1987, "step": 3101 }, { "epoch": 0.4824261275272162, "grad_norm": 0.9858057220040491, "learning_rate": 5.27590952501151e-06, "loss": 0.1993, "step": 3102 }, { "epoch": 0.48258164852255053, "grad_norm": 1.4176710741125174, "learning_rate": 5.273470296325334e-06, "loss": 0.2067, "step": 3103 }, { "epoch": 0.4827371695178849, "grad_norm": 0.7262957935135647, "learning_rate": 5.271031002357984e-06, "loss": 0.1331, "step": 3104 }, { "epoch": 0.48289269051321926, "grad_norm": 0.9439591765571638, "learning_rate": 5.2685916436917524e-06, "loss": 0.1817, "step": 3105 }, { "epoch": 0.48304821150855365, "grad_norm": 1.0679037429778249, "learning_rate": 5.2661522209089515e-06, "loss": 0.1494, "step": 3106 }, { "epoch": 0.48320373250388804, "grad_norm": 1.5317776188430958, "learning_rate": 5.263712734591902e-06, "loss": 0.2384, "step": 3107 }, { "epoch": 0.4833592534992224, "grad_norm": 1.1696711463667038, "learning_rate": 5.261273185322947e-06, "loss": 0.2137, "step": 3108 }, { "epoch": 0.48351477449455676, "grad_norm": 1.2466362645206497, "learning_rate": 5.258833573684438e-06, "loss": 0.1399, "step": 3109 }, { "epoch": 0.48367029548989116, "grad_norm": 1.100303932070022, "learning_rate": 5.256393900258747e-06, "loss": 0.1914, "step": 3110 }, { "epoch": 0.4838258164852255, "grad_norm": 1.4327405643242523, "learning_rate": 5.253954165628257e-06, "loss": 0.239, "step": 3111 }, { "epoch": 0.4839813374805599, "grad_norm": 0.7823106947817152, "learning_rate": 5.251514370375363e-06, "loss": 0.1354, "step": 3112 }, { "epoch": 0.48413685847589427, "grad_norm": 0.7007925075849528, "learning_rate": 5.249074515082483e-06, "loss": 0.1326, "step": 3113 }, { "epoch": 0.4842923794712286, "grad_norm": 0.979987424057645, "learning_rate": 5.246634600332044e-06, "loss": 0.1138, "step": 3114 }, { "epoch": 0.484447900466563, "grad_norm": 1.338870897771577, "learning_rate": 5.244194626706485e-06, "loss": 0.2461, "step": 3115 }, { "epoch": 0.48460342146189733, "grad_norm": 1.2338953091877114, "learning_rate": 5.241754594788262e-06, "loss": 0.1591, "step": 3116 }, { "epoch": 0.4847589424572317, "grad_norm": 0.8998324651082069, "learning_rate": 5.239314505159847e-06, "loss": 0.1086, "step": 3117 }, { "epoch": 0.4849144634525661, "grad_norm": 1.262653444818707, "learning_rate": 5.236874358403723e-06, "loss": 0.888, "step": 3118 }, { "epoch": 0.48506998444790045, "grad_norm": 0.9161335290651341, "learning_rate": 5.234434155102387e-06, "loss": 0.1866, "step": 3119 }, { "epoch": 0.48522550544323484, "grad_norm": 0.9435184558531327, "learning_rate": 5.231993895838348e-06, "loss": 0.1173, "step": 3120 }, { "epoch": 0.48538102643856923, "grad_norm": 1.15039793341985, "learning_rate": 5.229553581194129e-06, "loss": 0.1731, "step": 3121 }, { "epoch": 0.48553654743390356, "grad_norm": 1.226210260961498, "learning_rate": 5.22711321175227e-06, "loss": 0.1419, "step": 3122 }, { "epoch": 0.48569206842923796, "grad_norm": 1.3254021317730726, "learning_rate": 5.224672788095322e-06, "loss": 0.1508, "step": 3123 }, { "epoch": 0.4858475894245723, "grad_norm": 1.4124339248798146, "learning_rate": 5.222232310805844e-06, "loss": 0.132, "step": 3124 }, { "epoch": 0.4860031104199067, "grad_norm": 1.1548472668095506, "learning_rate": 5.219791780466416e-06, "loss": 0.2142, "step": 3125 }, { "epoch": 0.48615863141524107, "grad_norm": 0.9220933363833346, "learning_rate": 5.217351197659623e-06, "loss": 0.1383, "step": 3126 }, { "epoch": 0.4863141524105754, "grad_norm": 0.9984561820561947, "learning_rate": 5.2149105629680695e-06, "loss": 0.1648, "step": 3127 }, { "epoch": 0.4864696734059098, "grad_norm": 1.303567543042762, "learning_rate": 5.2124698769743675e-06, "loss": 0.153, "step": 3128 }, { "epoch": 0.4866251944012442, "grad_norm": 1.0543764664057818, "learning_rate": 5.2100291402611414e-06, "loss": 0.1504, "step": 3129 }, { "epoch": 0.4867807153965785, "grad_norm": 0.7152027414459398, "learning_rate": 5.207588353411032e-06, "loss": 0.1126, "step": 3130 }, { "epoch": 0.4869362363919129, "grad_norm": 1.1318415975100828, "learning_rate": 5.205147517006688e-06, "loss": 0.1804, "step": 3131 }, { "epoch": 0.4870917573872473, "grad_norm": 1.5726200714958825, "learning_rate": 5.2027066316307695e-06, "loss": 0.1825, "step": 3132 }, { "epoch": 0.48724727838258164, "grad_norm": 1.2129387055222793, "learning_rate": 5.200265697865953e-06, "loss": 0.2034, "step": 3133 }, { "epoch": 0.48740279937791603, "grad_norm": 0.960792401414764, "learning_rate": 5.19782471629492e-06, "loss": 0.1236, "step": 3134 }, { "epoch": 0.48755832037325036, "grad_norm": 0.804317553822898, "learning_rate": 5.1953836875003695e-06, "loss": 0.1144, "step": 3135 }, { "epoch": 0.48771384136858476, "grad_norm": 0.680724128046669, "learning_rate": 5.192942612065007e-06, "loss": 0.131, "step": 3136 }, { "epoch": 0.48786936236391915, "grad_norm": 0.8744519119198428, "learning_rate": 5.190501490571552e-06, "loss": 0.161, "step": 3137 }, { "epoch": 0.4880248833592535, "grad_norm": 0.9087812346785238, "learning_rate": 5.188060323602737e-06, "loss": 0.2251, "step": 3138 }, { "epoch": 0.48818040435458787, "grad_norm": 1.0142636227165327, "learning_rate": 5.185619111741298e-06, "loss": 0.2089, "step": 3139 }, { "epoch": 0.48833592534992226, "grad_norm": 1.4141944011654735, "learning_rate": 5.183177855569989e-06, "loss": 0.2311, "step": 3140 }, { "epoch": 0.4884914463452566, "grad_norm": 0.9547434094606634, "learning_rate": 5.180736555671572e-06, "loss": 0.1542, "step": 3141 }, { "epoch": 0.488646967340591, "grad_norm": 1.23756941255711, "learning_rate": 5.178295212628818e-06, "loss": 0.2225, "step": 3142 }, { "epoch": 0.4888024883359253, "grad_norm": 1.6996595064785625, "learning_rate": 5.175853827024511e-06, "loss": 0.188, "step": 3143 }, { "epoch": 0.4889580093312597, "grad_norm": 1.1547179870765238, "learning_rate": 5.173412399441442e-06, "loss": 0.2399, "step": 3144 }, { "epoch": 0.4891135303265941, "grad_norm": 0.9962059411422534, "learning_rate": 5.170970930462414e-06, "loss": 0.1331, "step": 3145 }, { "epoch": 0.48926905132192844, "grad_norm": 1.1826712742971357, "learning_rate": 5.168529420670244e-06, "loss": 0.1513, "step": 3146 }, { "epoch": 0.48942457231726283, "grad_norm": 1.287443489167226, "learning_rate": 5.166087870647749e-06, "loss": 0.173, "step": 3147 }, { "epoch": 0.4895800933125972, "grad_norm": 0.9707000581373366, "learning_rate": 5.163646280977763e-06, "loss": 0.1826, "step": 3148 }, { "epoch": 0.48973561430793155, "grad_norm": 1.0456608995428338, "learning_rate": 5.161204652243128e-06, "loss": 0.1755, "step": 3149 }, { "epoch": 0.48989113530326595, "grad_norm": 0.9705156371780639, "learning_rate": 5.158762985026694e-06, "loss": 0.1685, "step": 3150 }, { "epoch": 0.49004665629860034, "grad_norm": 0.9207172840710631, "learning_rate": 5.156321279911323e-06, "loss": 0.0882, "step": 3151 }, { "epoch": 0.49020217729393467, "grad_norm": 1.2605340908274074, "learning_rate": 5.153879537479881e-06, "loss": 0.1617, "step": 3152 }, { "epoch": 0.49035769828926906, "grad_norm": 0.9563216165297458, "learning_rate": 5.151437758315247e-06, "loss": 0.167, "step": 3153 }, { "epoch": 0.4905132192846034, "grad_norm": 0.9289530521297379, "learning_rate": 5.14899594300031e-06, "loss": 0.2001, "step": 3154 }, { "epoch": 0.4906687402799378, "grad_norm": 1.056129917436573, "learning_rate": 5.146554092117963e-06, "loss": 0.1233, "step": 3155 }, { "epoch": 0.4908242612752722, "grad_norm": 0.9190131170059495, "learning_rate": 5.14411220625111e-06, "loss": 0.1851, "step": 3156 }, { "epoch": 0.4909797822706065, "grad_norm": 1.052757150961462, "learning_rate": 5.141670285982663e-06, "loss": 0.2467, "step": 3157 }, { "epoch": 0.4911353032659409, "grad_norm": 0.8793744145541432, "learning_rate": 5.139228331895544e-06, "loss": 0.1962, "step": 3158 }, { "epoch": 0.4912908242612753, "grad_norm": 1.0673896010715038, "learning_rate": 5.13678634457268e-06, "loss": 0.1403, "step": 3159 }, { "epoch": 0.49144634525660963, "grad_norm": 0.9127806263031741, "learning_rate": 5.1343443245970095e-06, "loss": 0.2033, "step": 3160 }, { "epoch": 0.491601866251944, "grad_norm": 1.3124707569347591, "learning_rate": 5.131902272551472e-06, "loss": 0.1514, "step": 3161 }, { "epoch": 0.4917573872472784, "grad_norm": 1.0191630043446744, "learning_rate": 5.129460189019025e-06, "loss": 0.1883, "step": 3162 }, { "epoch": 0.49191290824261275, "grad_norm": 1.0979472112538669, "learning_rate": 5.127018074582626e-06, "loss": 0.1601, "step": 3163 }, { "epoch": 0.49206842923794714, "grad_norm": 1.0217369809773804, "learning_rate": 5.124575929825241e-06, "loss": 0.1349, "step": 3164 }, { "epoch": 0.49222395023328147, "grad_norm": 1.0715022800153613, "learning_rate": 5.122133755329843e-06, "loss": 0.1458, "step": 3165 }, { "epoch": 0.49237947122861586, "grad_norm": 0.8630748377758154, "learning_rate": 5.119691551679415e-06, "loss": 0.1184, "step": 3166 }, { "epoch": 0.49253499222395025, "grad_norm": 0.9298728470385974, "learning_rate": 5.117249319456945e-06, "loss": 0.1065, "step": 3167 }, { "epoch": 0.4926905132192846, "grad_norm": 0.6433300190749907, "learning_rate": 5.114807059245428e-06, "loss": 0.1775, "step": 3168 }, { "epoch": 0.492846034214619, "grad_norm": 1.051441998967172, "learning_rate": 5.112364771627862e-06, "loss": 0.1108, "step": 3169 }, { "epoch": 0.49300155520995337, "grad_norm": 0.8846316230727177, "learning_rate": 5.10992245718726e-06, "loss": 0.1525, "step": 3170 }, { "epoch": 0.4931570762052877, "grad_norm": 1.101033172489026, "learning_rate": 5.107480116506636e-06, "loss": 0.1522, "step": 3171 }, { "epoch": 0.4933125972006221, "grad_norm": 1.0558748919329528, "learning_rate": 5.105037750169007e-06, "loss": 0.1736, "step": 3172 }, { "epoch": 0.49346811819595643, "grad_norm": 0.81951761882563, "learning_rate": 5.102595358757403e-06, "loss": 0.121, "step": 3173 }, { "epoch": 0.4936236391912908, "grad_norm": 0.9253414794544325, "learning_rate": 5.100152942854856e-06, "loss": 0.1448, "step": 3174 }, { "epoch": 0.4937791601866252, "grad_norm": 0.8450740644918073, "learning_rate": 5.097710503044404e-06, "loss": 0.1199, "step": 3175 }, { "epoch": 0.49393468118195955, "grad_norm": 0.9919713289786649, "learning_rate": 5.095268039909095e-06, "loss": 0.0772, "step": 3176 }, { "epoch": 0.49409020217729394, "grad_norm": 1.5312836601586088, "learning_rate": 5.092825554031973e-06, "loss": 0.1893, "step": 3177 }, { "epoch": 0.4942457231726283, "grad_norm": 0.8363942900253392, "learning_rate": 5.090383045996098e-06, "loss": 0.1628, "step": 3178 }, { "epoch": 0.49440124416796266, "grad_norm": 0.6550932568866589, "learning_rate": 5.087940516384531e-06, "loss": 0.1348, "step": 3179 }, { "epoch": 0.49455676516329705, "grad_norm": 1.1912128649905764, "learning_rate": 5.085497965780335e-06, "loss": 0.1841, "step": 3180 }, { "epoch": 0.49471228615863144, "grad_norm": 0.9666152005487191, "learning_rate": 5.083055394766583e-06, "loss": 0.1033, "step": 3181 }, { "epoch": 0.4948678071539658, "grad_norm": 0.7702747577654057, "learning_rate": 5.080612803926349e-06, "loss": 0.2177, "step": 3182 }, { "epoch": 0.49502332814930017, "grad_norm": 1.1477472492552088, "learning_rate": 5.0781701938427145e-06, "loss": 0.138, "step": 3183 }, { "epoch": 0.4951788491446345, "grad_norm": 1.1735670750161997, "learning_rate": 5.075727565098766e-06, "loss": 0.1814, "step": 3184 }, { "epoch": 0.4953343701399689, "grad_norm": 1.111859526917612, "learning_rate": 5.07328491827759e-06, "loss": 0.2151, "step": 3185 }, { "epoch": 0.4954898911353033, "grad_norm": 1.282074516996947, "learning_rate": 5.0708422539622824e-06, "loss": 0.1558, "step": 3186 }, { "epoch": 0.4956454121306376, "grad_norm": 1.0440713207549788, "learning_rate": 5.068399572735941e-06, "loss": 0.1443, "step": 3187 }, { "epoch": 0.495800933125972, "grad_norm": 1.3745622656453431, "learning_rate": 5.065956875181669e-06, "loss": 0.2118, "step": 3188 }, { "epoch": 0.4959564541213064, "grad_norm": 0.7311163972789335, "learning_rate": 5.06351416188257e-06, "loss": 0.1092, "step": 3189 }, { "epoch": 0.49611197511664074, "grad_norm": 1.5065281059789226, "learning_rate": 5.061071433421754e-06, "loss": 0.159, "step": 3190 }, { "epoch": 0.4962674961119751, "grad_norm": 1.0428977325895967, "learning_rate": 5.058628690382337e-06, "loss": 0.1508, "step": 3191 }, { "epoch": 0.49642301710730946, "grad_norm": 0.9710968072297266, "learning_rate": 5.056185933347433e-06, "loss": 0.14, "step": 3192 }, { "epoch": 0.49657853810264385, "grad_norm": 1.1909581387699815, "learning_rate": 5.053743162900162e-06, "loss": 0.1933, "step": 3193 }, { "epoch": 0.49673405909797824, "grad_norm": 1.0098041514600664, "learning_rate": 5.051300379623649e-06, "loss": 0.1809, "step": 3194 }, { "epoch": 0.4968895800933126, "grad_norm": 1.604214412473776, "learning_rate": 5.048857584101023e-06, "loss": 0.1741, "step": 3195 }, { "epoch": 0.49704510108864697, "grad_norm": 1.2530287623619083, "learning_rate": 5.046414776915407e-06, "loss": 0.1189, "step": 3196 }, { "epoch": 0.49720062208398136, "grad_norm": 0.9930535399203282, "learning_rate": 5.043971958649938e-06, "loss": 0.1492, "step": 3197 }, { "epoch": 0.4973561430793157, "grad_norm": 1.389463690380734, "learning_rate": 5.041529129887748e-06, "loss": 0.108, "step": 3198 }, { "epoch": 0.4975116640746501, "grad_norm": 1.0717903236031918, "learning_rate": 5.039086291211977e-06, "loss": 0.1652, "step": 3199 }, { "epoch": 0.4976671850699845, "grad_norm": 0.8094877180712775, "learning_rate": 5.0366434432057624e-06, "loss": 0.1177, "step": 3200 }, { "epoch": 0.4976671850699845, "eval_loss": 0.17928095161914825, "eval_runtime": 9.4395, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.742, "step": 3200 }, { "epoch": 0.4978227060653188, "grad_norm": 1.2847928318456825, "learning_rate": 5.0342005864522455e-06, "loss": 0.1952, "step": 3201 }, { "epoch": 0.4979782270606532, "grad_norm": 0.8905120834410702, "learning_rate": 5.031757721534574e-06, "loss": 0.1568, "step": 3202 }, { "epoch": 0.49813374805598754, "grad_norm": 2.416945596676086, "learning_rate": 5.029314849035892e-06, "loss": 0.1829, "step": 3203 }, { "epoch": 0.4982892690513219, "grad_norm": 1.0760755091062555, "learning_rate": 5.026871969539345e-06, "loss": 0.1333, "step": 3204 }, { "epoch": 0.4984447900466563, "grad_norm": 0.7450074349565903, "learning_rate": 5.024429083628084e-06, "loss": 0.1587, "step": 3205 }, { "epoch": 0.49860031104199065, "grad_norm": 1.050968185165006, "learning_rate": 5.021986191885261e-06, "loss": 0.1239, "step": 3206 }, { "epoch": 0.49875583203732504, "grad_norm": 1.2642838426906973, "learning_rate": 5.019543294894027e-06, "loss": 0.2189, "step": 3207 }, { "epoch": 0.49891135303265943, "grad_norm": 1.3068446782104037, "learning_rate": 5.017100393237535e-06, "loss": 0.197, "step": 3208 }, { "epoch": 0.49906687402799377, "grad_norm": 1.2586585710128786, "learning_rate": 5.0146574874989415e-06, "loss": 0.1681, "step": 3209 }, { "epoch": 0.49922239502332816, "grad_norm": 1.0767385163554102, "learning_rate": 5.012214578261402e-06, "loss": 0.1426, "step": 3210 }, { "epoch": 0.4993779160186625, "grad_norm": 1.2231015538826626, "learning_rate": 5.009771666108072e-06, "loss": 0.1756, "step": 3211 }, { "epoch": 0.4995334370139969, "grad_norm": 1.059121204337931, "learning_rate": 5.007328751622109e-06, "loss": 0.1345, "step": 3212 }, { "epoch": 0.4996889580093313, "grad_norm": 0.9545702232721645, "learning_rate": 5.00488583538667e-06, "loss": 0.1761, "step": 3213 }, { "epoch": 0.4998444790046656, "grad_norm": 0.8040875346346933, "learning_rate": 5.002442917984916e-06, "loss": 0.1596, "step": 3214 }, { "epoch": 0.5, "grad_norm": 0.8988144127892902, "learning_rate": 5e-06, "loss": 0.1292, "step": 3215 }, { "epoch": 0.5001555209953343, "grad_norm": 1.4682569410241058, "learning_rate": 4.9975570820150875e-06, "loss": 0.2042, "step": 3216 }, { "epoch": 0.5003110419906688, "grad_norm": 1.2067322072143818, "learning_rate": 4.9951141646133314e-06, "loss": 0.1874, "step": 3217 }, { "epoch": 0.5004665629860031, "grad_norm": 1.0029755084985386, "learning_rate": 4.992671248377892e-06, "loss": 0.1665, "step": 3218 }, { "epoch": 0.5006220839813375, "grad_norm": 1.502155139160887, "learning_rate": 4.99022833389193e-06, "loss": 0.1523, "step": 3219 }, { "epoch": 0.5007776049766719, "grad_norm": 1.0191626515598071, "learning_rate": 4.987785421738599e-06, "loss": 0.2288, "step": 3220 }, { "epoch": 0.5009331259720062, "grad_norm": 0.9248760373217237, "learning_rate": 4.985342512501059e-06, "loss": 0.176, "step": 3221 }, { "epoch": 0.5010886469673406, "grad_norm": 1.1694736993987427, "learning_rate": 4.982899606762467e-06, "loss": 0.2026, "step": 3222 }, { "epoch": 0.501244167962675, "grad_norm": 0.8724342436119925, "learning_rate": 4.980456705105974e-06, "loss": 0.1382, "step": 3223 }, { "epoch": 0.5013996889580093, "grad_norm": 0.7798338010227632, "learning_rate": 4.9780138081147405e-06, "loss": 0.1122, "step": 3224 }, { "epoch": 0.5015552099533437, "grad_norm": 0.9273450543732058, "learning_rate": 4.975570916371917e-06, "loss": 0.2202, "step": 3225 }, { "epoch": 0.501710730948678, "grad_norm": 0.702593936273559, "learning_rate": 4.973128030460658e-06, "loss": 0.1385, "step": 3226 }, { "epoch": 0.5018662519440125, "grad_norm": 1.1460839871397348, "learning_rate": 4.97068515096411e-06, "loss": 0.1552, "step": 3227 }, { "epoch": 0.5020217729393468, "grad_norm": 1.2427549273963086, "learning_rate": 4.968242278465428e-06, "loss": 0.1237, "step": 3228 }, { "epoch": 0.5021772939346811, "grad_norm": 0.9786168004225111, "learning_rate": 4.965799413547755e-06, "loss": 0.1651, "step": 3229 }, { "epoch": 0.5023328149300156, "grad_norm": 1.916229108908797, "learning_rate": 4.963356556794238e-06, "loss": 0.2554, "step": 3230 }, { "epoch": 0.5024883359253499, "grad_norm": 2.194946035735157, "learning_rate": 4.960913708788025e-06, "loss": 0.1211, "step": 3231 }, { "epoch": 0.5026438569206843, "grad_norm": 1.0092289590774086, "learning_rate": 4.958470870112254e-06, "loss": 0.1859, "step": 3232 }, { "epoch": 0.5027993779160187, "grad_norm": 1.323336821256963, "learning_rate": 4.9560280413500635e-06, "loss": 0.1805, "step": 3233 }, { "epoch": 0.502954898911353, "grad_norm": 1.1372896412734643, "learning_rate": 4.953585223084595e-06, "loss": 0.2608, "step": 3234 }, { "epoch": 0.5031104199066874, "grad_norm": 1.1574539644461856, "learning_rate": 4.95114241589898e-06, "loss": 0.1771, "step": 3235 }, { "epoch": 0.5032659409020218, "grad_norm": 1.0271739140758736, "learning_rate": 4.948699620376351e-06, "loss": 0.2019, "step": 3236 }, { "epoch": 0.5034214618973561, "grad_norm": 1.4835225170427226, "learning_rate": 4.946256837099839e-06, "loss": 0.1514, "step": 3237 }, { "epoch": 0.5035769828926905, "grad_norm": 0.8956801825252562, "learning_rate": 4.943814066652569e-06, "loss": 0.2294, "step": 3238 }, { "epoch": 0.5037325038880249, "grad_norm": 2.0678717709891696, "learning_rate": 4.941371309617664e-06, "loss": 0.1507, "step": 3239 }, { "epoch": 0.5038880248833593, "grad_norm": 1.3603028958190868, "learning_rate": 4.938928566578247e-06, "loss": 0.1958, "step": 3240 }, { "epoch": 0.5040435458786936, "grad_norm": 0.9447371611997889, "learning_rate": 4.9364858381174315e-06, "loss": 0.1133, "step": 3241 }, { "epoch": 0.504199066874028, "grad_norm": 0.8968105637109985, "learning_rate": 4.934043124818332e-06, "loss": 0.1356, "step": 3242 }, { "epoch": 0.5043545878693624, "grad_norm": 1.7653532924251367, "learning_rate": 4.9316004272640594e-06, "loss": 0.1681, "step": 3243 }, { "epoch": 0.5045101088646967, "grad_norm": 1.8560072339178004, "learning_rate": 4.929157746037719e-06, "loss": 0.2308, "step": 3244 }, { "epoch": 0.504665629860031, "grad_norm": 0.9582004170528675, "learning_rate": 4.926715081722412e-06, "loss": 0.14, "step": 3245 }, { "epoch": 0.5048211508553655, "grad_norm": 1.1684118311338092, "learning_rate": 4.924272434901236e-06, "loss": 0.1864, "step": 3246 }, { "epoch": 0.5049766718506998, "grad_norm": 1.247855369122297, "learning_rate": 4.921829806157286e-06, "loss": 0.1741, "step": 3247 }, { "epoch": 0.5051321928460342, "grad_norm": 0.8072453759123964, "learning_rate": 4.919387196073654e-06, "loss": 0.2147, "step": 3248 }, { "epoch": 0.5052877138413686, "grad_norm": 0.8534703921171939, "learning_rate": 4.916944605233419e-06, "loss": 0.1887, "step": 3249 }, { "epoch": 0.505443234836703, "grad_norm": 0.9445253444548173, "learning_rate": 4.914502034219667e-06, "loss": 0.1108, "step": 3250 }, { "epoch": 0.5055987558320373, "grad_norm": 1.3603480607465024, "learning_rate": 4.9120594836154715e-06, "loss": 0.1186, "step": 3251 }, { "epoch": 0.5057542768273717, "grad_norm": 1.6395798920499736, "learning_rate": 4.909616954003902e-06, "loss": 0.1618, "step": 3252 }, { "epoch": 0.5059097978227061, "grad_norm": 0.8220526717580713, "learning_rate": 4.907174445968028e-06, "loss": 0.1509, "step": 3253 }, { "epoch": 0.5060653188180404, "grad_norm": 1.0160214037600337, "learning_rate": 4.9047319600909075e-06, "loss": 0.1488, "step": 3254 }, { "epoch": 0.5062208398133748, "grad_norm": 1.6647744901023496, "learning_rate": 4.902289496955596e-06, "loss": 0.1901, "step": 3255 }, { "epoch": 0.5063763608087092, "grad_norm": 1.2332971799095815, "learning_rate": 4.899847057145146e-06, "loss": 0.2012, "step": 3256 }, { "epoch": 0.5065318818040435, "grad_norm": 1.2499962620305265, "learning_rate": 4.897404641242599e-06, "loss": 0.1699, "step": 3257 }, { "epoch": 0.506687402799378, "grad_norm": 1.0112200942176497, "learning_rate": 4.894962249830995e-06, "loss": 0.1267, "step": 3258 }, { "epoch": 0.5068429237947123, "grad_norm": 1.23731457560021, "learning_rate": 4.892519883493367e-06, "loss": 0.2066, "step": 3259 }, { "epoch": 0.5069984447900466, "grad_norm": 1.0357139346259847, "learning_rate": 4.890077542812742e-06, "loss": 0.1411, "step": 3260 }, { "epoch": 0.5071539657853811, "grad_norm": 1.1143351049795318, "learning_rate": 4.887635228372139e-06, "loss": 0.2024, "step": 3261 }, { "epoch": 0.5073094867807154, "grad_norm": 0.7405795468735877, "learning_rate": 4.885192940754574e-06, "loss": 0.174, "step": 3262 }, { "epoch": 0.5074650077760497, "grad_norm": 1.3584278229096154, "learning_rate": 4.882750680543057e-06, "loss": 0.156, "step": 3263 }, { "epoch": 0.5076205287713841, "grad_norm": 1.1621886043487883, "learning_rate": 4.880308448320587e-06, "loss": 0.1641, "step": 3264 }, { "epoch": 0.5077760497667185, "grad_norm": 1.2685356546431112, "learning_rate": 4.877866244670158e-06, "loss": 0.155, "step": 3265 }, { "epoch": 0.5079315707620529, "grad_norm": 1.184747298645261, "learning_rate": 4.875424070174761e-06, "loss": 0.1176, "step": 3266 }, { "epoch": 0.5080870917573872, "grad_norm": 0.9725606664125502, "learning_rate": 4.872981925417376e-06, "loss": 0.1425, "step": 3267 }, { "epoch": 0.5082426127527216, "grad_norm": 0.9667237170384293, "learning_rate": 4.8705398109809755e-06, "loss": 0.1468, "step": 3268 }, { "epoch": 0.508398133748056, "grad_norm": 1.3100743649561728, "learning_rate": 4.8680977274485284e-06, "loss": 0.1702, "step": 3269 }, { "epoch": 0.5085536547433903, "grad_norm": 1.2477339432778347, "learning_rate": 4.865655675402993e-06, "loss": 0.1589, "step": 3270 }, { "epoch": 0.5087091757387248, "grad_norm": 1.0194213880755998, "learning_rate": 4.86321365542732e-06, "loss": 0.2263, "step": 3271 }, { "epoch": 0.5088646967340591, "grad_norm": 1.0941879802495735, "learning_rate": 4.860771668104457e-06, "loss": 0.1831, "step": 3272 }, { "epoch": 0.5090202177293934, "grad_norm": 1.0424480660464837, "learning_rate": 4.8583297140173385e-06, "loss": 0.2004, "step": 3273 }, { "epoch": 0.5091757387247279, "grad_norm": 1.3829603140283717, "learning_rate": 4.855887793748892e-06, "loss": 0.2545, "step": 3274 }, { "epoch": 0.5093312597200622, "grad_norm": 0.6985808895687627, "learning_rate": 4.853445907882039e-06, "loss": 0.1112, "step": 3275 }, { "epoch": 0.5094867807153965, "grad_norm": 0.7894459978197516, "learning_rate": 4.851004056999692e-06, "loss": 0.1111, "step": 3276 }, { "epoch": 0.509642301710731, "grad_norm": 0.7366845317408287, "learning_rate": 4.8485622416847535e-06, "loss": 0.1288, "step": 3277 }, { "epoch": 0.5097978227060653, "grad_norm": 0.9987229402460874, "learning_rate": 4.84612046252012e-06, "loss": 0.1219, "step": 3278 }, { "epoch": 0.5099533437013997, "grad_norm": 1.0679538877824934, "learning_rate": 4.843678720088679e-06, "loss": 0.1992, "step": 3279 }, { "epoch": 0.5101088646967341, "grad_norm": 1.0409387810965514, "learning_rate": 4.841237014973305e-06, "loss": 0.1714, "step": 3280 }, { "epoch": 0.5102643856920684, "grad_norm": 0.8874086607011485, "learning_rate": 4.838795347756873e-06, "loss": 0.0883, "step": 3281 }, { "epoch": 0.5104199066874028, "grad_norm": 1.0323219564357058, "learning_rate": 4.836353719022238e-06, "loss": 0.0976, "step": 3282 }, { "epoch": 0.5105754276827371, "grad_norm": 0.9161757479772081, "learning_rate": 4.833912129352254e-06, "loss": 0.1608, "step": 3283 }, { "epoch": 0.5107309486780716, "grad_norm": 1.0752663017770174, "learning_rate": 4.831470579329757e-06, "loss": 0.177, "step": 3284 }, { "epoch": 0.5108864696734059, "grad_norm": 0.7106222402700771, "learning_rate": 4.8290290695375866e-06, "loss": 0.1567, "step": 3285 }, { "epoch": 0.5110419906687402, "grad_norm": 1.5129472157827726, "learning_rate": 4.82658760055856e-06, "loss": 0.2075, "step": 3286 }, { "epoch": 0.5111975116640747, "grad_norm": 1.0867624299902485, "learning_rate": 4.82414617297549e-06, "loss": 0.1406, "step": 3287 }, { "epoch": 0.511353032659409, "grad_norm": 1.5248034140590612, "learning_rate": 4.821704787371184e-06, "loss": 0.1823, "step": 3288 }, { "epoch": 0.5115085536547433, "grad_norm": 0.8543020980494003, "learning_rate": 4.81926344432843e-06, "loss": 0.1272, "step": 3289 }, { "epoch": 0.5116640746500778, "grad_norm": 1.3601752087823684, "learning_rate": 4.8168221444300124e-06, "loss": 0.1679, "step": 3290 }, { "epoch": 0.5118195956454121, "grad_norm": 0.9349309493406993, "learning_rate": 4.8143808882587035e-06, "loss": 0.1598, "step": 3291 }, { "epoch": 0.5119751166407465, "grad_norm": 1.5071630905559474, "learning_rate": 4.811939676397266e-06, "loss": 0.2449, "step": 3292 }, { "epoch": 0.5121306376360809, "grad_norm": 0.7814452137186776, "learning_rate": 4.809498509428448e-06, "loss": 0.1547, "step": 3293 }, { "epoch": 0.5122861586314152, "grad_norm": 0.7658570636284814, "learning_rate": 4.807057387934994e-06, "loss": 0.1537, "step": 3294 }, { "epoch": 0.5124416796267496, "grad_norm": 0.9559745552478576, "learning_rate": 4.804616312499633e-06, "loss": 0.1452, "step": 3295 }, { "epoch": 0.512597200622084, "grad_norm": 1.02342257607073, "learning_rate": 4.802175283705081e-06, "loss": 0.1252, "step": 3296 }, { "epoch": 0.5127527216174184, "grad_norm": 0.7198952218771494, "learning_rate": 4.799734302134049e-06, "loss": 0.092, "step": 3297 }, { "epoch": 0.5129082426127527, "grad_norm": 0.8838040465941501, "learning_rate": 4.797293368369231e-06, "loss": 0.1807, "step": 3298 }, { "epoch": 0.5130637636080871, "grad_norm": 0.9276333650131369, "learning_rate": 4.794852482993313e-06, "loss": 0.1478, "step": 3299 }, { "epoch": 0.5132192846034215, "grad_norm": 0.9424872539668999, "learning_rate": 4.7924116465889684e-06, "loss": 0.221, "step": 3300 }, { "epoch": 0.5132192846034215, "eval_loss": 0.17806772887706757, "eval_runtime": 9.4387, "eval_samples_per_second": 2.755, "eval_steps_per_second": 0.742, "step": 3300 }, { "epoch": 0.5133748055987558, "grad_norm": 1.1415239937942243, "learning_rate": 4.78997085973886e-06, "loss": 0.1057, "step": 3301 }, { "epoch": 0.5135303265940901, "grad_norm": 1.02254040961082, "learning_rate": 4.787530123025635e-06, "loss": 0.1793, "step": 3302 }, { "epoch": 0.5136858475894246, "grad_norm": 1.2376191995275645, "learning_rate": 4.785089437031931e-06, "loss": 0.1411, "step": 3303 }, { "epoch": 0.5138413685847589, "grad_norm": 0.6177182477496226, "learning_rate": 4.782648802340378e-06, "loss": 0.1358, "step": 3304 }, { "epoch": 0.5139968895800933, "grad_norm": 0.9315434583963113, "learning_rate": 4.7802082195335866e-06, "loss": 0.1487, "step": 3305 }, { "epoch": 0.5141524105754277, "grad_norm": 1.012541999278977, "learning_rate": 4.777767689194157e-06, "loss": 0.1728, "step": 3306 }, { "epoch": 0.514307931570762, "grad_norm": 1.3580450902001222, "learning_rate": 4.77532721190468e-06, "loss": 0.2189, "step": 3307 }, { "epoch": 0.5144634525660964, "grad_norm": 0.9501941542049429, "learning_rate": 4.772886788247731e-06, "loss": 0.1199, "step": 3308 }, { "epoch": 0.5146189735614308, "grad_norm": 1.095787178460077, "learning_rate": 4.770446418805872e-06, "loss": 0.107, "step": 3309 }, { "epoch": 0.5147744945567652, "grad_norm": 0.8559220231178494, "learning_rate": 4.768006104161655e-06, "loss": 0.1554, "step": 3310 }, { "epoch": 0.5149300155520995, "grad_norm": 1.1099221905576766, "learning_rate": 4.765565844897616e-06, "loss": 0.2041, "step": 3311 }, { "epoch": 0.5150855365474339, "grad_norm": 0.8343318509916511, "learning_rate": 4.763125641596278e-06, "loss": 0.14, "step": 3312 }, { "epoch": 0.5152410575427683, "grad_norm": 1.29756812073765, "learning_rate": 4.7606854948401535e-06, "loss": 0.1841, "step": 3313 }, { "epoch": 0.5153965785381026, "grad_norm": 0.9552946581915727, "learning_rate": 4.758245405211739e-06, "loss": 0.1297, "step": 3314 }, { "epoch": 0.5155520995334371, "grad_norm": 0.9036931712029718, "learning_rate": 4.755805373293516e-06, "loss": 0.1671, "step": 3315 }, { "epoch": 0.5157076205287714, "grad_norm": 1.069937253300049, "learning_rate": 4.753365399667958e-06, "loss": 0.1397, "step": 3316 }, { "epoch": 0.5158631415241057, "grad_norm": 0.7971038491948065, "learning_rate": 4.750925484917519e-06, "loss": 0.2186, "step": 3317 }, { "epoch": 0.5160186625194402, "grad_norm": 0.879255229444548, "learning_rate": 4.7484856296246375e-06, "loss": 0.111, "step": 3318 }, { "epoch": 0.5161741835147745, "grad_norm": 1.0104541565480347, "learning_rate": 4.746045834371745e-06, "loss": 0.2033, "step": 3319 }, { "epoch": 0.5163297045101088, "grad_norm": 1.034253398618332, "learning_rate": 4.743606099741255e-06, "loss": 0.1057, "step": 3320 }, { "epoch": 0.5164852255054432, "grad_norm": 0.8066417010373746, "learning_rate": 4.7411664263155625e-06, "loss": 0.1181, "step": 3321 }, { "epoch": 0.5166407465007776, "grad_norm": 0.9125647030278861, "learning_rate": 4.738726814677054e-06, "loss": 0.1774, "step": 3322 }, { "epoch": 0.516796267496112, "grad_norm": 1.105821452290937, "learning_rate": 4.7362872654080985e-06, "loss": 0.1631, "step": 3323 }, { "epoch": 0.5169517884914463, "grad_norm": 0.8879480970163288, "learning_rate": 4.733847779091051e-06, "loss": 0.1329, "step": 3324 }, { "epoch": 0.5171073094867807, "grad_norm": 0.9492307792085869, "learning_rate": 4.731408356308248e-06, "loss": 0.1876, "step": 3325 }, { "epoch": 0.5172628304821151, "grad_norm": 1.2732921170424134, "learning_rate": 4.728968997642018e-06, "loss": 0.2025, "step": 3326 }, { "epoch": 0.5174183514774494, "grad_norm": 0.8363887617162071, "learning_rate": 4.726529703674669e-06, "loss": 0.1075, "step": 3327 }, { "epoch": 0.5175738724727839, "grad_norm": 1.6037878133541064, "learning_rate": 4.72409047498849e-06, "loss": 0.1861, "step": 3328 }, { "epoch": 0.5177293934681182, "grad_norm": 1.0301481960257342, "learning_rate": 4.721651312165766e-06, "loss": 0.1285, "step": 3329 }, { "epoch": 0.5178849144634525, "grad_norm": 1.2378911943217061, "learning_rate": 4.719212215788753e-06, "loss": 0.1736, "step": 3330 }, { "epoch": 0.518040435458787, "grad_norm": 2.2814296418272213, "learning_rate": 4.7167731864397e-06, "loss": 0.2332, "step": 3331 }, { "epoch": 0.5181959564541213, "grad_norm": 1.1921901667772734, "learning_rate": 4.714334224700838e-06, "loss": 0.1499, "step": 3332 }, { "epoch": 0.5183514774494556, "grad_norm": 0.9821019068451058, "learning_rate": 4.711895331154382e-06, "loss": 0.1681, "step": 3333 }, { "epoch": 0.5185069984447901, "grad_norm": 1.0964523068375216, "learning_rate": 4.709456506382524e-06, "loss": 0.1445, "step": 3334 }, { "epoch": 0.5186625194401244, "grad_norm": 0.7226111885181837, "learning_rate": 4.707017750967452e-06, "loss": 0.1973, "step": 3335 }, { "epoch": 0.5188180404354588, "grad_norm": 1.0597173649690848, "learning_rate": 4.7045790654913295e-06, "loss": 0.1709, "step": 3336 }, { "epoch": 0.5189735614307932, "grad_norm": 1.1204079093423216, "learning_rate": 4.702140450536302e-06, "loss": 0.1999, "step": 3337 }, { "epoch": 0.5191290824261275, "grad_norm": 0.9117850085661168, "learning_rate": 4.699701906684504e-06, "loss": 0.2098, "step": 3338 }, { "epoch": 0.5192846034214619, "grad_norm": 0.9821573242065235, "learning_rate": 4.697263434518049e-06, "loss": 0.1081, "step": 3339 }, { "epoch": 0.5194401244167963, "grad_norm": 1.2581430046046482, "learning_rate": 4.6948250346190315e-06, "loss": 0.1589, "step": 3340 }, { "epoch": 0.5195956454121307, "grad_norm": 0.9025635865274356, "learning_rate": 4.692386707569538e-06, "loss": 0.2055, "step": 3341 }, { "epoch": 0.519751166407465, "grad_norm": 0.5299519805183638, "learning_rate": 4.689948453951625e-06, "loss": 0.0805, "step": 3342 }, { "epoch": 0.5199066874027993, "grad_norm": 1.1313552487323788, "learning_rate": 4.68751027434734e-06, "loss": 0.123, "step": 3343 }, { "epoch": 0.5200622083981338, "grad_norm": 1.0308983554953683, "learning_rate": 4.685072169338709e-06, "loss": 0.1037, "step": 3344 }, { "epoch": 0.5202177293934681, "grad_norm": 1.0890789800609288, "learning_rate": 4.6826341395077456e-06, "loss": 0.2967, "step": 3345 }, { "epoch": 0.5203732503888024, "grad_norm": 0.8355657450405783, "learning_rate": 4.680196185436438e-06, "loss": 0.1618, "step": 3346 }, { "epoch": 0.5205287713841369, "grad_norm": 1.0271136360244932, "learning_rate": 4.677758307706758e-06, "loss": 0.2184, "step": 3347 }, { "epoch": 0.5206842923794712, "grad_norm": 1.3662942068392605, "learning_rate": 4.6753205069006675e-06, "loss": 0.2386, "step": 3348 }, { "epoch": 0.5208398133748056, "grad_norm": 0.7869404608722707, "learning_rate": 4.6728827836001e-06, "loss": 0.1553, "step": 3349 }, { "epoch": 0.52099533437014, "grad_norm": 1.1736095338391153, "learning_rate": 4.670445138386971e-06, "loss": 0.2175, "step": 3350 }, { "epoch": 0.5211508553654743, "grad_norm": 1.011986411468133, "learning_rate": 4.668007571843184e-06, "loss": 0.2117, "step": 3351 }, { "epoch": 0.5213063763608087, "grad_norm": 1.2847783059232076, "learning_rate": 4.665570084550621e-06, "loss": 0.1363, "step": 3352 }, { "epoch": 0.5214618973561431, "grad_norm": 2.018356102675957, "learning_rate": 4.663132677091141e-06, "loss": 0.3742, "step": 3353 }, { "epoch": 0.5216174183514775, "grad_norm": 0.9066502307859068, "learning_rate": 4.66069535004659e-06, "loss": 0.1396, "step": 3354 }, { "epoch": 0.5217729393468118, "grad_norm": 1.034673992210646, "learning_rate": 4.658258103998789e-06, "loss": 0.1428, "step": 3355 }, { "epoch": 0.5219284603421462, "grad_norm": 0.9052841568901105, "learning_rate": 4.655820939529545e-06, "loss": 0.1688, "step": 3356 }, { "epoch": 0.5220839813374806, "grad_norm": 0.973896071414925, "learning_rate": 4.653383857220643e-06, "loss": 0.1514, "step": 3357 }, { "epoch": 0.5222395023328149, "grad_norm": 1.1102793227572518, "learning_rate": 4.650946857653848e-06, "loss": 0.1166, "step": 3358 }, { "epoch": 0.5223950233281494, "grad_norm": 1.0309239434983715, "learning_rate": 4.648509941410905e-06, "loss": 0.2064, "step": 3359 }, { "epoch": 0.5225505443234837, "grad_norm": 1.2467162820963213, "learning_rate": 4.646073109073542e-06, "loss": 0.1161, "step": 3360 }, { "epoch": 0.522706065318818, "grad_norm": 0.9124642792053805, "learning_rate": 4.643636361223464e-06, "loss": 0.1546, "step": 3361 }, { "epoch": 0.5228615863141524, "grad_norm": 1.0426176586685174, "learning_rate": 4.6411996984423554e-06, "loss": 0.1171, "step": 3362 }, { "epoch": 0.5230171073094868, "grad_norm": 0.8883609021361996, "learning_rate": 4.638763121311881e-06, "loss": 0.148, "step": 3363 }, { "epoch": 0.5231726283048211, "grad_norm": 2.2362505911143136, "learning_rate": 4.636326630413689e-06, "loss": 0.1487, "step": 3364 }, { "epoch": 0.5233281493001555, "grad_norm": 1.0164486971525217, "learning_rate": 4.633890226329402e-06, "loss": 0.1159, "step": 3365 }, { "epoch": 0.5234836702954899, "grad_norm": 0.8408276596811677, "learning_rate": 4.631453909640622e-06, "loss": 0.159, "step": 3366 }, { "epoch": 0.5236391912908243, "grad_norm": 1.2409717023571485, "learning_rate": 4.629017680928934e-06, "loss": 0.1234, "step": 3367 }, { "epoch": 0.5237947122861586, "grad_norm": 1.2301000444652854, "learning_rate": 4.6265815407759e-06, "loss": 0.1341, "step": 3368 }, { "epoch": 0.523950233281493, "grad_norm": 1.1430398243683795, "learning_rate": 4.624145489763058e-06, "loss": 0.1859, "step": 3369 }, { "epoch": 0.5241057542768274, "grad_norm": 0.8837047295349988, "learning_rate": 4.621709528471931e-06, "loss": 0.1552, "step": 3370 }, { "epoch": 0.5242612752721617, "grad_norm": 1.0250354142495839, "learning_rate": 4.619273657484014e-06, "loss": 0.1423, "step": 3371 }, { "epoch": 0.5244167962674962, "grad_norm": 0.898324581299984, "learning_rate": 4.616837877380783e-06, "loss": 0.1104, "step": 3372 }, { "epoch": 0.5245723172628305, "grad_norm": 1.1168733323213924, "learning_rate": 4.614402188743696e-06, "loss": 0.1545, "step": 3373 }, { "epoch": 0.5247278382581648, "grad_norm": 1.0499412323027713, "learning_rate": 4.611966592154184e-06, "loss": 0.1554, "step": 3374 }, { "epoch": 0.5248833592534993, "grad_norm": 0.8066858478824168, "learning_rate": 4.609531088193655e-06, "loss": 0.129, "step": 3375 }, { "epoch": 0.5250388802488336, "grad_norm": 1.2249151404204228, "learning_rate": 4.607095677443504e-06, "loss": 0.1103, "step": 3376 }, { "epoch": 0.5251944012441679, "grad_norm": 1.097964240311843, "learning_rate": 4.6046603604850936e-06, "loss": 0.1796, "step": 3377 }, { "epoch": 0.5253499222395024, "grad_norm": 1.4249814512168046, "learning_rate": 4.6022251378997664e-06, "loss": 0.3237, "step": 3378 }, { "epoch": 0.5255054432348367, "grad_norm": 1.0883476737468214, "learning_rate": 4.599790010268847e-06, "loss": 0.1405, "step": 3379 }, { "epoch": 0.5256609642301711, "grad_norm": 1.3292925658637438, "learning_rate": 4.5973549781736335e-06, "loss": 0.131, "step": 3380 }, { "epoch": 0.5258164852255054, "grad_norm": 0.7549513768907523, "learning_rate": 4.594920042195403e-06, "loss": 0.1923, "step": 3381 }, { "epoch": 0.5259720062208398, "grad_norm": 0.8153893653628606, "learning_rate": 4.5924852029154046e-06, "loss": 0.2148, "step": 3382 }, { "epoch": 0.5261275272161742, "grad_norm": 0.8571065038149022, "learning_rate": 4.590050460914872e-06, "loss": 0.1369, "step": 3383 }, { "epoch": 0.5262830482115085, "grad_norm": 1.4276105530344536, "learning_rate": 4.587615816775014e-06, "loss": 0.1656, "step": 3384 }, { "epoch": 0.526438569206843, "grad_norm": 1.3448064314773458, "learning_rate": 4.585181271077008e-06, "loss": 0.1527, "step": 3385 }, { "epoch": 0.5265940902021773, "grad_norm": 1.5698240900428235, "learning_rate": 4.582746824402019e-06, "loss": 0.2097, "step": 3386 }, { "epoch": 0.5267496111975116, "grad_norm": 1.1598510462723652, "learning_rate": 4.5803124773311825e-06, "loss": 0.1832, "step": 3387 }, { "epoch": 0.5269051321928461, "grad_norm": 0.8100336671971025, "learning_rate": 4.577878230445608e-06, "loss": 0.1718, "step": 3388 }, { "epoch": 0.5270606531881804, "grad_norm": 2.2265780971278484, "learning_rate": 4.575444084326389e-06, "loss": 0.2266, "step": 3389 }, { "epoch": 0.5272161741835147, "grad_norm": 1.1143649140928342, "learning_rate": 4.573010039554587e-06, "loss": 0.1611, "step": 3390 }, { "epoch": 0.5273716951788492, "grad_norm": 0.867281745656266, "learning_rate": 4.570576096711241e-06, "loss": 0.2048, "step": 3391 }, { "epoch": 0.5275272161741835, "grad_norm": 1.0573452368051457, "learning_rate": 4.568142256377373e-06, "loss": 0.4393, "step": 3392 }, { "epoch": 0.5276827371695179, "grad_norm": 0.9650984498895485, "learning_rate": 4.56570851913397e-06, "loss": 0.1955, "step": 3393 }, { "epoch": 0.5278382581648523, "grad_norm": 0.8964129119583738, "learning_rate": 4.563274885561998e-06, "loss": 0.1158, "step": 3394 }, { "epoch": 0.5279937791601866, "grad_norm": 1.0437137472955509, "learning_rate": 4.560841356242402e-06, "loss": 0.1314, "step": 3395 }, { "epoch": 0.528149300155521, "grad_norm": 0.8763860559051468, "learning_rate": 4.5584079317561e-06, "loss": 0.1462, "step": 3396 }, { "epoch": 0.5283048211508554, "grad_norm": 1.3280808784933509, "learning_rate": 4.55597461268398e-06, "loss": 0.167, "step": 3397 }, { "epoch": 0.5284603421461898, "grad_norm": 0.7116789465633148, "learning_rate": 4.553541399606915e-06, "loss": 0.0926, "step": 3398 }, { "epoch": 0.5286158631415241, "grad_norm": 1.2200586805330922, "learning_rate": 4.551108293105743e-06, "loss": 0.1593, "step": 3399 }, { "epoch": 0.5287713841368584, "grad_norm": 1.1061009050494743, "learning_rate": 4.548675293761281e-06, "loss": 0.148, "step": 3400 }, { "epoch": 0.5287713841368584, "eval_loss": 0.1779935359954834, "eval_runtime": 9.4365, "eval_samples_per_second": 2.755, "eval_steps_per_second": 0.742, "step": 3400 }, { "epoch": 0.5289269051321929, "grad_norm": 1.378323178205116, "learning_rate": 4.54624240215432e-06, "loss": 0.2306, "step": 3401 }, { "epoch": 0.5290824261275272, "grad_norm": 1.0321233655398419, "learning_rate": 4.5438096188656266e-06, "loss": 0.1128, "step": 3402 }, { "epoch": 0.5292379471228615, "grad_norm": 0.7430648598504694, "learning_rate": 4.541376944475939e-06, "loss": 0.1664, "step": 3403 }, { "epoch": 0.529393468118196, "grad_norm": 1.184719896364667, "learning_rate": 4.538944379565968e-06, "loss": 0.1395, "step": 3404 }, { "epoch": 0.5295489891135303, "grad_norm": 1.2420002742158291, "learning_rate": 4.536511924716406e-06, "loss": 0.2133, "step": 3405 }, { "epoch": 0.5297045101088647, "grad_norm": 1.1403752777455154, "learning_rate": 4.53407958050791e-06, "loss": 0.2407, "step": 3406 }, { "epoch": 0.5298600311041991, "grad_norm": 1.1320318509051028, "learning_rate": 4.531647347521115e-06, "loss": 0.1217, "step": 3407 }, { "epoch": 0.5300155520995334, "grad_norm": 1.2763710363333394, "learning_rate": 4.529215226336631e-06, "loss": 0.0986, "step": 3408 }, { "epoch": 0.5301710730948678, "grad_norm": 0.7945851750923723, "learning_rate": 4.5267832175350385e-06, "loss": 0.1162, "step": 3409 }, { "epoch": 0.5303265940902022, "grad_norm": 1.0347806375334776, "learning_rate": 4.524351321696889e-06, "loss": 0.1194, "step": 3410 }, { "epoch": 0.5304821150855366, "grad_norm": 0.7381259831531662, "learning_rate": 4.521919539402713e-06, "loss": 0.1412, "step": 3411 }, { "epoch": 0.5306376360808709, "grad_norm": 0.9972513981127702, "learning_rate": 4.5194878712330105e-06, "loss": 0.2248, "step": 3412 }, { "epoch": 0.5307931570762053, "grad_norm": 0.796146429287473, "learning_rate": 4.517056317768252e-06, "loss": 0.1398, "step": 3413 }, { "epoch": 0.5309486780715397, "grad_norm": 1.3310822598399847, "learning_rate": 4.514624879588889e-06, "loss": 0.1632, "step": 3414 }, { "epoch": 0.531104199066874, "grad_norm": 2.018328420816252, "learning_rate": 4.512193557275333e-06, "loss": 0.1879, "step": 3415 }, { "epoch": 0.5312597200622085, "grad_norm": 0.7266043704380887, "learning_rate": 4.509762351407978e-06, "loss": 0.1516, "step": 3416 }, { "epoch": 0.5314152410575428, "grad_norm": 0.7723199825778642, "learning_rate": 4.507331262567188e-06, "loss": 0.1602, "step": 3417 }, { "epoch": 0.5315707620528771, "grad_norm": 1.1042234514597622, "learning_rate": 4.504900291333296e-06, "loss": 0.2567, "step": 3418 }, { "epoch": 0.5317262830482115, "grad_norm": 1.3400592999179133, "learning_rate": 4.502469438286607e-06, "loss": 0.1738, "step": 3419 }, { "epoch": 0.5318818040435459, "grad_norm": 1.0076124447959915, "learning_rate": 4.500038704007402e-06, "loss": 0.1474, "step": 3420 }, { "epoch": 0.5320373250388802, "grad_norm": 0.8537730941550392, "learning_rate": 4.497608089075931e-06, "loss": 0.172, "step": 3421 }, { "epoch": 0.5321928460342146, "grad_norm": 0.9195676831193533, "learning_rate": 4.495177594072416e-06, "loss": 0.1229, "step": 3422 }, { "epoch": 0.532348367029549, "grad_norm": 0.8303970115521765, "learning_rate": 4.492747219577047e-06, "loss": 0.1328, "step": 3423 }, { "epoch": 0.5325038880248834, "grad_norm": 0.9854129151314052, "learning_rate": 4.490316966169993e-06, "loss": 0.1865, "step": 3424 }, { "epoch": 0.5326594090202177, "grad_norm": 1.0527440499407796, "learning_rate": 4.487886834431386e-06, "loss": 0.1602, "step": 3425 }, { "epoch": 0.5328149300155521, "grad_norm": 1.025454567692483, "learning_rate": 4.485456824941333e-06, "loss": 0.1745, "step": 3426 }, { "epoch": 0.5329704510108865, "grad_norm": 0.9023642417838998, "learning_rate": 4.4830269382799116e-06, "loss": 0.2688, "step": 3427 }, { "epoch": 0.5331259720062208, "grad_norm": 1.1228898460242833, "learning_rate": 4.48059717502717e-06, "loss": 0.1488, "step": 3428 }, { "epoch": 0.5332814930015553, "grad_norm": 0.9252846775249467, "learning_rate": 4.478167535763124e-06, "loss": 0.115, "step": 3429 }, { "epoch": 0.5334370139968896, "grad_norm": 1.7139498826734287, "learning_rate": 4.475738021067768e-06, "loss": 0.1945, "step": 3430 }, { "epoch": 0.5335925349922239, "grad_norm": 1.1662719153869556, "learning_rate": 4.473308631521055e-06, "loss": 0.1596, "step": 3431 }, { "epoch": 0.5337480559875584, "grad_norm": 1.125448301686904, "learning_rate": 4.4708793677029154e-06, "loss": 0.1492, "step": 3432 }, { "epoch": 0.5339035769828927, "grad_norm": 0.9867844053278368, "learning_rate": 4.468450230193253e-06, "loss": 0.2259, "step": 3433 }, { "epoch": 0.534059097978227, "grad_norm": 1.2912898065650689, "learning_rate": 4.466021219571932e-06, "loss": 0.2567, "step": 3434 }, { "epoch": 0.5342146189735615, "grad_norm": 1.0136321305641505, "learning_rate": 4.463592336418791e-06, "loss": 0.1519, "step": 3435 }, { "epoch": 0.5343701399688958, "grad_norm": 1.452538609131163, "learning_rate": 4.461163581313641e-06, "loss": 0.2091, "step": 3436 }, { "epoch": 0.5345256609642302, "grad_norm": 0.7725378398414232, "learning_rate": 4.458734954836259e-06, "loss": 0.1341, "step": 3437 }, { "epoch": 0.5346811819595645, "grad_norm": 5.733994798554905, "learning_rate": 4.456306457566391e-06, "loss": 0.2324, "step": 3438 }, { "epoch": 0.5348367029548989, "grad_norm": 0.9331606236841765, "learning_rate": 4.453878090083752e-06, "loss": 0.1294, "step": 3439 }, { "epoch": 0.5349922239502333, "grad_norm": 1.2352807850644625, "learning_rate": 4.451449852968031e-06, "loss": 0.138, "step": 3440 }, { "epoch": 0.5351477449455676, "grad_norm": 0.9289640802160856, "learning_rate": 4.449021746798881e-06, "loss": 0.1338, "step": 3441 }, { "epoch": 0.535303265940902, "grad_norm": 0.8725001947740366, "learning_rate": 4.446593772155921e-06, "loss": 0.1299, "step": 3442 }, { "epoch": 0.5354587869362364, "grad_norm": 0.894487472251251, "learning_rate": 4.444165929618745e-06, "loss": 0.1636, "step": 3443 }, { "epoch": 0.5356143079315707, "grad_norm": 0.8467830310480334, "learning_rate": 4.4417382197669155e-06, "loss": 0.0953, "step": 3444 }, { "epoch": 0.5357698289269052, "grad_norm": 1.4511995556214823, "learning_rate": 4.4393106431799544e-06, "loss": 0.2146, "step": 3445 }, { "epoch": 0.5359253499222395, "grad_norm": 1.055731965104341, "learning_rate": 4.436883200437366e-06, "loss": 0.1766, "step": 3446 }, { "epoch": 0.5360808709175738, "grad_norm": 0.9991849773598579, "learning_rate": 4.4344558921186085e-06, "loss": 0.1577, "step": 3447 }, { "epoch": 0.5362363919129083, "grad_norm": 0.8834371273631433, "learning_rate": 4.4320287188031154e-06, "loss": 0.1341, "step": 3448 }, { "epoch": 0.5363919129082426, "grad_norm": 0.9759124169395754, "learning_rate": 4.42960168107029e-06, "loss": 0.1696, "step": 3449 }, { "epoch": 0.536547433903577, "grad_norm": 1.2228384230609555, "learning_rate": 4.427174779499498e-06, "loss": 0.1435, "step": 3450 }, { "epoch": 0.5367029548989114, "grad_norm": 0.7540997764731188, "learning_rate": 4.424748014670072e-06, "loss": 0.0986, "step": 3451 }, { "epoch": 0.5368584758942457, "grad_norm": 1.010621234085382, "learning_rate": 4.422321387161317e-06, "loss": 0.1105, "step": 3452 }, { "epoch": 0.5370139968895801, "grad_norm": 1.2127222674778764, "learning_rate": 4.4198948975525054e-06, "loss": 0.1684, "step": 3453 }, { "epoch": 0.5371695178849145, "grad_norm": 1.0812513006737807, "learning_rate": 4.41746854642287e-06, "loss": 0.1221, "step": 3454 }, { "epoch": 0.5373250388802489, "grad_norm": 1.4312404256636422, "learning_rate": 4.415042334351616e-06, "loss": 0.2518, "step": 3455 }, { "epoch": 0.5374805598755832, "grad_norm": 1.0054470628126586, "learning_rate": 4.4126162619179155e-06, "loss": 0.1326, "step": 3456 }, { "epoch": 0.5376360808709175, "grad_norm": 0.7137266908425585, "learning_rate": 4.410190329700905e-06, "loss": 0.1845, "step": 3457 }, { "epoch": 0.537791601866252, "grad_norm": 0.7620466513770129, "learning_rate": 4.407764538279686e-06, "loss": 0.1314, "step": 3458 }, { "epoch": 0.5379471228615863, "grad_norm": 1.0838138095103957, "learning_rate": 4.405338888233332e-06, "loss": 0.2018, "step": 3459 }, { "epoch": 0.5381026438569206, "grad_norm": 0.8524889745356093, "learning_rate": 4.402913380140878e-06, "loss": 0.1346, "step": 3460 }, { "epoch": 0.5382581648522551, "grad_norm": 0.7768918750727399, "learning_rate": 4.400488014581326e-06, "loss": 0.1187, "step": 3461 }, { "epoch": 0.5384136858475894, "grad_norm": 1.1979654478347088, "learning_rate": 4.398062792133647e-06, "loss": 0.145, "step": 3462 }, { "epoch": 0.5385692068429238, "grad_norm": 1.4488118537409558, "learning_rate": 4.395637713376774e-06, "loss": 0.1741, "step": 3463 }, { "epoch": 0.5387247278382582, "grad_norm": 1.1744555565614856, "learning_rate": 4.393212778889604e-06, "loss": 0.1873, "step": 3464 }, { "epoch": 0.5388802488335925, "grad_norm": 1.1873858358547666, "learning_rate": 4.390787989251009e-06, "loss": 0.2643, "step": 3465 }, { "epoch": 0.5390357698289269, "grad_norm": 0.7635021886521868, "learning_rate": 4.388363345039816e-06, "loss": 0.0978, "step": 3466 }, { "epoch": 0.5391912908242613, "grad_norm": 0.9188704306293071, "learning_rate": 4.38593884683482e-06, "loss": 0.1174, "step": 3467 }, { "epoch": 0.5393468118195957, "grad_norm": 1.2029084772666945, "learning_rate": 4.383514495214787e-06, "loss": 0.1684, "step": 3468 }, { "epoch": 0.53950233281493, "grad_norm": 1.1522114809113218, "learning_rate": 4.3810902907584405e-06, "loss": 0.1105, "step": 3469 }, { "epoch": 0.5396578538102644, "grad_norm": 0.9984256577225797, "learning_rate": 4.378666234044471e-06, "loss": 0.138, "step": 3470 }, { "epoch": 0.5398133748055988, "grad_norm": 1.362086023127549, "learning_rate": 4.376242325651538e-06, "loss": 0.1896, "step": 3471 }, { "epoch": 0.5399688958009331, "grad_norm": 0.9661702479703259, "learning_rate": 4.373818566158261e-06, "loss": 0.1911, "step": 3472 }, { "epoch": 0.5401244167962675, "grad_norm": 0.7860623298554701, "learning_rate": 4.371394956143222e-06, "loss": 0.152, "step": 3473 }, { "epoch": 0.5402799377916019, "grad_norm": 1.07068893903563, "learning_rate": 4.368971496184976e-06, "loss": 0.13, "step": 3474 }, { "epoch": 0.5404354587869362, "grad_norm": 1.121130918916447, "learning_rate": 4.366548186862033e-06, "loss": 0.0962, "step": 3475 }, { "epoch": 0.5405909797822706, "grad_norm": 1.059199469549698, "learning_rate": 4.364125028752872e-06, "loss": 0.1481, "step": 3476 }, { "epoch": 0.540746500777605, "grad_norm": 0.6616272293504313, "learning_rate": 4.3617020224359315e-06, "loss": 0.1754, "step": 3477 }, { "epoch": 0.5409020217729393, "grad_norm": 1.1575431751069025, "learning_rate": 4.359279168489624e-06, "loss": 0.1412, "step": 3478 }, { "epoch": 0.5410575427682737, "grad_norm": 0.863165899551449, "learning_rate": 4.356856467492311e-06, "loss": 0.139, "step": 3479 }, { "epoch": 0.5412130637636081, "grad_norm": 0.913091205030313, "learning_rate": 4.354433920022328e-06, "loss": 0.1663, "step": 3480 }, { "epoch": 0.5413685847589425, "grad_norm": 1.780270529052474, "learning_rate": 4.352011526657972e-06, "loss": 0.2142, "step": 3481 }, { "epoch": 0.5415241057542768, "grad_norm": 0.8290432168000126, "learning_rate": 4.349589287977503e-06, "loss": 0.1456, "step": 3482 }, { "epoch": 0.5416796267496112, "grad_norm": 1.1092900060847333, "learning_rate": 4.347167204559138e-06, "loss": 0.1455, "step": 3483 }, { "epoch": 0.5418351477449456, "grad_norm": 1.7097703388710936, "learning_rate": 4.344745276981067e-06, "loss": 0.1551, "step": 3484 }, { "epoch": 0.5419906687402799, "grad_norm": 2.0015002979858125, "learning_rate": 4.342323505821436e-06, "loss": 0.2234, "step": 3485 }, { "epoch": 0.5421461897356143, "grad_norm": 1.5427380233126213, "learning_rate": 4.3399018916583554e-06, "loss": 0.1815, "step": 3486 }, { "epoch": 0.5423017107309487, "grad_norm": 0.998387793928521, "learning_rate": 4.3374804350698995e-06, "loss": 0.1432, "step": 3487 }, { "epoch": 0.542457231726283, "grad_norm": 1.4670853755127147, "learning_rate": 4.335059136634102e-06, "loss": 0.1887, "step": 3488 }, { "epoch": 0.5426127527216175, "grad_norm": 0.97701948354993, "learning_rate": 4.332637996928961e-06, "loss": 0.1505, "step": 3489 }, { "epoch": 0.5427682737169518, "grad_norm": 0.8911561423290217, "learning_rate": 4.3302170165324385e-06, "loss": 0.1385, "step": 3490 }, { "epoch": 0.5429237947122861, "grad_norm": 0.9272358672921999, "learning_rate": 4.327796196022454e-06, "loss": 0.2052, "step": 3491 }, { "epoch": 0.5430793157076206, "grad_norm": 1.5130655449837096, "learning_rate": 4.32537553597689e-06, "loss": 0.1109, "step": 3492 }, { "epoch": 0.5432348367029549, "grad_norm": 1.0911926320005798, "learning_rate": 4.3229550369735965e-06, "loss": 0.1636, "step": 3493 }, { "epoch": 0.5433903576982893, "grad_norm": 0.8448925103501249, "learning_rate": 4.320534699590377e-06, "loss": 0.1204, "step": 3494 }, { "epoch": 0.5435458786936236, "grad_norm": 0.9115482859403481, "learning_rate": 4.318114524404999e-06, "loss": 0.1587, "step": 3495 }, { "epoch": 0.543701399688958, "grad_norm": 1.370143689843469, "learning_rate": 4.315694511995192e-06, "loss": 0.1952, "step": 3496 }, { "epoch": 0.5438569206842924, "grad_norm": 1.2223824498945621, "learning_rate": 4.31327466293865e-06, "loss": 0.1765, "step": 3497 }, { "epoch": 0.5440124416796267, "grad_norm": 0.9526278813260499, "learning_rate": 4.310854977813023e-06, "loss": 0.1374, "step": 3498 }, { "epoch": 0.5441679626749611, "grad_norm": 1.134897664889009, "learning_rate": 4.3084354571959216e-06, "loss": 0.1493, "step": 3499 }, { "epoch": 0.5443234836702955, "grad_norm": 2.2627655657798758, "learning_rate": 4.306016101664921e-06, "loss": 0.1365, "step": 3500 }, { "epoch": 0.5443234836702955, "eval_loss": 0.17789840698242188, "eval_runtime": 9.4207, "eval_samples_per_second": 2.76, "eval_steps_per_second": 0.743, "step": 3500 }, { "epoch": 0.5444790046656298, "grad_norm": 0.9235387409285072, "learning_rate": 4.303596911797556e-06, "loss": 0.1152, "step": 3501 }, { "epoch": 0.5446345256609643, "grad_norm": 0.9556104407140926, "learning_rate": 4.301177888171318e-06, "loss": 0.2012, "step": 3502 }, { "epoch": 0.5447900466562986, "grad_norm": 1.269748208708768, "learning_rate": 4.2987590313636655e-06, "loss": 0.0948, "step": 3503 }, { "epoch": 0.5449455676516329, "grad_norm": 0.7634822282795009, "learning_rate": 4.296340341952011e-06, "loss": 0.1276, "step": 3504 }, { "epoch": 0.5451010886469674, "grad_norm": 0.8801433928673349, "learning_rate": 4.293921820513728e-06, "loss": 0.1519, "step": 3505 }, { "epoch": 0.5452566096423017, "grad_norm": 0.9950122128270511, "learning_rate": 4.291503467626155e-06, "loss": 0.1302, "step": 3506 }, { "epoch": 0.545412130637636, "grad_norm": 1.0623136402314086, "learning_rate": 4.289085283866584e-06, "loss": 0.1491, "step": 3507 }, { "epoch": 0.5455676516329705, "grad_norm": 0.9716522329126454, "learning_rate": 4.286667269812269e-06, "loss": 0.1922, "step": 3508 }, { "epoch": 0.5457231726283048, "grad_norm": 0.9931912669922166, "learning_rate": 4.284249426040425e-06, "loss": 0.1599, "step": 3509 }, { "epoch": 0.5458786936236392, "grad_norm": 1.2147706975543748, "learning_rate": 4.281831753128226e-06, "loss": 0.2058, "step": 3510 }, { "epoch": 0.5460342146189736, "grad_norm": 1.0947512978195884, "learning_rate": 4.2794142516528e-06, "loss": 0.1392, "step": 3511 }, { "epoch": 0.546189735614308, "grad_norm": 1.7375417172762018, "learning_rate": 4.276996922191243e-06, "loss": 0.2257, "step": 3512 }, { "epoch": 0.5463452566096423, "grad_norm": 0.8952483021355166, "learning_rate": 4.2745797653206035e-06, "loss": 0.2083, "step": 3513 }, { "epoch": 0.5465007776049766, "grad_norm": 1.0609132446698366, "learning_rate": 4.272162781617891e-06, "loss": 0.154, "step": 3514 }, { "epoch": 0.5466562986003111, "grad_norm": 1.0459193460866134, "learning_rate": 4.26974597166007e-06, "loss": 0.188, "step": 3515 }, { "epoch": 0.5468118195956454, "grad_norm": 0.8023800454093994, "learning_rate": 4.2673293360240705e-06, "loss": 0.1493, "step": 3516 }, { "epoch": 0.5469673405909797, "grad_norm": 0.8150097500096768, "learning_rate": 4.264912875286776e-06, "loss": 0.1517, "step": 3517 }, { "epoch": 0.5471228615863142, "grad_norm": 1.4246409260351822, "learning_rate": 4.262496590025029e-06, "loss": 0.2214, "step": 3518 }, { "epoch": 0.5472783825816485, "grad_norm": 0.8441622779994482, "learning_rate": 4.260080480815631e-06, "loss": 0.1514, "step": 3519 }, { "epoch": 0.5474339035769828, "grad_norm": 0.811539590115903, "learning_rate": 4.257664548235341e-06, "loss": 0.0992, "step": 3520 }, { "epoch": 0.5475894245723173, "grad_norm": 1.0725339926784199, "learning_rate": 4.255248792860875e-06, "loss": 0.1544, "step": 3521 }, { "epoch": 0.5477449455676516, "grad_norm": 0.7985346245451533, "learning_rate": 4.252833215268909e-06, "loss": 0.0924, "step": 3522 }, { "epoch": 0.547900466562986, "grad_norm": 0.7048608064522983, "learning_rate": 4.250417816036074e-06, "loss": 0.1468, "step": 3523 }, { "epoch": 0.5480559875583204, "grad_norm": 1.3705622393216101, "learning_rate": 4.2480025957389575e-06, "loss": 0.1641, "step": 3524 }, { "epoch": 0.5482115085536547, "grad_norm": 0.8454634815940512, "learning_rate": 4.245587554954111e-06, "loss": 0.1866, "step": 3525 }, { "epoch": 0.5483670295489891, "grad_norm": 1.9158060414792306, "learning_rate": 4.243172694258036e-06, "loss": 0.3067, "step": 3526 }, { "epoch": 0.5485225505443235, "grad_norm": 0.7894854142305008, "learning_rate": 4.240758014227191e-06, "loss": 0.1488, "step": 3527 }, { "epoch": 0.5486780715396579, "grad_norm": 0.9316664552923821, "learning_rate": 4.238343515437997e-06, "loss": 0.1338, "step": 3528 }, { "epoch": 0.5488335925349922, "grad_norm": 1.4860071501177696, "learning_rate": 4.2359291984668284e-06, "loss": 0.133, "step": 3529 }, { "epoch": 0.5489891135303266, "grad_norm": 0.8385662193240171, "learning_rate": 4.233515063890013e-06, "loss": 0.1205, "step": 3530 }, { "epoch": 0.549144634525661, "grad_norm": 1.0397063353712808, "learning_rate": 4.231101112283844e-06, "loss": 0.1214, "step": 3531 }, { "epoch": 0.5493001555209953, "grad_norm": 0.9207695137608602, "learning_rate": 4.22868734422456e-06, "loss": 0.1951, "step": 3532 }, { "epoch": 0.5494556765163296, "grad_norm": 0.8642295036316828, "learning_rate": 4.226273760288365e-06, "loss": 0.1473, "step": 3533 }, { "epoch": 0.5496111975116641, "grad_norm": 0.980691601201511, "learning_rate": 4.2238603610514115e-06, "loss": 0.1191, "step": 3534 }, { "epoch": 0.5497667185069984, "grad_norm": 1.1348658831074285, "learning_rate": 4.221447147089815e-06, "loss": 0.1734, "step": 3535 }, { "epoch": 0.5499222395023328, "grad_norm": 1.4181565169196229, "learning_rate": 4.219034118979641e-06, "loss": 0.1469, "step": 3536 }, { "epoch": 0.5500777604976672, "grad_norm": 1.0443843083427304, "learning_rate": 4.2166212772969126e-06, "loss": 0.1848, "step": 3537 }, { "epoch": 0.5502332814930015, "grad_norm": 0.6976371183983389, "learning_rate": 4.214208622617612e-06, "loss": 0.1334, "step": 3538 }, { "epoch": 0.5503888024883359, "grad_norm": 1.1216525590746746, "learning_rate": 4.211796155517671e-06, "loss": 0.1963, "step": 3539 }, { "epoch": 0.5505443234836703, "grad_norm": 1.5468774506773288, "learning_rate": 4.209383876572977e-06, "loss": 0.2326, "step": 3540 }, { "epoch": 0.5506998444790047, "grad_norm": 1.1630574288123443, "learning_rate": 4.2069717863593805e-06, "loss": 0.1217, "step": 3541 }, { "epoch": 0.550855365474339, "grad_norm": 0.9696598730642263, "learning_rate": 4.204559885452678e-06, "loss": 0.243, "step": 3542 }, { "epoch": 0.5510108864696734, "grad_norm": 1.007409523506963, "learning_rate": 4.202148174428621e-06, "loss": 0.1818, "step": 3543 }, { "epoch": 0.5511664074650078, "grad_norm": 0.8860874836866308, "learning_rate": 4.199736653862922e-06, "loss": 0.2201, "step": 3544 }, { "epoch": 0.5513219284603421, "grad_norm": 1.1378071776829635, "learning_rate": 4.197325324331245e-06, "loss": 0.1827, "step": 3545 }, { "epoch": 0.5514774494556766, "grad_norm": 1.4967033976480877, "learning_rate": 4.194914186409205e-06, "loss": 0.2344, "step": 3546 }, { "epoch": 0.5516329704510109, "grad_norm": 0.9064733065679419, "learning_rate": 4.192503240672378e-06, "loss": 0.1315, "step": 3547 }, { "epoch": 0.5517884914463452, "grad_norm": 1.1647224326560612, "learning_rate": 4.190092487696288e-06, "loss": 0.1023, "step": 3548 }, { "epoch": 0.5519440124416797, "grad_norm": 1.1404867352483787, "learning_rate": 4.1876819280564135e-06, "loss": 0.1608, "step": 3549 }, { "epoch": 0.552099533437014, "grad_norm": 1.0300125553551307, "learning_rate": 4.1852715623281934e-06, "loss": 0.1633, "step": 3550 }, { "epoch": 0.5522550544323483, "grad_norm": 1.029134188932107, "learning_rate": 4.182861391087014e-06, "loss": 0.1714, "step": 3551 }, { "epoch": 0.5524105754276827, "grad_norm": 0.6082825364894174, "learning_rate": 4.1804514149082135e-06, "loss": 0.1175, "step": 3552 }, { "epoch": 0.5525660964230171, "grad_norm": 0.9251322103501135, "learning_rate": 4.1780416343670885e-06, "loss": 0.1351, "step": 3553 }, { "epoch": 0.5527216174183515, "grad_norm": 1.235762008864234, "learning_rate": 4.175632050038891e-06, "loss": 0.2528, "step": 3554 }, { "epoch": 0.5528771384136858, "grad_norm": 0.8572954742002363, "learning_rate": 4.173222662498817e-06, "loss": 0.1336, "step": 3555 }, { "epoch": 0.5530326594090202, "grad_norm": 1.0065031839934107, "learning_rate": 4.170813472322024e-06, "loss": 0.194, "step": 3556 }, { "epoch": 0.5531881804043546, "grad_norm": 1.171989365169518, "learning_rate": 4.168404480083619e-06, "loss": 0.1561, "step": 3557 }, { "epoch": 0.5533437013996889, "grad_norm": 0.7397528119411392, "learning_rate": 4.165995686358663e-06, "loss": 0.1568, "step": 3558 }, { "epoch": 0.5534992223950234, "grad_norm": 0.9226577640160962, "learning_rate": 4.163587091722164e-06, "loss": 0.18, "step": 3559 }, { "epoch": 0.5536547433903577, "grad_norm": 0.5343762332589049, "learning_rate": 4.161178696749092e-06, "loss": 0.0887, "step": 3560 }, { "epoch": 0.553810264385692, "grad_norm": 0.8117924496068853, "learning_rate": 4.158770502014364e-06, "loss": 0.1786, "step": 3561 }, { "epoch": 0.5539657853810265, "grad_norm": 0.9603931021643153, "learning_rate": 4.156362508092846e-06, "loss": 0.2124, "step": 3562 }, { "epoch": 0.5541213063763608, "grad_norm": 0.9142172430957958, "learning_rate": 4.153954715559365e-06, "loss": 0.134, "step": 3563 }, { "epoch": 0.5542768273716951, "grad_norm": 1.0627981233872563, "learning_rate": 4.151547124988689e-06, "loss": 0.225, "step": 3564 }, { "epoch": 0.5544323483670296, "grad_norm": 0.7911143907825706, "learning_rate": 4.149139736955547e-06, "loss": 0.1608, "step": 3565 }, { "epoch": 0.5545878693623639, "grad_norm": 1.5283046526974018, "learning_rate": 4.146732552034618e-06, "loss": 0.1345, "step": 3566 }, { "epoch": 0.5547433903576983, "grad_norm": 1.2424171386065743, "learning_rate": 4.144325570800527e-06, "loss": 0.1752, "step": 3567 }, { "epoch": 0.5548989113530327, "grad_norm": 1.2817667838627587, "learning_rate": 4.1419187938278536e-06, "loss": 0.2556, "step": 3568 }, { "epoch": 0.555054432348367, "grad_norm": 1.0297177626455964, "learning_rate": 4.139512221691131e-06, "loss": 0.1432, "step": 3569 }, { "epoch": 0.5552099533437014, "grad_norm": 1.2177112714891503, "learning_rate": 4.1371058549648425e-06, "loss": 0.1673, "step": 3570 }, { "epoch": 0.5553654743390358, "grad_norm": 0.7288978920393502, "learning_rate": 4.134699694223417e-06, "loss": 0.0744, "step": 3571 }, { "epoch": 0.5555209953343702, "grad_norm": 1.5457655238228813, "learning_rate": 4.132293740041244e-06, "loss": 0.1857, "step": 3572 }, { "epoch": 0.5556765163297045, "grad_norm": 1.3266613004678893, "learning_rate": 4.129887992992655e-06, "loss": 0.2255, "step": 3573 }, { "epoch": 0.5558320373250388, "grad_norm": 0.6769893528493952, "learning_rate": 4.127482453651938e-06, "loss": 0.2042, "step": 3574 }, { "epoch": 0.5559875583203733, "grad_norm": 0.9479469533371994, "learning_rate": 4.125077122593324e-06, "loss": 0.1112, "step": 3575 }, { "epoch": 0.5561430793157076, "grad_norm": 0.9245556777292917, "learning_rate": 4.1226720003910035e-06, "loss": 0.1512, "step": 3576 }, { "epoch": 0.556298600311042, "grad_norm": 0.8327936087938094, "learning_rate": 4.120267087619112e-06, "loss": 0.1684, "step": 3577 }, { "epoch": 0.5564541213063764, "grad_norm": 1.0065080492171525, "learning_rate": 4.117862384851733e-06, "loss": 0.0937, "step": 3578 }, { "epoch": 0.5566096423017107, "grad_norm": 1.2129115601868743, "learning_rate": 4.1154578926629066e-06, "loss": 0.1995, "step": 3579 }, { "epoch": 0.5567651632970451, "grad_norm": 1.6250613695880802, "learning_rate": 4.1130536116266155e-06, "loss": 0.2259, "step": 3580 }, { "epoch": 0.5569206842923795, "grad_norm": 0.6845164730720171, "learning_rate": 4.110649542316795e-06, "loss": 0.1585, "step": 3581 }, { "epoch": 0.5570762052877138, "grad_norm": 1.2442646018724761, "learning_rate": 4.1082456853073335e-06, "loss": 0.1912, "step": 3582 }, { "epoch": 0.5572317262830482, "grad_norm": 1.2971512816742006, "learning_rate": 4.105842041172063e-06, "loss": 0.1624, "step": 3583 }, { "epoch": 0.5573872472783826, "grad_norm": 1.1440365227886058, "learning_rate": 4.103438610484764e-06, "loss": 0.1834, "step": 3584 }, { "epoch": 0.557542768273717, "grad_norm": 0.9512772980127675, "learning_rate": 4.101035393819174e-06, "loss": 0.1291, "step": 3585 }, { "epoch": 0.5576982892690513, "grad_norm": 1.0695581345099938, "learning_rate": 4.098632391748971e-06, "loss": 0.1711, "step": 3586 }, { "epoch": 0.5578538102643857, "grad_norm": 1.520547036314969, "learning_rate": 4.096229604847786e-06, "loss": 0.1496, "step": 3587 }, { "epoch": 0.5580093312597201, "grad_norm": 1.1750100614433587, "learning_rate": 4.093827033689198e-06, "loss": 0.1542, "step": 3588 }, { "epoch": 0.5581648522550544, "grad_norm": 1.5645995534828308, "learning_rate": 4.091424678846735e-06, "loss": 0.1539, "step": 3589 }, { "epoch": 0.5583203732503889, "grad_norm": 0.921604219854687, "learning_rate": 4.089022540893871e-06, "loss": 0.1406, "step": 3590 }, { "epoch": 0.5584758942457232, "grad_norm": 3.7701936155356774, "learning_rate": 4.086620620404031e-06, "loss": 0.2009, "step": 3591 }, { "epoch": 0.5586314152410575, "grad_norm": 1.4180247449430712, "learning_rate": 4.084218917950586e-06, "loss": 0.1695, "step": 3592 }, { "epoch": 0.5587869362363919, "grad_norm": 1.4171785508880435, "learning_rate": 4.081817434106857e-06, "loss": 0.2111, "step": 3593 }, { "epoch": 0.5589424572317263, "grad_norm": 1.9079210187671585, "learning_rate": 4.0794161694461085e-06, "loss": 0.1747, "step": 3594 }, { "epoch": 0.5590979782270606, "grad_norm": 1.2739445697704825, "learning_rate": 4.077015124541561e-06, "loss": 0.1233, "step": 3595 }, { "epoch": 0.559253499222395, "grad_norm": 1.1879813897511688, "learning_rate": 4.074614299966374e-06, "loss": 0.139, "step": 3596 }, { "epoch": 0.5594090202177294, "grad_norm": 1.323159128324349, "learning_rate": 4.072213696293657e-06, "loss": 0.185, "step": 3597 }, { "epoch": 0.5595645412130638, "grad_norm": 1.039346869376028, "learning_rate": 4.069813314096472e-06, "loss": 0.1714, "step": 3598 }, { "epoch": 0.5597200622083981, "grad_norm": 1.2408298176930626, "learning_rate": 4.0674131539478205e-06, "loss": 0.2445, "step": 3599 }, { "epoch": 0.5598755832037325, "grad_norm": 0.9267017566474497, "learning_rate": 4.0650132164206515e-06, "loss": 0.2491, "step": 3600 }, { "epoch": 0.5598755832037325, "eval_loss": 0.1727723628282547, "eval_runtime": 9.4415, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.741, "step": 3600 }, { "epoch": 0.5600311041990669, "grad_norm": 1.2697677656765547, "learning_rate": 4.062613502087869e-06, "loss": 0.1621, "step": 3601 }, { "epoch": 0.5601866251944012, "grad_norm": 1.2214004073240445, "learning_rate": 4.060214011522316e-06, "loss": 0.1372, "step": 3602 }, { "epoch": 0.5603421461897357, "grad_norm": 1.3333004497789398, "learning_rate": 4.0578147452967825e-06, "loss": 0.1817, "step": 3603 }, { "epoch": 0.56049766718507, "grad_norm": 1.0896692432139203, "learning_rate": 4.055415703984009e-06, "loss": 0.2541, "step": 3604 }, { "epoch": 0.5606531881804043, "grad_norm": 1.1868751232820338, "learning_rate": 4.053016888156682e-06, "loss": 0.1573, "step": 3605 }, { "epoch": 0.5608087091757388, "grad_norm": 0.9680012136802153, "learning_rate": 4.050618298387427e-06, "loss": 0.1452, "step": 3606 }, { "epoch": 0.5609642301710731, "grad_norm": 1.0662872845433369, "learning_rate": 4.048219935248826e-06, "loss": 0.1189, "step": 3607 }, { "epoch": 0.5611197511664074, "grad_norm": 0.7373117431769676, "learning_rate": 4.045821799313397e-06, "loss": 0.1025, "step": 3608 }, { "epoch": 0.5612752721617419, "grad_norm": 0.9100459508294232, "learning_rate": 4.043423891153612e-06, "loss": 0.1332, "step": 3609 }, { "epoch": 0.5614307931570762, "grad_norm": 1.3935699292363493, "learning_rate": 4.041026211341886e-06, "loss": 0.1943, "step": 3610 }, { "epoch": 0.5615863141524106, "grad_norm": 1.1387023276285604, "learning_rate": 4.038628760450576e-06, "loss": 0.1938, "step": 3611 }, { "epoch": 0.5617418351477449, "grad_norm": 1.0865500103493115, "learning_rate": 4.036231539051986e-06, "loss": 0.1995, "step": 3612 }, { "epoch": 0.5618973561430793, "grad_norm": 1.379608252279679, "learning_rate": 4.033834547718368e-06, "loss": 0.2018, "step": 3613 }, { "epoch": 0.5620528771384137, "grad_norm": 0.8629743788845935, "learning_rate": 4.031437787021919e-06, "loss": 0.078, "step": 3614 }, { "epoch": 0.562208398133748, "grad_norm": 0.8436724702699211, "learning_rate": 4.029041257534777e-06, "loss": 0.1221, "step": 3615 }, { "epoch": 0.5623639191290825, "grad_norm": 1.0304279168312362, "learning_rate": 4.026644959829025e-06, "loss": 0.1304, "step": 3616 }, { "epoch": 0.5625194401244168, "grad_norm": 0.6964350697376791, "learning_rate": 4.024248894476697e-06, "loss": 0.1189, "step": 3617 }, { "epoch": 0.5626749611197511, "grad_norm": 0.9656458874239894, "learning_rate": 4.021853062049765e-06, "loss": 0.1385, "step": 3618 }, { "epoch": 0.5628304821150856, "grad_norm": 0.8018963393884337, "learning_rate": 4.019457463120146e-06, "loss": 0.1307, "step": 3619 }, { "epoch": 0.5629860031104199, "grad_norm": 1.4227779419636695, "learning_rate": 4.017062098259707e-06, "loss": 0.1974, "step": 3620 }, { "epoch": 0.5631415241057542, "grad_norm": 0.8192792032072828, "learning_rate": 4.014666968040252e-06, "loss": 0.1053, "step": 3621 }, { "epoch": 0.5632970451010887, "grad_norm": 0.9041391424092449, "learning_rate": 4.012272073033532e-06, "loss": 0.1433, "step": 3622 }, { "epoch": 0.563452566096423, "grad_norm": 0.7807696973579988, "learning_rate": 4.009877413811244e-06, "loss": 0.0818, "step": 3623 }, { "epoch": 0.5636080870917574, "grad_norm": 0.9061248882439153, "learning_rate": 4.007482990945023e-06, "loss": 0.1556, "step": 3624 }, { "epoch": 0.5637636080870918, "grad_norm": 1.2040353117225882, "learning_rate": 4.005088805006454e-06, "loss": 0.2069, "step": 3625 }, { "epoch": 0.5639191290824261, "grad_norm": 0.8876541152317877, "learning_rate": 4.002694856567063e-06, "loss": 0.2165, "step": 3626 }, { "epoch": 0.5640746500777605, "grad_norm": 1.1845353572534625, "learning_rate": 4.000301146198318e-06, "loss": 0.1811, "step": 3627 }, { "epoch": 0.5642301710730949, "grad_norm": 0.9568946776737602, "learning_rate": 3.997907674471628e-06, "loss": 0.125, "step": 3628 }, { "epoch": 0.5643856920684293, "grad_norm": 1.3632524572936475, "learning_rate": 3.995514441958352e-06, "loss": 0.1965, "step": 3629 }, { "epoch": 0.5645412130637636, "grad_norm": 1.1045208792310555, "learning_rate": 3.9931214492297875e-06, "loss": 0.1371, "step": 3630 }, { "epoch": 0.5646967340590979, "grad_norm": 0.8544301508689304, "learning_rate": 3.990728696857175e-06, "loss": 0.169, "step": 3631 }, { "epoch": 0.5648522550544324, "grad_norm": 0.9974210321307325, "learning_rate": 3.988336185411695e-06, "loss": 0.1383, "step": 3632 }, { "epoch": 0.5650077760497667, "grad_norm": 0.8928515051519395, "learning_rate": 3.985943915464477e-06, "loss": 0.1679, "step": 3633 }, { "epoch": 0.565163297045101, "grad_norm": 1.5423769693801643, "learning_rate": 3.983551887586589e-06, "loss": 0.211, "step": 3634 }, { "epoch": 0.5653188180404355, "grad_norm": 1.2244491841898106, "learning_rate": 3.981160102349037e-06, "loss": 0.1606, "step": 3635 }, { "epoch": 0.5654743390357698, "grad_norm": 0.88801559879881, "learning_rate": 3.97876856032278e-06, "loss": 0.1483, "step": 3636 }, { "epoch": 0.5656298600311042, "grad_norm": 1.088639677153151, "learning_rate": 3.9763772620787085e-06, "loss": 0.1617, "step": 3637 }, { "epoch": 0.5657853810264386, "grad_norm": 0.9145955592139379, "learning_rate": 3.973986208187658e-06, "loss": 0.1457, "step": 3638 }, { "epoch": 0.5659409020217729, "grad_norm": 1.047497392237993, "learning_rate": 3.97159539922041e-06, "loss": 0.1412, "step": 3639 }, { "epoch": 0.5660964230171073, "grad_norm": 1.3725832065680268, "learning_rate": 3.969204835747681e-06, "loss": 0.1853, "step": 3640 }, { "epoch": 0.5662519440124417, "grad_norm": 1.1071348472026845, "learning_rate": 3.9668145183401305e-06, "loss": 0.2129, "step": 3641 }, { "epoch": 0.5664074650077761, "grad_norm": 0.6784660696817704, "learning_rate": 3.964424447568367e-06, "loss": 0.1291, "step": 3642 }, { "epoch": 0.5665629860031104, "grad_norm": 0.8197233181537713, "learning_rate": 3.962034624002927e-06, "loss": 0.1093, "step": 3643 }, { "epoch": 0.5667185069984448, "grad_norm": 1.0660296603951362, "learning_rate": 3.959645048214296e-06, "loss": 0.1177, "step": 3644 }, { "epoch": 0.5668740279937792, "grad_norm": 1.1334251531265485, "learning_rate": 3.957255720772902e-06, "loss": 0.1014, "step": 3645 }, { "epoch": 0.5670295489891135, "grad_norm": 1.461577905710015, "learning_rate": 3.95486664224911e-06, "loss": 0.1277, "step": 3646 }, { "epoch": 0.567185069984448, "grad_norm": 1.140952908286075, "learning_rate": 3.9524778132132225e-06, "loss": 0.2314, "step": 3647 }, { "epoch": 0.5673405909797823, "grad_norm": 1.0601622876629047, "learning_rate": 3.9500892342354915e-06, "loss": 0.1179, "step": 3648 }, { "epoch": 0.5674961119751166, "grad_norm": 1.2952083465810234, "learning_rate": 3.9477009058861e-06, "loss": 0.0989, "step": 3649 }, { "epoch": 0.567651632970451, "grad_norm": 0.896546727451225, "learning_rate": 3.945312828735179e-06, "loss": 0.1741, "step": 3650 }, { "epoch": 0.5678071539657854, "grad_norm": 2.0138233984437695, "learning_rate": 3.942925003352793e-06, "loss": 0.1909, "step": 3651 }, { "epoch": 0.5679626749611197, "grad_norm": 1.0808460377604518, "learning_rate": 3.94053743030895e-06, "loss": 0.1026, "step": 3652 }, { "epoch": 0.5681181959564541, "grad_norm": 1.0038403239186084, "learning_rate": 3.938150110173597e-06, "loss": 0.2052, "step": 3653 }, { "epoch": 0.5682737169517885, "grad_norm": 1.8227114132920494, "learning_rate": 3.93576304351662e-06, "loss": 0.1382, "step": 3654 }, { "epoch": 0.5684292379471229, "grad_norm": 0.8700647544927617, "learning_rate": 3.933376230907848e-06, "loss": 0.0834, "step": 3655 }, { "epoch": 0.5685847589424572, "grad_norm": 0.6929490065447557, "learning_rate": 3.930989672917043e-06, "loss": 0.1444, "step": 3656 }, { "epoch": 0.5687402799377916, "grad_norm": 1.244283029782614, "learning_rate": 3.92860337011391e-06, "loss": 0.1887, "step": 3657 }, { "epoch": 0.568895800933126, "grad_norm": 0.9434053558855834, "learning_rate": 3.9262173230680956e-06, "loss": 0.1428, "step": 3658 }, { "epoch": 0.5690513219284603, "grad_norm": 0.9884348717970942, "learning_rate": 3.92383153234918e-06, "loss": 0.1176, "step": 3659 }, { "epoch": 0.5692068429237948, "grad_norm": 1.043755730687112, "learning_rate": 3.921445998526684e-06, "loss": 0.2048, "step": 3660 }, { "epoch": 0.5693623639191291, "grad_norm": 1.4098267376695979, "learning_rate": 3.919060722170071e-06, "loss": 0.1294, "step": 3661 }, { "epoch": 0.5695178849144634, "grad_norm": 1.0867934657865237, "learning_rate": 3.916675703848737e-06, "loss": 0.1406, "step": 3662 }, { "epoch": 0.5696734059097979, "grad_norm": 1.4632218544084636, "learning_rate": 3.914290944132019e-06, "loss": 0.1656, "step": 3663 }, { "epoch": 0.5698289269051322, "grad_norm": 1.0066234788421988, "learning_rate": 3.911906443589195e-06, "loss": 0.1235, "step": 3664 }, { "epoch": 0.5699844479004665, "grad_norm": 1.4593055856662336, "learning_rate": 3.9095222027894765e-06, "loss": 0.1687, "step": 3665 }, { "epoch": 0.570139968895801, "grad_norm": 1.1595590000277824, "learning_rate": 3.907138222302014e-06, "loss": 0.1216, "step": 3666 }, { "epoch": 0.5702954898911353, "grad_norm": 1.502215454651086, "learning_rate": 3.9047545026959e-06, "loss": 0.1518, "step": 3667 }, { "epoch": 0.5704510108864697, "grad_norm": 0.9424570992144677, "learning_rate": 3.902371044540159e-06, "loss": 0.1308, "step": 3668 }, { "epoch": 0.570606531881804, "grad_norm": 1.1765411856771795, "learning_rate": 3.8999878484037585e-06, "loss": 0.128, "step": 3669 }, { "epoch": 0.5707620528771384, "grad_norm": 0.902612181240845, "learning_rate": 3.897604914855596e-06, "loss": 0.1751, "step": 3670 }, { "epoch": 0.5709175738724728, "grad_norm": 0.8779645677672798, "learning_rate": 3.895222244464516e-06, "loss": 0.2275, "step": 3671 }, { "epoch": 0.5710730948678071, "grad_norm": 1.3279810493140813, "learning_rate": 3.892839837799292e-06, "loss": 0.1242, "step": 3672 }, { "epoch": 0.5712286158631416, "grad_norm": 1.0650678480078895, "learning_rate": 3.8904576954286374e-06, "loss": 0.1398, "step": 3673 }, { "epoch": 0.5713841368584759, "grad_norm": 1.433890163514264, "learning_rate": 3.888075817921207e-06, "loss": 0.209, "step": 3674 }, { "epoch": 0.5715396578538102, "grad_norm": 1.0232174186155516, "learning_rate": 3.885694205845584e-06, "loss": 0.159, "step": 3675 }, { "epoch": 0.5716951788491447, "grad_norm": 1.03986564296833, "learning_rate": 3.883312859770292e-06, "loss": 0.1038, "step": 3676 }, { "epoch": 0.571850699844479, "grad_norm": 1.0116734684191957, "learning_rate": 3.880931780263793e-06, "loss": 0.1665, "step": 3677 }, { "epoch": 0.5720062208398133, "grad_norm": 1.2163089590640237, "learning_rate": 3.878550967894486e-06, "loss": 0.2141, "step": 3678 }, { "epoch": 0.5721617418351478, "grad_norm": 0.9794487238864354, "learning_rate": 3.876170423230701e-06, "loss": 0.1294, "step": 3679 }, { "epoch": 0.5723172628304821, "grad_norm": 1.0908720074965834, "learning_rate": 3.873790146840709e-06, "loss": 0.1546, "step": 3680 }, { "epoch": 0.5724727838258165, "grad_norm": 0.8592082540583624, "learning_rate": 3.871410139292714e-06, "loss": 0.1706, "step": 3681 }, { "epoch": 0.5726283048211509, "grad_norm": 0.9741435424482981, "learning_rate": 3.8690304011548565e-06, "loss": 0.1526, "step": 3682 }, { "epoch": 0.5727838258164852, "grad_norm": 1.4252696296871674, "learning_rate": 3.866650932995216e-06, "loss": 0.1832, "step": 3683 }, { "epoch": 0.5729393468118196, "grad_norm": 1.2744027065699222, "learning_rate": 3.864271735381802e-06, "loss": 0.2915, "step": 3684 }, { "epoch": 0.573094867807154, "grad_norm": 0.8761473066852364, "learning_rate": 3.861892808882563e-06, "loss": 0.1105, "step": 3685 }, { "epoch": 0.5732503888024884, "grad_norm": 1.3010539079196122, "learning_rate": 3.859514154065382e-06, "loss": 0.1621, "step": 3686 }, { "epoch": 0.5734059097978227, "grad_norm": 1.4787117748557248, "learning_rate": 3.857135771498078e-06, "loss": 0.2123, "step": 3687 }, { "epoch": 0.573561430793157, "grad_norm": 1.2633900752208251, "learning_rate": 3.854757661748402e-06, "loss": 0.1022, "step": 3688 }, { "epoch": 0.5737169517884915, "grad_norm": 1.128889252182744, "learning_rate": 3.852379825384043e-06, "loss": 0.2104, "step": 3689 }, { "epoch": 0.5738724727838258, "grad_norm": 1.1114441550462724, "learning_rate": 3.8500022629726246e-06, "loss": 0.2289, "step": 3690 }, { "epoch": 0.5740279937791601, "grad_norm": 0.6862476145127174, "learning_rate": 3.847624975081704e-06, "loss": 0.1694, "step": 3691 }, { "epoch": 0.5741835147744946, "grad_norm": 1.1315128810686426, "learning_rate": 3.845247962278771e-06, "loss": 0.1612, "step": 3692 }, { "epoch": 0.5743390357698289, "grad_norm": 1.1072393976177457, "learning_rate": 3.842871225131252e-06, "loss": 0.1169, "step": 3693 }, { "epoch": 0.5744945567651633, "grad_norm": 1.240204325937349, "learning_rate": 3.840494764206511e-06, "loss": 0.1537, "step": 3694 }, { "epoch": 0.5746500777604977, "grad_norm": 1.0315984377178762, "learning_rate": 3.838118580071837e-06, "loss": 0.1643, "step": 3695 }, { "epoch": 0.574805598755832, "grad_norm": 1.006600938172501, "learning_rate": 3.835742673294462e-06, "loss": 0.1082, "step": 3696 }, { "epoch": 0.5749611197511664, "grad_norm": 1.0066584099436904, "learning_rate": 3.833367044441548e-06, "loss": 0.0898, "step": 3697 }, { "epoch": 0.5751166407465008, "grad_norm": 1.2723167071852366, "learning_rate": 3.830991694080187e-06, "loss": 0.1769, "step": 3698 }, { "epoch": 0.5752721617418352, "grad_norm": 1.4135725483086348, "learning_rate": 3.828616622777414e-06, "loss": 0.1167, "step": 3699 }, { "epoch": 0.5754276827371695, "grad_norm": 1.0498477363176921, "learning_rate": 3.8262418311001884e-06, "loss": 0.108, "step": 3700 }, { "epoch": 0.5754276827371695, "eval_loss": 0.17218641936779022, "eval_runtime": 9.4429, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 3700 }, { "epoch": 0.5755832037325039, "grad_norm": 0.89529998752128, "learning_rate": 3.823867319615405e-06, "loss": 0.1941, "step": 3701 }, { "epoch": 0.5757387247278383, "grad_norm": 1.0205293251176202, "learning_rate": 3.821493088889894e-06, "loss": 0.1795, "step": 3702 }, { "epoch": 0.5758942457231726, "grad_norm": 0.7058577410371626, "learning_rate": 3.819119139490419e-06, "loss": 0.1165, "step": 3703 }, { "epoch": 0.576049766718507, "grad_norm": 0.9464626086939034, "learning_rate": 3.816745471983672e-06, "loss": 0.1558, "step": 3704 }, { "epoch": 0.5762052877138414, "grad_norm": 1.1328333893520184, "learning_rate": 3.8143720869362823e-06, "loss": 0.1312, "step": 3705 }, { "epoch": 0.5763608087091757, "grad_norm": 1.2897942452757976, "learning_rate": 3.81199898491481e-06, "loss": 0.1407, "step": 3706 }, { "epoch": 0.57651632970451, "grad_norm": 1.0555019010201812, "learning_rate": 3.809626166485748e-06, "loss": 0.1892, "step": 3707 }, { "epoch": 0.5766718506998445, "grad_norm": 0.9952867257413818, "learning_rate": 3.8072536322155185e-06, "loss": 0.1493, "step": 3708 }, { "epoch": 0.5768273716951788, "grad_norm": 1.1901467392234883, "learning_rate": 3.804881382670481e-06, "loss": 0.1462, "step": 3709 }, { "epoch": 0.5769828926905132, "grad_norm": 1.173321241776454, "learning_rate": 3.8025094184169254e-06, "loss": 0.1134, "step": 3710 }, { "epoch": 0.5771384136858476, "grad_norm": 1.166059444043603, "learning_rate": 3.8001377400210686e-06, "loss": 0.1262, "step": 3711 }, { "epoch": 0.577293934681182, "grad_norm": 0.9635002060910945, "learning_rate": 3.7977663480490684e-06, "loss": 0.2022, "step": 3712 }, { "epoch": 0.5774494556765163, "grad_norm": 1.1416046999983067, "learning_rate": 3.7953952430670053e-06, "loss": 0.1411, "step": 3713 }, { "epoch": 0.5776049766718507, "grad_norm": 0.9846473168892196, "learning_rate": 3.793024425640896e-06, "loss": 0.2129, "step": 3714 }, { "epoch": 0.5777604976671851, "grad_norm": 1.0605340650118722, "learning_rate": 3.79065389633669e-06, "loss": 0.1114, "step": 3715 }, { "epoch": 0.5779160186625194, "grad_norm": 1.1618808427678367, "learning_rate": 3.788283655720264e-06, "loss": 0.1633, "step": 3716 }, { "epoch": 0.5780715396578539, "grad_norm": 1.2968637850443918, "learning_rate": 3.7859137043574244e-06, "loss": 0.1063, "step": 3717 }, { "epoch": 0.5782270606531882, "grad_norm": 0.8301076121089019, "learning_rate": 3.783544042813917e-06, "loss": 0.1729, "step": 3718 }, { "epoch": 0.5783825816485225, "grad_norm": 0.916613010847276, "learning_rate": 3.781174671655411e-06, "loss": 0.1225, "step": 3719 }, { "epoch": 0.578538102643857, "grad_norm": 0.8482580644059347, "learning_rate": 3.778805591447505e-06, "loss": 0.2144, "step": 3720 }, { "epoch": 0.5786936236391913, "grad_norm": 0.8775783915770369, "learning_rate": 3.776436802755737e-06, "loss": 0.1905, "step": 3721 }, { "epoch": 0.5788491446345256, "grad_norm": 1.3593526284630395, "learning_rate": 3.774068306145568e-06, "loss": 0.1131, "step": 3722 }, { "epoch": 0.5790046656298601, "grad_norm": 0.9647993551900854, "learning_rate": 3.7717001021823878e-06, "loss": 0.1523, "step": 3723 }, { "epoch": 0.5791601866251944, "grad_norm": 0.8110153509689182, "learning_rate": 3.7693321914315252e-06, "loss": 0.1554, "step": 3724 }, { "epoch": 0.5793157076205288, "grad_norm": 0.7192638158338103, "learning_rate": 3.7669645744582293e-06, "loss": 0.1658, "step": 3725 }, { "epoch": 0.5794712286158631, "grad_norm": 0.9270691892814943, "learning_rate": 3.7645972518276865e-06, "loss": 0.1326, "step": 3726 }, { "epoch": 0.5796267496111975, "grad_norm": 1.3376686822621062, "learning_rate": 3.762230224105005e-06, "loss": 0.1349, "step": 3727 }, { "epoch": 0.5797822706065319, "grad_norm": 0.8605250680257289, "learning_rate": 3.759863491855233e-06, "loss": 0.1983, "step": 3728 }, { "epoch": 0.5799377916018662, "grad_norm": 0.9580102725113202, "learning_rate": 3.757497055643339e-06, "loss": 0.1266, "step": 3729 }, { "epoch": 0.5800933125972006, "grad_norm": 1.0070269939227314, "learning_rate": 3.7551309160342233e-06, "loss": 0.1308, "step": 3730 }, { "epoch": 0.580248833592535, "grad_norm": 0.8780957433977615, "learning_rate": 3.7527650735927208e-06, "loss": 0.2142, "step": 3731 }, { "epoch": 0.5804043545878693, "grad_norm": 1.1191980755712647, "learning_rate": 3.750399528883588e-06, "loss": 0.108, "step": 3732 }, { "epoch": 0.5805598755832038, "grad_norm": 1.0044973612036479, "learning_rate": 3.7480342824715123e-06, "loss": 0.1608, "step": 3733 }, { "epoch": 0.5807153965785381, "grad_norm": 1.1290665142101406, "learning_rate": 3.745669334921114e-06, "loss": 0.1213, "step": 3734 }, { "epoch": 0.5808709175738724, "grad_norm": 0.9763119684264105, "learning_rate": 3.7433046867969382e-06, "loss": 0.1866, "step": 3735 }, { "epoch": 0.5810264385692069, "grad_norm": 1.1265347231402856, "learning_rate": 3.740940338663457e-06, "loss": 0.16, "step": 3736 }, { "epoch": 0.5811819595645412, "grad_norm": 1.6863081122377037, "learning_rate": 3.7385762910850766e-06, "loss": 0.1551, "step": 3737 }, { "epoch": 0.5813374805598756, "grad_norm": 0.8910302153746114, "learning_rate": 3.7362125446261273e-06, "loss": 0.1194, "step": 3738 }, { "epoch": 0.58149300155521, "grad_norm": 1.5791997344945523, "learning_rate": 3.7338490998508654e-06, "loss": 0.1912, "step": 3739 }, { "epoch": 0.5816485225505443, "grad_norm": 0.8880027577014762, "learning_rate": 3.731485957323483e-06, "loss": 0.2466, "step": 3740 }, { "epoch": 0.5818040435458787, "grad_norm": 0.7109118644854747, "learning_rate": 3.729123117608091e-06, "loss": 0.1525, "step": 3741 }, { "epoch": 0.5819595645412131, "grad_norm": 1.048208051239899, "learning_rate": 3.7267605812687336e-06, "loss": 0.1797, "step": 3742 }, { "epoch": 0.5821150855365474, "grad_norm": 0.866124143032253, "learning_rate": 3.724398348869383e-06, "loss": 0.1436, "step": 3743 }, { "epoch": 0.5822706065318818, "grad_norm": 0.7455408462159738, "learning_rate": 3.7220364209739355e-06, "loss": 0.1244, "step": 3744 }, { "epoch": 0.5824261275272161, "grad_norm": 1.8018772717591094, "learning_rate": 3.719674798146215e-06, "loss": 0.1415, "step": 3745 }, { "epoch": 0.5825816485225506, "grad_norm": 1.3020128847878882, "learning_rate": 3.7173134809499743e-06, "loss": 0.1407, "step": 3746 }, { "epoch": 0.5827371695178849, "grad_norm": 0.7696202972957449, "learning_rate": 3.714952469948895e-06, "loss": 0.1277, "step": 3747 }, { "epoch": 0.5828926905132192, "grad_norm": 1.2241666419968003, "learning_rate": 3.7125917657065817e-06, "loss": 0.1959, "step": 3748 }, { "epoch": 0.5830482115085537, "grad_norm": 0.896022176620724, "learning_rate": 3.7102313687865653e-06, "loss": 0.1941, "step": 3749 }, { "epoch": 0.583203732503888, "grad_norm": 0.9609263118883452, "learning_rate": 3.707871279752309e-06, "loss": 0.2004, "step": 3750 }, { "epoch": 0.5833592534992224, "grad_norm": 0.9065729431287559, "learning_rate": 3.705511499167199e-06, "loss": 0.1991, "step": 3751 }, { "epoch": 0.5835147744945568, "grad_norm": 1.4061374532231414, "learning_rate": 3.7031520275945436e-06, "loss": 0.1994, "step": 3752 }, { "epoch": 0.5836702954898911, "grad_norm": 1.1396767155128622, "learning_rate": 3.7007928655975856e-06, "loss": 0.1715, "step": 3753 }, { "epoch": 0.5838258164852255, "grad_norm": 1.1816959086687562, "learning_rate": 3.6984340137394903e-06, "loss": 0.1784, "step": 3754 }, { "epoch": 0.5839813374805599, "grad_norm": 0.6646925265800666, "learning_rate": 3.696075472583344e-06, "loss": 0.1066, "step": 3755 }, { "epoch": 0.5841368584758942, "grad_norm": 1.0035432867614429, "learning_rate": 3.6937172426921686e-06, "loss": 0.0986, "step": 3756 }, { "epoch": 0.5842923794712286, "grad_norm": 1.316306472622812, "learning_rate": 3.6913593246289035e-06, "loss": 0.1255, "step": 3757 }, { "epoch": 0.584447900466563, "grad_norm": 0.970675363828058, "learning_rate": 3.6890017189564152e-06, "loss": 0.1916, "step": 3758 }, { "epoch": 0.5846034214618974, "grad_norm": 1.1512380868766385, "learning_rate": 3.686644426237502e-06, "loss": 0.2229, "step": 3759 }, { "epoch": 0.5847589424572317, "grad_norm": 0.6889779590995257, "learning_rate": 3.68428744703488e-06, "loss": 0.144, "step": 3760 }, { "epoch": 0.5849144634525661, "grad_norm": 1.247206155260726, "learning_rate": 3.6819307819111898e-06, "loss": 0.1838, "step": 3761 }, { "epoch": 0.5850699844479005, "grad_norm": 1.0345958127683534, "learning_rate": 3.6795744314290044e-06, "loss": 0.1361, "step": 3762 }, { "epoch": 0.5852255054432348, "grad_norm": 1.3488221998114625, "learning_rate": 3.677218396150817e-06, "loss": 0.2269, "step": 3763 }, { "epoch": 0.5853810264385692, "grad_norm": 1.0974980518437472, "learning_rate": 3.674862676639046e-06, "loss": 0.1142, "step": 3764 }, { "epoch": 0.5855365474339036, "grad_norm": 1.0892033613237653, "learning_rate": 3.6725072734560307e-06, "loss": 0.1393, "step": 3765 }, { "epoch": 0.5856920684292379, "grad_norm": 0.7950066084494961, "learning_rate": 3.6701521871640435e-06, "loss": 0.1246, "step": 3766 }, { "epoch": 0.5858475894245723, "grad_norm": 1.3596991064107258, "learning_rate": 3.667797418325275e-06, "loss": 0.1239, "step": 3767 }, { "epoch": 0.5860031104199067, "grad_norm": 0.9944020401498918, "learning_rate": 3.66544296750184e-06, "loss": 0.1443, "step": 3768 }, { "epoch": 0.586158631415241, "grad_norm": 1.2812682842896386, "learning_rate": 3.6630888352557794e-06, "loss": 0.1396, "step": 3769 }, { "epoch": 0.5863141524105754, "grad_norm": 1.0730615323400485, "learning_rate": 3.6607350221490593e-06, "loss": 0.1603, "step": 3770 }, { "epoch": 0.5864696734059098, "grad_norm": 1.1087902000728853, "learning_rate": 3.6583815287435644e-06, "loss": 0.1363, "step": 3771 }, { "epoch": 0.5866251944012442, "grad_norm": 1.2348037463734765, "learning_rate": 3.65602835560111e-06, "loss": 0.2361, "step": 3772 }, { "epoch": 0.5867807153965785, "grad_norm": 0.9938979990795175, "learning_rate": 3.6536755032834283e-06, "loss": 0.1052, "step": 3773 }, { "epoch": 0.586936236391913, "grad_norm": 1.2013904843929366, "learning_rate": 3.651322972352178e-06, "loss": 0.1645, "step": 3774 }, { "epoch": 0.5870917573872473, "grad_norm": 0.8718842523808328, "learning_rate": 3.6489707633689443e-06, "loss": 0.1443, "step": 3775 }, { "epoch": 0.5872472783825816, "grad_norm": 0.8383831412516867, "learning_rate": 3.6466188768952306e-06, "loss": 0.1588, "step": 3776 }, { "epoch": 0.5874027993779161, "grad_norm": 0.9675430819845282, "learning_rate": 3.644267313492461e-06, "loss": 0.1796, "step": 3777 }, { "epoch": 0.5875583203732504, "grad_norm": 0.7131224200008046, "learning_rate": 3.6419160737219927e-06, "loss": 0.1472, "step": 3778 }, { "epoch": 0.5877138413685847, "grad_norm": 5.535342856439579, "learning_rate": 3.639565158145097e-06, "loss": 0.1918, "step": 3779 }, { "epoch": 0.5878693623639192, "grad_norm": 1.6605171812092434, "learning_rate": 3.6372145673229683e-06, "loss": 0.1735, "step": 3780 }, { "epoch": 0.5880248833592535, "grad_norm": 1.315981761940249, "learning_rate": 3.6348643018167283e-06, "loss": 0.2716, "step": 3781 }, { "epoch": 0.5881804043545878, "grad_norm": 1.0785338118400127, "learning_rate": 3.6325143621874172e-06, "loss": 0.1603, "step": 3782 }, { "epoch": 0.5883359253499222, "grad_norm": 1.4536650919061218, "learning_rate": 3.630164748995998e-06, "loss": 0.2385, "step": 3783 }, { "epoch": 0.5884914463452566, "grad_norm": 0.8860868116705176, "learning_rate": 3.6278154628033546e-06, "loss": 0.1454, "step": 3784 }, { "epoch": 0.588646967340591, "grad_norm": 0.9755211935808326, "learning_rate": 3.6254665041702976e-06, "loss": 0.179, "step": 3785 }, { "epoch": 0.5888024883359253, "grad_norm": 0.9229195616941909, "learning_rate": 3.6231178736575547e-06, "loss": 0.1147, "step": 3786 }, { "epoch": 0.5889580093312597, "grad_norm": 0.862248823475056, "learning_rate": 3.6207695718257754e-06, "loss": 0.1293, "step": 3787 }, { "epoch": 0.5891135303265941, "grad_norm": 1.0661640440120308, "learning_rate": 3.618421599235534e-06, "loss": 0.1318, "step": 3788 }, { "epoch": 0.5892690513219284, "grad_norm": 1.2633194095562068, "learning_rate": 3.6160739564473244e-06, "loss": 0.1638, "step": 3789 }, { "epoch": 0.5894245723172629, "grad_norm": 0.773041432460755, "learning_rate": 3.613726644021559e-06, "loss": 0.0959, "step": 3790 }, { "epoch": 0.5895800933125972, "grad_norm": 1.3162512001263016, "learning_rate": 3.611379662518578e-06, "loss": 0.1794, "step": 3791 }, { "epoch": 0.5897356143079315, "grad_norm": 3.7847525036316165, "learning_rate": 3.609033012498637e-06, "loss": 0.1332, "step": 3792 }, { "epoch": 0.589891135303266, "grad_norm": 1.335339677719655, "learning_rate": 3.6066866945219107e-06, "loss": 0.2397, "step": 3793 }, { "epoch": 0.5900466562986003, "grad_norm": 1.1286950460054914, "learning_rate": 3.604340709148503e-06, "loss": 0.1526, "step": 3794 }, { "epoch": 0.5902021772939346, "grad_norm": 1.316997082993514, "learning_rate": 3.601995056938431e-06, "loss": 0.1993, "step": 3795 }, { "epoch": 0.5903576982892691, "grad_norm": 1.146345483952742, "learning_rate": 3.5996497384516333e-06, "loss": 0.1578, "step": 3796 }, { "epoch": 0.5905132192846034, "grad_norm": 1.1335282585918642, "learning_rate": 3.5973047542479735e-06, "loss": 0.1809, "step": 3797 }, { "epoch": 0.5906687402799378, "grad_norm": 1.247491647317719, "learning_rate": 3.5949601048872297e-06, "loss": 0.1372, "step": 3798 }, { "epoch": 0.5908242612752722, "grad_norm": 1.010400069155978, "learning_rate": 3.592615790929101e-06, "loss": 0.2125, "step": 3799 }, { "epoch": 0.5909797822706065, "grad_norm": 0.7951218370976931, "learning_rate": 3.590271812933212e-06, "loss": 0.1334, "step": 3800 }, { "epoch": 0.5909797822706065, "eval_loss": 0.17280302941799164, "eval_runtime": 9.4273, "eval_samples_per_second": 2.758, "eval_steps_per_second": 0.743, "step": 3800 }, { "epoch": 0.5911353032659409, "grad_norm": 0.7368454707250384, "learning_rate": 3.5879281714590997e-06, "loss": 0.0996, "step": 3801 }, { "epoch": 0.5912908242612753, "grad_norm": 1.4919104511383072, "learning_rate": 3.585584867066225e-06, "loss": 0.0919, "step": 3802 }, { "epoch": 0.5914463452566097, "grad_norm": 0.9403942827349958, "learning_rate": 3.5832419003139674e-06, "loss": 0.2145, "step": 3803 }, { "epoch": 0.591601866251944, "grad_norm": 0.9049798302233923, "learning_rate": 3.580899271761626e-06, "loss": 0.1039, "step": 3804 }, { "epoch": 0.5917573872472783, "grad_norm": 1.4298449447530868, "learning_rate": 3.5785569819684175e-06, "loss": 0.3059, "step": 3805 }, { "epoch": 0.5919129082426128, "grad_norm": 0.7951295356727276, "learning_rate": 3.5762150314934794e-06, "loss": 0.0894, "step": 3806 }, { "epoch": 0.5920684292379471, "grad_norm": 0.8473178185953868, "learning_rate": 3.5738734208958703e-06, "loss": 0.1505, "step": 3807 }, { "epoch": 0.5922239502332814, "grad_norm": 1.2933128729338303, "learning_rate": 3.5715321507345647e-06, "loss": 0.1414, "step": 3808 }, { "epoch": 0.5923794712286159, "grad_norm": 0.9861987468804609, "learning_rate": 3.569191221568452e-06, "loss": 0.1464, "step": 3809 }, { "epoch": 0.5925349922239502, "grad_norm": 1.9660994015753464, "learning_rate": 3.5668506339563502e-06, "loss": 0.3009, "step": 3810 }, { "epoch": 0.5926905132192846, "grad_norm": 1.337762597386648, "learning_rate": 3.5645103884569886e-06, "loss": 0.177, "step": 3811 }, { "epoch": 0.592846034214619, "grad_norm": 1.1293471685948822, "learning_rate": 3.5621704856290134e-06, "loss": 0.1475, "step": 3812 }, { "epoch": 0.5930015552099533, "grad_norm": 1.0932968967809717, "learning_rate": 3.559830926030997e-06, "loss": 0.1818, "step": 3813 }, { "epoch": 0.5931570762052877, "grad_norm": 0.8586361628486543, "learning_rate": 3.5574917102214212e-06, "loss": 0.1286, "step": 3814 }, { "epoch": 0.5933125972006221, "grad_norm": 1.481133103279374, "learning_rate": 3.5551528387586896e-06, "loss": 0.1773, "step": 3815 }, { "epoch": 0.5934681181959565, "grad_norm": 0.81621426814378, "learning_rate": 3.552814312201127e-06, "loss": 0.1446, "step": 3816 }, { "epoch": 0.5936236391912908, "grad_norm": 1.293223878434778, "learning_rate": 3.5504761311069685e-06, "loss": 0.2186, "step": 3817 }, { "epoch": 0.5937791601866252, "grad_norm": 1.1418679073234383, "learning_rate": 3.5481382960343717e-06, "loss": 0.1395, "step": 3818 }, { "epoch": 0.5939346811819596, "grad_norm": 0.9986149437624529, "learning_rate": 3.545800807541411e-06, "loss": 0.1221, "step": 3819 }, { "epoch": 0.5940902021772939, "grad_norm": 0.7526185509016786, "learning_rate": 3.5434636661860776e-06, "loss": 0.144, "step": 3820 }, { "epoch": 0.5942457231726284, "grad_norm": 1.3817701286516793, "learning_rate": 3.541126872526277e-06, "loss": 0.168, "step": 3821 }, { "epoch": 0.5944012441679627, "grad_norm": 1.1484536597721462, "learning_rate": 3.5387904271198382e-06, "loss": 0.1373, "step": 3822 }, { "epoch": 0.594556765163297, "grad_norm": 1.4894413348182978, "learning_rate": 3.5364543305245024e-06, "loss": 0.1466, "step": 3823 }, { "epoch": 0.5947122861586314, "grad_norm": 1.2677008762469886, "learning_rate": 3.5341185832979273e-06, "loss": 0.4809, "step": 3824 }, { "epoch": 0.5948678071539658, "grad_norm": 1.2179341650956155, "learning_rate": 3.531783185997688e-06, "loss": 0.1318, "step": 3825 }, { "epoch": 0.5950233281493001, "grad_norm": 1.2097422245876368, "learning_rate": 3.529448139181277e-06, "loss": 0.2116, "step": 3826 }, { "epoch": 0.5951788491446345, "grad_norm": 0.9324097871367046, "learning_rate": 3.5271134434061043e-06, "loss": 0.1177, "step": 3827 }, { "epoch": 0.5953343701399689, "grad_norm": 1.5511781792944341, "learning_rate": 3.5247790992294907e-06, "loss": 0.2345, "step": 3828 }, { "epoch": 0.5954898911353033, "grad_norm": 1.4109039390068117, "learning_rate": 3.5224451072086807e-06, "loss": 0.2117, "step": 3829 }, { "epoch": 0.5956454121306376, "grad_norm": 1.2757771761186631, "learning_rate": 3.5201114679008286e-06, "loss": 0.1522, "step": 3830 }, { "epoch": 0.595800933125972, "grad_norm": 0.9937585391285637, "learning_rate": 3.5177781818630046e-06, "loss": 0.1109, "step": 3831 }, { "epoch": 0.5959564541213064, "grad_norm": 1.3066912913265771, "learning_rate": 3.5154452496522018e-06, "loss": 0.1344, "step": 3832 }, { "epoch": 0.5961119751166407, "grad_norm": 1.0873820874045028, "learning_rate": 3.5131126718253195e-06, "loss": 0.1827, "step": 3833 }, { "epoch": 0.5962674961119752, "grad_norm": 0.903597807600827, "learning_rate": 3.510780448939178e-06, "loss": 0.0967, "step": 3834 }, { "epoch": 0.5964230171073095, "grad_norm": 0.8794269870581702, "learning_rate": 3.508448581550511e-06, "loss": 0.2075, "step": 3835 }, { "epoch": 0.5965785381026438, "grad_norm": 0.7770907218246264, "learning_rate": 3.5061170702159685e-06, "loss": 0.1835, "step": 3836 }, { "epoch": 0.5967340590979783, "grad_norm": 0.9651863060207055, "learning_rate": 3.5037859154921127e-06, "loss": 0.1949, "step": 3837 }, { "epoch": 0.5968895800933126, "grad_norm": 0.9330177009076538, "learning_rate": 3.5014551179354255e-06, "loss": 0.2013, "step": 3838 }, { "epoch": 0.597045101088647, "grad_norm": 0.592575131106736, "learning_rate": 3.4991246781023004e-06, "loss": 0.1432, "step": 3839 }, { "epoch": 0.5972006220839814, "grad_norm": 1.090374451385102, "learning_rate": 3.4967945965490434e-06, "loss": 0.1524, "step": 3840 }, { "epoch": 0.5973561430793157, "grad_norm": 1.0039166808624955, "learning_rate": 3.494464873831881e-06, "loss": 0.2454, "step": 3841 }, { "epoch": 0.5975116640746501, "grad_norm": 1.3923622177078, "learning_rate": 3.492135510506947e-06, "loss": 0.1431, "step": 3842 }, { "epoch": 0.5976671850699844, "grad_norm": 1.338951269527425, "learning_rate": 3.489806507130296e-06, "loss": 0.2234, "step": 3843 }, { "epoch": 0.5978227060653188, "grad_norm": 0.9979685696754274, "learning_rate": 3.48747786425789e-06, "loss": 0.1771, "step": 3844 }, { "epoch": 0.5979782270606532, "grad_norm": 1.5581103677106332, "learning_rate": 3.4851495824456126e-06, "loss": 0.1845, "step": 3845 }, { "epoch": 0.5981337480559875, "grad_norm": 0.8235411934198494, "learning_rate": 3.482821662249255e-06, "loss": 0.1181, "step": 3846 }, { "epoch": 0.598289269051322, "grad_norm": 0.9833138146109598, "learning_rate": 3.4804941042245228e-06, "loss": 0.0831, "step": 3847 }, { "epoch": 0.5984447900466563, "grad_norm": 1.1882134143029064, "learning_rate": 3.4781669089270397e-06, "loss": 0.1321, "step": 3848 }, { "epoch": 0.5986003110419906, "grad_norm": 0.892251921356951, "learning_rate": 3.475840076912338e-06, "loss": 0.1616, "step": 3849 }, { "epoch": 0.5987558320373251, "grad_norm": 1.4232886801190825, "learning_rate": 3.4735136087358646e-06, "loss": 0.0779, "step": 3850 }, { "epoch": 0.5989113530326594, "grad_norm": 1.026792529265304, "learning_rate": 3.471187504952981e-06, "loss": 0.1288, "step": 3851 }, { "epoch": 0.5990668740279937, "grad_norm": 1.0742404499229647, "learning_rate": 3.4688617661189606e-06, "loss": 0.1666, "step": 3852 }, { "epoch": 0.5992223950233282, "grad_norm": 0.8117263190435208, "learning_rate": 3.4665363927889867e-06, "loss": 0.1076, "step": 3853 }, { "epoch": 0.5993779160186625, "grad_norm": 0.7990770439053131, "learning_rate": 3.4642113855181617e-06, "loss": 0.1242, "step": 3854 }, { "epoch": 0.5995334370139969, "grad_norm": 1.2581229114221675, "learning_rate": 3.4618867448614978e-06, "loss": 0.265, "step": 3855 }, { "epoch": 0.5996889580093313, "grad_norm": 0.8411176161872168, "learning_rate": 3.4595624713739157e-06, "loss": 0.0935, "step": 3856 }, { "epoch": 0.5998444790046656, "grad_norm": 0.8582551899181419, "learning_rate": 3.457238565610256e-06, "loss": 0.0876, "step": 3857 }, { "epoch": 0.6, "grad_norm": 1.356247162733943, "learning_rate": 3.4549150281252635e-06, "loss": 0.181, "step": 3858 }, { "epoch": 0.6001555209953344, "grad_norm": 1.42324679213715, "learning_rate": 3.4525918594736003e-06, "loss": 0.1995, "step": 3859 }, { "epoch": 0.6003110419906688, "grad_norm": 1.1954683615011503, "learning_rate": 3.450269060209841e-06, "loss": 0.1713, "step": 3860 }, { "epoch": 0.6004665629860031, "grad_norm": 1.1145383161696076, "learning_rate": 3.4479466308884694e-06, "loss": 0.124, "step": 3861 }, { "epoch": 0.6006220839813374, "grad_norm": 1.0533255079348258, "learning_rate": 3.4456245720638806e-06, "loss": 0.1419, "step": 3862 }, { "epoch": 0.6007776049766719, "grad_norm": 1.0981803613730445, "learning_rate": 3.4433028842903803e-06, "loss": 0.1012, "step": 3863 }, { "epoch": 0.6009331259720062, "grad_norm": 1.4760840368260533, "learning_rate": 3.4409815681221936e-06, "loss": 0.153, "step": 3864 }, { "epoch": 0.6010886469673405, "grad_norm": 0.8866530409187653, "learning_rate": 3.4386606241134466e-06, "loss": 0.1261, "step": 3865 }, { "epoch": 0.601244167962675, "grad_norm": 1.329839448582019, "learning_rate": 3.4363400528181826e-06, "loss": 0.2341, "step": 3866 }, { "epoch": 0.6013996889580093, "grad_norm": 0.8278439647831208, "learning_rate": 3.4340198547903536e-06, "loss": 0.1797, "step": 3867 }, { "epoch": 0.6015552099533437, "grad_norm": 1.0745798498960633, "learning_rate": 3.4317000305838255e-06, "loss": 0.1544, "step": 3868 }, { "epoch": 0.6017107309486781, "grad_norm": 0.9683032684213898, "learning_rate": 3.4293805807523677e-06, "loss": 0.1043, "step": 3869 }, { "epoch": 0.6018662519440124, "grad_norm": 1.0584320834430383, "learning_rate": 3.42706150584967e-06, "loss": 0.1876, "step": 3870 }, { "epoch": 0.6020217729393468, "grad_norm": 1.0720700333291342, "learning_rate": 3.4247428064293263e-06, "loss": 0.1726, "step": 3871 }, { "epoch": 0.6021772939346812, "grad_norm": 1.1604561367742923, "learning_rate": 3.4224244830448415e-06, "loss": 0.2467, "step": 3872 }, { "epoch": 0.6023328149300156, "grad_norm": 1.182089166742427, "learning_rate": 3.4201065362496342e-06, "loss": 0.2033, "step": 3873 }, { "epoch": 0.6024883359253499, "grad_norm": 1.2716972710778613, "learning_rate": 3.4177889665970283e-06, "loss": 0.1972, "step": 3874 }, { "epoch": 0.6026438569206843, "grad_norm": 1.171494234738484, "learning_rate": 3.4154717746402596e-06, "loss": 0.1745, "step": 3875 }, { "epoch": 0.6027993779160187, "grad_norm": 1.404715680552692, "learning_rate": 3.4131549609324773e-06, "loss": 0.1437, "step": 3876 }, { "epoch": 0.602954898911353, "grad_norm": 0.7176696047444673, "learning_rate": 3.410838526026735e-06, "loss": 0.1008, "step": 3877 }, { "epoch": 0.6031104199066875, "grad_norm": 1.4398523903375622, "learning_rate": 3.408522470475996e-06, "loss": 0.1333, "step": 3878 }, { "epoch": 0.6032659409020218, "grad_norm": 1.4383663664625421, "learning_rate": 3.406206794833139e-06, "loss": 0.1458, "step": 3879 }, { "epoch": 0.6034214618973561, "grad_norm": 0.8836311961143194, "learning_rate": 3.4038914996509464e-06, "loss": 0.1108, "step": 3880 }, { "epoch": 0.6035769828926905, "grad_norm": 0.9971014422584805, "learning_rate": 3.4015765854821115e-06, "loss": 0.1816, "step": 3881 }, { "epoch": 0.6037325038880249, "grad_norm": 0.9935506642673112, "learning_rate": 3.3992620528792352e-06, "loss": 0.1651, "step": 3882 }, { "epoch": 0.6038880248833592, "grad_norm": 2.180481437766861, "learning_rate": 3.3969479023948304e-06, "loss": 0.2168, "step": 3883 }, { "epoch": 0.6040435458786936, "grad_norm": 0.8934649920240859, "learning_rate": 3.3946341345813183e-06, "loss": 0.1883, "step": 3884 }, { "epoch": 0.604199066874028, "grad_norm": 1.5412061969281217, "learning_rate": 3.3923207499910225e-06, "loss": 0.1822, "step": 3885 }, { "epoch": 0.6043545878693624, "grad_norm": 1.464723907405094, "learning_rate": 3.3900077491761856e-06, "loss": 0.2274, "step": 3886 }, { "epoch": 0.6045101088646967, "grad_norm": 0.9955353288586886, "learning_rate": 3.387695132688952e-06, "loss": 0.135, "step": 3887 }, { "epoch": 0.6046656298600311, "grad_norm": 1.1748229824637224, "learning_rate": 3.3853829010813723e-06, "loss": 0.2393, "step": 3888 }, { "epoch": 0.6048211508553655, "grad_norm": 1.0285382492859452, "learning_rate": 3.383071054905413e-06, "loss": 0.1762, "step": 3889 }, { "epoch": 0.6049766718506998, "grad_norm": 0.9642357497585049, "learning_rate": 3.3807595947129405e-06, "loss": 0.1108, "step": 3890 }, { "epoch": 0.6051321928460343, "grad_norm": 1.0844845804337204, "learning_rate": 3.3784485210557333e-06, "loss": 0.1329, "step": 3891 }, { "epoch": 0.6052877138413686, "grad_norm": 1.2308523178593893, "learning_rate": 3.376137834485479e-06, "loss": 0.2124, "step": 3892 }, { "epoch": 0.6054432348367029, "grad_norm": 1.0920289008756296, "learning_rate": 3.3738275355537696e-06, "loss": 0.1771, "step": 3893 }, { "epoch": 0.6055987558320374, "grad_norm": 0.9412027455699212, "learning_rate": 3.371517624812103e-06, "loss": 0.1455, "step": 3894 }, { "epoch": 0.6057542768273717, "grad_norm": 1.1948755165535825, "learning_rate": 3.3692081028118894e-06, "loss": 0.1276, "step": 3895 }, { "epoch": 0.605909797822706, "grad_norm": 0.8016757207066428, "learning_rate": 3.366898970104446e-06, "loss": 0.1863, "step": 3896 }, { "epoch": 0.6060653188180405, "grad_norm": 0.8367427294324966, "learning_rate": 3.3645902272409904e-06, "loss": 0.1732, "step": 3897 }, { "epoch": 0.6062208398133748, "grad_norm": 0.8668407173007023, "learning_rate": 3.3622818747726568e-06, "loss": 0.1769, "step": 3898 }, { "epoch": 0.6063763608087092, "grad_norm": 0.7297056766184378, "learning_rate": 3.3599739132504767e-06, "loss": 0.1369, "step": 3899 }, { "epoch": 0.6065318818040435, "grad_norm": 0.9093758156966857, "learning_rate": 3.357666343225396e-06, "loss": 0.1057, "step": 3900 }, { "epoch": 0.6065318818040435, "eval_loss": 0.17144258320331573, "eval_runtime": 9.4209, "eval_samples_per_second": 2.76, "eval_steps_per_second": 0.743, "step": 3900 }, { "epoch": 0.6066874027993779, "grad_norm": 1.0956050222785594, "learning_rate": 3.35535916524826e-06, "loss": 0.1568, "step": 3901 }, { "epoch": 0.6068429237947123, "grad_norm": 1.3191212290725145, "learning_rate": 3.3530523798698273e-06, "loss": 0.1298, "step": 3902 }, { "epoch": 0.6069984447900466, "grad_norm": 1.0751758344162625, "learning_rate": 3.35074598764076e-06, "loss": 0.1883, "step": 3903 }, { "epoch": 0.6071539657853811, "grad_norm": 0.8708683887429591, "learning_rate": 3.348439989111624e-06, "loss": 0.149, "step": 3904 }, { "epoch": 0.6073094867807154, "grad_norm": 1.0694555833802646, "learning_rate": 3.3461343848328954e-06, "loss": 0.1486, "step": 3905 }, { "epoch": 0.6074650077760497, "grad_norm": 1.3819583097777202, "learning_rate": 3.3438291753549523e-06, "loss": 0.1943, "step": 3906 }, { "epoch": 0.6076205287713842, "grad_norm": 0.857731871445432, "learning_rate": 3.341524361228081e-06, "loss": 0.0791, "step": 3907 }, { "epoch": 0.6077760497667185, "grad_norm": 1.301127699589288, "learning_rate": 3.339219943002474e-06, "loss": 0.1449, "step": 3908 }, { "epoch": 0.6079315707620528, "grad_norm": 1.1570864412411916, "learning_rate": 3.3369159212282278e-06, "loss": 0.1477, "step": 3909 }, { "epoch": 0.6080870917573873, "grad_norm": 0.829799950039658, "learning_rate": 3.3346122964553407e-06, "loss": 0.0935, "step": 3910 }, { "epoch": 0.6082426127527216, "grad_norm": 1.1746739289020354, "learning_rate": 3.3323090692337244e-06, "loss": 0.1447, "step": 3911 }, { "epoch": 0.608398133748056, "grad_norm": 1.1008646248675698, "learning_rate": 3.3300062401131916e-06, "loss": 0.1405, "step": 3912 }, { "epoch": 0.6085536547433904, "grad_norm": 1.1542217200558567, "learning_rate": 3.3277038096434554e-06, "loss": 0.123, "step": 3913 }, { "epoch": 0.6087091757387247, "grad_norm": 1.0284607033158246, "learning_rate": 3.325401778374143e-06, "loss": 0.2242, "step": 3914 }, { "epoch": 0.6088646967340591, "grad_norm": 1.1470506781790692, "learning_rate": 3.3231001468547785e-06, "loss": 0.1831, "step": 3915 }, { "epoch": 0.6090202177293935, "grad_norm": 1.1908036881305386, "learning_rate": 3.320798915634793e-06, "loss": 0.149, "step": 3916 }, { "epoch": 0.6091757387247279, "grad_norm": 0.9948661768352608, "learning_rate": 3.3184980852635257e-06, "loss": 0.1238, "step": 3917 }, { "epoch": 0.6093312597200622, "grad_norm": 0.9273803507467088, "learning_rate": 3.316197656290214e-06, "loss": 0.1626, "step": 3918 }, { "epoch": 0.6094867807153965, "grad_norm": 1.1390747287775345, "learning_rate": 3.3138976292640036e-06, "loss": 0.1368, "step": 3919 }, { "epoch": 0.609642301710731, "grad_norm": 0.6951643861348608, "learning_rate": 3.3115980047339415e-06, "loss": 0.159, "step": 3920 }, { "epoch": 0.6097978227060653, "grad_norm": 1.38624618184033, "learning_rate": 3.309298783248982e-06, "loss": 0.1627, "step": 3921 }, { "epoch": 0.6099533437013996, "grad_norm": 1.7962294699791383, "learning_rate": 3.306999965357981e-06, "loss": 0.142, "step": 3922 }, { "epoch": 0.6101088646967341, "grad_norm": 0.7493660005633894, "learning_rate": 3.3047015516096957e-06, "loss": 0.1377, "step": 3923 }, { "epoch": 0.6102643856920684, "grad_norm": 1.1315967183442515, "learning_rate": 3.3024035425527943e-06, "loss": 0.2001, "step": 3924 }, { "epoch": 0.6104199066874028, "grad_norm": 2.373211229651155, "learning_rate": 3.3001059387358414e-06, "loss": 0.3589, "step": 3925 }, { "epoch": 0.6105754276827372, "grad_norm": 1.393921801604906, "learning_rate": 3.2978087407073046e-06, "loss": 0.1308, "step": 3926 }, { "epoch": 0.6107309486780715, "grad_norm": 0.7748430941841263, "learning_rate": 3.29551194901556e-06, "loss": 0.1451, "step": 3927 }, { "epoch": 0.6108864696734059, "grad_norm": 0.8911826417617253, "learning_rate": 3.293215564208885e-06, "loss": 0.1863, "step": 3928 }, { "epoch": 0.6110419906687403, "grad_norm": 1.4910247812701019, "learning_rate": 3.290919586835454e-06, "loss": 0.2358, "step": 3929 }, { "epoch": 0.6111975116640747, "grad_norm": 1.1990341924537629, "learning_rate": 3.288624017443353e-06, "loss": 0.18, "step": 3930 }, { "epoch": 0.611353032659409, "grad_norm": 1.9350710902854056, "learning_rate": 3.286328856580565e-06, "loss": 0.2156, "step": 3931 }, { "epoch": 0.6115085536547434, "grad_norm": 0.94319149864738, "learning_rate": 3.2840341047949754e-06, "loss": 0.1984, "step": 3932 }, { "epoch": 0.6116640746500778, "grad_norm": 0.9362459328386021, "learning_rate": 3.281739762634377e-06, "loss": 0.1909, "step": 3933 }, { "epoch": 0.6118195956454121, "grad_norm": 1.3438639672984762, "learning_rate": 3.2794458306464584e-06, "loss": 0.1885, "step": 3934 }, { "epoch": 0.6119751166407466, "grad_norm": 0.7095509643106033, "learning_rate": 3.277152309378813e-06, "loss": 0.1227, "step": 3935 }, { "epoch": 0.6121306376360809, "grad_norm": 0.7181699178651835, "learning_rate": 3.2748591993789393e-06, "loss": 0.1417, "step": 3936 }, { "epoch": 0.6122861586314152, "grad_norm": 1.132050538091891, "learning_rate": 3.2725665011942324e-06, "loss": 0.1157, "step": 3937 }, { "epoch": 0.6124416796267496, "grad_norm": 1.2872049646241446, "learning_rate": 3.2702742153719906e-06, "loss": 0.1323, "step": 3938 }, { "epoch": 0.612597200622084, "grad_norm": 0.5974239528696816, "learning_rate": 3.267982342459415e-06, "loss": 0.1265, "step": 3939 }, { "epoch": 0.6127527216174183, "grad_norm": 0.837782617541905, "learning_rate": 3.265690883003609e-06, "loss": 0.1188, "step": 3940 }, { "epoch": 0.6129082426127527, "grad_norm": 0.850681942946027, "learning_rate": 3.263399837551576e-06, "loss": 0.1513, "step": 3941 }, { "epoch": 0.6130637636080871, "grad_norm": 1.135405113388794, "learning_rate": 3.2611092066502174e-06, "loss": 0.0966, "step": 3942 }, { "epoch": 0.6132192846034215, "grad_norm": 0.9606246342392277, "learning_rate": 3.258818990846342e-06, "loss": 0.1632, "step": 3943 }, { "epoch": 0.6133748055987558, "grad_norm": 1.3172206639900355, "learning_rate": 3.256529190686656e-06, "loss": 0.2366, "step": 3944 }, { "epoch": 0.6135303265940902, "grad_norm": 1.0886054065712696, "learning_rate": 3.254239806717763e-06, "loss": 0.1001, "step": 3945 }, { "epoch": 0.6136858475894246, "grad_norm": 0.8918475290939591, "learning_rate": 3.2519508394861766e-06, "loss": 0.1652, "step": 3946 }, { "epoch": 0.6138413685847589, "grad_norm": 0.876085139441039, "learning_rate": 3.2496622895383e-06, "loss": 0.1666, "step": 3947 }, { "epoch": 0.6139968895800934, "grad_norm": 3.858358570127307, "learning_rate": 3.247374157420444e-06, "loss": 0.2357, "step": 3948 }, { "epoch": 0.6141524105754277, "grad_norm": 1.2227432252085024, "learning_rate": 3.24508644367882e-06, "loss": 0.1858, "step": 3949 }, { "epoch": 0.614307931570762, "grad_norm": 0.9691615004785167, "learning_rate": 3.2427991488595334e-06, "loss": 0.1199, "step": 3950 }, { "epoch": 0.6144634525660965, "grad_norm": 1.4806068771182554, "learning_rate": 3.2405122735085947e-06, "loss": 0.2309, "step": 3951 }, { "epoch": 0.6146189735614308, "grad_norm": 1.3490451813543325, "learning_rate": 3.238225818171915e-06, "loss": 0.1481, "step": 3952 }, { "epoch": 0.6147744945567651, "grad_norm": 1.010745711646417, "learning_rate": 3.235939783395301e-06, "loss": 0.0802, "step": 3953 }, { "epoch": 0.6149300155520996, "grad_norm": 1.233492021235516, "learning_rate": 3.2336541697244594e-06, "loss": 0.172, "step": 3954 }, { "epoch": 0.6150855365474339, "grad_norm": 1.3901825329446718, "learning_rate": 3.2313689777050017e-06, "loss": 0.1856, "step": 3955 }, { "epoch": 0.6152410575427683, "grad_norm": 1.081608271000454, "learning_rate": 3.2290842078824335e-06, "loss": 0.2304, "step": 3956 }, { "epoch": 0.6153965785381026, "grad_norm": 0.8788823211206985, "learning_rate": 3.2267998608021623e-06, "loss": 0.1074, "step": 3957 }, { "epoch": 0.615552099533437, "grad_norm": 0.903730841660074, "learning_rate": 3.224515937009489e-06, "loss": 0.1531, "step": 3958 }, { "epoch": 0.6157076205287714, "grad_norm": 1.5356716449538264, "learning_rate": 3.222232437049624e-06, "loss": 0.1866, "step": 3959 }, { "epoch": 0.6158631415241057, "grad_norm": 1.4214689927642028, "learning_rate": 3.219949361467668e-06, "loss": 0.1546, "step": 3960 }, { "epoch": 0.6160186625194402, "grad_norm": 0.8698298815003316, "learning_rate": 3.217666710808621e-06, "loss": 0.1293, "step": 3961 }, { "epoch": 0.6161741835147745, "grad_norm": 0.8596106116295954, "learning_rate": 3.215384485617389e-06, "loss": 0.1061, "step": 3962 }, { "epoch": 0.6163297045101088, "grad_norm": 1.1370244119746797, "learning_rate": 3.2131026864387656e-06, "loss": 0.2252, "step": 3963 }, { "epoch": 0.6164852255054433, "grad_norm": 1.085526278974763, "learning_rate": 3.2108213138174494e-06, "loss": 0.1643, "step": 3964 }, { "epoch": 0.6166407465007776, "grad_norm": 1.7895463021317797, "learning_rate": 3.208540368298038e-06, "loss": 0.2212, "step": 3965 }, { "epoch": 0.6167962674961119, "grad_norm": 0.9937309591786468, "learning_rate": 3.206259850425023e-06, "loss": 0.1099, "step": 3966 }, { "epoch": 0.6169517884914464, "grad_norm": 0.9359019238524645, "learning_rate": 3.2039797607427954e-06, "loss": 0.1474, "step": 3967 }, { "epoch": 0.6171073094867807, "grad_norm": 0.9037038924673162, "learning_rate": 3.201700099795647e-06, "loss": 0.1921, "step": 3968 }, { "epoch": 0.617262830482115, "grad_norm": 1.358336130415309, "learning_rate": 3.1994208681277628e-06, "loss": 0.1298, "step": 3969 }, { "epoch": 0.6174183514774495, "grad_norm": 1.339332451209068, "learning_rate": 3.197142066283225e-06, "loss": 0.175, "step": 3970 }, { "epoch": 0.6175738724727838, "grad_norm": 1.3746897865043157, "learning_rate": 3.1948636948060195e-06, "loss": 0.1981, "step": 3971 }, { "epoch": 0.6177293934681182, "grad_norm": 1.01533692795272, "learning_rate": 3.1925857542400253e-06, "loss": 0.1895, "step": 3972 }, { "epoch": 0.6178849144634526, "grad_norm": 0.820141389210646, "learning_rate": 3.1903082451290136e-06, "loss": 0.144, "step": 3973 }, { "epoch": 0.618040435458787, "grad_norm": 1.1840376639778787, "learning_rate": 3.188031168016663e-06, "loss": 0.131, "step": 3974 }, { "epoch": 0.6181959564541213, "grad_norm": 1.069211517315581, "learning_rate": 3.1857545234465405e-06, "loss": 0.2316, "step": 3975 }, { "epoch": 0.6183514774494556, "grad_norm": 0.9301854200767, "learning_rate": 3.183478311962114e-06, "loss": 0.2158, "step": 3976 }, { "epoch": 0.6185069984447901, "grad_norm": 1.0647583316221962, "learning_rate": 3.181202534106746e-06, "loss": 0.1822, "step": 3977 }, { "epoch": 0.6186625194401244, "grad_norm": 1.1578902595053129, "learning_rate": 3.1789271904236985e-06, "loss": 0.1708, "step": 3978 }, { "epoch": 0.6188180404354587, "grad_norm": 1.3245303355553681, "learning_rate": 3.176652281456125e-06, "loss": 0.1311, "step": 3979 }, { "epoch": 0.6189735614307932, "grad_norm": 0.9724123152826942, "learning_rate": 3.174377807747079e-06, "loss": 0.1245, "step": 3980 }, { "epoch": 0.6191290824261275, "grad_norm": 1.1588316995455832, "learning_rate": 3.172103769839512e-06, "loss": 0.1327, "step": 3981 }, { "epoch": 0.6192846034214619, "grad_norm": 1.0782810934414955, "learning_rate": 3.169830168276264e-06, "loss": 0.2074, "step": 3982 }, { "epoch": 0.6194401244167963, "grad_norm": 1.1374497130452252, "learning_rate": 3.167557003600078e-06, "loss": 0.1786, "step": 3983 }, { "epoch": 0.6195956454121306, "grad_norm": 1.9070746275221366, "learning_rate": 3.165284276353589e-06, "loss": 0.2156, "step": 3984 }, { "epoch": 0.619751166407465, "grad_norm": 1.249152287099708, "learning_rate": 3.1630119870793306e-06, "loss": 0.1761, "step": 3985 }, { "epoch": 0.6199066874027994, "grad_norm": 1.3639308796870684, "learning_rate": 3.1607401363197265e-06, "loss": 0.133, "step": 3986 }, { "epoch": 0.6200622083981338, "grad_norm": 1.0679092205280354, "learning_rate": 3.1584687246171024e-06, "loss": 0.1842, "step": 3987 }, { "epoch": 0.6202177293934681, "grad_norm": 1.3197776003638269, "learning_rate": 3.1561977525136754e-06, "loss": 0.1596, "step": 3988 }, { "epoch": 0.6203732503888025, "grad_norm": 0.9178048062369041, "learning_rate": 3.1539272205515557e-06, "loss": 0.1509, "step": 3989 }, { "epoch": 0.6205287713841369, "grad_norm": 0.9589460332091052, "learning_rate": 3.1516571292727553e-06, "loss": 0.1478, "step": 3990 }, { "epoch": 0.6206842923794712, "grad_norm": 1.0685162507870172, "learning_rate": 3.149387479219173e-06, "loss": 0.2137, "step": 3991 }, { "epoch": 0.6208398133748056, "grad_norm": 0.6350792780259459, "learning_rate": 3.1471182709326065e-06, "loss": 0.1398, "step": 3992 }, { "epoch": 0.62099533437014, "grad_norm": 0.7468572874351359, "learning_rate": 3.144849504954749e-06, "loss": 0.142, "step": 3993 }, { "epoch": 0.6211508553654743, "grad_norm": 1.0599986030550972, "learning_rate": 3.1425811818271866e-06, "loss": 0.156, "step": 3994 }, { "epoch": 0.6213063763608087, "grad_norm": 1.4826492670299984, "learning_rate": 3.1403133020913974e-06, "loss": 0.1725, "step": 3995 }, { "epoch": 0.6214618973561431, "grad_norm": 1.0222501631923613, "learning_rate": 3.1380458662887557e-06, "loss": 0.1363, "step": 3996 }, { "epoch": 0.6216174183514774, "grad_norm": 1.0244616330566072, "learning_rate": 3.1357788749605344e-06, "loss": 0.2124, "step": 3997 }, { "epoch": 0.6217729393468118, "grad_norm": 1.0575492108378708, "learning_rate": 3.133512328647892e-06, "loss": 0.1982, "step": 3998 }, { "epoch": 0.6219284603421462, "grad_norm": 1.145404179130331, "learning_rate": 3.1312462278918853e-06, "loss": 0.4379, "step": 3999 }, { "epoch": 0.6220839813374806, "grad_norm": 1.225922132730782, "learning_rate": 3.128980573233465e-06, "loss": 0.1513, "step": 4000 }, { "epoch": 0.6220839813374806, "eval_loss": 0.17017674446105957, "eval_runtime": 9.4357, "eval_samples_per_second": 2.755, "eval_steps_per_second": 0.742, "step": 4000 }, { "epoch": 0.6222395023328149, "grad_norm": 1.3094742454113348, "learning_rate": 3.1267153652134754e-06, "loss": 0.1206, "step": 4001 }, { "epoch": 0.6223950233281493, "grad_norm": 1.8288852192744924, "learning_rate": 3.12445060437265e-06, "loss": 0.2547, "step": 4002 }, { "epoch": 0.6225505443234837, "grad_norm": 0.9955004407523551, "learning_rate": 3.122186291251621e-06, "loss": 0.124, "step": 4003 }, { "epoch": 0.622706065318818, "grad_norm": 1.269824986521419, "learning_rate": 3.1199224263909123e-06, "loss": 0.2026, "step": 4004 }, { "epoch": 0.6228615863141524, "grad_norm": 0.9894641498635691, "learning_rate": 3.117659010330938e-06, "loss": 0.1685, "step": 4005 }, { "epoch": 0.6230171073094868, "grad_norm": 1.268274162837014, "learning_rate": 3.1153960436120097e-06, "loss": 0.1759, "step": 4006 }, { "epoch": 0.6231726283048211, "grad_norm": 0.9991110588341253, "learning_rate": 3.1131335267743263e-06, "loss": 0.1485, "step": 4007 }, { "epoch": 0.6233281493001556, "grad_norm": 0.6983462883823367, "learning_rate": 3.1108714603579828e-06, "loss": 0.297, "step": 4008 }, { "epoch": 0.6234836702954899, "grad_norm": 0.8746984783603572, "learning_rate": 3.108609844902968e-06, "loss": 0.1205, "step": 4009 }, { "epoch": 0.6236391912908242, "grad_norm": 1.2714197115836121, "learning_rate": 3.1063486809491595e-06, "loss": 0.1043, "step": 4010 }, { "epoch": 0.6237947122861587, "grad_norm": 0.822371272311479, "learning_rate": 3.1040879690363266e-06, "loss": 0.1366, "step": 4011 }, { "epoch": 0.623950233281493, "grad_norm": 1.994458961151286, "learning_rate": 3.1018277097041365e-06, "loss": 0.2399, "step": 4012 }, { "epoch": 0.6241057542768274, "grad_norm": 0.9253973561493322, "learning_rate": 3.099567903492144e-06, "loss": 0.1628, "step": 4013 }, { "epoch": 0.6242612752721617, "grad_norm": 0.8606052737474799, "learning_rate": 3.097308550939794e-06, "loss": 0.1263, "step": 4014 }, { "epoch": 0.6244167962674961, "grad_norm": 1.643212075090044, "learning_rate": 3.0950496525864273e-06, "loss": 0.2335, "step": 4015 }, { "epoch": 0.6245723172628305, "grad_norm": 1.5239121613016136, "learning_rate": 3.0927912089712738e-06, "loss": 0.1722, "step": 4016 }, { "epoch": 0.6247278382581648, "grad_norm": 0.947629010523022, "learning_rate": 3.0905332206334567e-06, "loss": 0.1296, "step": 4017 }, { "epoch": 0.6248833592534992, "grad_norm": 1.0975821001287158, "learning_rate": 3.088275688111986e-06, "loss": 0.1596, "step": 4018 }, { "epoch": 0.6250388802488336, "grad_norm": 0.9559975096077269, "learning_rate": 3.0860186119457695e-06, "loss": 0.0915, "step": 4019 }, { "epoch": 0.6251944012441679, "grad_norm": 1.1974693485475545, "learning_rate": 3.0837619926736027e-06, "loss": 0.1523, "step": 4020 }, { "epoch": 0.6253499222395024, "grad_norm": 1.335027206369511, "learning_rate": 3.0815058308341693e-06, "loss": 0.1816, "step": 4021 }, { "epoch": 0.6255054432348367, "grad_norm": 1.1898241633372326, "learning_rate": 3.079250126966051e-06, "loss": 0.1827, "step": 4022 }, { "epoch": 0.625660964230171, "grad_norm": 1.1621685929962164, "learning_rate": 3.0769948816077113e-06, "loss": 0.1605, "step": 4023 }, { "epoch": 0.6258164852255055, "grad_norm": 0.7862540452719574, "learning_rate": 3.0747400952975104e-06, "loss": 0.1398, "step": 4024 }, { "epoch": 0.6259720062208398, "grad_norm": 0.885746682888412, "learning_rate": 3.0724857685736987e-06, "loss": 0.1516, "step": 4025 }, { "epoch": 0.6261275272161742, "grad_norm": 1.2829312863975595, "learning_rate": 3.0702319019744143e-06, "loss": 0.1719, "step": 4026 }, { "epoch": 0.6262830482115086, "grad_norm": 0.833440694085121, "learning_rate": 3.0679784960376845e-06, "loss": 0.1394, "step": 4027 }, { "epoch": 0.6264385692068429, "grad_norm": 1.1055406236861067, "learning_rate": 3.0657255513014317e-06, "loss": 0.1636, "step": 4028 }, { "epoch": 0.6265940902021773, "grad_norm": 1.2606893548746232, "learning_rate": 3.0634730683034653e-06, "loss": 0.1578, "step": 4029 }, { "epoch": 0.6267496111975117, "grad_norm": 1.183761981798936, "learning_rate": 3.061221047581482e-06, "loss": 0.1715, "step": 4030 }, { "epoch": 0.626905132192846, "grad_norm": 1.260006171632546, "learning_rate": 3.0589694896730727e-06, "loss": 0.1715, "step": 4031 }, { "epoch": 0.6270606531881804, "grad_norm": 1.125698700666014, "learning_rate": 3.0567183951157142e-06, "loss": 0.1379, "step": 4032 }, { "epoch": 0.6272161741835148, "grad_norm": 0.902044954274944, "learning_rate": 3.0544677644467736e-06, "loss": 0.1315, "step": 4033 }, { "epoch": 0.6273716951788492, "grad_norm": 0.9433004001762546, "learning_rate": 3.052217598203512e-06, "loss": 0.1427, "step": 4034 }, { "epoch": 0.6275272161741835, "grad_norm": 1.2687482524865659, "learning_rate": 3.0499678969230707e-06, "loss": 0.1533, "step": 4035 }, { "epoch": 0.6276827371695178, "grad_norm": 0.7747334549816114, "learning_rate": 3.047718661142487e-06, "loss": 0.0984, "step": 4036 }, { "epoch": 0.6278382581648523, "grad_norm": 0.6545692277139293, "learning_rate": 3.0454698913986836e-06, "loss": 0.1068, "step": 4037 }, { "epoch": 0.6279937791601866, "grad_norm": 1.7327649229494635, "learning_rate": 3.0432215882284756e-06, "loss": 0.1948, "step": 4038 }, { "epoch": 0.628149300155521, "grad_norm": 0.5690781162533233, "learning_rate": 3.040973752168561e-06, "loss": 0.0752, "step": 4039 }, { "epoch": 0.6283048211508554, "grad_norm": 1.3088166063546975, "learning_rate": 3.038726383755531e-06, "loss": 0.1448, "step": 4040 }, { "epoch": 0.6284603421461897, "grad_norm": 0.7472059195265462, "learning_rate": 3.036479483525866e-06, "loss": 0.1853, "step": 4041 }, { "epoch": 0.6286158631415241, "grad_norm": 1.0147181570312263, "learning_rate": 3.0342330520159303e-06, "loss": 0.1462, "step": 4042 }, { "epoch": 0.6287713841368585, "grad_norm": 1.2575988471722104, "learning_rate": 3.031987089761977e-06, "loss": 0.1761, "step": 4043 }, { "epoch": 0.6289269051321928, "grad_norm": 0.6354403559643552, "learning_rate": 3.0297415973001508e-06, "loss": 0.1, "step": 4044 }, { "epoch": 0.6290824261275272, "grad_norm": 1.0284509183182597, "learning_rate": 3.0274965751664833e-06, "loss": 0.0829, "step": 4045 }, { "epoch": 0.6292379471228616, "grad_norm": 0.7988869647539968, "learning_rate": 3.0252520238968887e-06, "loss": 0.1408, "step": 4046 }, { "epoch": 0.629393468118196, "grad_norm": 0.8675423417886627, "learning_rate": 3.0230079440271763e-06, "loss": 0.1187, "step": 4047 }, { "epoch": 0.6295489891135303, "grad_norm": 1.2347033841046904, "learning_rate": 3.0207643360930366e-06, "loss": 0.2158, "step": 4048 }, { "epoch": 0.6297045101088647, "grad_norm": 1.2319703958421027, "learning_rate": 3.018521200630051e-06, "loss": 0.1761, "step": 4049 }, { "epoch": 0.6298600311041991, "grad_norm": 1.2652771178616276, "learning_rate": 3.0162785381736893e-06, "loss": 0.1484, "step": 4050 }, { "epoch": 0.6300155520995334, "grad_norm": 1.0219335035250554, "learning_rate": 3.014036349259303e-06, "loss": 0.1614, "step": 4051 }, { "epoch": 0.6301710730948679, "grad_norm": 1.140983361836353, "learning_rate": 3.011794634422135e-06, "loss": 0.1437, "step": 4052 }, { "epoch": 0.6303265940902022, "grad_norm": 0.8576491904260584, "learning_rate": 3.0095533941973156e-06, "loss": 0.0876, "step": 4053 }, { "epoch": 0.6304821150855365, "grad_norm": 1.233130766574232, "learning_rate": 3.0073126291198594e-06, "loss": 0.1571, "step": 4054 }, { "epoch": 0.6306376360808709, "grad_norm": 1.0496507970299729, "learning_rate": 3.0050723397246663e-06, "loss": 0.1765, "step": 4055 }, { "epoch": 0.6307931570762053, "grad_norm": 1.1076742581573908, "learning_rate": 3.002832526546525e-06, "loss": 0.1513, "step": 4056 }, { "epoch": 0.6309486780715396, "grad_norm": 1.003808869962046, "learning_rate": 3.0005931901201137e-06, "loss": 0.1624, "step": 4057 }, { "epoch": 0.631104199066874, "grad_norm": 0.7684564728351232, "learning_rate": 2.9983543309799897e-06, "loss": 0.1625, "step": 4058 }, { "epoch": 0.6312597200622084, "grad_norm": 6.766327136136721, "learning_rate": 2.9961159496605986e-06, "loss": 0.1134, "step": 4059 }, { "epoch": 0.6314152410575428, "grad_norm": 1.0904533478000944, "learning_rate": 2.9938780466962768e-06, "loss": 0.149, "step": 4060 }, { "epoch": 0.6315707620528771, "grad_norm": 0.995502406998833, "learning_rate": 2.9916406226212437e-06, "loss": 0.1927, "step": 4061 }, { "epoch": 0.6317262830482115, "grad_norm": 1.5778818428301888, "learning_rate": 2.989403677969598e-06, "loss": 0.1885, "step": 4062 }, { "epoch": 0.6318818040435459, "grad_norm": 1.3854807320456868, "learning_rate": 2.987167213275336e-06, "loss": 0.1464, "step": 4063 }, { "epoch": 0.6320373250388802, "grad_norm": 0.9812643088237749, "learning_rate": 2.9849312290723287e-06, "loss": 0.1249, "step": 4064 }, { "epoch": 0.6321928460342147, "grad_norm": 1.2259044479859358, "learning_rate": 2.9826957258943375e-06, "loss": 0.109, "step": 4065 }, { "epoch": 0.632348367029549, "grad_norm": 1.1340200748162108, "learning_rate": 2.9804607042750107e-06, "loss": 0.1194, "step": 4066 }, { "epoch": 0.6325038880248833, "grad_norm": 1.155693168391778, "learning_rate": 2.978226164747876e-06, "loss": 0.1057, "step": 4067 }, { "epoch": 0.6326594090202178, "grad_norm": 0.6996344725356083, "learning_rate": 2.975992107846349e-06, "loss": 0.0944, "step": 4068 }, { "epoch": 0.6328149300155521, "grad_norm": 1.734925043388634, "learning_rate": 2.9737585341037344e-06, "loss": 0.1444, "step": 4069 }, { "epoch": 0.6329704510108864, "grad_norm": 0.9811169815506973, "learning_rate": 2.9715254440532147e-06, "loss": 0.1734, "step": 4070 }, { "epoch": 0.6331259720062209, "grad_norm": 1.1983979083405545, "learning_rate": 2.9692928382278575e-06, "loss": 0.1456, "step": 4071 }, { "epoch": 0.6332814930015552, "grad_norm": 1.3845019583255926, "learning_rate": 2.967060717160619e-06, "loss": 0.1793, "step": 4072 }, { "epoch": 0.6334370139968896, "grad_norm": 1.057976350678729, "learning_rate": 2.96482908138434e-06, "loss": 0.1514, "step": 4073 }, { "epoch": 0.6335925349922239, "grad_norm": 1.5537452615582217, "learning_rate": 2.96259793143174e-06, "loss": 0.1442, "step": 4074 }, { "epoch": 0.6337480559875583, "grad_norm": 1.1058657848647409, "learning_rate": 2.9603672678354244e-06, "loss": 0.1594, "step": 4075 }, { "epoch": 0.6339035769828927, "grad_norm": 1.448445755282528, "learning_rate": 2.9581370911278872e-06, "loss": 0.14, "step": 4076 }, { "epoch": 0.634059097978227, "grad_norm": 1.4905591044721729, "learning_rate": 2.9559074018415014e-06, "loss": 0.1602, "step": 4077 }, { "epoch": 0.6342146189735615, "grad_norm": 0.9060143356274665, "learning_rate": 2.953678200508523e-06, "loss": 0.1531, "step": 4078 }, { "epoch": 0.6343701399688958, "grad_norm": 0.9960882969852086, "learning_rate": 2.951449487661097e-06, "loss": 0.1491, "step": 4079 }, { "epoch": 0.6345256609642301, "grad_norm": 1.0600836939628078, "learning_rate": 2.9492212638312458e-06, "loss": 0.1568, "step": 4080 }, { "epoch": 0.6346811819595646, "grad_norm": 0.885870962344145, "learning_rate": 2.946993529550876e-06, "loss": 0.1127, "step": 4081 }, { "epoch": 0.6348367029548989, "grad_norm": 1.0067984066270625, "learning_rate": 2.9447662853517835e-06, "loss": 0.1872, "step": 4082 }, { "epoch": 0.6349922239502332, "grad_norm": 0.917337088380226, "learning_rate": 2.942539531765639e-06, "loss": 0.1377, "step": 4083 }, { "epoch": 0.6351477449455677, "grad_norm": 1.3491621347761322, "learning_rate": 2.9403132693239988e-06, "loss": 0.1537, "step": 4084 }, { "epoch": 0.635303265940902, "grad_norm": 1.1615971672954988, "learning_rate": 2.9380874985583074e-06, "loss": 0.1727, "step": 4085 }, { "epoch": 0.6354587869362364, "grad_norm": 1.377973502882053, "learning_rate": 2.9358622199998843e-06, "loss": 0.2139, "step": 4086 }, { "epoch": 0.6356143079315708, "grad_norm": 1.2586856815150556, "learning_rate": 2.9336374341799325e-06, "loss": 0.1517, "step": 4087 }, { "epoch": 0.6357698289269051, "grad_norm": 0.824852048594405, "learning_rate": 2.9314131416295434e-06, "loss": 0.114, "step": 4088 }, { "epoch": 0.6359253499222395, "grad_norm": 0.777832418831284, "learning_rate": 2.9291893428796847e-06, "loss": 0.1642, "step": 4089 }, { "epoch": 0.6360808709175739, "grad_norm": 1.4459217189210158, "learning_rate": 2.9269660384612064e-06, "loss": 0.1699, "step": 4090 }, { "epoch": 0.6362363919129083, "grad_norm": 1.0532322222637682, "learning_rate": 2.924743228904847e-06, "loss": 0.1362, "step": 4091 }, { "epoch": 0.6363919129082426, "grad_norm": 1.4537501351034179, "learning_rate": 2.9225209147412176e-06, "loss": 0.1203, "step": 4092 }, { "epoch": 0.6365474339035769, "grad_norm": 0.8739368848987361, "learning_rate": 2.9202990965008187e-06, "loss": 0.1361, "step": 4093 }, { "epoch": 0.6367029548989114, "grad_norm": 1.0146262341792618, "learning_rate": 2.9180777747140256e-06, "loss": 0.2039, "step": 4094 }, { "epoch": 0.6368584758942457, "grad_norm": 1.733268988991781, "learning_rate": 2.9158569499111033e-06, "loss": 0.1361, "step": 4095 }, { "epoch": 0.63701399688958, "grad_norm": 1.0427550003647366, "learning_rate": 2.9136366226221915e-06, "loss": 0.1546, "step": 4096 }, { "epoch": 0.6371695178849145, "grad_norm": 1.2555723621385348, "learning_rate": 2.9114167933773113e-06, "loss": 0.1923, "step": 4097 }, { "epoch": 0.6373250388802488, "grad_norm": 3.7214267186746133, "learning_rate": 2.909197462706369e-06, "loss": 0.1964, "step": 4098 }, { "epoch": 0.6374805598755832, "grad_norm": 0.9807451470672904, "learning_rate": 2.9069786311391514e-06, "loss": 0.1402, "step": 4099 }, { "epoch": 0.6376360808709176, "grad_norm": 0.8874548657259055, "learning_rate": 2.90476029920532e-06, "loss": 0.0988, "step": 4100 }, { "epoch": 0.6376360808709176, "eval_loss": 0.16973473131656647, "eval_runtime": 9.451, "eval_samples_per_second": 2.751, "eval_steps_per_second": 0.741, "step": 4100 }, { "epoch": 0.637791601866252, "grad_norm": 1.6581584832447396, "learning_rate": 2.9025424674344273e-06, "loss": 0.2076, "step": 4101 }, { "epoch": 0.6379471228615863, "grad_norm": 1.2689984658515878, "learning_rate": 2.900325136355897e-06, "loss": 0.1284, "step": 4102 }, { "epoch": 0.6381026438569207, "grad_norm": 0.9426544635851843, "learning_rate": 2.8981083064990363e-06, "loss": 0.1738, "step": 4103 }, { "epoch": 0.6382581648522551, "grad_norm": 0.7827738619753263, "learning_rate": 2.895891978393036e-06, "loss": 0.1319, "step": 4104 }, { "epoch": 0.6384136858475894, "grad_norm": 0.9312903205286884, "learning_rate": 2.893676152566962e-06, "loss": 0.1393, "step": 4105 }, { "epoch": 0.6385692068429238, "grad_norm": 1.3476092533302007, "learning_rate": 2.8914608295497658e-06, "loss": 0.252, "step": 4106 }, { "epoch": 0.6387247278382582, "grad_norm": 0.9222416246629421, "learning_rate": 2.8892460098702723e-06, "loss": 0.1715, "step": 4107 }, { "epoch": 0.6388802488335925, "grad_norm": 1.1664049631750335, "learning_rate": 2.8870316940571934e-06, "loss": 0.1926, "step": 4108 }, { "epoch": 0.639035769828927, "grad_norm": 1.0902442867838833, "learning_rate": 2.8848178826391136e-06, "loss": 0.2274, "step": 4109 }, { "epoch": 0.6391912908242613, "grad_norm": 0.8075708532614905, "learning_rate": 2.882604576144505e-06, "loss": 0.1294, "step": 4110 }, { "epoch": 0.6393468118195956, "grad_norm": 1.5152639902194176, "learning_rate": 2.880391775101712e-06, "loss": 0.1819, "step": 4111 }, { "epoch": 0.63950233281493, "grad_norm": 0.9635240936565476, "learning_rate": 2.8781794800389605e-06, "loss": 0.1779, "step": 4112 }, { "epoch": 0.6396578538102644, "grad_norm": 1.3327497348118817, "learning_rate": 2.8759676914843558e-06, "loss": 0.1801, "step": 4113 }, { "epoch": 0.6398133748055987, "grad_norm": 1.4385419795334646, "learning_rate": 2.873756409965883e-06, "loss": 0.159, "step": 4114 }, { "epoch": 0.6399688958009331, "grad_norm": 1.368752532929196, "learning_rate": 2.871545636011409e-06, "loss": 0.1573, "step": 4115 }, { "epoch": 0.6401244167962675, "grad_norm": 1.2068077524586065, "learning_rate": 2.8693353701486725e-06, "loss": 0.132, "step": 4116 }, { "epoch": 0.6402799377916019, "grad_norm": 1.1476403925156922, "learning_rate": 2.8671256129052984e-06, "loss": 0.1183, "step": 4117 }, { "epoch": 0.6404354587869362, "grad_norm": 0.6604760912791316, "learning_rate": 2.8649163648087834e-06, "loss": 0.1399, "step": 4118 }, { "epoch": 0.6405909797822706, "grad_norm": 0.6502735326543596, "learning_rate": 2.8627076263865063e-06, "loss": 0.0671, "step": 4119 }, { "epoch": 0.640746500777605, "grad_norm": 0.7258266491189761, "learning_rate": 2.8604993981657247e-06, "loss": 0.1252, "step": 4120 }, { "epoch": 0.6409020217729393, "grad_norm": 1.5004676116920115, "learning_rate": 2.858291680673572e-06, "loss": 0.1519, "step": 4121 }, { "epoch": 0.6410575427682738, "grad_norm": 0.9722959675840543, "learning_rate": 2.856084474437064e-06, "loss": 0.1336, "step": 4122 }, { "epoch": 0.6412130637636081, "grad_norm": 1.0497908721751705, "learning_rate": 2.8538777799830888e-06, "loss": 0.1718, "step": 4123 }, { "epoch": 0.6413685847589424, "grad_norm": 1.4254461506570493, "learning_rate": 2.851671597838418e-06, "loss": 0.1822, "step": 4124 }, { "epoch": 0.6415241057542769, "grad_norm": 1.1445570661556794, "learning_rate": 2.8494659285296934e-06, "loss": 0.1153, "step": 4125 }, { "epoch": 0.6416796267496112, "grad_norm": 1.4780244689379938, "learning_rate": 2.8472607725834446e-06, "loss": 0.2061, "step": 4126 }, { "epoch": 0.6418351477449455, "grad_norm": 2.2085942432542356, "learning_rate": 2.8450561305260705e-06, "loss": 0.161, "step": 4127 }, { "epoch": 0.64199066874028, "grad_norm": 0.6467167076500404, "learning_rate": 2.8428520028838468e-06, "loss": 0.1053, "step": 4128 }, { "epoch": 0.6421461897356143, "grad_norm": 1.0916801525784925, "learning_rate": 2.8406483901829353e-06, "loss": 0.1001, "step": 4129 }, { "epoch": 0.6423017107309487, "grad_norm": 0.8059492875553267, "learning_rate": 2.8384452929493645e-06, "loss": 0.1324, "step": 4130 }, { "epoch": 0.642457231726283, "grad_norm": 1.632440011954782, "learning_rate": 2.8362427117090476e-06, "loss": 0.1948, "step": 4131 }, { "epoch": 0.6426127527216174, "grad_norm": 1.2855777034122493, "learning_rate": 2.834040646987768e-06, "loss": 0.117, "step": 4132 }, { "epoch": 0.6427682737169518, "grad_norm": 0.8290401604666349, "learning_rate": 2.8318390993111938e-06, "loss": 0.1145, "step": 4133 }, { "epoch": 0.6429237947122861, "grad_norm": 1.0077512458300586, "learning_rate": 2.8296380692048624e-06, "loss": 0.1819, "step": 4134 }, { "epoch": 0.6430793157076206, "grad_norm": 0.9287888769386844, "learning_rate": 2.827437557194189e-06, "loss": 0.1492, "step": 4135 }, { "epoch": 0.6432348367029549, "grad_norm": 0.8948608969338693, "learning_rate": 2.825237563804469e-06, "loss": 0.1733, "step": 4136 }, { "epoch": 0.6433903576982892, "grad_norm": 0.8052749056069223, "learning_rate": 2.82303808956087e-06, "loss": 0.135, "step": 4137 }, { "epoch": 0.6435458786936237, "grad_norm": 0.9084163221902293, "learning_rate": 2.8208391349884396e-06, "loss": 0.2042, "step": 4138 }, { "epoch": 0.643701399688958, "grad_norm": 1.183407048088661, "learning_rate": 2.8186407006120946e-06, "loss": 0.1385, "step": 4139 }, { "epoch": 0.6438569206842923, "grad_norm": 1.0255346759038606, "learning_rate": 2.8164427869566367e-06, "loss": 0.2076, "step": 4140 }, { "epoch": 0.6440124416796268, "grad_norm": 1.0225942317272754, "learning_rate": 2.8142453945467346e-06, "loss": 0.1781, "step": 4141 }, { "epoch": 0.6441679626749611, "grad_norm": 1.1938625830128315, "learning_rate": 2.81204852390694e-06, "loss": 0.0967, "step": 4142 }, { "epoch": 0.6443234836702955, "grad_norm": 1.1691341185525663, "learning_rate": 2.8098521755616746e-06, "loss": 0.2272, "step": 4143 }, { "epoch": 0.6444790046656299, "grad_norm": 1.1029658173649213, "learning_rate": 2.8076563500352362e-06, "loss": 0.136, "step": 4144 }, { "epoch": 0.6446345256609642, "grad_norm": 1.4076452663222423, "learning_rate": 2.8054610478518017e-06, "loss": 0.158, "step": 4145 }, { "epoch": 0.6447900466562986, "grad_norm": 1.505530817120426, "learning_rate": 2.8032662695354174e-06, "loss": 0.2403, "step": 4146 }, { "epoch": 0.644945567651633, "grad_norm": 1.164172068600399, "learning_rate": 2.8010720156100085e-06, "loss": 0.1648, "step": 4147 }, { "epoch": 0.6451010886469674, "grad_norm": 0.7927300206916682, "learning_rate": 2.798878286599377e-06, "loss": 0.1247, "step": 4148 }, { "epoch": 0.6452566096423017, "grad_norm": 1.2223043420619968, "learning_rate": 2.796685083027194e-06, "loss": 0.128, "step": 4149 }, { "epoch": 0.645412130637636, "grad_norm": 0.9648983618273342, "learning_rate": 2.7944924054170087e-06, "loss": 0.2331, "step": 4150 }, { "epoch": 0.6455676516329705, "grad_norm": 1.0970406020516166, "learning_rate": 2.79230025429224e-06, "loss": 0.1479, "step": 4151 }, { "epoch": 0.6457231726283048, "grad_norm": 1.0837939128709781, "learning_rate": 2.7901086301761904e-06, "loss": 0.162, "step": 4152 }, { "epoch": 0.6458786936236391, "grad_norm": 0.92279687202624, "learning_rate": 2.7879175335920273e-06, "loss": 0.1511, "step": 4153 }, { "epoch": 0.6460342146189736, "grad_norm": 1.376597156498336, "learning_rate": 2.785726965062799e-06, "loss": 0.125, "step": 4154 }, { "epoch": 0.6461897356143079, "grad_norm": 1.01176723681088, "learning_rate": 2.783536925111422e-06, "loss": 0.204, "step": 4155 }, { "epoch": 0.6463452566096423, "grad_norm": 1.2481409600832079, "learning_rate": 2.7813474142606915e-06, "loss": 0.1885, "step": 4156 }, { "epoch": 0.6465007776049767, "grad_norm": 1.0881951493782882, "learning_rate": 2.779158433033272e-06, "loss": 0.1375, "step": 4157 }, { "epoch": 0.646656298600311, "grad_norm": 0.6809718571503973, "learning_rate": 2.7769699819517073e-06, "loss": 0.0973, "step": 4158 }, { "epoch": 0.6468118195956454, "grad_norm": 1.1000500004377887, "learning_rate": 2.774782061538409e-06, "loss": 0.1579, "step": 4159 }, { "epoch": 0.6469673405909798, "grad_norm": 2.135226642347411, "learning_rate": 2.7725946723156626e-06, "loss": 0.2087, "step": 4160 }, { "epoch": 0.6471228615863142, "grad_norm": 1.2260478156377257, "learning_rate": 2.7704078148056316e-06, "loss": 0.1796, "step": 4161 }, { "epoch": 0.6472783825816485, "grad_norm": 1.0085828654961189, "learning_rate": 2.7682214895303468e-06, "loss": 0.0845, "step": 4162 }, { "epoch": 0.6474339035769829, "grad_norm": 0.9173201931477079, "learning_rate": 2.7660356970117154e-06, "loss": 0.0957, "step": 4163 }, { "epoch": 0.6475894245723173, "grad_norm": 1.0673855201464133, "learning_rate": 2.7638504377715183e-06, "loss": 0.1267, "step": 4164 }, { "epoch": 0.6477449455676516, "grad_norm": 0.8665925184031599, "learning_rate": 2.761665712331406e-06, "loss": 0.0941, "step": 4165 }, { "epoch": 0.647900466562986, "grad_norm": 1.4099277467641387, "learning_rate": 2.759481521212901e-06, "loss": 0.1869, "step": 4166 }, { "epoch": 0.6480559875583204, "grad_norm": 1.1635176366079876, "learning_rate": 2.7572978649374047e-06, "loss": 0.1434, "step": 4167 }, { "epoch": 0.6482115085536547, "grad_norm": 0.8769385640702013, "learning_rate": 2.755114744026183e-06, "loss": 0.1204, "step": 4168 }, { "epoch": 0.6483670295489891, "grad_norm": 0.9288608780529806, "learning_rate": 2.7529321590003755e-06, "loss": 0.1962, "step": 4169 }, { "epoch": 0.6485225505443235, "grad_norm": 0.8345487295398479, "learning_rate": 2.750750110381001e-06, "loss": 0.1764, "step": 4170 }, { "epoch": 0.6486780715396578, "grad_norm": 1.011262838338422, "learning_rate": 2.74856859868894e-06, "loss": 0.1445, "step": 4171 }, { "epoch": 0.6488335925349922, "grad_norm": 0.7176363640608321, "learning_rate": 2.746387624444953e-06, "loss": 0.1565, "step": 4172 }, { "epoch": 0.6489891135303266, "grad_norm": 1.3351977687071972, "learning_rate": 2.7442071881696664e-06, "loss": 0.2364, "step": 4173 }, { "epoch": 0.649144634525661, "grad_norm": 1.3213287649884924, "learning_rate": 2.7420272903835834e-06, "loss": 0.2521, "step": 4174 }, { "epoch": 0.6493001555209953, "grad_norm": 0.6732044460529488, "learning_rate": 2.739847931607075e-06, "loss": 0.105, "step": 4175 }, { "epoch": 0.6494556765163297, "grad_norm": 0.7273714326958732, "learning_rate": 2.7376691123603827e-06, "loss": 0.1361, "step": 4176 }, { "epoch": 0.6496111975116641, "grad_norm": 1.0174411316344416, "learning_rate": 2.7354908331636243e-06, "loss": 0.1551, "step": 4177 }, { "epoch": 0.6497667185069984, "grad_norm": 1.110098093601475, "learning_rate": 2.7333130945367813e-06, "loss": 0.1896, "step": 4178 }, { "epoch": 0.6499222395023329, "grad_norm": 1.3121443276966855, "learning_rate": 2.7311358969997138e-06, "loss": 0.1701, "step": 4179 }, { "epoch": 0.6500777604976672, "grad_norm": 1.1495970074253155, "learning_rate": 2.728959241072149e-06, "loss": 0.1612, "step": 4180 }, { "epoch": 0.6502332814930015, "grad_norm": 1.001676179882145, "learning_rate": 2.7267831272736846e-06, "loss": 0.1071, "step": 4181 }, { "epoch": 0.650388802488336, "grad_norm": 1.387052109613188, "learning_rate": 2.7246075561237867e-06, "loss": 0.2117, "step": 4182 }, { "epoch": 0.6505443234836703, "grad_norm": 0.873720280367497, "learning_rate": 2.7224325281417986e-06, "loss": 0.14, "step": 4183 }, { "epoch": 0.6506998444790046, "grad_norm": 1.3207929215363527, "learning_rate": 2.7202580438469283e-06, "loss": 0.1607, "step": 4184 }, { "epoch": 0.6508553654743391, "grad_norm": 1.1654150914874986, "learning_rate": 2.7180841037582524e-06, "loss": 0.1831, "step": 4185 }, { "epoch": 0.6510108864696734, "grad_norm": 0.87180490857782, "learning_rate": 2.7159107083947254e-06, "loss": 0.1625, "step": 4186 }, { "epoch": 0.6511664074650078, "grad_norm": 1.1642781820727433, "learning_rate": 2.713737858275163e-06, "loss": 0.1449, "step": 4187 }, { "epoch": 0.6513219284603421, "grad_norm": 0.9656139072128972, "learning_rate": 2.7115655539182594e-06, "loss": 0.1597, "step": 4188 }, { "epoch": 0.6514774494556765, "grad_norm": 0.8767092652660037, "learning_rate": 2.7093937958425694e-06, "loss": 0.1891, "step": 4189 }, { "epoch": 0.6516329704510109, "grad_norm": 0.9682341612333679, "learning_rate": 2.7072225845665256e-06, "loss": 0.2193, "step": 4190 }, { "epoch": 0.6517884914463452, "grad_norm": 0.9799403454347709, "learning_rate": 2.705051920608425e-06, "loss": 0.1834, "step": 4191 }, { "epoch": 0.6519440124416797, "grad_norm": 1.2401107002379594, "learning_rate": 2.7028818044864324e-06, "loss": 0.175, "step": 4192 }, { "epoch": 0.652099533437014, "grad_norm": 1.114001662258525, "learning_rate": 2.70071223671859e-06, "loss": 0.2033, "step": 4193 }, { "epoch": 0.6522550544323483, "grad_norm": 0.981779699791757, "learning_rate": 2.6985432178228e-06, "loss": 0.2279, "step": 4194 }, { "epoch": 0.6524105754276828, "grad_norm": 1.0872810344095263, "learning_rate": 2.6963747483168378e-06, "loss": 0.1817, "step": 4195 }, { "epoch": 0.6525660964230171, "grad_norm": 0.8608303209152488, "learning_rate": 2.694206828718351e-06, "loss": 0.1108, "step": 4196 }, { "epoch": 0.6527216174183514, "grad_norm": 1.0869030936555109, "learning_rate": 2.6920394595448496e-06, "loss": 0.2044, "step": 4197 }, { "epoch": 0.6528771384136859, "grad_norm": 1.1143733211861335, "learning_rate": 2.6898726413137122e-06, "loss": 0.1131, "step": 4198 }, { "epoch": 0.6530326594090202, "grad_norm": 1.1587978455528876, "learning_rate": 2.687706374542194e-06, "loss": 0.1752, "step": 4199 }, { "epoch": 0.6531881804043546, "grad_norm": 1.211351152007887, "learning_rate": 2.6855406597474098e-06, "loss": 0.2126, "step": 4200 }, { "epoch": 0.6531881804043546, "eval_loss": 0.16808296740055084, "eval_runtime": 9.4375, "eval_samples_per_second": 2.755, "eval_steps_per_second": 0.742, "step": 4200 }, { "epoch": 0.653343701399689, "grad_norm": 0.7637766343450602, "learning_rate": 2.6833754974463444e-06, "loss": 0.0845, "step": 4201 }, { "epoch": 0.6534992223950233, "grad_norm": 0.7079307696698743, "learning_rate": 2.681210888155856e-06, "loss": 0.1524, "step": 4202 }, { "epoch": 0.6536547433903577, "grad_norm": 0.8012423454006898, "learning_rate": 2.679046832392664e-06, "loss": 0.1234, "step": 4203 }, { "epoch": 0.6538102643856921, "grad_norm": 0.8061567166861243, "learning_rate": 2.6768833306733595e-06, "loss": 0.125, "step": 4204 }, { "epoch": 0.6539657853810265, "grad_norm": 1.1233706467944355, "learning_rate": 2.6747203835144026e-06, "loss": 0.1222, "step": 4205 }, { "epoch": 0.6541213063763608, "grad_norm": 1.2547190384567741, "learning_rate": 2.6725579914321175e-06, "loss": 0.1571, "step": 4206 }, { "epoch": 0.6542768273716951, "grad_norm": 1.0743842908445158, "learning_rate": 2.670396154942697e-06, "loss": 0.1371, "step": 4207 }, { "epoch": 0.6544323483670296, "grad_norm": 1.5185429060766238, "learning_rate": 2.6682348745622006e-06, "loss": 0.0977, "step": 4208 }, { "epoch": 0.6545878693623639, "grad_norm": 0.9421916422019911, "learning_rate": 2.6660741508065586e-06, "loss": 0.1473, "step": 4209 }, { "epoch": 0.6547433903576982, "grad_norm": 0.8665543317219616, "learning_rate": 2.6639139841915628e-06, "loss": 0.0961, "step": 4210 }, { "epoch": 0.6548989113530327, "grad_norm": 1.234793364289094, "learning_rate": 2.6617543752328767e-06, "loss": 0.1831, "step": 4211 }, { "epoch": 0.655054432348367, "grad_norm": 0.8078391854064625, "learning_rate": 2.659595324446031e-06, "loss": 0.1349, "step": 4212 }, { "epoch": 0.6552099533437014, "grad_norm": 1.4682996850852383, "learning_rate": 2.6574368323464195e-06, "loss": 0.1069, "step": 4213 }, { "epoch": 0.6553654743390358, "grad_norm": 0.9368052497025612, "learning_rate": 2.6552788994493027e-06, "loss": 0.1319, "step": 4214 }, { "epoch": 0.6555209953343701, "grad_norm": 1.181486221334191, "learning_rate": 2.653121526269812e-06, "loss": 0.1024, "step": 4215 }, { "epoch": 0.6556765163297045, "grad_norm": 1.0316201963598242, "learning_rate": 2.6509647133229422e-06, "loss": 0.2075, "step": 4216 }, { "epoch": 0.6558320373250389, "grad_norm": 0.930389931650913, "learning_rate": 2.6488084611235517e-06, "loss": 0.1315, "step": 4217 }, { "epoch": 0.6559875583203733, "grad_norm": 1.1086038737294182, "learning_rate": 2.646652770186372e-06, "loss": 0.154, "step": 4218 }, { "epoch": 0.6561430793157076, "grad_norm": 1.1677299250846405, "learning_rate": 2.644497641025992e-06, "loss": 0.1193, "step": 4219 }, { "epoch": 0.656298600311042, "grad_norm": 0.7798215812899868, "learning_rate": 2.6423430741568746e-06, "loss": 0.1591, "step": 4220 }, { "epoch": 0.6564541213063764, "grad_norm": 1.0913589224282587, "learning_rate": 2.6401890700933452e-06, "loss": 0.1941, "step": 4221 }, { "epoch": 0.6566096423017107, "grad_norm": 1.2105920622271487, "learning_rate": 2.638035629349594e-06, "loss": 0.2678, "step": 4222 }, { "epoch": 0.6567651632970452, "grad_norm": 0.9238508632886339, "learning_rate": 2.6358827524396746e-06, "loss": 0.1051, "step": 4223 }, { "epoch": 0.6569206842923795, "grad_norm": 0.795175137530294, "learning_rate": 2.6337304398775133e-06, "loss": 0.145, "step": 4224 }, { "epoch": 0.6570762052877138, "grad_norm": 1.1318421582687181, "learning_rate": 2.631578692176895e-06, "loss": 0.1911, "step": 4225 }, { "epoch": 0.6572317262830482, "grad_norm": 1.068851489075724, "learning_rate": 2.6294275098514688e-06, "loss": 0.1541, "step": 4226 }, { "epoch": 0.6573872472783826, "grad_norm": 1.0321101453629522, "learning_rate": 2.6272768934147554e-06, "loss": 0.1285, "step": 4227 }, { "epoch": 0.6575427682737169, "grad_norm": 1.113715373029375, "learning_rate": 2.625126843380138e-06, "loss": 0.1695, "step": 4228 }, { "epoch": 0.6576982892690513, "grad_norm": 0.8562991496374537, "learning_rate": 2.622977360260862e-06, "loss": 0.1279, "step": 4229 }, { "epoch": 0.6578538102643857, "grad_norm": 0.742155328676804, "learning_rate": 2.6208284445700373e-06, "loss": 0.1123, "step": 4230 }, { "epoch": 0.65800933125972, "grad_norm": 1.2868744342033713, "learning_rate": 2.6186800968206426e-06, "loss": 0.1848, "step": 4231 }, { "epoch": 0.6581648522550544, "grad_norm": 1.2150477403782265, "learning_rate": 2.616532317525518e-06, "loss": 0.106, "step": 4232 }, { "epoch": 0.6583203732503888, "grad_norm": 1.0955139119768051, "learning_rate": 2.6143851071973657e-06, "loss": 0.1098, "step": 4233 }, { "epoch": 0.6584758942457232, "grad_norm": 1.4608661732282564, "learning_rate": 2.612238466348759e-06, "loss": 0.2142, "step": 4234 }, { "epoch": 0.6586314152410575, "grad_norm": 1.129880722720207, "learning_rate": 2.610092395492127e-06, "loss": 0.1044, "step": 4235 }, { "epoch": 0.658786936236392, "grad_norm": 1.5482166533530106, "learning_rate": 2.6079468951397685e-06, "loss": 0.1586, "step": 4236 }, { "epoch": 0.6589424572317263, "grad_norm": 1.10316454759234, "learning_rate": 2.605801965803847e-06, "loss": 0.1961, "step": 4237 }, { "epoch": 0.6590979782270606, "grad_norm": 1.6109614661065057, "learning_rate": 2.6036576079963837e-06, "loss": 0.2201, "step": 4238 }, { "epoch": 0.6592534992223951, "grad_norm": 1.402329347091309, "learning_rate": 2.601513822229266e-06, "loss": 0.1926, "step": 4239 }, { "epoch": 0.6594090202177294, "grad_norm": 1.1814375276819604, "learning_rate": 2.5993706090142484e-06, "loss": 0.186, "step": 4240 }, { "epoch": 0.6595645412130637, "grad_norm": 0.8580115797483842, "learning_rate": 2.597227968862945e-06, "loss": 0.1437, "step": 4241 }, { "epoch": 0.6597200622083982, "grad_norm": 1.7297221646784189, "learning_rate": 2.5950859022868306e-06, "loss": 0.2416, "step": 4242 }, { "epoch": 0.6598755832037325, "grad_norm": 1.4638136313986456, "learning_rate": 2.59294440979725e-06, "loss": 0.2007, "step": 4243 }, { "epoch": 0.6600311041990669, "grad_norm": 2.0741617567438477, "learning_rate": 2.5908034919054074e-06, "loss": 0.0843, "step": 4244 }, { "epoch": 0.6601866251944012, "grad_norm": 1.1187152312708115, "learning_rate": 2.588663149122369e-06, "loss": 0.1086, "step": 4245 }, { "epoch": 0.6603421461897356, "grad_norm": 1.5041199927764337, "learning_rate": 2.5865233819590625e-06, "loss": 0.1586, "step": 4246 }, { "epoch": 0.66049766718507, "grad_norm": 1.3507062791133562, "learning_rate": 2.5843841909262833e-06, "loss": 0.1374, "step": 4247 }, { "epoch": 0.6606531881804043, "grad_norm": 0.940546831086624, "learning_rate": 2.5822455765346855e-06, "loss": 0.1249, "step": 4248 }, { "epoch": 0.6608087091757388, "grad_norm": 1.2834041159670277, "learning_rate": 2.5801075392947827e-06, "loss": 0.166, "step": 4249 }, { "epoch": 0.6609642301710731, "grad_norm": 1.1259487637720738, "learning_rate": 2.577970079716959e-06, "loss": 0.1767, "step": 4250 }, { "epoch": 0.6611197511664074, "grad_norm": 1.078379639260632, "learning_rate": 2.5758331983114514e-06, "loss": 0.158, "step": 4251 }, { "epoch": 0.6612752721617419, "grad_norm": 1.3078751594853828, "learning_rate": 2.5736968955883655e-06, "loss": 0.1217, "step": 4252 }, { "epoch": 0.6614307931570762, "grad_norm": 1.3704359532308295, "learning_rate": 2.5715611720576692e-06, "loss": 0.1419, "step": 4253 }, { "epoch": 0.6615863141524105, "grad_norm": 1.1538097591896017, "learning_rate": 2.5694260282291867e-06, "loss": 0.0963, "step": 4254 }, { "epoch": 0.661741835147745, "grad_norm": 1.0834314449212799, "learning_rate": 2.5672914646126046e-06, "loss": 0.1189, "step": 4255 }, { "epoch": 0.6618973561430793, "grad_norm": 1.1359592388118103, "learning_rate": 2.5651574817174774e-06, "loss": 0.1317, "step": 4256 }, { "epoch": 0.6620528771384137, "grad_norm": 1.7191390720974242, "learning_rate": 2.5630240800532135e-06, "loss": 0.1723, "step": 4257 }, { "epoch": 0.6622083981337481, "grad_norm": 1.4157089136288825, "learning_rate": 2.5608912601290857e-06, "loss": 0.1093, "step": 4258 }, { "epoch": 0.6623639191290824, "grad_norm": 0.6995986027947196, "learning_rate": 2.558759022454228e-06, "loss": 0.1082, "step": 4259 }, { "epoch": 0.6625194401244168, "grad_norm": 1.1956850152955174, "learning_rate": 2.5566273675376386e-06, "loss": 0.1978, "step": 4260 }, { "epoch": 0.6626749611197512, "grad_norm": 0.918109510161113, "learning_rate": 2.554496295888168e-06, "loss": 0.1675, "step": 4261 }, { "epoch": 0.6628304821150856, "grad_norm": 1.193762007322642, "learning_rate": 2.5523658080145376e-06, "loss": 0.2428, "step": 4262 }, { "epoch": 0.6629860031104199, "grad_norm": 0.9423612776455994, "learning_rate": 2.5502359044253223e-06, "loss": 0.1607, "step": 4263 }, { "epoch": 0.6631415241057543, "grad_norm": 1.4324500823407231, "learning_rate": 2.548106585628957e-06, "loss": 0.1845, "step": 4264 }, { "epoch": 0.6632970451010887, "grad_norm": 0.8498719415282182, "learning_rate": 2.545977852133745e-06, "loss": 0.1256, "step": 4265 }, { "epoch": 0.663452566096423, "grad_norm": 0.9426754553128992, "learning_rate": 2.543849704447843e-06, "loss": 0.1548, "step": 4266 }, { "epoch": 0.6636080870917573, "grad_norm": 1.0869391511038302, "learning_rate": 2.541722143079266e-06, "loss": 0.0917, "step": 4267 }, { "epoch": 0.6637636080870918, "grad_norm": 0.9698445477556623, "learning_rate": 2.5395951685358954e-06, "loss": 0.1683, "step": 4268 }, { "epoch": 0.6639191290824261, "grad_norm": 1.1127992470614672, "learning_rate": 2.5374687813254716e-06, "loss": 0.1936, "step": 4269 }, { "epoch": 0.6640746500777605, "grad_norm": 0.9261703574434432, "learning_rate": 2.535342981955591e-06, "loss": 0.1558, "step": 4270 }, { "epoch": 0.6642301710730949, "grad_norm": 1.268900920620763, "learning_rate": 2.533217770933709e-06, "loss": 0.1141, "step": 4271 }, { "epoch": 0.6643856920684292, "grad_norm": 0.8480260902763055, "learning_rate": 2.5310931487671488e-06, "loss": 0.1525, "step": 4272 }, { "epoch": 0.6645412130637636, "grad_norm": 0.771557879912697, "learning_rate": 2.5289691159630825e-06, "loss": 0.1437, "step": 4273 }, { "epoch": 0.664696734059098, "grad_norm": 0.888610616598135, "learning_rate": 2.526845673028547e-06, "loss": 0.1826, "step": 4274 }, { "epoch": 0.6648522550544324, "grad_norm": 1.1554186085055886, "learning_rate": 2.5247228204704386e-06, "loss": 0.1794, "step": 4275 }, { "epoch": 0.6650077760497667, "grad_norm": 1.0081333347449493, "learning_rate": 2.5226005587955138e-06, "loss": 0.1244, "step": 4276 }, { "epoch": 0.6651632970451011, "grad_norm": 1.0486423773497957, "learning_rate": 2.5204788885103813e-06, "loss": 0.1686, "step": 4277 }, { "epoch": 0.6653188180404355, "grad_norm": 1.2199370097739066, "learning_rate": 2.5183578101215195e-06, "loss": 0.1756, "step": 4278 }, { "epoch": 0.6654743390357698, "grad_norm": 0.8541680878073211, "learning_rate": 2.5162373241352555e-06, "loss": 0.3097, "step": 4279 }, { "epoch": 0.6656298600311042, "grad_norm": 1.352976123065494, "learning_rate": 2.5141174310577774e-06, "loss": 0.2389, "step": 4280 }, { "epoch": 0.6657853810264386, "grad_norm": 1.1596984193480224, "learning_rate": 2.511998131395138e-06, "loss": 0.2284, "step": 4281 }, { "epoch": 0.6659409020217729, "grad_norm": 0.9583234899762272, "learning_rate": 2.5098794256532404e-06, "loss": 0.1035, "step": 4282 }, { "epoch": 0.6660964230171074, "grad_norm": 0.8319688854268386, "learning_rate": 2.507761314337848e-06, "loss": 0.1233, "step": 4283 }, { "epoch": 0.6662519440124417, "grad_norm": 1.3395709780391127, "learning_rate": 2.5056437979545855e-06, "loss": 0.1375, "step": 4284 }, { "epoch": 0.666407465007776, "grad_norm": 1.1434414099999588, "learning_rate": 2.5035268770089356e-06, "loss": 0.1885, "step": 4285 }, { "epoch": 0.6665629860031104, "grad_norm": 1.2057221778128222, "learning_rate": 2.501410552006234e-06, "loss": 0.1631, "step": 4286 }, { "epoch": 0.6667185069984448, "grad_norm": 0.9300629651079486, "learning_rate": 2.499294823451677e-06, "loss": 0.0882, "step": 4287 }, { "epoch": 0.6668740279937792, "grad_norm": 0.9864990816831994, "learning_rate": 2.49717969185032e-06, "loss": 0.1174, "step": 4288 }, { "epoch": 0.6670295489891135, "grad_norm": 1.0805733390561123, "learning_rate": 2.495065157707074e-06, "loss": 0.169, "step": 4289 }, { "epoch": 0.6671850699844479, "grad_norm": 1.0001878545459213, "learning_rate": 2.492951221526705e-06, "loss": 0.1167, "step": 4290 }, { "epoch": 0.6673405909797823, "grad_norm": 0.8374411337549794, "learning_rate": 2.4908378838138418e-06, "loss": 0.1409, "step": 4291 }, { "epoch": 0.6674961119751166, "grad_norm": 1.1371907680295288, "learning_rate": 2.4887251450729694e-06, "loss": 0.1767, "step": 4292 }, { "epoch": 0.667651632970451, "grad_norm": 1.1058946565896044, "learning_rate": 2.4866130058084236e-06, "loss": 0.1435, "step": 4293 }, { "epoch": 0.6678071539657854, "grad_norm": 1.6484406322370544, "learning_rate": 2.4845014665244054e-06, "loss": 0.2069, "step": 4294 }, { "epoch": 0.6679626749611197, "grad_norm": 1.2737132743752153, "learning_rate": 2.4823905277249672e-06, "loss": 0.1955, "step": 4295 }, { "epoch": 0.6681181959564542, "grad_norm": 1.2829224024790427, "learning_rate": 2.480280189914018e-06, "loss": 0.1665, "step": 4296 }, { "epoch": 0.6682737169517885, "grad_norm": 0.8354009946844844, "learning_rate": 2.478170453595327e-06, "loss": 0.227, "step": 4297 }, { "epoch": 0.6684292379471228, "grad_norm": 1.4545525451544448, "learning_rate": 2.4760613192725175e-06, "loss": 0.1437, "step": 4298 }, { "epoch": 0.6685847589424573, "grad_norm": 1.0571670548159589, "learning_rate": 2.473952787449067e-06, "loss": 0.1329, "step": 4299 }, { "epoch": 0.6687402799377916, "grad_norm": 1.2196345730800615, "learning_rate": 2.4718448586283126e-06, "loss": 0.2117, "step": 4300 }, { "epoch": 0.6687402799377916, "eval_loss": 0.1687493622303009, "eval_runtime": 9.4247, "eval_samples_per_second": 2.759, "eval_steps_per_second": 0.743, "step": 4300 }, { "epoch": 0.668895800933126, "grad_norm": 1.0576697584808976, "learning_rate": 2.4697375333134487e-06, "loss": 0.2098, "step": 4301 }, { "epoch": 0.6690513219284604, "grad_norm": 1.2958161522350817, "learning_rate": 2.46763081200752e-06, "loss": 0.1542, "step": 4302 }, { "epoch": 0.6692068429237947, "grad_norm": 0.6522939888429491, "learning_rate": 2.465524695213433e-06, "loss": 0.1192, "step": 4303 }, { "epoch": 0.6693623639191291, "grad_norm": 1.5126854182794545, "learning_rate": 2.4634191834339462e-06, "loss": 0.1753, "step": 4304 }, { "epoch": 0.6695178849144634, "grad_norm": 1.1828319595343906, "learning_rate": 2.4613142771716737e-06, "loss": 0.1313, "step": 4305 }, { "epoch": 0.6696734059097978, "grad_norm": 1.0386845248294643, "learning_rate": 2.4592099769290845e-06, "loss": 0.1648, "step": 4306 }, { "epoch": 0.6698289269051322, "grad_norm": 1.0344212551649405, "learning_rate": 2.4571062832085062e-06, "loss": 0.1426, "step": 4307 }, { "epoch": 0.6699844479004665, "grad_norm": 0.8802551039600756, "learning_rate": 2.4550031965121224e-06, "loss": 0.0906, "step": 4308 }, { "epoch": 0.670139968895801, "grad_norm": 1.1294211290732599, "learning_rate": 2.452900717341965e-06, "loss": 0.1039, "step": 4309 }, { "epoch": 0.6702954898911353, "grad_norm": 0.9509920555225324, "learning_rate": 2.4507988461999283e-06, "loss": 0.1783, "step": 4310 }, { "epoch": 0.6704510108864696, "grad_norm": 0.8580274234537807, "learning_rate": 2.4486975835877568e-06, "loss": 0.155, "step": 4311 }, { "epoch": 0.6706065318818041, "grad_norm": 1.1123539529781745, "learning_rate": 2.4465969300070504e-06, "loss": 0.1605, "step": 4312 }, { "epoch": 0.6707620528771384, "grad_norm": 0.9323544866197916, "learning_rate": 2.444496885959266e-06, "loss": 0.1163, "step": 4313 }, { "epoch": 0.6709175738724728, "grad_norm": 1.2498589563805496, "learning_rate": 2.4423974519457134e-06, "loss": 0.2413, "step": 4314 }, { "epoch": 0.6710730948678072, "grad_norm": 1.291825524220148, "learning_rate": 2.4402986284675536e-06, "loss": 0.2097, "step": 4315 }, { "epoch": 0.6712286158631415, "grad_norm": 1.2940060452956998, "learning_rate": 2.438200416025808e-06, "loss": 0.1518, "step": 4316 }, { "epoch": 0.6713841368584759, "grad_norm": 0.9695071270371622, "learning_rate": 2.4361028151213493e-06, "loss": 0.1763, "step": 4317 }, { "epoch": 0.6715396578538103, "grad_norm": 0.93373610044212, "learning_rate": 2.4340058262549028e-06, "loss": 0.1032, "step": 4318 }, { "epoch": 0.6716951788491446, "grad_norm": 0.762918755630333, "learning_rate": 2.431909449927051e-06, "loss": 0.1093, "step": 4319 }, { "epoch": 0.671850699844479, "grad_norm": 0.7948501627602724, "learning_rate": 2.429813686638227e-06, "loss": 0.1133, "step": 4320 }, { "epoch": 0.6720062208398134, "grad_norm": 1.1311394258214351, "learning_rate": 2.4277185368887167e-06, "loss": 0.1509, "step": 4321 }, { "epoch": 0.6721617418351478, "grad_norm": 1.5756905573667703, "learning_rate": 2.4256240011786645e-06, "loss": 0.2584, "step": 4322 }, { "epoch": 0.6723172628304821, "grad_norm": 0.8958501696351835, "learning_rate": 2.4235300800080634e-06, "loss": 0.1251, "step": 4323 }, { "epoch": 0.6724727838258164, "grad_norm": 1.2570616323895953, "learning_rate": 2.4214367738767635e-06, "loss": 0.2163, "step": 4324 }, { "epoch": 0.6726283048211509, "grad_norm": 1.0597155992018512, "learning_rate": 2.4193440832844635e-06, "loss": 0.168, "step": 4325 }, { "epoch": 0.6727838258164852, "grad_norm": 0.8846581439116575, "learning_rate": 2.4172520087307206e-06, "loss": 0.1859, "step": 4326 }, { "epoch": 0.6729393468118195, "grad_norm": 0.7515951666413669, "learning_rate": 2.415160550714941e-06, "loss": 0.1599, "step": 4327 }, { "epoch": 0.673094867807154, "grad_norm": 1.0465122062597179, "learning_rate": 2.4130697097363837e-06, "loss": 0.1159, "step": 4328 }, { "epoch": 0.6732503888024883, "grad_norm": 0.7718765559862649, "learning_rate": 2.4109794862941637e-06, "loss": 0.1416, "step": 4329 }, { "epoch": 0.6734059097978227, "grad_norm": 0.9186660108759457, "learning_rate": 2.408889880887246e-06, "loss": 0.0876, "step": 4330 }, { "epoch": 0.6735614307931571, "grad_norm": 1.244939089903755, "learning_rate": 2.406800894014446e-06, "loss": 0.2224, "step": 4331 }, { "epoch": 0.6737169517884914, "grad_norm": 0.9326825457436969, "learning_rate": 2.404712526174436e-06, "loss": 0.1053, "step": 4332 }, { "epoch": 0.6738724727838258, "grad_norm": 0.8454426742532524, "learning_rate": 2.40262477786574e-06, "loss": 0.1716, "step": 4333 }, { "epoch": 0.6740279937791602, "grad_norm": 1.1544394977270769, "learning_rate": 2.40053764958673e-06, "loss": 0.1821, "step": 4334 }, { "epoch": 0.6741835147744946, "grad_norm": 1.05367334306896, "learning_rate": 2.3984511418356354e-06, "loss": 0.2088, "step": 4335 }, { "epoch": 0.6743390357698289, "grad_norm": 1.3859447377243814, "learning_rate": 2.396365255110533e-06, "loss": 0.1189, "step": 4336 }, { "epoch": 0.6744945567651633, "grad_norm": 1.1634117745516463, "learning_rate": 2.3942799899093515e-06, "loss": 0.0978, "step": 4337 }, { "epoch": 0.6746500777604977, "grad_norm": 0.8668677583531115, "learning_rate": 2.3921953467298764e-06, "loss": 0.1204, "step": 4338 }, { "epoch": 0.674805598755832, "grad_norm": 1.2701334463802947, "learning_rate": 2.3901113260697366e-06, "loss": 0.1053, "step": 4339 }, { "epoch": 0.6749611197511665, "grad_norm": 1.1908526685724006, "learning_rate": 2.38802792842642e-06, "loss": 0.1445, "step": 4340 }, { "epoch": 0.6751166407465008, "grad_norm": 1.2941933894409903, "learning_rate": 2.385945154297264e-06, "loss": 0.2292, "step": 4341 }, { "epoch": 0.6752721617418351, "grad_norm": 1.0704640286885303, "learning_rate": 2.3838630041794535e-06, "loss": 0.1432, "step": 4342 }, { "epoch": 0.6754276827371695, "grad_norm": 0.8439563119488198, "learning_rate": 2.381781478570028e-06, "loss": 0.1439, "step": 4343 }, { "epoch": 0.6755832037325039, "grad_norm": 1.1151758294657133, "learning_rate": 2.379700577965873e-06, "loss": 0.137, "step": 4344 }, { "epoch": 0.6757387247278382, "grad_norm": 1.2932930672079148, "learning_rate": 2.377620302863734e-06, "loss": 0.2191, "step": 4345 }, { "epoch": 0.6758942457231726, "grad_norm": 1.4184849415149057, "learning_rate": 2.3755406537601993e-06, "loss": 0.2177, "step": 4346 }, { "epoch": 0.676049766718507, "grad_norm": 1.02588180103073, "learning_rate": 2.373461631151707e-06, "loss": 0.109, "step": 4347 }, { "epoch": 0.6762052877138414, "grad_norm": 0.8809368117668627, "learning_rate": 2.371383235534553e-06, "loss": 0.1107, "step": 4348 }, { "epoch": 0.6763608087091757, "grad_norm": 0.9778285624956363, "learning_rate": 2.3693054674048787e-06, "loss": 0.1775, "step": 4349 }, { "epoch": 0.6765163297045101, "grad_norm": 0.9027089904414863, "learning_rate": 2.3672283272586745e-06, "loss": 0.1969, "step": 4350 }, { "epoch": 0.6766718506998445, "grad_norm": 1.3630585349645838, "learning_rate": 2.3651518155917855e-06, "loss": 0.2663, "step": 4351 }, { "epoch": 0.6768273716951788, "grad_norm": 1.2511992148172273, "learning_rate": 2.3630759328999024e-06, "loss": 0.1637, "step": 4352 }, { "epoch": 0.6769828926905133, "grad_norm": 1.9650171973471822, "learning_rate": 2.3610006796785657e-06, "loss": 0.1554, "step": 4353 }, { "epoch": 0.6771384136858476, "grad_norm": 0.9978577203851653, "learning_rate": 2.35892605642317e-06, "loss": 0.1304, "step": 4354 }, { "epoch": 0.6772939346811819, "grad_norm": 0.9768519853457535, "learning_rate": 2.3568520636289543e-06, "loss": 0.1422, "step": 4355 }, { "epoch": 0.6774494556765164, "grad_norm": 1.5579392720051175, "learning_rate": 2.354778701791011e-06, "loss": 0.1482, "step": 4356 }, { "epoch": 0.6776049766718507, "grad_norm": 0.8927574375154105, "learning_rate": 2.3527059714042826e-06, "loss": 0.1824, "step": 4357 }, { "epoch": 0.677760497667185, "grad_norm": 0.6769038343856992, "learning_rate": 2.3506338729635563e-06, "loss": 0.1045, "step": 4358 }, { "epoch": 0.6779160186625195, "grad_norm": 1.1969855164479584, "learning_rate": 2.3485624069634695e-06, "loss": 0.1166, "step": 4359 }, { "epoch": 0.6780715396578538, "grad_norm": 1.0980563761298787, "learning_rate": 2.346491573898513e-06, "loss": 0.1039, "step": 4360 }, { "epoch": 0.6782270606531882, "grad_norm": 1.6592188378655734, "learning_rate": 2.3444213742630234e-06, "loss": 0.1921, "step": 4361 }, { "epoch": 0.6783825816485225, "grad_norm": 1.3872885902398995, "learning_rate": 2.342351808551184e-06, "loss": 0.1473, "step": 4362 }, { "epoch": 0.6785381026438569, "grad_norm": 1.420889152023295, "learning_rate": 2.340282877257029e-06, "loss": 0.1622, "step": 4363 }, { "epoch": 0.6786936236391913, "grad_norm": 0.770512474592418, "learning_rate": 2.3382145808744424e-06, "loss": 0.1449, "step": 4364 }, { "epoch": 0.6788491446345256, "grad_norm": 1.1298124786296702, "learning_rate": 2.3361469198971566e-06, "loss": 0.1437, "step": 4365 }, { "epoch": 0.6790046656298601, "grad_norm": 0.8928482230918432, "learning_rate": 2.3340798948187475e-06, "loss": 0.1515, "step": 4366 }, { "epoch": 0.6791601866251944, "grad_norm": 1.3210672049291061, "learning_rate": 2.332013506132648e-06, "loss": 0.2802, "step": 4367 }, { "epoch": 0.6793157076205287, "grad_norm": 0.7889420029967057, "learning_rate": 2.3299477543321302e-06, "loss": 0.1287, "step": 4368 }, { "epoch": 0.6794712286158632, "grad_norm": 1.225286863663267, "learning_rate": 2.3278826399103173e-06, "loss": 0.1841, "step": 4369 }, { "epoch": 0.6796267496111975, "grad_norm": 1.175082754249653, "learning_rate": 2.3258181633601836e-06, "loss": 0.1403, "step": 4370 }, { "epoch": 0.6797822706065318, "grad_norm": 1.1266924042985595, "learning_rate": 2.3237543251745446e-06, "loss": 0.1153, "step": 4371 }, { "epoch": 0.6799377916018663, "grad_norm": 1.027420753044702, "learning_rate": 2.321691125846071e-06, "loss": 0.1666, "step": 4372 }, { "epoch": 0.6800933125972006, "grad_norm": 1.5151165421058401, "learning_rate": 2.3196285658672774e-06, "loss": 0.1249, "step": 4373 }, { "epoch": 0.680248833592535, "grad_norm": 1.1686344409761702, "learning_rate": 2.317566645730524e-06, "loss": 0.2009, "step": 4374 }, { "epoch": 0.6804043545878694, "grad_norm": 1.0378549690677468, "learning_rate": 2.3155053659280186e-06, "loss": 0.1819, "step": 4375 }, { "epoch": 0.6805598755832037, "grad_norm": 0.8427131152969166, "learning_rate": 2.3134447269518202e-06, "loss": 0.1314, "step": 4376 }, { "epoch": 0.6807153965785381, "grad_norm": 1.2502895681530737, "learning_rate": 2.3113847292938315e-06, "loss": 0.1834, "step": 4377 }, { "epoch": 0.6808709175738725, "grad_norm": 0.9476919147338879, "learning_rate": 2.3093253734457995e-06, "loss": 0.1094, "step": 4378 }, { "epoch": 0.6810264385692069, "grad_norm": 1.341145643107782, "learning_rate": 2.307266659899326e-06, "loss": 0.1562, "step": 4379 }, { "epoch": 0.6811819595645412, "grad_norm": 0.7198916810272196, "learning_rate": 2.30520858914585e-06, "loss": 0.1321, "step": 4380 }, { "epoch": 0.6813374805598755, "grad_norm": 0.7509401813228153, "learning_rate": 2.3031511616766654e-06, "loss": 0.1266, "step": 4381 }, { "epoch": 0.68149300155521, "grad_norm": 1.2385267047023865, "learning_rate": 2.3010943779829065e-06, "loss": 0.2669, "step": 4382 }, { "epoch": 0.6816485225505443, "grad_norm": 1.1270144655541017, "learning_rate": 2.2990382385555587e-06, "loss": 0.144, "step": 4383 }, { "epoch": 0.6818040435458786, "grad_norm": 1.0884956998955082, "learning_rate": 2.2969827438854492e-06, "loss": 0.1284, "step": 4384 }, { "epoch": 0.6819595645412131, "grad_norm": 0.8383664314500181, "learning_rate": 2.294927894463252e-06, "loss": 0.1293, "step": 4385 }, { "epoch": 0.6821150855365474, "grad_norm": 1.1705734463594, "learning_rate": 2.292873690779492e-06, "loss": 0.1991, "step": 4386 }, { "epoch": 0.6822706065318818, "grad_norm": 1.043290884918879, "learning_rate": 2.2908201333245324e-06, "loss": 0.1122, "step": 4387 }, { "epoch": 0.6824261275272162, "grad_norm": 1.2318441553106227, "learning_rate": 2.288767222588587e-06, "loss": 0.1029, "step": 4388 }, { "epoch": 0.6825816485225505, "grad_norm": 1.314878442910586, "learning_rate": 2.2867149590617176e-06, "loss": 0.1482, "step": 4389 }, { "epoch": 0.6827371695178849, "grad_norm": 0.9496779180603032, "learning_rate": 2.2846633432338256e-06, "loss": 0.1469, "step": 4390 }, { "epoch": 0.6828926905132193, "grad_norm": 1.4906700489912226, "learning_rate": 2.282612375594658e-06, "loss": 0.142, "step": 4391 }, { "epoch": 0.6830482115085537, "grad_norm": 0.6115773521022383, "learning_rate": 2.280562056633814e-06, "loss": 0.0623, "step": 4392 }, { "epoch": 0.683203732503888, "grad_norm": 0.8432410759000893, "learning_rate": 2.2785123868407307e-06, "loss": 0.157, "step": 4393 }, { "epoch": 0.6833592534992224, "grad_norm": 0.8591688207696261, "learning_rate": 2.2764633667046908e-06, "loss": 0.1594, "step": 4394 }, { "epoch": 0.6835147744945568, "grad_norm": 1.204933832852645, "learning_rate": 2.274414996714829e-06, "loss": 0.1922, "step": 4395 }, { "epoch": 0.6836702954898911, "grad_norm": 0.8848176021235135, "learning_rate": 2.2723672773601146e-06, "loss": 0.1625, "step": 4396 }, { "epoch": 0.6838258164852256, "grad_norm": 1.0556525252903484, "learning_rate": 2.2703202091293695e-06, "loss": 0.103, "step": 4397 }, { "epoch": 0.6839813374805599, "grad_norm": 1.4586076056749313, "learning_rate": 2.268273792511259e-06, "loss": 0.2563, "step": 4398 }, { "epoch": 0.6841368584758942, "grad_norm": 0.9893831715153024, "learning_rate": 2.2662280279942893e-06, "loss": 0.2235, "step": 4399 }, { "epoch": 0.6842923794712286, "grad_norm": 1.477372991759482, "learning_rate": 2.2641829160668137e-06, "loss": 0.2683, "step": 4400 }, { "epoch": 0.6842923794712286, "eval_loss": 0.16709984838962555, "eval_runtime": 9.4432, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 4400 }, { "epoch": 0.684447900466563, "grad_norm": 0.8552626671830938, "learning_rate": 2.262138457217026e-06, "loss": 0.0986, "step": 4401 }, { "epoch": 0.6846034214618973, "grad_norm": 0.9991100534863135, "learning_rate": 2.2600946519329716e-06, "loss": 0.1269, "step": 4402 }, { "epoch": 0.6847589424572317, "grad_norm": 1.2007080014601943, "learning_rate": 2.2580515007025307e-06, "loss": 0.0648, "step": 4403 }, { "epoch": 0.6849144634525661, "grad_norm": 0.9803933264329527, "learning_rate": 2.2560090040134345e-06, "loss": 0.1427, "step": 4404 }, { "epoch": 0.6850699844479005, "grad_norm": 1.5681457954311218, "learning_rate": 2.2539671623532577e-06, "loss": 0.1459, "step": 4405 }, { "epoch": 0.6852255054432348, "grad_norm": 1.1739700468948433, "learning_rate": 2.251925976209414e-06, "loss": 0.1788, "step": 4406 }, { "epoch": 0.6853810264385692, "grad_norm": 0.9469350359962736, "learning_rate": 2.2498854460691604e-06, "loss": 0.1075, "step": 4407 }, { "epoch": 0.6855365474339036, "grad_norm": 1.1517203223960168, "learning_rate": 2.2478455724196045e-06, "loss": 0.1479, "step": 4408 }, { "epoch": 0.6856920684292379, "grad_norm": 1.0387527089196902, "learning_rate": 2.2458063557476913e-06, "loss": 0.1142, "step": 4409 }, { "epoch": 0.6858475894245724, "grad_norm": 1.0408087918841902, "learning_rate": 2.243767796540207e-06, "loss": 0.1605, "step": 4410 }, { "epoch": 0.6860031104199067, "grad_norm": 0.8048342706880139, "learning_rate": 2.241729895283789e-06, "loss": 0.1516, "step": 4411 }, { "epoch": 0.686158631415241, "grad_norm": 1.3488037561203114, "learning_rate": 2.2396926524649084e-06, "loss": 0.1416, "step": 4412 }, { "epoch": 0.6863141524105755, "grad_norm": 1.8355621811369993, "learning_rate": 2.237656068569885e-06, "loss": 0.1382, "step": 4413 }, { "epoch": 0.6864696734059098, "grad_norm": 1.1823158636887334, "learning_rate": 2.235620144084883e-06, "loss": 0.1378, "step": 4414 }, { "epoch": 0.6866251944012441, "grad_norm": 1.3195542796041335, "learning_rate": 2.2335848794959026e-06, "loss": 0.1956, "step": 4415 }, { "epoch": 0.6867807153965786, "grad_norm": 1.4900808659369325, "learning_rate": 2.231550275288789e-06, "loss": 0.2227, "step": 4416 }, { "epoch": 0.6869362363919129, "grad_norm": 0.9534074875288079, "learning_rate": 2.2295163319492335e-06, "loss": 0.1511, "step": 4417 }, { "epoch": 0.6870917573872473, "grad_norm": 1.359324458027944, "learning_rate": 2.2274830499627657e-06, "loss": 0.2035, "step": 4418 }, { "epoch": 0.6872472783825816, "grad_norm": 1.6140721120649346, "learning_rate": 2.225450429814756e-06, "loss": 0.1119, "step": 4419 }, { "epoch": 0.687402799377916, "grad_norm": 1.264321182982655, "learning_rate": 2.223418471990421e-06, "loss": 0.1848, "step": 4420 }, { "epoch": 0.6875583203732504, "grad_norm": 1.2046867515945123, "learning_rate": 2.22138717697482e-06, "loss": 0.2036, "step": 4421 }, { "epoch": 0.6877138413685847, "grad_norm": 1.3368031661891069, "learning_rate": 2.2193565452528483e-06, "loss": 0.1698, "step": 4422 }, { "epoch": 0.6878693623639192, "grad_norm": 1.1052567594483285, "learning_rate": 2.2173265773092446e-06, "loss": 0.1811, "step": 4423 }, { "epoch": 0.6880248833592535, "grad_norm": 0.9398333446777973, "learning_rate": 2.215297273628595e-06, "loss": 0.1499, "step": 4424 }, { "epoch": 0.6881804043545878, "grad_norm": 0.899964482904394, "learning_rate": 2.2132686346953196e-06, "loss": 0.1525, "step": 4425 }, { "epoch": 0.6883359253499223, "grad_norm": 1.2265534586114022, "learning_rate": 2.2112406609936825e-06, "loss": 0.1654, "step": 4426 }, { "epoch": 0.6884914463452566, "grad_norm": 1.102979594234954, "learning_rate": 2.2092133530077914e-06, "loss": 0.109, "step": 4427 }, { "epoch": 0.6886469673405909, "grad_norm": 1.5742270052158056, "learning_rate": 2.20718671122159e-06, "loss": 0.2577, "step": 4428 }, { "epoch": 0.6888024883359254, "grad_norm": 0.7586677810448738, "learning_rate": 2.2051607361188683e-06, "loss": 0.101, "step": 4429 }, { "epoch": 0.6889580093312597, "grad_norm": 1.4188522286436427, "learning_rate": 2.2031354281832555e-06, "loss": 0.1248, "step": 4430 }, { "epoch": 0.6891135303265941, "grad_norm": 0.9688946603229812, "learning_rate": 2.2011107878982203e-06, "loss": 0.159, "step": 4431 }, { "epoch": 0.6892690513219285, "grad_norm": 0.9993249651431735, "learning_rate": 2.1990868157470695e-06, "loss": 0.1565, "step": 4432 }, { "epoch": 0.6894245723172628, "grad_norm": 0.8293986187665019, "learning_rate": 2.197063512212958e-06, "loss": 0.169, "step": 4433 }, { "epoch": 0.6895800933125972, "grad_norm": 0.9138868617636657, "learning_rate": 2.195040877778874e-06, "loss": 0.1155, "step": 4434 }, { "epoch": 0.6897356143079316, "grad_norm": 1.2342823486539498, "learning_rate": 2.1930189129276476e-06, "loss": 0.1404, "step": 4435 }, { "epoch": 0.689891135303266, "grad_norm": 0.7097570233749165, "learning_rate": 2.190997618141951e-06, "loss": 0.1242, "step": 4436 }, { "epoch": 0.6900466562986003, "grad_norm": 0.8517643637006148, "learning_rate": 2.1889769939042983e-06, "loss": 0.1787, "step": 4437 }, { "epoch": 0.6902021772939346, "grad_norm": 1.5895272299225984, "learning_rate": 2.1869570406970382e-06, "loss": 0.2247, "step": 4438 }, { "epoch": 0.6903576982892691, "grad_norm": 1.1795665454151296, "learning_rate": 2.1849377590023597e-06, "loss": 0.1181, "step": 4439 }, { "epoch": 0.6905132192846034, "grad_norm": 1.2704961005536326, "learning_rate": 2.1829191493022974e-06, "loss": 0.1329, "step": 4440 }, { "epoch": 0.6906687402799377, "grad_norm": 1.3237954877398872, "learning_rate": 2.1809012120787193e-06, "loss": 0.1818, "step": 4441 }, { "epoch": 0.6908242612752722, "grad_norm": 1.1740454139171752, "learning_rate": 2.178883947813334e-06, "loss": 0.1212, "step": 4442 }, { "epoch": 0.6909797822706065, "grad_norm": 1.2835083532679046, "learning_rate": 2.1768673569876942e-06, "loss": 0.2049, "step": 4443 }, { "epoch": 0.6911353032659409, "grad_norm": 1.2205767531472023, "learning_rate": 2.1748514400831843e-06, "loss": 0.218, "step": 4444 }, { "epoch": 0.6912908242612753, "grad_norm": 0.7741210267773792, "learning_rate": 2.1728361975810338e-06, "loss": 0.1044, "step": 4445 }, { "epoch": 0.6914463452566096, "grad_norm": 0.6768432548506254, "learning_rate": 2.1708216299623108e-06, "loss": 0.0611, "step": 4446 }, { "epoch": 0.691601866251944, "grad_norm": 1.3372331003313662, "learning_rate": 2.168807737707919e-06, "loss": 0.154, "step": 4447 }, { "epoch": 0.6917573872472784, "grad_norm": 0.6392744375871331, "learning_rate": 2.166794521298601e-06, "loss": 0.1036, "step": 4448 }, { "epoch": 0.6919129082426128, "grad_norm": 0.9722107704000688, "learning_rate": 2.164781981214943e-06, "loss": 0.1593, "step": 4449 }, { "epoch": 0.6920684292379471, "grad_norm": 0.9767414890154252, "learning_rate": 2.1627701179373645e-06, "loss": 0.1111, "step": 4450 }, { "epoch": 0.6922239502332815, "grad_norm": 1.60455073753883, "learning_rate": 2.160758931946123e-06, "loss": 0.1268, "step": 4451 }, { "epoch": 0.6923794712286159, "grad_norm": 1.412735876604593, "learning_rate": 2.1587484237213195e-06, "loss": 0.145, "step": 4452 }, { "epoch": 0.6925349922239502, "grad_norm": 1.0476466448928046, "learning_rate": 2.156738593742892e-06, "loss": 0.1479, "step": 4453 }, { "epoch": 0.6926905132192847, "grad_norm": 0.8878800044475174, "learning_rate": 2.1547294424906105e-06, "loss": 0.1491, "step": 4454 }, { "epoch": 0.692846034214619, "grad_norm": 0.9039420427490245, "learning_rate": 2.152720970444092e-06, "loss": 0.1629, "step": 4455 }, { "epoch": 0.6930015552099533, "grad_norm": 0.9915707253558252, "learning_rate": 2.1507131780827845e-06, "loss": 0.152, "step": 4456 }, { "epoch": 0.6931570762052877, "grad_norm": 0.9476215143757766, "learning_rate": 2.148706065885976e-06, "loss": 0.0874, "step": 4457 }, { "epoch": 0.6933125972006221, "grad_norm": 1.1721967284145411, "learning_rate": 2.1466996343327906e-06, "loss": 0.1372, "step": 4458 }, { "epoch": 0.6934681181959564, "grad_norm": 1.027954358936779, "learning_rate": 2.1446938839021946e-06, "loss": 0.1174, "step": 4459 }, { "epoch": 0.6936236391912908, "grad_norm": 1.4214533551109192, "learning_rate": 2.142688815072986e-06, "loss": 0.1629, "step": 4460 }, { "epoch": 0.6937791601866252, "grad_norm": 1.129514223963418, "learning_rate": 2.1406844283238033e-06, "loss": 0.1624, "step": 4461 }, { "epoch": 0.6939346811819596, "grad_norm": 0.9635263532271225, "learning_rate": 2.138680724133124e-06, "loss": 0.1728, "step": 4462 }, { "epoch": 0.6940902021772939, "grad_norm": 0.836338186880421, "learning_rate": 2.1366777029792578e-06, "loss": 0.1737, "step": 4463 }, { "epoch": 0.6942457231726283, "grad_norm": 1.1637650248284885, "learning_rate": 2.134675365340353e-06, "loss": 0.0925, "step": 4464 }, { "epoch": 0.6944012441679627, "grad_norm": 1.2754640395099677, "learning_rate": 2.1326737116943993e-06, "loss": 0.1507, "step": 4465 }, { "epoch": 0.694556765163297, "grad_norm": 1.8882967954761984, "learning_rate": 2.1306727425192157e-06, "loss": 0.1868, "step": 4466 }, { "epoch": 0.6947122861586315, "grad_norm": 0.9825922216671582, "learning_rate": 2.1286724582924613e-06, "loss": 0.1318, "step": 4467 }, { "epoch": 0.6948678071539658, "grad_norm": 0.9293553524484802, "learning_rate": 2.126672859491635e-06, "loss": 0.1778, "step": 4468 }, { "epoch": 0.6950233281493001, "grad_norm": 0.9468590287751905, "learning_rate": 2.124673946594065e-06, "loss": 0.1098, "step": 4469 }, { "epoch": 0.6951788491446346, "grad_norm": 1.4443575861899332, "learning_rate": 2.1226757200769225e-06, "loss": 0.2195, "step": 4470 }, { "epoch": 0.6953343701399689, "grad_norm": 1.8009952029435587, "learning_rate": 2.1206781804172128e-06, "loss": 0.1343, "step": 4471 }, { "epoch": 0.6954898911353032, "grad_norm": 1.4182985013897773, "learning_rate": 2.118681328091775e-06, "loss": 0.216, "step": 4472 }, { "epoch": 0.6956454121306377, "grad_norm": 0.7763830806484747, "learning_rate": 2.1166851635772835e-06, "loss": 0.1405, "step": 4473 }, { "epoch": 0.695800933125972, "grad_norm": 2.866942821258645, "learning_rate": 2.1146896873502547e-06, "loss": 0.1216, "step": 4474 }, { "epoch": 0.6959564541213064, "grad_norm": 1.0346260263754208, "learning_rate": 2.1126948998870344e-06, "loss": 0.13, "step": 4475 }, { "epoch": 0.6961119751166407, "grad_norm": 0.9580046267984968, "learning_rate": 2.1107008016638047e-06, "loss": 0.1725, "step": 4476 }, { "epoch": 0.6962674961119751, "grad_norm": 1.1359975932682704, "learning_rate": 2.1087073931565866e-06, "loss": 0.0973, "step": 4477 }, { "epoch": 0.6964230171073095, "grad_norm": 0.8228715280060591, "learning_rate": 2.106714674841235e-06, "loss": 0.1508, "step": 4478 }, { "epoch": 0.6965785381026438, "grad_norm": 1.1516648552339062, "learning_rate": 2.10472264719344e-06, "loss": 0.2125, "step": 4479 }, { "epoch": 0.6967340590979783, "grad_norm": 1.030116968186068, "learning_rate": 2.102731310688723e-06, "loss": 0.1573, "step": 4480 }, { "epoch": 0.6968895800933126, "grad_norm": 1.0852404338083017, "learning_rate": 2.1007406658024478e-06, "loss": 0.1458, "step": 4481 }, { "epoch": 0.6970451010886469, "grad_norm": 0.9691612433574605, "learning_rate": 2.0987507130098073e-06, "loss": 0.1876, "step": 4482 }, { "epoch": 0.6972006220839814, "grad_norm": 1.3223268756732727, "learning_rate": 2.096761452785829e-06, "loss": 0.1368, "step": 4483 }, { "epoch": 0.6973561430793157, "grad_norm": 1.3686254061573853, "learning_rate": 2.0947728856053804e-06, "loss": 0.1211, "step": 4484 }, { "epoch": 0.69751166407465, "grad_norm": 1.0288834911423816, "learning_rate": 2.0927850119431577e-06, "loss": 0.1662, "step": 4485 }, { "epoch": 0.6976671850699845, "grad_norm": 1.22030536949885, "learning_rate": 2.0907978322736943e-06, "loss": 0.1734, "step": 4486 }, { "epoch": 0.6978227060653188, "grad_norm": 1.3717687048552567, "learning_rate": 2.0888113470713612e-06, "loss": 0.1557, "step": 4487 }, { "epoch": 0.6979782270606532, "grad_norm": 1.4602209712178653, "learning_rate": 2.0868255568103573e-06, "loss": 0.2123, "step": 4488 }, { "epoch": 0.6981337480559876, "grad_norm": 1.4377452545961726, "learning_rate": 2.084840461964717e-06, "loss": 0.1721, "step": 4489 }, { "epoch": 0.6982892690513219, "grad_norm": 1.0963277587944045, "learning_rate": 2.0828560630083127e-06, "loss": 0.1331, "step": 4490 }, { "epoch": 0.6984447900466563, "grad_norm": 1.2744468241341078, "learning_rate": 2.080872360414848e-06, "loss": 0.1172, "step": 4491 }, { "epoch": 0.6986003110419907, "grad_norm": 0.7924711032289711, "learning_rate": 2.0788893546578577e-06, "loss": 0.1346, "step": 4492 }, { "epoch": 0.698755832037325, "grad_norm": 1.3247479520792687, "learning_rate": 2.076907046210714e-06, "loss": 0.1524, "step": 4493 }, { "epoch": 0.6989113530326594, "grad_norm": 1.2417187435360038, "learning_rate": 2.0749254355466245e-06, "loss": 0.1368, "step": 4494 }, { "epoch": 0.6990668740279938, "grad_norm": 0.8617185491660335, "learning_rate": 2.072944523138624e-06, "loss": 0.1928, "step": 4495 }, { "epoch": 0.6992223950233282, "grad_norm": 0.9892105163964453, "learning_rate": 2.070964309459586e-06, "loss": 0.1347, "step": 4496 }, { "epoch": 0.6993779160186625, "grad_norm": 0.7402392207662021, "learning_rate": 2.0689847949822144e-06, "loss": 0.0704, "step": 4497 }, { "epoch": 0.6995334370139968, "grad_norm": 1.3462545535786699, "learning_rate": 2.067005980179047e-06, "loss": 0.1721, "step": 4498 }, { "epoch": 0.6996889580093313, "grad_norm": 1.681368334265631, "learning_rate": 2.065027865522452e-06, "loss": 0.1641, "step": 4499 }, { "epoch": 0.6998444790046656, "grad_norm": 0.7205605388596559, "learning_rate": 2.0630504514846372e-06, "loss": 0.1124, "step": 4500 }, { "epoch": 0.6998444790046656, "eval_loss": 0.16485631465911865, "eval_runtime": 9.4466, "eval_samples_per_second": 2.752, "eval_steps_per_second": 0.741, "step": 4500 }, { "epoch": 0.7, "grad_norm": 1.0018889885414415, "learning_rate": 2.061073738537635e-06, "loss": 0.1497, "step": 4501 }, { "epoch": 0.7001555209953344, "grad_norm": 0.9360570300011756, "learning_rate": 2.0590977271533157e-06, "loss": 0.1101, "step": 4502 }, { "epoch": 0.7003110419906687, "grad_norm": 1.1508335903728877, "learning_rate": 2.0571224178033833e-06, "loss": 0.1516, "step": 4503 }, { "epoch": 0.7004665629860031, "grad_norm": 0.7061354611092867, "learning_rate": 2.05514781095937e-06, "loss": 0.1373, "step": 4504 }, { "epoch": 0.7006220839813375, "grad_norm": 1.5622611244384894, "learning_rate": 2.053173907092639e-06, "loss": 0.1763, "step": 4505 }, { "epoch": 0.7007776049766719, "grad_norm": 0.8749718439162775, "learning_rate": 2.051200706674394e-06, "loss": 0.1585, "step": 4506 }, { "epoch": 0.7009331259720062, "grad_norm": 1.1347689578873372, "learning_rate": 2.049228210175662e-06, "loss": 0.1434, "step": 4507 }, { "epoch": 0.7010886469673406, "grad_norm": 0.9173804847962802, "learning_rate": 2.0472564180673048e-06, "loss": 0.1588, "step": 4508 }, { "epoch": 0.701244167962675, "grad_norm": 1.1561301638062123, "learning_rate": 2.045285330820018e-06, "loss": 0.1786, "step": 4509 }, { "epoch": 0.7013996889580093, "grad_norm": 1.3891567203481496, "learning_rate": 2.0433149489043296e-06, "loss": 0.1127, "step": 4510 }, { "epoch": 0.7015552099533438, "grad_norm": 2.035373983915273, "learning_rate": 2.0413452727905936e-06, "loss": 0.1938, "step": 4511 }, { "epoch": 0.7017107309486781, "grad_norm": 0.9278381676454944, "learning_rate": 2.0393763029490027e-06, "loss": 0.2236, "step": 4512 }, { "epoch": 0.7018662519440124, "grad_norm": 0.8881381000266305, "learning_rate": 2.037408039849575e-06, "loss": 0.1578, "step": 4513 }, { "epoch": 0.7020217729393469, "grad_norm": 0.9419333776063396, "learning_rate": 2.0354404839621617e-06, "loss": 0.1661, "step": 4514 }, { "epoch": 0.7021772939346812, "grad_norm": 1.102468054295542, "learning_rate": 2.033473635756449e-06, "loss": 0.2081, "step": 4515 }, { "epoch": 0.7023328149300155, "grad_norm": 1.423762758792734, "learning_rate": 2.0315074957019486e-06, "loss": 0.2662, "step": 4516 }, { "epoch": 0.7024883359253499, "grad_norm": 1.0786051520370283, "learning_rate": 2.029542064268004e-06, "loss": 0.1635, "step": 4517 }, { "epoch": 0.7026438569206843, "grad_norm": 1.1414248723565281, "learning_rate": 2.027577341923794e-06, "loss": 0.205, "step": 4518 }, { "epoch": 0.7027993779160187, "grad_norm": 0.9136837385574679, "learning_rate": 2.0256133291383257e-06, "loss": 0.094, "step": 4519 }, { "epoch": 0.702954898911353, "grad_norm": 1.2421504783473125, "learning_rate": 2.0236500263804355e-06, "loss": 0.1785, "step": 4520 }, { "epoch": 0.7031104199066874, "grad_norm": 1.4841393776263467, "learning_rate": 2.021687434118788e-06, "loss": 0.1805, "step": 4521 }, { "epoch": 0.7032659409020218, "grad_norm": 0.9144213230746508, "learning_rate": 2.019725552821886e-06, "loss": 0.1364, "step": 4522 }, { "epoch": 0.7034214618973561, "grad_norm": 0.7034689923614892, "learning_rate": 2.0177643829580567e-06, "loss": 0.0937, "step": 4523 }, { "epoch": 0.7035769828926906, "grad_norm": 1.6864823244987914, "learning_rate": 2.0158039249954552e-06, "loss": 0.0981, "step": 4524 }, { "epoch": 0.7037325038880249, "grad_norm": 0.9122327435781903, "learning_rate": 2.0138441794020734e-06, "loss": 0.103, "step": 4525 }, { "epoch": 0.7038880248833592, "grad_norm": 1.0076526375863684, "learning_rate": 2.011885146645731e-06, "loss": 0.1429, "step": 4526 }, { "epoch": 0.7040435458786937, "grad_norm": 1.344388997775513, "learning_rate": 2.009926827194073e-06, "loss": 0.1605, "step": 4527 }, { "epoch": 0.704199066874028, "grad_norm": 0.8907778027903134, "learning_rate": 2.007969221514581e-06, "loss": 0.1565, "step": 4528 }, { "epoch": 0.7043545878693623, "grad_norm": 0.8148335406074245, "learning_rate": 2.0060123300745613e-06, "loss": 0.1528, "step": 4529 }, { "epoch": 0.7045101088646968, "grad_norm": 0.9810676024858072, "learning_rate": 2.0040561533411494e-06, "loss": 0.1527, "step": 4530 }, { "epoch": 0.7046656298600311, "grad_norm": 1.2358420137279482, "learning_rate": 2.002100691781316e-06, "loss": 0.1799, "step": 4531 }, { "epoch": 0.7048211508553655, "grad_norm": 0.8924222514036928, "learning_rate": 2.0001459458618533e-06, "loss": 0.1313, "step": 4532 }, { "epoch": 0.7049766718506999, "grad_norm": 1.512479594462006, "learning_rate": 1.9981919160493868e-06, "loss": 0.1713, "step": 4533 }, { "epoch": 0.7051321928460342, "grad_norm": 0.6891261817977348, "learning_rate": 1.9962386028103713e-06, "loss": 0.1501, "step": 4534 }, { "epoch": 0.7052877138413686, "grad_norm": 1.248105794518286, "learning_rate": 1.994286006611092e-06, "loss": 0.1655, "step": 4535 }, { "epoch": 0.7054432348367029, "grad_norm": 1.4934132974107261, "learning_rate": 1.9923341279176596e-06, "loss": 0.2675, "step": 4536 }, { "epoch": 0.7055987558320373, "grad_norm": 1.0752647319971977, "learning_rate": 1.9903829671960117e-06, "loss": 0.1833, "step": 4537 }, { "epoch": 0.7057542768273717, "grad_norm": 0.7856599288966126, "learning_rate": 1.9884325249119226e-06, "loss": 0.1613, "step": 4538 }, { "epoch": 0.705909797822706, "grad_norm": 1.2424832432582555, "learning_rate": 1.986482801530987e-06, "loss": 0.1643, "step": 4539 }, { "epoch": 0.7060653188180405, "grad_norm": 1.147111412331209, "learning_rate": 1.9845337975186297e-06, "loss": 0.1393, "step": 4540 }, { "epoch": 0.7062208398133748, "grad_norm": 1.432469449459083, "learning_rate": 1.982585513340108e-06, "loss": 0.1102, "step": 4541 }, { "epoch": 0.7063763608087091, "grad_norm": 1.0477852570761823, "learning_rate": 1.9806379494605043e-06, "loss": 0.1391, "step": 4542 }, { "epoch": 0.7065318818040436, "grad_norm": 1.0127210483104907, "learning_rate": 1.978691106344727e-06, "loss": 0.1737, "step": 4543 }, { "epoch": 0.7066874027993779, "grad_norm": 0.8519376082287473, "learning_rate": 1.9767449844575187e-06, "loss": 0.0927, "step": 4544 }, { "epoch": 0.7068429237947123, "grad_norm": 1.581003753360095, "learning_rate": 1.974799584263443e-06, "loss": 0.1408, "step": 4545 }, { "epoch": 0.7069984447900467, "grad_norm": 0.8853562671216283, "learning_rate": 1.9728549062268925e-06, "loss": 0.1502, "step": 4546 }, { "epoch": 0.707153965785381, "grad_norm": 4.004219722861764, "learning_rate": 1.9709109508120926e-06, "loss": 0.2456, "step": 4547 }, { "epoch": 0.7073094867807154, "grad_norm": 1.2165185984895674, "learning_rate": 1.968967718483091e-06, "loss": 0.1728, "step": 4548 }, { "epoch": 0.7074650077760498, "grad_norm": 0.9879717842988374, "learning_rate": 1.9670252097037622e-06, "loss": 0.102, "step": 4549 }, { "epoch": 0.7076205287713841, "grad_norm": 1.2498243200907329, "learning_rate": 1.9650834249378125e-06, "loss": 0.9596, "step": 4550 }, { "epoch": 0.7077760497667185, "grad_norm": 1.1082759217657163, "learning_rate": 1.9631423646487746e-06, "loss": 0.139, "step": 4551 }, { "epoch": 0.7079315707620529, "grad_norm": 0.7599638834811892, "learning_rate": 1.961202029300002e-06, "loss": 0.1598, "step": 4552 }, { "epoch": 0.7080870917573873, "grad_norm": 1.3676275092476513, "learning_rate": 1.9592624193546855e-06, "loss": 0.1139, "step": 4553 }, { "epoch": 0.7082426127527216, "grad_norm": 1.2949850313248332, "learning_rate": 1.9573235352758335e-06, "loss": 0.1408, "step": 4554 }, { "epoch": 0.7083981337480559, "grad_norm": 1.1149368942075353, "learning_rate": 1.9553853775262854e-06, "loss": 0.1436, "step": 4555 }, { "epoch": 0.7085536547433904, "grad_norm": 1.1375622713042188, "learning_rate": 1.9534479465687046e-06, "loss": 0.136, "step": 4556 }, { "epoch": 0.7087091757387247, "grad_norm": 0.8516001453334673, "learning_rate": 1.951511242865584e-06, "loss": 0.2084, "step": 4557 }, { "epoch": 0.708864696734059, "grad_norm": 1.2985501514752515, "learning_rate": 1.9495752668792445e-06, "loss": 0.1701, "step": 4558 }, { "epoch": 0.7090202177293935, "grad_norm": 1.126828496058495, "learning_rate": 1.947640019071827e-06, "loss": 0.1829, "step": 4559 }, { "epoch": 0.7091757387247278, "grad_norm": 1.065271468941405, "learning_rate": 1.945705499905305e-06, "loss": 0.1272, "step": 4560 }, { "epoch": 0.7093312597200622, "grad_norm": 0.9456283548751514, "learning_rate": 1.943771709841474e-06, "loss": 0.1138, "step": 4561 }, { "epoch": 0.7094867807153966, "grad_norm": 1.4867738136766095, "learning_rate": 1.9418386493419545e-06, "loss": 0.145, "step": 4562 }, { "epoch": 0.709642301710731, "grad_norm": 0.9922920002464805, "learning_rate": 1.9399063188681987e-06, "loss": 0.139, "step": 4563 }, { "epoch": 0.7097978227060653, "grad_norm": 0.9584250549462505, "learning_rate": 1.9379747188814796e-06, "loss": 0.1753, "step": 4564 }, { "epoch": 0.7099533437013997, "grad_norm": 1.5922387905131676, "learning_rate": 1.9360438498428942e-06, "loss": 0.1005, "step": 4565 }, { "epoch": 0.7101088646967341, "grad_norm": 2.3044048920731104, "learning_rate": 1.93411371221337e-06, "loss": 0.1774, "step": 4566 }, { "epoch": 0.7102643856920684, "grad_norm": 1.1133971409935761, "learning_rate": 1.9321843064536606e-06, "loss": 0.1547, "step": 4567 }, { "epoch": 0.7104199066874028, "grad_norm": 1.038031116246459, "learning_rate": 1.9302556330243372e-06, "loss": 0.1659, "step": 4568 }, { "epoch": 0.7105754276827372, "grad_norm": 2.4809296179942915, "learning_rate": 1.9283276923858048e-06, "loss": 0.181, "step": 4569 }, { "epoch": 0.7107309486780715, "grad_norm": 0.8592588942833261, "learning_rate": 1.926400484998289e-06, "loss": 0.0884, "step": 4570 }, { "epoch": 0.710886469673406, "grad_norm": 1.6487907530798562, "learning_rate": 1.9244740113218376e-06, "loss": 0.1637, "step": 4571 }, { "epoch": 0.7110419906687403, "grad_norm": 1.4376430717203181, "learning_rate": 1.9225482718163315e-06, "loss": 0.1572, "step": 4572 }, { "epoch": 0.7111975116640746, "grad_norm": 0.654474516527469, "learning_rate": 1.9206232669414676e-06, "loss": 0.0999, "step": 4573 }, { "epoch": 0.711353032659409, "grad_norm": 0.8604147978161887, "learning_rate": 1.918698997156775e-06, "loss": 0.1353, "step": 4574 }, { "epoch": 0.7115085536547434, "grad_norm": 0.9342434704484676, "learning_rate": 1.916775462921601e-06, "loss": 0.1598, "step": 4575 }, { "epoch": 0.7116640746500777, "grad_norm": 1.1377999341328582, "learning_rate": 1.9148526646951217e-06, "loss": 0.1145, "step": 4576 }, { "epoch": 0.7118195956454121, "grad_norm": 1.3430791924323482, "learning_rate": 1.912930602936336e-06, "loss": 0.162, "step": 4577 }, { "epoch": 0.7119751166407465, "grad_norm": 1.0341889982231582, "learning_rate": 1.911009278104064e-06, "loss": 0.1978, "step": 4578 }, { "epoch": 0.7121306376360809, "grad_norm": 1.2496709992578903, "learning_rate": 1.9090886906569565e-06, "loss": 0.0983, "step": 4579 }, { "epoch": 0.7122861586314152, "grad_norm": 0.8013991554262484, "learning_rate": 1.9071688410534828e-06, "loss": 0.1322, "step": 4580 }, { "epoch": 0.7124416796267496, "grad_norm": 1.1264395544987054, "learning_rate": 1.9052497297519362e-06, "loss": 0.1742, "step": 4581 }, { "epoch": 0.712597200622084, "grad_norm": 1.0794432155224039, "learning_rate": 1.9033313572104366e-06, "loss": 0.204, "step": 4582 }, { "epoch": 0.7127527216174183, "grad_norm": 0.7096749705721701, "learning_rate": 1.901413723886929e-06, "loss": 0.1167, "step": 4583 }, { "epoch": 0.7129082426127528, "grad_norm": 1.4224251842282922, "learning_rate": 1.8994968302391747e-06, "loss": 0.1355, "step": 4584 }, { "epoch": 0.7130637636080871, "grad_norm": 1.0538821839755372, "learning_rate": 1.8975806767247668e-06, "loss": 0.1161, "step": 4585 }, { "epoch": 0.7132192846034214, "grad_norm": 0.9756898922640862, "learning_rate": 1.8956652638011157e-06, "loss": 0.1509, "step": 4586 }, { "epoch": 0.7133748055987559, "grad_norm": 0.8240430164579899, "learning_rate": 1.8937505919254561e-06, "loss": 0.155, "step": 4587 }, { "epoch": 0.7135303265940902, "grad_norm": 1.3644152477683644, "learning_rate": 1.8918366615548506e-06, "loss": 0.1513, "step": 4588 }, { "epoch": 0.7136858475894245, "grad_norm": 1.219454464127777, "learning_rate": 1.8899234731461763e-06, "loss": 0.1473, "step": 4589 }, { "epoch": 0.713841368584759, "grad_norm": 0.8746603283094898, "learning_rate": 1.8880110271561415e-06, "loss": 0.1569, "step": 4590 }, { "epoch": 0.7139968895800933, "grad_norm": 0.7015560661837315, "learning_rate": 1.8860993240412733e-06, "loss": 0.1281, "step": 4591 }, { "epoch": 0.7141524105754277, "grad_norm": 0.6690295683894842, "learning_rate": 1.8841883642579222e-06, "loss": 0.1252, "step": 4592 }, { "epoch": 0.714307931570762, "grad_norm": 1.1332338550724446, "learning_rate": 1.8822781482622593e-06, "loss": 0.1565, "step": 4593 }, { "epoch": 0.7144634525660964, "grad_norm": 1.1744127648003795, "learning_rate": 1.8803686765102797e-06, "loss": 0.1874, "step": 4594 }, { "epoch": 0.7146189735614308, "grad_norm": 1.0324897109389453, "learning_rate": 1.8784599494578026e-06, "loss": 0.1139, "step": 4595 }, { "epoch": 0.7147744945567651, "grad_norm": 1.1379003461584456, "learning_rate": 1.8765519675604676e-06, "loss": 0.1361, "step": 4596 }, { "epoch": 0.7149300155520996, "grad_norm": 1.267267218076302, "learning_rate": 1.8746447312737343e-06, "loss": 0.1307, "step": 4597 }, { "epoch": 0.7150855365474339, "grad_norm": 1.0015204022541095, "learning_rate": 1.8727382410528877e-06, "loss": 0.1481, "step": 4598 }, { "epoch": 0.7152410575427682, "grad_norm": 1.086692303822751, "learning_rate": 1.8708324973530361e-06, "loss": 0.1675, "step": 4599 }, { "epoch": 0.7153965785381027, "grad_norm": 0.9903060218722843, "learning_rate": 1.8689275006291035e-06, "loss": 0.2138, "step": 4600 }, { "epoch": 0.7153965785381027, "eval_loss": 0.1650892049074173, "eval_runtime": 9.4444, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 4600 }, { "epoch": 0.715552099533437, "grad_norm": 1.15397445012902, "learning_rate": 1.8670232513358427e-06, "loss": 0.1489, "step": 4601 }, { "epoch": 0.7157076205287713, "grad_norm": 1.3832737637602122, "learning_rate": 1.8651197499278228e-06, "loss": 0.1718, "step": 4602 }, { "epoch": 0.7158631415241058, "grad_norm": 0.8641233676083006, "learning_rate": 1.8632169968594338e-06, "loss": 0.1492, "step": 4603 }, { "epoch": 0.7160186625194401, "grad_norm": 0.8507818907284558, "learning_rate": 1.8613149925848939e-06, "loss": 0.0905, "step": 4604 }, { "epoch": 0.7161741835147745, "grad_norm": 0.6800404948296217, "learning_rate": 1.8594137375582334e-06, "loss": 0.1272, "step": 4605 }, { "epoch": 0.7163297045101089, "grad_norm": 1.042049779551716, "learning_rate": 1.8575132322333111e-06, "loss": 0.2212, "step": 4606 }, { "epoch": 0.7164852255054432, "grad_norm": 1.069136936313498, "learning_rate": 1.8556134770638057e-06, "loss": 0.1091, "step": 4607 }, { "epoch": 0.7166407465007776, "grad_norm": 1.0282590392224245, "learning_rate": 1.853714472503213e-06, "loss": 0.0974, "step": 4608 }, { "epoch": 0.716796267496112, "grad_norm": 1.0455395795175115, "learning_rate": 1.8518162190048506e-06, "loss": 0.1599, "step": 4609 }, { "epoch": 0.7169517884914464, "grad_norm": 1.106281146071488, "learning_rate": 1.8499187170218614e-06, "loss": 0.1992, "step": 4610 }, { "epoch": 0.7171073094867807, "grad_norm": 0.9769163955160892, "learning_rate": 1.8480219670072037e-06, "loss": 0.0825, "step": 4611 }, { "epoch": 0.717262830482115, "grad_norm": 1.0831637970871146, "learning_rate": 1.8461259694136586e-06, "loss": 0.2133, "step": 4612 }, { "epoch": 0.7174183514774495, "grad_norm": 1.2502644001477874, "learning_rate": 1.8442307246938245e-06, "loss": 0.23, "step": 4613 }, { "epoch": 0.7175738724727838, "grad_norm": 1.5498386052684854, "learning_rate": 1.842336233300126e-06, "loss": 0.1313, "step": 4614 }, { "epoch": 0.7177293934681181, "grad_norm": 0.9227994739388249, "learning_rate": 1.8404424956848055e-06, "loss": 0.161, "step": 4615 }, { "epoch": 0.7178849144634526, "grad_norm": 1.3722221834645953, "learning_rate": 1.8385495122999203e-06, "loss": 0.1114, "step": 4616 }, { "epoch": 0.7180404354587869, "grad_norm": 1.206490443356825, "learning_rate": 1.8366572835973567e-06, "loss": 0.2333, "step": 4617 }, { "epoch": 0.7181959564541213, "grad_norm": 1.0133623011443706, "learning_rate": 1.834765810028814e-06, "loss": 0.1757, "step": 4618 }, { "epoch": 0.7183514774494557, "grad_norm": 0.7568920915786574, "learning_rate": 1.8328750920458117e-06, "loss": 0.1889, "step": 4619 }, { "epoch": 0.71850699844479, "grad_norm": 0.9078901413395668, "learning_rate": 1.8309851300996934e-06, "loss": 0.1347, "step": 4620 }, { "epoch": 0.7186625194401244, "grad_norm": 1.347356090537096, "learning_rate": 1.8290959246416163e-06, "loss": 0.1579, "step": 4621 }, { "epoch": 0.7188180404354588, "grad_norm": 0.9004151048198076, "learning_rate": 1.8272074761225617e-06, "loss": 0.1647, "step": 4622 }, { "epoch": 0.7189735614307932, "grad_norm": 1.6683473772106923, "learning_rate": 1.8253197849933303e-06, "loss": 0.1435, "step": 4623 }, { "epoch": 0.7191290824261275, "grad_norm": 1.137057514824073, "learning_rate": 1.8234328517045392e-06, "loss": 0.1368, "step": 4624 }, { "epoch": 0.7192846034214619, "grad_norm": 0.9697239409757642, "learning_rate": 1.8215466767066226e-06, "loss": 0.1816, "step": 4625 }, { "epoch": 0.7194401244167963, "grad_norm": 1.032400600524485, "learning_rate": 1.8196612604498414e-06, "loss": 0.1294, "step": 4626 }, { "epoch": 0.7195956454121306, "grad_norm": 1.305038795058337, "learning_rate": 1.8177766033842691e-06, "loss": 0.1916, "step": 4627 }, { "epoch": 0.7197511664074651, "grad_norm": 0.9936570134416011, "learning_rate": 1.8158927059597976e-06, "loss": 0.0767, "step": 4628 }, { "epoch": 0.7199066874027994, "grad_norm": 0.9057189979414394, "learning_rate": 1.814009568626142e-06, "loss": 0.2345, "step": 4629 }, { "epoch": 0.7200622083981337, "grad_norm": 0.6951718000321233, "learning_rate": 1.8121271918328314e-06, "loss": 0.1025, "step": 4630 }, { "epoch": 0.7202177293934681, "grad_norm": 0.9918094003475194, "learning_rate": 1.8102455760292186e-06, "loss": 0.0857, "step": 4631 }, { "epoch": 0.7203732503888025, "grad_norm": 1.3366734481302882, "learning_rate": 1.8083647216644672e-06, "loss": 0.1917, "step": 4632 }, { "epoch": 0.7205287713841368, "grad_norm": 0.9142631000374659, "learning_rate": 1.8064846291875676e-06, "loss": 0.1373, "step": 4633 }, { "epoch": 0.7206842923794712, "grad_norm": 1.189173154618085, "learning_rate": 1.8046052990473224e-06, "loss": 0.0672, "step": 4634 }, { "epoch": 0.7208398133748056, "grad_norm": 0.9934099236126293, "learning_rate": 1.8027267316923514e-06, "loss": 0.16, "step": 4635 }, { "epoch": 0.72099533437014, "grad_norm": 0.8765485722029014, "learning_rate": 1.800848927571099e-06, "loss": 0.1093, "step": 4636 }, { "epoch": 0.7211508553654743, "grad_norm": 1.3730513724033024, "learning_rate": 1.7989718871318195e-06, "loss": 0.1898, "step": 4637 }, { "epoch": 0.7213063763608087, "grad_norm": 1.052447785631237, "learning_rate": 1.7970956108225906e-06, "loss": 0.1444, "step": 4638 }, { "epoch": 0.7214618973561431, "grad_norm": 0.8296329915072237, "learning_rate": 1.7952200990913071e-06, "loss": 0.1317, "step": 4639 }, { "epoch": 0.7216174183514774, "grad_norm": 0.9169335846001785, "learning_rate": 1.793345352385678e-06, "loss": 0.1168, "step": 4640 }, { "epoch": 0.7217729393468119, "grad_norm": 1.319845457516792, "learning_rate": 1.79147137115323e-06, "loss": 0.1288, "step": 4641 }, { "epoch": 0.7219284603421462, "grad_norm": 0.7292909677638076, "learning_rate": 1.7895981558413123e-06, "loss": 0.1394, "step": 4642 }, { "epoch": 0.7220839813374805, "grad_norm": 1.0461497719908444, "learning_rate": 1.7877257068970849e-06, "loss": 0.153, "step": 4643 }, { "epoch": 0.722239502332815, "grad_norm": 1.2377914328715023, "learning_rate": 1.7858540247675266e-06, "loss": 0.1662, "step": 4644 }, { "epoch": 0.7223950233281493, "grad_norm": 1.1276854842551973, "learning_rate": 1.783983109899437e-06, "loss": 0.105, "step": 4645 }, { "epoch": 0.7225505443234836, "grad_norm": 0.9401921316991168, "learning_rate": 1.7821129627394262e-06, "loss": 0.0872, "step": 4646 }, { "epoch": 0.7227060653188181, "grad_norm": 1.6444946751637444, "learning_rate": 1.7802435837339267e-06, "loss": 0.1524, "step": 4647 }, { "epoch": 0.7228615863141524, "grad_norm": 1.3828166827428823, "learning_rate": 1.7783749733291862e-06, "loss": 0.1554, "step": 4648 }, { "epoch": 0.7230171073094868, "grad_norm": 0.8356335748931965, "learning_rate": 1.7765071319712662e-06, "loss": 0.1197, "step": 4649 }, { "epoch": 0.7231726283048211, "grad_norm": 1.1304485537646554, "learning_rate": 1.7746400601060476e-06, "loss": 0.1919, "step": 4650 }, { "epoch": 0.7233281493001555, "grad_norm": 0.9130245706259102, "learning_rate": 1.7727737581792242e-06, "loss": 0.0919, "step": 4651 }, { "epoch": 0.7234836702954899, "grad_norm": 1.1118872817215657, "learning_rate": 1.7709082266363115e-06, "loss": 0.127, "step": 4652 }, { "epoch": 0.7236391912908242, "grad_norm": 1.0938754259122485, "learning_rate": 1.7690434659226346e-06, "loss": 0.1453, "step": 4653 }, { "epoch": 0.7237947122861587, "grad_norm": 0.9120592993794178, "learning_rate": 1.7671794764833395e-06, "loss": 0.127, "step": 4654 }, { "epoch": 0.723950233281493, "grad_norm": 0.9595305580522147, "learning_rate": 1.7653162587633888e-06, "loss": 0.1727, "step": 4655 }, { "epoch": 0.7241057542768273, "grad_norm": 3.8491181576875975, "learning_rate": 1.7634538132075557e-06, "loss": 0.098, "step": 4656 }, { "epoch": 0.7242612752721618, "grad_norm": 1.0751966125757824, "learning_rate": 1.7615921402604314e-06, "loss": 0.1257, "step": 4657 }, { "epoch": 0.7244167962674961, "grad_norm": 1.0453389114043958, "learning_rate": 1.7597312403664257e-06, "loss": 0.1356, "step": 4658 }, { "epoch": 0.7245723172628304, "grad_norm": 0.6671676846472874, "learning_rate": 1.7578711139697607e-06, "loss": 0.0807, "step": 4659 }, { "epoch": 0.7247278382581649, "grad_norm": 1.0846694332398512, "learning_rate": 1.7560117615144717e-06, "loss": 0.1265, "step": 4660 }, { "epoch": 0.7248833592534992, "grad_norm": 0.9751882374451909, "learning_rate": 1.754153183444416e-06, "loss": 0.1245, "step": 4661 }, { "epoch": 0.7250388802488336, "grad_norm": 1.0879759865718945, "learning_rate": 1.7522953802032584e-06, "loss": 0.1847, "step": 4662 }, { "epoch": 0.725194401244168, "grad_norm": 0.7083347590980091, "learning_rate": 1.7504383522344848e-06, "loss": 0.1496, "step": 4663 }, { "epoch": 0.7253499222395023, "grad_norm": 1.2714910783698299, "learning_rate": 1.7485820999813947e-06, "loss": 0.193, "step": 4664 }, { "epoch": 0.7255054432348367, "grad_norm": 1.139098552306346, "learning_rate": 1.7467266238870994e-06, "loss": 0.1789, "step": 4665 }, { "epoch": 0.7256609642301711, "grad_norm": 0.9119766707698675, "learning_rate": 1.7448719243945266e-06, "loss": 0.1105, "step": 4666 }, { "epoch": 0.7258164852255055, "grad_norm": 1.0314517035206534, "learning_rate": 1.743018001946421e-06, "loss": 0.11, "step": 4667 }, { "epoch": 0.7259720062208398, "grad_norm": 0.7365169399544169, "learning_rate": 1.741164856985339e-06, "loss": 0.1401, "step": 4668 }, { "epoch": 0.7261275272161741, "grad_norm": 1.305467113364108, "learning_rate": 1.7393124899536495e-06, "loss": 0.1395, "step": 4669 }, { "epoch": 0.7262830482115086, "grad_norm": 0.8635900537206621, "learning_rate": 1.7374609012935412e-06, "loss": 0.1518, "step": 4670 }, { "epoch": 0.7264385692068429, "grad_norm": 0.9743909417288582, "learning_rate": 1.7356100914470143e-06, "loss": 0.148, "step": 4671 }, { "epoch": 0.7265940902021772, "grad_norm": 0.753301647273116, "learning_rate": 1.7337600608558825e-06, "loss": 0.1049, "step": 4672 }, { "epoch": 0.7267496111975117, "grad_norm": 0.8900878595420585, "learning_rate": 1.731910809961772e-06, "loss": 0.114, "step": 4673 }, { "epoch": 0.726905132192846, "grad_norm": 1.0424021484052521, "learning_rate": 1.7300623392061278e-06, "loss": 0.1727, "step": 4674 }, { "epoch": 0.7270606531881804, "grad_norm": 0.7428680236914553, "learning_rate": 1.7282146490302038e-06, "loss": 0.0785, "step": 4675 }, { "epoch": 0.7272161741835148, "grad_norm": 2.3337330658859843, "learning_rate": 1.7263677398750683e-06, "loss": 0.1963, "step": 4676 }, { "epoch": 0.7273716951788491, "grad_norm": 1.475278177573427, "learning_rate": 1.7245216121816073e-06, "loss": 0.1923, "step": 4677 }, { "epoch": 0.7275272161741835, "grad_norm": 0.6831987219697702, "learning_rate": 1.7226762663905139e-06, "loss": 0.1478, "step": 4678 }, { "epoch": 0.7276827371695179, "grad_norm": 1.9104557295804927, "learning_rate": 1.7208317029422995e-06, "loss": 0.1509, "step": 4679 }, { "epoch": 0.7278382581648523, "grad_norm": 1.3298573277567565, "learning_rate": 1.7189879222772894e-06, "loss": 0.118, "step": 4680 }, { "epoch": 0.7279937791601866, "grad_norm": 0.9404738473429161, "learning_rate": 1.717144924835617e-06, "loss": 0.1865, "step": 4681 }, { "epoch": 0.728149300155521, "grad_norm": 0.7722829247736142, "learning_rate": 1.7153027110572307e-06, "loss": 0.1065, "step": 4682 }, { "epoch": 0.7283048211508554, "grad_norm": 0.9918277067170752, "learning_rate": 1.7134612813818952e-06, "loss": 0.2329, "step": 4683 }, { "epoch": 0.7284603421461897, "grad_norm": 1.4977048576570202, "learning_rate": 1.7116206362491843e-06, "loss": 0.1175, "step": 4684 }, { "epoch": 0.7286158631415242, "grad_norm": 1.1581434898997767, "learning_rate": 1.7097807760984842e-06, "loss": 0.159, "step": 4685 }, { "epoch": 0.7287713841368585, "grad_norm": 1.2096596783934668, "learning_rate": 1.7079417013689963e-06, "loss": 0.2577, "step": 4686 }, { "epoch": 0.7289269051321928, "grad_norm": 0.9646805861827605, "learning_rate": 1.7061034124997356e-06, "loss": 0.1684, "step": 4687 }, { "epoch": 0.7290824261275272, "grad_norm": 1.3103269243361997, "learning_rate": 1.7042659099295255e-06, "loss": 0.1708, "step": 4688 }, { "epoch": 0.7292379471228616, "grad_norm": 1.1454747452200154, "learning_rate": 1.7024291940970012e-06, "loss": 0.1243, "step": 4689 }, { "epoch": 0.7293934681181959, "grad_norm": 1.0183974088474113, "learning_rate": 1.7005932654406165e-06, "loss": 0.168, "step": 4690 }, { "epoch": 0.7295489891135303, "grad_norm": 0.8134707671273607, "learning_rate": 1.6987581243986307e-06, "loss": 0.0979, "step": 4691 }, { "epoch": 0.7297045101088647, "grad_norm": 1.2013631381129761, "learning_rate": 1.6969237714091169e-06, "loss": 0.1127, "step": 4692 }, { "epoch": 0.7298600311041991, "grad_norm": 1.3805398141238312, "learning_rate": 1.6950902069099634e-06, "loss": 0.1336, "step": 4693 }, { "epoch": 0.7300155520995334, "grad_norm": 0.984245521854967, "learning_rate": 1.6932574313388644e-06, "loss": 0.1824, "step": 4694 }, { "epoch": 0.7301710730948678, "grad_norm": 1.2011395080748524, "learning_rate": 1.6914254451333307e-06, "loss": 0.1152, "step": 4695 }, { "epoch": 0.7303265940902022, "grad_norm": 0.7292588175715882, "learning_rate": 1.6895942487306843e-06, "loss": 0.1694, "step": 4696 }, { "epoch": 0.7304821150855365, "grad_norm": 1.032341114587998, "learning_rate": 1.687763842568056e-06, "loss": 0.149, "step": 4697 }, { "epoch": 0.730637636080871, "grad_norm": 1.1198829999456932, "learning_rate": 1.6859342270823875e-06, "loss": 0.2013, "step": 4698 }, { "epoch": 0.7307931570762053, "grad_norm": 1.1375865967662984, "learning_rate": 1.6841054027104375e-06, "loss": 0.124, "step": 4699 }, { "epoch": 0.7309486780715396, "grad_norm": 1.1865084588469168, "learning_rate": 1.682277369888769e-06, "loss": 0.2013, "step": 4700 }, { "epoch": 0.7309486780715396, "eval_loss": 0.16376589238643646, "eval_runtime": 9.4456, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 4700 }, { "epoch": 0.7311041990668741, "grad_norm": 0.7799196080944473, "learning_rate": 1.6804501290537583e-06, "loss": 0.1243, "step": 4701 }, { "epoch": 0.7312597200622084, "grad_norm": 0.9803324545548049, "learning_rate": 1.6786236806415945e-06, "loss": 0.1624, "step": 4702 }, { "epoch": 0.7314152410575427, "grad_norm": 1.1666613168065287, "learning_rate": 1.6767980250882777e-06, "loss": 0.1371, "step": 4703 }, { "epoch": 0.7315707620528772, "grad_norm": 1.2934172331064167, "learning_rate": 1.6749731628296145e-06, "loss": 0.1337, "step": 4704 }, { "epoch": 0.7317262830482115, "grad_norm": 0.9542869911940447, "learning_rate": 1.673149094301228e-06, "loss": 0.1634, "step": 4705 }, { "epoch": 0.7318818040435459, "grad_norm": 1.2035487621107432, "learning_rate": 1.6713258199385474e-06, "loss": 0.2308, "step": 4706 }, { "epoch": 0.7320373250388802, "grad_norm": 0.8826766688280042, "learning_rate": 1.6695033401768136e-06, "loss": 0.1062, "step": 4707 }, { "epoch": 0.7321928460342146, "grad_norm": 1.2237345024455522, "learning_rate": 1.6676816554510762e-06, "loss": 0.1886, "step": 4708 }, { "epoch": 0.732348367029549, "grad_norm": 1.0244376859043156, "learning_rate": 1.6658607661962001e-06, "loss": 0.1567, "step": 4709 }, { "epoch": 0.7325038880248833, "grad_norm": 0.7559357528808817, "learning_rate": 1.6640406728468534e-06, "loss": 0.123, "step": 4710 }, { "epoch": 0.7326594090202178, "grad_norm": 1.2563378735359483, "learning_rate": 1.66222137583752e-06, "loss": 0.1598, "step": 4711 }, { "epoch": 0.7328149300155521, "grad_norm": 1.0284357570533968, "learning_rate": 1.6604028756024926e-06, "loss": 0.1816, "step": 4712 }, { "epoch": 0.7329704510108864, "grad_norm": 0.9226558210368945, "learning_rate": 1.6585851725758711e-06, "loss": 0.1499, "step": 4713 }, { "epoch": 0.7331259720062209, "grad_norm": 1.2308062428577127, "learning_rate": 1.6567682671915653e-06, "loss": 0.1959, "step": 4714 }, { "epoch": 0.7332814930015552, "grad_norm": 1.0038745655052408, "learning_rate": 1.6549521598832985e-06, "loss": 0.1297, "step": 4715 }, { "epoch": 0.7334370139968895, "grad_norm": 3.3639462213946008, "learning_rate": 1.6531368510845992e-06, "loss": 0.1272, "step": 4716 }, { "epoch": 0.733592534992224, "grad_norm": 1.1921811940539924, "learning_rate": 1.651322341228806e-06, "loss": 0.1075, "step": 4717 }, { "epoch": 0.7337480559875583, "grad_norm": 0.8490835075986206, "learning_rate": 1.649508630749068e-06, "loss": 0.0838, "step": 4718 }, { "epoch": 0.7339035769828927, "grad_norm": 0.9603842206493567, "learning_rate": 1.6476957200783466e-06, "loss": 0.1223, "step": 4719 }, { "epoch": 0.7340590979782271, "grad_norm": 0.9819032357753432, "learning_rate": 1.6458836096494046e-06, "loss": 0.1333, "step": 4720 }, { "epoch": 0.7342146189735614, "grad_norm": 0.9273954942706978, "learning_rate": 1.6440722998948211e-06, "loss": 0.112, "step": 4721 }, { "epoch": 0.7343701399688958, "grad_norm": 0.6893112896608466, "learning_rate": 1.6422617912469801e-06, "loss": 0.1155, "step": 4722 }, { "epoch": 0.7345256609642302, "grad_norm": 1.0670457679996763, "learning_rate": 1.6404520841380728e-06, "loss": 0.1217, "step": 4723 }, { "epoch": 0.7346811819595646, "grad_norm": 0.939303103734395, "learning_rate": 1.6386431790001051e-06, "loss": 0.2283, "step": 4724 }, { "epoch": 0.7348367029548989, "grad_norm": 1.3960518791969474, "learning_rate": 1.6368350762648866e-06, "loss": 0.1754, "step": 4725 }, { "epoch": 0.7349922239502333, "grad_norm": 1.4159015710580038, "learning_rate": 1.6350277763640348e-06, "loss": 0.1385, "step": 4726 }, { "epoch": 0.7351477449455677, "grad_norm": 0.9435846407187777, "learning_rate": 1.6332212797289787e-06, "loss": 0.0849, "step": 4727 }, { "epoch": 0.735303265940902, "grad_norm": 1.144664924932682, "learning_rate": 1.631415586790956e-06, "loss": 0.1983, "step": 4728 }, { "epoch": 0.7354587869362363, "grad_norm": 0.6907496251950994, "learning_rate": 1.6296106979810094e-06, "loss": 0.1199, "step": 4729 }, { "epoch": 0.7356143079315708, "grad_norm": 0.9305192529455809, "learning_rate": 1.6278066137299898e-06, "loss": 0.127, "step": 4730 }, { "epoch": 0.7357698289269051, "grad_norm": 1.1532606510293084, "learning_rate": 1.6260033344685594e-06, "loss": 0.2188, "step": 4731 }, { "epoch": 0.7359253499222395, "grad_norm": 1.3555530823510245, "learning_rate": 1.6242008606271853e-06, "loss": 0.1886, "step": 4732 }, { "epoch": 0.7360808709175739, "grad_norm": 1.3065985109108444, "learning_rate": 1.6223991926361416e-06, "loss": 0.2088, "step": 4733 }, { "epoch": 0.7362363919129082, "grad_norm": 1.7346929087932126, "learning_rate": 1.6205983309255125e-06, "loss": 0.205, "step": 4734 }, { "epoch": 0.7363919129082426, "grad_norm": 1.0272107532696482, "learning_rate": 1.6187982759251909e-06, "loss": 0.1248, "step": 4735 }, { "epoch": 0.736547433903577, "grad_norm": 0.6618156023657897, "learning_rate": 1.6169990280648717e-06, "loss": 0.1465, "step": 4736 }, { "epoch": 0.7367029548989114, "grad_norm": 0.8970050006516335, "learning_rate": 1.6152005877740635e-06, "loss": 0.089, "step": 4737 }, { "epoch": 0.7368584758942457, "grad_norm": 0.714581060925677, "learning_rate": 1.613402955482078e-06, "loss": 0.0847, "step": 4738 }, { "epoch": 0.7370139968895801, "grad_norm": 0.7109352913337605, "learning_rate": 1.6116061316180332e-06, "loss": 0.1111, "step": 4739 }, { "epoch": 0.7371695178849145, "grad_norm": 0.9178207138544197, "learning_rate": 1.6098101166108593e-06, "loss": 0.2165, "step": 4740 }, { "epoch": 0.7373250388802488, "grad_norm": 1.1862117910199579, "learning_rate": 1.6080149108892878e-06, "loss": 0.1215, "step": 4741 }, { "epoch": 0.7374805598755833, "grad_norm": 1.1469281856807412, "learning_rate": 1.6062205148818588e-06, "loss": 0.1951, "step": 4742 }, { "epoch": 0.7376360808709176, "grad_norm": 0.9526775359684546, "learning_rate": 1.6044269290169208e-06, "loss": 0.0773, "step": 4743 }, { "epoch": 0.7377916018662519, "grad_norm": 0.8487808346003366, "learning_rate": 1.6026341537226292e-06, "loss": 0.1418, "step": 4744 }, { "epoch": 0.7379471228615864, "grad_norm": 1.0163065538045097, "learning_rate": 1.6008421894269415e-06, "loss": 0.1781, "step": 4745 }, { "epoch": 0.7381026438569207, "grad_norm": 1.308208079901526, "learning_rate": 1.5990510365576277e-06, "loss": 0.0956, "step": 4746 }, { "epoch": 0.738258164852255, "grad_norm": 0.8834044329808994, "learning_rate": 1.5972606955422599e-06, "loss": 0.1442, "step": 4747 }, { "epoch": 0.7384136858475894, "grad_norm": 1.188707788509102, "learning_rate": 1.5954711668082162e-06, "loss": 0.1304, "step": 4748 }, { "epoch": 0.7385692068429238, "grad_norm": 1.1850763346434818, "learning_rate": 1.5936824507826815e-06, "loss": 0.174, "step": 4749 }, { "epoch": 0.7387247278382582, "grad_norm": 0.978958101079245, "learning_rate": 1.5918945478926484e-06, "loss": 0.1589, "step": 4750 }, { "epoch": 0.7388802488335925, "grad_norm": 0.9703936206434888, "learning_rate": 1.5901074585649163e-06, "loss": 0.1203, "step": 4751 }, { "epoch": 0.7390357698289269, "grad_norm": 1.100209487100694, "learning_rate": 1.5883211832260843e-06, "loss": 0.1338, "step": 4752 }, { "epoch": 0.7391912908242613, "grad_norm": 1.9866996100664727, "learning_rate": 1.5865357223025652e-06, "loss": 0.2597, "step": 4753 }, { "epoch": 0.7393468118195956, "grad_norm": 1.200130821968161, "learning_rate": 1.584751076220572e-06, "loss": 0.1066, "step": 4754 }, { "epoch": 0.73950233281493, "grad_norm": 0.8047816315650913, "learning_rate": 1.5829672454061224e-06, "loss": 0.1532, "step": 4755 }, { "epoch": 0.7396578538102644, "grad_norm": 0.9843461991459651, "learning_rate": 1.581184230285045e-06, "loss": 0.118, "step": 4756 }, { "epoch": 0.7398133748055987, "grad_norm": 1.0499984643804037, "learning_rate": 1.5794020312829689e-06, "loss": 0.1517, "step": 4757 }, { "epoch": 0.7399688958009332, "grad_norm": 1.1791048770834867, "learning_rate": 1.5776206488253282e-06, "loss": 0.2536, "step": 4758 }, { "epoch": 0.7401244167962675, "grad_norm": 2.6529223735794867, "learning_rate": 1.5758400833373654e-06, "loss": 0.1062, "step": 4759 }, { "epoch": 0.7402799377916018, "grad_norm": 0.8911696536531238, "learning_rate": 1.5740603352441281e-06, "loss": 0.0923, "step": 4760 }, { "epoch": 0.7404354587869363, "grad_norm": 1.2468987539536127, "learning_rate": 1.5722814049704633e-06, "loss": 0.1552, "step": 4761 }, { "epoch": 0.7405909797822706, "grad_norm": 0.8404877484292913, "learning_rate": 1.5705032929410296e-06, "loss": 0.0828, "step": 4762 }, { "epoch": 0.740746500777605, "grad_norm": 1.0768395343505048, "learning_rate": 1.5687259995802867e-06, "loss": 0.1503, "step": 4763 }, { "epoch": 0.7409020217729394, "grad_norm": 0.8989957391436278, "learning_rate": 1.5669495253124967e-06, "loss": 0.1051, "step": 4764 }, { "epoch": 0.7410575427682737, "grad_norm": 1.2076142231507692, "learning_rate": 1.5651738705617314e-06, "loss": 0.254, "step": 4765 }, { "epoch": 0.7412130637636081, "grad_norm": 0.7914086250377828, "learning_rate": 1.563399035751863e-06, "loss": 0.1011, "step": 4766 }, { "epoch": 0.7413685847589424, "grad_norm": 0.8718299055295512, "learning_rate": 1.561625021306571e-06, "loss": 0.1043, "step": 4767 }, { "epoch": 0.7415241057542769, "grad_norm": 1.2158221120013328, "learning_rate": 1.5598518276493341e-06, "loss": 0.1288, "step": 4768 }, { "epoch": 0.7416796267496112, "grad_norm": 1.428017091517197, "learning_rate": 1.5580794552034428e-06, "loss": 0.125, "step": 4769 }, { "epoch": 0.7418351477449455, "grad_norm": 1.190663602962483, "learning_rate": 1.5563079043919843e-06, "loss": 0.1696, "step": 4770 }, { "epoch": 0.74199066874028, "grad_norm": 1.3707956457351915, "learning_rate": 1.554537175637852e-06, "loss": 0.1923, "step": 4771 }, { "epoch": 0.7421461897356143, "grad_norm": 0.9918507657030989, "learning_rate": 1.5527672693637453e-06, "loss": 0.0743, "step": 4772 }, { "epoch": 0.7423017107309486, "grad_norm": 1.346998511823733, "learning_rate": 1.5509981859921652e-06, "loss": 0.1648, "step": 4773 }, { "epoch": 0.7424572317262831, "grad_norm": 1.0512687867288288, "learning_rate": 1.5492299259454147e-06, "loss": 0.1663, "step": 4774 }, { "epoch": 0.7426127527216174, "grad_norm": 0.9549565528833723, "learning_rate": 1.547462489645603e-06, "loss": 0.0673, "step": 4775 }, { "epoch": 0.7427682737169518, "grad_norm": 0.8666813232801317, "learning_rate": 1.5456958775146446e-06, "loss": 0.1491, "step": 4776 }, { "epoch": 0.7429237947122862, "grad_norm": 1.0714975248075627, "learning_rate": 1.5439300899742505e-06, "loss": 0.1492, "step": 4777 }, { "epoch": 0.7430793157076205, "grad_norm": 1.335417627580456, "learning_rate": 1.5421651274459415e-06, "loss": 0.1494, "step": 4778 }, { "epoch": 0.7432348367029549, "grad_norm": 1.217231962330948, "learning_rate": 1.540400990351038e-06, "loss": 0.2142, "step": 4779 }, { "epoch": 0.7433903576982893, "grad_norm": 1.5974426202606204, "learning_rate": 1.5386376791106627e-06, "loss": 0.162, "step": 4780 }, { "epoch": 0.7435458786936237, "grad_norm": 1.02768771498619, "learning_rate": 1.536875194145745e-06, "loss": 0.1363, "step": 4781 }, { "epoch": 0.743701399688958, "grad_norm": 0.8873249704622823, "learning_rate": 1.535113535877012e-06, "loss": 0.2165, "step": 4782 }, { "epoch": 0.7438569206842924, "grad_norm": 1.286272164963589, "learning_rate": 1.5333527047249992e-06, "loss": 0.184, "step": 4783 }, { "epoch": 0.7440124416796268, "grad_norm": 1.2209591441455947, "learning_rate": 1.5315927011100378e-06, "loss": 0.1101, "step": 4784 }, { "epoch": 0.7441679626749611, "grad_norm": 1.139820811579968, "learning_rate": 1.5298335254522695e-06, "loss": 0.1545, "step": 4785 }, { "epoch": 0.7443234836702954, "grad_norm": 0.8189882414061721, "learning_rate": 1.5280751781716313e-06, "loss": 0.1787, "step": 4786 }, { "epoch": 0.7444790046656299, "grad_norm": 1.7034760885853824, "learning_rate": 1.5263176596878643e-06, "loss": 0.0878, "step": 4787 }, { "epoch": 0.7446345256609642, "grad_norm": 1.361430477848424, "learning_rate": 1.5245609704205161e-06, "loss": 0.0909, "step": 4788 }, { "epoch": 0.7447900466562986, "grad_norm": 1.07817089969399, "learning_rate": 1.5228051107889303e-06, "loss": 0.1165, "step": 4789 }, { "epoch": 0.744945567651633, "grad_norm": 0.8211158057741629, "learning_rate": 1.5210500812122548e-06, "loss": 0.1049, "step": 4790 }, { "epoch": 0.7451010886469673, "grad_norm": 1.07772953818359, "learning_rate": 1.51929588210944e-06, "loss": 0.159, "step": 4791 }, { "epoch": 0.7452566096423017, "grad_norm": 1.2900901022918059, "learning_rate": 1.51754251389924e-06, "loss": 0.1223, "step": 4792 }, { "epoch": 0.7454121306376361, "grad_norm": 0.717465781765538, "learning_rate": 1.5157899770002055e-06, "loss": 0.1494, "step": 4793 }, { "epoch": 0.7455676516329705, "grad_norm": 1.1538746572992948, "learning_rate": 1.5140382718306933e-06, "loss": 0.1437, "step": 4794 }, { "epoch": 0.7457231726283048, "grad_norm": 1.0560778519637724, "learning_rate": 1.5122873988088594e-06, "loss": 0.1322, "step": 4795 }, { "epoch": 0.7458786936236392, "grad_norm": 0.9224640105068801, "learning_rate": 1.5105373583526594e-06, "loss": 0.156, "step": 4796 }, { "epoch": 0.7460342146189736, "grad_norm": 1.169190308618248, "learning_rate": 1.5087881508798564e-06, "loss": 0.1157, "step": 4797 }, { "epoch": 0.7461897356143079, "grad_norm": 1.0542610715919543, "learning_rate": 1.5070397768080063e-06, "loss": 0.1718, "step": 4798 }, { "epoch": 0.7463452566096423, "grad_norm": 1.4744102849910794, "learning_rate": 1.5052922365544741e-06, "loss": 0.1269, "step": 4799 }, { "epoch": 0.7465007776049767, "grad_norm": 0.6684896505792133, "learning_rate": 1.5035455305364188e-06, "loss": 0.0985, "step": 4800 }, { "epoch": 0.7465007776049767, "eval_loss": 0.16456378996372223, "eval_runtime": 9.4617, "eval_samples_per_second": 2.748, "eval_steps_per_second": 0.74, "step": 4800 }, { "epoch": 0.746656298600311, "grad_norm": 1.7891315911765118, "learning_rate": 1.5017996591708073e-06, "loss": 0.1853, "step": 4801 }, { "epoch": 0.7468118195956455, "grad_norm": 0.9390930747530568, "learning_rate": 1.5000546228743989e-06, "loss": 0.1111, "step": 4802 }, { "epoch": 0.7469673405909798, "grad_norm": 0.7350106138782887, "learning_rate": 1.4983104220637623e-06, "loss": 0.145, "step": 4803 }, { "epoch": 0.7471228615863141, "grad_norm": 1.0597292480658829, "learning_rate": 1.496567057155261e-06, "loss": 0.2124, "step": 4804 }, { "epoch": 0.7472783825816485, "grad_norm": 1.0083704241062854, "learning_rate": 1.4948245285650602e-06, "loss": 0.1541, "step": 4805 }, { "epoch": 0.7474339035769829, "grad_norm": 0.7805444110761147, "learning_rate": 1.4930828367091239e-06, "loss": 0.1741, "step": 4806 }, { "epoch": 0.7475894245723173, "grad_norm": 1.4869529512387427, "learning_rate": 1.4913419820032205e-06, "loss": 0.0869, "step": 4807 }, { "epoch": 0.7477449455676516, "grad_norm": 0.8511693493930795, "learning_rate": 1.4896019648629174e-06, "loss": 0.1246, "step": 4808 }, { "epoch": 0.747900466562986, "grad_norm": 0.7847507601255682, "learning_rate": 1.4878627857035777e-06, "loss": 0.155, "step": 4809 }, { "epoch": 0.7480559875583204, "grad_norm": 1.0238066210771986, "learning_rate": 1.4861244449403717e-06, "loss": 0.1229, "step": 4810 }, { "epoch": 0.7482115085536547, "grad_norm": 0.9539379097669529, "learning_rate": 1.484386942988263e-06, "loss": 0.1331, "step": 4811 }, { "epoch": 0.7483670295489891, "grad_norm": 0.8177480892784577, "learning_rate": 1.4826502802620164e-06, "loss": 0.1016, "step": 4812 }, { "epoch": 0.7485225505443235, "grad_norm": 1.353634964816855, "learning_rate": 1.4809144571762001e-06, "loss": 0.1176, "step": 4813 }, { "epoch": 0.7486780715396578, "grad_norm": 1.0827190080181996, "learning_rate": 1.479179474145177e-06, "loss": 0.1395, "step": 4814 }, { "epoch": 0.7488335925349923, "grad_norm": 1.3407514796575009, "learning_rate": 1.4774453315831149e-06, "loss": 0.1715, "step": 4815 }, { "epoch": 0.7489891135303266, "grad_norm": 0.8956591607290386, "learning_rate": 1.475712029903974e-06, "loss": 0.1668, "step": 4816 }, { "epoch": 0.7491446345256609, "grad_norm": 1.3136059482308051, "learning_rate": 1.4739795695215215e-06, "loss": 0.1457, "step": 4817 }, { "epoch": 0.7493001555209954, "grad_norm": 1.1365314102051522, "learning_rate": 1.4722479508493154e-06, "loss": 0.1549, "step": 4818 }, { "epoch": 0.7494556765163297, "grad_norm": 0.8829613772185011, "learning_rate": 1.4705171743007219e-06, "loss": 0.2129, "step": 4819 }, { "epoch": 0.749611197511664, "grad_norm": 0.9643676619858396, "learning_rate": 1.4687872402888991e-06, "loss": 0.1811, "step": 4820 }, { "epoch": 0.7497667185069985, "grad_norm": 1.344990409997875, "learning_rate": 1.467058149226805e-06, "loss": 0.1496, "step": 4821 }, { "epoch": 0.7499222395023328, "grad_norm": 0.9029665451269865, "learning_rate": 1.4653299015272004e-06, "loss": 0.1495, "step": 4822 }, { "epoch": 0.7500777604976672, "grad_norm": 1.2992048808400887, "learning_rate": 1.4636024976026403e-06, "loss": 0.1932, "step": 4823 }, { "epoch": 0.7502332814930015, "grad_norm": 1.3032264506376432, "learning_rate": 1.4618759378654817e-06, "loss": 0.2227, "step": 4824 }, { "epoch": 0.750388802488336, "grad_norm": 1.1296789758828913, "learning_rate": 1.4601502227278762e-06, "loss": 0.1449, "step": 4825 }, { "epoch": 0.7505443234836703, "grad_norm": 1.3420285176357567, "learning_rate": 1.458425352601779e-06, "loss": 0.1589, "step": 4826 }, { "epoch": 0.7506998444790046, "grad_norm": 1.0613761730803712, "learning_rate": 1.456701327898939e-06, "loss": 0.1058, "step": 4827 }, { "epoch": 0.7508553654743391, "grad_norm": 1.2086704021055008, "learning_rate": 1.4549781490309039e-06, "loss": 0.1491, "step": 4828 }, { "epoch": 0.7510108864696734, "grad_norm": 1.151529808843173, "learning_rate": 1.4532558164090226e-06, "loss": 0.1253, "step": 4829 }, { "epoch": 0.7511664074650077, "grad_norm": 1.3749040693347492, "learning_rate": 1.451534330444438e-06, "loss": 0.2152, "step": 4830 }, { "epoch": 0.7513219284603422, "grad_norm": 1.3324271545222897, "learning_rate": 1.4498136915480954e-06, "loss": 0.1759, "step": 4831 }, { "epoch": 0.7514774494556765, "grad_norm": 1.621674671024891, "learning_rate": 1.448093900130732e-06, "loss": 0.149, "step": 4832 }, { "epoch": 0.7516329704510109, "grad_norm": 1.161190461123551, "learning_rate": 1.4463749566028889e-06, "loss": 0.1304, "step": 4833 }, { "epoch": 0.7517884914463453, "grad_norm": 1.3604018493145238, "learning_rate": 1.4446568613748996e-06, "loss": 0.1857, "step": 4834 }, { "epoch": 0.7519440124416796, "grad_norm": 0.9569924821173782, "learning_rate": 1.4429396148568997e-06, "loss": 0.0877, "step": 4835 }, { "epoch": 0.752099533437014, "grad_norm": 1.2709938777469127, "learning_rate": 1.4412232174588186e-06, "loss": 0.122, "step": 4836 }, { "epoch": 0.7522550544323484, "grad_norm": 1.644415924339911, "learning_rate": 1.4395076695903831e-06, "loss": 0.1336, "step": 4837 }, { "epoch": 0.7524105754276827, "grad_norm": 0.8808824318487543, "learning_rate": 1.4377929716611211e-06, "loss": 0.1151, "step": 4838 }, { "epoch": 0.7525660964230171, "grad_norm": 1.1904284698229148, "learning_rate": 1.4360791240803523e-06, "loss": 0.1995, "step": 4839 }, { "epoch": 0.7527216174183515, "grad_norm": 0.9632810520118924, "learning_rate": 1.4343661272571967e-06, "loss": 0.2192, "step": 4840 }, { "epoch": 0.7528771384136859, "grad_norm": 0.7355361790497928, "learning_rate": 1.432653981600573e-06, "loss": 0.1505, "step": 4841 }, { "epoch": 0.7530326594090202, "grad_norm": 1.2050174525260375, "learning_rate": 1.4309426875191917e-06, "loss": 0.1849, "step": 4842 }, { "epoch": 0.7531881804043545, "grad_norm": 0.8670033555052911, "learning_rate": 1.4292322454215634e-06, "loss": 0.1016, "step": 4843 }, { "epoch": 0.753343701399689, "grad_norm": 0.7650236534958725, "learning_rate": 1.4275226557159927e-06, "loss": 0.1444, "step": 4844 }, { "epoch": 0.7534992223950233, "grad_norm": 1.015274288529243, "learning_rate": 1.4258139188105858e-06, "loss": 0.1684, "step": 4845 }, { "epoch": 0.7536547433903577, "grad_norm": 1.4651955495958553, "learning_rate": 1.4241060351132386e-06, "loss": 0.1486, "step": 4846 }, { "epoch": 0.7538102643856921, "grad_norm": 0.7446585835921073, "learning_rate": 1.4223990050316494e-06, "loss": 0.0985, "step": 4847 }, { "epoch": 0.7539657853810264, "grad_norm": 1.5581334395307767, "learning_rate": 1.420692828973308e-06, "loss": 0.1807, "step": 4848 }, { "epoch": 0.7541213063763608, "grad_norm": 1.2460902756394572, "learning_rate": 1.4189875073455051e-06, "loss": 0.1923, "step": 4849 }, { "epoch": 0.7542768273716952, "grad_norm": 1.0070484367487307, "learning_rate": 1.4172830405553216e-06, "loss": 0.1407, "step": 4850 }, { "epoch": 0.7544323483670295, "grad_norm": 1.2375775565874008, "learning_rate": 1.4155794290096404e-06, "loss": 0.2091, "step": 4851 }, { "epoch": 0.7545878693623639, "grad_norm": 0.8493993540440667, "learning_rate": 1.4138766731151365e-06, "loss": 0.1453, "step": 4852 }, { "epoch": 0.7547433903576983, "grad_norm": 0.9865288111333512, "learning_rate": 1.4121747732782788e-06, "loss": 0.2089, "step": 4853 }, { "epoch": 0.7548989113530327, "grad_norm": 1.2709808847817152, "learning_rate": 1.4104737299053384e-06, "loss": 0.2205, "step": 4854 }, { "epoch": 0.755054432348367, "grad_norm": 1.065923960845324, "learning_rate": 1.4087735434023748e-06, "loss": 0.1086, "step": 4855 }, { "epoch": 0.7552099533437014, "grad_norm": 1.2244379448852702, "learning_rate": 1.407074214175248e-06, "loss": 0.2693, "step": 4856 }, { "epoch": 0.7553654743390358, "grad_norm": 0.815599663767941, "learning_rate": 1.4053757426296127e-06, "loss": 0.137, "step": 4857 }, { "epoch": 0.7555209953343701, "grad_norm": 1.0705070338386804, "learning_rate": 1.4036781291709168e-06, "loss": 0.235, "step": 4858 }, { "epoch": 0.7556765163297046, "grad_norm": 0.8804431752408354, "learning_rate": 1.401981374204402e-06, "loss": 0.1381, "step": 4859 }, { "epoch": 0.7558320373250389, "grad_norm": 0.8356819588855672, "learning_rate": 1.4002854781351104e-06, "loss": 0.1966, "step": 4860 }, { "epoch": 0.7559875583203732, "grad_norm": 1.15488031450221, "learning_rate": 1.3985904413678757e-06, "loss": 0.0801, "step": 4861 }, { "epoch": 0.7561430793157076, "grad_norm": 1.1120179308480729, "learning_rate": 1.3968962643073242e-06, "loss": 0.1335, "step": 4862 }, { "epoch": 0.756298600311042, "grad_norm": 0.9622617673128306, "learning_rate": 1.3952029473578833e-06, "loss": 0.1339, "step": 4863 }, { "epoch": 0.7564541213063763, "grad_norm": 0.6267635851441599, "learning_rate": 1.3935104909237678e-06, "loss": 0.1338, "step": 4864 }, { "epoch": 0.7566096423017107, "grad_norm": 1.1169745206587787, "learning_rate": 1.3918188954089939e-06, "loss": 0.1197, "step": 4865 }, { "epoch": 0.7567651632970451, "grad_norm": 1.0074105820762949, "learning_rate": 1.3901281612173656e-06, "loss": 0.1103, "step": 4866 }, { "epoch": 0.7569206842923795, "grad_norm": 0.9286071286810654, "learning_rate": 1.388438288752489e-06, "loss": 0.1538, "step": 4867 }, { "epoch": 0.7570762052877138, "grad_norm": 1.105868354748469, "learning_rate": 1.386749278417757e-06, "loss": 0.09, "step": 4868 }, { "epoch": 0.7572317262830482, "grad_norm": 0.8744533167049136, "learning_rate": 1.3850611306163597e-06, "loss": 0.1625, "step": 4869 }, { "epoch": 0.7573872472783826, "grad_norm": 1.0187360986235985, "learning_rate": 1.3833738457512842e-06, "loss": 0.1052, "step": 4870 }, { "epoch": 0.7575427682737169, "grad_norm": 1.217536543021923, "learning_rate": 1.3816874242253054e-06, "loss": 0.1512, "step": 4871 }, { "epoch": 0.7576982892690514, "grad_norm": 0.8525947055770187, "learning_rate": 1.3800018664409974e-06, "loss": 0.1176, "step": 4872 }, { "epoch": 0.7578538102643857, "grad_norm": 0.7622790500156104, "learning_rate": 1.3783171728007289e-06, "loss": 0.1279, "step": 4873 }, { "epoch": 0.75800933125972, "grad_norm": 1.356079454379559, "learning_rate": 1.376633343706657e-06, "loss": 0.14, "step": 4874 }, { "epoch": 0.7581648522550545, "grad_norm": 0.8450791699408621, "learning_rate": 1.3749503795607338e-06, "loss": 0.1337, "step": 4875 }, { "epoch": 0.7583203732503888, "grad_norm": 1.0466790770150878, "learning_rate": 1.3732682807647096e-06, "loss": 0.1344, "step": 4876 }, { "epoch": 0.7584758942457231, "grad_norm": 1.4296455289536676, "learning_rate": 1.3715870477201237e-06, "loss": 0.231, "step": 4877 }, { "epoch": 0.7586314152410576, "grad_norm": 1.025074694900758, "learning_rate": 1.369906680828308e-06, "loss": 0.191, "step": 4878 }, { "epoch": 0.7587869362363919, "grad_norm": 0.7211111915224813, "learning_rate": 1.3682271804903918e-06, "loss": 0.1342, "step": 4879 }, { "epoch": 0.7589424572317263, "grad_norm": 0.8467733441724873, "learning_rate": 1.3665485471072937e-06, "loss": 0.1008, "step": 4880 }, { "epoch": 0.7590979782270606, "grad_norm": 1.733832213848631, "learning_rate": 1.3648707810797291e-06, "loss": 0.1394, "step": 4881 }, { "epoch": 0.759253499222395, "grad_norm": 0.8850406306783191, "learning_rate": 1.363193882808201e-06, "loss": 0.1321, "step": 4882 }, { "epoch": 0.7594090202177294, "grad_norm": 1.169996568162923, "learning_rate": 1.3615178526930112e-06, "loss": 0.1809, "step": 4883 }, { "epoch": 0.7595645412130637, "grad_norm": 1.1016633416281578, "learning_rate": 1.359842691134251e-06, "loss": 0.1521, "step": 4884 }, { "epoch": 0.7597200622083982, "grad_norm": 1.1532856856496672, "learning_rate": 1.3581683985318023e-06, "loss": 0.157, "step": 4885 }, { "epoch": 0.7598755832037325, "grad_norm": 1.1456530881252367, "learning_rate": 1.3564949752853456e-06, "loss": 0.1005, "step": 4886 }, { "epoch": 0.7600311041990668, "grad_norm": 1.375649855872582, "learning_rate": 1.3548224217943473e-06, "loss": 0.1612, "step": 4887 }, { "epoch": 0.7601866251944013, "grad_norm": 0.8777067507092644, "learning_rate": 1.353150738458071e-06, "loss": 0.1215, "step": 4888 }, { "epoch": 0.7603421461897356, "grad_norm": 1.1375076560143125, "learning_rate": 1.3514799256755717e-06, "loss": 0.1602, "step": 4889 }, { "epoch": 0.76049766718507, "grad_norm": 0.9707077594091942, "learning_rate": 1.3498099838456947e-06, "loss": 0.1594, "step": 4890 }, { "epoch": 0.7606531881804044, "grad_norm": 0.7922975037807678, "learning_rate": 1.3481409133670765e-06, "loss": 0.0978, "step": 4891 }, { "epoch": 0.7608087091757387, "grad_norm": 0.9957800760950859, "learning_rate": 1.3464727146381507e-06, "loss": 0.1202, "step": 4892 }, { "epoch": 0.7609642301710731, "grad_norm": 0.8710335903419382, "learning_rate": 1.3448053880571382e-06, "loss": 0.1793, "step": 4893 }, { "epoch": 0.7611197511664075, "grad_norm": 0.953919418246849, "learning_rate": 1.3431389340220513e-06, "loss": 0.1321, "step": 4894 }, { "epoch": 0.7612752721617418, "grad_norm": 0.9063965151727927, "learning_rate": 1.3414733529306996e-06, "loss": 0.1349, "step": 4895 }, { "epoch": 0.7614307931570762, "grad_norm": 1.272481036814765, "learning_rate": 1.339808645180677e-06, "loss": 0.1331, "step": 4896 }, { "epoch": 0.7615863141524106, "grad_norm": 1.4383916351870587, "learning_rate": 1.3381448111693735e-06, "loss": 0.1523, "step": 4897 }, { "epoch": 0.761741835147745, "grad_norm": 1.3171779595726563, "learning_rate": 1.3364818512939714e-06, "loss": 0.1691, "step": 4898 }, { "epoch": 0.7618973561430793, "grad_norm": 1.296305396799674, "learning_rate": 1.3348197659514411e-06, "loss": 0.1664, "step": 4899 }, { "epoch": 0.7620528771384136, "grad_norm": 3.5786564527171145, "learning_rate": 1.3331585555385458e-06, "loss": 0.1566, "step": 4900 }, { "epoch": 0.7620528771384136, "eval_loss": 0.16375580430030823, "eval_runtime": 9.4295, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 4900 }, { "epoch": 0.7622083981337481, "grad_norm": 0.9313520977851478, "learning_rate": 1.3314982204518368e-06, "loss": 0.0993, "step": 4901 }, { "epoch": 0.7623639191290824, "grad_norm": 1.0765216502627253, "learning_rate": 1.329838761087664e-06, "loss": 0.1401, "step": 4902 }, { "epoch": 0.7625194401244167, "grad_norm": 1.3726988283259063, "learning_rate": 1.3281801778421594e-06, "loss": 0.1216, "step": 4903 }, { "epoch": 0.7626749611197512, "grad_norm": 1.5905842565828034, "learning_rate": 1.3265224711112512e-06, "loss": 0.1275, "step": 4904 }, { "epoch": 0.7628304821150855, "grad_norm": 1.3730966177914223, "learning_rate": 1.3248656412906596e-06, "loss": 0.1355, "step": 4905 }, { "epoch": 0.7629860031104199, "grad_norm": 0.6409467527434934, "learning_rate": 1.3232096887758905e-06, "loss": 0.0696, "step": 4906 }, { "epoch": 0.7631415241057543, "grad_norm": 1.1482875070861511, "learning_rate": 1.3215546139622421e-06, "loss": 0.2172, "step": 4907 }, { "epoch": 0.7632970451010886, "grad_norm": 1.1742329817778654, "learning_rate": 1.3199004172448065e-06, "loss": 0.1534, "step": 4908 }, { "epoch": 0.763452566096423, "grad_norm": 1.2769048506186145, "learning_rate": 1.3182470990184621e-06, "loss": 0.1532, "step": 4909 }, { "epoch": 0.7636080870917574, "grad_norm": 1.1971625986277596, "learning_rate": 1.3165946596778773e-06, "loss": 0.1091, "step": 4910 }, { "epoch": 0.7637636080870918, "grad_norm": 1.111127108317417, "learning_rate": 1.314943099617516e-06, "loss": 0.1788, "step": 4911 }, { "epoch": 0.7639191290824261, "grad_norm": 1.346855052684181, "learning_rate": 1.3132924192316249e-06, "loss": 0.1805, "step": 4912 }, { "epoch": 0.7640746500777605, "grad_norm": 1.0619017064775056, "learning_rate": 1.3116426189142468e-06, "loss": 0.1342, "step": 4913 }, { "epoch": 0.7642301710730949, "grad_norm": 0.8758670515347072, "learning_rate": 1.3099936990592128e-06, "loss": 0.1523, "step": 4914 }, { "epoch": 0.7643856920684292, "grad_norm": 1.2729258299889572, "learning_rate": 1.3083456600601413e-06, "loss": 0.1725, "step": 4915 }, { "epoch": 0.7645412130637637, "grad_norm": 1.2299261356735247, "learning_rate": 1.3066985023104412e-06, "loss": 0.0979, "step": 4916 }, { "epoch": 0.764696734059098, "grad_norm": 2.666313802929025, "learning_rate": 1.3050522262033156e-06, "loss": 0.0953, "step": 4917 }, { "epoch": 0.7648522550544323, "grad_norm": 0.8778515006040425, "learning_rate": 1.3034068321317512e-06, "loss": 0.177, "step": 4918 }, { "epoch": 0.7650077760497667, "grad_norm": 1.5156098476542834, "learning_rate": 1.3017623204885249e-06, "loss": 0.1136, "step": 4919 }, { "epoch": 0.7651632970451011, "grad_norm": 4.368783964129528, "learning_rate": 1.3001186916662066e-06, "loss": 0.1394, "step": 4920 }, { "epoch": 0.7653188180404354, "grad_norm": 1.1402988316630773, "learning_rate": 1.298475946057155e-06, "loss": 0.1347, "step": 4921 }, { "epoch": 0.7654743390357698, "grad_norm": 1.1899099135450624, "learning_rate": 1.2968340840535143e-06, "loss": 0.1657, "step": 4922 }, { "epoch": 0.7656298600311042, "grad_norm": 0.8404541872935504, "learning_rate": 1.295193106047219e-06, "loss": 0.1072, "step": 4923 }, { "epoch": 0.7657853810264386, "grad_norm": 1.2762066907937772, "learning_rate": 1.2935530124299954e-06, "loss": 0.1346, "step": 4924 }, { "epoch": 0.7659409020217729, "grad_norm": 1.0973681466833667, "learning_rate": 1.291913803593357e-06, "loss": 0.1215, "step": 4925 }, { "epoch": 0.7660964230171073, "grad_norm": 0.8660219045119206, "learning_rate": 1.2902754799286027e-06, "loss": 0.2023, "step": 4926 }, { "epoch": 0.7662519440124417, "grad_norm": 0.9736074473169334, "learning_rate": 1.2886380418268268e-06, "loss": 0.1338, "step": 4927 }, { "epoch": 0.766407465007776, "grad_norm": 1.1901574055517592, "learning_rate": 1.2870014896789057e-06, "loss": 0.1586, "step": 4928 }, { "epoch": 0.7665629860031105, "grad_norm": 1.1494216595545377, "learning_rate": 1.2853658238755085e-06, "loss": 0.1291, "step": 4929 }, { "epoch": 0.7667185069984448, "grad_norm": 0.7979989598038516, "learning_rate": 1.2837310448070929e-06, "loss": 0.1123, "step": 4930 }, { "epoch": 0.7668740279937791, "grad_norm": 1.4166491996820125, "learning_rate": 1.282097152863902e-06, "loss": 0.1889, "step": 4931 }, { "epoch": 0.7670295489891136, "grad_norm": 1.1071529035463956, "learning_rate": 1.280464148435967e-06, "loss": 0.1716, "step": 4932 }, { "epoch": 0.7671850699844479, "grad_norm": 0.7147988270221063, "learning_rate": 1.2788320319131125e-06, "loss": 0.1372, "step": 4933 }, { "epoch": 0.7673405909797822, "grad_norm": 1.5928816470086447, "learning_rate": 1.2772008036849454e-06, "loss": 0.1424, "step": 4934 }, { "epoch": 0.7674961119751167, "grad_norm": 1.0314219022598945, "learning_rate": 1.275570464140861e-06, "loss": 0.1165, "step": 4935 }, { "epoch": 0.767651632970451, "grad_norm": 0.9353714461556175, "learning_rate": 1.2739410136700458e-06, "loss": 0.1088, "step": 4936 }, { "epoch": 0.7678071539657854, "grad_norm": 1.0891519423773195, "learning_rate": 1.2723124526614744e-06, "loss": 0.1473, "step": 4937 }, { "epoch": 0.7679626749611198, "grad_norm": 1.5315082223357976, "learning_rate": 1.270684781503903e-06, "loss": 0.1425, "step": 4938 }, { "epoch": 0.7681181959564541, "grad_norm": 1.2825596401885673, "learning_rate": 1.2690580005858827e-06, "loss": 0.1287, "step": 4939 }, { "epoch": 0.7682737169517885, "grad_norm": 0.8943933617635227, "learning_rate": 1.2674321102957476e-06, "loss": 0.1181, "step": 4940 }, { "epoch": 0.7684292379471228, "grad_norm": 1.0276430134711467, "learning_rate": 1.2658071110216202e-06, "loss": 0.1311, "step": 4941 }, { "epoch": 0.7685847589424573, "grad_norm": 0.9479246930806259, "learning_rate": 1.2641830031514096e-06, "loss": 0.1516, "step": 4942 }, { "epoch": 0.7687402799377916, "grad_norm": 1.0887937078822407, "learning_rate": 1.2625597870728145e-06, "loss": 0.1855, "step": 4943 }, { "epoch": 0.7688958009331259, "grad_norm": 0.8565408012702929, "learning_rate": 1.2609374631733179e-06, "loss": 0.1729, "step": 4944 }, { "epoch": 0.7690513219284604, "grad_norm": 0.9293093521859053, "learning_rate": 1.2593160318401914e-06, "loss": 0.1504, "step": 4945 }, { "epoch": 0.7692068429237947, "grad_norm": 0.9138052958801124, "learning_rate": 1.2576954934604957e-06, "loss": 0.098, "step": 4946 }, { "epoch": 0.769362363919129, "grad_norm": 0.7752139244550458, "learning_rate": 1.2560758484210738e-06, "loss": 0.1067, "step": 4947 }, { "epoch": 0.7695178849144635, "grad_norm": 1.0303552142103594, "learning_rate": 1.2544570971085563e-06, "loss": 0.0979, "step": 4948 }, { "epoch": 0.7696734059097978, "grad_norm": 1.1909358590886256, "learning_rate": 1.2528392399093642e-06, "loss": 0.1331, "step": 4949 }, { "epoch": 0.7698289269051322, "grad_norm": 1.5396336498042984, "learning_rate": 1.251222277209702e-06, "loss": 0.0993, "step": 4950 }, { "epoch": 0.7699844479004666, "grad_norm": 3.062537308583278, "learning_rate": 1.2496062093955591e-06, "loss": 0.1254, "step": 4951 }, { "epoch": 0.7701399688958009, "grad_norm": 1.34750942534762, "learning_rate": 1.2479910368527149e-06, "loss": 0.1867, "step": 4952 }, { "epoch": 0.7702954898911353, "grad_norm": 1.1956337562686283, "learning_rate": 1.2463767599667353e-06, "loss": 0.175, "step": 4953 }, { "epoch": 0.7704510108864697, "grad_norm": 0.9256778794609304, "learning_rate": 1.244763379122968e-06, "loss": 0.1015, "step": 4954 }, { "epoch": 0.7706065318818041, "grad_norm": 0.955364699535579, "learning_rate": 1.2431508947065523e-06, "loss": 0.1458, "step": 4955 }, { "epoch": 0.7707620528771384, "grad_norm": 1.0080604195872513, "learning_rate": 1.241539307102409e-06, "loss": 0.099, "step": 4956 }, { "epoch": 0.7709175738724728, "grad_norm": 1.9194918429680277, "learning_rate": 1.239928616695245e-06, "loss": 0.3251, "step": 4957 }, { "epoch": 0.7710730948678072, "grad_norm": 1.0745371262031558, "learning_rate": 1.2383188238695575e-06, "loss": 0.1642, "step": 4958 }, { "epoch": 0.7712286158631415, "grad_norm": 1.118544544550129, "learning_rate": 1.2367099290096258e-06, "loss": 0.1752, "step": 4959 }, { "epoch": 0.7713841368584758, "grad_norm": 1.5769114195357372, "learning_rate": 1.2351019324995128e-06, "loss": 0.1668, "step": 4960 }, { "epoch": 0.7715396578538103, "grad_norm": 0.7820461985163338, "learning_rate": 1.2334948347230725e-06, "loss": 0.1102, "step": 4961 }, { "epoch": 0.7716951788491446, "grad_norm": 1.071690854267506, "learning_rate": 1.231888636063942e-06, "loss": 0.2669, "step": 4962 }, { "epoch": 0.771850699844479, "grad_norm": 1.1066869945468456, "learning_rate": 1.2302833369055422e-06, "loss": 0.1303, "step": 4963 }, { "epoch": 0.7720062208398134, "grad_norm": 1.0183725115007074, "learning_rate": 1.2286789376310793e-06, "loss": 0.1599, "step": 4964 }, { "epoch": 0.7721617418351477, "grad_norm": 1.0745132672099362, "learning_rate": 1.2270754386235479e-06, "loss": 0.13, "step": 4965 }, { "epoch": 0.7723172628304821, "grad_norm": 0.8756329437340231, "learning_rate": 1.2254728402657245e-06, "loss": 0.1311, "step": 4966 }, { "epoch": 0.7724727838258165, "grad_norm": 1.0293191219896698, "learning_rate": 1.2238711429401705e-06, "loss": 0.1651, "step": 4967 }, { "epoch": 0.7726283048211509, "grad_norm": 1.0851129038480638, "learning_rate": 1.2222703470292352e-06, "loss": 0.1281, "step": 4968 }, { "epoch": 0.7727838258164852, "grad_norm": 0.9975304092552916, "learning_rate": 1.220670452915051e-06, "loss": 0.1316, "step": 4969 }, { "epoch": 0.7729393468118196, "grad_norm": 1.1640969641408954, "learning_rate": 1.2190714609795334e-06, "loss": 0.1045, "step": 4970 }, { "epoch": 0.773094867807154, "grad_norm": 1.0970711222454157, "learning_rate": 1.2174733716043858e-06, "loss": 0.1072, "step": 4971 }, { "epoch": 0.7732503888024883, "grad_norm": 1.381959601079731, "learning_rate": 1.2158761851710943e-06, "loss": 0.1783, "step": 4972 }, { "epoch": 0.7734059097978228, "grad_norm": 1.0037948172575368, "learning_rate": 1.214279902060927e-06, "loss": 0.1432, "step": 4973 }, { "epoch": 0.7735614307931571, "grad_norm": 0.9299306965778402, "learning_rate": 1.2126845226549422e-06, "loss": 0.0636, "step": 4974 }, { "epoch": 0.7737169517884914, "grad_norm": 0.943738694381375, "learning_rate": 1.2110900473339776e-06, "loss": 0.0895, "step": 4975 }, { "epoch": 0.7738724727838259, "grad_norm": 1.2630966286981273, "learning_rate": 1.209496476478656e-06, "loss": 0.1319, "step": 4976 }, { "epoch": 0.7740279937791602, "grad_norm": 0.9634443564803252, "learning_rate": 1.2079038104693852e-06, "loss": 0.1378, "step": 4977 }, { "epoch": 0.7741835147744945, "grad_norm": 1.3328562538107285, "learning_rate": 1.2063120496863595e-06, "loss": 0.1743, "step": 4978 }, { "epoch": 0.7743390357698289, "grad_norm": 1.198874944356959, "learning_rate": 1.2047211945095523e-06, "loss": 0.1844, "step": 4979 }, { "epoch": 0.7744945567651633, "grad_norm": 0.9075310284645752, "learning_rate": 1.203131245318721e-06, "loss": 0.0857, "step": 4980 }, { "epoch": 0.7746500777604977, "grad_norm": 2.10799941048558, "learning_rate": 1.2015422024934126e-06, "loss": 0.191, "step": 4981 }, { "epoch": 0.774805598755832, "grad_norm": 1.381866828226641, "learning_rate": 1.1999540664129516e-06, "loss": 0.2085, "step": 4982 }, { "epoch": 0.7749611197511664, "grad_norm": 1.2554131039330523, "learning_rate": 1.1983668374564471e-06, "loss": 0.1438, "step": 4983 }, { "epoch": 0.7751166407465008, "grad_norm": 0.8960081779310591, "learning_rate": 1.196780516002794e-06, "loss": 0.1815, "step": 4984 }, { "epoch": 0.7752721617418351, "grad_norm": 1.4171641059787177, "learning_rate": 1.1951951024306712e-06, "loss": 0.1814, "step": 4985 }, { "epoch": 0.7754276827371696, "grad_norm": 0.9245351416190208, "learning_rate": 1.1936105971185358e-06, "loss": 0.1473, "step": 4986 }, { "epoch": 0.7755832037325039, "grad_norm": 0.9472853731856695, "learning_rate": 1.1920270004446337e-06, "loss": 0.1466, "step": 4987 }, { "epoch": 0.7757387247278382, "grad_norm": 0.9575335400962642, "learning_rate": 1.1904443127869913e-06, "loss": 0.1079, "step": 4988 }, { "epoch": 0.7758942457231727, "grad_norm": 0.9642048147256979, "learning_rate": 1.1888625345234155e-06, "loss": 0.1881, "step": 4989 }, { "epoch": 0.776049766718507, "grad_norm": 1.1306038993968166, "learning_rate": 1.1872816660315029e-06, "loss": 0.1236, "step": 4990 }, { "epoch": 0.7762052877138413, "grad_norm": 1.2870701750214886, "learning_rate": 1.1857017076886269e-06, "loss": 0.1107, "step": 4991 }, { "epoch": 0.7763608087091758, "grad_norm": 1.0556653990748335, "learning_rate": 1.1841226598719435e-06, "loss": 0.1925, "step": 4992 }, { "epoch": 0.7765163297045101, "grad_norm": 1.1511339432970626, "learning_rate": 1.182544522958396e-06, "loss": 0.1547, "step": 4993 }, { "epoch": 0.7766718506998445, "grad_norm": 1.1786514905128782, "learning_rate": 1.1809672973247082e-06, "loss": 0.1425, "step": 4994 }, { "epoch": 0.7768273716951789, "grad_norm": 0.8665283899245577, "learning_rate": 1.1793909833473831e-06, "loss": 0.2221, "step": 4995 }, { "epoch": 0.7769828926905132, "grad_norm": 1.1152647747562363, "learning_rate": 1.1778155814027126e-06, "loss": 0.1291, "step": 4996 }, { "epoch": 0.7771384136858476, "grad_norm": 1.2746989945024203, "learning_rate": 1.176241091866765e-06, "loss": 0.1154, "step": 4997 }, { "epoch": 0.7772939346811819, "grad_norm": 1.0041695380303857, "learning_rate": 1.1746675151153925e-06, "loss": 0.1363, "step": 4998 }, { "epoch": 0.7774494556765164, "grad_norm": 1.1717680597179665, "learning_rate": 1.173094851524229e-06, "loss": 0.1139, "step": 4999 }, { "epoch": 0.7776049766718507, "grad_norm": 1.0745124415965217, "learning_rate": 1.171523101468693e-06, "loss": 0.1004, "step": 5000 }, { "epoch": 0.7776049766718507, "eval_loss": 0.16409926116466522, "eval_runtime": 9.4425, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.741, "step": 5000 }, { "epoch": 0.777760497667185, "grad_norm": 0.8533699844558775, "learning_rate": 1.169952265323983e-06, "loss": 0.1337, "step": 5001 }, { "epoch": 0.7779160186625195, "grad_norm": 0.9042909673802058, "learning_rate": 1.1683823434650788e-06, "loss": 0.0998, "step": 5002 }, { "epoch": 0.7780715396578538, "grad_norm": 0.9571476219704854, "learning_rate": 1.1668133362667439e-06, "loss": 0.1298, "step": 5003 }, { "epoch": 0.7782270606531881, "grad_norm": 1.0671268282716748, "learning_rate": 1.165245244103521e-06, "loss": 0.1881, "step": 5004 }, { "epoch": 0.7783825816485226, "grad_norm": 0.9864484682402658, "learning_rate": 1.163678067349735e-06, "loss": 0.0867, "step": 5005 }, { "epoch": 0.7785381026438569, "grad_norm": 2.197925090827152, "learning_rate": 1.162111806379495e-06, "loss": 0.1731, "step": 5006 }, { "epoch": 0.7786936236391913, "grad_norm": 0.7757278387810537, "learning_rate": 1.1605464615666877e-06, "loss": 0.1204, "step": 5007 }, { "epoch": 0.7788491446345257, "grad_norm": 1.1245592163826827, "learning_rate": 1.158982033284982e-06, "loss": 0.1314, "step": 5008 }, { "epoch": 0.77900466562986, "grad_norm": 1.1623348922888919, "learning_rate": 1.1574185219078299e-06, "loss": 0.1288, "step": 5009 }, { "epoch": 0.7791601866251944, "grad_norm": 1.0722299420732997, "learning_rate": 1.1558559278084647e-06, "loss": 0.1265, "step": 5010 }, { "epoch": 0.7793157076205288, "grad_norm": 0.9192503474696424, "learning_rate": 1.1542942513598966e-06, "loss": 0.1145, "step": 5011 }, { "epoch": 0.7794712286158632, "grad_norm": 0.5238792091056697, "learning_rate": 1.152733492934922e-06, "loss": 0.1125, "step": 5012 }, { "epoch": 0.7796267496111975, "grad_norm": 1.2664160732057734, "learning_rate": 1.1511736529061146e-06, "loss": 0.1309, "step": 5013 }, { "epoch": 0.7797822706065319, "grad_norm": 1.173915686327792, "learning_rate": 1.1496147316458289e-06, "loss": 0.1633, "step": 5014 }, { "epoch": 0.7799377916018663, "grad_norm": 0.7621349711779675, "learning_rate": 1.1480567295262035e-06, "loss": 0.1148, "step": 5015 }, { "epoch": 0.7800933125972006, "grad_norm": 1.6907185740707709, "learning_rate": 1.146499646919153e-06, "loss": 0.0876, "step": 5016 }, { "epoch": 0.7802488335925349, "grad_norm": 1.1165175754870342, "learning_rate": 1.1449434841963763e-06, "loss": 0.1438, "step": 5017 }, { "epoch": 0.7804043545878694, "grad_norm": 2.4799241173962243, "learning_rate": 1.1433882417293502e-06, "loss": 0.1551, "step": 5018 }, { "epoch": 0.7805598755832037, "grad_norm": 0.9060677085255289, "learning_rate": 1.1418339198893335e-06, "loss": 0.161, "step": 5019 }, { "epoch": 0.7807153965785381, "grad_norm": 1.2559248172790907, "learning_rate": 1.1402805190473649e-06, "loss": 0.1959, "step": 5020 }, { "epoch": 0.7808709175738725, "grad_norm": 0.9478098437473376, "learning_rate": 1.1387280395742601e-06, "loss": 0.1497, "step": 5021 }, { "epoch": 0.7810264385692068, "grad_norm": 0.8380955929126747, "learning_rate": 1.1371764818406211e-06, "loss": 0.1024, "step": 5022 }, { "epoch": 0.7811819595645412, "grad_norm": 1.129663274911471, "learning_rate": 1.1356258462168251e-06, "loss": 0.1485, "step": 5023 }, { "epoch": 0.7813374805598756, "grad_norm": 0.8963917502817385, "learning_rate": 1.1340761330730289e-06, "loss": 0.1297, "step": 5024 }, { "epoch": 0.78149300155521, "grad_norm": 1.4369042933597402, "learning_rate": 1.1325273427791717e-06, "loss": 0.128, "step": 5025 }, { "epoch": 0.7816485225505443, "grad_norm": 0.9773950244273666, "learning_rate": 1.130979475704973e-06, "loss": 0.0957, "step": 5026 }, { "epoch": 0.7818040435458787, "grad_norm": 1.16261783855377, "learning_rate": 1.1294325322199272e-06, "loss": 0.1734, "step": 5027 }, { "epoch": 0.7819595645412131, "grad_norm": 1.2657917121092612, "learning_rate": 1.1278865126933147e-06, "loss": 0.1524, "step": 5028 }, { "epoch": 0.7821150855365474, "grad_norm": 1.0476970594780515, "learning_rate": 1.1263414174941894e-06, "loss": 0.1387, "step": 5029 }, { "epoch": 0.7822706065318819, "grad_norm": 1.0713328129803532, "learning_rate": 1.124797246991387e-06, "loss": 0.0861, "step": 5030 }, { "epoch": 0.7824261275272162, "grad_norm": 0.9558766678566981, "learning_rate": 1.1232540015535248e-06, "loss": 0.2114, "step": 5031 }, { "epoch": 0.7825816485225505, "grad_norm": 1.0398136405648164, "learning_rate": 1.1217116815489941e-06, "loss": 0.1059, "step": 5032 }, { "epoch": 0.782737169517885, "grad_norm": 1.3120387538305966, "learning_rate": 1.1201702873459697e-06, "loss": 0.1342, "step": 5033 }, { "epoch": 0.7828926905132193, "grad_norm": 0.8452771459703474, "learning_rate": 1.1186298193124046e-06, "loss": 0.1365, "step": 5034 }, { "epoch": 0.7830482115085536, "grad_norm": 1.2129685628384415, "learning_rate": 1.1170902778160297e-06, "loss": 0.1279, "step": 5035 }, { "epoch": 0.783203732503888, "grad_norm": 1.0820217006098714, "learning_rate": 1.1155516632243545e-06, "loss": 0.1459, "step": 5036 }, { "epoch": 0.7833592534992224, "grad_norm": 1.7095620600683674, "learning_rate": 1.1140139759046664e-06, "loss": 0.1025, "step": 5037 }, { "epoch": 0.7835147744945568, "grad_norm": 0.9207253167917396, "learning_rate": 1.1124772162240354e-06, "loss": 0.1393, "step": 5038 }, { "epoch": 0.7836702954898911, "grad_norm": 0.8585109129272239, "learning_rate": 1.1109413845493066e-06, "loss": 0.1521, "step": 5039 }, { "epoch": 0.7838258164852255, "grad_norm": 1.0901097283736958, "learning_rate": 1.1094064812471028e-06, "loss": 0.181, "step": 5040 }, { "epoch": 0.7839813374805599, "grad_norm": 0.9090735125682089, "learning_rate": 1.1078725066838281e-06, "loss": 0.1056, "step": 5041 }, { "epoch": 0.7841368584758942, "grad_norm": 1.0901485407057245, "learning_rate": 1.106339461225665e-06, "loss": 0.0913, "step": 5042 }, { "epoch": 0.7842923794712287, "grad_norm": 1.0337463115870933, "learning_rate": 1.1048073452385699e-06, "loss": 0.1504, "step": 5043 }, { "epoch": 0.784447900466563, "grad_norm": 1.357723266017358, "learning_rate": 1.1032761590882834e-06, "loss": 0.2524, "step": 5044 }, { "epoch": 0.7846034214618973, "grad_norm": 1.0121938287952548, "learning_rate": 1.1017459031403194e-06, "loss": 0.1601, "step": 5045 }, { "epoch": 0.7847589424572318, "grad_norm": 1.1119641680688381, "learning_rate": 1.1002165777599704e-06, "loss": 0.1982, "step": 5046 }, { "epoch": 0.7849144634525661, "grad_norm": 0.9288643348489117, "learning_rate": 1.0986881833123097e-06, "loss": 0.1078, "step": 5047 }, { "epoch": 0.7850699844479004, "grad_norm": 1.194090118910171, "learning_rate": 1.0971607201621836e-06, "loss": 0.1995, "step": 5048 }, { "epoch": 0.7852255054432349, "grad_norm": 0.7853406308273965, "learning_rate": 1.0956341886742211e-06, "loss": 0.1241, "step": 5049 }, { "epoch": 0.7853810264385692, "grad_norm": 1.1018735780462365, "learning_rate": 1.0941085892128272e-06, "loss": 0.1493, "step": 5050 }, { "epoch": 0.7855365474339036, "grad_norm": 0.7573945273395968, "learning_rate": 1.0925839221421824e-06, "loss": 0.1352, "step": 5051 }, { "epoch": 0.785692068429238, "grad_norm": 1.458300328682292, "learning_rate": 1.0910601878262456e-06, "loss": 0.1659, "step": 5052 }, { "epoch": 0.7858475894245723, "grad_norm": 0.7708133427498459, "learning_rate": 1.0895373866287545e-06, "loss": 0.0961, "step": 5053 }, { "epoch": 0.7860031104199067, "grad_norm": 0.7330716977582163, "learning_rate": 1.088015518913223e-06, "loss": 0.0663, "step": 5054 }, { "epoch": 0.786158631415241, "grad_norm": 1.1097480206682726, "learning_rate": 1.0864945850429414e-06, "loss": 0.1499, "step": 5055 }, { "epoch": 0.7863141524105755, "grad_norm": 0.9998020016463693, "learning_rate": 1.0849745853809772e-06, "loss": 0.1532, "step": 5056 }, { "epoch": 0.7864696734059098, "grad_norm": 0.9099127492110944, "learning_rate": 1.0834555202901758e-06, "loss": 0.1401, "step": 5057 }, { "epoch": 0.7866251944012441, "grad_norm": 0.9893454165102558, "learning_rate": 1.0819373901331615e-06, "loss": 0.0878, "step": 5058 }, { "epoch": 0.7867807153965786, "grad_norm": 1.0990155559925987, "learning_rate": 1.0804201952723304e-06, "loss": 0.1224, "step": 5059 }, { "epoch": 0.7869362363919129, "grad_norm": 1.053880657315413, "learning_rate": 1.07890393606986e-06, "loss": 0.1148, "step": 5060 }, { "epoch": 0.7870917573872472, "grad_norm": 1.1839469662361468, "learning_rate": 1.077388612887702e-06, "loss": 0.0949, "step": 5061 }, { "epoch": 0.7872472783825817, "grad_norm": 0.9810912051497737, "learning_rate": 1.0758742260875832e-06, "loss": 0.1312, "step": 5062 }, { "epoch": 0.787402799377916, "grad_norm": 0.8567351968682857, "learning_rate": 1.074360776031012e-06, "loss": 0.0915, "step": 5063 }, { "epoch": 0.7875583203732504, "grad_norm": 1.3394154587346774, "learning_rate": 1.072848263079267e-06, "loss": 0.1426, "step": 5064 }, { "epoch": 0.7877138413685848, "grad_norm": 1.1016094974941877, "learning_rate": 1.0713366875934078e-06, "loss": 0.1144, "step": 5065 }, { "epoch": 0.7878693623639191, "grad_norm": 1.6653141937909557, "learning_rate": 1.0698260499342694e-06, "loss": 0.1748, "step": 5066 }, { "epoch": 0.7880248833592535, "grad_norm": 1.6127188414495073, "learning_rate": 1.0683163504624616e-06, "loss": 0.0978, "step": 5067 }, { "epoch": 0.7881804043545879, "grad_norm": 1.038432685039484, "learning_rate": 1.0668075895383684e-06, "loss": 0.1173, "step": 5068 }, { "epoch": 0.7883359253499223, "grad_norm": 1.0577584932344914, "learning_rate": 1.0652997675221555e-06, "loss": 0.1106, "step": 5069 }, { "epoch": 0.7884914463452566, "grad_norm": 1.106809728880819, "learning_rate": 1.0637928847737594e-06, "loss": 0.2414, "step": 5070 }, { "epoch": 0.788646967340591, "grad_norm": 1.3374672465699144, "learning_rate": 1.0622869416528925e-06, "loss": 0.1944, "step": 5071 }, { "epoch": 0.7888024883359254, "grad_norm": 0.9189106903562557, "learning_rate": 1.0607819385190476e-06, "loss": 0.1128, "step": 5072 }, { "epoch": 0.7889580093312597, "grad_norm": 1.1242672545271248, "learning_rate": 1.0592778757314865e-06, "loss": 0.176, "step": 5073 }, { "epoch": 0.789113530326594, "grad_norm": 1.0969032632215536, "learning_rate": 1.0577747536492534e-06, "loss": 0.1242, "step": 5074 }, { "epoch": 0.7892690513219285, "grad_norm": 0.8651567542106003, "learning_rate": 1.0562725726311613e-06, "loss": 0.132, "step": 5075 }, { "epoch": 0.7894245723172628, "grad_norm": 1.24018470415228, "learning_rate": 1.054771333035805e-06, "loss": 0.1285, "step": 5076 }, { "epoch": 0.7895800933125972, "grad_norm": 1.1304032613008461, "learning_rate": 1.053271035221549e-06, "loss": 0.1693, "step": 5077 }, { "epoch": 0.7897356143079316, "grad_norm": 0.921698464748386, "learning_rate": 1.0517716795465354e-06, "loss": 0.1237, "step": 5078 }, { "epoch": 0.7898911353032659, "grad_norm": 1.6549996653227173, "learning_rate": 1.0502732663686827e-06, "loss": 0.327, "step": 5079 }, { "epoch": 0.7900466562986003, "grad_norm": 1.1786575123794896, "learning_rate": 1.0487757960456812e-06, "loss": 0.1201, "step": 5080 }, { "epoch": 0.7902021772939347, "grad_norm": 1.1626985770986011, "learning_rate": 1.0472792689349987e-06, "loss": 0.1989, "step": 5081 }, { "epoch": 0.790357698289269, "grad_norm": 0.7966161234699778, "learning_rate": 1.0457836853938786e-06, "loss": 0.1475, "step": 5082 }, { "epoch": 0.7905132192846034, "grad_norm": 1.0705003566916258, "learning_rate": 1.0442890457793365e-06, "loss": 0.1248, "step": 5083 }, { "epoch": 0.7906687402799378, "grad_norm": 1.2640202775888951, "learning_rate": 1.042795350448162e-06, "loss": 0.2783, "step": 5084 }, { "epoch": 0.7908242612752722, "grad_norm": 0.8783801692804495, "learning_rate": 1.0413025997569238e-06, "loss": 0.1499, "step": 5085 }, { "epoch": 0.7909797822706065, "grad_norm": 0.9066373875969631, "learning_rate": 1.0398107940619606e-06, "loss": 0.1454, "step": 5086 }, { "epoch": 0.791135303265941, "grad_norm": 1.1108501558663046, "learning_rate": 1.0383199337193873e-06, "loss": 0.1733, "step": 5087 }, { "epoch": 0.7912908242612753, "grad_norm": 0.7065399989878971, "learning_rate": 1.036830019085094e-06, "loss": 0.0661, "step": 5088 }, { "epoch": 0.7914463452566096, "grad_norm": 1.1162128071949498, "learning_rate": 1.035341050514742e-06, "loss": 0.1621, "step": 5089 }, { "epoch": 0.7916018662519441, "grad_norm": 0.859218489577035, "learning_rate": 1.0338530283637704e-06, "loss": 0.122, "step": 5090 }, { "epoch": 0.7917573872472784, "grad_norm": 0.8856864544520967, "learning_rate": 1.0323659529873925e-06, "loss": 0.1314, "step": 5091 }, { "epoch": 0.7919129082426127, "grad_norm": 1.3577430397804249, "learning_rate": 1.030879824740592e-06, "loss": 0.1218, "step": 5092 }, { "epoch": 0.7920684292379471, "grad_norm": 0.9649175722305443, "learning_rate": 1.0293946439781289e-06, "loss": 0.1101, "step": 5093 }, { "epoch": 0.7922239502332815, "grad_norm": 1.3130614567645313, "learning_rate": 1.0279104110545346e-06, "loss": 0.1172, "step": 5094 }, { "epoch": 0.7923794712286159, "grad_norm": 1.1591615265536053, "learning_rate": 1.0264271263241192e-06, "loss": 0.1795, "step": 5095 }, { "epoch": 0.7925349922239502, "grad_norm": 1.0086870634473704, "learning_rate": 1.0249447901409609e-06, "loss": 0.134, "step": 5096 }, { "epoch": 0.7926905132192846, "grad_norm": 1.2698875240054683, "learning_rate": 1.0234634028589158e-06, "loss": 0.1508, "step": 5097 }, { "epoch": 0.792846034214619, "grad_norm": 0.8166225953997334, "learning_rate": 1.021982964831612e-06, "loss": 0.177, "step": 5098 }, { "epoch": 0.7930015552099533, "grad_norm": 0.9578457367080772, "learning_rate": 1.02050347641245e-06, "loss": 0.1104, "step": 5099 }, { "epoch": 0.7931570762052877, "grad_norm": 0.7310283769638923, "learning_rate": 1.0190249379546024e-06, "loss": 0.1242, "step": 5100 }, { "epoch": 0.7931570762052877, "eval_loss": 0.1631857007741928, "eval_runtime": 9.4457, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 5100 }, { "epoch": 0.7933125972006221, "grad_norm": 1.1498037518767232, "learning_rate": 1.0175473498110206e-06, "loss": 0.1714, "step": 5101 }, { "epoch": 0.7934681181959564, "grad_norm": 0.9635782108066961, "learning_rate": 1.0160707123344238e-06, "loss": 0.1044, "step": 5102 }, { "epoch": 0.7936236391912909, "grad_norm": 0.8639808302591088, "learning_rate": 1.014595025877304e-06, "loss": 0.094, "step": 5103 }, { "epoch": 0.7937791601866252, "grad_norm": 0.8273667391367782, "learning_rate": 1.0131202907919318e-06, "loss": 0.1457, "step": 5104 }, { "epoch": 0.7939346811819595, "grad_norm": 1.2257412395914327, "learning_rate": 1.0116465074303434e-06, "loss": 0.2207, "step": 5105 }, { "epoch": 0.794090202177294, "grad_norm": 0.8543425197549029, "learning_rate": 1.0101736761443531e-06, "loss": 0.0999, "step": 5106 }, { "epoch": 0.7942457231726283, "grad_norm": 0.8768229898991009, "learning_rate": 1.0087017972855478e-06, "loss": 0.1307, "step": 5107 }, { "epoch": 0.7944012441679627, "grad_norm": 1.0242882625562697, "learning_rate": 1.0072308712052847e-06, "loss": 0.1478, "step": 5108 }, { "epoch": 0.7945567651632971, "grad_norm": 0.7078771494368054, "learning_rate": 1.0057608982546918e-06, "loss": 0.1169, "step": 5109 }, { "epoch": 0.7947122861586314, "grad_norm": 0.8217127734646672, "learning_rate": 1.0042918787846757e-06, "loss": 0.1294, "step": 5110 }, { "epoch": 0.7948678071539658, "grad_norm": 0.786053707023061, "learning_rate": 1.00282381314591e-06, "loss": 0.0868, "step": 5111 }, { "epoch": 0.7950233281493001, "grad_norm": 1.1287667274689805, "learning_rate": 1.0013567016888415e-06, "loss": 0.1154, "step": 5112 }, { "epoch": 0.7951788491446345, "grad_norm": 0.9459810006748774, "learning_rate": 9.998905447636908e-07, "loss": 0.1175, "step": 5113 }, { "epoch": 0.7953343701399689, "grad_norm": 0.9158618079905247, "learning_rate": 9.984253427204526e-07, "loss": 0.135, "step": 5114 }, { "epoch": 0.7954898911353032, "grad_norm": 1.0663691428308197, "learning_rate": 9.969610959088887e-07, "loss": 0.1331, "step": 5115 }, { "epoch": 0.7956454121306377, "grad_norm": 0.7810972258451782, "learning_rate": 9.95497804678534e-07, "loss": 0.0587, "step": 5116 }, { "epoch": 0.795800933125972, "grad_norm": 0.7604546010651961, "learning_rate": 9.940354693786997e-07, "loss": 0.1117, "step": 5117 }, { "epoch": 0.7959564541213063, "grad_norm": 1.019634683563963, "learning_rate": 9.92574090358464e-07, "loss": 0.1469, "step": 5118 }, { "epoch": 0.7961119751166408, "grad_norm": 1.775661249707838, "learning_rate": 9.911136679666773e-07, "loss": 0.1829, "step": 5119 }, { "epoch": 0.7962674961119751, "grad_norm": 1.6608353605019899, "learning_rate": 9.896542025519645e-07, "loss": 0.1307, "step": 5120 }, { "epoch": 0.7964230171073094, "grad_norm": 0.9958676136673916, "learning_rate": 9.88195694462719e-07, "loss": 0.1855, "step": 5121 }, { "epoch": 0.7965785381026439, "grad_norm": 0.9344177089881905, "learning_rate": 9.86738144047108e-07, "loss": 0.0895, "step": 5122 }, { "epoch": 0.7967340590979782, "grad_norm": 1.0244490872763514, "learning_rate": 9.852815516530694e-07, "loss": 0.1164, "step": 5123 }, { "epoch": 0.7968895800933126, "grad_norm": 1.2100343271912266, "learning_rate": 9.838259176283126e-07, "loss": 0.1689, "step": 5124 }, { "epoch": 0.797045101088647, "grad_norm": 1.0036675494605025, "learning_rate": 9.823712423203146e-07, "loss": 0.205, "step": 5125 }, { "epoch": 0.7972006220839813, "grad_norm": 0.9797365471582973, "learning_rate": 9.809175260763308e-07, "loss": 0.235, "step": 5126 }, { "epoch": 0.7973561430793157, "grad_norm": 2.4816702501020504, "learning_rate": 9.794647692433813e-07, "loss": 0.115, "step": 5127 }, { "epoch": 0.7975116640746501, "grad_norm": 1.1917352769641987, "learning_rate": 9.780129721682585e-07, "loss": 0.2218, "step": 5128 }, { "epoch": 0.7976671850699845, "grad_norm": 0.7506631012142823, "learning_rate": 9.76562135197528e-07, "loss": 0.1161, "step": 5129 }, { "epoch": 0.7978227060653188, "grad_norm": 1.029894514406552, "learning_rate": 9.751122586775253e-07, "loss": 0.1618, "step": 5130 }, { "epoch": 0.7979782270606531, "grad_norm": 0.8083269507255393, "learning_rate": 9.736633429543564e-07, "loss": 0.125, "step": 5131 }, { "epoch": 0.7981337480559876, "grad_norm": 1.274063004155709, "learning_rate": 9.722153883738944e-07, "loss": 0.2581, "step": 5132 }, { "epoch": 0.7982892690513219, "grad_norm": 1.1773801940157476, "learning_rate": 9.707683952817904e-07, "loss": 0.1146, "step": 5133 }, { "epoch": 0.7984447900466562, "grad_norm": 1.198856094970494, "learning_rate": 9.6932236402346e-07, "loss": 0.1018, "step": 5134 }, { "epoch": 0.7986003110419907, "grad_norm": 1.002127385024415, "learning_rate": 9.6787729494409e-07, "loss": 0.105, "step": 5135 }, { "epoch": 0.798755832037325, "grad_norm": 1.2040404580578263, "learning_rate": 9.664331883886408e-07, "loss": 0.1425, "step": 5136 }, { "epoch": 0.7989113530326594, "grad_norm": 1.3197854081918656, "learning_rate": 9.649900447018383e-07, "loss": 0.1694, "step": 5137 }, { "epoch": 0.7990668740279938, "grad_norm": 1.5000090920907971, "learning_rate": 9.63547864228182e-07, "loss": 0.1736, "step": 5138 }, { "epoch": 0.7992223950233281, "grad_norm": 0.8819594827582473, "learning_rate": 9.62106647311943e-07, "loss": 0.169, "step": 5139 }, { "epoch": 0.7993779160186625, "grad_norm": 1.056992271644876, "learning_rate": 9.606663942971568e-07, "loss": 0.2334, "step": 5140 }, { "epoch": 0.7995334370139969, "grad_norm": 0.939760177713112, "learning_rate": 9.592271055276315e-07, "loss": 0.0983, "step": 5141 }, { "epoch": 0.7996889580093313, "grad_norm": 1.3608072093838364, "learning_rate": 9.577887813469483e-07, "loss": 0.162, "step": 5142 }, { "epoch": 0.7998444790046656, "grad_norm": 0.9904732436591782, "learning_rate": 9.563514220984532e-07, "loss": 0.1664, "step": 5143 }, { "epoch": 0.8, "grad_norm": 1.5157294435974666, "learning_rate": 9.549150281252633e-07, "loss": 0.1138, "step": 5144 }, { "epoch": 0.8001555209953344, "grad_norm": 0.7144460777329649, "learning_rate": 9.534795997702667e-07, "loss": 0.1247, "step": 5145 }, { "epoch": 0.8003110419906687, "grad_norm": 0.8880934387855349, "learning_rate": 9.520451373761219e-07, "loss": 0.1007, "step": 5146 }, { "epoch": 0.8004665629860032, "grad_norm": 0.925119636551832, "learning_rate": 9.506116412852517e-07, "loss": 0.1097, "step": 5147 }, { "epoch": 0.8006220839813375, "grad_norm": 0.8144948962841034, "learning_rate": 9.491791118398552e-07, "loss": 0.0995, "step": 5148 }, { "epoch": 0.8007776049766718, "grad_norm": 1.217495353854283, "learning_rate": 9.477475493818949e-07, "loss": 0.1615, "step": 5149 }, { "epoch": 0.8009331259720062, "grad_norm": 0.7357797524831408, "learning_rate": 9.463169542531059e-07, "loss": 0.1621, "step": 5150 }, { "epoch": 0.8010886469673406, "grad_norm": 1.557068576800722, "learning_rate": 9.448873267949887e-07, "loss": 0.1088, "step": 5151 }, { "epoch": 0.801244167962675, "grad_norm": 1.1518805927249798, "learning_rate": 9.434586673488183e-07, "loss": 0.1918, "step": 5152 }, { "epoch": 0.8013996889580093, "grad_norm": 1.131164547680081, "learning_rate": 9.420309762556335e-07, "loss": 0.139, "step": 5153 }, { "epoch": 0.8015552099533437, "grad_norm": 1.5105542368771059, "learning_rate": 9.406042538562449e-07, "loss": 0.1929, "step": 5154 }, { "epoch": 0.8017107309486781, "grad_norm": 1.0465013603638373, "learning_rate": 9.391785004912324e-07, "loss": 0.1253, "step": 5155 }, { "epoch": 0.8018662519440124, "grad_norm": 1.498109300438602, "learning_rate": 9.37753716500942e-07, "loss": 0.1707, "step": 5156 }, { "epoch": 0.8020217729393468, "grad_norm": 1.2384187693531923, "learning_rate": 9.363299022254885e-07, "loss": 0.1137, "step": 5157 }, { "epoch": 0.8021772939346812, "grad_norm": 1.730699143535896, "learning_rate": 9.349070580047581e-07, "loss": 0.1978, "step": 5158 }, { "epoch": 0.8023328149300155, "grad_norm": 1.0987241208448566, "learning_rate": 9.334851841784026e-07, "loss": 0.1761, "step": 5159 }, { "epoch": 0.80248833592535, "grad_norm": 1.403262042023598, "learning_rate": 9.320642810858421e-07, "loss": 0.0966, "step": 5160 }, { "epoch": 0.8026438569206843, "grad_norm": 1.192436375292074, "learning_rate": 9.30644349066267e-07, "loss": 0.2344, "step": 5161 }, { "epoch": 0.8027993779160186, "grad_norm": 0.8946041287897295, "learning_rate": 9.292253884586361e-07, "loss": 0.1463, "step": 5162 }, { "epoch": 0.8029548989113531, "grad_norm": 1.3399919389812165, "learning_rate": 9.278073996016729e-07, "loss": 0.1704, "step": 5163 }, { "epoch": 0.8031104199066874, "grad_norm": 1.1185706525651515, "learning_rate": 9.263903828338727e-07, "loss": 0.1877, "step": 5164 }, { "epoch": 0.8032659409020217, "grad_norm": 0.863121634973482, "learning_rate": 9.249743384934973e-07, "loss": 0.1266, "step": 5165 }, { "epoch": 0.8034214618973562, "grad_norm": 1.242525647595965, "learning_rate": 9.235592669185739e-07, "loss": 0.1232, "step": 5166 }, { "epoch": 0.8035769828926905, "grad_norm": 1.2442917312311024, "learning_rate": 9.221451684469029e-07, "loss": 0.1123, "step": 5167 }, { "epoch": 0.8037325038880249, "grad_norm": 1.2474635192966739, "learning_rate": 9.207320434160477e-07, "loss": 0.2611, "step": 5168 }, { "epoch": 0.8038880248833593, "grad_norm": 0.6598786116744019, "learning_rate": 9.1931989216334e-07, "loss": 0.0994, "step": 5169 }, { "epoch": 0.8040435458786936, "grad_norm": 0.6700468178672696, "learning_rate": 9.179087150258814e-07, "loss": 0.1293, "step": 5170 }, { "epoch": 0.804199066874028, "grad_norm": 1.306779369065659, "learning_rate": 9.164985123405401e-07, "loss": 0.1414, "step": 5171 }, { "epoch": 0.8043545878693623, "grad_norm": 1.140405785337438, "learning_rate": 9.150892844439502e-07, "loss": 0.1193, "step": 5172 }, { "epoch": 0.8045101088646968, "grad_norm": 1.085341833985666, "learning_rate": 9.136810316725131e-07, "loss": 0.1328, "step": 5173 }, { "epoch": 0.8046656298600311, "grad_norm": 1.260713461252275, "learning_rate": 9.12273754362401e-07, "loss": 0.1324, "step": 5174 }, { "epoch": 0.8048211508553654, "grad_norm": 0.8851689410544645, "learning_rate": 9.108674528495487e-07, "loss": 0.169, "step": 5175 }, { "epoch": 0.8049766718506999, "grad_norm": 1.848927864861967, "learning_rate": 9.094621274696591e-07, "loss": 0.1763, "step": 5176 }, { "epoch": 0.8051321928460342, "grad_norm": 0.9628727785517058, "learning_rate": 9.080577785582045e-07, "loss": 0.1643, "step": 5177 }, { "epoch": 0.8052877138413685, "grad_norm": 1.1718360883475751, "learning_rate": 9.066544064504229e-07, "loss": 0.1042, "step": 5178 }, { "epoch": 0.805443234836703, "grad_norm": 0.7216760669801315, "learning_rate": 9.052520114813174e-07, "loss": 0.1473, "step": 5179 }, { "epoch": 0.8055987558320373, "grad_norm": 0.8969247136436705, "learning_rate": 9.038505939856612e-07, "loss": 0.1685, "step": 5180 }, { "epoch": 0.8057542768273717, "grad_norm": 0.740094649578267, "learning_rate": 9.024501542979913e-07, "loss": 0.1525, "step": 5181 }, { "epoch": 0.8059097978227061, "grad_norm": 0.8050870703003438, "learning_rate": 9.010506927526103e-07, "loss": 0.119, "step": 5182 }, { "epoch": 0.8060653188180404, "grad_norm": 1.2072877226378849, "learning_rate": 8.996522096835924e-07, "loss": 0.1599, "step": 5183 }, { "epoch": 0.8062208398133748, "grad_norm": 0.9294117992899331, "learning_rate": 8.982547054247731e-07, "loss": 0.0723, "step": 5184 }, { "epoch": 0.8063763608087092, "grad_norm": 1.0044735152890412, "learning_rate": 8.968581803097548e-07, "loss": 0.1652, "step": 5185 }, { "epoch": 0.8065318818040436, "grad_norm": 1.0223862733993037, "learning_rate": 8.954626346719098e-07, "loss": 0.1143, "step": 5186 }, { "epoch": 0.8066874027993779, "grad_norm": 1.2738252482838306, "learning_rate": 8.940680688443748e-07, "loss": 0.1055, "step": 5187 }, { "epoch": 0.8068429237947123, "grad_norm": 1.213634046708657, "learning_rate": 8.926744831600498e-07, "loss": 0.1801, "step": 5188 }, { "epoch": 0.8069984447900467, "grad_norm": 0.7360553916239143, "learning_rate": 8.912818779516053e-07, "loss": 0.0722, "step": 5189 }, { "epoch": 0.807153965785381, "grad_norm": 1.0306812167912693, "learning_rate": 8.898902535514747e-07, "loss": 0.12, "step": 5190 }, { "epoch": 0.8073094867807153, "grad_norm": 0.8013649689959341, "learning_rate": 8.88499610291858e-07, "loss": 0.2201, "step": 5191 }, { "epoch": 0.8074650077760498, "grad_norm": 0.9365877102334953, "learning_rate": 8.871099485047202e-07, "loss": 0.1883, "step": 5192 }, { "epoch": 0.8076205287713841, "grad_norm": 1.008094526684965, "learning_rate": 8.857212685217948e-07, "loss": 0.1222, "step": 5193 }, { "epoch": 0.8077760497667185, "grad_norm": 1.0930621267844127, "learning_rate": 8.84333570674577e-07, "loss": 0.1165, "step": 5194 }, { "epoch": 0.8079315707620529, "grad_norm": 1.7798165698255384, "learning_rate": 8.82946855294331e-07, "loss": 0.1535, "step": 5195 }, { "epoch": 0.8080870917573872, "grad_norm": 0.8820927020269812, "learning_rate": 8.815611227120863e-07, "loss": 0.0982, "step": 5196 }, { "epoch": 0.8082426127527216, "grad_norm": 8.119750110309722, "learning_rate": 8.801763732586355e-07, "loss": 0.1448, "step": 5197 }, { "epoch": 0.808398133748056, "grad_norm": 0.9134492340763061, "learning_rate": 8.787926072645358e-07, "loss": 0.1353, "step": 5198 }, { "epoch": 0.8085536547433904, "grad_norm": 1.4803735768817767, "learning_rate": 8.774098250601143e-07, "loss": 0.1401, "step": 5199 }, { "epoch": 0.8087091757387247, "grad_norm": 1.1960974776927698, "learning_rate": 8.76028026975459e-07, "loss": 0.1069, "step": 5200 }, { "epoch": 0.8087091757387247, "eval_loss": 0.1622847467660904, "eval_runtime": 9.4377, "eval_samples_per_second": 2.755, "eval_steps_per_second": 0.742, "step": 5200 }, { "epoch": 0.8088646967340591, "grad_norm": 1.4381068402660147, "learning_rate": 8.746472133404232e-07, "loss": 0.1183, "step": 5201 }, { "epoch": 0.8090202177293935, "grad_norm": 1.4485977420398677, "learning_rate": 8.732673844846274e-07, "loss": 0.1859, "step": 5202 }, { "epoch": 0.8091757387247278, "grad_norm": 1.3683585937666445, "learning_rate": 8.718885407374578e-07, "loss": 0.174, "step": 5203 }, { "epoch": 0.8093312597200623, "grad_norm": 1.1050131285188118, "learning_rate": 8.705106824280607e-07, "loss": 0.1351, "step": 5204 }, { "epoch": 0.8094867807153966, "grad_norm": 0.9734148907534854, "learning_rate": 8.69133809885353e-07, "loss": 0.1485, "step": 5205 }, { "epoch": 0.8096423017107309, "grad_norm": 0.8556911237124321, "learning_rate": 8.677579234380112e-07, "loss": 0.1108, "step": 5206 }, { "epoch": 0.8097978227060654, "grad_norm": 0.914000294870213, "learning_rate": 8.663830234144782e-07, "loss": 0.1624, "step": 5207 }, { "epoch": 0.8099533437013997, "grad_norm": 0.917433270988088, "learning_rate": 8.650091101429642e-07, "loss": 0.134, "step": 5208 }, { "epoch": 0.810108864696734, "grad_norm": 0.7912643516194876, "learning_rate": 8.636361839514401e-07, "loss": 0.1135, "step": 5209 }, { "epoch": 0.8102643856920684, "grad_norm": 1.2424991043874263, "learning_rate": 8.62264245167641e-07, "loss": 0.1558, "step": 5210 }, { "epoch": 0.8104199066874028, "grad_norm": 1.0755355405727958, "learning_rate": 8.608932941190696e-07, "loss": 0.1052, "step": 5211 }, { "epoch": 0.8105754276827372, "grad_norm": 1.3677936175095249, "learning_rate": 8.59523331132992e-07, "loss": 0.2115, "step": 5212 }, { "epoch": 0.8107309486780715, "grad_norm": 1.080064384174892, "learning_rate": 8.581543565364358e-07, "loss": 0.1352, "step": 5213 }, { "epoch": 0.8108864696734059, "grad_norm": 1.6619750483603934, "learning_rate": 8.567863706561941e-07, "loss": 0.2166, "step": 5214 }, { "epoch": 0.8110419906687403, "grad_norm": 0.7028279763839975, "learning_rate": 8.55419373818826e-07, "loss": 0.0943, "step": 5215 }, { "epoch": 0.8111975116640746, "grad_norm": 0.8436877359771799, "learning_rate": 8.540533663506511e-07, "loss": 0.0989, "step": 5216 }, { "epoch": 0.8113530326594091, "grad_norm": 0.901258048317428, "learning_rate": 8.526883485777543e-07, "loss": 0.1218, "step": 5217 }, { "epoch": 0.8115085536547434, "grad_norm": 1.139302297274668, "learning_rate": 8.513243208259841e-07, "loss": 0.1241, "step": 5218 }, { "epoch": 0.8116640746500777, "grad_norm": 1.5625240649802756, "learning_rate": 8.499612834209559e-07, "loss": 0.1944, "step": 5219 }, { "epoch": 0.8118195956454122, "grad_norm": 1.2909934865804964, "learning_rate": 8.485992366880419e-07, "loss": 0.1365, "step": 5220 }, { "epoch": 0.8119751166407465, "grad_norm": 1.0386899435083528, "learning_rate": 8.472381809523849e-07, "loss": 0.1006, "step": 5221 }, { "epoch": 0.8121306376360808, "grad_norm": 1.597035379799794, "learning_rate": 8.458781165388863e-07, "loss": 0.1944, "step": 5222 }, { "epoch": 0.8122861586314153, "grad_norm": 0.8429340788358408, "learning_rate": 8.445190437722112e-07, "loss": 0.091, "step": 5223 }, { "epoch": 0.8124416796267496, "grad_norm": 1.4291908812216705, "learning_rate": 8.431609629767917e-07, "loss": 0.1636, "step": 5224 }, { "epoch": 0.812597200622084, "grad_norm": 1.4591309383634294, "learning_rate": 8.418038744768197e-07, "loss": 0.1148, "step": 5225 }, { "epoch": 0.8127527216174184, "grad_norm": 1.0153710208111126, "learning_rate": 8.4044777859625e-07, "loss": 0.1412, "step": 5226 }, { "epoch": 0.8129082426127527, "grad_norm": 0.8032844987396476, "learning_rate": 8.390926756588019e-07, "loss": 0.1105, "step": 5227 }, { "epoch": 0.8130637636080871, "grad_norm": 2.2416814455247414, "learning_rate": 8.3773856598796e-07, "loss": 0.1506, "step": 5228 }, { "epoch": 0.8132192846034214, "grad_norm": 0.8563811458323806, "learning_rate": 8.363854499069668e-07, "loss": 0.1121, "step": 5229 }, { "epoch": 0.8133748055987559, "grad_norm": 0.6884669273312958, "learning_rate": 8.35033327738829e-07, "loss": 0.1211, "step": 5230 }, { "epoch": 0.8135303265940902, "grad_norm": 1.5237225206489, "learning_rate": 8.336821998063193e-07, "loss": 0.1838, "step": 5231 }, { "epoch": 0.8136858475894245, "grad_norm": 0.9904058737457814, "learning_rate": 8.323320664319701e-07, "loss": 0.1875, "step": 5232 }, { "epoch": 0.813841368584759, "grad_norm": 1.0719359908618293, "learning_rate": 8.309829279380754e-07, "loss": 0.1901, "step": 5233 }, { "epoch": 0.8139968895800933, "grad_norm": 1.1574483899698451, "learning_rate": 8.296347846466951e-07, "loss": 0.155, "step": 5234 }, { "epoch": 0.8141524105754276, "grad_norm": 1.639251341481778, "learning_rate": 8.282876368796505e-07, "loss": 0.2098, "step": 5235 }, { "epoch": 0.8143079315707621, "grad_norm": 1.0316343044481986, "learning_rate": 8.269414849585228e-07, "loss": 0.1837, "step": 5236 }, { "epoch": 0.8144634525660964, "grad_norm": 1.1462160466775755, "learning_rate": 8.255963292046587e-07, "loss": 0.1317, "step": 5237 }, { "epoch": 0.8146189735614308, "grad_norm": 0.9400246964050845, "learning_rate": 8.242521699391647e-07, "loss": 0.1441, "step": 5238 }, { "epoch": 0.8147744945567652, "grad_norm": 1.3752734760729721, "learning_rate": 8.229090074829099e-07, "loss": 0.19, "step": 5239 }, { "epoch": 0.8149300155520995, "grad_norm": 0.8765870327927434, "learning_rate": 8.215668421565276e-07, "loss": 0.1098, "step": 5240 }, { "epoch": 0.8150855365474339, "grad_norm": 0.9373135598709361, "learning_rate": 8.202256742804104e-07, "loss": 0.1265, "step": 5241 }, { "epoch": 0.8152410575427683, "grad_norm": 1.0629022885248562, "learning_rate": 8.188855041747123e-07, "loss": 0.1285, "step": 5242 }, { "epoch": 0.8153965785381027, "grad_norm": 0.8290818041510629, "learning_rate": 8.175463321593518e-07, "loss": 0.0904, "step": 5243 }, { "epoch": 0.815552099533437, "grad_norm": 1.2131974094159745, "learning_rate": 8.162081585540099e-07, "loss": 0.1306, "step": 5244 }, { "epoch": 0.8157076205287714, "grad_norm": 1.1422103699652495, "learning_rate": 8.148709836781243e-07, "loss": 0.1247, "step": 5245 }, { "epoch": 0.8158631415241058, "grad_norm": 0.901947295367284, "learning_rate": 8.135348078508997e-07, "loss": 0.1831, "step": 5246 }, { "epoch": 0.8160186625194401, "grad_norm": 0.951092425063768, "learning_rate": 8.121996313912989e-07, "loss": 0.1357, "step": 5247 }, { "epoch": 0.8161741835147744, "grad_norm": 1.1813903780904016, "learning_rate": 8.108654546180467e-07, "loss": 0.1036, "step": 5248 }, { "epoch": 0.8163297045101089, "grad_norm": 0.907985955259586, "learning_rate": 8.095322778496289e-07, "loss": 0.123, "step": 5249 }, { "epoch": 0.8164852255054432, "grad_norm": 0.7882181735790773, "learning_rate": 8.082001014042945e-07, "loss": 0.1563, "step": 5250 }, { "epoch": 0.8166407465007776, "grad_norm": 1.1218136456369971, "learning_rate": 8.068689256000539e-07, "loss": 0.1095, "step": 5251 }, { "epoch": 0.816796267496112, "grad_norm": 1.5952945301747776, "learning_rate": 8.055387507546747e-07, "loss": 0.2286, "step": 5252 }, { "epoch": 0.8169517884914463, "grad_norm": 1.0923613765612683, "learning_rate": 8.042095771856906e-07, "loss": 0.2257, "step": 5253 }, { "epoch": 0.8171073094867807, "grad_norm": 1.014529049500127, "learning_rate": 8.028814052103928e-07, "loss": 0.1606, "step": 5254 }, { "epoch": 0.8172628304821151, "grad_norm": 0.9067812047163438, "learning_rate": 8.015542351458333e-07, "loss": 0.1348, "step": 5255 }, { "epoch": 0.8174183514774495, "grad_norm": 0.9164522901519364, "learning_rate": 8.002280673088287e-07, "loss": 0.1762, "step": 5256 }, { "epoch": 0.8175738724727838, "grad_norm": 0.7694869098559198, "learning_rate": 7.989029020159517e-07, "loss": 0.0244, "step": 5257 }, { "epoch": 0.8177293934681182, "grad_norm": 1.0390152589582176, "learning_rate": 7.975787395835377e-07, "loss": 0.1708, "step": 5258 }, { "epoch": 0.8178849144634526, "grad_norm": 1.2840127092327531, "learning_rate": 7.962555803276834e-07, "loss": 0.1629, "step": 5259 }, { "epoch": 0.8180404354587869, "grad_norm": 0.7785099985629484, "learning_rate": 7.949334245642459e-07, "loss": 0.1356, "step": 5260 }, { "epoch": 0.8181959564541214, "grad_norm": 1.8523279391448342, "learning_rate": 7.936122726088408e-07, "loss": 0.1976, "step": 5261 }, { "epoch": 0.8183514774494557, "grad_norm": 1.455593916336306, "learning_rate": 7.922921247768478e-07, "loss": 0.1593, "step": 5262 }, { "epoch": 0.81850699844479, "grad_norm": 0.8605511466111299, "learning_rate": 7.909729813834027e-07, "loss": 0.0963, "step": 5263 }, { "epoch": 0.8186625194401245, "grad_norm": 0.9509514732334857, "learning_rate": 7.896548427434031e-07, "loss": 0.1498, "step": 5264 }, { "epoch": 0.8188180404354588, "grad_norm": 0.7713702726721933, "learning_rate": 7.883377091715089e-07, "loss": 0.136, "step": 5265 }, { "epoch": 0.8189735614307931, "grad_norm": 1.3032068920872113, "learning_rate": 7.870215809821357e-07, "loss": 0.1766, "step": 5266 }, { "epoch": 0.8191290824261275, "grad_norm": 0.8559688950144617, "learning_rate": 7.857064584894647e-07, "loss": 0.1721, "step": 5267 }, { "epoch": 0.8192846034214619, "grad_norm": 1.0857115261243198, "learning_rate": 7.843923420074311e-07, "loss": 0.1358, "step": 5268 }, { "epoch": 0.8194401244167963, "grad_norm": 1.1832428957120278, "learning_rate": 7.830792318497359e-07, "loss": 0.1404, "step": 5269 }, { "epoch": 0.8195956454121306, "grad_norm": 1.1403537693579286, "learning_rate": 7.817671283298345e-07, "loss": 0.1618, "step": 5270 }, { "epoch": 0.819751166407465, "grad_norm": 0.9693890513566523, "learning_rate": 7.804560317609438e-07, "loss": 0.1722, "step": 5271 }, { "epoch": 0.8199066874027994, "grad_norm": 0.8418212680830457, "learning_rate": 7.791459424560433e-07, "loss": 0.102, "step": 5272 }, { "epoch": 0.8200622083981337, "grad_norm": 1.1642013266813889, "learning_rate": 7.778368607278675e-07, "loss": 0.1245, "step": 5273 }, { "epoch": 0.8202177293934682, "grad_norm": 1.104996438655127, "learning_rate": 7.765287868889126e-07, "loss": 0.1238, "step": 5274 }, { "epoch": 0.8203732503888025, "grad_norm": 1.1749161240468298, "learning_rate": 7.752217212514346e-07, "loss": 0.2294, "step": 5275 }, { "epoch": 0.8205287713841368, "grad_norm": 0.8560742151720416, "learning_rate": 7.739156641274492e-07, "loss": 0.1857, "step": 5276 }, { "epoch": 0.8206842923794713, "grad_norm": 1.2131785958315993, "learning_rate": 7.726106158287289e-07, "loss": 0.16, "step": 5277 }, { "epoch": 0.8208398133748056, "grad_norm": 1.215118865034051, "learning_rate": 7.713065766668082e-07, "loss": 0.1475, "step": 5278 }, { "epoch": 0.8209953343701399, "grad_norm": 1.4439018240195154, "learning_rate": 7.700035469529793e-07, "loss": 0.141, "step": 5279 }, { "epoch": 0.8211508553654744, "grad_norm": 2.3253123978250794, "learning_rate": 7.687015269982917e-07, "loss": 0.2199, "step": 5280 }, { "epoch": 0.8213063763608087, "grad_norm": 0.8876791981899205, "learning_rate": 7.674005171135585e-07, "loss": 0.1091, "step": 5281 }, { "epoch": 0.8214618973561431, "grad_norm": 1.228486465473228, "learning_rate": 7.661005176093461e-07, "loss": 0.2012, "step": 5282 }, { "epoch": 0.8216174183514775, "grad_norm": 1.1142555874960098, "learning_rate": 7.648015287959843e-07, "loss": 0.1763, "step": 5283 }, { "epoch": 0.8217729393468118, "grad_norm": 1.117018550919326, "learning_rate": 7.635035509835603e-07, "loss": 0.1808, "step": 5284 }, { "epoch": 0.8219284603421462, "grad_norm": 0.756165291597364, "learning_rate": 7.622065844819188e-07, "loss": 0.1164, "step": 5285 }, { "epoch": 0.8220839813374805, "grad_norm": 0.8788289974605208, "learning_rate": 7.609106296006636e-07, "loss": 0.1233, "step": 5286 }, { "epoch": 0.822239502332815, "grad_norm": 0.9777204577919729, "learning_rate": 7.59615686649156e-07, "loss": 0.1354, "step": 5287 }, { "epoch": 0.8223950233281493, "grad_norm": 1.2068800670314601, "learning_rate": 7.583217559365197e-07, "loss": 0.1714, "step": 5288 }, { "epoch": 0.8225505443234836, "grad_norm": 1.2200896795529956, "learning_rate": 7.570288377716323e-07, "loss": 0.173, "step": 5289 }, { "epoch": 0.8227060653188181, "grad_norm": 1.2044450428743056, "learning_rate": 7.557369324631303e-07, "loss": 0.1389, "step": 5290 }, { "epoch": 0.8228615863141524, "grad_norm": 1.0864328594591324, "learning_rate": 7.544460403194109e-07, "loss": 0.1019, "step": 5291 }, { "epoch": 0.8230171073094867, "grad_norm": 1.4980915690985102, "learning_rate": 7.531561616486294e-07, "loss": 0.153, "step": 5292 }, { "epoch": 0.8231726283048212, "grad_norm": 1.0578418969595453, "learning_rate": 7.518672967586954e-07, "loss": 0.146, "step": 5293 }, { "epoch": 0.8233281493001555, "grad_norm": 1.2644263029296434, "learning_rate": 7.505794459572807e-07, "loss": 0.1186, "step": 5294 }, { "epoch": 0.8234836702954899, "grad_norm": 1.2431725231810036, "learning_rate": 7.492926095518127e-07, "loss": 0.0822, "step": 5295 }, { "epoch": 0.8236391912908243, "grad_norm": 1.205154690446287, "learning_rate": 7.480067878494757e-07, "loss": 0.187, "step": 5296 }, { "epoch": 0.8237947122861586, "grad_norm": 2.1995191262492684, "learning_rate": 7.467219811572158e-07, "loss": 0.1272, "step": 5297 }, { "epoch": 0.823950233281493, "grad_norm": 0.7306480507052485, "learning_rate": 7.45438189781732e-07, "loss": 0.1073, "step": 5298 }, { "epoch": 0.8241057542768274, "grad_norm": 1.2314972167633065, "learning_rate": 7.441554140294843e-07, "loss": 0.1059, "step": 5299 }, { "epoch": 0.8242612752721618, "grad_norm": 1.3084180924668811, "learning_rate": 7.4287365420669e-07, "loss": 0.1956, "step": 5300 }, { "epoch": 0.8242612752721618, "eval_loss": 0.161608025431633, "eval_runtime": 9.4426, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 5300 }, { "epoch": 0.8244167962674961, "grad_norm": 0.9183884320257549, "learning_rate": 7.41592910619322e-07, "loss": 0.1605, "step": 5301 }, { "epoch": 0.8245723172628305, "grad_norm": 0.6046520690383564, "learning_rate": 7.403131835731109e-07, "loss": 0.0894, "step": 5302 }, { "epoch": 0.8247278382581649, "grad_norm": 1.0480869216923623, "learning_rate": 7.390344733735477e-07, "loss": 0.1732, "step": 5303 }, { "epoch": 0.8248833592534992, "grad_norm": 0.9990406514288712, "learning_rate": 7.377567803258762e-07, "loss": 0.1292, "step": 5304 }, { "epoch": 0.8250388802488335, "grad_norm": 1.1687134639713503, "learning_rate": 7.364801047351011e-07, "loss": 0.0921, "step": 5305 }, { "epoch": 0.825194401244168, "grad_norm": 1.1183493982466974, "learning_rate": 7.352044469059799e-07, "loss": 0.0866, "step": 5306 }, { "epoch": 0.8253499222395023, "grad_norm": 1.316070867676314, "learning_rate": 7.339298071430323e-07, "loss": 0.1129, "step": 5307 }, { "epoch": 0.8255054432348367, "grad_norm": 1.0176162950441818, "learning_rate": 7.326561857505337e-07, "loss": 0.1548, "step": 5308 }, { "epoch": 0.8256609642301711, "grad_norm": 1.0217593933350988, "learning_rate": 7.313835830325123e-07, "loss": 0.1327, "step": 5309 }, { "epoch": 0.8258164852255054, "grad_norm": 1.8003468052537954, "learning_rate": 7.301119992927585e-07, "loss": 0.2126, "step": 5310 }, { "epoch": 0.8259720062208398, "grad_norm": 1.0064067301421227, "learning_rate": 7.28841434834816e-07, "loss": 0.0835, "step": 5311 }, { "epoch": 0.8261275272161742, "grad_norm": 1.4943106143803713, "learning_rate": 7.275718899619855e-07, "loss": 0.1033, "step": 5312 }, { "epoch": 0.8262830482115086, "grad_norm": 1.3992816284320915, "learning_rate": 7.263033649773271e-07, "loss": 0.133, "step": 5313 }, { "epoch": 0.8264385692068429, "grad_norm": 1.502475042752789, "learning_rate": 7.250358601836533e-07, "loss": 0.1344, "step": 5314 }, { "epoch": 0.8265940902021773, "grad_norm": 1.1584923388757802, "learning_rate": 7.237693758835357e-07, "loss": 0.2045, "step": 5315 }, { "epoch": 0.8267496111975117, "grad_norm": 1.1463885319007763, "learning_rate": 7.225039123793038e-07, "loss": 0.177, "step": 5316 }, { "epoch": 0.826905132192846, "grad_norm": 0.6981448142072617, "learning_rate": 7.212394699730396e-07, "loss": 0.1198, "step": 5317 }, { "epoch": 0.8270606531881805, "grad_norm": 0.9821099169393385, "learning_rate": 7.199760489665819e-07, "loss": 0.1207, "step": 5318 }, { "epoch": 0.8272161741835148, "grad_norm": 0.6139196905566119, "learning_rate": 7.187136496615299e-07, "loss": 0.1275, "step": 5319 }, { "epoch": 0.8273716951788491, "grad_norm": 1.1152439908755625, "learning_rate": 7.174522723592342e-07, "loss": 0.1648, "step": 5320 }, { "epoch": 0.8275272161741836, "grad_norm": 1.0328375572640396, "learning_rate": 7.161919173608023e-07, "loss": 0.176, "step": 5321 }, { "epoch": 0.8276827371695179, "grad_norm": 1.3342439651121083, "learning_rate": 7.149325849671001e-07, "loss": 0.1723, "step": 5322 }, { "epoch": 0.8278382581648522, "grad_norm": 0.7969863164435922, "learning_rate": 7.136742754787468e-07, "loss": 0.1844, "step": 5323 }, { "epoch": 0.8279937791601866, "grad_norm": 1.4746239760938487, "learning_rate": 7.124169891961196e-07, "loss": 0.1328, "step": 5324 }, { "epoch": 0.828149300155521, "grad_norm": 1.1626237275901252, "learning_rate": 7.111607264193482e-07, "loss": 0.1617, "step": 5325 }, { "epoch": 0.8283048211508554, "grad_norm": 1.2201212253539002, "learning_rate": 7.099054874483224e-07, "loss": 0.1583, "step": 5326 }, { "epoch": 0.8284603421461897, "grad_norm": 1.0518475553047062, "learning_rate": 7.086512725826849e-07, "loss": 0.3202, "step": 5327 }, { "epoch": 0.8286158631415241, "grad_norm": 0.9586215454934622, "learning_rate": 7.073980821218318e-07, "loss": 0.1048, "step": 5328 }, { "epoch": 0.8287713841368585, "grad_norm": 1.3069893551084466, "learning_rate": 7.061459163649203e-07, "loss": 0.1234, "step": 5329 }, { "epoch": 0.8289269051321928, "grad_norm": 1.1204682369570356, "learning_rate": 7.048947756108576e-07, "loss": 0.136, "step": 5330 }, { "epoch": 0.8290824261275272, "grad_norm": 0.8281666760666855, "learning_rate": 7.036446601583091e-07, "loss": 0.183, "step": 5331 }, { "epoch": 0.8292379471228616, "grad_norm": 1.1360904050571579, "learning_rate": 7.023955703056956e-07, "loss": 0.0911, "step": 5332 }, { "epoch": 0.8293934681181959, "grad_norm": 0.803306370277369, "learning_rate": 7.011475063511925e-07, "loss": 0.1512, "step": 5333 }, { "epoch": 0.8295489891135304, "grad_norm": 0.9464704267324799, "learning_rate": 6.999004685927275e-07, "loss": 0.1347, "step": 5334 }, { "epoch": 0.8297045101088647, "grad_norm": 0.9848397389922443, "learning_rate": 6.986544573279891e-07, "loss": 0.1473, "step": 5335 }, { "epoch": 0.829860031104199, "grad_norm": 1.3558972044748199, "learning_rate": 6.974094728544156e-07, "loss": 0.1899, "step": 5336 }, { "epoch": 0.8300155520995335, "grad_norm": 1.0602444470997987, "learning_rate": 6.961655154692021e-07, "loss": 0.1407, "step": 5337 }, { "epoch": 0.8301710730948678, "grad_norm": 0.9222066331831632, "learning_rate": 6.949225854692998e-07, "loss": 0.1617, "step": 5338 }, { "epoch": 0.8303265940902022, "grad_norm": 1.4046458078902235, "learning_rate": 6.936806831514121e-07, "loss": 0.1196, "step": 5339 }, { "epoch": 0.8304821150855366, "grad_norm": 0.8480945240224557, "learning_rate": 6.924398088119988e-07, "loss": 0.1151, "step": 5340 }, { "epoch": 0.8306376360808709, "grad_norm": 0.8843917820311208, "learning_rate": 6.911999627472748e-07, "loss": 0.1307, "step": 5341 }, { "epoch": 0.8307931570762053, "grad_norm": 1.0308825723331592, "learning_rate": 6.899611452532085e-07, "loss": 0.116, "step": 5342 }, { "epoch": 0.8309486780715396, "grad_norm": 1.3204175593680982, "learning_rate": 6.887233566255225e-07, "loss": 0.1057, "step": 5343 }, { "epoch": 0.831104199066874, "grad_norm": 0.8822711152706587, "learning_rate": 6.874865971596928e-07, "loss": 0.0907, "step": 5344 }, { "epoch": 0.8312597200622084, "grad_norm": 1.0999055408261513, "learning_rate": 6.862508671509538e-07, "loss": 0.1157, "step": 5345 }, { "epoch": 0.8314152410575427, "grad_norm": 0.7931446995947834, "learning_rate": 6.850161668942889e-07, "loss": 0.1253, "step": 5346 }, { "epoch": 0.8315707620528772, "grad_norm": 1.046377242302317, "learning_rate": 6.837824966844397e-07, "loss": 0.1218, "step": 5347 }, { "epoch": 0.8317262830482115, "grad_norm": 1.2294341016236352, "learning_rate": 6.825498568159011e-07, "loss": 0.1477, "step": 5348 }, { "epoch": 0.8318818040435458, "grad_norm": 1.1807958926012059, "learning_rate": 6.813182475829205e-07, "loss": 0.1483, "step": 5349 }, { "epoch": 0.8320373250388803, "grad_norm": 1.3339387752587357, "learning_rate": 6.800876692794994e-07, "loss": 0.1776, "step": 5350 }, { "epoch": 0.8321928460342146, "grad_norm": 1.3635318205548363, "learning_rate": 6.788581221993962e-07, "loss": 0.1607, "step": 5351 }, { "epoch": 0.832348367029549, "grad_norm": 1.6163120884570954, "learning_rate": 6.776296066361193e-07, "loss": 0.1635, "step": 5352 }, { "epoch": 0.8325038880248834, "grad_norm": 1.2419919605157625, "learning_rate": 6.764021228829321e-07, "loss": 0.1633, "step": 5353 }, { "epoch": 0.8326594090202177, "grad_norm": 1.1735686347704823, "learning_rate": 6.751756712328539e-07, "loss": 0.1612, "step": 5354 }, { "epoch": 0.8328149300155521, "grad_norm": 0.6280994448913608, "learning_rate": 6.739502519786534e-07, "loss": 0.045, "step": 5355 }, { "epoch": 0.8329704510108865, "grad_norm": 0.9914249573644739, "learning_rate": 6.727258654128571e-07, "loss": 0.1302, "step": 5356 }, { "epoch": 0.8331259720062208, "grad_norm": 1.3453354065484557, "learning_rate": 6.715025118277435e-07, "loss": 0.1369, "step": 5357 }, { "epoch": 0.8332814930015552, "grad_norm": 1.0921580814862846, "learning_rate": 6.702801915153434e-07, "loss": 0.1358, "step": 5358 }, { "epoch": 0.8334370139968896, "grad_norm": 1.144762893644582, "learning_rate": 6.690589047674401e-07, "loss": 0.0843, "step": 5359 }, { "epoch": 0.833592534992224, "grad_norm": 0.7866609930794485, "learning_rate": 6.678386518755747e-07, "loss": 0.1226, "step": 5360 }, { "epoch": 0.8337480559875583, "grad_norm": 1.0539600400094358, "learning_rate": 6.666194331310377e-07, "loss": 0.118, "step": 5361 }, { "epoch": 0.8339035769828926, "grad_norm": 0.7649591763558078, "learning_rate": 6.654012488248713e-07, "loss": 0.1131, "step": 5362 }, { "epoch": 0.8340590979782271, "grad_norm": 0.9636038220821082, "learning_rate": 6.641840992478755e-07, "loss": 0.094, "step": 5363 }, { "epoch": 0.8342146189735614, "grad_norm": 1.3772151685286937, "learning_rate": 6.629679846906006e-07, "loss": 0.1079, "step": 5364 }, { "epoch": 0.8343701399688958, "grad_norm": 1.1845698221745644, "learning_rate": 6.617529054433503e-07, "loss": 0.1668, "step": 5365 }, { "epoch": 0.8345256609642302, "grad_norm": 0.6861121237100982, "learning_rate": 6.605388617961794e-07, "loss": 0.072, "step": 5366 }, { "epoch": 0.8346811819595645, "grad_norm": 1.5448569902740539, "learning_rate": 6.593258540388991e-07, "loss": 0.1002, "step": 5367 }, { "epoch": 0.8348367029548989, "grad_norm": 1.0506593165822316, "learning_rate": 6.5811388246107e-07, "loss": 0.135, "step": 5368 }, { "epoch": 0.8349922239502333, "grad_norm": 0.794838477906218, "learning_rate": 6.569029473520061e-07, "loss": 0.1947, "step": 5369 }, { "epoch": 0.8351477449455676, "grad_norm": 1.5377499851533645, "learning_rate": 6.556930490007762e-07, "loss": 0.1884, "step": 5370 }, { "epoch": 0.835303265940902, "grad_norm": 1.3920543469047566, "learning_rate": 6.544841876961977e-07, "loss": 0.1308, "step": 5371 }, { "epoch": 0.8354587869362364, "grad_norm": 1.0804736251661284, "learning_rate": 6.532763637268441e-07, "loss": 0.1834, "step": 5372 }, { "epoch": 0.8356143079315708, "grad_norm": 0.8860646166902433, "learning_rate": 6.520695773810409e-07, "loss": 0.1124, "step": 5373 }, { "epoch": 0.8357698289269051, "grad_norm": 1.3271386555951636, "learning_rate": 6.508638289468633e-07, "loss": 0.2199, "step": 5374 }, { "epoch": 0.8359253499222395, "grad_norm": 1.8648848098718793, "learning_rate": 6.496591187121398e-07, "loss": 0.1464, "step": 5375 }, { "epoch": 0.8360808709175739, "grad_norm": 1.215057936653217, "learning_rate": 6.484554469644533e-07, "loss": 0.1279, "step": 5376 }, { "epoch": 0.8362363919129082, "grad_norm": 0.9065232806336396, "learning_rate": 6.472528139911361e-07, "loss": 0.1688, "step": 5377 }, { "epoch": 0.8363919129082427, "grad_norm": 1.0548427376582483, "learning_rate": 6.460512200792718e-07, "loss": 0.2057, "step": 5378 }, { "epoch": 0.836547433903577, "grad_norm": 0.9968087601328377, "learning_rate": 6.448506655156994e-07, "loss": 0.131, "step": 5379 }, { "epoch": 0.8367029548989113, "grad_norm": 1.407926145459382, "learning_rate": 6.436511505870091e-07, "loss": 0.1246, "step": 5380 }, { "epoch": 0.8368584758942457, "grad_norm": 1.1288248823711307, "learning_rate": 6.424526755795407e-07, "loss": 0.1342, "step": 5381 }, { "epoch": 0.8370139968895801, "grad_norm": 1.9746766117275305, "learning_rate": 6.412552407793854e-07, "loss": 0.0982, "step": 5382 }, { "epoch": 0.8371695178849144, "grad_norm": 1.184794585548143, "learning_rate": 6.4005884647239e-07, "loss": 0.1838, "step": 5383 }, { "epoch": 0.8373250388802488, "grad_norm": 0.8216696917351101, "learning_rate": 6.388634929441495e-07, "loss": 0.0817, "step": 5384 }, { "epoch": 0.8374805598755832, "grad_norm": 1.2179743720519673, "learning_rate": 6.376691804800106e-07, "loss": 0.3074, "step": 5385 }, { "epoch": 0.8376360808709176, "grad_norm": 0.8462158647773919, "learning_rate": 6.364759093650741e-07, "loss": 0.1461, "step": 5386 }, { "epoch": 0.8377916018662519, "grad_norm": 0.9877337536351601, "learning_rate": 6.352836798841882e-07, "loss": 0.0996, "step": 5387 }, { "epoch": 0.8379471228615863, "grad_norm": 1.5296870982308424, "learning_rate": 6.34092492321956e-07, "loss": 0.136, "step": 5388 }, { "epoch": 0.8381026438569207, "grad_norm": 1.1285825750804055, "learning_rate": 6.329023469627316e-07, "loss": 0.1363, "step": 5389 }, { "epoch": 0.838258164852255, "grad_norm": 1.1385922883554642, "learning_rate": 6.317132440906188e-07, "loss": 0.1841, "step": 5390 }, { "epoch": 0.8384136858475895, "grad_norm": 0.9435535999873713, "learning_rate": 6.305251839894711e-07, "loss": 0.1027, "step": 5391 }, { "epoch": 0.8385692068429238, "grad_norm": 0.947272546106738, "learning_rate": 6.293381669428972e-07, "loss": 0.1221, "step": 5392 }, { "epoch": 0.8387247278382581, "grad_norm": 1.1221599426940856, "learning_rate": 6.281521932342544e-07, "loss": 0.1655, "step": 5393 }, { "epoch": 0.8388802488335926, "grad_norm": 0.9296111284726964, "learning_rate": 6.269672631466489e-07, "loss": 0.0894, "step": 5394 }, { "epoch": 0.8390357698289269, "grad_norm": 1.2806859975359395, "learning_rate": 6.257833769629424e-07, "loss": 0.1717, "step": 5395 }, { "epoch": 0.8391912908242612, "grad_norm": 0.9566390803606257, "learning_rate": 6.246005349657452e-07, "loss": 0.1189, "step": 5396 }, { "epoch": 0.8393468118195957, "grad_norm": 1.2562853228683564, "learning_rate": 6.234187374374162e-07, "loss": 0.2148, "step": 5397 }, { "epoch": 0.83950233281493, "grad_norm": 1.1898690456767762, "learning_rate": 6.222379846600696e-07, "loss": 0.1644, "step": 5398 }, { "epoch": 0.8396578538102644, "grad_norm": 0.9316838003611182, "learning_rate": 6.21058276915566e-07, "loss": 0.1308, "step": 5399 }, { "epoch": 0.8398133748055988, "grad_norm": 0.8783220889685971, "learning_rate": 6.198796144855168e-07, "loss": 0.1319, "step": 5400 }, { "epoch": 0.8398133748055988, "eval_loss": 0.1616022288799286, "eval_runtime": 9.4658, "eval_samples_per_second": 2.747, "eval_steps_per_second": 0.74, "step": 5400 }, { "epoch": 0.8399688958009331, "grad_norm": 0.7596863144644015, "learning_rate": 6.187019976512876e-07, "loss": 0.112, "step": 5401 }, { "epoch": 0.8401244167962675, "grad_norm": 1.1197534014843953, "learning_rate": 6.175254266939912e-07, "loss": 0.1013, "step": 5402 }, { "epoch": 0.8402799377916018, "grad_norm": 0.9240350264048309, "learning_rate": 6.163499018944902e-07, "loss": 0.1546, "step": 5403 }, { "epoch": 0.8404354587869363, "grad_norm": 0.6844120695364478, "learning_rate": 6.151754235333989e-07, "loss": 0.0623, "step": 5404 }, { "epoch": 0.8405909797822706, "grad_norm": 1.0024305301349785, "learning_rate": 6.140019918910845e-07, "loss": 0.2048, "step": 5405 }, { "epoch": 0.8407465007776049, "grad_norm": 1.2569405687238568, "learning_rate": 6.128296072476591e-07, "loss": 0.1279, "step": 5406 }, { "epoch": 0.8409020217729394, "grad_norm": 1.0753493306771365, "learning_rate": 6.116582698829871e-07, "loss": 0.132, "step": 5407 }, { "epoch": 0.8410575427682737, "grad_norm": 0.9893417537275409, "learning_rate": 6.104879800766838e-07, "loss": 0.158, "step": 5408 }, { "epoch": 0.841213063763608, "grad_norm": 0.9817110102300572, "learning_rate": 6.093187381081145e-07, "loss": 0.1448, "step": 5409 }, { "epoch": 0.8413685847589425, "grad_norm": 0.8170052291556689, "learning_rate": 6.081505442563912e-07, "loss": 0.1182, "step": 5410 }, { "epoch": 0.8415241057542768, "grad_norm": 1.0018500114148856, "learning_rate": 6.069833988003793e-07, "loss": 0.1126, "step": 5411 }, { "epoch": 0.8416796267496112, "grad_norm": 1.0131655538588993, "learning_rate": 6.058173020186936e-07, "loss": 0.1162, "step": 5412 }, { "epoch": 0.8418351477449456, "grad_norm": 0.8215662836362867, "learning_rate": 6.046522541896966e-07, "loss": 0.137, "step": 5413 }, { "epoch": 0.84199066874028, "grad_norm": 1.2426353620646349, "learning_rate": 6.034882555915023e-07, "loss": 0.1122, "step": 5414 }, { "epoch": 0.8421461897356143, "grad_norm": 0.768880419789266, "learning_rate": 6.023253065019729e-07, "loss": 0.1664, "step": 5415 }, { "epoch": 0.8423017107309487, "grad_norm": 1.1938617575677026, "learning_rate": 6.0116340719872e-07, "loss": 0.1679, "step": 5416 }, { "epoch": 0.8424572317262831, "grad_norm": 0.9525856699138132, "learning_rate": 6.000025579591062e-07, "loss": 0.095, "step": 5417 }, { "epoch": 0.8426127527216174, "grad_norm": 1.3057079367420166, "learning_rate": 5.988427590602424e-07, "loss": 0.115, "step": 5418 }, { "epoch": 0.8427682737169518, "grad_norm": 1.0158846943238318, "learning_rate": 5.976840107789872e-07, "loss": 0.164, "step": 5419 }, { "epoch": 0.8429237947122862, "grad_norm": 1.72533736051024, "learning_rate": 5.965263133919508e-07, "loss": 0.1789, "step": 5420 }, { "epoch": 0.8430793157076205, "grad_norm": 0.820337692404724, "learning_rate": 5.953696671754928e-07, "loss": 0.1412, "step": 5421 }, { "epoch": 0.8432348367029548, "grad_norm": 0.6447029781765707, "learning_rate": 5.942140724057205e-07, "loss": 0.1257, "step": 5422 }, { "epoch": 0.8433903576982893, "grad_norm": 1.3788605339958224, "learning_rate": 5.930595293584884e-07, "loss": 0.1691, "step": 5423 }, { "epoch": 0.8435458786936236, "grad_norm": 1.1287868566581434, "learning_rate": 5.919060383094049e-07, "loss": 0.1605, "step": 5424 }, { "epoch": 0.843701399688958, "grad_norm": 0.8788289970323909, "learning_rate": 5.907535995338232e-07, "loss": 0.118, "step": 5425 }, { "epoch": 0.8438569206842924, "grad_norm": 0.9616798624108981, "learning_rate": 5.896022133068452e-07, "loss": 0.1092, "step": 5426 }, { "epoch": 0.8440124416796267, "grad_norm": 1.131394158448979, "learning_rate": 5.88451879903324e-07, "loss": 0.1702, "step": 5427 }, { "epoch": 0.8441679626749611, "grad_norm": 0.8295931220176931, "learning_rate": 5.873025995978616e-07, "loss": 0.1194, "step": 5428 }, { "epoch": 0.8443234836702955, "grad_norm": 0.9797247700043263, "learning_rate": 5.861543726648045e-07, "loss": 0.1715, "step": 5429 }, { "epoch": 0.8444790046656299, "grad_norm": 1.2318628447543556, "learning_rate": 5.850071993782525e-07, "loss": 0.1442, "step": 5430 }, { "epoch": 0.8446345256609642, "grad_norm": 1.2158526659760822, "learning_rate": 5.83861080012052e-07, "loss": 0.1428, "step": 5431 }, { "epoch": 0.8447900466562986, "grad_norm": 1.1316840077529913, "learning_rate": 5.827160148397954e-07, "loss": 0.1238, "step": 5432 }, { "epoch": 0.844945567651633, "grad_norm": 1.2498551511538933, "learning_rate": 5.815720041348283e-07, "loss": 0.1842, "step": 5433 }, { "epoch": 0.8451010886469673, "grad_norm": 0.9885997464945713, "learning_rate": 5.804290481702412e-07, "loss": 0.1379, "step": 5434 }, { "epoch": 0.8452566096423018, "grad_norm": 1.0477193259971673, "learning_rate": 5.792871472188721e-07, "loss": 0.1674, "step": 5435 }, { "epoch": 0.8454121306376361, "grad_norm": 1.0515676253535184, "learning_rate": 5.7814630155331e-07, "loss": 0.17, "step": 5436 }, { "epoch": 0.8455676516329704, "grad_norm": 1.402565436620513, "learning_rate": 5.770065114458923e-07, "loss": 0.1568, "step": 5437 }, { "epoch": 0.8457231726283049, "grad_norm": 1.0566249053921297, "learning_rate": 5.758677771686994e-07, "loss": 0.1726, "step": 5438 }, { "epoch": 0.8458786936236392, "grad_norm": 0.7953821732802839, "learning_rate": 5.747300989935656e-07, "loss": 0.1551, "step": 5439 }, { "epoch": 0.8460342146189735, "grad_norm": 1.0944763742146142, "learning_rate": 5.735934771920704e-07, "loss": 0.1939, "step": 5440 }, { "epoch": 0.8461897356143079, "grad_norm": 2.5751477870938837, "learning_rate": 5.7245791203554e-07, "loss": 0.0926, "step": 5441 }, { "epoch": 0.8463452566096423, "grad_norm": 1.0823296810624412, "learning_rate": 5.713234037950494e-07, "loss": 0.1728, "step": 5442 }, { "epoch": 0.8465007776049767, "grad_norm": 1.0415361575517994, "learning_rate": 5.701899527414223e-07, "loss": 0.1467, "step": 5443 }, { "epoch": 0.846656298600311, "grad_norm": 12.374071625843884, "learning_rate": 5.690575591452308e-07, "loss": 0.9607, "step": 5444 }, { "epoch": 0.8468118195956454, "grad_norm": 1.1299399117525881, "learning_rate": 5.679262232767902e-07, "loss": 0.2084, "step": 5445 }, { "epoch": 0.8469673405909798, "grad_norm": 1.3008769142274024, "learning_rate": 5.667959454061683e-07, "loss": 0.079, "step": 5446 }, { "epoch": 0.8471228615863141, "grad_norm": 1.0190851194249608, "learning_rate": 5.656667258031779e-07, "loss": 0.1159, "step": 5447 }, { "epoch": 0.8472783825816486, "grad_norm": 0.8865792468159991, "learning_rate": 5.645385647373774e-07, "loss": 0.1526, "step": 5448 }, { "epoch": 0.8474339035769829, "grad_norm": 0.827769912873066, "learning_rate": 5.634114624780773e-07, "loss": 0.2395, "step": 5449 }, { "epoch": 0.8475894245723172, "grad_norm": 5.19706682152406, "learning_rate": 5.622854192943317e-07, "loss": 0.1846, "step": 5450 }, { "epoch": 0.8477449455676517, "grad_norm": 1.00358020495566, "learning_rate": 5.611604354549416e-07, "loss": 0.124, "step": 5451 }, { "epoch": 0.847900466562986, "grad_norm": 1.2378342318692006, "learning_rate": 5.600365112284578e-07, "loss": 0.146, "step": 5452 }, { "epoch": 0.8480559875583203, "grad_norm": 0.8764935735210747, "learning_rate": 5.589136468831763e-07, "loss": 0.0934, "step": 5453 }, { "epoch": 0.8482115085536548, "grad_norm": 1.0082253803215078, "learning_rate": 5.577918426871398e-07, "loss": 0.1324, "step": 5454 }, { "epoch": 0.8483670295489891, "grad_norm": 1.101862666604851, "learning_rate": 5.566710989081403e-07, "loss": 0.1555, "step": 5455 }, { "epoch": 0.8485225505443235, "grad_norm": 1.4958180359807571, "learning_rate": 5.555514158137143e-07, "loss": 0.147, "step": 5456 }, { "epoch": 0.8486780715396579, "grad_norm": 1.2224899504041187, "learning_rate": 5.544327936711436e-07, "loss": 0.1215, "step": 5457 }, { "epoch": 0.8488335925349922, "grad_norm": 1.1339666343089687, "learning_rate": 5.533152327474623e-07, "loss": 0.1347, "step": 5458 }, { "epoch": 0.8489891135303266, "grad_norm": 0.9405666876526453, "learning_rate": 5.521987333094447e-07, "loss": 0.1354, "step": 5459 }, { "epoch": 0.8491446345256609, "grad_norm": 1.1294546450043745, "learning_rate": 5.510832956236173e-07, "loss": 0.2123, "step": 5460 }, { "epoch": 0.8493001555209954, "grad_norm": 0.7272458852936855, "learning_rate": 5.49968919956248e-07, "loss": 0.1473, "step": 5461 }, { "epoch": 0.8494556765163297, "grad_norm": 0.9843946009876944, "learning_rate": 5.488556065733569e-07, "loss": 0.1339, "step": 5462 }, { "epoch": 0.849611197511664, "grad_norm": 0.8113909754264533, "learning_rate": 5.477433557407064e-07, "loss": 0.1346, "step": 5463 }, { "epoch": 0.8497667185069985, "grad_norm": 1.1387042665854659, "learning_rate": 5.46632167723804e-07, "loss": 0.0838, "step": 5464 }, { "epoch": 0.8499222395023328, "grad_norm": 1.156663762915337, "learning_rate": 5.455220427879088e-07, "loss": 0.1723, "step": 5465 }, { "epoch": 0.8500777604976671, "grad_norm": 1.232809239284718, "learning_rate": 5.44412981198022e-07, "loss": 0.0977, "step": 5466 }, { "epoch": 0.8502332814930016, "grad_norm": 1.2238884071438756, "learning_rate": 5.433049832188903e-07, "loss": 0.1997, "step": 5467 }, { "epoch": 0.8503888024883359, "grad_norm": 0.764865129870327, "learning_rate": 5.421980491150103e-07, "loss": 0.0786, "step": 5468 }, { "epoch": 0.8505443234836703, "grad_norm": 1.0941286830031527, "learning_rate": 5.410921791506235e-07, "loss": 0.1503, "step": 5469 }, { "epoch": 0.8506998444790047, "grad_norm": 1.2640394515227331, "learning_rate": 5.399873735897137e-07, "loss": 0.1399, "step": 5470 }, { "epoch": 0.850855365474339, "grad_norm": 0.9753836815701394, "learning_rate": 5.388836326960161e-07, "loss": 0.168, "step": 5471 }, { "epoch": 0.8510108864696734, "grad_norm": 1.288501302932176, "learning_rate": 5.377809567330078e-07, "loss": 0.1667, "step": 5472 }, { "epoch": 0.8511664074650078, "grad_norm": 0.7572921566702671, "learning_rate": 5.366793459639119e-07, "loss": 0.0699, "step": 5473 }, { "epoch": 0.8513219284603422, "grad_norm": 0.7756241992152608, "learning_rate": 5.355788006517004e-07, "loss": 0.1377, "step": 5474 }, { "epoch": 0.8514774494556765, "grad_norm": 1.214663676934432, "learning_rate": 5.344793210590866e-07, "loss": 0.1364, "step": 5475 }, { "epoch": 0.8516329704510109, "grad_norm": 0.5772269956208795, "learning_rate": 5.33380907448534e-07, "loss": 0.1169, "step": 5476 }, { "epoch": 0.8517884914463453, "grad_norm": 1.309626810269974, "learning_rate": 5.322835600822468e-07, "loss": 0.1848, "step": 5477 }, { "epoch": 0.8519440124416796, "grad_norm": 1.6301968727920582, "learning_rate": 5.31187279222179e-07, "loss": 0.1456, "step": 5478 }, { "epoch": 0.852099533437014, "grad_norm": 1.5593244709653238, "learning_rate": 5.300920651300278e-07, "loss": 0.1745, "step": 5479 }, { "epoch": 0.8522550544323484, "grad_norm": 1.0239230854905843, "learning_rate": 5.289979180672344e-07, "loss": 0.0836, "step": 5480 }, { "epoch": 0.8524105754276827, "grad_norm": 1.1233505567281579, "learning_rate": 5.279048382949892e-07, "loss": 0.2183, "step": 5481 }, { "epoch": 0.8525660964230171, "grad_norm": 0.7482866946657345, "learning_rate": 5.26812826074225e-07, "loss": 0.1286, "step": 5482 }, { "epoch": 0.8527216174183515, "grad_norm": 1.1953749706205612, "learning_rate": 5.25721881665619e-07, "loss": 0.1525, "step": 5483 }, { "epoch": 0.8528771384136858, "grad_norm": 5.066358674435666, "learning_rate": 5.246320053295955e-07, "loss": 0.1071, "step": 5484 }, { "epoch": 0.8530326594090202, "grad_norm": 1.4326716395405845, "learning_rate": 5.235431973263245e-07, "loss": 0.1411, "step": 5485 }, { "epoch": 0.8531881804043546, "grad_norm": 0.9880502802622516, "learning_rate": 5.224554579157182e-07, "loss": 0.1728, "step": 5486 }, { "epoch": 0.853343701399689, "grad_norm": 1.1547901883669662, "learning_rate": 5.213687873574363e-07, "loss": 0.1425, "step": 5487 }, { "epoch": 0.8534992223950233, "grad_norm": 0.8261129980044236, "learning_rate": 5.202831859108815e-07, "loss": 0.0938, "step": 5488 }, { "epoch": 0.8536547433903577, "grad_norm": 0.8644666571047549, "learning_rate": 5.191986538352007e-07, "loss": 0.1283, "step": 5489 }, { "epoch": 0.8538102643856921, "grad_norm": 1.0868194471303323, "learning_rate": 5.181151913892896e-07, "loss": 0.1612, "step": 5490 }, { "epoch": 0.8539657853810264, "grad_norm": 0.951712029575812, "learning_rate": 5.170327988317836e-07, "loss": 0.1577, "step": 5491 }, { "epoch": 0.8541213063763609, "grad_norm": 0.9528801801052349, "learning_rate": 5.159514764210666e-07, "loss": 0.1333, "step": 5492 }, { "epoch": 0.8542768273716952, "grad_norm": 1.1096962240605845, "learning_rate": 5.148712244152631e-07, "loss": 0.1583, "step": 5493 }, { "epoch": 0.8544323483670295, "grad_norm": 0.8120149407352528, "learning_rate": 5.137920430722465e-07, "loss": 0.1079, "step": 5494 }, { "epoch": 0.854587869362364, "grad_norm": 1.009662786474378, "learning_rate": 5.127139326496311e-07, "loss": 0.1625, "step": 5495 }, { "epoch": 0.8547433903576983, "grad_norm": 1.8538255552543559, "learning_rate": 5.116368934047778e-07, "loss": 0.1464, "step": 5496 }, { "epoch": 0.8548989113530326, "grad_norm": 2.131756051432387, "learning_rate": 5.105609255947907e-07, "loss": 0.1694, "step": 5497 }, { "epoch": 0.855054432348367, "grad_norm": 0.9386070520379609, "learning_rate": 5.094860294765186e-07, "loss": 0.1663, "step": 5498 }, { "epoch": 0.8552099533437014, "grad_norm": 0.8856496102850174, "learning_rate": 5.084122053065521e-07, "loss": 0.1181, "step": 5499 }, { "epoch": 0.8553654743390358, "grad_norm": 0.8501403555856336, "learning_rate": 5.073394533412296e-07, "loss": 0.0767, "step": 5500 }, { "epoch": 0.8553654743390358, "eval_loss": 0.16114845871925354, "eval_runtime": 9.4463, "eval_samples_per_second": 2.752, "eval_steps_per_second": 0.741, "step": 5500 }, { "epoch": 0.8555209953343701, "grad_norm": 0.9013646217627421, "learning_rate": 5.062677738366329e-07, "loss": 0.1823, "step": 5501 }, { "epoch": 0.8556765163297045, "grad_norm": 1.2335904246528187, "learning_rate": 5.05197167048585e-07, "loss": 0.1683, "step": 5502 }, { "epoch": 0.8558320373250389, "grad_norm": 1.0071832995766743, "learning_rate": 5.041276332326567e-07, "loss": 0.1401, "step": 5503 }, { "epoch": 0.8559875583203732, "grad_norm": 0.8675748008190723, "learning_rate": 5.030591726441592e-07, "loss": 0.1764, "step": 5504 }, { "epoch": 0.8561430793157077, "grad_norm": 0.8867540175067826, "learning_rate": 5.019917855381484e-07, "loss": 0.1305, "step": 5505 }, { "epoch": 0.856298600311042, "grad_norm": 0.8114108157942236, "learning_rate": 5.009254721694262e-07, "loss": 0.1439, "step": 5506 }, { "epoch": 0.8564541213063763, "grad_norm": 1.1880220076806367, "learning_rate": 4.998602327925344e-07, "loss": 0.1203, "step": 5507 }, { "epoch": 0.8566096423017108, "grad_norm": 1.3684074312001706, "learning_rate": 4.987960676617626e-07, "loss": 0.1881, "step": 5508 }, { "epoch": 0.8567651632970451, "grad_norm": 0.9672859122535615, "learning_rate": 4.977329770311406e-07, "loss": 0.1353, "step": 5509 }, { "epoch": 0.8569206842923794, "grad_norm": 0.8784653861229427, "learning_rate": 4.966709611544435e-07, "loss": 0.1792, "step": 5510 }, { "epoch": 0.8570762052877139, "grad_norm": 1.1101766684801433, "learning_rate": 4.956100202851888e-07, "loss": 0.1725, "step": 5511 }, { "epoch": 0.8572317262830482, "grad_norm": 1.1448018342235393, "learning_rate": 4.945501546766384e-07, "loss": 0.1599, "step": 5512 }, { "epoch": 0.8573872472783826, "grad_norm": 0.9810694384499137, "learning_rate": 4.934913645817973e-07, "loss": 0.1136, "step": 5513 }, { "epoch": 0.857542768273717, "grad_norm": 0.8483823524228962, "learning_rate": 4.924336502534122e-07, "loss": 0.1252, "step": 5514 }, { "epoch": 0.8576982892690513, "grad_norm": 1.1711186982919686, "learning_rate": 4.913770119439753e-07, "loss": 0.1643, "step": 5515 }, { "epoch": 0.8578538102643857, "grad_norm": 0.9563379003722419, "learning_rate": 4.903214499057201e-07, "loss": 0.1606, "step": 5516 }, { "epoch": 0.85800933125972, "grad_norm": 0.8842502453979008, "learning_rate": 4.892669643906251e-07, "loss": 0.171, "step": 5517 }, { "epoch": 0.8581648522550545, "grad_norm": 0.9355255190717527, "learning_rate": 4.882135556504092e-07, "loss": 0.0887, "step": 5518 }, { "epoch": 0.8583203732503888, "grad_norm": 0.8912598259424035, "learning_rate": 4.871612239365381e-07, "loss": 0.1363, "step": 5519 }, { "epoch": 0.8584758942457231, "grad_norm": 1.1524540054293018, "learning_rate": 4.861099695002158e-07, "loss": 0.1156, "step": 5520 }, { "epoch": 0.8586314152410576, "grad_norm": 1.0856315709506636, "learning_rate": 4.850597925923916e-07, "loss": 0.1056, "step": 5521 }, { "epoch": 0.8587869362363919, "grad_norm": 0.7761694969498629, "learning_rate": 4.84010693463759e-07, "loss": 0.1433, "step": 5522 }, { "epoch": 0.8589424572317262, "grad_norm": 1.3078739234921666, "learning_rate": 4.829626723647502e-07, "loss": 0.1398, "step": 5523 }, { "epoch": 0.8590979782270607, "grad_norm": 0.8227858013417971, "learning_rate": 4.819157295455451e-07, "loss": 0.0884, "step": 5524 }, { "epoch": 0.859253499222395, "grad_norm": 1.076486043318519, "learning_rate": 4.808698652560612e-07, "loss": 0.0926, "step": 5525 }, { "epoch": 0.8594090202177294, "grad_norm": 0.8172720437984277, "learning_rate": 4.79825079745963e-07, "loss": 0.1519, "step": 5526 }, { "epoch": 0.8595645412130638, "grad_norm": 1.1193611305780606, "learning_rate": 4.787813732646529e-07, "loss": 0.211, "step": 5527 }, { "epoch": 0.8597200622083981, "grad_norm": 0.9714540736664157, "learning_rate": 4.777387460612809e-07, "loss": 0.1737, "step": 5528 }, { "epoch": 0.8598755832037325, "grad_norm": 1.4849481138542455, "learning_rate": 4.7669719838473573e-07, "loss": 0.1563, "step": 5529 }, { "epoch": 0.8600311041990669, "grad_norm": 1.1707807242860675, "learning_rate": 4.7565673048364735e-07, "loss": 0.16, "step": 5530 }, { "epoch": 0.8601866251944013, "grad_norm": 1.0291637343560769, "learning_rate": 4.746173426063927e-07, "loss": 0.1414, "step": 5531 }, { "epoch": 0.8603421461897356, "grad_norm": 0.8646785130540109, "learning_rate": 4.7357903500108603e-07, "loss": 0.0793, "step": 5532 }, { "epoch": 0.86049766718507, "grad_norm": 1.0655369139766717, "learning_rate": 4.725418079155869e-07, "loss": 0.1558, "step": 5533 }, { "epoch": 0.8606531881804044, "grad_norm": 1.301411178864828, "learning_rate": 4.715056615974972e-07, "loss": 0.1382, "step": 5534 }, { "epoch": 0.8608087091757387, "grad_norm": 1.4345306198798067, "learning_rate": 4.704705962941575e-07, "loss": 0.1617, "step": 5535 }, { "epoch": 0.860964230171073, "grad_norm": 0.7582687745456371, "learning_rate": 4.6943661225265335e-07, "loss": 0.1475, "step": 5536 }, { "epoch": 0.8611197511664075, "grad_norm": 0.6815522367316157, "learning_rate": 4.6840370971980965e-07, "loss": 0.0917, "step": 5537 }, { "epoch": 0.8612752721617418, "grad_norm": 0.9282008295250717, "learning_rate": 4.673718889421969e-07, "loss": 0.171, "step": 5538 }, { "epoch": 0.8614307931570762, "grad_norm": 1.2132411717447686, "learning_rate": 4.6634115016612303e-07, "loss": 0.1508, "step": 5539 }, { "epoch": 0.8615863141524106, "grad_norm": 1.0655635283537512, "learning_rate": 4.6531149363764126e-07, "loss": 0.1638, "step": 5540 }, { "epoch": 0.8617418351477449, "grad_norm": 0.8098214868725473, "learning_rate": 4.642829196025439e-07, "loss": 0.1688, "step": 5541 }, { "epoch": 0.8618973561430793, "grad_norm": 0.8540448076285858, "learning_rate": 4.632554283063667e-07, "loss": 0.1018, "step": 5542 }, { "epoch": 0.8620528771384137, "grad_norm": 1.2037043181779212, "learning_rate": 4.6222901999438573e-07, "loss": 0.1468, "step": 5543 }, { "epoch": 0.862208398133748, "grad_norm": 1.609688907284011, "learning_rate": 4.612036949116194e-07, "loss": 0.1394, "step": 5544 }, { "epoch": 0.8623639191290824, "grad_norm": 0.9355803202925081, "learning_rate": 4.601794533028275e-07, "loss": 0.1188, "step": 5545 }, { "epoch": 0.8625194401244168, "grad_norm": 1.1209701913037342, "learning_rate": 4.591562954125084e-07, "loss": 0.1421, "step": 5546 }, { "epoch": 0.8626749611197512, "grad_norm": 1.0149752442098143, "learning_rate": 4.5813422148490714e-07, "loss": 0.1685, "step": 5547 }, { "epoch": 0.8628304821150855, "grad_norm": 0.690330570819564, "learning_rate": 4.5711323176400536e-07, "loss": 0.1496, "step": 5548 }, { "epoch": 0.86298600311042, "grad_norm": 0.88741469368052, "learning_rate": 4.5609332649352687e-07, "loss": 0.1173, "step": 5549 }, { "epoch": 0.8631415241057543, "grad_norm": 1.0199049267832376, "learning_rate": 4.550745059169398e-07, "loss": 0.1383, "step": 5550 }, { "epoch": 0.8632970451010886, "grad_norm": 1.1303337086160663, "learning_rate": 4.540567702774495e-07, "loss": 0.1791, "step": 5551 }, { "epoch": 0.8634525660964231, "grad_norm": 1.4180777877466966, "learning_rate": 4.530401198180023e-07, "loss": 0.2162, "step": 5552 }, { "epoch": 0.8636080870917574, "grad_norm": 0.7332456711218964, "learning_rate": 4.520245547812896e-07, "loss": 0.1295, "step": 5553 }, { "epoch": 0.8637636080870917, "grad_norm": 1.513068319185478, "learning_rate": 4.510100754097391e-07, "loss": 0.1507, "step": 5554 }, { "epoch": 0.8639191290824261, "grad_norm": 2.228480695151488, "learning_rate": 4.499966819455204e-07, "loss": 0.1016, "step": 5555 }, { "epoch": 0.8640746500777605, "grad_norm": 0.8486466840105873, "learning_rate": 4.4898437463054665e-07, "loss": 0.1115, "step": 5556 }, { "epoch": 0.8642301710730949, "grad_norm": 1.217472370028131, "learning_rate": 4.479731537064674e-07, "loss": 0.202, "step": 5557 }, { "epoch": 0.8643856920684292, "grad_norm": 1.569571738667737, "learning_rate": 4.4696301941467845e-07, "loss": 0.1981, "step": 5558 }, { "epoch": 0.8645412130637636, "grad_norm": 0.9963819318311314, "learning_rate": 4.4595397199630964e-07, "loss": 0.1458, "step": 5559 }, { "epoch": 0.864696734059098, "grad_norm": 0.7184089851062957, "learning_rate": 4.4494601169223715e-07, "loss": 0.1072, "step": 5560 }, { "epoch": 0.8648522550544323, "grad_norm": 1.0006492941597194, "learning_rate": 4.4393913874307415e-07, "loss": 0.1639, "step": 5561 }, { "epoch": 0.8650077760497668, "grad_norm": 1.3703795288617195, "learning_rate": 4.429333533891739e-07, "loss": 0.2079, "step": 5562 }, { "epoch": 0.8651632970451011, "grad_norm": 1.100350633839963, "learning_rate": 4.419286558706343e-07, "loss": 0.2371, "step": 5563 }, { "epoch": 0.8653188180404354, "grad_norm": 0.7454373690982873, "learning_rate": 4.409250464272874e-07, "loss": 0.1106, "step": 5564 }, { "epoch": 0.8654743390357699, "grad_norm": 1.311829178619018, "learning_rate": 4.39922525298711e-07, "loss": 0.1456, "step": 5565 }, { "epoch": 0.8656298600311042, "grad_norm": 0.8124478614469509, "learning_rate": 4.38921092724221e-07, "loss": 0.1101, "step": 5566 }, { "epoch": 0.8657853810264385, "grad_norm": 0.7839538000434724, "learning_rate": 4.379207489428727e-07, "loss": 0.1168, "step": 5567 }, { "epoch": 0.865940902021773, "grad_norm": 1.5479699316843798, "learning_rate": 4.369214941934613e-07, "loss": 0.1676, "step": 5568 }, { "epoch": 0.8660964230171073, "grad_norm": 1.0430090894666677, "learning_rate": 4.3592332871452423e-07, "loss": 0.1393, "step": 5569 }, { "epoch": 0.8662519440124417, "grad_norm": 1.1752526305866566, "learning_rate": 4.349262527443371e-07, "loss": 0.1144, "step": 5570 }, { "epoch": 0.8664074650077761, "grad_norm": 1.0734741045074672, "learning_rate": 4.339302665209144e-07, "loss": 0.1351, "step": 5571 }, { "epoch": 0.8665629860031104, "grad_norm": 1.0865200870496945, "learning_rate": 4.329353702820144e-07, "loss": 0.209, "step": 5572 }, { "epoch": 0.8667185069984448, "grad_norm": 1.4622766772289388, "learning_rate": 4.3194156426513036e-07, "loss": 0.2169, "step": 5573 }, { "epoch": 0.8668740279937791, "grad_norm": 1.002527067918335, "learning_rate": 4.3094884870749976e-07, "loss": 0.1756, "step": 5574 }, { "epoch": 0.8670295489891136, "grad_norm": 1.0728969904710302, "learning_rate": 4.299572238460953e-07, "loss": 0.1498, "step": 5575 }, { "epoch": 0.8671850699844479, "grad_norm": 0.8900384577526537, "learning_rate": 4.289666899176342e-07, "loss": 0.131, "step": 5576 }, { "epoch": 0.8673405909797822, "grad_norm": 0.5904105822340004, "learning_rate": 4.279772471585697e-07, "loss": 0.0959, "step": 5577 }, { "epoch": 0.8674961119751167, "grad_norm": 1.1352707912859432, "learning_rate": 4.269888958050944e-07, "loss": 0.1989, "step": 5578 }, { "epoch": 0.867651632970451, "grad_norm": 0.8639283391888362, "learning_rate": 4.2600163609314295e-07, "loss": 0.1861, "step": 5579 }, { "epoch": 0.8678071539657853, "grad_norm": 0.8712165607663875, "learning_rate": 4.2501546825838735e-07, "loss": 0.1935, "step": 5580 }, { "epoch": 0.8679626749611198, "grad_norm": 0.8040753030227624, "learning_rate": 4.2403039253623937e-07, "loss": 0.0809, "step": 5581 }, { "epoch": 0.8681181959564541, "grad_norm": 1.1932391044891624, "learning_rate": 4.230464091618519e-07, "loss": 0.1199, "step": 5582 }, { "epoch": 0.8682737169517885, "grad_norm": 1.5453837968559256, "learning_rate": 4.220635183701144e-07, "loss": 0.1553, "step": 5583 }, { "epoch": 0.8684292379471229, "grad_norm": 0.9674909184972574, "learning_rate": 4.2108172039565576e-07, "loss": 0.1096, "step": 5584 }, { "epoch": 0.8685847589424572, "grad_norm": 0.7601733165779556, "learning_rate": 4.2010101547284633e-07, "loss": 0.1362, "step": 5585 }, { "epoch": 0.8687402799377916, "grad_norm": 1.1379773674734968, "learning_rate": 4.191214038357938e-07, "loss": 0.1925, "step": 5586 }, { "epoch": 0.868895800933126, "grad_norm": 1.235765080204881, "learning_rate": 4.181428857183439e-07, "loss": 0.1602, "step": 5587 }, { "epoch": 0.8690513219284604, "grad_norm": 1.4424260314886321, "learning_rate": 4.171654613540843e-07, "loss": 0.1526, "step": 5588 }, { "epoch": 0.8692068429237947, "grad_norm": 0.8063004224848834, "learning_rate": 4.161891309763377e-07, "loss": 0.1311, "step": 5589 }, { "epoch": 0.8693623639191291, "grad_norm": 1.6840373239580282, "learning_rate": 4.152138948181689e-07, "loss": 0.1475, "step": 5590 }, { "epoch": 0.8695178849144635, "grad_norm": 0.8931694374535459, "learning_rate": 4.142397531123821e-07, "loss": 0.1021, "step": 5591 }, { "epoch": 0.8696734059097978, "grad_norm": 1.114969792275346, "learning_rate": 4.132667060915163e-07, "loss": 0.1043, "step": 5592 }, { "epoch": 0.8698289269051321, "grad_norm": 0.7790754524934486, "learning_rate": 4.122947539878519e-07, "loss": 0.101, "step": 5593 }, { "epoch": 0.8699844479004666, "grad_norm": 1.3159039768783751, "learning_rate": 4.1132389703340715e-07, "loss": 0.1015, "step": 5594 }, { "epoch": 0.8701399688958009, "grad_norm": 1.333783514781497, "learning_rate": 4.1035413545994e-07, "loss": 0.1855, "step": 5595 }, { "epoch": 0.8702954898911353, "grad_norm": 1.1084516176264616, "learning_rate": 4.0938546949894467e-07, "loss": 0.1143, "step": 5596 }, { "epoch": 0.8704510108864697, "grad_norm": 1.0694146629621495, "learning_rate": 4.084178993816562e-07, "loss": 0.1199, "step": 5597 }, { "epoch": 0.870606531881804, "grad_norm": 1.1763984528214346, "learning_rate": 4.074514253390482e-07, "loss": 0.1123, "step": 5598 }, { "epoch": 0.8707620528771384, "grad_norm": 0.9638476213607905, "learning_rate": 4.0648604760182955e-07, "loss": 0.1273, "step": 5599 }, { "epoch": 0.8709175738724728, "grad_norm": 0.8125028874293225, "learning_rate": 4.0552176640045017e-07, "loss": 0.1163, "step": 5600 }, { "epoch": 0.8709175738724728, "eval_loss": 0.16097532212734222, "eval_runtime": 9.4382, "eval_samples_per_second": 2.755, "eval_steps_per_second": 0.742, "step": 5600 }, { "epoch": 0.8710730948678072, "grad_norm": 1.0508545044227033, "learning_rate": 4.045585819650977e-07, "loss": 0.1094, "step": 5601 }, { "epoch": 0.8712286158631415, "grad_norm": 1.299476581495267, "learning_rate": 4.0359649452569705e-07, "loss": 0.214, "step": 5602 }, { "epoch": 0.8713841368584759, "grad_norm": 0.9461131470732053, "learning_rate": 4.026355043119118e-07, "loss": 0.1225, "step": 5603 }, { "epoch": 0.8715396578538103, "grad_norm": 1.1186090981371073, "learning_rate": 4.0167561155314507e-07, "loss": 0.1351, "step": 5604 }, { "epoch": 0.8716951788491446, "grad_norm": 1.1864180420196955, "learning_rate": 4.0071681647853523e-07, "loss": 0.1408, "step": 5605 }, { "epoch": 0.871850699844479, "grad_norm": 1.0971833109820022, "learning_rate": 3.997591193169598e-07, "loss": 0.1645, "step": 5606 }, { "epoch": 0.8720062208398134, "grad_norm": 0.9609333113354045, "learning_rate": 3.988025202970364e-07, "loss": 0.1389, "step": 5607 }, { "epoch": 0.8721617418351477, "grad_norm": 1.1690197005559966, "learning_rate": 3.9784701964711736e-07, "loss": 0.2375, "step": 5608 }, { "epoch": 0.8723172628304822, "grad_norm": 0.9396335497170089, "learning_rate": 3.968926175952931e-07, "loss": 0.0567, "step": 5609 }, { "epoch": 0.8724727838258165, "grad_norm": 1.5892374554380668, "learning_rate": 3.959393143693946e-07, "loss": 0.1431, "step": 5610 }, { "epoch": 0.8726283048211508, "grad_norm": 1.150926520993341, "learning_rate": 3.949871101969871e-07, "loss": 0.2298, "step": 5611 }, { "epoch": 0.8727838258164852, "grad_norm": 0.9391144733633057, "learning_rate": 3.9403600530537546e-07, "loss": 0.13, "step": 5612 }, { "epoch": 0.8729393468118196, "grad_norm": 0.9439969876451616, "learning_rate": 3.9308599992160136e-07, "loss": 0.1155, "step": 5613 }, { "epoch": 0.873094867807154, "grad_norm": 1.5227815796777815, "learning_rate": 3.921370942724451e-07, "loss": 0.2464, "step": 5614 }, { "epoch": 0.8732503888024883, "grad_norm": 0.9627663527081136, "learning_rate": 3.9118928858442426e-07, "loss": 0.1475, "step": 5615 }, { "epoch": 0.8734059097978227, "grad_norm": 1.3037055881244475, "learning_rate": 3.902425830837908e-07, "loss": 0.1519, "step": 5616 }, { "epoch": 0.8735614307931571, "grad_norm": 1.4564282286641836, "learning_rate": 3.892969779965394e-07, "loss": 0.1555, "step": 5617 }, { "epoch": 0.8737169517884914, "grad_norm": 1.2304588008715174, "learning_rate": 3.883524735483973e-07, "loss": 0.1547, "step": 5618 }, { "epoch": 0.8738724727838258, "grad_norm": 1.167147869010711, "learning_rate": 3.8740906996483084e-07, "loss": 0.1487, "step": 5619 }, { "epoch": 0.8740279937791602, "grad_norm": 0.9015803503504535, "learning_rate": 3.864667674710454e-07, "loss": 0.1386, "step": 5620 }, { "epoch": 0.8741835147744945, "grad_norm": 1.3926080470312676, "learning_rate": 3.8552556629197935e-07, "loss": 0.1783, "step": 5621 }, { "epoch": 0.874339035769829, "grad_norm": 0.6137181331367628, "learning_rate": 3.845854666523119e-07, "loss": 0.0937, "step": 5622 }, { "epoch": 0.8744945567651633, "grad_norm": 1.0834584672753056, "learning_rate": 3.8364646877645797e-07, "loss": 0.1321, "step": 5623 }, { "epoch": 0.8746500777604976, "grad_norm": 1.0291141128116255, "learning_rate": 3.827085728885699e-07, "loss": 0.1348, "step": 5624 }, { "epoch": 0.8748055987558321, "grad_norm": 1.120667055128189, "learning_rate": 3.817717792125347e-07, "loss": 0.1689, "step": 5625 }, { "epoch": 0.8749611197511664, "grad_norm": 2.5015920141981485, "learning_rate": 3.808360879719802e-07, "loss": 0.151, "step": 5626 }, { "epoch": 0.8751166407465008, "grad_norm": 1.162784196111815, "learning_rate": 3.799014993902683e-07, "loss": 0.2232, "step": 5627 }, { "epoch": 0.8752721617418352, "grad_norm": 1.24891612100839, "learning_rate": 3.789680136904972e-07, "loss": 0.1968, "step": 5628 }, { "epoch": 0.8754276827371695, "grad_norm": 1.2747663282921629, "learning_rate": 3.78035631095503e-07, "loss": 0.1585, "step": 5629 }, { "epoch": 0.8755832037325039, "grad_norm": 1.0957491326588649, "learning_rate": 3.7710435182786053e-07, "loss": 0.0962, "step": 5630 }, { "epoch": 0.8757387247278383, "grad_norm": 0.9381971490392556, "learning_rate": 3.761741761098769e-07, "loss": 0.1951, "step": 5631 }, { "epoch": 0.8758942457231726, "grad_norm": 1.4265852434813433, "learning_rate": 3.752451041636002e-07, "loss": 0.0934, "step": 5632 }, { "epoch": 0.876049766718507, "grad_norm": 1.3424884792690246, "learning_rate": 3.743171362108111e-07, "loss": 0.1551, "step": 5633 }, { "epoch": 0.8762052877138413, "grad_norm": 1.086621453372041, "learning_rate": 3.7339027247302927e-07, "loss": 0.1369, "step": 5634 }, { "epoch": 0.8763608087091758, "grad_norm": 1.1199390152504427, "learning_rate": 3.7246451317150877e-07, "loss": 0.0734, "step": 5635 }, { "epoch": 0.8765163297045101, "grad_norm": 1.6290276975346751, "learning_rate": 3.715398585272428e-07, "loss": 0.0838, "step": 5636 }, { "epoch": 0.8766718506998444, "grad_norm": 0.8074002168517241, "learning_rate": 3.706163087609582e-07, "loss": 0.1182, "step": 5637 }, { "epoch": 0.8768273716951789, "grad_norm": 1.5869196846247098, "learning_rate": 3.696938640931197e-07, "loss": 0.2027, "step": 5638 }, { "epoch": 0.8769828926905132, "grad_norm": 0.8141653026358605, "learning_rate": 3.687725247439283e-07, "loss": 0.1308, "step": 5639 }, { "epoch": 0.8771384136858476, "grad_norm": 1.4359039552995176, "learning_rate": 3.6785229093331987e-07, "loss": 0.1574, "step": 5640 }, { "epoch": 0.877293934681182, "grad_norm": 1.0148648467503092, "learning_rate": 3.6693316288096694e-07, "loss": 0.1156, "step": 5641 }, { "epoch": 0.8774494556765163, "grad_norm": 1.0779559562909966, "learning_rate": 3.6601514080627895e-07, "loss": 0.1225, "step": 5642 }, { "epoch": 0.8776049766718507, "grad_norm": 1.4027761784943797, "learning_rate": 3.6509822492839964e-07, "loss": 0.1783, "step": 5643 }, { "epoch": 0.8777604976671851, "grad_norm": 1.4872122916929145, "learning_rate": 3.6418241546620936e-07, "loss": 0.1895, "step": 5644 }, { "epoch": 0.8779160186625194, "grad_norm": 1.1087288904207173, "learning_rate": 3.632677126383255e-07, "loss": 0.2114, "step": 5645 }, { "epoch": 0.8780715396578538, "grad_norm": 4.106958027581052, "learning_rate": 3.623541166631006e-07, "loss": 0.1102, "step": 5646 }, { "epoch": 0.8782270606531882, "grad_norm": 0.9306591139236339, "learning_rate": 3.614416277586219e-07, "loss": 0.0809, "step": 5647 }, { "epoch": 0.8783825816485226, "grad_norm": 1.1080535272802399, "learning_rate": 3.6053024614271405e-07, "loss": 0.1278, "step": 5648 }, { "epoch": 0.8785381026438569, "grad_norm": 0.9905467465618378, "learning_rate": 3.596199720329363e-07, "loss": 0.1016, "step": 5649 }, { "epoch": 0.8786936236391913, "grad_norm": 0.7708474289494227, "learning_rate": 3.587108056465827e-07, "loss": 0.0996, "step": 5650 }, { "epoch": 0.8788491446345257, "grad_norm": 0.9612047681736022, "learning_rate": 3.5780274720068566e-07, "loss": 0.1696, "step": 5651 }, { "epoch": 0.87900466562986, "grad_norm": 1.142631613371655, "learning_rate": 3.568957969120107e-07, "loss": 0.2184, "step": 5652 }, { "epoch": 0.8791601866251944, "grad_norm": 1.8417573949021275, "learning_rate": 3.5598995499705857e-07, "loss": 0.1412, "step": 5653 }, { "epoch": 0.8793157076205288, "grad_norm": 0.8565253917025486, "learning_rate": 3.550852216720668e-07, "loss": 0.1505, "step": 5654 }, { "epoch": 0.8794712286158631, "grad_norm": 0.7181202448802291, "learning_rate": 3.541815971530094e-07, "loss": 0.1199, "step": 5655 }, { "epoch": 0.8796267496111975, "grad_norm": 1.2180316886942035, "learning_rate": 3.5327908165559254e-07, "loss": 0.1311, "step": 5656 }, { "epoch": 0.8797822706065319, "grad_norm": 0.9356108015873734, "learning_rate": 3.52377675395259e-07, "loss": 0.1624, "step": 5657 }, { "epoch": 0.8799377916018662, "grad_norm": 1.0503431400095533, "learning_rate": 3.514773785871889e-07, "loss": 0.1556, "step": 5658 }, { "epoch": 0.8800933125972006, "grad_norm": 1.2513323575433914, "learning_rate": 3.505781914462941e-07, "loss": 0.1574, "step": 5659 }, { "epoch": 0.880248833592535, "grad_norm": 1.0504588489857518, "learning_rate": 3.496801141872225e-07, "loss": 0.1721, "step": 5660 }, { "epoch": 0.8804043545878694, "grad_norm": 1.005279229043833, "learning_rate": 3.487831470243591e-07, "loss": 0.2151, "step": 5661 }, { "epoch": 0.8805598755832037, "grad_norm": 0.9103940902473099, "learning_rate": 3.478872901718222e-07, "loss": 0.1727, "step": 5662 }, { "epoch": 0.8807153965785381, "grad_norm": 3.3831730502372066, "learning_rate": 3.469925438434646e-07, "loss": 0.1305, "step": 5663 }, { "epoch": 0.8808709175738725, "grad_norm": 0.8810538262680552, "learning_rate": 3.460989082528765e-07, "loss": 0.1746, "step": 5664 }, { "epoch": 0.8810264385692068, "grad_norm": 0.8635001813462627, "learning_rate": 3.45206383613379e-07, "loss": 0.1154, "step": 5665 }, { "epoch": 0.8811819595645413, "grad_norm": 1.0979454165012401, "learning_rate": 3.443149701380311e-07, "loss": 0.1153, "step": 5666 }, { "epoch": 0.8813374805598756, "grad_norm": 0.8226525066529897, "learning_rate": 3.434246680396258e-07, "loss": 0.0865, "step": 5667 }, { "epoch": 0.8814930015552099, "grad_norm": 0.759984203815009, "learning_rate": 3.425354775306911e-07, "loss": 0.1004, "step": 5668 }, { "epoch": 0.8816485225505444, "grad_norm": 0.8990896750927304, "learning_rate": 3.4164739882348694e-07, "loss": 0.1002, "step": 5669 }, { "epoch": 0.8818040435458787, "grad_norm": 1.3679141945181652, "learning_rate": 3.407604321300123e-07, "loss": 0.1435, "step": 5670 }, { "epoch": 0.881959564541213, "grad_norm": 1.2149944805144912, "learning_rate": 3.3987457766199883e-07, "loss": 0.1028, "step": 5671 }, { "epoch": 0.8821150855365474, "grad_norm": 2.270543658792335, "learning_rate": 3.389898356309107e-07, "loss": 0.1795, "step": 5672 }, { "epoch": 0.8822706065318818, "grad_norm": 1.118531448452699, "learning_rate": 3.381062062479484e-07, "loss": 0.1395, "step": 5673 }, { "epoch": 0.8824261275272162, "grad_norm": 1.0184599450921639, "learning_rate": 3.372236897240477e-07, "loss": 0.134, "step": 5674 }, { "epoch": 0.8825816485225505, "grad_norm": 0.8841022193543031, "learning_rate": 3.363422862698773e-07, "loss": 0.1082, "step": 5675 }, { "epoch": 0.882737169517885, "grad_norm": 1.2667192891095662, "learning_rate": 3.354619960958394e-07, "loss": 0.2047, "step": 5676 }, { "epoch": 0.8828926905132193, "grad_norm": 0.8509216197340151, "learning_rate": 3.345828194120715e-07, "loss": 0.1362, "step": 5677 }, { "epoch": 0.8830482115085536, "grad_norm": 1.1418661120228855, "learning_rate": 3.337047564284479e-07, "loss": 0.1259, "step": 5678 }, { "epoch": 0.8832037325038881, "grad_norm": 1.5905962915649294, "learning_rate": 3.32827807354571e-07, "loss": 0.1488, "step": 5679 }, { "epoch": 0.8833592534992224, "grad_norm": 0.8965269984095786, "learning_rate": 3.3195197239978384e-07, "loss": 0.1197, "step": 5680 }, { "epoch": 0.8835147744945567, "grad_norm": 1.1515617420932518, "learning_rate": 3.310772517731592e-07, "loss": 0.2124, "step": 5681 }, { "epoch": 0.8836702954898912, "grad_norm": 1.1377122683537477, "learning_rate": 3.3020364568350394e-07, "loss": 0.1654, "step": 5682 }, { "epoch": 0.8838258164852255, "grad_norm": 1.126097834090154, "learning_rate": 3.293311543393618e-07, "loss": 0.1102, "step": 5683 }, { "epoch": 0.8839813374805598, "grad_norm": 0.8729376358665645, "learning_rate": 3.2845977794900783e-07, "loss": 0.13, "step": 5684 }, { "epoch": 0.8841368584758943, "grad_norm": 0.9833122732149159, "learning_rate": 3.275895167204507e-07, "loss": 0.0891, "step": 5685 }, { "epoch": 0.8842923794712286, "grad_norm": 0.788411826270717, "learning_rate": 3.267203708614353e-07, "loss": 0.1294, "step": 5686 }, { "epoch": 0.884447900466563, "grad_norm": 1.847813576408218, "learning_rate": 3.258523405794395e-07, "loss": 0.2338, "step": 5687 }, { "epoch": 0.8846034214618974, "grad_norm": 0.9518351125644693, "learning_rate": 3.2498542608167206e-07, "loss": 0.1251, "step": 5688 }, { "epoch": 0.8847589424572317, "grad_norm": 0.9515782919949121, "learning_rate": 3.241196275750802e-07, "loss": 0.101, "step": 5689 }, { "epoch": 0.8849144634525661, "grad_norm": 1.3698895713963906, "learning_rate": 3.232549452663403e-07, "loss": 0.0719, "step": 5690 }, { "epoch": 0.8850699844479004, "grad_norm": 0.8435105327812104, "learning_rate": 3.223913793618644e-07, "loss": 0.1393, "step": 5691 }, { "epoch": 0.8852255054432349, "grad_norm": 1.0058190257006756, "learning_rate": 3.215289300677965e-07, "loss": 0.128, "step": 5692 }, { "epoch": 0.8853810264385692, "grad_norm": 0.950551430379818, "learning_rate": 3.206675975900175e-07, "loss": 0.182, "step": 5693 }, { "epoch": 0.8855365474339035, "grad_norm": 0.7341387678939632, "learning_rate": 3.1980738213413856e-07, "loss": 0.1393, "step": 5694 }, { "epoch": 0.885692068429238, "grad_norm": 1.2436063897571437, "learning_rate": 3.189482839055047e-07, "loss": 0.1472, "step": 5695 }, { "epoch": 0.8858475894245723, "grad_norm": 0.7790076244102134, "learning_rate": 3.180903031091953e-07, "loss": 0.1072, "step": 5696 }, { "epoch": 0.8860031104199066, "grad_norm": 0.7964728265761083, "learning_rate": 3.1723343995002265e-07, "loss": 0.1454, "step": 5697 }, { "epoch": 0.8861586314152411, "grad_norm": 1.001493669714912, "learning_rate": 3.163776946325303e-07, "loss": 0.1729, "step": 5698 }, { "epoch": 0.8863141524105754, "grad_norm": 1.198644724071747, "learning_rate": 3.155230673609988e-07, "loss": 0.1464, "step": 5699 }, { "epoch": 0.8864696734059098, "grad_norm": 0.7669442244287487, "learning_rate": 3.146695583394377e-07, "loss": 0.0927, "step": 5700 }, { "epoch": 0.8864696734059098, "eval_loss": 0.16071347892284393, "eval_runtime": 9.4275, "eval_samples_per_second": 2.758, "eval_steps_per_second": 0.743, "step": 5700 }, { "epoch": 0.8866251944012442, "grad_norm": 1.0509178825432375, "learning_rate": 3.138171677715918e-07, "loss": 0.1361, "step": 5701 }, { "epoch": 0.8867807153965785, "grad_norm": 0.9290580255569608, "learning_rate": 3.129658958609394e-07, "loss": 0.1147, "step": 5702 }, { "epoch": 0.8869362363919129, "grad_norm": 1.033165012734603, "learning_rate": 3.121157428106908e-07, "loss": 0.1167, "step": 5703 }, { "epoch": 0.8870917573872473, "grad_norm": 0.7151902240010587, "learning_rate": 3.112667088237892e-07, "loss": 0.118, "step": 5704 }, { "epoch": 0.8872472783825817, "grad_norm": 1.7548311408170536, "learning_rate": 3.1041879410291076e-07, "loss": 0.1625, "step": 5705 }, { "epoch": 0.887402799377916, "grad_norm": 0.7533605434252014, "learning_rate": 3.095719988504653e-07, "loss": 0.1491, "step": 5706 }, { "epoch": 0.8875583203732504, "grad_norm": 1.3280456384265253, "learning_rate": 3.0872632326859276e-07, "loss": 0.0804, "step": 5707 }, { "epoch": 0.8877138413685848, "grad_norm": 0.9771703854119773, "learning_rate": 3.0788176755917e-07, "loss": 0.1871, "step": 5708 }, { "epoch": 0.8878693623639191, "grad_norm": 0.7684667032204046, "learning_rate": 3.070383319238024e-07, "loss": 0.1132, "step": 5709 }, { "epoch": 0.8880248833592534, "grad_norm": 1.0534409785669496, "learning_rate": 3.061960165638317e-07, "loss": 0.155, "step": 5710 }, { "epoch": 0.8881804043545879, "grad_norm": 1.149601974227962, "learning_rate": 3.0535482168032816e-07, "loss": 0.1028, "step": 5711 }, { "epoch": 0.8883359253499222, "grad_norm": 0.8723980458166707, "learning_rate": 3.0451474747409835e-07, "loss": 0.1859, "step": 5712 }, { "epoch": 0.8884914463452566, "grad_norm": 1.1322504497938706, "learning_rate": 3.0367579414567973e-07, "loss": 0.1923, "step": 5713 }, { "epoch": 0.888646967340591, "grad_norm": 1.1477230155583613, "learning_rate": 3.0283796189534034e-07, "loss": 0.1188, "step": 5714 }, { "epoch": 0.8888024883359253, "grad_norm": 1.3354061226440932, "learning_rate": 3.020012509230846e-07, "loss": 0.1571, "step": 5715 }, { "epoch": 0.8889580093312597, "grad_norm": 0.8888762859514483, "learning_rate": 3.0116566142864565e-07, "loss": 0.1865, "step": 5716 }, { "epoch": 0.8891135303265941, "grad_norm": 1.5600997194893549, "learning_rate": 3.0033119361149053e-07, "loss": 0.2172, "step": 5717 }, { "epoch": 0.8892690513219285, "grad_norm": 1.53771611439838, "learning_rate": 2.994978476708188e-07, "loss": 0.1293, "step": 5718 }, { "epoch": 0.8894245723172628, "grad_norm": 1.8961527626530987, "learning_rate": 2.986656238055624e-07, "loss": 0.1642, "step": 5719 }, { "epoch": 0.8895800933125972, "grad_norm": 0.7106757838342468, "learning_rate": 2.9783452221438304e-07, "loss": 0.07, "step": 5720 }, { "epoch": 0.8897356143079316, "grad_norm": 0.8983542311656352, "learning_rate": 2.970045430956781e-07, "loss": 0.1223, "step": 5721 }, { "epoch": 0.8898911353032659, "grad_norm": 0.9359705538544465, "learning_rate": 2.96175686647574e-07, "loss": 0.1678, "step": 5722 }, { "epoch": 0.8900466562986004, "grad_norm": 0.8247078534938106, "learning_rate": 2.9534795306792987e-07, "loss": 0.1055, "step": 5723 }, { "epoch": 0.8902021772939347, "grad_norm": 0.7553038020543926, "learning_rate": 2.945213425543392e-07, "loss": 0.1282, "step": 5724 }, { "epoch": 0.890357698289269, "grad_norm": 1.1721368292137098, "learning_rate": 2.9369585530412304e-07, "loss": 0.0863, "step": 5725 }, { "epoch": 0.8905132192846035, "grad_norm": 1.1870637710277545, "learning_rate": 2.9287149151433827e-07, "loss": 0.1653, "step": 5726 }, { "epoch": 0.8906687402799378, "grad_norm": 0.9793697183668253, "learning_rate": 2.9204825138177186e-07, "loss": 0.0801, "step": 5727 }, { "epoch": 0.8908242612752721, "grad_norm": 0.9568283271694411, "learning_rate": 2.912261351029433e-07, "loss": 0.1591, "step": 5728 }, { "epoch": 0.8909797822706065, "grad_norm": 0.9963298858144942, "learning_rate": 2.904051428741017e-07, "loss": 0.0951, "step": 5729 }, { "epoch": 0.8911353032659409, "grad_norm": 0.6277877726241657, "learning_rate": 2.895852748912298e-07, "loss": 0.1061, "step": 5730 }, { "epoch": 0.8912908242612753, "grad_norm": 1.0205039501400026, "learning_rate": 2.8876653135004264e-07, "loss": 0.153, "step": 5731 }, { "epoch": 0.8914463452566096, "grad_norm": 0.8370272461522827, "learning_rate": 2.8794891244598445e-07, "loss": 0.1149, "step": 5732 }, { "epoch": 0.891601866251944, "grad_norm": 1.035040570021154, "learning_rate": 2.8713241837423243e-07, "loss": 0.1816, "step": 5733 }, { "epoch": 0.8917573872472784, "grad_norm": 0.9232723942687356, "learning_rate": 2.8631704932969516e-07, "loss": 0.1238, "step": 5734 }, { "epoch": 0.8919129082426127, "grad_norm": 1.1064903572487017, "learning_rate": 2.855028055070136e-07, "loss": 0.1403, "step": 5735 }, { "epoch": 0.8920684292379472, "grad_norm": 1.505821845492436, "learning_rate": 2.8468968710055723e-07, "loss": 0.1337, "step": 5736 }, { "epoch": 0.8922239502332815, "grad_norm": 0.8861410727732335, "learning_rate": 2.838776943044308e-07, "loss": 0.1282, "step": 5737 }, { "epoch": 0.8923794712286158, "grad_norm": 1.0635726299573731, "learning_rate": 2.830668273124676e-07, "loss": 0.1384, "step": 5738 }, { "epoch": 0.8925349922239503, "grad_norm": 1.0049417110553251, "learning_rate": 2.8225708631823166e-07, "loss": 0.118, "step": 5739 }, { "epoch": 0.8926905132192846, "grad_norm": 1.0839987099095318, "learning_rate": 2.814484715150212e-07, "loss": 0.1448, "step": 5740 }, { "epoch": 0.8928460342146189, "grad_norm": 0.9681436040910736, "learning_rate": 2.8064098309586287e-07, "loss": 0.1525, "step": 5741 }, { "epoch": 0.8930015552099534, "grad_norm": 1.0575051067168553, "learning_rate": 2.7983462125351523e-07, "loss": 0.1582, "step": 5742 }, { "epoch": 0.8931570762052877, "grad_norm": 1.1249856690667421, "learning_rate": 2.790293861804688e-07, "loss": 0.0835, "step": 5743 }, { "epoch": 0.8933125972006221, "grad_norm": 1.0636539769446378, "learning_rate": 2.782252780689448e-07, "loss": 0.1192, "step": 5744 }, { "epoch": 0.8934681181959565, "grad_norm": 0.7852100198392726, "learning_rate": 2.774222971108931e-07, "loss": 0.1799, "step": 5745 }, { "epoch": 0.8936236391912908, "grad_norm": 0.9698042546540431, "learning_rate": 2.766204434979991e-07, "loss": 0.1384, "step": 5746 }, { "epoch": 0.8937791601866252, "grad_norm": 0.9928621750800279, "learning_rate": 2.7581971742167425e-07, "loss": 0.1423, "step": 5747 }, { "epoch": 0.8939346811819595, "grad_norm": 1.0040917284700857, "learning_rate": 2.7502011907306447e-07, "loss": 0.0858, "step": 5748 }, { "epoch": 0.894090202177294, "grad_norm": 1.5373315627701283, "learning_rate": 2.7422164864304325e-07, "loss": 0.0999, "step": 5749 }, { "epoch": 0.8942457231726283, "grad_norm": 1.0478022520144608, "learning_rate": 2.734243063222181e-07, "loss": 0.1276, "step": 5750 }, { "epoch": 0.8944012441679626, "grad_norm": 0.9143200558158329, "learning_rate": 2.726280923009261e-07, "loss": 0.1731, "step": 5751 }, { "epoch": 0.8945567651632971, "grad_norm": 1.062136143898708, "learning_rate": 2.718330067692332e-07, "loss": 0.1314, "step": 5752 }, { "epoch": 0.8947122861586314, "grad_norm": 1.0892249684597173, "learning_rate": 2.7103904991693907e-07, "loss": 0.1718, "step": 5753 }, { "epoch": 0.8948678071539657, "grad_norm": 1.0602390076235193, "learning_rate": 2.702462219335711e-07, "loss": 0.1699, "step": 5754 }, { "epoch": 0.8950233281493002, "grad_norm": 0.8862983413664659, "learning_rate": 2.6945452300838834e-07, "loss": 0.1445, "step": 5755 }, { "epoch": 0.8951788491446345, "grad_norm": 1.1837157160585903, "learning_rate": 2.686639533303809e-07, "loss": 0.1623, "step": 5756 }, { "epoch": 0.8953343701399689, "grad_norm": 1.1879950281817209, "learning_rate": 2.678745130882682e-07, "loss": 0.2018, "step": 5757 }, { "epoch": 0.8954898911353033, "grad_norm": 0.7386961849682563, "learning_rate": 2.670862024705012e-07, "loss": 0.1459, "step": 5758 }, { "epoch": 0.8956454121306376, "grad_norm": 1.0795421486274008, "learning_rate": 2.6629902166526057e-07, "loss": 0.1256, "step": 5759 }, { "epoch": 0.895800933125972, "grad_norm": 0.9551573865556224, "learning_rate": 2.655129708604576e-07, "loss": 0.1248, "step": 5760 }, { "epoch": 0.8959564541213064, "grad_norm": 0.98610637798951, "learning_rate": 2.647280502437327e-07, "loss": 0.1668, "step": 5761 }, { "epoch": 0.8961119751166408, "grad_norm": 1.3406273704931664, "learning_rate": 2.639442600024583e-07, "loss": 0.1542, "step": 5762 }, { "epoch": 0.8962674961119751, "grad_norm": 0.5488848294356269, "learning_rate": 2.6316160032373585e-07, "loss": 0.0956, "step": 5763 }, { "epoch": 0.8964230171073095, "grad_norm": 1.2230466816491865, "learning_rate": 2.623800713943958e-07, "loss": 0.1184, "step": 5764 }, { "epoch": 0.8965785381026439, "grad_norm": 0.9608421543075575, "learning_rate": 2.6159967340100233e-07, "loss": 0.1282, "step": 5765 }, { "epoch": 0.8967340590979782, "grad_norm": 0.9609848722848647, "learning_rate": 2.608204065298453e-07, "loss": 0.0581, "step": 5766 }, { "epoch": 0.8968895800933125, "grad_norm": 1.1805148374000831, "learning_rate": 2.600422709669476e-07, "loss": 0.1247, "step": 5767 }, { "epoch": 0.897045101088647, "grad_norm": 1.094129096069794, "learning_rate": 2.5926526689806054e-07, "loss": 0.1934, "step": 5768 }, { "epoch": 0.8972006220839813, "grad_norm": 1.0768047169682036, "learning_rate": 2.5848939450866695e-07, "loss": 0.1453, "step": 5769 }, { "epoch": 0.8973561430793157, "grad_norm": 0.9494661662825569, "learning_rate": 2.5771465398397757e-07, "loss": 0.1834, "step": 5770 }, { "epoch": 0.8975116640746501, "grad_norm": 1.140067795739051, "learning_rate": 2.569410455089327e-07, "loss": 0.124, "step": 5771 }, { "epoch": 0.8976671850699844, "grad_norm": 0.8675924506234418, "learning_rate": 2.561685692682053e-07, "loss": 0.1541, "step": 5772 }, { "epoch": 0.8978227060653188, "grad_norm": 1.465398013862432, "learning_rate": 2.5539722544619506e-07, "loss": 0.1945, "step": 5773 }, { "epoch": 0.8979782270606532, "grad_norm": 0.9219660346229158, "learning_rate": 2.5462701422703296e-07, "loss": 0.101, "step": 5774 }, { "epoch": 0.8981337480559876, "grad_norm": 1.4945066615514613, "learning_rate": 2.5385793579457916e-07, "loss": 0.0915, "step": 5775 }, { "epoch": 0.8982892690513219, "grad_norm": 0.7954207958970004, "learning_rate": 2.530899903324241e-07, "loss": 0.1482, "step": 5776 }, { "epoch": 0.8984447900466563, "grad_norm": 0.6515846822770607, "learning_rate": 2.5232317802388497e-07, "loss": 0.0623, "step": 5777 }, { "epoch": 0.8986003110419907, "grad_norm": 0.8881173319080791, "learning_rate": 2.515574990520131e-07, "loss": 0.1564, "step": 5778 }, { "epoch": 0.898755832037325, "grad_norm": 1.2786301740639319, "learning_rate": 2.5079295359958567e-07, "loss": 0.1474, "step": 5779 }, { "epoch": 0.8989113530326595, "grad_norm": 1.247846198087779, "learning_rate": 2.5002954184910887e-07, "loss": 0.1499, "step": 5780 }, { "epoch": 0.8990668740279938, "grad_norm": 1.114106826386461, "learning_rate": 2.492672639828225e-07, "loss": 0.2496, "step": 5781 }, { "epoch": 0.8992223950233281, "grad_norm": 1.1608820471296273, "learning_rate": 2.485061201826899e-07, "loss": 0.151, "step": 5782 }, { "epoch": 0.8993779160186626, "grad_norm": 1.320103214295542, "learning_rate": 2.477461106304091e-07, "loss": 0.1613, "step": 5783 }, { "epoch": 0.8995334370139969, "grad_norm": 1.657771294287026, "learning_rate": 2.4698723550740486e-07, "loss": 0.2372, "step": 5784 }, { "epoch": 0.8996889580093312, "grad_norm": 1.4918033751753583, "learning_rate": 2.4622949499483016e-07, "loss": 0.1231, "step": 5785 }, { "epoch": 0.8998444790046656, "grad_norm": 0.6957920826355418, "learning_rate": 2.454728892735686e-07, "loss": 0.1008, "step": 5786 }, { "epoch": 0.9, "grad_norm": 0.8090941814728845, "learning_rate": 2.447174185242324e-07, "loss": 0.0988, "step": 5787 }, { "epoch": 0.9001555209953344, "grad_norm": 0.9658131908603245, "learning_rate": 2.439630829271633e-07, "loss": 0.1373, "step": 5788 }, { "epoch": 0.9003110419906687, "grad_norm": 1.0255087092541433, "learning_rate": 2.432098826624313e-07, "loss": 0.1488, "step": 5789 }, { "epoch": 0.9004665629860031, "grad_norm": 1.1793900126749541, "learning_rate": 2.424578179098358e-07, "loss": 0.1188, "step": 5790 }, { "epoch": 0.9006220839813375, "grad_norm": 0.960682796630534, "learning_rate": 2.4170688884890605e-07, "loss": 0.2025, "step": 5791 }, { "epoch": 0.9007776049766718, "grad_norm": 1.1735562701203022, "learning_rate": 2.4095709565889857e-07, "loss": 0.1719, "step": 5792 }, { "epoch": 0.9009331259720063, "grad_norm": 0.807140491011167, "learning_rate": 2.4020843851879916e-07, "loss": 0.1143, "step": 5793 }, { "epoch": 0.9010886469673406, "grad_norm": 0.8002063419262229, "learning_rate": 2.3946091760732373e-07, "loss": 0.1189, "step": 5794 }, { "epoch": 0.9012441679626749, "grad_norm": 1.53471443225601, "learning_rate": 2.38714533102915e-07, "loss": 0.2026, "step": 5795 }, { "epoch": 0.9013996889580094, "grad_norm": 0.827309167454249, "learning_rate": 2.3796928518374562e-07, "loss": 0.1157, "step": 5796 }, { "epoch": 0.9015552099533437, "grad_norm": 0.7706791800359526, "learning_rate": 2.372251740277165e-07, "loss": 0.1544, "step": 5797 }, { "epoch": 0.901710730948678, "grad_norm": 0.83706851092758, "learning_rate": 2.364821998124578e-07, "loss": 0.117, "step": 5798 }, { "epoch": 0.9018662519440125, "grad_norm": 0.9243468529339535, "learning_rate": 2.357403627153271e-07, "loss": 0.1914, "step": 5799 }, { "epoch": 0.9020217729393468, "grad_norm": 1.0390635457770556, "learning_rate": 2.3499966291341213e-07, "loss": 0.1271, "step": 5800 }, { "epoch": 0.9020217729393468, "eval_loss": 0.16070416569709778, "eval_runtime": 9.4392, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.742, "step": 5800 }, { "epoch": 0.9021772939346812, "grad_norm": 1.3382791728970873, "learning_rate": 2.3426010058352822e-07, "loss": 0.1491, "step": 5801 }, { "epoch": 0.9023328149300156, "grad_norm": 0.6405885903520063, "learning_rate": 2.3352167590221797e-07, "loss": 0.1103, "step": 5802 }, { "epoch": 0.9024883359253499, "grad_norm": 0.9924692095650187, "learning_rate": 2.327843890457554e-07, "loss": 0.1096, "step": 5803 }, { "epoch": 0.9026438569206843, "grad_norm": 0.9803079837000307, "learning_rate": 2.3204824019014027e-07, "loss": 0.0979, "step": 5804 }, { "epoch": 0.9027993779160186, "grad_norm": 1.0314701568680384, "learning_rate": 2.313132295111009e-07, "loss": 0.1141, "step": 5805 }, { "epoch": 0.902954898911353, "grad_norm": 0.8893599405884106, "learning_rate": 2.305793571840953e-07, "loss": 0.1211, "step": 5806 }, { "epoch": 0.9031104199066874, "grad_norm": 1.3178590760687472, "learning_rate": 2.2984662338430995e-07, "loss": 0.2034, "step": 5807 }, { "epoch": 0.9032659409020217, "grad_norm": 0.9827794228998218, "learning_rate": 2.2911502828665722e-07, "loss": 0.1763, "step": 5808 }, { "epoch": 0.9034214618973562, "grad_norm": 0.9291808223017586, "learning_rate": 2.2838457206577957e-07, "loss": 0.1099, "step": 5809 }, { "epoch": 0.9035769828926905, "grad_norm": 0.9486189660565154, "learning_rate": 2.2765525489604702e-07, "loss": 0.0902, "step": 5810 }, { "epoch": 0.9037325038880248, "grad_norm": 0.8252721889561242, "learning_rate": 2.2692707695155802e-07, "loss": 0.123, "step": 5811 }, { "epoch": 0.9038880248833593, "grad_norm": 1.0545722659419408, "learning_rate": 2.26200038406138e-07, "loss": 0.1176, "step": 5812 }, { "epoch": 0.9040435458786936, "grad_norm": 0.6978931891299445, "learning_rate": 2.254741394333426e-07, "loss": 0.1154, "step": 5813 }, { "epoch": 0.904199066874028, "grad_norm": 0.8830353442689887, "learning_rate": 2.2474938020645266e-07, "loss": 0.098, "step": 5814 }, { "epoch": 0.9043545878693624, "grad_norm": 0.8100325028699273, "learning_rate": 2.240257608984786e-07, "loss": 0.0723, "step": 5815 }, { "epoch": 0.9045101088646967, "grad_norm": 0.930063372129811, "learning_rate": 2.2330328168215897e-07, "loss": 0.1224, "step": 5816 }, { "epoch": 0.9046656298600311, "grad_norm": 0.7681692632731774, "learning_rate": 2.225819427299597e-07, "loss": 0.1206, "step": 5817 }, { "epoch": 0.9048211508553655, "grad_norm": 0.8340542445800614, "learning_rate": 2.2186174421407358e-07, "loss": 0.1188, "step": 5818 }, { "epoch": 0.9049766718506999, "grad_norm": 1.5521378435199726, "learning_rate": 2.211426863064231e-07, "loss": 0.2122, "step": 5819 }, { "epoch": 0.9051321928460342, "grad_norm": 1.195121931795032, "learning_rate": 2.2042476917865706e-07, "loss": 0.1106, "step": 5820 }, { "epoch": 0.9052877138413686, "grad_norm": 1.397913510870534, "learning_rate": 2.1970799300215117e-07, "loss": 0.1809, "step": 5821 }, { "epoch": 0.905443234836703, "grad_norm": 1.2190335094944609, "learning_rate": 2.1899235794801132e-07, "loss": 0.1448, "step": 5822 }, { "epoch": 0.9055987558320373, "grad_norm": 0.8281351013867018, "learning_rate": 2.182778641870692e-07, "loss": 0.1348, "step": 5823 }, { "epoch": 0.9057542768273716, "grad_norm": 1.0330310069258557, "learning_rate": 2.1756451188988503e-07, "loss": 0.1392, "step": 5824 }, { "epoch": 0.9059097978227061, "grad_norm": 0.9362416130812174, "learning_rate": 2.1685230122674428e-07, "loss": 0.1237, "step": 5825 }, { "epoch": 0.9060653188180404, "grad_norm": 1.3516109321252394, "learning_rate": 2.161412323676626e-07, "loss": 0.1624, "step": 5826 }, { "epoch": 0.9062208398133748, "grad_norm": 1.3140385458360873, "learning_rate": 2.1543130548238255e-07, "loss": 0.2124, "step": 5827 }, { "epoch": 0.9063763608087092, "grad_norm": 1.563854675448706, "learning_rate": 2.1472252074037248e-07, "loss": 0.2525, "step": 5828 }, { "epoch": 0.9065318818040435, "grad_norm": 1.061939522270165, "learning_rate": 2.1401487831083034e-07, "loss": 0.1165, "step": 5829 }, { "epoch": 0.9066874027993779, "grad_norm": 1.278324616535262, "learning_rate": 2.1330837836267882e-07, "loss": 0.1615, "step": 5830 }, { "epoch": 0.9068429237947123, "grad_norm": 0.8431753468536463, "learning_rate": 2.126030210645702e-07, "loss": 0.1008, "step": 5831 }, { "epoch": 0.9069984447900467, "grad_norm": 3.5714215270928014, "learning_rate": 2.118988065848837e-07, "loss": 0.1321, "step": 5832 }, { "epoch": 0.907153965785381, "grad_norm": 0.8410131456093862, "learning_rate": 2.111957350917243e-07, "loss": 0.1384, "step": 5833 }, { "epoch": 0.9073094867807154, "grad_norm": 0.9357086359142014, "learning_rate": 2.1049380675292496e-07, "loss": 0.0672, "step": 5834 }, { "epoch": 0.9074650077760498, "grad_norm": 0.7852493179878384, "learning_rate": 2.097930217360461e-07, "loss": 0.1616, "step": 5835 }, { "epoch": 0.9076205287713841, "grad_norm": 1.0361048880704697, "learning_rate": 2.09093380208375e-07, "loss": 0.1407, "step": 5836 }, { "epoch": 0.9077760497667186, "grad_norm": 1.3224042285204285, "learning_rate": 2.0839488233692472e-07, "loss": 0.157, "step": 5837 }, { "epoch": 0.9079315707620529, "grad_norm": 1.3735495095491266, "learning_rate": 2.0769752828843748e-07, "loss": 0.1149, "step": 5838 }, { "epoch": 0.9080870917573872, "grad_norm": 1.419623016793329, "learning_rate": 2.0700131822938174e-07, "loss": 0.2109, "step": 5839 }, { "epoch": 0.9082426127527217, "grad_norm": 1.3061291210995953, "learning_rate": 2.0630625232595126e-07, "loss": 0.2181, "step": 5840 }, { "epoch": 0.908398133748056, "grad_norm": 1.0437797974214051, "learning_rate": 2.0561233074406938e-07, "loss": 0.1767, "step": 5841 }, { "epoch": 0.9085536547433903, "grad_norm": 1.3521734451842407, "learning_rate": 2.0491955364938475e-07, "loss": 0.131, "step": 5842 }, { "epoch": 0.9087091757387247, "grad_norm": 1.5056890506101497, "learning_rate": 2.0422792120727174e-07, "loss": 0.2563, "step": 5843 }, { "epoch": 0.9088646967340591, "grad_norm": 0.9175489128753485, "learning_rate": 2.035374335828333e-07, "loss": 0.1516, "step": 5844 }, { "epoch": 0.9090202177293935, "grad_norm": 1.3133945415818635, "learning_rate": 2.0284809094089862e-07, "loss": 0.1425, "step": 5845 }, { "epoch": 0.9091757387247278, "grad_norm": 1.2298001887021108, "learning_rate": 2.0215989344602281e-07, "loss": 0.1799, "step": 5846 }, { "epoch": 0.9093312597200622, "grad_norm": 1.2613941904579857, "learning_rate": 2.0147284126248834e-07, "loss": 0.1633, "step": 5847 }, { "epoch": 0.9094867807153966, "grad_norm": 0.993463526453324, "learning_rate": 2.0078693455430565e-07, "loss": 0.0798, "step": 5848 }, { "epoch": 0.9096423017107309, "grad_norm": 1.046865647253922, "learning_rate": 2.0010217348520876e-07, "loss": 0.1207, "step": 5849 }, { "epoch": 0.9097978227060654, "grad_norm": 1.5153608992808094, "learning_rate": 1.9941855821865918e-07, "loss": 0.2109, "step": 5850 }, { "epoch": 0.9099533437013997, "grad_norm": 1.174448418010533, "learning_rate": 1.987360889178469e-07, "loss": 0.1396, "step": 5851 }, { "epoch": 0.910108864696734, "grad_norm": 0.68288577175286, "learning_rate": 1.980547657456866e-07, "loss": 0.1409, "step": 5852 }, { "epoch": 0.9102643856920685, "grad_norm": 1.2654371962290238, "learning_rate": 1.973745888648182e-07, "loss": 0.1477, "step": 5853 }, { "epoch": 0.9104199066874028, "grad_norm": 0.8985554477528374, "learning_rate": 1.966955584376101e-07, "loss": 0.1302, "step": 5854 }, { "epoch": 0.9105754276827371, "grad_norm": 1.2430872743206671, "learning_rate": 1.9601767462615773e-07, "loss": 0.1159, "step": 5855 }, { "epoch": 0.9107309486780716, "grad_norm": 1.1230673564836533, "learning_rate": 1.9534093759227936e-07, "loss": 0.1656, "step": 5856 }, { "epoch": 0.9108864696734059, "grad_norm": 1.0381610677978066, "learning_rate": 1.9466534749752353e-07, "loss": 0.1727, "step": 5857 }, { "epoch": 0.9110419906687403, "grad_norm": 1.3299510737119313, "learning_rate": 1.9399090450316126e-07, "loss": 0.2107, "step": 5858 }, { "epoch": 0.9111975116640747, "grad_norm": 0.7707691263162277, "learning_rate": 1.9331760877019212e-07, "loss": 0.12, "step": 5859 }, { "epoch": 0.911353032659409, "grad_norm": 1.0653998490243637, "learning_rate": 1.9264546045934196e-07, "loss": 0.1231, "step": 5860 }, { "epoch": 0.9115085536547434, "grad_norm": 1.1911683237219262, "learning_rate": 1.9197445973106076e-07, "loss": 0.1021, "step": 5861 }, { "epoch": 0.9116640746500778, "grad_norm": 0.7944862739594745, "learning_rate": 1.9130460674552654e-07, "loss": 0.0905, "step": 5862 }, { "epoch": 0.9118195956454122, "grad_norm": 0.943478961381292, "learning_rate": 1.9063590166264134e-07, "loss": 0.1011, "step": 5863 }, { "epoch": 0.9119751166407465, "grad_norm": 1.7734264712188579, "learning_rate": 1.899683446420364e-07, "loss": 0.1576, "step": 5864 }, { "epoch": 0.9121306376360808, "grad_norm": 1.0772039723617315, "learning_rate": 1.8930193584306588e-07, "loss": 0.1412, "step": 5865 }, { "epoch": 0.9122861586314153, "grad_norm": 0.963313328962512, "learning_rate": 1.8863667542481035e-07, "loss": 0.107, "step": 5866 }, { "epoch": 0.9124416796267496, "grad_norm": 1.1474937086567898, "learning_rate": 1.8797256354607774e-07, "loss": 0.2025, "step": 5867 }, { "epoch": 0.9125972006220839, "grad_norm": 0.995148260293707, "learning_rate": 1.8730960036540015e-07, "loss": 0.1861, "step": 5868 }, { "epoch": 0.9127527216174184, "grad_norm": 0.8001200372357383, "learning_rate": 1.866477860410354e-07, "loss": 0.1414, "step": 5869 }, { "epoch": 0.9129082426127527, "grad_norm": 0.7757029133612263, "learning_rate": 1.859871207309688e-07, "loss": 0.0752, "step": 5870 }, { "epoch": 0.913063763608087, "grad_norm": 1.0743446409584438, "learning_rate": 1.8532760459291088e-07, "loss": 0.1423, "step": 5871 }, { "epoch": 0.9132192846034215, "grad_norm": 1.2702983002286288, "learning_rate": 1.8466923778429624e-07, "loss": 0.1227, "step": 5872 }, { "epoch": 0.9133748055987558, "grad_norm": 1.2815089959523107, "learning_rate": 1.840120204622875e-07, "loss": 0.1796, "step": 5873 }, { "epoch": 0.9135303265940902, "grad_norm": 0.7044208577897189, "learning_rate": 1.8335595278377028e-07, "loss": 0.1276, "step": 5874 }, { "epoch": 0.9136858475894246, "grad_norm": 1.1330713796777785, "learning_rate": 1.8270103490535708e-07, "loss": 0.1958, "step": 5875 }, { "epoch": 0.913841368584759, "grad_norm": 1.0674558923502933, "learning_rate": 1.8204726698338725e-07, "loss": 0.162, "step": 5876 }, { "epoch": 0.9139968895800933, "grad_norm": 0.8175939641063887, "learning_rate": 1.8139464917392325e-07, "loss": 0.1067, "step": 5877 }, { "epoch": 0.9141524105754277, "grad_norm": 1.3404982095938547, "learning_rate": 1.8074318163275372e-07, "loss": 0.1815, "step": 5878 }, { "epoch": 0.9143079315707621, "grad_norm": 0.8618563219564925, "learning_rate": 1.8009286451539377e-07, "loss": 0.1217, "step": 5879 }, { "epoch": 0.9144634525660964, "grad_norm": 0.8187670337209523, "learning_rate": 1.7944369797708362e-07, "loss": 0.1846, "step": 5880 }, { "epoch": 0.9146189735614308, "grad_norm": 0.9548148378505019, "learning_rate": 1.787956821727871e-07, "loss": 0.1903, "step": 5881 }, { "epoch": 0.9147744945567652, "grad_norm": 1.4773750189142691, "learning_rate": 1.7814881725719545e-07, "loss": 0.158, "step": 5882 }, { "epoch": 0.9149300155520995, "grad_norm": 0.702427624206725, "learning_rate": 1.7750310338472464e-07, "loss": 0.1347, "step": 5883 }, { "epoch": 0.9150855365474339, "grad_norm": 0.9154761568845805, "learning_rate": 1.7685854070951525e-07, "loss": 0.212, "step": 5884 }, { "epoch": 0.9152410575427683, "grad_norm": 0.8195571514557887, "learning_rate": 1.762151293854325e-07, "loss": 0.1907, "step": 5885 }, { "epoch": 0.9153965785381026, "grad_norm": 0.9228292220157248, "learning_rate": 1.7557286956606855e-07, "loss": 0.1461, "step": 5886 }, { "epoch": 0.915552099533437, "grad_norm": 1.4018717295029453, "learning_rate": 1.7493176140473966e-07, "loss": 0.112, "step": 5887 }, { "epoch": 0.9157076205287714, "grad_norm": 1.3619597264572296, "learning_rate": 1.742918050544873e-07, "loss": 0.0956, "step": 5888 }, { "epoch": 0.9158631415241058, "grad_norm": 1.2785758856442475, "learning_rate": 1.736530006680781e-07, "loss": 0.1773, "step": 5889 }, { "epoch": 0.9160186625194401, "grad_norm": 0.9146967364226447, "learning_rate": 1.7301534839800348e-07, "loss": 0.1717, "step": 5890 }, { "epoch": 0.9161741835147745, "grad_norm": 0.991343173201121, "learning_rate": 1.7237884839647944e-07, "loss": 0.1254, "step": 5891 }, { "epoch": 0.9163297045101089, "grad_norm": 1.1539944969198275, "learning_rate": 1.7174350081544832e-07, "loss": 0.1538, "step": 5892 }, { "epoch": 0.9164852255054432, "grad_norm": 0.9699466579316269, "learning_rate": 1.7110930580657547e-07, "loss": 0.1599, "step": 5893 }, { "epoch": 0.9166407465007776, "grad_norm": 1.2324657617883283, "learning_rate": 1.7047626352125256e-07, "loss": 0.1265, "step": 5894 }, { "epoch": 0.916796267496112, "grad_norm": 1.9142177692156517, "learning_rate": 1.698443741105954e-07, "loss": 0.1632, "step": 5895 }, { "epoch": 0.9169517884914463, "grad_norm": 1.4497818956223705, "learning_rate": 1.6921363772544553e-07, "loss": 0.1205, "step": 5896 }, { "epoch": 0.9171073094867808, "grad_norm": 1.1134879865949807, "learning_rate": 1.685840545163675e-07, "loss": 0.1993, "step": 5897 }, { "epoch": 0.9172628304821151, "grad_norm": 1.5260416083404948, "learning_rate": 1.6795562463365279e-07, "loss": 0.1261, "step": 5898 }, { "epoch": 0.9174183514774494, "grad_norm": 1.1566856619200792, "learning_rate": 1.6732834822731582e-07, "loss": 0.1325, "step": 5899 }, { "epoch": 0.9175738724727839, "grad_norm": 0.8462902314260414, "learning_rate": 1.6670222544709515e-07, "loss": 0.0913, "step": 5900 }, { "epoch": 0.9175738724727839, "eval_loss": 0.16040350496768951, "eval_runtime": 9.409, "eval_samples_per_second": 2.763, "eval_steps_per_second": 0.744, "step": 5900 }, { "epoch": 0.9177293934681182, "grad_norm": 1.2767184183118734, "learning_rate": 1.6607725644245675e-07, "loss": 0.0951, "step": 5901 }, { "epoch": 0.9178849144634526, "grad_norm": 0.7274703169092662, "learning_rate": 1.65453441362588e-07, "loss": 0.1219, "step": 5902 }, { "epoch": 0.9180404354587869, "grad_norm": 1.1600599732901438, "learning_rate": 1.6483078035640364e-07, "loss": 0.1551, "step": 5903 }, { "epoch": 0.9181959564541213, "grad_norm": 0.8997731759628078, "learning_rate": 1.642092735725398e-07, "loss": 0.1736, "step": 5904 }, { "epoch": 0.9183514774494557, "grad_norm": 0.8050289671736852, "learning_rate": 1.6358892115936054e-07, "loss": 0.116, "step": 5905 }, { "epoch": 0.91850699844479, "grad_norm": 0.8912125019380509, "learning_rate": 1.6296972326495242e-07, "loss": 0.1539, "step": 5906 }, { "epoch": 0.9186625194401244, "grad_norm": 1.1242916172381823, "learning_rate": 1.6235168003712498e-07, "loss": 0.1014, "step": 5907 }, { "epoch": 0.9188180404354588, "grad_norm": 1.0503651369558498, "learning_rate": 1.6173479162341577e-07, "loss": 0.177, "step": 5908 }, { "epoch": 0.9189735614307931, "grad_norm": 0.9283592603706473, "learning_rate": 1.6111905817108308e-07, "loss": 0.1426, "step": 5909 }, { "epoch": 0.9191290824261276, "grad_norm": 0.9380491969953575, "learning_rate": 1.6050447982711214e-07, "loss": 0.1943, "step": 5910 }, { "epoch": 0.9192846034214619, "grad_norm": 1.1079649623889694, "learning_rate": 1.5989105673821005e-07, "loss": 0.211, "step": 5911 }, { "epoch": 0.9194401244167962, "grad_norm": 0.8470097793894495, "learning_rate": 1.5927878905081185e-07, "loss": 0.1049, "step": 5912 }, { "epoch": 0.9195956454121307, "grad_norm": 1.3767436259339856, "learning_rate": 1.5866767691107178e-07, "loss": 0.1277, "step": 5913 }, { "epoch": 0.919751166407465, "grad_norm": 0.7871075270961281, "learning_rate": 1.5805772046487255e-07, "loss": 0.1245, "step": 5914 }, { "epoch": 0.9199066874027994, "grad_norm": 0.9397528773290647, "learning_rate": 1.5744891985781885e-07, "loss": 0.1008, "step": 5915 }, { "epoch": 0.9200622083981338, "grad_norm": 1.0771372965561161, "learning_rate": 1.568412752352394e-07, "loss": 0.1495, "step": 5916 }, { "epoch": 0.9202177293934681, "grad_norm": 0.9977364364799063, "learning_rate": 1.5623478674218762e-07, "loss": 0.0969, "step": 5917 }, { "epoch": 0.9203732503888025, "grad_norm": 1.2921483362327275, "learning_rate": 1.5562945452344103e-07, "loss": 0.1268, "step": 5918 }, { "epoch": 0.9205287713841369, "grad_norm": 1.1351211692963334, "learning_rate": 1.5502527872350127e-07, "loss": 0.1562, "step": 5919 }, { "epoch": 0.9206842923794712, "grad_norm": 0.9212445632544811, "learning_rate": 1.5442225948659183e-07, "loss": 0.1499, "step": 5920 }, { "epoch": 0.9208398133748056, "grad_norm": 0.8564484740173748, "learning_rate": 1.5382039695666428e-07, "loss": 0.1431, "step": 5921 }, { "epoch": 0.9209953343701399, "grad_norm": 0.841094832581025, "learning_rate": 1.5321969127738977e-07, "loss": 0.1032, "step": 5922 }, { "epoch": 0.9211508553654744, "grad_norm": 1.28419682387889, "learning_rate": 1.526201425921653e-07, "loss": 0.1463, "step": 5923 }, { "epoch": 0.9213063763608087, "grad_norm": 0.9646659414096255, "learning_rate": 1.5202175104411242e-07, "loss": 0.1789, "step": 5924 }, { "epoch": 0.921461897356143, "grad_norm": 1.120591024982702, "learning_rate": 1.514245167760753e-07, "loss": 0.1931, "step": 5925 }, { "epoch": 0.9216174183514775, "grad_norm": 1.3544635786504204, "learning_rate": 1.508284399306209e-07, "loss": 0.1813, "step": 5926 }, { "epoch": 0.9217729393468118, "grad_norm": 1.058395178169264, "learning_rate": 1.502335206500416e-07, "loss": 0.1741, "step": 5927 }, { "epoch": 0.9219284603421461, "grad_norm": 1.3357918931243613, "learning_rate": 1.496397590763543e-07, "loss": 0.1431, "step": 5928 }, { "epoch": 0.9220839813374806, "grad_norm": 1.2987366658278365, "learning_rate": 1.4904715535129623e-07, "loss": 0.1406, "step": 5929 }, { "epoch": 0.9222395023328149, "grad_norm": 1.238552719525775, "learning_rate": 1.4845570961633192e-07, "loss": 0.1431, "step": 5930 }, { "epoch": 0.9223950233281493, "grad_norm": 1.1954507992501249, "learning_rate": 1.4786542201264687e-07, "loss": 0.2027, "step": 5931 }, { "epoch": 0.9225505443234837, "grad_norm": 1.4871689421754393, "learning_rate": 1.4727629268115052e-07, "loss": 0.1055, "step": 5932 }, { "epoch": 0.922706065318818, "grad_norm": 0.9094483384755923, "learning_rate": 1.4668832176247706e-07, "loss": 0.1005, "step": 5933 }, { "epoch": 0.9228615863141524, "grad_norm": 1.0419522802530807, "learning_rate": 1.461015093969831e-07, "loss": 0.2126, "step": 5934 }, { "epoch": 0.9230171073094868, "grad_norm": 0.7666197449820589, "learning_rate": 1.4551585572474825e-07, "loss": 0.1297, "step": 5935 }, { "epoch": 0.9231726283048212, "grad_norm": 0.8739417820274286, "learning_rate": 1.449313608855768e-07, "loss": 0.1792, "step": 5936 }, { "epoch": 0.9233281493001555, "grad_norm": 0.678371348683818, "learning_rate": 1.4434802501899604e-07, "loss": 0.0847, "step": 5937 }, { "epoch": 0.92348367029549, "grad_norm": 1.3696685335571568, "learning_rate": 1.4376584826425622e-07, "loss": 0.156, "step": 5938 }, { "epoch": 0.9236391912908243, "grad_norm": 1.0235906680364073, "learning_rate": 1.4318483076033064e-07, "loss": 0.1615, "step": 5939 }, { "epoch": 0.9237947122861586, "grad_norm": 1.635902679505808, "learning_rate": 1.426049726459172e-07, "loss": 0.1639, "step": 5940 }, { "epoch": 0.923950233281493, "grad_norm": 1.1093545294591476, "learning_rate": 1.4202627405943582e-07, "loss": 0.1387, "step": 5941 }, { "epoch": 0.9241057542768274, "grad_norm": 0.6839074328019549, "learning_rate": 1.4144873513902868e-07, "loss": 0.1409, "step": 5942 }, { "epoch": 0.9242612752721617, "grad_norm": 1.4190794540731169, "learning_rate": 1.4087235602256333e-07, "loss": 0.121, "step": 5943 }, { "epoch": 0.9244167962674961, "grad_norm": 1.0160403675073009, "learning_rate": 1.402971368476297e-07, "loss": 0.1447, "step": 5944 }, { "epoch": 0.9245723172628305, "grad_norm": 0.9629883505522094, "learning_rate": 1.3972307775154015e-07, "loss": 0.1369, "step": 5945 }, { "epoch": 0.9247278382581648, "grad_norm": 1.2577099049801175, "learning_rate": 1.3915017887133176e-07, "loss": 0.2046, "step": 5946 }, { "epoch": 0.9248833592534992, "grad_norm": 1.046451985680252, "learning_rate": 1.3857844034376233e-07, "loss": 0.1233, "step": 5947 }, { "epoch": 0.9250388802488336, "grad_norm": 0.9579950634285156, "learning_rate": 1.3800786230531327e-07, "loss": 0.1993, "step": 5948 }, { "epoch": 0.925194401244168, "grad_norm": 0.8629151981526569, "learning_rate": 1.3743844489219117e-07, "loss": 0.2031, "step": 5949 }, { "epoch": 0.9253499222395023, "grad_norm": 0.9503879938377606, "learning_rate": 1.368701882403234e-07, "loss": 0.1463, "step": 5950 }, { "epoch": 0.9255054432348367, "grad_norm": 1.2092127106631048, "learning_rate": 1.363030924853592e-07, "loss": 0.1787, "step": 5951 }, { "epoch": 0.9256609642301711, "grad_norm": 0.6466213265120724, "learning_rate": 1.3573715776267371e-07, "loss": 0.1427, "step": 5952 }, { "epoch": 0.9258164852255054, "grad_norm": 1.647804399566043, "learning_rate": 1.3517238420736378e-07, "loss": 0.2161, "step": 5953 }, { "epoch": 0.9259720062208399, "grad_norm": 0.933753573190566, "learning_rate": 1.346087719542477e-07, "loss": 0.112, "step": 5954 }, { "epoch": 0.9261275272161742, "grad_norm": 1.154355630430787, "learning_rate": 1.340463211378684e-07, "loss": 0.2161, "step": 5955 }, { "epoch": 0.9262830482115085, "grad_norm": 1.1501664166953949, "learning_rate": 1.3348503189249018e-07, "loss": 0.1297, "step": 5956 }, { "epoch": 0.926438569206843, "grad_norm": 1.0898658969391852, "learning_rate": 1.3292490435210027e-07, "loss": 0.1013, "step": 5957 }, { "epoch": 0.9265940902021773, "grad_norm": 0.9763562115484917, "learning_rate": 1.3236593865041002e-07, "loss": 0.1725, "step": 5958 }, { "epoch": 0.9267496111975116, "grad_norm": 1.036767910613003, "learning_rate": 1.3180813492085165e-07, "loss": 0.2167, "step": 5959 }, { "epoch": 0.926905132192846, "grad_norm": 1.3651866462030835, "learning_rate": 1.3125149329658083e-07, "loss": 0.1631, "step": 5960 }, { "epoch": 0.9270606531881804, "grad_norm": 1.0413923301132262, "learning_rate": 1.3069601391047515e-07, "loss": 0.1797, "step": 5961 }, { "epoch": 0.9272161741835148, "grad_norm": 1.0842466638871253, "learning_rate": 1.3014169689513633e-07, "loss": 0.169, "step": 5962 }, { "epoch": 0.9273716951788491, "grad_norm": 0.9248741302501171, "learning_rate": 1.2958854238288686e-07, "loss": 0.1083, "step": 5963 }, { "epoch": 0.9275272161741835, "grad_norm": 0.989529887090528, "learning_rate": 1.2903655050577224e-07, "loss": 0.1095, "step": 5964 }, { "epoch": 0.9276827371695179, "grad_norm": 1.029216029105682, "learning_rate": 1.2848572139556147e-07, "loss": 0.0679, "step": 5965 }, { "epoch": 0.9278382581648522, "grad_norm": 0.7994601607975655, "learning_rate": 1.2793605518374442e-07, "loss": 0.1408, "step": 5966 }, { "epoch": 0.9279937791601867, "grad_norm": 0.822947579450496, "learning_rate": 1.2738755200153442e-07, "loss": 0.1624, "step": 5967 }, { "epoch": 0.928149300155521, "grad_norm": 1.3667630960070987, "learning_rate": 1.2684021197986618e-07, "loss": 0.1507, "step": 5968 }, { "epoch": 0.9283048211508553, "grad_norm": 0.7322581799016704, "learning_rate": 1.2629403524939853e-07, "loss": 0.0851, "step": 5969 }, { "epoch": 0.9284603421461898, "grad_norm": 0.8213804703688914, "learning_rate": 1.2574902194050996e-07, "loss": 0.1516, "step": 5970 }, { "epoch": 0.9286158631415241, "grad_norm": 1.4917962950307126, "learning_rate": 1.2520517218330474e-07, "loss": 0.184, "step": 5971 }, { "epoch": 0.9287713841368584, "grad_norm": 0.9071613152152974, "learning_rate": 1.2466248610760622e-07, "loss": 0.1358, "step": 5972 }, { "epoch": 0.9289269051321929, "grad_norm": 1.027977697584738, "learning_rate": 1.2412096384296079e-07, "loss": 0.2289, "step": 5973 }, { "epoch": 0.9290824261275272, "grad_norm": 1.0682841441741995, "learning_rate": 1.2358060551863782e-07, "loss": 0.1285, "step": 5974 }, { "epoch": 0.9292379471228616, "grad_norm": 1.2690716126484567, "learning_rate": 1.2304141126362855e-07, "loss": 0.146, "step": 5975 }, { "epoch": 0.929393468118196, "grad_norm": 0.9763353919454301, "learning_rate": 1.2250338120664563e-07, "loss": 0.1068, "step": 5976 }, { "epoch": 0.9295489891135303, "grad_norm": 0.9520611001593948, "learning_rate": 1.2196651547612514e-07, "loss": 0.1424, "step": 5977 }, { "epoch": 0.9297045101088647, "grad_norm": 0.7401110765175115, "learning_rate": 1.214308142002235e-07, "loss": 0.113, "step": 5978 }, { "epoch": 0.929860031104199, "grad_norm": 0.8751282531073911, "learning_rate": 1.208962775068212e-07, "loss": 0.142, "step": 5979 }, { "epoch": 0.9300155520995335, "grad_norm": 0.8735460425943997, "learning_rate": 1.2036290552351838e-07, "loss": 0.1454, "step": 5980 }, { "epoch": 0.9301710730948678, "grad_norm": 1.1392041931116736, "learning_rate": 1.198306983776393e-07, "loss": 0.1484, "step": 5981 }, { "epoch": 0.9303265940902021, "grad_norm": 1.1067333939166482, "learning_rate": 1.192996561962284e-07, "loss": 0.1134, "step": 5982 }, { "epoch": 0.9304821150855366, "grad_norm": 0.8002540604538603, "learning_rate": 1.1876977910605325e-07, "loss": 0.1269, "step": 5983 }, { "epoch": 0.9306376360808709, "grad_norm": 1.2809439549044799, "learning_rate": 1.1824106723360317e-07, "loss": 0.1845, "step": 5984 }, { "epoch": 0.9307931570762052, "grad_norm": 1.0275796735715907, "learning_rate": 1.177135207050889e-07, "loss": 0.1009, "step": 5985 }, { "epoch": 0.9309486780715397, "grad_norm": 1.7802598425805964, "learning_rate": 1.1718713964644302e-07, "loss": 0.2296, "step": 5986 }, { "epoch": 0.931104199066874, "grad_norm": 0.6748680760879335, "learning_rate": 1.1666192418332057e-07, "loss": 0.1199, "step": 5987 }, { "epoch": 0.9312597200622084, "grad_norm": 1.1110414795093149, "learning_rate": 1.1613787444109681e-07, "loss": 0.1263, "step": 5988 }, { "epoch": 0.9314152410575428, "grad_norm": 0.8920883524754907, "learning_rate": 1.1561499054487058e-07, "loss": 0.1533, "step": 5989 }, { "epoch": 0.9315707620528771, "grad_norm": 0.913940432769334, "learning_rate": 1.150932726194609e-07, "loss": 0.067, "step": 5990 }, { "epoch": 0.9317262830482115, "grad_norm": 0.9396159645605604, "learning_rate": 1.1457272078940929e-07, "loss": 0.1424, "step": 5991 }, { "epoch": 0.9318818040435459, "grad_norm": 0.835234235841558, "learning_rate": 1.140533351789791e-07, "loss": 0.0667, "step": 5992 }, { "epoch": 0.9320373250388803, "grad_norm": 1.3228728722082568, "learning_rate": 1.1353511591215505e-07, "loss": 0.1578, "step": 5993 }, { "epoch": 0.9321928460342146, "grad_norm": 1.053212523295092, "learning_rate": 1.1301806311264318e-07, "loss": 0.0992, "step": 5994 }, { "epoch": 0.932348367029549, "grad_norm": 1.0917074228513728, "learning_rate": 1.1250217690387033e-07, "loss": 0.1217, "step": 5995 }, { "epoch": 0.9325038880248834, "grad_norm": 1.2480962195714367, "learning_rate": 1.1198745740898631e-07, "loss": 0.1772, "step": 5996 }, { "epoch": 0.9326594090202177, "grad_norm": 1.12661460500748, "learning_rate": 1.1147390475086284e-07, "loss": 0.2115, "step": 5997 }, { "epoch": 0.932814930015552, "grad_norm": 1.3774596183295298, "learning_rate": 1.1096151905209074e-07, "loss": 0.1474, "step": 5998 }, { "epoch": 0.9329704510108865, "grad_norm": 1.4903923806352466, "learning_rate": 1.1045030043498328e-07, "loss": 0.149, "step": 5999 }, { "epoch": 0.9331259720062208, "grad_norm": 1.1072662064477183, "learning_rate": 1.0994024902157674e-07, "loss": 0.1398, "step": 6000 }, { "epoch": 0.9331259720062208, "eval_loss": 0.1602540910243988, "eval_runtime": 9.4318, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 6000 }, { "epoch": 0.9332814930015552, "grad_norm": 1.2357222526088094, "learning_rate": 1.0943136493362705e-07, "loss": 0.0857, "step": 6001 }, { "epoch": 0.9334370139968896, "grad_norm": 1.0101944049435716, "learning_rate": 1.0892364829261204e-07, "loss": 0.1675, "step": 6002 }, { "epoch": 0.9335925349922239, "grad_norm": 0.9050188711792648, "learning_rate": 1.0841709921973087e-07, "loss": 0.1794, "step": 6003 }, { "epoch": 0.9337480559875583, "grad_norm": 1.1450372561751612, "learning_rate": 1.0791171783590349e-07, "loss": 0.1366, "step": 6004 }, { "epoch": 0.9339035769828927, "grad_norm": 1.4581708095750583, "learning_rate": 1.0740750426177116e-07, "loss": 0.188, "step": 6005 }, { "epoch": 0.9340590979782271, "grad_norm": 0.7378067918814023, "learning_rate": 1.0690445861769761e-07, "loss": 0.1193, "step": 6006 }, { "epoch": 0.9342146189735614, "grad_norm": 1.0088145980126888, "learning_rate": 1.0640258102376566e-07, "loss": 0.1228, "step": 6007 }, { "epoch": 0.9343701399688958, "grad_norm": 0.9782329255806054, "learning_rate": 1.0590187159978171e-07, "loss": 0.1327, "step": 6008 }, { "epoch": 0.9345256609642302, "grad_norm": 0.8725035329836632, "learning_rate": 1.0540233046527182e-07, "loss": 0.1472, "step": 6009 }, { "epoch": 0.9346811819595645, "grad_norm": 1.3269741110519633, "learning_rate": 1.0490395773948336e-07, "loss": 0.1746, "step": 6010 }, { "epoch": 0.934836702954899, "grad_norm": 0.8629451703537575, "learning_rate": 1.0440675354138396e-07, "loss": 0.1502, "step": 6011 }, { "epoch": 0.9349922239502333, "grad_norm": 0.8934082379306171, "learning_rate": 1.0391071798966479e-07, "loss": 0.1242, "step": 6012 }, { "epoch": 0.9351477449455676, "grad_norm": 1.4424114084528608, "learning_rate": 1.0341585120273501e-07, "loss": 0.1683, "step": 6013 }, { "epoch": 0.9353032659409021, "grad_norm": 1.1878456988446975, "learning_rate": 1.0292215329872679e-07, "loss": 0.1613, "step": 6014 }, { "epoch": 0.9354587869362364, "grad_norm": 1.0258801567548261, "learning_rate": 1.0242962439549309e-07, "loss": 0.1303, "step": 6015 }, { "epoch": 0.9356143079315707, "grad_norm": 0.8350193893992409, "learning_rate": 1.0193826461060652e-07, "loss": 0.0892, "step": 6016 }, { "epoch": 0.9357698289269051, "grad_norm": 0.7560607034668472, "learning_rate": 1.0144807406136215e-07, "loss": 0.1146, "step": 6017 }, { "epoch": 0.9359253499222395, "grad_norm": 1.3295674657970333, "learning_rate": 1.0095905286477526e-07, "loss": 0.1688, "step": 6018 }, { "epoch": 0.9360808709175739, "grad_norm": 0.7111534809814972, "learning_rate": 1.0047120113758246e-07, "loss": 0.0884, "step": 6019 }, { "epoch": 0.9362363919129082, "grad_norm": 1.3705024181899894, "learning_rate": 9.998451899624007e-08, "loss": 0.1438, "step": 6020 }, { "epoch": 0.9363919129082426, "grad_norm": 1.0867623378491467, "learning_rate": 9.949900655692512e-08, "loss": 0.1413, "step": 6021 }, { "epoch": 0.936547433903577, "grad_norm": 1.0047481141634953, "learning_rate": 9.901466393553827e-08, "loss": 0.1112, "step": 6022 }, { "epoch": 0.9367029548989113, "grad_norm": 0.8006067810444659, "learning_rate": 9.8531491247697e-08, "loss": 0.1094, "step": 6023 }, { "epoch": 0.9368584758942458, "grad_norm": 0.7100586637094772, "learning_rate": 9.804948860874241e-08, "loss": 0.1061, "step": 6024 }, { "epoch": 0.9370139968895801, "grad_norm": 0.9724328249869837, "learning_rate": 9.756865613373523e-08, "loss": 0.1032, "step": 6025 }, { "epoch": 0.9371695178849144, "grad_norm": 1.3034875738805856, "learning_rate": 9.708899393745696e-08, "loss": 0.1403, "step": 6026 }, { "epoch": 0.9373250388802489, "grad_norm": 1.2861265826238562, "learning_rate": 9.661050213440881e-08, "loss": 0.6124, "step": 6027 }, { "epoch": 0.9374805598755832, "grad_norm": 0.8048030105539355, "learning_rate": 9.613318083881384e-08, "loss": 0.0788, "step": 6028 }, { "epoch": 0.9376360808709175, "grad_norm": 1.2658097218407347, "learning_rate": 9.565703016461591e-08, "loss": 0.0971, "step": 6029 }, { "epoch": 0.937791601866252, "grad_norm": 0.8337859644626585, "learning_rate": 9.5182050225478e-08, "loss": 0.0888, "step": 6030 }, { "epoch": 0.9379471228615863, "grad_norm": 1.15136664157168, "learning_rate": 9.470824113478494e-08, "loss": 0.1587, "step": 6031 }, { "epoch": 0.9381026438569207, "grad_norm": 1.03598127019173, "learning_rate": 9.423560300564072e-08, "loss": 0.1527, "step": 6032 }, { "epoch": 0.9382581648522551, "grad_norm": 1.2210260015539705, "learning_rate": 9.376413595087175e-08, "loss": 0.1604, "step": 6033 }, { "epoch": 0.9384136858475894, "grad_norm": 1.3915252080280553, "learning_rate": 9.329384008302355e-08, "loss": 0.173, "step": 6034 }, { "epoch": 0.9385692068429238, "grad_norm": 0.8429187230331056, "learning_rate": 9.282471551436245e-08, "loss": 0.1143, "step": 6035 }, { "epoch": 0.9387247278382581, "grad_norm": 0.8911057865829357, "learning_rate": 9.23567623568744e-08, "loss": 0.128, "step": 6036 }, { "epoch": 0.9388802488335926, "grad_norm": 1.4591913517919903, "learning_rate": 9.188998072226618e-08, "loss": 0.2235, "step": 6037 }, { "epoch": 0.9390357698289269, "grad_norm": 1.264034169543466, "learning_rate": 9.142437072196642e-08, "loss": 0.2541, "step": 6038 }, { "epoch": 0.9391912908242612, "grad_norm": 1.3222125179500017, "learning_rate": 9.095993246712065e-08, "loss": 0.1632, "step": 6039 }, { "epoch": 0.9393468118195957, "grad_norm": 1.3974390390101483, "learning_rate": 9.049666606859852e-08, "loss": 0.1378, "step": 6040 }, { "epoch": 0.93950233281493, "grad_norm": 0.9648560440405011, "learning_rate": 9.003457163698825e-08, "loss": 0.1265, "step": 6041 }, { "epoch": 0.9396578538102643, "grad_norm": 0.9668723286034735, "learning_rate": 8.957364928259715e-08, "loss": 0.1153, "step": 6042 }, { "epoch": 0.9398133748055988, "grad_norm": 1.210975934418288, "learning_rate": 8.911389911545388e-08, "loss": 0.1235, "step": 6043 }, { "epoch": 0.9399688958009331, "grad_norm": 1.1499265554294855, "learning_rate": 8.86553212453084e-08, "loss": 0.1689, "step": 6044 }, { "epoch": 0.9401244167962675, "grad_norm": 1.296449365417157, "learning_rate": 8.819791578162818e-08, "loss": 0.2152, "step": 6045 }, { "epoch": 0.9402799377916019, "grad_norm": 1.0349805116019808, "learning_rate": 8.774168283360307e-08, "loss": 0.1544, "step": 6046 }, { "epoch": 0.9404354587869362, "grad_norm": 1.0894620853606836, "learning_rate": 8.728662251014208e-08, "loss": 0.1625, "step": 6047 }, { "epoch": 0.9405909797822706, "grad_norm": 0.9820035180287559, "learning_rate": 8.683273491987443e-08, "loss": 0.1307, "step": 6048 }, { "epoch": 0.940746500777605, "grad_norm": 0.961919443142456, "learning_rate": 8.638002017114899e-08, "loss": 0.1057, "step": 6049 }, { "epoch": 0.9409020217729394, "grad_norm": 1.0698156536254062, "learning_rate": 8.592847837203655e-08, "loss": 0.1905, "step": 6050 }, { "epoch": 0.9410575427682737, "grad_norm": 0.9431676032623935, "learning_rate": 8.547810963032533e-08, "loss": 0.146, "step": 6051 }, { "epoch": 0.9412130637636081, "grad_norm": 0.606489279462274, "learning_rate": 8.502891405352375e-08, "loss": 0.0764, "step": 6052 }, { "epoch": 0.9413685847589425, "grad_norm": 1.2776211704254008, "learning_rate": 8.458089174886331e-08, "loss": 0.1925, "step": 6053 }, { "epoch": 0.9415241057542768, "grad_norm": 0.811928972821667, "learning_rate": 8.413404282329118e-08, "loss": 0.1419, "step": 6054 }, { "epoch": 0.9416796267496111, "grad_norm": 0.910515361265882, "learning_rate": 8.368836738347708e-08, "loss": 0.1667, "step": 6055 }, { "epoch": 0.9418351477449456, "grad_norm": 0.9460061529433832, "learning_rate": 8.324386553581032e-08, "loss": 0.1229, "step": 6056 }, { "epoch": 0.9419906687402799, "grad_norm": 0.9075423611684704, "learning_rate": 8.280053738639937e-08, "loss": 0.1437, "step": 6057 }, { "epoch": 0.9421461897356143, "grad_norm": 1.0394672743955962, "learning_rate": 8.235838304107291e-08, "loss": 0.1537, "step": 6058 }, { "epoch": 0.9423017107309487, "grad_norm": 0.6803881274441064, "learning_rate": 8.191740260537929e-08, "loss": 0.0675, "step": 6059 }, { "epoch": 0.942457231726283, "grad_norm": 0.8542553773043837, "learning_rate": 8.147759618458706e-08, "loss": 0.177, "step": 6060 }, { "epoch": 0.9426127527216174, "grad_norm": 1.059574378387779, "learning_rate": 8.103896388368337e-08, "loss": 0.1544, "step": 6061 }, { "epoch": 0.9427682737169518, "grad_norm": 1.1316568249450571, "learning_rate": 8.060150580737614e-08, "loss": 0.1393, "step": 6062 }, { "epoch": 0.9429237947122862, "grad_norm": 1.1064998069567815, "learning_rate": 8.016522206009347e-08, "loss": 0.1282, "step": 6063 }, { "epoch": 0.9430793157076205, "grad_norm": 1.886685419813936, "learning_rate": 7.973011274598153e-08, "loss": 0.1333, "step": 6064 }, { "epoch": 0.9432348367029549, "grad_norm": 0.9811968699761139, "learning_rate": 7.929617796890665e-08, "loss": 0.1023, "step": 6065 }, { "epoch": 0.9433903576982893, "grad_norm": 0.6377493057013237, "learning_rate": 7.886341783245654e-08, "loss": 0.0829, "step": 6066 }, { "epoch": 0.9435458786936236, "grad_norm": 1.263656567800681, "learning_rate": 7.843183243993635e-08, "loss": 0.2038, "step": 6067 }, { "epoch": 0.943701399688958, "grad_norm": 0.8705334779700498, "learning_rate": 7.800142189437033e-08, "loss": 0.1623, "step": 6068 }, { "epoch": 0.9438569206842924, "grad_norm": 1.105712636097588, "learning_rate": 7.757218629850571e-08, "loss": 0.1444, "step": 6069 }, { "epoch": 0.9440124416796267, "grad_norm": 1.0374859309553155, "learning_rate": 7.714412575480556e-08, "loss": 0.1044, "step": 6070 }, { "epoch": 0.9441679626749612, "grad_norm": 2.6869839320689892, "learning_rate": 7.67172403654537e-08, "loss": 0.1024, "step": 6071 }, { "epoch": 0.9443234836702955, "grad_norm": 0.9160351155695365, "learning_rate": 7.629153023235414e-08, "loss": 0.102, "step": 6072 }, { "epoch": 0.9444790046656298, "grad_norm": 1.2179414439806853, "learning_rate": 7.586699545713061e-08, "loss": 0.0885, "step": 6073 }, { "epoch": 0.9446345256609642, "grad_norm": 0.7980270927472214, "learning_rate": 7.544363614112427e-08, "loss": 0.1672, "step": 6074 }, { "epoch": 0.9447900466562986, "grad_norm": 0.8822631689876437, "learning_rate": 7.502145238539705e-08, "loss": 0.1189, "step": 6075 }, { "epoch": 0.944945567651633, "grad_norm": 0.9889286563237285, "learning_rate": 7.46004442907311e-08, "loss": 0.1737, "step": 6076 }, { "epoch": 0.9451010886469673, "grad_norm": 1.277895198423775, "learning_rate": 7.418061195762549e-08, "loss": 0.1346, "step": 6077 }, { "epoch": 0.9452566096423017, "grad_norm": 1.2660022461705556, "learning_rate": 7.376195548630117e-08, "loss": 0.1695, "step": 6078 }, { "epoch": 0.9454121306376361, "grad_norm": 1.1337854527882503, "learning_rate": 7.33444749766965e-08, "loss": 0.1717, "step": 6079 }, { "epoch": 0.9455676516329704, "grad_norm": 1.4973570182473095, "learning_rate": 7.292817052847068e-08, "loss": 0.0743, "step": 6080 }, { "epoch": 0.9457231726283049, "grad_norm": 0.9520239419632378, "learning_rate": 7.25130422410003e-08, "loss": 0.1717, "step": 6081 }, { "epoch": 0.9458786936236392, "grad_norm": 0.9292126097925694, "learning_rate": 7.209909021338335e-08, "loss": 0.1098, "step": 6082 }, { "epoch": 0.9460342146189735, "grad_norm": 1.9582064477096914, "learning_rate": 7.168631454443576e-08, "loss": 0.2291, "step": 6083 }, { "epoch": 0.946189735614308, "grad_norm": 0.8470818348300218, "learning_rate": 7.127471533269259e-08, "loss": 0.1358, "step": 6084 }, { "epoch": 0.9463452566096423, "grad_norm": 0.9838797660877596, "learning_rate": 7.086429267640804e-08, "loss": 0.1274, "step": 6085 }, { "epoch": 0.9465007776049766, "grad_norm": 0.7068655375771491, "learning_rate": 7.045504667355651e-08, "loss": 0.0959, "step": 6086 }, { "epoch": 0.9466562986003111, "grad_norm": 0.9651593515084791, "learning_rate": 7.004697742182986e-08, "loss": 0.1808, "step": 6087 }, { "epoch": 0.9468118195956454, "grad_norm": 1.5237537245713069, "learning_rate": 6.964008501864018e-08, "loss": 0.1642, "step": 6088 }, { "epoch": 0.9469673405909798, "grad_norm": 0.9678590265265065, "learning_rate": 6.923436956111862e-08, "loss": 0.119, "step": 6089 }, { "epoch": 0.9471228615863142, "grad_norm": 0.819809447539096, "learning_rate": 6.882983114611497e-08, "loss": 0.1389, "step": 6090 }, { "epoch": 0.9472783825816485, "grad_norm": 1.1459551875148117, "learning_rate": 6.842646987019808e-08, "loss": 0.1578, "step": 6091 }, { "epoch": 0.9474339035769829, "grad_norm": 1.1258378293263585, "learning_rate": 6.802428582965648e-08, "loss": 0.0899, "step": 6092 }, { "epoch": 0.9475894245723173, "grad_norm": 0.8547070071520748, "learning_rate": 6.762327912049616e-08, "loss": 0.0791, "step": 6093 }, { "epoch": 0.9477449455676517, "grad_norm": 0.9418792069239789, "learning_rate": 6.722344983844387e-08, "loss": 0.1545, "step": 6094 }, { "epoch": 0.947900466562986, "grad_norm": 0.905078794437441, "learning_rate": 6.682479807894381e-08, "loss": 0.0982, "step": 6095 }, { "epoch": 0.9480559875583203, "grad_norm": 1.304093682335632, "learning_rate": 6.64273239371599e-08, "loss": 0.084, "step": 6096 }, { "epoch": 0.9482115085536548, "grad_norm": 0.8920218883916073, "learning_rate": 6.603102750797452e-08, "loss": 0.1307, "step": 6097 }, { "epoch": 0.9483670295489891, "grad_norm": 1.1551060108400917, "learning_rate": 6.563590888599036e-08, "loss": 0.1667, "step": 6098 }, { "epoch": 0.9485225505443234, "grad_norm": 1.0567542355427968, "learning_rate": 6.524196816552641e-08, "loss": 0.1775, "step": 6099 }, { "epoch": 0.9486780715396579, "grad_norm": 1.0152059774082154, "learning_rate": 6.484920544062245e-08, "loss": 0.1328, "step": 6100 }, { "epoch": 0.9486780715396579, "eval_loss": 0.16048476099967957, "eval_runtime": 9.4295, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 6100 }, { "epoch": 0.9488335925349922, "grad_norm": 1.481019065945732, "learning_rate": 6.445762080503626e-08, "loss": 0.1755, "step": 6101 }, { "epoch": 0.9489891135303266, "grad_norm": 0.8967802609679155, "learning_rate": 6.406721435224417e-08, "loss": 0.1375, "step": 6102 }, { "epoch": 0.949144634525661, "grad_norm": 0.8649345574164956, "learning_rate": 6.367798617544219e-08, "loss": 0.1091, "step": 6103 }, { "epoch": 0.9493001555209953, "grad_norm": 1.034401448727601, "learning_rate": 6.328993636754377e-08, "loss": 0.1819, "step": 6104 }, { "epoch": 0.9494556765163297, "grad_norm": 1.2946220855129367, "learning_rate": 6.290306502118316e-08, "loss": 0.1058, "step": 6105 }, { "epoch": 0.9496111975116641, "grad_norm": 1.1247141276931807, "learning_rate": 6.25173722287109e-08, "loss": 0.1076, "step": 6106 }, { "epoch": 0.9497667185069985, "grad_norm": 0.7844372071584512, "learning_rate": 6.213285808219726e-08, "loss": 0.1817, "step": 6107 }, { "epoch": 0.9499222395023328, "grad_norm": 0.8975781066588662, "learning_rate": 6.174952267343215e-08, "loss": 0.1072, "step": 6108 }, { "epoch": 0.9500777604976672, "grad_norm": 1.201532492116242, "learning_rate": 6.136736609392124e-08, "loss": 0.157, "step": 6109 }, { "epoch": 0.9502332814930016, "grad_norm": 1.1396008555108934, "learning_rate": 6.098638843489213e-08, "loss": 0.1114, "step": 6110 }, { "epoch": 0.9503888024883359, "grad_norm": 0.9566214017863358, "learning_rate": 6.060658978728928e-08, "loss": 0.1444, "step": 6111 }, { "epoch": 0.9505443234836704, "grad_norm": 0.9206877741420252, "learning_rate": 6.022797024177518e-08, "loss": 0.1542, "step": 6112 }, { "epoch": 0.9506998444790047, "grad_norm": 1.217063878739373, "learning_rate": 5.985052988873252e-08, "loss": 0.1771, "step": 6113 }, { "epoch": 0.950855365474339, "grad_norm": 0.8957705306627637, "learning_rate": 5.9474268818260905e-08, "loss": 0.1194, "step": 6114 }, { "epoch": 0.9510108864696734, "grad_norm": 1.0676981242216075, "learning_rate": 5.909918712017959e-08, "loss": 0.08, "step": 6115 }, { "epoch": 0.9511664074650078, "grad_norm": 1.1056664311030722, "learning_rate": 5.8725284884025294e-08, "loss": 0.0859, "step": 6116 }, { "epoch": 0.9513219284603421, "grad_norm": 1.0330376959682035, "learning_rate": 5.835256219905438e-08, "loss": 0.0984, "step": 6117 }, { "epoch": 0.9514774494556765, "grad_norm": 0.7993082892256115, "learning_rate": 5.79810191542407e-08, "loss": 0.1562, "step": 6118 }, { "epoch": 0.9516329704510109, "grad_norm": 1.0888368187166595, "learning_rate": 5.7610655838276074e-08, "loss": 0.1068, "step": 6119 }, { "epoch": 0.9517884914463453, "grad_norm": 1.0524933689961635, "learning_rate": 5.7241472339572e-08, "loss": 0.134, "step": 6120 }, { "epoch": 0.9519440124416796, "grad_norm": 1.1513156540205667, "learning_rate": 5.687346874625799e-08, "loss": 0.1666, "step": 6121 }, { "epoch": 0.952099533437014, "grad_norm": 1.102168461688061, "learning_rate": 5.650664514618043e-08, "loss": 0.1479, "step": 6122 }, { "epoch": 0.9522550544323484, "grad_norm": 1.414027094244146, "learning_rate": 5.6141001626907054e-08, "loss": 0.174, "step": 6123 }, { "epoch": 0.9524105754276827, "grad_norm": 1.1033273344727272, "learning_rate": 5.577653827572083e-08, "loss": 0.154, "step": 6124 }, { "epoch": 0.9525660964230172, "grad_norm": 0.7856423815647753, "learning_rate": 5.541325517962437e-08, "loss": 0.1921, "step": 6125 }, { "epoch": 0.9527216174183515, "grad_norm": 0.9810659819265313, "learning_rate": 5.5051152425338875e-08, "loss": 0.1249, "step": 6126 }, { "epoch": 0.9528771384136858, "grad_norm": 0.896722557605337, "learning_rate": 5.469023009930296e-08, "loss": 0.0905, "step": 6127 }, { "epoch": 0.9530326594090203, "grad_norm": 0.9925204289274568, "learning_rate": 5.433048828767329e-08, "loss": 0.2114, "step": 6128 }, { "epoch": 0.9531881804043546, "grad_norm": 0.9828556179644782, "learning_rate": 5.397192707632615e-08, "loss": 0.0833, "step": 6129 }, { "epoch": 0.9533437013996889, "grad_norm": 1.4227120652230405, "learning_rate": 5.361454655085529e-08, "loss": 0.1747, "step": 6130 }, { "epoch": 0.9534992223950234, "grad_norm": 0.9393358702249112, "learning_rate": 5.325834679657138e-08, "loss": 0.1312, "step": 6131 }, { "epoch": 0.9536547433903577, "grad_norm": 1.0502853796854734, "learning_rate": 5.290332789850472e-08, "loss": 0.1124, "step": 6132 }, { "epoch": 0.953810264385692, "grad_norm": 1.261869712036898, "learning_rate": 5.2549489941404187e-08, "loss": 0.197, "step": 6133 }, { "epoch": 0.9539657853810264, "grad_norm": 1.418326047131124, "learning_rate": 5.219683300973444e-08, "loss": 0.2134, "step": 6134 }, { "epoch": 0.9541213063763608, "grad_norm": 0.9865223525416306, "learning_rate": 5.18453571876798e-08, "loss": 0.176, "step": 6135 }, { "epoch": 0.9542768273716952, "grad_norm": 0.7444518530411957, "learning_rate": 5.149506255914316e-08, "loss": 0.0876, "step": 6136 }, { "epoch": 0.9544323483670295, "grad_norm": 0.7946789886712093, "learning_rate": 5.1145949207744294e-08, "loss": 0.077, "step": 6137 }, { "epoch": 0.954587869362364, "grad_norm": 1.0620718409635537, "learning_rate": 5.079801721682154e-08, "loss": 0.1164, "step": 6138 }, { "epoch": 0.9547433903576983, "grad_norm": 1.2085494630292628, "learning_rate": 5.045126666943123e-08, "loss": 0.1609, "step": 6139 }, { "epoch": 0.9548989113530326, "grad_norm": 0.9247233325824317, "learning_rate": 5.0105697648347716e-08, "loss": 0.1213, "step": 6140 }, { "epoch": 0.9550544323483671, "grad_norm": 0.9747472026598205, "learning_rate": 4.976131023606223e-08, "loss": 0.1187, "step": 6141 }, { "epoch": 0.9552099533437014, "grad_norm": 1.368177888940013, "learning_rate": 4.941810451478624e-08, "loss": 0.1396, "step": 6142 }, { "epoch": 0.9553654743390357, "grad_norm": 1.1267444578254817, "learning_rate": 4.9076080566446416e-08, "loss": 0.1416, "step": 6143 }, { "epoch": 0.9555209953343702, "grad_norm": 0.9301841161914337, "learning_rate": 4.873523847268913e-08, "loss": 0.1038, "step": 6144 }, { "epoch": 0.9556765163297045, "grad_norm": 0.731513320809688, "learning_rate": 4.839557831487873e-08, "loss": 0.1575, "step": 6145 }, { "epoch": 0.9558320373250389, "grad_norm": 0.9314967257546614, "learning_rate": 4.8057100174095926e-08, "loss": 0.1437, "step": 6146 }, { "epoch": 0.9559875583203733, "grad_norm": 1.00797077710473, "learning_rate": 4.771980413114052e-08, "loss": 0.1599, "step": 6147 }, { "epoch": 0.9561430793157076, "grad_norm": 0.9361262754244674, "learning_rate": 4.7383690266530335e-08, "loss": 0.1184, "step": 6148 }, { "epoch": 0.956298600311042, "grad_norm": 0.765301411053877, "learning_rate": 4.704875866049952e-08, "loss": 0.1049, "step": 6149 }, { "epoch": 0.9564541213063764, "grad_norm": 1.676641960382829, "learning_rate": 4.671500939300133e-08, "loss": 0.1181, "step": 6150 }, { "epoch": 0.9566096423017107, "grad_norm": 0.8302392068219521, "learning_rate": 4.63824425437065e-08, "loss": 0.1387, "step": 6151 }, { "epoch": 0.9567651632970451, "grad_norm": 0.9803005446845957, "learning_rate": 4.6051058192002615e-08, "loss": 0.1256, "step": 6152 }, { "epoch": 0.9569206842923794, "grad_norm": 1.4778184556763196, "learning_rate": 4.572085641699697e-08, "loss": 0.1524, "step": 6153 }, { "epoch": 0.9570762052877139, "grad_norm": 1.3219839647858083, "learning_rate": 4.539183729751262e-08, "loss": 0.2011, "step": 6154 }, { "epoch": 0.9572317262830482, "grad_norm": 1.0356389553577878, "learning_rate": 4.506400091209118e-08, "loss": 0.1075, "step": 6155 }, { "epoch": 0.9573872472783825, "grad_norm": 1.114340835610953, "learning_rate": 4.473734733899227e-08, "loss": 0.1296, "step": 6156 }, { "epoch": 0.957542768273717, "grad_norm": 0.828911977651467, "learning_rate": 4.441187665619129e-08, "loss": 0.0428, "step": 6157 }, { "epoch": 0.9576982892690513, "grad_norm": 0.8492263261648972, "learning_rate": 4.408758894138387e-08, "loss": 0.1892, "step": 6158 }, { "epoch": 0.9578538102643857, "grad_norm": 1.142616982224722, "learning_rate": 4.3764484271981435e-08, "loss": 0.2257, "step": 6159 }, { "epoch": 0.9580093312597201, "grad_norm": 1.0387175625931329, "learning_rate": 4.344256272511338e-08, "loss": 0.1691, "step": 6160 }, { "epoch": 0.9581648522550544, "grad_norm": 0.8994994101412701, "learning_rate": 4.312182437762769e-08, "loss": 0.1225, "step": 6161 }, { "epoch": 0.9583203732503888, "grad_norm": 0.9561009034016593, "learning_rate": 4.2802269306088105e-08, "loss": 0.1524, "step": 6162 }, { "epoch": 0.9584758942457232, "grad_norm": 1.0065788699666465, "learning_rate": 4.248389758677751e-08, "loss": 0.1007, "step": 6163 }, { "epoch": 0.9586314152410575, "grad_norm": 1.083172333906424, "learning_rate": 4.216670929569622e-08, "loss": 0.1754, "step": 6164 }, { "epoch": 0.9587869362363919, "grad_norm": 1.2868327720083226, "learning_rate": 4.185070450856032e-08, "loss": 0.1793, "step": 6165 }, { "epoch": 0.9589424572317263, "grad_norm": 0.8139214385153428, "learning_rate": 4.1535883300805045e-08, "loss": 0.0919, "step": 6166 }, { "epoch": 0.9590979782270607, "grad_norm": 1.0872479362394658, "learning_rate": 4.122224574758249e-08, "loss": 0.1023, "step": 6167 }, { "epoch": 0.959253499222395, "grad_norm": 1.263047914544462, "learning_rate": 4.090979192376277e-08, "loss": 0.1161, "step": 6168 }, { "epoch": 0.9594090202177294, "grad_norm": 1.0396005757258633, "learning_rate": 4.0598521903931765e-08, "loss": 0.1031, "step": 6169 }, { "epoch": 0.9595645412130638, "grad_norm": 1.1850100629628733, "learning_rate": 4.0288435762396164e-08, "loss": 0.1144, "step": 6170 }, { "epoch": 0.9597200622083981, "grad_norm": 1.2081062284971396, "learning_rate": 3.997953357317563e-08, "loss": 0.1317, "step": 6171 }, { "epoch": 0.9598755832037325, "grad_norm": 0.7283082268446268, "learning_rate": 3.967181541001119e-08, "loss": 0.1352, "step": 6172 }, { "epoch": 0.9600311041990669, "grad_norm": 1.0881718295256737, "learning_rate": 3.936528134635742e-08, "loss": 0.1429, "step": 6173 }, { "epoch": 0.9601866251944012, "grad_norm": 1.464464285475614, "learning_rate": 3.905993145539022e-08, "loss": 0.2096, "step": 6174 }, { "epoch": 0.9603421461897356, "grad_norm": 0.9352290309200556, "learning_rate": 3.875576581000018e-08, "loss": 0.1136, "step": 6175 }, { "epoch": 0.96049766718507, "grad_norm": 0.7501554857608408, "learning_rate": 3.8452784482795324e-08, "loss": 0.1264, "step": 6176 }, { "epoch": 0.9606531881804043, "grad_norm": 0.8618503344337196, "learning_rate": 3.8150987546102246e-08, "loss": 0.1472, "step": 6177 }, { "epoch": 0.9608087091757387, "grad_norm": 1.3695257457827745, "learning_rate": 3.7850375071963875e-08, "loss": 0.1805, "step": 6178 }, { "epoch": 0.9609642301710731, "grad_norm": 0.9213469865329909, "learning_rate": 3.7550947132140584e-08, "loss": 0.1569, "step": 6179 }, { "epoch": 0.9611197511664075, "grad_norm": 1.0545445657985666, "learning_rate": 3.725270379811019e-08, "loss": 0.1945, "step": 6180 }, { "epoch": 0.9612752721617418, "grad_norm": 1.004226335056963, "learning_rate": 3.6955645141066865e-08, "loss": 0.1214, "step": 6181 }, { "epoch": 0.9614307931570762, "grad_norm": 0.9537900757144409, "learning_rate": 3.665977123192333e-08, "loss": 0.1894, "step": 6182 }, { "epoch": 0.9615863141524106, "grad_norm": 1.2264614710617254, "learning_rate": 3.636508214130863e-08, "loss": 0.109, "step": 6183 }, { "epoch": 0.9617418351477449, "grad_norm": 1.2864112856046779, "learning_rate": 3.607157793956928e-08, "loss": 0.1631, "step": 6184 }, { "epoch": 0.9618973561430794, "grad_norm": 1.1565247773945686, "learning_rate": 3.5779258696768126e-08, "loss": 0.1546, "step": 6185 }, { "epoch": 0.9620528771384137, "grad_norm": 0.7454632880596697, "learning_rate": 3.5488124482687125e-08, "loss": 0.0897, "step": 6186 }, { "epoch": 0.962208398133748, "grad_norm": 1.3049306045004603, "learning_rate": 3.5198175366822906e-08, "loss": 0.1911, "step": 6187 }, { "epoch": 0.9623639191290825, "grad_norm": 0.8881463858636415, "learning_rate": 3.490941141839066e-08, "loss": 0.1991, "step": 6188 }, { "epoch": 0.9625194401244168, "grad_norm": 1.24935186607701, "learning_rate": 3.4621832706323023e-08, "loss": 0.1672, "step": 6189 }, { "epoch": 0.9626749611197511, "grad_norm": 1.247183677185879, "learning_rate": 3.4335439299268414e-08, "loss": 0.1025, "step": 6190 }, { "epoch": 0.9628304821150855, "grad_norm": 0.9080215188189259, "learning_rate": 3.40502312655927e-08, "loss": 0.19, "step": 6191 }, { "epoch": 0.9629860031104199, "grad_norm": 1.1022207258221952, "learning_rate": 3.3766208673379764e-08, "loss": 0.1613, "step": 6192 }, { "epoch": 0.9631415241057543, "grad_norm": 0.7491284560831681, "learning_rate": 3.3483371590428695e-08, "loss": 0.077, "step": 6193 }, { "epoch": 0.9632970451010886, "grad_norm": 1.1148877638168753, "learning_rate": 3.320172008425771e-08, "loss": 0.141, "step": 6194 }, { "epoch": 0.963452566096423, "grad_norm": 1.1745029310727895, "learning_rate": 3.2921254222100796e-08, "loss": 0.1142, "step": 6195 }, { "epoch": 0.9636080870917574, "grad_norm": 1.0400302808357298, "learning_rate": 3.2641974070908854e-08, "loss": 0.1275, "step": 6196 }, { "epoch": 0.9637636080870917, "grad_norm": 1.318661904427293, "learning_rate": 3.236387969734967e-08, "loss": 0.1367, "step": 6197 }, { "epoch": 0.9639191290824262, "grad_norm": 1.046935753429947, "learning_rate": 3.208697116780846e-08, "loss": 0.168, "step": 6198 }, { "epoch": 0.9640746500777605, "grad_norm": 1.3813740928452907, "learning_rate": 3.1811248548387354e-08, "loss": 0.1665, "step": 6199 }, { "epoch": 0.9642301710730948, "grad_norm": 1.323485213366165, "learning_rate": 3.1536711904904816e-08, "loss": 0.1169, "step": 6200 }, { "epoch": 0.9642301710730948, "eval_loss": 0.16031643748283386, "eval_runtime": 9.4526, "eval_samples_per_second": 2.751, "eval_steps_per_second": 0.741, "step": 6200 }, { "epoch": 0.9643856920684293, "grad_norm": 0.7964222468782227, "learning_rate": 3.126336130289676e-08, "loss": 0.1296, "step": 6201 }, { "epoch": 0.9645412130637636, "grad_norm": 2.601367728492524, "learning_rate": 3.099119680761598e-08, "loss": 0.2108, "step": 6202 }, { "epoch": 0.964696734059098, "grad_norm": 1.1293683862693211, "learning_rate": 3.072021848403217e-08, "loss": 0.0937, "step": 6203 }, { "epoch": 0.9648522550544324, "grad_norm": 0.8375637849933584, "learning_rate": 3.045042639683082e-08, "loss": 0.0928, "step": 6204 }, { "epoch": 0.9650077760497667, "grad_norm": 0.8557956827882293, "learning_rate": 3.018182061041541e-08, "loss": 0.1289, "step": 6205 }, { "epoch": 0.9651632970451011, "grad_norm": 1.137120235255635, "learning_rate": 2.991440118890632e-08, "loss": 0.1037, "step": 6206 }, { "epoch": 0.9653188180404355, "grad_norm": 0.7436953808664888, "learning_rate": 2.9648168196139693e-08, "loss": 0.0902, "step": 6207 }, { "epoch": 0.9654743390357698, "grad_norm": 1.0089279254249324, "learning_rate": 2.9383121695669147e-08, "loss": 0.1678, "step": 6208 }, { "epoch": 0.9656298600311042, "grad_norm": 1.4716525048901559, "learning_rate": 2.9119261750765183e-08, "loss": 0.1094, "step": 6209 }, { "epoch": 0.9657853810264385, "grad_norm": 1.2169663179021877, "learning_rate": 2.8856588424414632e-08, "loss": 0.157, "step": 6210 }, { "epoch": 0.965940902021773, "grad_norm": 1.065141153109308, "learning_rate": 2.8595101779321787e-08, "loss": 0.113, "step": 6211 }, { "epoch": 0.9660964230171073, "grad_norm": 1.3294895012941623, "learning_rate": 2.8334801877906714e-08, "loss": 0.1218, "step": 6212 }, { "epoch": 0.9662519440124416, "grad_norm": 1.1533401305464612, "learning_rate": 2.807568878230693e-08, "loss": 0.1339, "step": 6213 }, { "epoch": 0.9664074650077761, "grad_norm": 1.2503298979148232, "learning_rate": 2.7817762554375737e-08, "loss": 0.1646, "step": 6214 }, { "epoch": 0.9665629860031104, "grad_norm": 0.9522198548303137, "learning_rate": 2.7561023255684438e-08, "loss": 0.1454, "step": 6215 }, { "epoch": 0.9667185069984447, "grad_norm": 0.821864479790256, "learning_rate": 2.7305470947519562e-08, "loss": 0.1632, "step": 6216 }, { "epoch": 0.9668740279937792, "grad_norm": 1.025703481939874, "learning_rate": 2.7051105690885648e-08, "loss": 0.1997, "step": 6217 }, { "epoch": 0.9670295489891135, "grad_norm": 0.9057902434917936, "learning_rate": 2.679792754650301e-08, "loss": 0.1042, "step": 6218 }, { "epoch": 0.9671850699844479, "grad_norm": 1.108043413056384, "learning_rate": 2.654593657480886e-08, "loss": 0.1204, "step": 6219 }, { "epoch": 0.9673405909797823, "grad_norm": 1.075066725181999, "learning_rate": 2.6295132835956748e-08, "loss": 0.1507, "step": 6220 }, { "epoch": 0.9674961119751166, "grad_norm": 1.1952852000428664, "learning_rate": 2.604551638981767e-08, "loss": 0.1338, "step": 6221 }, { "epoch": 0.967651632970451, "grad_norm": 1.194182400649156, "learning_rate": 2.5797087295977297e-08, "loss": 0.1412, "step": 6222 }, { "epoch": 0.9678071539657854, "grad_norm": 0.7120269334540772, "learning_rate": 2.554984561374041e-08, "loss": 0.1355, "step": 6223 }, { "epoch": 0.9679626749611198, "grad_norm": 1.1834841310870512, "learning_rate": 2.5303791402126465e-08, "loss": 0.1025, "step": 6224 }, { "epoch": 0.9681181959564541, "grad_norm": 1.038551353353866, "learning_rate": 2.5058924719871258e-08, "loss": 0.1029, "step": 6225 }, { "epoch": 0.9682737169517885, "grad_norm": 1.248456306998982, "learning_rate": 2.4815245625429696e-08, "loss": 0.1889, "step": 6226 }, { "epoch": 0.9684292379471229, "grad_norm": 0.9354594613709177, "learning_rate": 2.4572754176969692e-08, "loss": 0.1987, "step": 6227 }, { "epoch": 0.9685847589424572, "grad_norm": 1.591908535395492, "learning_rate": 2.4331450432378833e-08, "loss": 0.1809, "step": 6228 }, { "epoch": 0.9687402799377915, "grad_norm": 1.3857628590629225, "learning_rate": 2.4091334449258263e-08, "loss": 0.1246, "step": 6229 }, { "epoch": 0.968895800933126, "grad_norm": 0.996678876939594, "learning_rate": 2.3852406284927687e-08, "loss": 0.1131, "step": 6230 }, { "epoch": 0.9690513219284603, "grad_norm": 0.8945385105589122, "learning_rate": 2.3614665996422592e-08, "loss": 0.0688, "step": 6231 }, { "epoch": 0.9692068429237947, "grad_norm": 1.5416987668615487, "learning_rate": 2.3378113640494805e-08, "loss": 0.146, "step": 6232 }, { "epoch": 0.9693623639191291, "grad_norm": 1.0827442977756248, "learning_rate": 2.3142749273612487e-08, "loss": 0.1339, "step": 6233 }, { "epoch": 0.9695178849144634, "grad_norm": 1.2597042800864637, "learning_rate": 2.290857295196125e-08, "loss": 0.1453, "step": 6234 }, { "epoch": 0.9696734059097978, "grad_norm": 1.0695874149047513, "learning_rate": 2.2675584731440826e-08, "loss": 0.093, "step": 6235 }, { "epoch": 0.9698289269051322, "grad_norm": 0.7973609732351543, "learning_rate": 2.2443784667670054e-08, "loss": 0.0981, "step": 6236 }, { "epoch": 0.9699844479004666, "grad_norm": 1.091218549826648, "learning_rate": 2.2213172815982454e-08, "loss": 0.1042, "step": 6237 }, { "epoch": 0.9701399688958009, "grad_norm": 1.034374525311031, "learning_rate": 2.1983749231427877e-08, "loss": 0.1757, "step": 6238 }, { "epoch": 0.9702954898911353, "grad_norm": 0.8805677734384761, "learning_rate": 2.175551396877307e-08, "loss": 0.0861, "step": 6239 }, { "epoch": 0.9704510108864697, "grad_norm": 0.7776376212870187, "learning_rate": 2.152846708250167e-08, "loss": 0.1292, "step": 6240 }, { "epoch": 0.970606531881804, "grad_norm": 1.7439553790761453, "learning_rate": 2.1302608626811994e-08, "loss": 0.155, "step": 6241 }, { "epoch": 0.9707620528771385, "grad_norm": 1.4504484839638818, "learning_rate": 2.1077938655619802e-08, "loss": 0.1459, "step": 6242 }, { "epoch": 0.9709175738724728, "grad_norm": 0.9849439127653121, "learning_rate": 2.0854457222557188e-08, "loss": 0.1607, "step": 6243 }, { "epoch": 0.9710730948678071, "grad_norm": 0.8498288769352798, "learning_rate": 2.0632164380972598e-08, "loss": 0.0901, "step": 6244 }, { "epoch": 0.9712286158631416, "grad_norm": 1.11753501037739, "learning_rate": 2.0411060183929687e-08, "loss": 0.1672, "step": 6245 }, { "epoch": 0.9713841368584759, "grad_norm": 1.2666957799375014, "learning_rate": 2.0191144684210128e-08, "loss": 0.1595, "step": 6246 }, { "epoch": 0.9715396578538102, "grad_norm": 1.2061147760570528, "learning_rate": 1.9972417934309706e-08, "loss": 0.1062, "step": 6247 }, { "epoch": 0.9716951788491446, "grad_norm": 1.1508470149955843, "learning_rate": 1.975487998644221e-08, "loss": 0.1365, "step": 6248 }, { "epoch": 0.971850699844479, "grad_norm": 1.286456410849191, "learning_rate": 1.9538530892536655e-08, "loss": 0.1821, "step": 6249 }, { "epoch": 0.9720062208398134, "grad_norm": 1.0286631471857703, "learning_rate": 1.93233707042384e-08, "loss": 0.1089, "step": 6250 }, { "epoch": 0.9721617418351477, "grad_norm": 1.11950285802662, "learning_rate": 1.9109399472910238e-08, "loss": 0.1436, "step": 6251 }, { "epoch": 0.9723172628304821, "grad_norm": 0.8987991419955842, "learning_rate": 1.88966172496291e-08, "loss": 0.1117, "step": 6252 }, { "epoch": 0.9724727838258165, "grad_norm": 1.2820203901617107, "learning_rate": 1.8685024085189352e-08, "loss": 0.0974, "step": 6253 }, { "epoch": 0.9726283048211508, "grad_norm": 0.9828123340924062, "learning_rate": 1.8474620030101142e-08, "loss": 0.1059, "step": 6254 }, { "epoch": 0.9727838258164853, "grad_norm": 0.6598638980180037, "learning_rate": 1.8265405134590963e-08, "loss": 0.0541, "step": 6255 }, { "epoch": 0.9729393468118196, "grad_norm": 1.1959443824063662, "learning_rate": 1.8057379448601086e-08, "loss": 0.1059, "step": 6256 }, { "epoch": 0.9730948678071539, "grad_norm": 0.9326553766808711, "learning_rate": 1.785054302179068e-08, "loss": 0.1309, "step": 6257 }, { "epoch": 0.9732503888024884, "grad_norm": 1.0049743817843433, "learning_rate": 1.7644895903533575e-08, "loss": 0.1614, "step": 6258 }, { "epoch": 0.9734059097978227, "grad_norm": 0.9357796762376644, "learning_rate": 1.7440438142921623e-08, "loss": 0.0803, "step": 6259 }, { "epoch": 0.973561430793157, "grad_norm": 0.8967089920568344, "learning_rate": 1.723716978876133e-08, "loss": 0.1214, "step": 6260 }, { "epoch": 0.9737169517884915, "grad_norm": 0.8991658692357358, "learning_rate": 1.7035090889575556e-08, "loss": 0.0685, "step": 6261 }, { "epoch": 0.9738724727838258, "grad_norm": 1.151710465427922, "learning_rate": 1.6834201493603485e-08, "loss": 0.1922, "step": 6262 }, { "epoch": 0.9740279937791602, "grad_norm": 0.9072939707973526, "learning_rate": 1.6634501648800648e-08, "loss": 0.1485, "step": 6263 }, { "epoch": 0.9741835147744946, "grad_norm": 1.100424040669396, "learning_rate": 1.643599140283725e-08, "loss": 0.2008, "step": 6264 }, { "epoch": 0.9743390357698289, "grad_norm": 1.3641450158691153, "learning_rate": 1.623867080310093e-08, "loss": 0.1228, "step": 6265 }, { "epoch": 0.9744945567651633, "grad_norm": 1.4403238232540572, "learning_rate": 1.6042539896694577e-08, "loss": 0.1861, "step": 6266 }, { "epoch": 0.9746500777604976, "grad_norm": 1.1954705312770386, "learning_rate": 1.584759873043795e-08, "loss": 0.159, "step": 6267 }, { "epoch": 0.9748055987558321, "grad_norm": 0.8911920217648157, "learning_rate": 1.5653847350866057e-08, "loss": 0.1117, "step": 6268 }, { "epoch": 0.9749611197511664, "grad_norm": 0.8828575852846045, "learning_rate": 1.5461285804230232e-08, "loss": 0.0831, "step": 6269 }, { "epoch": 0.9751166407465007, "grad_norm": 1.1269571560965936, "learning_rate": 1.5269914136497033e-08, "loss": 0.1731, "step": 6270 }, { "epoch": 0.9752721617418352, "grad_norm": 0.9855920106424969, "learning_rate": 1.5079732393349366e-08, "loss": 0.1133, "step": 6271 }, { "epoch": 0.9754276827371695, "grad_norm": 1.300630265795775, "learning_rate": 1.4890740620187028e-08, "loss": 0.1549, "step": 6272 }, { "epoch": 0.9755832037325038, "grad_norm": 0.8822711789717151, "learning_rate": 1.4702938862124483e-08, "loss": 0.1487, "step": 6273 }, { "epoch": 0.9757387247278383, "grad_norm": 1.2075893485364713, "learning_rate": 1.4516327163993093e-08, "loss": 0.2083, "step": 6274 }, { "epoch": 0.9758942457231726, "grad_norm": 0.6306452388699071, "learning_rate": 1.4330905570339448e-08, "loss": 0.1383, "step": 6275 }, { "epoch": 0.976049766718507, "grad_norm": 1.2267479745568575, "learning_rate": 1.4146674125426474e-08, "loss": 0.0988, "step": 6276 }, { "epoch": 0.9762052877138414, "grad_norm": 0.8099334083040386, "learning_rate": 1.396363287323177e-08, "loss": 0.1173, "step": 6277 }, { "epoch": 0.9763608087091757, "grad_norm": 1.3381009571088922, "learning_rate": 1.3781781857451492e-08, "loss": 0.1097, "step": 6278 }, { "epoch": 0.9765163297045101, "grad_norm": 0.7693261453960534, "learning_rate": 1.3601121121494809e-08, "loss": 0.1843, "step": 6279 }, { "epoch": 0.9766718506998445, "grad_norm": 0.9366392016992208, "learning_rate": 1.3421650708487777e-08, "loss": 0.1305, "step": 6280 }, { "epoch": 0.9768273716951789, "grad_norm": 1.1195502983712664, "learning_rate": 1.3243370661273347e-08, "loss": 0.1211, "step": 6281 }, { "epoch": 0.9769828926905132, "grad_norm": 0.8612832502587932, "learning_rate": 1.3066281022409145e-08, "loss": 0.0634, "step": 6282 }, { "epoch": 0.9771384136858476, "grad_norm": 1.2692198112978264, "learning_rate": 1.2890381834168575e-08, "loss": 0.1134, "step": 6283 }, { "epoch": 0.977293934681182, "grad_norm": 1.1425367777429847, "learning_rate": 1.271567313854194e-08, "loss": 0.1705, "step": 6284 }, { "epoch": 0.9774494556765163, "grad_norm": 1.3195866320397094, "learning_rate": 1.2542154977234212e-08, "loss": 0.134, "step": 6285 }, { "epoch": 0.9776049766718506, "grad_norm": 1.071507325552433, "learning_rate": 1.23698273916667e-08, "loss": 0.1604, "step": 6286 }, { "epoch": 0.9777604976671851, "grad_norm": 1.0389408417183448, "learning_rate": 1.2198690422975945e-08, "loss": 0.1548, "step": 6287 }, { "epoch": 0.9779160186625194, "grad_norm": 1.0128578883404868, "learning_rate": 1.2028744112015378e-08, "loss": 0.1263, "step": 6288 }, { "epoch": 0.9780715396578538, "grad_norm": 0.8814100259938773, "learning_rate": 1.1859988499353103e-08, "loss": 0.1352, "step": 6289 }, { "epoch": 0.9782270606531882, "grad_norm": 0.988955253653772, "learning_rate": 1.1692423625273563e-08, "loss": 0.1037, "step": 6290 }, { "epoch": 0.9783825816485225, "grad_norm": 0.9755012943315347, "learning_rate": 1.1526049529776984e-08, "loss": 0.1884, "step": 6291 }, { "epoch": 0.9785381026438569, "grad_norm": 1.3064477688090788, "learning_rate": 1.1360866252579372e-08, "loss": 0.1652, "step": 6292 }, { "epoch": 0.9786936236391913, "grad_norm": 1.0864821605220956, "learning_rate": 1.119687383311141e-08, "loss": 0.1537, "step": 6293 }, { "epoch": 0.9788491446345257, "grad_norm": 1.0355943777081922, "learning_rate": 1.1034072310521226e-08, "loss": 0.0977, "step": 6294 }, { "epoch": 0.97900466562986, "grad_norm": 0.9872841812309893, "learning_rate": 1.0872461723671624e-08, "loss": 0.1235, "step": 6295 }, { "epoch": 0.9791601866251944, "grad_norm": 1.1026166511582043, "learning_rate": 1.0712042111141185e-08, "loss": 0.103, "step": 6296 }, { "epoch": 0.9793157076205288, "grad_norm": 1.5990532500579155, "learning_rate": 1.055281351122428e-08, "loss": 0.1655, "step": 6297 }, { "epoch": 0.9794712286158631, "grad_norm": 0.5579872876702263, "learning_rate": 1.0394775961931058e-08, "loss": 0.0884, "step": 6298 }, { "epoch": 0.9796267496111976, "grad_norm": 1.0590397537767569, "learning_rate": 1.0237929500987453e-08, "loss": 0.1151, "step": 6299 }, { "epoch": 0.9797822706065319, "grad_norm": 0.8820420460270124, "learning_rate": 1.0082274165834627e-08, "loss": 0.1498, "step": 6300 }, { "epoch": 0.9797822706065319, "eval_loss": 0.16043274104595184, "eval_runtime": 9.4318, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 6300 }, { "epoch": 0.9799377916018662, "grad_norm": 0.7621829613790806, "learning_rate": 9.927809993629522e-09, "loss": 0.1593, "step": 6301 }, { "epoch": 0.9800933125972007, "grad_norm": 0.919673314374682, "learning_rate": 9.774537021245422e-09, "loss": 0.1642, "step": 6302 }, { "epoch": 0.980248833592535, "grad_norm": 0.9992284747073868, "learning_rate": 9.622455285270838e-09, "loss": 0.2006, "step": 6303 }, { "epoch": 0.9804043545878693, "grad_norm": 1.4249470832606403, "learning_rate": 9.471564822008949e-09, "loss": 0.1843, "step": 6304 }, { "epoch": 0.9805598755832037, "grad_norm": 1.6150348431985972, "learning_rate": 9.321865667479835e-09, "loss": 0.1526, "step": 6305 }, { "epoch": 0.9807153965785381, "grad_norm": 0.8005541187233789, "learning_rate": 9.173357857418797e-09, "loss": 0.1011, "step": 6306 }, { "epoch": 0.9808709175738725, "grad_norm": 1.1482232487159838, "learning_rate": 9.026041427276922e-09, "loss": 0.1897, "step": 6307 }, { "epoch": 0.9810264385692068, "grad_norm": 1.3332669667195762, "learning_rate": 8.879916412220524e-09, "loss": 0.151, "step": 6308 }, { "epoch": 0.9811819595645412, "grad_norm": 1.1534524891197455, "learning_rate": 8.7349828471317e-09, "loss": 0.1519, "step": 6309 }, { "epoch": 0.9813374805598756, "grad_norm": 1.2637372914646152, "learning_rate": 8.591240766607779e-09, "loss": 0.1821, "step": 6310 }, { "epoch": 0.9814930015552099, "grad_norm": 1.5556564629995404, "learning_rate": 8.448690204962973e-09, "loss": 0.2541, "step": 6311 }, { "epoch": 0.9816485225505444, "grad_norm": 1.0111828551400739, "learning_rate": 8.307331196225066e-09, "loss": 0.1523, "step": 6312 }, { "epoch": 0.9818040435458787, "grad_norm": 1.099743235112141, "learning_rate": 8.167163774138731e-09, "loss": 0.1269, "step": 6313 }, { "epoch": 0.981959564541213, "grad_norm": 0.8097770540170601, "learning_rate": 8.02818797216387e-09, "loss": 0.1493, "step": 6314 }, { "epoch": 0.9821150855365475, "grad_norm": 1.2783780738209316, "learning_rate": 7.890403823476168e-09, "loss": 0.1742, "step": 6315 }, { "epoch": 0.9822706065318818, "grad_norm": 1.112583669506246, "learning_rate": 7.75381136096598e-09, "loss": 0.2081, "step": 6316 }, { "epoch": 0.9824261275272161, "grad_norm": 0.9790658435127209, "learning_rate": 7.618410617241112e-09, "loss": 0.1558, "step": 6317 }, { "epoch": 0.9825816485225506, "grad_norm": 1.028549036704973, "learning_rate": 7.484201624622378e-09, "loss": 0.0862, "step": 6318 }, { "epoch": 0.9827371695178849, "grad_norm": 1.178673436768954, "learning_rate": 7.351184415148039e-09, "loss": 0.1684, "step": 6319 }, { "epoch": 0.9828926905132193, "grad_norm": 1.242693138538055, "learning_rate": 7.219359020570471e-09, "loss": 0.1407, "step": 6320 }, { "epoch": 0.9830482115085537, "grad_norm": 1.2503017470543931, "learning_rate": 7.088725472358948e-09, "loss": 0.1376, "step": 6321 }, { "epoch": 0.983203732503888, "grad_norm": 1.4370844896900314, "learning_rate": 6.959283801697414e-09, "loss": 0.1577, "step": 6322 }, { "epoch": 0.9833592534992224, "grad_norm": 1.1198477070028017, "learning_rate": 6.831034039485041e-09, "loss": 0.1919, "step": 6323 }, { "epoch": 0.9835147744945568, "grad_norm": 1.1911317127815277, "learning_rate": 6.703976216336783e-09, "loss": 0.1508, "step": 6324 }, { "epoch": 0.9836702954898912, "grad_norm": 1.2637791163280694, "learning_rate": 6.578110362583379e-09, "loss": 0.2184, "step": 6325 }, { "epoch": 0.9838258164852255, "grad_norm": 1.2277712119737252, "learning_rate": 6.453436508270794e-09, "loss": 0.1897, "step": 6326 }, { "epoch": 0.9839813374805598, "grad_norm": 1.5220589587522342, "learning_rate": 6.329954683160777e-09, "loss": 0.1826, "step": 6327 }, { "epoch": 0.9841368584758943, "grad_norm": 0.9470109101794946, "learning_rate": 6.207664916729194e-09, "loss": 0.124, "step": 6328 }, { "epoch": 0.9842923794712286, "grad_norm": 0.9312647399430644, "learning_rate": 6.0865672381693605e-09, "loss": 0.1429, "step": 6329 }, { "epoch": 0.9844479004665629, "grad_norm": 1.0976162939533385, "learning_rate": 5.966661676388152e-09, "loss": 0.1168, "step": 6330 }, { "epoch": 0.9846034214618974, "grad_norm": 0.9405244801922693, "learning_rate": 5.847948260008784e-09, "loss": 0.1056, "step": 6331 }, { "epoch": 0.9847589424572317, "grad_norm": 0.7426189784270935, "learning_rate": 5.730427017370255e-09, "loss": 0.1095, "step": 6332 }, { "epoch": 0.9849144634525661, "grad_norm": 1.618165637714386, "learning_rate": 5.6140979765267885e-09, "loss": 0.1944, "step": 6333 }, { "epoch": 0.9850699844479005, "grad_norm": 1.253837632394512, "learning_rate": 5.49896116524673e-09, "loss": 0.2029, "step": 6334 }, { "epoch": 0.9852255054432348, "grad_norm": 1.136873503484721, "learning_rate": 5.385016611015315e-09, "loss": 0.1513, "step": 6335 }, { "epoch": 0.9853810264385692, "grad_norm": 0.9449204337278451, "learning_rate": 5.272264341033006e-09, "loss": 0.1798, "step": 6336 }, { "epoch": 0.9855365474339036, "grad_norm": 1.0971005202315647, "learning_rate": 5.160704382215498e-09, "loss": 0.0888, "step": 6337 }, { "epoch": 0.985692068429238, "grad_norm": 0.9768082429342902, "learning_rate": 5.050336761192598e-09, "loss": 0.1228, "step": 6338 }, { "epoch": 0.9858475894245723, "grad_norm": 2.0016798034910805, "learning_rate": 4.941161504311564e-09, "loss": 0.1639, "step": 6339 }, { "epoch": 0.9860031104199067, "grad_norm": 0.9795979650440577, "learning_rate": 4.833178637633773e-09, "loss": 0.1194, "step": 6340 }, { "epoch": 0.9861586314152411, "grad_norm": 0.7996144910651797, "learning_rate": 4.726388186936381e-09, "loss": 0.1826, "step": 6341 }, { "epoch": 0.9863141524105754, "grad_norm": 1.2043509852516918, "learning_rate": 4.620790177711776e-09, "loss": 0.1102, "step": 6342 }, { "epoch": 0.9864696734059099, "grad_norm": 0.8213745199126574, "learning_rate": 4.5163846351675696e-09, "loss": 0.0623, "step": 6343 }, { "epoch": 0.9866251944012442, "grad_norm": 1.4260413273908958, "learning_rate": 4.413171584226605e-09, "loss": 0.1766, "step": 6344 }, { "epoch": 0.9867807153965785, "grad_norm": 1.0346570796243992, "learning_rate": 4.311151049527507e-09, "loss": 0.1157, "step": 6345 }, { "epoch": 0.9869362363919129, "grad_norm": 0.6945913913695937, "learning_rate": 4.210323055424126e-09, "loss": 0.1063, "step": 6346 }, { "epoch": 0.9870917573872473, "grad_norm": 1.3635140322095076, "learning_rate": 4.110687625984988e-09, "loss": 0.1042, "step": 6347 }, { "epoch": 0.9872472783825816, "grad_norm": 0.9012517655918755, "learning_rate": 4.012244784994956e-09, "loss": 0.1621, "step": 6348 }, { "epoch": 0.987402799377916, "grad_norm": 0.6706583454626702, "learning_rate": 3.914994555954121e-09, "loss": 0.1135, "step": 6349 }, { "epoch": 0.9875583203732504, "grad_norm": 0.7566554168880338, "learning_rate": 3.818936962076136e-09, "loss": 0.1442, "step": 6350 }, { "epoch": 0.9877138413685848, "grad_norm": 1.3700075232756939, "learning_rate": 3.7240720262926577e-09, "loss": 0.1899, "step": 6351 }, { "epoch": 0.9878693623639191, "grad_norm": 0.9115873135971353, "learning_rate": 3.6303997712483498e-09, "loss": 0.0772, "step": 6352 }, { "epoch": 0.9880248833592535, "grad_norm": 0.9001857914264059, "learning_rate": 3.537920219304214e-09, "loss": 0.1476, "step": 6353 }, { "epoch": 0.9881804043545879, "grad_norm": 1.4634143067398333, "learning_rate": 3.4466333925370354e-09, "loss": 0.1156, "step": 6354 }, { "epoch": 0.9883359253499222, "grad_norm": 0.9996639136704001, "learning_rate": 3.3565393127377165e-09, "loss": 0.1417, "step": 6355 }, { "epoch": 0.9884914463452567, "grad_norm": 1.1493130335373207, "learning_rate": 3.2676380014129427e-09, "loss": 0.1747, "step": 6356 }, { "epoch": 0.988646967340591, "grad_norm": 1.3619139428323221, "learning_rate": 3.179929479785182e-09, "loss": 0.1809, "step": 6357 }, { "epoch": 0.9888024883359253, "grad_norm": 1.406460001985442, "learning_rate": 3.0934137687910205e-09, "loss": 0.1706, "step": 6358 }, { "epoch": 0.9889580093312598, "grad_norm": 0.9319088929679163, "learning_rate": 3.0080908890833815e-09, "loss": 0.0598, "step": 6359 }, { "epoch": 0.9891135303265941, "grad_norm": 1.4193919594293938, "learning_rate": 2.9239608610298618e-09, "loss": 0.1761, "step": 6360 }, { "epoch": 0.9892690513219284, "grad_norm": 1.1635149617410365, "learning_rate": 2.841023704713841e-09, "loss": 0.129, "step": 6361 }, { "epoch": 0.9894245723172629, "grad_norm": 0.8051984883787606, "learning_rate": 2.759279439933371e-09, "loss": 0.1234, "step": 6362 }, { "epoch": 0.9895800933125972, "grad_norm": 1.127808618730805, "learning_rate": 2.6787280862011766e-09, "loss": 0.1271, "step": 6363 }, { "epoch": 0.9897356143079316, "grad_norm": 1.2329424449379407, "learning_rate": 2.5993696627468758e-09, "loss": 0.1144, "step": 6364 }, { "epoch": 0.9898911353032659, "grad_norm": 0.6759294793649148, "learning_rate": 2.5212041885147585e-09, "loss": 0.1373, "step": 6365 }, { "epoch": 0.9900466562986003, "grad_norm": 1.5127514954919077, "learning_rate": 2.444231682163234e-09, "loss": 0.1724, "step": 6366 }, { "epoch": 0.9902021772939347, "grad_norm": 0.8537555183162342, "learning_rate": 2.3684521620664925e-09, "loss": 0.1386, "step": 6367 }, { "epoch": 0.990357698289269, "grad_norm": 1.2970665235357177, "learning_rate": 2.2938656463150633e-09, "loss": 0.1499, "step": 6368 }, { "epoch": 0.9905132192846035, "grad_norm": 0.7931728811350028, "learning_rate": 2.220472152713038e-09, "loss": 0.1211, "step": 6369 }, { "epoch": 0.9906687402799378, "grad_norm": 1.0017571118927273, "learning_rate": 2.148271698781401e-09, "loss": 0.0903, "step": 6370 }, { "epoch": 0.9908242612752721, "grad_norm": 1.051722678501817, "learning_rate": 2.077264301754145e-09, "loss": 0.1269, "step": 6371 }, { "epoch": 0.9909797822706066, "grad_norm": 1.2123410583079337, "learning_rate": 2.0074499785827096e-09, "loss": 0.1697, "step": 6372 }, { "epoch": 0.9911353032659409, "grad_norm": 1.179736173698801, "learning_rate": 1.9388287459320974e-09, "loss": 0.1168, "step": 6373 }, { "epoch": 0.9912908242612752, "grad_norm": 1.1721055345788074, "learning_rate": 1.87140062018365e-09, "loss": 0.1249, "step": 6374 }, { "epoch": 0.9914463452566097, "grad_norm": 1.5455661759409929, "learning_rate": 1.8051656174333798e-09, "loss": 0.1527, "step": 6375 }, { "epoch": 0.991601866251944, "grad_norm": 1.0928243770882569, "learning_rate": 1.7401237534919735e-09, "loss": 0.1468, "step": 6376 }, { "epoch": 0.9917573872472784, "grad_norm": 1.100409006797048, "learning_rate": 1.676275043886455e-09, "loss": 0.1204, "step": 6377 }, { "epoch": 0.9919129082426128, "grad_norm": 0.989255911260247, "learning_rate": 1.6136195038579661e-09, "loss": 0.1958, "step": 6378 }, { "epoch": 0.9920684292379471, "grad_norm": 0.9099594169349633, "learning_rate": 1.5521571483634313e-09, "loss": 0.1461, "step": 6379 }, { "epoch": 0.9922239502332815, "grad_norm": 1.0753295904891556, "learning_rate": 1.4918879920750029e-09, "loss": 0.1094, "step": 6380 }, { "epoch": 0.9923794712286159, "grad_norm": 1.2567034531232073, "learning_rate": 1.4328120493795061e-09, "loss": 0.1399, "step": 6381 }, { "epoch": 0.9925349922239503, "grad_norm": 1.0750777848390272, "learning_rate": 1.3749293343789937e-09, "loss": 0.1711, "step": 6382 }, { "epoch": 0.9926905132192846, "grad_norm": 1.118362079079034, "learning_rate": 1.3182398608913016e-09, "loss": 0.1532, "step": 6383 }, { "epoch": 0.9928460342146189, "grad_norm": 0.9040309386090117, "learning_rate": 1.2627436424489381e-09, "loss": 0.1178, "step": 6384 }, { "epoch": 0.9930015552099534, "grad_norm": 1.313116453462254, "learning_rate": 1.2084406922990844e-09, "loss": 0.1696, "step": 6385 }, { "epoch": 0.9931570762052877, "grad_norm": 1.0936455247027816, "learning_rate": 1.1553310234052596e-09, "loss": 0.1586, "step": 6386 }, { "epoch": 0.993312597200622, "grad_norm": 0.9451450536474856, "learning_rate": 1.1034146484451003e-09, "loss": 0.1286, "step": 6387 }, { "epoch": 0.9934681181959565, "grad_norm": 0.9741154210440472, "learning_rate": 1.0526915798120263e-09, "loss": 0.1072, "step": 6388 }, { "epoch": 0.9936236391912908, "grad_norm": 0.6567667010672732, "learning_rate": 1.003161829614685e-09, "loss": 0.1233, "step": 6389 }, { "epoch": 0.9937791601866252, "grad_norm": 0.8767591152122706, "learning_rate": 9.548254096752862e-10, "loss": 0.1761, "step": 6390 }, { "epoch": 0.9939346811819596, "grad_norm": 1.7174729531998503, "learning_rate": 9.076823315334882e-10, "loss": 0.1544, "step": 6391 }, { "epoch": 0.9940902021772939, "grad_norm": 1.7154657621148628, "learning_rate": 8.617326064430664e-10, "loss": 0.1852, "step": 6392 }, { "epoch": 0.9942457231726283, "grad_norm": 0.8762237034015482, "learning_rate": 8.169762453719143e-10, "loss": 0.1011, "step": 6393 }, { "epoch": 0.9944012441679627, "grad_norm": 1.0409141682692484, "learning_rate": 7.734132590048182e-10, "loss": 0.1731, "step": 6394 }, { "epoch": 0.994556765163297, "grad_norm": 1.5069741029288812, "learning_rate": 7.310436577406821e-10, "loss": 0.2029, "step": 6395 }, { "epoch": 0.9947122861586314, "grad_norm": 1.1322810831428436, "learning_rate": 6.898674516936377e-10, "loss": 0.2123, "step": 6396 }, { "epoch": 0.9948678071539658, "grad_norm": 1.0049648031612368, "learning_rate": 6.498846506924894e-10, "loss": 0.1169, "step": 6397 }, { "epoch": 0.9950233281493002, "grad_norm": 0.7914926167425006, "learning_rate": 6.110952642829349e-10, "loss": 0.1125, "step": 6398 }, { "epoch": 0.9951788491446345, "grad_norm": 1.1156549527862143, "learning_rate": 5.73499301723679e-10, "loss": 0.097, "step": 6399 }, { "epoch": 0.995334370139969, "grad_norm": 0.8493322416061793, "learning_rate": 5.370967719897646e-10, "loss": 0.1662, "step": 6400 }, { "epoch": 0.995334370139969, "eval_loss": 0.16027696430683136, "eval_runtime": 9.4535, "eval_samples_per_second": 2.75, "eval_steps_per_second": 0.74, "step": 6400 }, { "epoch": 0.9954898911353033, "grad_norm": 1.2829208634591882, "learning_rate": 5.018876837703523e-10, "loss": 0.1207, "step": 6401 }, { "epoch": 0.9956454121306376, "grad_norm": 1.2345979027242246, "learning_rate": 4.678720454709407e-10, "loss": 0.1583, "step": 6402 }, { "epoch": 0.995800933125972, "grad_norm": 0.9222293209766532, "learning_rate": 4.350498652117008e-10, "loss": 0.1548, "step": 6403 }, { "epoch": 0.9959564541213064, "grad_norm": 1.2558380112016958, "learning_rate": 4.0342115082692144e-10, "loss": 0.1554, "step": 6404 }, { "epoch": 0.9961119751166407, "grad_norm": 1.3046318634021232, "learning_rate": 3.729859098677846e-10, "loss": 0.1785, "step": 6405 }, { "epoch": 0.9962674961119751, "grad_norm": 0.9625780548790225, "learning_rate": 3.4374414959903456e-10, "loss": 0.108, "step": 6406 }, { "epoch": 0.9964230171073095, "grad_norm": 1.2696202994447932, "learning_rate": 3.156958770006435e-10, "loss": 0.1979, "step": 6407 }, { "epoch": 0.9965785381026439, "grad_norm": 0.571742346474067, "learning_rate": 2.8884109876947676e-10, "loss": 0.0929, "step": 6408 }, { "epoch": 0.9967340590979782, "grad_norm": 1.0289893306365017, "learning_rate": 2.6317982131485176e-10, "loss": 0.113, "step": 6409 }, { "epoch": 0.9968895800933126, "grad_norm": 1.1282050959156955, "learning_rate": 2.387120507629792e-10, "loss": 0.169, "step": 6410 }, { "epoch": 0.997045101088647, "grad_norm": 1.505028579714464, "learning_rate": 2.1543779295474244e-10, "loss": 0.1723, "step": 6411 }, { "epoch": 0.9972006220839813, "grad_norm": 1.0858929682271188, "learning_rate": 1.9335705344625256e-10, "loss": 0.1091, "step": 6412 }, { "epoch": 0.9973561430793157, "grad_norm": 1.2662906034455852, "learning_rate": 1.7246983750773827e-10, "loss": 0.1553, "step": 6413 }, { "epoch": 0.9975116640746501, "grad_norm": 0.8465704876268566, "learning_rate": 1.527761501257663e-10, "loss": 0.1186, "step": 6414 }, { "epoch": 0.9976671850699844, "grad_norm": 1.2937900605528176, "learning_rate": 1.3427599600157603e-10, "loss": 0.1676, "step": 6415 }, { "epoch": 0.9978227060653189, "grad_norm": 0.8454514688996834, "learning_rate": 1.1696937955107956e-10, "loss": 0.1117, "step": 6416 }, { "epoch": 0.9979782270606532, "grad_norm": 0.8908716009569493, "learning_rate": 1.0085630490597187e-10, "loss": 0.1324, "step": 6417 }, { "epoch": 0.9981337480559875, "grad_norm": 1.0090217064725013, "learning_rate": 8.593677591262062e-11, "loss": 0.1491, "step": 6418 }, { "epoch": 0.998289269051322, "grad_norm": 1.0147431944359704, "learning_rate": 7.221079613206617e-11, "loss": 0.1821, "step": 6419 }, { "epoch": 0.9984447900466563, "grad_norm": 1.155022481904756, "learning_rate": 5.967836884168687e-11, "loss": 0.1519, "step": 6420 }, { "epoch": 0.9986003110419907, "grad_norm": 1.2613723256592264, "learning_rate": 4.833949703242358e-11, "loss": 0.1352, "step": 6421 }, { "epoch": 0.998755832037325, "grad_norm": 0.9284936560088762, "learning_rate": 3.819418341100001e-11, "loss": 0.1277, "step": 6422 }, { "epoch": 0.9989113530326594, "grad_norm": 1.5446204479591665, "learning_rate": 2.924243039992281e-11, "loss": 0.1631, "step": 6423 }, { "epoch": 0.9990668740279938, "grad_norm": 1.1241553005553095, "learning_rate": 2.148424013526107e-11, "loss": 0.1441, "step": 6424 }, { "epoch": 0.9992223950233281, "grad_norm": 0.7542838536816375, "learning_rate": 1.491961446997703e-11, "loss": 0.1218, "step": 6425 }, { "epoch": 0.9993779160186625, "grad_norm": 1.0617817354244825, "learning_rate": 9.548554970040258e-12, "loss": 0.1502, "step": 6426 }, { "epoch": 0.9995334370139969, "grad_norm": 1.10954371674041, "learning_rate": 5.371062918868575e-12, "loss": 0.0986, "step": 6427 }, { "epoch": 0.9996889580093312, "grad_norm": 1.0899286885381663, "learning_rate": 2.3871393123320317e-12, "loss": 0.1188, "step": 6428 }, { "epoch": 0.9998444790046657, "grad_norm": 1.5584912348337274, "learning_rate": 5.967848637489227e-13, "loss": 0.1828, "step": 6429 }, { "epoch": 1.0, "grad_norm": 0.8479386347901516, "learning_rate": 0.0, "loss": 0.0473, "step": 6430 }, { "epoch": 1.0, "step": 6430, "total_flos": 617240494080000.0, "train_loss": 0.17388578839020724, "train_runtime": 34020.6223, "train_samples_per_second": 0.756, "train_steps_per_second": 0.189 } ], "logging_steps": 1, "max_steps": 6430, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 617240494080000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }