{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3110419906687403, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015552099533437013, "grad_norm": 4.662841910859059, "learning_rate": 9.999999403215137e-06, "loss": 0.5705, "step": 1 }, { "epoch": 0.00031104199066874026, "grad_norm": 3.2101601568771754, "learning_rate": 9.999997612860688e-06, "loss": 0.5483, "step": 2 }, { "epoch": 0.00046656298600311044, "grad_norm": 4.156927485367742, "learning_rate": 9.999994628937082e-06, "loss": 0.5274, "step": 3 }, { "epoch": 0.0006220839813374805, "grad_norm": 3.3680995992825533, "learning_rate": 9.99999045144503e-06, "loss": 0.5289, "step": 4 }, { "epoch": 0.0007776049766718507, "grad_norm": 3.3625564866317608, "learning_rate": 9.99998508038553e-06, "loss": 0.379, "step": 5 }, { "epoch": 0.0009331259720062209, "grad_norm": 2.7876669390361055, "learning_rate": 9.999978515759865e-06, "loss": 0.4019, "step": 6 }, { "epoch": 0.001088646967340591, "grad_norm": 2.8774460632164955, "learning_rate": 9.999970757569602e-06, "loss": 0.4947, "step": 7 }, { "epoch": 0.001244167962674961, "grad_norm": 2.126555417939935, "learning_rate": 9.999961805816589e-06, "loss": 0.3555, "step": 8 }, { "epoch": 0.0013996889580093312, "grad_norm": 1.9764001175562893, "learning_rate": 9.999951660502969e-06, "loss": 0.3102, "step": 9 }, { "epoch": 0.0015552099533437014, "grad_norm": 1.122915497849003, "learning_rate": 9.999940321631158e-06, "loss": 0.2802, "step": 10 }, { "epoch": 0.0017107309486780716, "grad_norm": 2.6849893357934436, "learning_rate": 9.99992778920387e-06, "loss": 0.3883, "step": 11 }, { "epoch": 0.0018662519440124418, "grad_norm": 1.5146624591903277, "learning_rate": 9.999914063224088e-06, "loss": 0.2749, "step": 12 }, { "epoch": 0.002021772939346812, "grad_norm": 1.4042567809467454, "learning_rate": 9.999899143695095e-06, "loss": 0.296, "step": 13 }, { "epoch": 0.002177293934681182, "grad_norm": 1.817010561320059, "learning_rate": 9.99988303062045e-06, "loss": 0.3278, "step": 14 }, { "epoch": 0.0023328149300155523, "grad_norm": 1.795433870197738, "learning_rate": 9.999865724003998e-06, "loss": 0.3146, "step": 15 }, { "epoch": 0.002488335925349922, "grad_norm": 1.7826807436565577, "learning_rate": 9.999847223849875e-06, "loss": 0.3233, "step": 16 }, { "epoch": 0.0026438569206842922, "grad_norm": 1.7206284351475924, "learning_rate": 9.999827530162493e-06, "loss": 0.3246, "step": 17 }, { "epoch": 0.0027993779160186624, "grad_norm": 3.8760819362380867, "learning_rate": 9.999806642946554e-06, "loss": 0.2648, "step": 18 }, { "epoch": 0.0029548989113530326, "grad_norm": 1.5644293516985985, "learning_rate": 9.999784562207046e-06, "loss": 0.3096, "step": 19 }, { "epoch": 0.003110419906687403, "grad_norm": 2.0190853150190877, "learning_rate": 9.999761287949237e-06, "loss": 0.307, "step": 20 }, { "epoch": 0.003265940902021773, "grad_norm": 2.763319388592032, "learning_rate": 9.999736820178686e-06, "loss": 0.4327, "step": 21 }, { "epoch": 0.003421461897356143, "grad_norm": 1.6605096033172442, "learning_rate": 9.999711158901231e-06, "loss": 0.3918, "step": 22 }, { "epoch": 0.0035769828926905133, "grad_norm": 1.7508401571856476, "learning_rate": 9.999684304123e-06, "loss": 0.3852, "step": 23 }, { "epoch": 0.0037325038880248835, "grad_norm": 2.0163360210179335, "learning_rate": 9.999656255850401e-06, "loss": 0.3567, "step": 24 }, { "epoch": 0.0038880248833592537, "grad_norm": 1.5224484473221345, "learning_rate": 9.999627014090133e-06, "loss": 0.3185, "step": 25 }, { "epoch": 0.004043545878693624, "grad_norm": 1.5651644136387708, "learning_rate": 9.999596578849173e-06, "loss": 0.2548, "step": 26 }, { "epoch": 0.004199066874027994, "grad_norm": 1.506984699015577, "learning_rate": 9.999564950134788e-06, "loss": 0.2719, "step": 27 }, { "epoch": 0.004354587869362364, "grad_norm": 1.5707101400798584, "learning_rate": 9.99953212795453e-06, "loss": 0.2585, "step": 28 }, { "epoch": 0.004510108864696734, "grad_norm": 1.6678601949561362, "learning_rate": 9.999498112316231e-06, "loss": 0.2642, "step": 29 }, { "epoch": 0.004665629860031105, "grad_norm": 1.1937228959267376, "learning_rate": 9.99946290322801e-06, "loss": 0.3348, "step": 30 }, { "epoch": 0.004821150855365474, "grad_norm": 1.474398491556367, "learning_rate": 9.999426500698277e-06, "loss": 0.2936, "step": 31 }, { "epoch": 0.004976671850699844, "grad_norm": 1.4230321858584387, "learning_rate": 9.999388904735718e-06, "loss": 0.316, "step": 32 }, { "epoch": 0.005132192846034215, "grad_norm": 2.5118600752998645, "learning_rate": 9.999350115349309e-06, "loss": 0.3135, "step": 33 }, { "epoch": 0.0052877138413685845, "grad_norm": 1.7910755988881728, "learning_rate": 9.999310132548308e-06, "loss": 0.249, "step": 34 }, { "epoch": 0.005443234836702955, "grad_norm": 1.4981333944055653, "learning_rate": 9.999268956342261e-06, "loss": 0.2594, "step": 35 }, { "epoch": 0.005598755832037325, "grad_norm": 0.9261919071743852, "learning_rate": 9.999226586740995e-06, "loss": 0.2333, "step": 36 }, { "epoch": 0.0057542768273716955, "grad_norm": 1.26246346078558, "learning_rate": 9.999183023754628e-06, "loss": 0.1787, "step": 37 }, { "epoch": 0.005909797822706065, "grad_norm": 1.9545697787374448, "learning_rate": 9.999138267393557e-06, "loss": 0.3246, "step": 38 }, { "epoch": 0.006065318818040436, "grad_norm": 1.4285410822616305, "learning_rate": 9.999092317668467e-06, "loss": 0.223, "step": 39 }, { "epoch": 0.006220839813374806, "grad_norm": 1.4526856529113084, "learning_rate": 9.999045174590324e-06, "loss": 0.182, "step": 40 }, { "epoch": 0.006376360808709175, "grad_norm": 2.4846217662340995, "learning_rate": 9.998996838170387e-06, "loss": 0.36, "step": 41 }, { "epoch": 0.006531881804043546, "grad_norm": 1.2772759621800358, "learning_rate": 9.998947308420189e-06, "loss": 0.241, "step": 42 }, { "epoch": 0.006687402799377916, "grad_norm": 2.7720889102611945, "learning_rate": 9.998896585351557e-06, "loss": 0.3213, "step": 43 }, { "epoch": 0.006842923794712286, "grad_norm": 1.7490095603308047, "learning_rate": 9.998844668976595e-06, "loss": 0.3155, "step": 44 }, { "epoch": 0.006998444790046656, "grad_norm": 1.3823301922226903, "learning_rate": 9.998791559307702e-06, "loss": 0.2149, "step": 45 }, { "epoch": 0.007153965785381027, "grad_norm": 1.288871141891326, "learning_rate": 9.998737256357551e-06, "loss": 0.2887, "step": 46 }, { "epoch": 0.007309486780715396, "grad_norm": 3.483009451782568, "learning_rate": 9.99868176013911e-06, "loss": 0.263, "step": 47 }, { "epoch": 0.007465007776049767, "grad_norm": 1.652490483156804, "learning_rate": 9.998625070665622e-06, "loss": 0.2664, "step": 48 }, { "epoch": 0.007620528771384137, "grad_norm": 1.8206039592741312, "learning_rate": 9.99856718795062e-06, "loss": 0.224, "step": 49 }, { "epoch": 0.007776049766718507, "grad_norm": 3.2471818448644743, "learning_rate": 9.998508112007925e-06, "loss": 0.293, "step": 50 }, { "epoch": 0.007931570762052876, "grad_norm": 2.4630640416023, "learning_rate": 9.998447842851638e-06, "loss": 0.2958, "step": 51 }, { "epoch": 0.008087091757387248, "grad_norm": 2.1952255314920817, "learning_rate": 9.998386380496144e-06, "loss": 0.2841, "step": 52 }, { "epoch": 0.008242612752721618, "grad_norm": 1.7440998263562653, "learning_rate": 9.998323724956114e-06, "loss": 0.2392, "step": 53 }, { "epoch": 0.008398133748055987, "grad_norm": 1.7713538170023606, "learning_rate": 9.998259876246509e-06, "loss": 0.2148, "step": 54 }, { "epoch": 0.008553654743390357, "grad_norm": 2.196248357816803, "learning_rate": 9.998194834382567e-06, "loss": 0.2314, "step": 55 }, { "epoch": 0.008709175738724729, "grad_norm": 1.5241920091736059, "learning_rate": 9.998128599379817e-06, "loss": 0.3538, "step": 56 }, { "epoch": 0.008864696734059098, "grad_norm": 1.084932443566165, "learning_rate": 9.998061171254068e-06, "loss": 0.2061, "step": 57 }, { "epoch": 0.009020217729393468, "grad_norm": 1.7028355052947441, "learning_rate": 9.997992550021418e-06, "loss": 0.2286, "step": 58 }, { "epoch": 0.009175738724727838, "grad_norm": 1.7850241306158636, "learning_rate": 9.997922735698247e-06, "loss": 0.1935, "step": 59 }, { "epoch": 0.00933125972006221, "grad_norm": 2.7780720350287287, "learning_rate": 9.997851728301219e-06, "loss": 0.2658, "step": 60 }, { "epoch": 0.009486780715396579, "grad_norm": 1.8811033125325856, "learning_rate": 9.997779527847287e-06, "loss": 0.1963, "step": 61 }, { "epoch": 0.009642301710730949, "grad_norm": 1.3758579938738247, "learning_rate": 9.997706134353687e-06, "loss": 0.2529, "step": 62 }, { "epoch": 0.009797822706065318, "grad_norm": 1.9634000227706385, "learning_rate": 9.997631547837934e-06, "loss": 0.2544, "step": 63 }, { "epoch": 0.009953343701399688, "grad_norm": 1.594710372018227, "learning_rate": 9.997555768317838e-06, "loss": 0.3528, "step": 64 }, { "epoch": 0.01010886469673406, "grad_norm": 1.8005547220704254, "learning_rate": 9.997478795811486e-06, "loss": 0.2165, "step": 65 }, { "epoch": 0.01026438569206843, "grad_norm": 2.290269323202059, "learning_rate": 9.997400630337254e-06, "loss": 0.2786, "step": 66 }, { "epoch": 0.0104199066874028, "grad_norm": 1.5486051696063095, "learning_rate": 9.997321271913801e-06, "loss": 0.2188, "step": 67 }, { "epoch": 0.010575427682737169, "grad_norm": 0.9684733219759649, "learning_rate": 9.997240720560068e-06, "loss": 0.2043, "step": 68 }, { "epoch": 0.01073094867807154, "grad_norm": 2.1081587478577437, "learning_rate": 9.997158976295288e-06, "loss": 0.2908, "step": 69 }, { "epoch": 0.01088646967340591, "grad_norm": 3.6233628477076736, "learning_rate": 9.99707603913897e-06, "loss": 0.2579, "step": 70 }, { "epoch": 0.01104199066874028, "grad_norm": 1.090209411261846, "learning_rate": 9.996991909110918e-06, "loss": 0.2864, "step": 71 }, { "epoch": 0.01119751166407465, "grad_norm": 1.3430452010098815, "learning_rate": 9.99690658623121e-06, "loss": 0.2217, "step": 72 }, { "epoch": 0.01135303265940902, "grad_norm": 2.3549515267664005, "learning_rate": 9.996820070520216e-06, "loss": 0.2822, "step": 73 }, { "epoch": 0.011508553654743391, "grad_norm": 1.5602820881890913, "learning_rate": 9.996732361998588e-06, "loss": 0.2456, "step": 74 }, { "epoch": 0.01166407465007776, "grad_norm": 1.5856862134183374, "learning_rate": 9.996643460687264e-06, "loss": 0.3056, "step": 75 }, { "epoch": 0.01181959564541213, "grad_norm": 1.6134033436501471, "learning_rate": 9.996553366607464e-06, "loss": 0.2141, "step": 76 }, { "epoch": 0.0119751166407465, "grad_norm": 1.3597955630988308, "learning_rate": 9.996462079780696e-06, "loss": 0.2295, "step": 77 }, { "epoch": 0.012130637636080872, "grad_norm": 1.1374281802105086, "learning_rate": 9.996369600228753e-06, "loss": 0.2487, "step": 78 }, { "epoch": 0.012286158631415241, "grad_norm": 1.4298077500438133, "learning_rate": 9.99627592797371e-06, "loss": 0.2446, "step": 79 }, { "epoch": 0.012441679626749611, "grad_norm": 1.3975983522660094, "learning_rate": 9.996181063037924e-06, "loss": 0.2611, "step": 80 }, { "epoch": 0.012597200622083981, "grad_norm": 1.5544782250742402, "learning_rate": 9.996085005444046e-06, "loss": 0.2311, "step": 81 }, { "epoch": 0.01275272161741835, "grad_norm": 1.3603452791878323, "learning_rate": 9.995987755215006e-06, "loss": 0.2003, "step": 82 }, { "epoch": 0.012908242612752722, "grad_norm": 1.3071118505273163, "learning_rate": 9.995889312374016e-06, "loss": 0.2338, "step": 83 }, { "epoch": 0.013063763608087092, "grad_norm": 1.7380116089178919, "learning_rate": 9.995789676944576e-06, "loss": 0.2645, "step": 84 }, { "epoch": 0.013219284603421462, "grad_norm": 1.263086313797395, "learning_rate": 9.995688848950473e-06, "loss": 0.2215, "step": 85 }, { "epoch": 0.013374805598755831, "grad_norm": 1.5100086739523095, "learning_rate": 9.995586828415774e-06, "loss": 0.2444, "step": 86 }, { "epoch": 0.013530326594090203, "grad_norm": 1.0847005275250092, "learning_rate": 9.995483615364833e-06, "loss": 0.2129, "step": 87 }, { "epoch": 0.013685847589424573, "grad_norm": 1.3155329082198164, "learning_rate": 9.995379209822289e-06, "loss": 0.2788, "step": 88 }, { "epoch": 0.013841368584758942, "grad_norm": 1.8214452427995387, "learning_rate": 9.995273611813065e-06, "loss": 0.3027, "step": 89 }, { "epoch": 0.013996889580093312, "grad_norm": 0.8312908694387112, "learning_rate": 9.995166821362368e-06, "loss": 0.226, "step": 90 }, { "epoch": 0.014152410575427682, "grad_norm": 1.6627232520442479, "learning_rate": 9.995058838495689e-06, "loss": 0.2742, "step": 91 }, { "epoch": 0.014307931570762053, "grad_norm": 0.9378761990044046, "learning_rate": 9.994949663238809e-06, "loss": 0.267, "step": 92 }, { "epoch": 0.014463452566096423, "grad_norm": 2.122534441012584, "learning_rate": 9.994839295617786e-06, "loss": 0.2438, "step": 93 }, { "epoch": 0.014618973561430793, "grad_norm": 1.577456726662404, "learning_rate": 9.994727735658968e-06, "loss": 0.2659, "step": 94 }, { "epoch": 0.014774494556765163, "grad_norm": 1.6054087269070507, "learning_rate": 9.994614983388986e-06, "loss": 0.2404, "step": 95 }, { "epoch": 0.014930015552099534, "grad_norm": 1.5558443339273214, "learning_rate": 9.994501038834755e-06, "loss": 0.2703, "step": 96 }, { "epoch": 0.015085536547433904, "grad_norm": 1.7101494645663162, "learning_rate": 9.994385902023474e-06, "loss": 0.2148, "step": 97 }, { "epoch": 0.015241057542768274, "grad_norm": 1.5422168422798725, "learning_rate": 9.99426957298263e-06, "loss": 0.2045, "step": 98 }, { "epoch": 0.015396578538102643, "grad_norm": 1.3177834617215995, "learning_rate": 9.994152051739991e-06, "loss": 0.2097, "step": 99 }, { "epoch": 0.015552099533437015, "grad_norm": 1.7250496500116883, "learning_rate": 9.994033338323612e-06, "loss": 0.2309, "step": 100 }, { "epoch": 0.015552099533437015, "eval_loss": 0.255514532327652, "eval_runtime": 9.4404, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.741, "step": 100 }, { "epoch": 0.015707620528771383, "grad_norm": 1.2242797466034228, "learning_rate": 9.993913432761831e-06, "loss": 0.2309, "step": 101 }, { "epoch": 0.015863141524105753, "grad_norm": 1.2091727169379591, "learning_rate": 9.993792335083272e-06, "loss": 0.215, "step": 102 }, { "epoch": 0.016018662519440126, "grad_norm": 1.6991288183534923, "learning_rate": 9.99367004531684e-06, "loss": 0.2716, "step": 103 }, { "epoch": 0.016174183514774496, "grad_norm": 1.8626540300463013, "learning_rate": 9.99354656349173e-06, "loss": 0.287, "step": 104 }, { "epoch": 0.016329704510108865, "grad_norm": 1.2626220604624867, "learning_rate": 9.993421889637418e-06, "loss": 0.1737, "step": 105 }, { "epoch": 0.016485225505443235, "grad_norm": 0.8807151838598477, "learning_rate": 9.993296023783664e-06, "loss": 0.227, "step": 106 }, { "epoch": 0.016640746500777605, "grad_norm": 1.4662006360846318, "learning_rate": 9.993168965960515e-06, "loss": 0.2698, "step": 107 }, { "epoch": 0.016796267496111975, "grad_norm": 2.5676508719496383, "learning_rate": 9.993040716198304e-06, "loss": 0.2231, "step": 108 }, { "epoch": 0.016951788491446344, "grad_norm": 2.144008184181988, "learning_rate": 9.992911274527641e-06, "loss": 0.2729, "step": 109 }, { "epoch": 0.017107309486780714, "grad_norm": 1.3871826576036752, "learning_rate": 9.99278064097943e-06, "loss": 0.2078, "step": 110 }, { "epoch": 0.017262830482115084, "grad_norm": 1.9299054218636398, "learning_rate": 9.992648815584853e-06, "loss": 0.2543, "step": 111 }, { "epoch": 0.017418351477449457, "grad_norm": 6.182669074382352, "learning_rate": 9.992515798375379e-06, "loss": 0.2442, "step": 112 }, { "epoch": 0.017573872472783827, "grad_norm": 1.9218049477099652, "learning_rate": 9.992381589382761e-06, "loss": 0.2909, "step": 113 }, { "epoch": 0.017729393468118197, "grad_norm": 1.7558505152868706, "learning_rate": 9.992246188639035e-06, "loss": 0.2182, "step": 114 }, { "epoch": 0.017884914463452566, "grad_norm": 1.3145893008937046, "learning_rate": 9.992109596176525e-06, "loss": 0.2445, "step": 115 }, { "epoch": 0.018040435458786936, "grad_norm": 2.3756692802265094, "learning_rate": 9.991971812027836e-06, "loss": 0.2961, "step": 116 }, { "epoch": 0.018195956454121306, "grad_norm": 2.027933705938777, "learning_rate": 9.991832836225863e-06, "loss": 0.2459, "step": 117 }, { "epoch": 0.018351477449455676, "grad_norm": 1.9997556478308784, "learning_rate": 9.991692668803775e-06, "loss": 0.2108, "step": 118 }, { "epoch": 0.018506998444790045, "grad_norm": 1.39831187226532, "learning_rate": 9.991551309795038e-06, "loss": 0.1902, "step": 119 }, { "epoch": 0.01866251944012442, "grad_norm": 1.6377700259822823, "learning_rate": 9.991408759233394e-06, "loss": 0.2491, "step": 120 }, { "epoch": 0.018818040435458788, "grad_norm": 2.09576564356888, "learning_rate": 9.991265017152869e-06, "loss": 0.2526, "step": 121 }, { "epoch": 0.018973561430793158, "grad_norm": 2.031216743667695, "learning_rate": 9.991120083587779e-06, "loss": 0.2418, "step": 122 }, { "epoch": 0.019129082426127528, "grad_norm": 1.9897151692182136, "learning_rate": 9.990973958572723e-06, "loss": 0.2786, "step": 123 }, { "epoch": 0.019284603421461897, "grad_norm": 1.7503968375792016, "learning_rate": 9.990826642142581e-06, "loss": 0.3231, "step": 124 }, { "epoch": 0.019440124416796267, "grad_norm": 0.8307156104752434, "learning_rate": 9.990678134332521e-06, "loss": 0.2058, "step": 125 }, { "epoch": 0.019595645412130637, "grad_norm": 2.105265419067902, "learning_rate": 9.990528435177992e-06, "loss": 0.2665, "step": 126 }, { "epoch": 0.019751166407465007, "grad_norm": 0.845573052530141, "learning_rate": 9.99037754471473e-06, "loss": 0.1706, "step": 127 }, { "epoch": 0.019906687402799376, "grad_norm": 1.3561288374051286, "learning_rate": 9.990225462978756e-06, "loss": 0.2834, "step": 128 }, { "epoch": 0.02006220839813375, "grad_norm": 1.4639985615099256, "learning_rate": 9.990072190006371e-06, "loss": 0.2775, "step": 129 }, { "epoch": 0.02021772939346812, "grad_norm": 1.424715750901468, "learning_rate": 9.989917725834166e-06, "loss": 0.2331, "step": 130 }, { "epoch": 0.02037325038880249, "grad_norm": 1.4908712495988423, "learning_rate": 9.989762070499015e-06, "loss": 0.2326, "step": 131 }, { "epoch": 0.02052877138413686, "grad_norm": 1.9371986234951772, "learning_rate": 9.98960522403807e-06, "loss": 0.248, "step": 132 }, { "epoch": 0.02068429237947123, "grad_norm": 1.7802271420639102, "learning_rate": 9.989447186488777e-06, "loss": 0.2881, "step": 133 }, { "epoch": 0.0208398133748056, "grad_norm": 1.1250396512690675, "learning_rate": 9.98928795788886e-06, "loss": 0.2309, "step": 134 }, { "epoch": 0.020995334370139968, "grad_norm": 1.6801724252117862, "learning_rate": 9.989127538276329e-06, "loss": 0.2292, "step": 135 }, { "epoch": 0.021150855365474338, "grad_norm": 1.1771299351260398, "learning_rate": 9.98896592768948e-06, "loss": 0.1553, "step": 136 }, { "epoch": 0.021306376360808708, "grad_norm": 2.1842202518230645, "learning_rate": 9.988803126166889e-06, "loss": 0.3029, "step": 137 }, { "epoch": 0.02146189735614308, "grad_norm": 1.3745547142156036, "learning_rate": 9.988639133747422e-06, "loss": 0.1702, "step": 138 }, { "epoch": 0.02161741835147745, "grad_norm": 1.8504088238591443, "learning_rate": 9.988473950470223e-06, "loss": 0.2318, "step": 139 }, { "epoch": 0.02177293934681182, "grad_norm": 1.7870069125473158, "learning_rate": 9.988307576374727e-06, "loss": 0.2008, "step": 140 }, { "epoch": 0.02192846034214619, "grad_norm": 2.3953898044564883, "learning_rate": 9.988140011500647e-06, "loss": 0.2007, "step": 141 }, { "epoch": 0.02208398133748056, "grad_norm": 1.1845465157973594, "learning_rate": 9.987971255887985e-06, "loss": 0.2334, "step": 142 }, { "epoch": 0.02223950233281493, "grad_norm": 1.747163885973197, "learning_rate": 9.987801309577026e-06, "loss": 0.2559, "step": 143 }, { "epoch": 0.0223950233281493, "grad_norm": 1.6909380164686145, "learning_rate": 9.987630172608333e-06, "loss": 0.2819, "step": 144 }, { "epoch": 0.02255054432348367, "grad_norm": 1.6459040836915735, "learning_rate": 9.987457845022767e-06, "loss": 0.2283, "step": 145 }, { "epoch": 0.02270606531881804, "grad_norm": 1.0639213494130906, "learning_rate": 9.987284326861459e-06, "loss": 0.2947, "step": 146 }, { "epoch": 0.022861586314152412, "grad_norm": 1.423659630662775, "learning_rate": 9.987109618165832e-06, "loss": 0.1895, "step": 147 }, { "epoch": 0.023017107309486782, "grad_norm": 2.1171729246911966, "learning_rate": 9.986933718977591e-06, "loss": 0.1967, "step": 148 }, { "epoch": 0.02317262830482115, "grad_norm": 1.4659656443481106, "learning_rate": 9.986756629338728e-06, "loss": 0.1553, "step": 149 }, { "epoch": 0.02332814930015552, "grad_norm": 3.3524464413937762, "learning_rate": 9.986578349291514e-06, "loss": 0.2472, "step": 150 }, { "epoch": 0.02348367029548989, "grad_norm": 1.4421209559287633, "learning_rate": 9.986398878878507e-06, "loss": 0.1791, "step": 151 }, { "epoch": 0.02363919129082426, "grad_norm": 1.7313564339261944, "learning_rate": 9.98621821814255e-06, "loss": 0.2238, "step": 152 }, { "epoch": 0.02379471228615863, "grad_norm": 1.7017996756379121, "learning_rate": 9.986036367126769e-06, "loss": 0.2007, "step": 153 }, { "epoch": 0.023950233281493, "grad_norm": 1.515471002124247, "learning_rate": 9.985853325874575e-06, "loss": 0.2688, "step": 154 }, { "epoch": 0.02410575427682737, "grad_norm": 0.8049651881516254, "learning_rate": 9.985669094429662e-06, "loss": 0.1865, "step": 155 }, { "epoch": 0.024261275272161743, "grad_norm": 1.2861650933813724, "learning_rate": 9.985483672836007e-06, "loss": 0.2403, "step": 156 }, { "epoch": 0.024416796267496113, "grad_norm": 2.173379700965189, "learning_rate": 9.985297061137877e-06, "loss": 0.2045, "step": 157 }, { "epoch": 0.024572317262830483, "grad_norm": 1.5915407935889336, "learning_rate": 9.985109259379813e-06, "loss": 0.2063, "step": 158 }, { "epoch": 0.024727838258164853, "grad_norm": 1.877271886192633, "learning_rate": 9.98492026760665e-06, "loss": 0.226, "step": 159 }, { "epoch": 0.024883359253499222, "grad_norm": 1.590999803444347, "learning_rate": 9.984730085863504e-06, "loss": 0.2243, "step": 160 }, { "epoch": 0.025038880248833592, "grad_norm": 2.2602490405621016, "learning_rate": 9.98453871419577e-06, "loss": 0.2599, "step": 161 }, { "epoch": 0.025194401244167962, "grad_norm": 1.8247502790432317, "learning_rate": 9.984346152649135e-06, "loss": 0.2575, "step": 162 }, { "epoch": 0.02534992223950233, "grad_norm": 1.6317702406563646, "learning_rate": 9.984152401269562e-06, "loss": 0.2513, "step": 163 }, { "epoch": 0.0255054432348367, "grad_norm": 1.479820350518653, "learning_rate": 9.983957460103307e-06, "loss": 0.2134, "step": 164 }, { "epoch": 0.025660964230171075, "grad_norm": 2.2204278110409716, "learning_rate": 9.9837613291969e-06, "loss": 0.2288, "step": 165 }, { "epoch": 0.025816485225505444, "grad_norm": 1.8249773963334357, "learning_rate": 9.983564008597164e-06, "loss": 0.2342, "step": 166 }, { "epoch": 0.025972006220839814, "grad_norm": 1.892476010698033, "learning_rate": 9.9833654983512e-06, "loss": 0.2263, "step": 167 }, { "epoch": 0.026127527216174184, "grad_norm": 1.593847254715758, "learning_rate": 9.983165798506398e-06, "loss": 0.2163, "step": 168 }, { "epoch": 0.026283048211508554, "grad_norm": 1.7653992228114257, "learning_rate": 9.982964909110426e-06, "loss": 0.2938, "step": 169 }, { "epoch": 0.026438569206842923, "grad_norm": 1.3352350617943483, "learning_rate": 9.982762830211239e-06, "loss": 0.2069, "step": 170 }, { "epoch": 0.026594090202177293, "grad_norm": 1.6623662216358996, "learning_rate": 9.982559561857079e-06, "loss": 0.213, "step": 171 }, { "epoch": 0.026749611197511663, "grad_norm": 1.1923151136153478, "learning_rate": 9.982355104096468e-06, "loss": 0.2068, "step": 172 }, { "epoch": 0.026905132192846033, "grad_norm": 1.5009321240819553, "learning_rate": 9.98214945697821e-06, "loss": 0.3292, "step": 173 }, { "epoch": 0.027060653188180406, "grad_norm": 1.6168504596283289, "learning_rate": 9.981942620551399e-06, "loss": 0.2001, "step": 174 }, { "epoch": 0.027216174183514776, "grad_norm": 1.0410735731938325, "learning_rate": 9.98173459486541e-06, "loss": 0.2697, "step": 175 }, { "epoch": 0.027371695178849145, "grad_norm": 1.477725722611291, "learning_rate": 9.9815253799699e-06, "loss": 0.1796, "step": 176 }, { "epoch": 0.027527216174183515, "grad_norm": 1.5159741098115525, "learning_rate": 9.981314975914811e-06, "loss": 0.2203, "step": 177 }, { "epoch": 0.027682737169517885, "grad_norm": 0.8954975243967727, "learning_rate": 9.981103382750372e-06, "loss": 0.2662, "step": 178 }, { "epoch": 0.027838258164852255, "grad_norm": 1.418625218985406, "learning_rate": 9.980890600527092e-06, "loss": 0.2484, "step": 179 }, { "epoch": 0.027993779160186624, "grad_norm": 1.4411516373436362, "learning_rate": 9.980676629295763e-06, "loss": 0.302, "step": 180 }, { "epoch": 0.028149300155520994, "grad_norm": 0.9480510792156464, "learning_rate": 9.980461469107463e-06, "loss": 0.2075, "step": 181 }, { "epoch": 0.028304821150855364, "grad_norm": 2.081864475923441, "learning_rate": 9.980245120013558e-06, "loss": 0.2942, "step": 182 }, { "epoch": 0.028460342146189737, "grad_norm": 1.2615838373847896, "learning_rate": 9.980027582065691e-06, "loss": 0.2018, "step": 183 }, { "epoch": 0.028615863141524107, "grad_norm": 1.2086223544731691, "learning_rate": 9.979808855315792e-06, "loss": 0.2743, "step": 184 }, { "epoch": 0.028771384136858476, "grad_norm": 0.9412206342605678, "learning_rate": 9.979588939816071e-06, "loss": 0.2318, "step": 185 }, { "epoch": 0.028926905132192846, "grad_norm": 1.365479987499767, "learning_rate": 9.979367835619029e-06, "loss": 0.2813, "step": 186 }, { "epoch": 0.029082426127527216, "grad_norm": 1.1385427599520912, "learning_rate": 9.979145542777444e-06, "loss": 0.2627, "step": 187 }, { "epoch": 0.029237947122861586, "grad_norm": 1.560448582637042, "learning_rate": 9.97892206134438e-06, "loss": 0.2042, "step": 188 }, { "epoch": 0.029393468118195955, "grad_norm": 1.9585068672638826, "learning_rate": 9.97869739137319e-06, "loss": 0.2647, "step": 189 }, { "epoch": 0.029548989113530325, "grad_norm": 1.612253014357388, "learning_rate": 9.9784715329175e-06, "loss": 0.2573, "step": 190 }, { "epoch": 0.0297045101088647, "grad_norm": 1.2782552177366555, "learning_rate": 9.978244486031228e-06, "loss": 0.1914, "step": 191 }, { "epoch": 0.029860031104199068, "grad_norm": 2.1188620010348163, "learning_rate": 9.978016250768573e-06, "loss": 0.245, "step": 192 }, { "epoch": 0.030015552099533438, "grad_norm": 1.9777647488169638, "learning_rate": 9.977786827184019e-06, "loss": 0.2774, "step": 193 }, { "epoch": 0.030171073094867808, "grad_norm": 2.0157801185629407, "learning_rate": 9.977556215332332e-06, "loss": 0.297, "step": 194 }, { "epoch": 0.030326594090202177, "grad_norm": 0.8845310906810993, "learning_rate": 9.97732441526856e-06, "loss": 0.1756, "step": 195 }, { "epoch": 0.030482115085536547, "grad_norm": 1.2647941053184737, "learning_rate": 9.97709142704804e-06, "loss": 0.1773, "step": 196 }, { "epoch": 0.030637636080870917, "grad_norm": 1.1823797462719756, "learning_rate": 9.976857250726389e-06, "loss": 0.2501, "step": 197 }, { "epoch": 0.030793157076205287, "grad_norm": 1.643272741263538, "learning_rate": 9.976621886359506e-06, "loss": 0.2794, "step": 198 }, { "epoch": 0.030948678071539656, "grad_norm": 1.6415813649465196, "learning_rate": 9.976385334003577e-06, "loss": 0.2562, "step": 199 }, { "epoch": 0.03110419906687403, "grad_norm": 1.4019518238717095, "learning_rate": 9.976147593715074e-06, "loss": 0.2066, "step": 200 }, { "epoch": 0.03110419906687403, "eval_loss": 0.2431146204471588, "eval_runtime": 9.4441, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 200 }, { "epoch": 0.031259720062208396, "grad_norm": 1.5261705881041825, "learning_rate": 9.975908665550742e-06, "loss": 0.168, "step": 201 }, { "epoch": 0.031415241057542766, "grad_norm": 1.3552305394454693, "learning_rate": 9.975668549567623e-06, "loss": 0.2513, "step": 202 }, { "epoch": 0.031570762052877135, "grad_norm": 1.09704983539552, "learning_rate": 9.97542724582303e-06, "loss": 0.1877, "step": 203 }, { "epoch": 0.031726283048211505, "grad_norm": 1.8452203060592092, "learning_rate": 9.975184754374572e-06, "loss": 0.3442, "step": 204 }, { "epoch": 0.03188180404354588, "grad_norm": 1.4512649025391702, "learning_rate": 9.974941075280128e-06, "loss": 0.2172, "step": 205 }, { "epoch": 0.03203732503888025, "grad_norm": 1.5376722263850107, "learning_rate": 9.974696208597874e-06, "loss": 0.2206, "step": 206 }, { "epoch": 0.03219284603421462, "grad_norm": 1.6097488768932668, "learning_rate": 9.97445015438626e-06, "loss": 0.2134, "step": 207 }, { "epoch": 0.03234836702954899, "grad_norm": 1.2381378734127797, "learning_rate": 9.974202912704022e-06, "loss": 0.2026, "step": 208 }, { "epoch": 0.03250388802488336, "grad_norm": 2.0110329862327663, "learning_rate": 9.973954483610184e-06, "loss": 0.2117, "step": 209 }, { "epoch": 0.03265940902021773, "grad_norm": 4.938465463538487, "learning_rate": 9.973704867164044e-06, "loss": 0.2787, "step": 210 }, { "epoch": 0.0328149300155521, "grad_norm": 1.9318587506840115, "learning_rate": 9.973454063425191e-06, "loss": 0.2901, "step": 211 }, { "epoch": 0.03297045101088647, "grad_norm": 1.5730776773238022, "learning_rate": 9.973202072453498e-06, "loss": 0.3557, "step": 212 }, { "epoch": 0.03312597200622084, "grad_norm": 2.333406801079277, "learning_rate": 9.972948894309116e-06, "loss": 0.2553, "step": 213 }, { "epoch": 0.03328149300155521, "grad_norm": 1.2613725609366824, "learning_rate": 9.972694529052482e-06, "loss": 0.2721, "step": 214 }, { "epoch": 0.03343701399688958, "grad_norm": 1.233807021429561, "learning_rate": 9.972438976744317e-06, "loss": 0.194, "step": 215 }, { "epoch": 0.03359253499222395, "grad_norm": 1.0922019141763, "learning_rate": 9.972182237445624e-06, "loss": 0.2625, "step": 216 }, { "epoch": 0.03374805598755832, "grad_norm": 1.5332376003824164, "learning_rate": 9.971924311217693e-06, "loss": 0.2369, "step": 217 }, { "epoch": 0.03390357698289269, "grad_norm": 2.1386234582292856, "learning_rate": 9.971665198122093e-06, "loss": 0.2691, "step": 218 }, { "epoch": 0.03405909797822706, "grad_norm": 1.4374027394103162, "learning_rate": 9.97140489822068e-06, "loss": 0.2217, "step": 219 }, { "epoch": 0.03421461897356143, "grad_norm": 1.7261766376116665, "learning_rate": 9.971143411575585e-06, "loss": 0.3063, "step": 220 }, { "epoch": 0.0343701399688958, "grad_norm": 1.5632670578977363, "learning_rate": 9.970880738249236e-06, "loss": 0.2333, "step": 221 }, { "epoch": 0.03452566096423017, "grad_norm": 1.6709935682257062, "learning_rate": 9.97061687830433e-06, "loss": 0.2808, "step": 222 }, { "epoch": 0.034681181959564544, "grad_norm": 1.7747486994884278, "learning_rate": 9.970351831803862e-06, "loss": 0.3182, "step": 223 }, { "epoch": 0.034836702954898914, "grad_norm": 1.2079739996818415, "learning_rate": 9.970085598811094e-06, "loss": 0.2426, "step": 224 }, { "epoch": 0.034992223950233284, "grad_norm": 2.269795435480081, "learning_rate": 9.969818179389586e-06, "loss": 0.1933, "step": 225 }, { "epoch": 0.035147744945567654, "grad_norm": 1.28324330975912, "learning_rate": 9.96954957360317e-06, "loss": 0.2078, "step": 226 }, { "epoch": 0.03530326594090202, "grad_norm": 3.0240429569891147, "learning_rate": 9.969279781515967e-06, "loss": 0.2865, "step": 227 }, { "epoch": 0.03545878693623639, "grad_norm": 1.4022531253860526, "learning_rate": 9.969008803192385e-06, "loss": 0.189, "step": 228 }, { "epoch": 0.03561430793157076, "grad_norm": 1.4481645110880101, "learning_rate": 9.968736638697105e-06, "loss": 0.2038, "step": 229 }, { "epoch": 0.03576982892690513, "grad_norm": 1.2439638440320844, "learning_rate": 9.968463288095096e-06, "loss": 0.1962, "step": 230 }, { "epoch": 0.0359253499222395, "grad_norm": 1.550618674775446, "learning_rate": 9.968188751451613e-06, "loss": 0.2461, "step": 231 }, { "epoch": 0.03608087091757387, "grad_norm": 1.2590441656933422, "learning_rate": 9.967913028832192e-06, "loss": 0.28, "step": 232 }, { "epoch": 0.03623639191290824, "grad_norm": 15.743047596573488, "learning_rate": 9.96763612030265e-06, "loss": 0.2272, "step": 233 }, { "epoch": 0.03639191290824261, "grad_norm": 1.0832646660805165, "learning_rate": 9.967358025929092e-06, "loss": 0.2766, "step": 234 }, { "epoch": 0.03654743390357698, "grad_norm": 1.496152606461021, "learning_rate": 9.9670787457779e-06, "loss": 0.1928, "step": 235 }, { "epoch": 0.03670295489891135, "grad_norm": 1.5049076518304147, "learning_rate": 9.966798279915744e-06, "loss": 0.2023, "step": 236 }, { "epoch": 0.03685847589424572, "grad_norm": 0.9377725167524534, "learning_rate": 9.966516628409573e-06, "loss": 0.1657, "step": 237 }, { "epoch": 0.03701399688958009, "grad_norm": 1.5646202349920761, "learning_rate": 9.96623379132662e-06, "loss": 0.2157, "step": 238 }, { "epoch": 0.03716951788491446, "grad_norm": 1.14277577769819, "learning_rate": 9.965949768734409e-06, "loss": 0.2163, "step": 239 }, { "epoch": 0.03732503888024884, "grad_norm": 2.158716016882222, "learning_rate": 9.965664560700734e-06, "loss": 0.2041, "step": 240 }, { "epoch": 0.03748055987558321, "grad_norm": 1.8568349342429766, "learning_rate": 9.965378167293679e-06, "loss": 0.2266, "step": 241 }, { "epoch": 0.037636080870917576, "grad_norm": 2.035673543431871, "learning_rate": 9.965090588581609e-06, "loss": 0.2893, "step": 242 }, { "epoch": 0.037791601866251946, "grad_norm": 1.2421527558787024, "learning_rate": 9.964801824633177e-06, "loss": 0.166, "step": 243 }, { "epoch": 0.037947122861586316, "grad_norm": 1.7368625294642988, "learning_rate": 9.964511875517313e-06, "loss": 0.2593, "step": 244 }, { "epoch": 0.038102643856920686, "grad_norm": 1.274064232837515, "learning_rate": 9.964220741303232e-06, "loss": 0.1676, "step": 245 }, { "epoch": 0.038258164852255055, "grad_norm": 1.3271094523398685, "learning_rate": 9.963928422060432e-06, "loss": 0.2048, "step": 246 }, { "epoch": 0.038413685847589425, "grad_norm": 1.441894820882409, "learning_rate": 9.963634917858692e-06, "loss": 0.2102, "step": 247 }, { "epoch": 0.038569206842923795, "grad_norm": 1.3882607946902543, "learning_rate": 9.963340228768077e-06, "loss": 0.1862, "step": 248 }, { "epoch": 0.038724727838258165, "grad_norm": 1.1529068772443192, "learning_rate": 9.963044354858934e-06, "loss": 0.2519, "step": 249 }, { "epoch": 0.038880248833592534, "grad_norm": 2.236043321099024, "learning_rate": 9.962747296201891e-06, "loss": 0.1635, "step": 250 }, { "epoch": 0.039035769828926904, "grad_norm": 1.8503487939836718, "learning_rate": 9.96244905286786e-06, "loss": 0.181, "step": 251 }, { "epoch": 0.039191290824261274, "grad_norm": 1.4083157880171735, "learning_rate": 9.962149624928037e-06, "loss": 0.1781, "step": 252 }, { "epoch": 0.039346811819595644, "grad_norm": 1.6536407646222175, "learning_rate": 9.961849012453899e-06, "loss": 0.2699, "step": 253 }, { "epoch": 0.039502332814930013, "grad_norm": 1.3154495432198843, "learning_rate": 9.961547215517206e-06, "loss": 0.2096, "step": 254 }, { "epoch": 0.03965785381026438, "grad_norm": 1.222944730470649, "learning_rate": 9.961244234190001e-06, "loss": 0.209, "step": 255 }, { "epoch": 0.03981337480559875, "grad_norm": 1.3903861430735245, "learning_rate": 9.96094006854461e-06, "loss": 0.177, "step": 256 }, { "epoch": 0.03996889580093312, "grad_norm": 1.8733569984170189, "learning_rate": 9.960634718653644e-06, "loss": 0.4051, "step": 257 }, { "epoch": 0.0401244167962675, "grad_norm": 1.3013086938531622, "learning_rate": 9.96032818458999e-06, "loss": 0.2215, "step": 258 }, { "epoch": 0.04027993779160187, "grad_norm": 1.9062067810307814, "learning_rate": 9.960020466426825e-06, "loss": 0.2131, "step": 259 }, { "epoch": 0.04043545878693624, "grad_norm": 1.240725461727028, "learning_rate": 9.959711564237603e-06, "loss": 0.2376, "step": 260 }, { "epoch": 0.04059097978227061, "grad_norm": 1.504578258989953, "learning_rate": 9.95940147809607e-06, "loss": 0.2238, "step": 261 }, { "epoch": 0.04074650077760498, "grad_norm": 1.112441665378311, "learning_rate": 9.959090208076239e-06, "loss": 0.175, "step": 262 }, { "epoch": 0.04090202177293935, "grad_norm": 1.492328645699945, "learning_rate": 9.958777754252418e-06, "loss": 0.2332, "step": 263 }, { "epoch": 0.04105754276827372, "grad_norm": 1.4626777112927891, "learning_rate": 9.958464116699196e-06, "loss": 0.2093, "step": 264 }, { "epoch": 0.04121306376360809, "grad_norm": 2.4304182018626266, "learning_rate": 9.958149295491441e-06, "loss": 0.2495, "step": 265 }, { "epoch": 0.04136858475894246, "grad_norm": 2.1830670676642256, "learning_rate": 9.957833290704305e-06, "loss": 0.2151, "step": 266 }, { "epoch": 0.04152410575427683, "grad_norm": 0.9776646131405466, "learning_rate": 9.957516102413223e-06, "loss": 0.2215, "step": 267 }, { "epoch": 0.0416796267496112, "grad_norm": 0.9811824757237497, "learning_rate": 9.957197730693912e-06, "loss": 0.2671, "step": 268 }, { "epoch": 0.04183514774494557, "grad_norm": 1.025030756788744, "learning_rate": 9.956878175622372e-06, "loss": 0.1935, "step": 269 }, { "epoch": 0.041990668740279936, "grad_norm": 1.715248799705313, "learning_rate": 9.956557437274887e-06, "loss": 0.2639, "step": 270 }, { "epoch": 0.042146189735614306, "grad_norm": 1.4715136542514509, "learning_rate": 9.95623551572802e-06, "loss": 0.1863, "step": 271 }, { "epoch": 0.042301710730948676, "grad_norm": 2.0941396313348766, "learning_rate": 9.955912411058616e-06, "loss": 0.1764, "step": 272 }, { "epoch": 0.042457231726283046, "grad_norm": 1.4113410003708207, "learning_rate": 9.955588123343808e-06, "loss": 0.2635, "step": 273 }, { "epoch": 0.042612752721617415, "grad_norm": 1.0999635349018924, "learning_rate": 9.955262652661009e-06, "loss": 0.2424, "step": 274 }, { "epoch": 0.042768273716951785, "grad_norm": 1.0847541480257452, "learning_rate": 9.954935999087908e-06, "loss": 0.276, "step": 275 }, { "epoch": 0.04292379471228616, "grad_norm": 1.695906274664277, "learning_rate": 9.954608162702488e-06, "loss": 0.2316, "step": 276 }, { "epoch": 0.04307931570762053, "grad_norm": 1.428650374776818, "learning_rate": 9.954279143583003e-06, "loss": 0.234, "step": 277 }, { "epoch": 0.0432348367029549, "grad_norm": 1.261831528775643, "learning_rate": 9.953948941807998e-06, "loss": 0.2331, "step": 278 }, { "epoch": 0.04339035769828927, "grad_norm": 1.1389240235405695, "learning_rate": 9.953617557456295e-06, "loss": 0.1813, "step": 279 }, { "epoch": 0.04354587869362364, "grad_norm": 2.1356821017337264, "learning_rate": 9.953284990607e-06, "loss": 0.2716, "step": 280 }, { "epoch": 0.04370139968895801, "grad_norm": 1.256196669200449, "learning_rate": 9.952951241339501e-06, "loss": 0.2586, "step": 281 }, { "epoch": 0.04385692068429238, "grad_norm": 1.6264279435141102, "learning_rate": 9.952616309733471e-06, "loss": 0.2138, "step": 282 }, { "epoch": 0.04401244167962675, "grad_norm": 1.0771562874552736, "learning_rate": 9.952280195868859e-06, "loss": 0.2798, "step": 283 }, { "epoch": 0.04416796267496112, "grad_norm": 1.6634031368562676, "learning_rate": 9.951942899825906e-06, "loss": 0.3159, "step": 284 }, { "epoch": 0.04432348367029549, "grad_norm": 1.5379741925800816, "learning_rate": 9.951604421685121e-06, "loss": 0.3275, "step": 285 }, { "epoch": 0.04447900466562986, "grad_norm": 1.4489954817264272, "learning_rate": 9.951264761527311e-06, "loss": 0.1989, "step": 286 }, { "epoch": 0.04463452566096423, "grad_norm": 1.6369744606712289, "learning_rate": 9.950923919433555e-06, "loss": 0.2068, "step": 287 }, { "epoch": 0.0447900466562986, "grad_norm": 1.8400125131547473, "learning_rate": 9.950581895485214e-06, "loss": 0.1977, "step": 288 }, { "epoch": 0.04494556765163297, "grad_norm": 2.1448208174547743, "learning_rate": 9.950238689763937e-06, "loss": 0.1882, "step": 289 }, { "epoch": 0.04510108864696734, "grad_norm": 1.1002755110550755, "learning_rate": 9.949894302351653e-06, "loss": 0.2422, "step": 290 }, { "epoch": 0.04525660964230171, "grad_norm": 0.8557887132764603, "learning_rate": 9.94954873333057e-06, "loss": 0.2249, "step": 291 }, { "epoch": 0.04541213063763608, "grad_norm": 1.800548229871832, "learning_rate": 9.94920198278318e-06, "loss": 0.2462, "step": 292 }, { "epoch": 0.04556765163297045, "grad_norm": 1.077848623865367, "learning_rate": 9.948854050792256e-06, "loss": 0.1693, "step": 293 }, { "epoch": 0.045723172628304824, "grad_norm": 1.3420617788641933, "learning_rate": 9.948504937440857e-06, "loss": 0.2632, "step": 294 }, { "epoch": 0.045878693623639194, "grad_norm": 1.786889545891979, "learning_rate": 9.948154642812321e-06, "loss": 0.1812, "step": 295 }, { "epoch": 0.046034214618973564, "grad_norm": 1.6608331504976344, "learning_rate": 9.947803166990267e-06, "loss": 0.2781, "step": 296 }, { "epoch": 0.046189735614307934, "grad_norm": 1.479079510539959, "learning_rate": 9.947450510058596e-06, "loss": 0.2176, "step": 297 }, { "epoch": 0.0463452566096423, "grad_norm": 1.1205653962227666, "learning_rate": 9.947096672101496e-06, "loss": 0.2189, "step": 298 }, { "epoch": 0.04650077760497667, "grad_norm": 1.6903970393534788, "learning_rate": 9.94674165320343e-06, "loss": 0.1715, "step": 299 }, { "epoch": 0.04665629860031104, "grad_norm": 3.020535469766265, "learning_rate": 9.946385453449145e-06, "loss": 0.2334, "step": 300 }, { "epoch": 0.04665629860031104, "eval_loss": 0.23520340025424957, "eval_runtime": 9.4655, "eval_samples_per_second": 2.747, "eval_steps_per_second": 0.74, "step": 300 }, { "epoch": 0.04681181959564541, "grad_norm": 1.2625213750296742, "learning_rate": 9.946028072923675e-06, "loss": 0.2153, "step": 301 }, { "epoch": 0.04696734059097978, "grad_norm": 1.326552639234392, "learning_rate": 9.945669511712328e-06, "loss": 0.1378, "step": 302 }, { "epoch": 0.04712286158631415, "grad_norm": 1.1353660480206176, "learning_rate": 9.945309769900698e-06, "loss": 0.2505, "step": 303 }, { "epoch": 0.04727838258164852, "grad_norm": 1.2591178630665596, "learning_rate": 9.944948847574662e-06, "loss": 0.1704, "step": 304 }, { "epoch": 0.04743390357698289, "grad_norm": 1.3520689396483014, "learning_rate": 9.944586744820377e-06, "loss": 0.2324, "step": 305 }, { "epoch": 0.04758942457231726, "grad_norm": 1.0116417439713241, "learning_rate": 9.94422346172428e-06, "loss": 0.1512, "step": 306 }, { "epoch": 0.04774494556765163, "grad_norm": 1.479626380132595, "learning_rate": 9.943858998373093e-06, "loss": 0.2121, "step": 307 }, { "epoch": 0.047900466562986, "grad_norm": 1.4227055232441543, "learning_rate": 9.94349335485382e-06, "loss": 0.2667, "step": 308 }, { "epoch": 0.04805598755832037, "grad_norm": 1.583200032514501, "learning_rate": 9.943126531253744e-06, "loss": 0.289, "step": 309 }, { "epoch": 0.04821150855365474, "grad_norm": 1.8189938486203978, "learning_rate": 9.942758527660429e-06, "loss": 0.3084, "step": 310 }, { "epoch": 0.04836702954898912, "grad_norm": 1.146189412882889, "learning_rate": 9.942389344161724e-06, "loss": 0.1669, "step": 311 }, { "epoch": 0.04852255054432349, "grad_norm": 1.547896984860253, "learning_rate": 9.94201898084576e-06, "loss": 0.2064, "step": 312 }, { "epoch": 0.048678071539657856, "grad_norm": 1.5949794296702688, "learning_rate": 9.941647437800946e-06, "loss": 0.1929, "step": 313 }, { "epoch": 0.048833592534992226, "grad_norm": 1.803377063241175, "learning_rate": 9.941274715115976e-06, "loss": 0.2791, "step": 314 }, { "epoch": 0.048989113530326596, "grad_norm": 1.3837921692775779, "learning_rate": 9.940900812879822e-06, "loss": 0.1767, "step": 315 }, { "epoch": 0.049144634525660966, "grad_norm": 1.3433932609509933, "learning_rate": 9.940525731181741e-06, "loss": 0.2084, "step": 316 }, { "epoch": 0.049300155520995335, "grad_norm": 1.357062528683942, "learning_rate": 9.940149470111269e-06, "loss": 0.2047, "step": 317 }, { "epoch": 0.049455676516329705, "grad_norm": 1.6539883727473814, "learning_rate": 9.939772029758225e-06, "loss": 0.2925, "step": 318 }, { "epoch": 0.049611197511664075, "grad_norm": 1.2278880982790155, "learning_rate": 9.939393410212713e-06, "loss": 0.2649, "step": 319 }, { "epoch": 0.049766718506998445, "grad_norm": 1.6247947056783312, "learning_rate": 9.93901361156511e-06, "loss": 0.3355, "step": 320 }, { "epoch": 0.049922239502332814, "grad_norm": 1.1732603342184649, "learning_rate": 9.93863263390608e-06, "loss": 0.2603, "step": 321 }, { "epoch": 0.050077760497667184, "grad_norm": 1.4022468720638315, "learning_rate": 9.93825047732657e-06, "loss": 0.3171, "step": 322 }, { "epoch": 0.050233281493001554, "grad_norm": 1.3668475608164796, "learning_rate": 9.937867141917804e-06, "loss": 0.2952, "step": 323 }, { "epoch": 0.050388802488335924, "grad_norm": 1.4553813573539522, "learning_rate": 9.93748262777129e-06, "loss": 0.1581, "step": 324 }, { "epoch": 0.05054432348367029, "grad_norm": 1.9871080316775154, "learning_rate": 9.937096934978819e-06, "loss": 0.2368, "step": 325 }, { "epoch": 0.05069984447900466, "grad_norm": 1.2900065629907207, "learning_rate": 9.936710063632457e-06, "loss": 0.2831, "step": 326 }, { "epoch": 0.05085536547433903, "grad_norm": 0.9263549089146618, "learning_rate": 9.93632201382456e-06, "loss": 0.2086, "step": 327 }, { "epoch": 0.0510108864696734, "grad_norm": 1.9892589335821493, "learning_rate": 9.935932785647756e-06, "loss": 0.2717, "step": 328 }, { "epoch": 0.05116640746500778, "grad_norm": 1.1155547773179386, "learning_rate": 9.935542379194965e-06, "loss": 0.2731, "step": 329 }, { "epoch": 0.05132192846034215, "grad_norm": 1.0330106857849222, "learning_rate": 9.935150794559379e-06, "loss": 0.1841, "step": 330 }, { "epoch": 0.05147744945567652, "grad_norm": 1.52093348670823, "learning_rate": 9.934758031834475e-06, "loss": 0.2061, "step": 331 }, { "epoch": 0.05163297045101089, "grad_norm": 1.1824055834479263, "learning_rate": 9.93436409111401e-06, "loss": 0.2613, "step": 332 }, { "epoch": 0.05178849144634526, "grad_norm": 1.5329142188470473, "learning_rate": 9.933968972492026e-06, "loss": 0.2541, "step": 333 }, { "epoch": 0.05194401244167963, "grad_norm": 1.0304282737168275, "learning_rate": 9.933572676062841e-06, "loss": 0.2024, "step": 334 }, { "epoch": 0.052099533437014, "grad_norm": 1.1252175849664872, "learning_rate": 9.933175201921057e-06, "loss": 0.201, "step": 335 }, { "epoch": 0.05225505443234837, "grad_norm": 1.6828294804696526, "learning_rate": 9.932776550161559e-06, "loss": 0.2298, "step": 336 }, { "epoch": 0.05241057542768274, "grad_norm": 1.2831001226274117, "learning_rate": 9.932376720879503e-06, "loss": 0.2352, "step": 337 }, { "epoch": 0.05256609642301711, "grad_norm": 2.152789286567263, "learning_rate": 9.931975714170345e-06, "loss": 0.3382, "step": 338 }, { "epoch": 0.05272161741835148, "grad_norm": 1.702657664273862, "learning_rate": 9.931573530129803e-06, "loss": 0.2368, "step": 339 }, { "epoch": 0.05287713841368585, "grad_norm": 2.05056832602719, "learning_rate": 9.931170168853886e-06, "loss": 0.2992, "step": 340 }, { "epoch": 0.053032659409020216, "grad_norm": 1.5775290622934088, "learning_rate": 9.930765630438882e-06, "loss": 0.212, "step": 341 }, { "epoch": 0.053188180404354586, "grad_norm": 1.166034186090071, "learning_rate": 9.93035991498136e-06, "loss": 0.2081, "step": 342 }, { "epoch": 0.053343701399688956, "grad_norm": 1.4555896083998001, "learning_rate": 9.929953022578171e-06, "loss": 0.1857, "step": 343 }, { "epoch": 0.053499222395023326, "grad_norm": 1.343927833342108, "learning_rate": 9.929544953326445e-06, "loss": 0.2691, "step": 344 }, { "epoch": 0.053654743390357695, "grad_norm": 1.8890642830307378, "learning_rate": 9.929135707323592e-06, "loss": 0.1967, "step": 345 }, { "epoch": 0.053810264385692065, "grad_norm": 1.4990308791372666, "learning_rate": 9.928725284667308e-06, "loss": 0.1774, "step": 346 }, { "epoch": 0.05396578538102644, "grad_norm": 1.615806257387967, "learning_rate": 9.928313685455565e-06, "loss": 0.2234, "step": 347 }, { "epoch": 0.05412130637636081, "grad_norm": 1.3758078431089233, "learning_rate": 9.927900909786617e-06, "loss": 0.259, "step": 348 }, { "epoch": 0.05427682737169518, "grad_norm": 0.855435278326685, "learning_rate": 9.927486957759001e-06, "loss": 0.2068, "step": 349 }, { "epoch": 0.05443234836702955, "grad_norm": 1.5217482862634222, "learning_rate": 9.927071829471531e-06, "loss": 0.1551, "step": 350 }, { "epoch": 0.05458786936236392, "grad_norm": 1.5111503264835533, "learning_rate": 9.926655525023304e-06, "loss": 0.2599, "step": 351 }, { "epoch": 0.05474339035769829, "grad_norm": 0.8967930843733002, "learning_rate": 9.9262380445137e-06, "loss": 0.169, "step": 352 }, { "epoch": 0.05489891135303266, "grad_norm": 1.9464375941159884, "learning_rate": 9.925819388042374e-06, "loss": 0.2983, "step": 353 }, { "epoch": 0.05505443234836703, "grad_norm": 1.574189824318599, "learning_rate": 9.925399555709269e-06, "loss": 0.1937, "step": 354 }, { "epoch": 0.0552099533437014, "grad_norm": 3.1438752373638232, "learning_rate": 9.924978547614604e-06, "loss": 0.2181, "step": 355 }, { "epoch": 0.05536547433903577, "grad_norm": 1.6348127637741856, "learning_rate": 9.924556363858877e-06, "loss": 0.1847, "step": 356 }, { "epoch": 0.05552099533437014, "grad_norm": 1.724455721347507, "learning_rate": 9.92413300454287e-06, "loss": 0.1924, "step": 357 }, { "epoch": 0.05567651632970451, "grad_norm": 0.9215074637606898, "learning_rate": 9.923708469767645e-06, "loss": 0.1484, "step": 358 }, { "epoch": 0.05583203732503888, "grad_norm": 1.0048144642733263, "learning_rate": 9.923282759634547e-06, "loss": 0.139, "step": 359 }, { "epoch": 0.05598755832037325, "grad_norm": 1.6563473574979655, "learning_rate": 9.922855874245197e-06, "loss": 0.2462, "step": 360 }, { "epoch": 0.05614307931570762, "grad_norm": 1.0753481257964308, "learning_rate": 9.922427813701495e-06, "loss": 0.2543, "step": 361 }, { "epoch": 0.05629860031104199, "grad_norm": 1.1607722120362791, "learning_rate": 9.92199857810563e-06, "loss": 0.1919, "step": 362 }, { "epoch": 0.05645412130637636, "grad_norm": 1.0235707105593828, "learning_rate": 9.921568167560065e-06, "loss": 0.1851, "step": 363 }, { "epoch": 0.05660964230171073, "grad_norm": 1.443489161948352, "learning_rate": 9.921136582167545e-06, "loss": 0.2566, "step": 364 }, { "epoch": 0.056765163297045104, "grad_norm": 1.1047251832726421, "learning_rate": 9.920703822031094e-06, "loss": 0.2268, "step": 365 }, { "epoch": 0.056920684292379474, "grad_norm": 1.8071891113724519, "learning_rate": 9.92026988725402e-06, "loss": 0.286, "step": 366 }, { "epoch": 0.057076205287713844, "grad_norm": 1.127534519608966, "learning_rate": 9.919834777939908e-06, "loss": 0.2078, "step": 367 }, { "epoch": 0.05723172628304821, "grad_norm": 1.3537981754957027, "learning_rate": 9.919398494192625e-06, "loss": 0.2574, "step": 368 }, { "epoch": 0.05738724727838258, "grad_norm": 1.5740289483284484, "learning_rate": 9.918961036116317e-06, "loss": 0.2168, "step": 369 }, { "epoch": 0.05754276827371695, "grad_norm": 2.1521943324617854, "learning_rate": 9.918522403815414e-06, "loss": 0.5388, "step": 370 }, { "epoch": 0.05769828926905132, "grad_norm": 0.9621156840694527, "learning_rate": 9.918082597394621e-06, "loss": 0.2206, "step": 371 }, { "epoch": 0.05785381026438569, "grad_norm": 0.8374473543740336, "learning_rate": 9.91764161695893e-06, "loss": 0.1931, "step": 372 }, { "epoch": 0.05800933125972006, "grad_norm": 1.594565893913882, "learning_rate": 9.917199462613601e-06, "loss": 0.2664, "step": 373 }, { "epoch": 0.05816485225505443, "grad_norm": 2.539276249800021, "learning_rate": 9.916756134464191e-06, "loss": 0.3158, "step": 374 }, { "epoch": 0.0583203732503888, "grad_norm": 1.0461962066836652, "learning_rate": 9.916311632616525e-06, "loss": 0.2489, "step": 375 }, { "epoch": 0.05847589424572317, "grad_norm": 1.1340444520472663, "learning_rate": 9.915865957176709e-06, "loss": 0.2718, "step": 376 }, { "epoch": 0.05863141524105754, "grad_norm": 1.467480205738983, "learning_rate": 9.915419108251138e-06, "loss": 0.1753, "step": 377 }, { "epoch": 0.05878693623639191, "grad_norm": 1.4394725259816188, "learning_rate": 9.914971085946476e-06, "loss": 0.1973, "step": 378 }, { "epoch": 0.05894245723172628, "grad_norm": 1.2534669496284443, "learning_rate": 9.914521890369676e-06, "loss": 0.2127, "step": 379 }, { "epoch": 0.05909797822706065, "grad_norm": 1.282361137311585, "learning_rate": 9.914071521627964e-06, "loss": 0.1881, "step": 380 }, { "epoch": 0.05925349922239502, "grad_norm": 1.7744186005576332, "learning_rate": 9.913619979828851e-06, "loss": 0.1875, "step": 381 }, { "epoch": 0.0594090202177294, "grad_norm": 1.5020250209663002, "learning_rate": 9.913167265080126e-06, "loss": 0.1684, "step": 382 }, { "epoch": 0.05956454121306377, "grad_norm": 1.259074929221576, "learning_rate": 9.912713377489858e-06, "loss": 0.2268, "step": 383 }, { "epoch": 0.059720062208398136, "grad_norm": 1.7761373693512776, "learning_rate": 9.912258317166398e-06, "loss": 0.223, "step": 384 }, { "epoch": 0.059875583203732506, "grad_norm": 2.38865888975245, "learning_rate": 9.911802084218374e-06, "loss": 0.2401, "step": 385 }, { "epoch": 0.060031104199066876, "grad_norm": 0.8949382740792282, "learning_rate": 9.911344678754694e-06, "loss": 0.1922, "step": 386 }, { "epoch": 0.060186625194401246, "grad_norm": 1.5889982876131292, "learning_rate": 9.910886100884547e-06, "loss": 0.1943, "step": 387 }, { "epoch": 0.060342146189735615, "grad_norm": 1.4147870380604834, "learning_rate": 9.910426350717404e-06, "loss": 0.1812, "step": 388 }, { "epoch": 0.060497667185069985, "grad_norm": 1.8231195124047115, "learning_rate": 9.909965428363012e-06, "loss": 0.2312, "step": 389 }, { "epoch": 0.060653188180404355, "grad_norm": 1.8874621933930384, "learning_rate": 9.909503333931402e-06, "loss": 0.287, "step": 390 }, { "epoch": 0.060808709175738725, "grad_norm": 1.7665216636429069, "learning_rate": 9.90904006753288e-06, "loss": 0.2185, "step": 391 }, { "epoch": 0.060964230171073094, "grad_norm": 1.256357590139898, "learning_rate": 9.908575629278034e-06, "loss": 0.1919, "step": 392 }, { "epoch": 0.061119751166407464, "grad_norm": 4.375967721306914, "learning_rate": 9.908110019277735e-06, "loss": 0.1781, "step": 393 }, { "epoch": 0.061275272161741834, "grad_norm": 1.4286735960699084, "learning_rate": 9.907643237643127e-06, "loss": 0.253, "step": 394 }, { "epoch": 0.061430793157076204, "grad_norm": 1.6229980414007696, "learning_rate": 9.90717528448564e-06, "loss": 0.2598, "step": 395 }, { "epoch": 0.06158631415241057, "grad_norm": 1.654127403226531, "learning_rate": 9.906706159916977e-06, "loss": 0.2677, "step": 396 }, { "epoch": 0.06174183514774494, "grad_norm": 0.7489317566220969, "learning_rate": 9.90623586404913e-06, "loss": 0.1595, "step": 397 }, { "epoch": 0.06189735614307931, "grad_norm": 1.0243584995437751, "learning_rate": 9.90576439699436e-06, "loss": 0.2089, "step": 398 }, { "epoch": 0.06205287713841368, "grad_norm": 1.2843274122650117, "learning_rate": 9.905291758865217e-06, "loss": 0.2458, "step": 399 }, { "epoch": 0.06220839813374806, "grad_norm": 1.482986812845832, "learning_rate": 9.904817949774524e-06, "loss": 0.2611, "step": 400 }, { "epoch": 0.06220839813374806, "eval_loss": 0.23184187710285187, "eval_runtime": 9.4466, "eval_samples_per_second": 2.752, "eval_steps_per_second": 0.741, "step": 400 }, { "epoch": 0.06236391912908243, "grad_norm": 2.01899839511783, "learning_rate": 9.904342969835385e-06, "loss": 0.2178, "step": 401 }, { "epoch": 0.06251944012441679, "grad_norm": 1.4244669635257896, "learning_rate": 9.903866819161188e-06, "loss": 0.2321, "step": 402 }, { "epoch": 0.06267496111975117, "grad_norm": 1.7090867256976423, "learning_rate": 9.903389497865593e-06, "loss": 0.2071, "step": 403 }, { "epoch": 0.06283048211508553, "grad_norm": 1.305136754505658, "learning_rate": 9.902911006062543e-06, "loss": 0.1899, "step": 404 }, { "epoch": 0.06298600311041991, "grad_norm": 1.0188677304744835, "learning_rate": 9.902431343866266e-06, "loss": 0.2457, "step": 405 }, { "epoch": 0.06314152410575427, "grad_norm": 1.6042710170666996, "learning_rate": 9.901950511391259e-06, "loss": 0.1894, "step": 406 }, { "epoch": 0.06329704510108865, "grad_norm": 1.3017493690494788, "learning_rate": 9.901468508752304e-06, "loss": 0.2908, "step": 407 }, { "epoch": 0.06345256609642301, "grad_norm": 1.3230633029674432, "learning_rate": 9.900985336064463e-06, "loss": 0.2786, "step": 408 }, { "epoch": 0.06360808709175739, "grad_norm": 1.5120257860737862, "learning_rate": 9.900500993443076e-06, "loss": 0.2516, "step": 409 }, { "epoch": 0.06376360808709176, "grad_norm": 1.004582433223966, "learning_rate": 9.900015481003762e-06, "loss": 0.2232, "step": 410 }, { "epoch": 0.06391912908242613, "grad_norm": 1.399115724283105, "learning_rate": 9.89952879886242e-06, "loss": 0.2763, "step": 411 }, { "epoch": 0.0640746500777605, "grad_norm": 1.816764777159624, "learning_rate": 9.899040947135225e-06, "loss": 0.2913, "step": 412 }, { "epoch": 0.06423017107309487, "grad_norm": 1.1949304261760583, "learning_rate": 9.898551925938638e-06, "loss": 0.191, "step": 413 }, { "epoch": 0.06438569206842924, "grad_norm": 1.6899096837752585, "learning_rate": 9.898061735389395e-06, "loss": 0.2314, "step": 414 }, { "epoch": 0.0645412130637636, "grad_norm": 1.6400875402483213, "learning_rate": 9.897570375604508e-06, "loss": 0.1985, "step": 415 }, { "epoch": 0.06469673405909798, "grad_norm": 1.1700291435704913, "learning_rate": 9.897077846701274e-06, "loss": 0.2178, "step": 416 }, { "epoch": 0.06485225505443235, "grad_norm": 1.6396026705753728, "learning_rate": 9.896584148797265e-06, "loss": 0.2443, "step": 417 }, { "epoch": 0.06500777604976672, "grad_norm": 0.8511496035113331, "learning_rate": 9.896089282010338e-06, "loss": 0.1619, "step": 418 }, { "epoch": 0.06516329704510108, "grad_norm": 1.3924064844406538, "learning_rate": 9.895593246458617e-06, "loss": 0.2021, "step": 419 }, { "epoch": 0.06531881804043546, "grad_norm": 0.8605197503722029, "learning_rate": 9.895096042260517e-06, "loss": 0.1628, "step": 420 }, { "epoch": 0.06547433903576982, "grad_norm": 1.3908417494412908, "learning_rate": 9.894597669534729e-06, "loss": 0.2054, "step": 421 }, { "epoch": 0.0656298600311042, "grad_norm": 1.445540354985538, "learning_rate": 9.894098128400219e-06, "loss": 0.2197, "step": 422 }, { "epoch": 0.06578538102643856, "grad_norm": 1.3103752658839474, "learning_rate": 9.893597418976234e-06, "loss": 0.2297, "step": 423 }, { "epoch": 0.06594090202177294, "grad_norm": 1.0497805770986521, "learning_rate": 9.893095541382304e-06, "loss": 0.1747, "step": 424 }, { "epoch": 0.0660964230171073, "grad_norm": 1.513640843523071, "learning_rate": 9.892592495738229e-06, "loss": 0.1754, "step": 425 }, { "epoch": 0.06625194401244168, "grad_norm": 1.0493517604475748, "learning_rate": 9.892088282164098e-06, "loss": 0.2586, "step": 426 }, { "epoch": 0.06640746500777606, "grad_norm": 1.4678962231044086, "learning_rate": 9.89158290078027e-06, "loss": 0.2932, "step": 427 }, { "epoch": 0.06656298600311042, "grad_norm": 1.6765991678498569, "learning_rate": 9.891076351707389e-06, "loss": 0.2116, "step": 428 }, { "epoch": 0.0667185069984448, "grad_norm": 1.4655721822686016, "learning_rate": 9.890568635066373e-06, "loss": 0.1543, "step": 429 }, { "epoch": 0.06687402799377916, "grad_norm": 1.6313534003780414, "learning_rate": 9.890059750978425e-06, "loss": 0.1571, "step": 430 }, { "epoch": 0.06702954898911354, "grad_norm": 1.0261848775525118, "learning_rate": 9.889549699565017e-06, "loss": 0.2865, "step": 431 }, { "epoch": 0.0671850699844479, "grad_norm": 1.5225780156038968, "learning_rate": 9.88903848094791e-06, "loss": 0.1914, "step": 432 }, { "epoch": 0.06734059097978227, "grad_norm": 1.3350387169313882, "learning_rate": 9.888526095249138e-06, "loss": 0.2754, "step": 433 }, { "epoch": 0.06749611197511664, "grad_norm": 1.192180411270206, "learning_rate": 9.888012542591014e-06, "loss": 0.1974, "step": 434 }, { "epoch": 0.06765163297045101, "grad_norm": 1.3005497242232493, "learning_rate": 9.88749782309613e-06, "loss": 0.1903, "step": 435 }, { "epoch": 0.06780715396578538, "grad_norm": 1.1288456938448086, "learning_rate": 9.88698193688736e-06, "loss": 0.2333, "step": 436 }, { "epoch": 0.06796267496111975, "grad_norm": 1.130396483559975, "learning_rate": 9.886464884087846e-06, "loss": 0.2674, "step": 437 }, { "epoch": 0.06811819595645412, "grad_norm": 0.9035948769600225, "learning_rate": 9.885946664821021e-06, "loss": 0.1864, "step": 438 }, { "epoch": 0.0682737169517885, "grad_norm": 1.1233476167867031, "learning_rate": 9.885427279210592e-06, "loss": 0.1787, "step": 439 }, { "epoch": 0.06842923794712286, "grad_norm": 1.2410015017602511, "learning_rate": 9.88490672738054e-06, "loss": 0.2509, "step": 440 }, { "epoch": 0.06858475894245723, "grad_norm": 1.3429869818046247, "learning_rate": 9.884385009455131e-06, "loss": 0.2811, "step": 441 }, { "epoch": 0.0687402799377916, "grad_norm": 0.7587532198438675, "learning_rate": 9.883862125558904e-06, "loss": 0.1781, "step": 442 }, { "epoch": 0.06889580093312597, "grad_norm": 0.9782244567957874, "learning_rate": 9.88333807581668e-06, "loss": 0.1891, "step": 443 }, { "epoch": 0.06905132192846034, "grad_norm": 1.8354472673215871, "learning_rate": 9.882812860353558e-06, "loss": 0.2372, "step": 444 }, { "epoch": 0.06920684292379471, "grad_norm": 1.0210293095436775, "learning_rate": 9.882286479294911e-06, "loss": 0.1988, "step": 445 }, { "epoch": 0.06936236391912909, "grad_norm": 2.117567357062213, "learning_rate": 9.881758932766398e-06, "loss": 0.1992, "step": 446 }, { "epoch": 0.06951788491446345, "grad_norm": 1.1644685693150085, "learning_rate": 9.881230220893948e-06, "loss": 0.18, "step": 447 }, { "epoch": 0.06967340590979783, "grad_norm": 1.1209275418337545, "learning_rate": 9.880700343803773e-06, "loss": 0.3069, "step": 448 }, { "epoch": 0.06982892690513219, "grad_norm": 1.155686416296927, "learning_rate": 9.880169301622362e-06, "loss": 0.1744, "step": 449 }, { "epoch": 0.06998444790046657, "grad_norm": 0.9709514091501408, "learning_rate": 9.879637094476482e-06, "loss": 0.1871, "step": 450 }, { "epoch": 0.07013996889580093, "grad_norm": 1.1219093494884402, "learning_rate": 9.87910372249318e-06, "loss": 0.1932, "step": 451 }, { "epoch": 0.07029548989113531, "grad_norm": 1.9094748023939434, "learning_rate": 9.878569185799778e-06, "loss": 0.2339, "step": 452 }, { "epoch": 0.07045101088646967, "grad_norm": 1.3264334862739553, "learning_rate": 9.878033484523876e-06, "loss": 0.1407, "step": 453 }, { "epoch": 0.07060653188180405, "grad_norm": 1.667180383137504, "learning_rate": 9.877496618793356e-06, "loss": 0.1867, "step": 454 }, { "epoch": 0.07076205287713841, "grad_norm": 1.0486860196671894, "learning_rate": 9.876958588736371e-06, "loss": 0.1683, "step": 455 }, { "epoch": 0.07091757387247279, "grad_norm": 1.2507603637095628, "learning_rate": 9.876419394481363e-06, "loss": 0.1958, "step": 456 }, { "epoch": 0.07107309486780715, "grad_norm": 1.7806763122908775, "learning_rate": 9.87587903615704e-06, "loss": 0.2466, "step": 457 }, { "epoch": 0.07122861586314153, "grad_norm": 1.0570385231053188, "learning_rate": 9.875337513892395e-06, "loss": 0.1336, "step": 458 }, { "epoch": 0.07138413685847589, "grad_norm": 1.8093621923009064, "learning_rate": 9.874794827816696e-06, "loss": 0.245, "step": 459 }, { "epoch": 0.07153965785381027, "grad_norm": 1.6343174119313473, "learning_rate": 9.874250978059489e-06, "loss": 0.1878, "step": 460 }, { "epoch": 0.07169517884914463, "grad_norm": 1.2474757406216732, "learning_rate": 9.873705964750603e-06, "loss": 0.201, "step": 461 }, { "epoch": 0.071850699844479, "grad_norm": 0.9854370189019162, "learning_rate": 9.873159788020135e-06, "loss": 0.1572, "step": 462 }, { "epoch": 0.07200622083981338, "grad_norm": 1.2046716423202313, "learning_rate": 9.872612447998466e-06, "loss": 0.1644, "step": 463 }, { "epoch": 0.07216174183514774, "grad_norm": 1.6657683984708445, "learning_rate": 9.872063944816257e-06, "loss": 0.2026, "step": 464 }, { "epoch": 0.07231726283048212, "grad_norm": 1.6319780353610651, "learning_rate": 9.871514278604439e-06, "loss": 0.2361, "step": 465 }, { "epoch": 0.07247278382581648, "grad_norm": 0.930626270347552, "learning_rate": 9.870963449494228e-06, "loss": 0.2334, "step": 466 }, { "epoch": 0.07262830482115086, "grad_norm": 1.7347785771237878, "learning_rate": 9.870411457617115e-06, "loss": 0.3121, "step": 467 }, { "epoch": 0.07278382581648522, "grad_norm": 1.6861297399111428, "learning_rate": 9.869858303104864e-06, "loss": 0.2234, "step": 468 }, { "epoch": 0.0729393468118196, "grad_norm": 2.2175613812233856, "learning_rate": 9.869303986089525e-06, "loss": 0.215, "step": 469 }, { "epoch": 0.07309486780715396, "grad_norm": 1.2151103786584494, "learning_rate": 9.86874850670342e-06, "loss": 0.143, "step": 470 }, { "epoch": 0.07325038880248834, "grad_norm": 1.8347498082665927, "learning_rate": 9.868191865079149e-06, "loss": 0.1847, "step": 471 }, { "epoch": 0.0734059097978227, "grad_norm": 0.7662001443118179, "learning_rate": 9.867634061349592e-06, "loss": 0.2132, "step": 472 }, { "epoch": 0.07356143079315708, "grad_norm": 1.127229878211817, "learning_rate": 9.8670750956479e-06, "loss": 0.2405, "step": 473 }, { "epoch": 0.07371695178849144, "grad_norm": 0.8919765028163983, "learning_rate": 9.866514968107511e-06, "loss": 0.2187, "step": 474 }, { "epoch": 0.07387247278382582, "grad_norm": 0.8318099634868261, "learning_rate": 9.865953678862133e-06, "loss": 0.149, "step": 475 }, { "epoch": 0.07402799377916018, "grad_norm": 1.577340616348031, "learning_rate": 9.865391228045753e-06, "loss": 0.2319, "step": 476 }, { "epoch": 0.07418351477449456, "grad_norm": 1.116181816359047, "learning_rate": 9.864827615792637e-06, "loss": 0.1901, "step": 477 }, { "epoch": 0.07433903576982892, "grad_norm": 1.105109643192386, "learning_rate": 9.864262842237327e-06, "loss": 0.2011, "step": 478 }, { "epoch": 0.0744945567651633, "grad_norm": 1.9701207318396636, "learning_rate": 9.863696907514641e-06, "loss": 0.2409, "step": 479 }, { "epoch": 0.07465007776049767, "grad_norm": 2.2498632028053507, "learning_rate": 9.863129811759678e-06, "loss": 0.3829, "step": 480 }, { "epoch": 0.07480559875583204, "grad_norm": 1.1224194434111838, "learning_rate": 9.86256155510781e-06, "loss": 0.2114, "step": 481 }, { "epoch": 0.07496111975116641, "grad_norm": 1.5539407325523458, "learning_rate": 9.861992137694687e-06, "loss": 0.1976, "step": 482 }, { "epoch": 0.07511664074650078, "grad_norm": 1.962092802549792, "learning_rate": 9.86142155965624e-06, "loss": 0.2725, "step": 483 }, { "epoch": 0.07527216174183515, "grad_norm": 0.8983695148666645, "learning_rate": 9.860849821128668e-06, "loss": 0.154, "step": 484 }, { "epoch": 0.07542768273716952, "grad_norm": 1.398592267234838, "learning_rate": 9.86027692224846e-06, "loss": 0.1497, "step": 485 }, { "epoch": 0.07558320373250389, "grad_norm": 1.0403186420901969, "learning_rate": 9.859702863152372e-06, "loss": 0.1936, "step": 486 }, { "epoch": 0.07573872472783826, "grad_norm": 0.7470818354767621, "learning_rate": 9.859127643977438e-06, "loss": 0.1523, "step": 487 }, { "epoch": 0.07589424572317263, "grad_norm": 1.2067693893481815, "learning_rate": 9.858551264860972e-06, "loss": 0.3168, "step": 488 }, { "epoch": 0.076049766718507, "grad_norm": 1.5295551443098423, "learning_rate": 9.857973725940565e-06, "loss": 0.2194, "step": 489 }, { "epoch": 0.07620528771384137, "grad_norm": 1.618418958541224, "learning_rate": 9.857395027354085e-06, "loss": 0.2209, "step": 490 }, { "epoch": 0.07636080870917573, "grad_norm": 1.1696631104347366, "learning_rate": 9.856815169239671e-06, "loss": 0.1993, "step": 491 }, { "epoch": 0.07651632970451011, "grad_norm": 1.4918786793556023, "learning_rate": 9.856234151735744e-06, "loss": 0.2657, "step": 492 }, { "epoch": 0.07667185069984447, "grad_norm": 1.3100404095494855, "learning_rate": 9.855651974981005e-06, "loss": 0.2832, "step": 493 }, { "epoch": 0.07682737169517885, "grad_norm": 13.98784357990924, "learning_rate": 9.855068639114425e-06, "loss": 0.2488, "step": 494 }, { "epoch": 0.07698289269051321, "grad_norm": 1.3956332181045448, "learning_rate": 9.854484144275254e-06, "loss": 0.225, "step": 495 }, { "epoch": 0.07713841368584759, "grad_norm": 1.1858198275947147, "learning_rate": 9.853898490603018e-06, "loss": 0.2041, "step": 496 }, { "epoch": 0.07729393468118195, "grad_norm": 0.765411378364051, "learning_rate": 9.853311678237524e-06, "loss": 0.1492, "step": 497 }, { "epoch": 0.07744945567651633, "grad_norm": 1.2288325537770441, "learning_rate": 9.85272370731885e-06, "loss": 0.1773, "step": 498 }, { "epoch": 0.0776049766718507, "grad_norm": 1.3901203640607709, "learning_rate": 9.852134577987353e-06, "loss": 0.2091, "step": 499 }, { "epoch": 0.07776049766718507, "grad_norm": 1.5991626946866644, "learning_rate": 9.85154429038367e-06, "loss": 0.2485, "step": 500 }, { "epoch": 0.07776049766718507, "eval_loss": 0.22800126671791077, "eval_runtime": 9.4446, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 500 }, { "epoch": 0.07791601866251945, "grad_norm": 0.9946822389547595, "learning_rate": 9.850952844648705e-06, "loss": 0.2324, "step": 501 }, { "epoch": 0.07807153965785381, "grad_norm": 1.088817573789371, "learning_rate": 9.850360240923647e-06, "loss": 0.1813, "step": 502 }, { "epoch": 0.07822706065318819, "grad_norm": 5.945777339639669, "learning_rate": 9.849766479349959e-06, "loss": 0.1976, "step": 503 }, { "epoch": 0.07838258164852255, "grad_norm": 0.8593394406117729, "learning_rate": 9.84917156006938e-06, "loss": 0.2474, "step": 504 }, { "epoch": 0.07853810264385692, "grad_norm": 1.3930666133589364, "learning_rate": 9.848575483223925e-06, "loss": 0.215, "step": 505 }, { "epoch": 0.07869362363919129, "grad_norm": 1.6493288101835173, "learning_rate": 9.84797824895589e-06, "loss": 0.303, "step": 506 }, { "epoch": 0.07884914463452566, "grad_norm": 1.1106903817577367, "learning_rate": 9.847379857407835e-06, "loss": 0.1654, "step": 507 }, { "epoch": 0.07900466562986003, "grad_norm": 1.166896696404847, "learning_rate": 9.846780308722612e-06, "loss": 0.2046, "step": 508 }, { "epoch": 0.0791601866251944, "grad_norm": 1.7221901123414272, "learning_rate": 9.846179603043338e-06, "loss": 0.2543, "step": 509 }, { "epoch": 0.07931570762052877, "grad_norm": 1.0398664154595585, "learning_rate": 9.845577740513409e-06, "loss": 0.2616, "step": 510 }, { "epoch": 0.07947122861586314, "grad_norm": 1.2062182369254026, "learning_rate": 9.8449747212765e-06, "loss": 0.1641, "step": 511 }, { "epoch": 0.0796267496111975, "grad_norm": 1.3859318575453086, "learning_rate": 9.84437054547656e-06, "loss": 0.193, "step": 512 }, { "epoch": 0.07978227060653188, "grad_norm": 3.5056235741823523, "learning_rate": 9.843765213257814e-06, "loss": 0.2399, "step": 513 }, { "epoch": 0.07993779160186625, "grad_norm": 1.2578551416521373, "learning_rate": 9.843158724764762e-06, "loss": 0.2177, "step": 514 }, { "epoch": 0.08009331259720062, "grad_norm": 1.4118043035204642, "learning_rate": 9.842551080142182e-06, "loss": 0.21, "step": 515 }, { "epoch": 0.080248833592535, "grad_norm": 1.1155160124053434, "learning_rate": 9.841942279535128e-06, "loss": 0.2128, "step": 516 }, { "epoch": 0.08040435458786936, "grad_norm": 1.0287833439256027, "learning_rate": 9.84133232308893e-06, "loss": 0.1846, "step": 517 }, { "epoch": 0.08055987558320374, "grad_norm": 2.2894965228305377, "learning_rate": 9.84072121094919e-06, "loss": 0.1814, "step": 518 }, { "epoch": 0.0807153965785381, "grad_norm": 1.345886098139959, "learning_rate": 9.84010894326179e-06, "loss": 0.1912, "step": 519 }, { "epoch": 0.08087091757387248, "grad_norm": 1.9234609876851483, "learning_rate": 9.83949552017289e-06, "loss": 0.2982, "step": 520 }, { "epoch": 0.08102643856920684, "grad_norm": 1.2452886345823744, "learning_rate": 9.83888094182892e-06, "loss": 0.2144, "step": 521 }, { "epoch": 0.08118195956454122, "grad_norm": 1.2711995935698062, "learning_rate": 9.838265208376584e-06, "loss": 0.1799, "step": 522 }, { "epoch": 0.08133748055987558, "grad_norm": 1.0755729955519457, "learning_rate": 9.837648319962876e-06, "loss": 0.3311, "step": 523 }, { "epoch": 0.08149300155520996, "grad_norm": 1.5025152130217085, "learning_rate": 9.837030276735049e-06, "loss": 0.203, "step": 524 }, { "epoch": 0.08164852255054432, "grad_norm": 1.4271542860149822, "learning_rate": 9.83641107884064e-06, "loss": 0.2055, "step": 525 }, { "epoch": 0.0818040435458787, "grad_norm": 1.1896665999932865, "learning_rate": 9.83579072642746e-06, "loss": 0.2191, "step": 526 }, { "epoch": 0.08195956454121306, "grad_norm": 1.6391797544527267, "learning_rate": 9.835169219643597e-06, "loss": 0.2164, "step": 527 }, { "epoch": 0.08211508553654744, "grad_norm": 1.3905263994766632, "learning_rate": 9.834546558637412e-06, "loss": 0.2188, "step": 528 }, { "epoch": 0.0822706065318818, "grad_norm": 1.1325886941547982, "learning_rate": 9.833922743557545e-06, "loss": 0.3596, "step": 529 }, { "epoch": 0.08242612752721618, "grad_norm": 1.58458236573862, "learning_rate": 9.833297774552905e-06, "loss": 0.2725, "step": 530 }, { "epoch": 0.08258164852255054, "grad_norm": 1.2630288499628133, "learning_rate": 9.832671651772685e-06, "loss": 0.3327, "step": 531 }, { "epoch": 0.08273716951788491, "grad_norm": 1.1472998381036559, "learning_rate": 9.832044375366347e-06, "loss": 0.1758, "step": 532 }, { "epoch": 0.08289269051321929, "grad_norm": 1.148175513948538, "learning_rate": 9.831415945483634e-06, "loss": 0.189, "step": 533 }, { "epoch": 0.08304821150855365, "grad_norm": 1.2010115460022994, "learning_rate": 9.830786362274556e-06, "loss": 0.2065, "step": 534 }, { "epoch": 0.08320373250388803, "grad_norm": 1.357353814240526, "learning_rate": 9.830155625889406e-06, "loss": 0.1505, "step": 535 }, { "epoch": 0.0833592534992224, "grad_norm": 1.2541527971168078, "learning_rate": 9.829523736478748e-06, "loss": 0.2309, "step": 536 }, { "epoch": 0.08351477449455677, "grad_norm": 1.0453169347517781, "learning_rate": 9.828890694193425e-06, "loss": 0.1593, "step": 537 }, { "epoch": 0.08367029548989113, "grad_norm": 1.256435896986176, "learning_rate": 9.828256499184553e-06, "loss": 0.2081, "step": 538 }, { "epoch": 0.08382581648522551, "grad_norm": 1.4617851677608784, "learning_rate": 9.827621151603522e-06, "loss": 0.2181, "step": 539 }, { "epoch": 0.08398133748055987, "grad_norm": 2.512069587946666, "learning_rate": 9.826984651601998e-06, "loss": 0.4003, "step": 540 }, { "epoch": 0.08413685847589425, "grad_norm": 1.1168842922612399, "learning_rate": 9.826346999331923e-06, "loss": 0.2823, "step": 541 }, { "epoch": 0.08429237947122861, "grad_norm": 1.0417831453973136, "learning_rate": 9.825708194945514e-06, "loss": 0.1889, "step": 542 }, { "epoch": 0.08444790046656299, "grad_norm": 0.9096481087872343, "learning_rate": 9.82506823859526e-06, "loss": 0.2351, "step": 543 }, { "epoch": 0.08460342146189735, "grad_norm": 1.0430082881953087, "learning_rate": 9.824427130433932e-06, "loss": 0.1953, "step": 544 }, { "epoch": 0.08475894245723173, "grad_norm": 0.6608850743713712, "learning_rate": 9.823784870614568e-06, "loss": 0.1854, "step": 545 }, { "epoch": 0.08491446345256609, "grad_norm": 0.9535990803944258, "learning_rate": 9.823141459290486e-06, "loss": 0.3623, "step": 546 }, { "epoch": 0.08506998444790047, "grad_norm": 1.2084813471627978, "learning_rate": 9.822496896615276e-06, "loss": 0.2088, "step": 547 }, { "epoch": 0.08522550544323483, "grad_norm": 1.751880921507202, "learning_rate": 9.821851182742806e-06, "loss": 0.2367, "step": 548 }, { "epoch": 0.08538102643856921, "grad_norm": 0.859776642879622, "learning_rate": 9.821204317827214e-06, "loss": 0.249, "step": 549 }, { "epoch": 0.08553654743390357, "grad_norm": 1.127529266910784, "learning_rate": 9.820556302022916e-06, "loss": 0.2038, "step": 550 }, { "epoch": 0.08569206842923795, "grad_norm": 1.1762380712487397, "learning_rate": 9.819907135484607e-06, "loss": 0.1408, "step": 551 }, { "epoch": 0.08584758942457232, "grad_norm": 1.1841316789710945, "learning_rate": 9.819256818367247e-06, "loss": 0.1971, "step": 552 }, { "epoch": 0.08600311041990669, "grad_norm": 0.9978225930526609, "learning_rate": 9.818605350826078e-06, "loss": 0.2221, "step": 553 }, { "epoch": 0.08615863141524106, "grad_norm": 1.6694424755142652, "learning_rate": 9.817952733016614e-06, "loss": 0.1549, "step": 554 }, { "epoch": 0.08631415241057543, "grad_norm": 0.9346983450738274, "learning_rate": 9.817298965094644e-06, "loss": 0.1579, "step": 555 }, { "epoch": 0.0864696734059098, "grad_norm": 1.147526345911482, "learning_rate": 9.816644047216231e-06, "loss": 0.1873, "step": 556 }, { "epoch": 0.08662519440124417, "grad_norm": 1.1886850012764587, "learning_rate": 9.815987979537713e-06, "loss": 0.2347, "step": 557 }, { "epoch": 0.08678071539657854, "grad_norm": 1.6793433753087175, "learning_rate": 9.815330762215704e-06, "loss": 0.2773, "step": 558 }, { "epoch": 0.0869362363919129, "grad_norm": 0.7389091927152867, "learning_rate": 9.81467239540709e-06, "loss": 0.2376, "step": 559 }, { "epoch": 0.08709175738724728, "grad_norm": 1.5501383478894555, "learning_rate": 9.814012879269031e-06, "loss": 0.249, "step": 560 }, { "epoch": 0.08724727838258164, "grad_norm": 1.985092307546573, "learning_rate": 9.813352213958966e-06, "loss": 0.2293, "step": 561 }, { "epoch": 0.08740279937791602, "grad_norm": 1.1408911673993625, "learning_rate": 9.812690399634601e-06, "loss": 0.29, "step": 562 }, { "epoch": 0.08755832037325038, "grad_norm": 1.2461126532920535, "learning_rate": 9.812027436453924e-06, "loss": 0.2783, "step": 563 }, { "epoch": 0.08771384136858476, "grad_norm": 1.764223151926025, "learning_rate": 9.81136332457519e-06, "loss": 0.2528, "step": 564 }, { "epoch": 0.08786936236391912, "grad_norm": 1.0618642840366128, "learning_rate": 9.810698064156935e-06, "loss": 0.1723, "step": 565 }, { "epoch": 0.0880248833592535, "grad_norm": 0.8569330765683667, "learning_rate": 9.810031655357964e-06, "loss": 0.2241, "step": 566 }, { "epoch": 0.08818040435458786, "grad_norm": 1.0553303848822568, "learning_rate": 9.80936409833736e-06, "loss": 0.2312, "step": 567 }, { "epoch": 0.08833592534992224, "grad_norm": 1.8702866312005988, "learning_rate": 9.808695393254474e-06, "loss": 0.1949, "step": 568 }, { "epoch": 0.08849144634525662, "grad_norm": 0.9476538253542002, "learning_rate": 9.808025540268939e-06, "loss": 0.1783, "step": 569 }, { "epoch": 0.08864696734059098, "grad_norm": 1.4661601306937122, "learning_rate": 9.80735453954066e-06, "loss": 0.2941, "step": 570 }, { "epoch": 0.08880248833592536, "grad_norm": 1.1865752816456114, "learning_rate": 9.80668239122981e-06, "loss": 0.2196, "step": 571 }, { "epoch": 0.08895800933125972, "grad_norm": 0.9682721759722641, "learning_rate": 9.80600909549684e-06, "loss": 0.2453, "step": 572 }, { "epoch": 0.0891135303265941, "grad_norm": 1.0402552655035497, "learning_rate": 9.805334652502478e-06, "loss": 0.2528, "step": 573 }, { "epoch": 0.08926905132192846, "grad_norm": 1.1058208608284787, "learning_rate": 9.804659062407721e-06, "loss": 0.1704, "step": 574 }, { "epoch": 0.08942457231726283, "grad_norm": 0.9300562072054855, "learning_rate": 9.803982325373843e-06, "loss": 0.241, "step": 575 }, { "epoch": 0.0895800933125972, "grad_norm": 1.3452145435832572, "learning_rate": 9.803304441562391e-06, "loss": 0.179, "step": 576 }, { "epoch": 0.08973561430793157, "grad_norm": 0.934714522466104, "learning_rate": 9.802625411135183e-06, "loss": 0.2131, "step": 577 }, { "epoch": 0.08989113530326594, "grad_norm": 1.2723518042915498, "learning_rate": 9.801945234254315e-06, "loss": 0.2342, "step": 578 }, { "epoch": 0.09004665629860031, "grad_norm": 2.11692073632197, "learning_rate": 9.801263911082154e-06, "loss": 0.2148, "step": 579 }, { "epoch": 0.09020217729393468, "grad_norm": 2.6365326907523396, "learning_rate": 9.800581441781342e-06, "loss": 0.2787, "step": 580 }, { "epoch": 0.09035769828926905, "grad_norm": 1.3369047254369875, "learning_rate": 9.799897826514793e-06, "loss": 0.2365, "step": 581 }, { "epoch": 0.09051321928460342, "grad_norm": 0.9493060685693816, "learning_rate": 9.799213065445696e-06, "loss": 0.1656, "step": 582 }, { "epoch": 0.09066874027993779, "grad_norm": 1.0470819909783555, "learning_rate": 9.798527158737512e-06, "loss": 0.1578, "step": 583 }, { "epoch": 0.09082426127527216, "grad_norm": 1.0969444747176942, "learning_rate": 9.797840106553977e-06, "loss": 0.2095, "step": 584 }, { "epoch": 0.09097978227060653, "grad_norm": 1.6035875172395766, "learning_rate": 9.797151909059102e-06, "loss": 0.2682, "step": 585 }, { "epoch": 0.0911353032659409, "grad_norm": 1.3049640527657593, "learning_rate": 9.796462566417169e-06, "loss": 0.2537, "step": 586 }, { "epoch": 0.09129082426127527, "grad_norm": 1.365745492042764, "learning_rate": 9.79577207879273e-06, "loss": 0.2065, "step": 587 }, { "epoch": 0.09144634525660965, "grad_norm": 0.9500261347653985, "learning_rate": 9.795080446350616e-06, "loss": 0.1885, "step": 588 }, { "epoch": 0.09160186625194401, "grad_norm": 1.5405453397493063, "learning_rate": 9.79438766925593e-06, "loss": 0.2507, "step": 589 }, { "epoch": 0.09175738724727839, "grad_norm": 0.9919977440587929, "learning_rate": 9.79369374767405e-06, "loss": 0.1607, "step": 590 }, { "epoch": 0.09191290824261275, "grad_norm": 1.2052697190695243, "learning_rate": 9.79299868177062e-06, "loss": 0.2247, "step": 591 }, { "epoch": 0.09206842923794713, "grad_norm": 1.5911347684916193, "learning_rate": 9.792302471711564e-06, "loss": 0.1812, "step": 592 }, { "epoch": 0.09222395023328149, "grad_norm": 1.3772469912987155, "learning_rate": 9.791605117663076e-06, "loss": 0.1567, "step": 593 }, { "epoch": 0.09237947122861587, "grad_norm": 1.456752513640415, "learning_rate": 9.790906619791627e-06, "loss": 0.2009, "step": 594 }, { "epoch": 0.09253499222395023, "grad_norm": 0.9824754188966437, "learning_rate": 9.790206978263955e-06, "loss": 0.2041, "step": 595 }, { "epoch": 0.0926905132192846, "grad_norm": 1.1576177882724517, "learning_rate": 9.789506193247075e-06, "loss": 0.2304, "step": 596 }, { "epoch": 0.09284603421461897, "grad_norm": 1.3814578099918997, "learning_rate": 9.788804264908276e-06, "loss": 0.1935, "step": 597 }, { "epoch": 0.09300155520995335, "grad_norm": 0.8483069008778095, "learning_rate": 9.788101193415116e-06, "loss": 0.2148, "step": 598 }, { "epoch": 0.09315707620528771, "grad_norm": 1.3477202886979611, "learning_rate": 9.787396978935431e-06, "loss": 0.23, "step": 599 }, { "epoch": 0.09331259720062209, "grad_norm": 1.4372703771133322, "learning_rate": 9.786691621637322e-06, "loss": 0.2496, "step": 600 }, { "epoch": 0.09331259720062209, "eval_loss": 0.22426502406597137, "eval_runtime": 9.4405, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.741, "step": 600 }, { "epoch": 0.09346811819595645, "grad_norm": 1.9416761367047068, "learning_rate": 9.785985121689171e-06, "loss": 0.6927, "step": 601 }, { "epoch": 0.09362363919129083, "grad_norm": 1.268764907148312, "learning_rate": 9.785277479259629e-06, "loss": 0.2501, "step": 602 }, { "epoch": 0.09377916018662519, "grad_norm": 2.3273439454641514, "learning_rate": 9.784568694517618e-06, "loss": 0.2469, "step": 603 }, { "epoch": 0.09393468118195956, "grad_norm": 1.8747313801721335, "learning_rate": 9.783858767632338e-06, "loss": 0.2289, "step": 604 }, { "epoch": 0.09409020217729394, "grad_norm": 1.2586569667037595, "learning_rate": 9.783147698773257e-06, "loss": 0.1962, "step": 605 }, { "epoch": 0.0942457231726283, "grad_norm": 1.496316694651238, "learning_rate": 9.782435488110116e-06, "loss": 0.298, "step": 606 }, { "epoch": 0.09440124416796268, "grad_norm": 1.1801510466185432, "learning_rate": 9.781722135812932e-06, "loss": 0.2189, "step": 607 }, { "epoch": 0.09455676516329704, "grad_norm": 1.2565748248573585, "learning_rate": 9.78100764205199e-06, "loss": 0.2186, "step": 608 }, { "epoch": 0.09471228615863142, "grad_norm": 0.9391168040034623, "learning_rate": 9.780292006997849e-06, "loss": 0.2144, "step": 609 }, { "epoch": 0.09486780715396578, "grad_norm": 1.1387381134081225, "learning_rate": 9.779575230821344e-06, "loss": 0.1718, "step": 610 }, { "epoch": 0.09502332814930016, "grad_norm": 1.0177855034745955, "learning_rate": 9.778857313693578e-06, "loss": 0.1586, "step": 611 }, { "epoch": 0.09517884914463452, "grad_norm": 1.4624255805438011, "learning_rate": 9.778138255785928e-06, "loss": 0.2697, "step": 612 }, { "epoch": 0.0953343701399689, "grad_norm": 1.0796167649791846, "learning_rate": 9.77741805727004e-06, "loss": 0.2668, "step": 613 }, { "epoch": 0.09548989113530326, "grad_norm": 2.1747859377128806, "learning_rate": 9.776696718317842e-06, "loss": 0.2117, "step": 614 }, { "epoch": 0.09564541213063764, "grad_norm": 1.6173977205310859, "learning_rate": 9.775974239101522e-06, "loss": 0.2048, "step": 615 }, { "epoch": 0.095800933125972, "grad_norm": 1.281075534048029, "learning_rate": 9.775250619793548e-06, "loss": 0.2218, "step": 616 }, { "epoch": 0.09595645412130638, "grad_norm": 1.5623409338338163, "learning_rate": 9.77452586056666e-06, "loss": 0.2843, "step": 617 }, { "epoch": 0.09611197511664074, "grad_norm": 0.920135780872905, "learning_rate": 9.773799961593862e-06, "loss": 0.218, "step": 618 }, { "epoch": 0.09626749611197512, "grad_norm": 1.6644765009913491, "learning_rate": 9.773072923048443e-06, "loss": 0.277, "step": 619 }, { "epoch": 0.09642301710730948, "grad_norm": 1.0758387537045102, "learning_rate": 9.772344745103955e-06, "loss": 0.2405, "step": 620 }, { "epoch": 0.09657853810264386, "grad_norm": 1.1751354263981124, "learning_rate": 9.77161542793422e-06, "loss": 0.2362, "step": 621 }, { "epoch": 0.09673405909797823, "grad_norm": 2.7957127911749655, "learning_rate": 9.770884971713344e-06, "loss": 0.178, "step": 622 }, { "epoch": 0.0968895800933126, "grad_norm": 5.021758252286217, "learning_rate": 9.770153376615692e-06, "loss": 0.2095, "step": 623 }, { "epoch": 0.09704510108864697, "grad_norm": 0.8518883317455118, "learning_rate": 9.769420642815905e-06, "loss": 0.2174, "step": 624 }, { "epoch": 0.09720062208398134, "grad_norm": 1.0603512343033086, "learning_rate": 9.7686867704889e-06, "loss": 0.2437, "step": 625 }, { "epoch": 0.09735614307931571, "grad_norm": 2.7767054670419067, "learning_rate": 9.767951759809861e-06, "loss": 0.3072, "step": 626 }, { "epoch": 0.09751166407465008, "grad_norm": 0.875830402681162, "learning_rate": 9.767215610954246e-06, "loss": 0.1865, "step": 627 }, { "epoch": 0.09766718506998445, "grad_norm": 1.1746324049289305, "learning_rate": 9.766478324097784e-06, "loss": 0.1775, "step": 628 }, { "epoch": 0.09782270606531882, "grad_norm": 1.3198405804921558, "learning_rate": 9.765739899416474e-06, "loss": 0.2202, "step": 629 }, { "epoch": 0.09797822706065319, "grad_norm": 0.9040537149469751, "learning_rate": 9.76500033708659e-06, "loss": 0.134, "step": 630 }, { "epoch": 0.09813374805598755, "grad_norm": 1.1116680855923542, "learning_rate": 9.764259637284674e-06, "loss": 0.2413, "step": 631 }, { "epoch": 0.09828926905132193, "grad_norm": 1.816511140625042, "learning_rate": 9.763517800187543e-06, "loss": 0.1881, "step": 632 }, { "epoch": 0.0984447900466563, "grad_norm": 1.1808179637924803, "learning_rate": 9.762774825972284e-06, "loss": 0.1797, "step": 633 }, { "epoch": 0.09860031104199067, "grad_norm": 0.9260180174403776, "learning_rate": 9.762030714816255e-06, "loss": 0.1692, "step": 634 }, { "epoch": 0.09875583203732503, "grad_norm": 0.9809663827224766, "learning_rate": 9.761285466897086e-06, "loss": 0.1971, "step": 635 }, { "epoch": 0.09891135303265941, "grad_norm": 1.1818951833176021, "learning_rate": 9.760539082392678e-06, "loss": 0.3061, "step": 636 }, { "epoch": 0.09906687402799377, "grad_norm": 1.5126562950843534, "learning_rate": 9.759791561481201e-06, "loss": 0.2214, "step": 637 }, { "epoch": 0.09922239502332815, "grad_norm": 1.1563368410762391, "learning_rate": 9.759042904341103e-06, "loss": 0.1879, "step": 638 }, { "epoch": 0.09937791601866251, "grad_norm": 1.7465834025848672, "learning_rate": 9.758293111151094e-06, "loss": 0.2936, "step": 639 }, { "epoch": 0.09953343701399689, "grad_norm": 1.4420901394687415, "learning_rate": 9.757542182090165e-06, "loss": 0.1977, "step": 640 }, { "epoch": 0.09968895800933127, "grad_norm": 1.4320029014579423, "learning_rate": 9.756790117337569e-06, "loss": 0.235, "step": 641 }, { "epoch": 0.09984447900466563, "grad_norm": 1.0178157213981396, "learning_rate": 9.756036917072837e-06, "loss": 0.228, "step": 642 }, { "epoch": 0.1, "grad_norm": 1.634337451034447, "learning_rate": 9.755282581475769e-06, "loss": 0.174, "step": 643 }, { "epoch": 0.10015552099533437, "grad_norm": 1.3123622467109133, "learning_rate": 9.754527110726432e-06, "loss": 0.1854, "step": 644 }, { "epoch": 0.10031104199066875, "grad_norm": 1.3700959071130703, "learning_rate": 9.753770505005171e-06, "loss": 0.271, "step": 645 }, { "epoch": 0.10046656298600311, "grad_norm": 1.5589446061903662, "learning_rate": 9.753012764492596e-06, "loss": 0.1669, "step": 646 }, { "epoch": 0.10062208398133748, "grad_norm": 1.3813884723817376, "learning_rate": 9.752253889369592e-06, "loss": 0.1525, "step": 647 }, { "epoch": 0.10077760497667185, "grad_norm": 1.3858844961504873, "learning_rate": 9.75149387981731e-06, "loss": 0.2673, "step": 648 }, { "epoch": 0.10093312597200622, "grad_norm": 0.9436000404569762, "learning_rate": 9.75073273601718e-06, "loss": 0.2058, "step": 649 }, { "epoch": 0.10108864696734059, "grad_norm": 1.4599521330072638, "learning_rate": 9.749970458150893e-06, "loss": 0.2145, "step": 650 }, { "epoch": 0.10124416796267496, "grad_norm": 1.3455835009343615, "learning_rate": 9.749207046400415e-06, "loss": 0.2353, "step": 651 }, { "epoch": 0.10139968895800933, "grad_norm": 1.6299219848605395, "learning_rate": 9.748442500947988e-06, "loss": 0.2582, "step": 652 }, { "epoch": 0.1015552099533437, "grad_norm": 2.1538893724554966, "learning_rate": 9.747676821976116e-06, "loss": 0.2128, "step": 653 }, { "epoch": 0.10171073094867807, "grad_norm": 1.1642628978054306, "learning_rate": 9.746910009667577e-06, "loss": 0.2092, "step": 654 }, { "epoch": 0.10186625194401244, "grad_norm": 0.9776673463806724, "learning_rate": 9.746142064205422e-06, "loss": 0.176, "step": 655 }, { "epoch": 0.1020217729393468, "grad_norm": 1.350687490540933, "learning_rate": 9.745372985772968e-06, "loss": 0.2426, "step": 656 }, { "epoch": 0.10217729393468118, "grad_norm": 1.7681295289484116, "learning_rate": 9.744602774553807e-06, "loss": 0.2204, "step": 657 }, { "epoch": 0.10233281493001556, "grad_norm": 0.9199423619051535, "learning_rate": 9.743831430731796e-06, "loss": 0.1647, "step": 658 }, { "epoch": 0.10248833592534992, "grad_norm": 5.138426947168042, "learning_rate": 9.743058954491067e-06, "loss": 0.2107, "step": 659 }, { "epoch": 0.1026438569206843, "grad_norm": 1.446510693113484, "learning_rate": 9.742285346016024e-06, "loss": 0.2379, "step": 660 }, { "epoch": 0.10279937791601866, "grad_norm": 1.4833539837619547, "learning_rate": 9.741510605491335e-06, "loss": 0.1714, "step": 661 }, { "epoch": 0.10295489891135304, "grad_norm": 1.3228899574182327, "learning_rate": 9.74073473310194e-06, "loss": 0.2388, "step": 662 }, { "epoch": 0.1031104199066874, "grad_norm": 1.0712502633957945, "learning_rate": 9.739957729033054e-06, "loss": 0.2289, "step": 663 }, { "epoch": 0.10326594090202178, "grad_norm": 1.1587775220461487, "learning_rate": 9.739179593470156e-06, "loss": 0.1741, "step": 664 }, { "epoch": 0.10342146189735614, "grad_norm": 1.0260279383302884, "learning_rate": 9.738400326599e-06, "loss": 0.2412, "step": 665 }, { "epoch": 0.10357698289269052, "grad_norm": 1.491042707966078, "learning_rate": 9.737619928605605e-06, "loss": 0.1833, "step": 666 }, { "epoch": 0.10373250388802488, "grad_norm": 1.6710832262506907, "learning_rate": 9.736838399676266e-06, "loss": 0.1712, "step": 667 }, { "epoch": 0.10388802488335926, "grad_norm": 1.4001413138925893, "learning_rate": 9.736055739997543e-06, "loss": 0.2739, "step": 668 }, { "epoch": 0.10404354587869362, "grad_norm": 1.0413982567358797, "learning_rate": 9.735271949756269e-06, "loss": 0.1655, "step": 669 }, { "epoch": 0.104199066874028, "grad_norm": 2.062452927969995, "learning_rate": 9.734487029139544e-06, "loss": 0.2384, "step": 670 }, { "epoch": 0.10435458786936236, "grad_norm": 1.1419346714711909, "learning_rate": 9.733700978334741e-06, "loss": 0.2176, "step": 671 }, { "epoch": 0.10451010886469674, "grad_norm": 1.4704145498498906, "learning_rate": 9.7329137975295e-06, "loss": 0.2281, "step": 672 }, { "epoch": 0.1046656298600311, "grad_norm": 1.7257595787120843, "learning_rate": 9.732125486911733e-06, "loss": 0.1964, "step": 673 }, { "epoch": 0.10482115085536547, "grad_norm": 1.596182048450316, "learning_rate": 9.731336046669621e-06, "loss": 0.1863, "step": 674 }, { "epoch": 0.10497667185069985, "grad_norm": 1.741565962255971, "learning_rate": 9.730545476991613e-06, "loss": 0.1358, "step": 675 }, { "epoch": 0.10513219284603421, "grad_norm": 1.2105023861624677, "learning_rate": 9.729753778066431e-06, "loss": 0.2757, "step": 676 }, { "epoch": 0.10528771384136859, "grad_norm": 1.1483441998296096, "learning_rate": 9.728960950083062e-06, "loss": 0.2327, "step": 677 }, { "epoch": 0.10544323483670295, "grad_norm": 2.6827889453865255, "learning_rate": 9.728166993230768e-06, "loss": 0.2841, "step": 678 }, { "epoch": 0.10559875583203733, "grad_norm": 1.3531013447523792, "learning_rate": 9.727371907699075e-06, "loss": 0.2742, "step": 679 }, { "epoch": 0.1057542768273717, "grad_norm": 1.4165422039945663, "learning_rate": 9.726575693677782e-06, "loss": 0.1733, "step": 680 }, { "epoch": 0.10590979782270607, "grad_norm": 1.1633994693280907, "learning_rate": 9.725778351356958e-06, "loss": 0.1752, "step": 681 }, { "epoch": 0.10606531881804043, "grad_norm": 1.4801298044861129, "learning_rate": 9.724979880926937e-06, "loss": 0.1654, "step": 682 }, { "epoch": 0.10622083981337481, "grad_norm": 1.038476254792903, "learning_rate": 9.724180282578327e-06, "loss": 0.1796, "step": 683 }, { "epoch": 0.10637636080870917, "grad_norm": 1.1715546692057253, "learning_rate": 9.723379556502002e-06, "loss": 0.2615, "step": 684 }, { "epoch": 0.10653188180404355, "grad_norm": 0.9669903775949065, "learning_rate": 9.722577702889106e-06, "loss": 0.2217, "step": 685 }, { "epoch": 0.10668740279937791, "grad_norm": 0.9554324370526551, "learning_rate": 9.721774721931056e-06, "loss": 0.2067, "step": 686 }, { "epoch": 0.10684292379471229, "grad_norm": 1.5055382554521828, "learning_rate": 9.720970613819532e-06, "loss": 0.2886, "step": 687 }, { "epoch": 0.10699844479004665, "grad_norm": 1.4701983051316598, "learning_rate": 9.720165378746486e-06, "loss": 0.2461, "step": 688 }, { "epoch": 0.10715396578538103, "grad_norm": 0.8955915121278603, "learning_rate": 9.719359016904137e-06, "loss": 0.1296, "step": 689 }, { "epoch": 0.10730948678071539, "grad_norm": 1.1365940197104127, "learning_rate": 9.718551528484979e-06, "loss": 0.1756, "step": 690 }, { "epoch": 0.10746500777604977, "grad_norm": 1.1309854500820393, "learning_rate": 9.717742913681769e-06, "loss": 0.1685, "step": 691 }, { "epoch": 0.10762052877138413, "grad_norm": 1.228647590848163, "learning_rate": 9.716933172687533e-06, "loss": 0.1988, "step": 692 }, { "epoch": 0.1077760497667185, "grad_norm": 1.8437087557242553, "learning_rate": 9.71612230569557e-06, "loss": 0.2259, "step": 693 }, { "epoch": 0.10793157076205288, "grad_norm": 2.190128145243616, "learning_rate": 9.715310312899445e-06, "loss": 0.1593, "step": 694 }, { "epoch": 0.10808709175738725, "grad_norm": 1.9542747095305757, "learning_rate": 9.714497194492988e-06, "loss": 0.1942, "step": 695 }, { "epoch": 0.10824261275272162, "grad_norm": 1.190017072453523, "learning_rate": 9.713682950670305e-06, "loss": 0.184, "step": 696 }, { "epoch": 0.10839813374805599, "grad_norm": 1.3702585397170965, "learning_rate": 9.712867581625769e-06, "loss": 0.2747, "step": 697 }, { "epoch": 0.10855365474339036, "grad_norm": 1.1224607857205071, "learning_rate": 9.712051087554017e-06, "loss": 0.1851, "step": 698 }, { "epoch": 0.10870917573872473, "grad_norm": 1.1610995749820388, "learning_rate": 9.711233468649958e-06, "loss": 0.1651, "step": 699 }, { "epoch": 0.1088646967340591, "grad_norm": 1.0713548580433974, "learning_rate": 9.710414725108771e-06, "loss": 0.2798, "step": 700 }, { "epoch": 0.1088646967340591, "eval_loss": 0.2192843109369278, "eval_runtime": 9.4454, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.741, "step": 700 }, { "epoch": 0.10902021772939346, "grad_norm": 1.086974338576193, "learning_rate": 9.709594857125898e-06, "loss": 0.3235, "step": 701 }, { "epoch": 0.10917573872472784, "grad_norm": 3.455927159294357, "learning_rate": 9.708773864897059e-06, "loss": 0.1502, "step": 702 }, { "epoch": 0.1093312597200622, "grad_norm": 1.6070730415734276, "learning_rate": 9.707951748618229e-06, "loss": 0.2652, "step": 703 }, { "epoch": 0.10948678071539658, "grad_norm": 1.0297377958380671, "learning_rate": 9.707128508485663e-06, "loss": 0.2352, "step": 704 }, { "epoch": 0.10964230171073094, "grad_norm": 1.07292209906991, "learning_rate": 9.706304144695877e-06, "loss": 0.1471, "step": 705 }, { "epoch": 0.10979782270606532, "grad_norm": 1.2095547877752455, "learning_rate": 9.705478657445661e-06, "loss": 0.2107, "step": 706 }, { "epoch": 0.10995334370139968, "grad_norm": 1.307669146215221, "learning_rate": 9.70465204693207e-06, "loss": 0.2337, "step": 707 }, { "epoch": 0.11010886469673406, "grad_norm": 0.8004125116368356, "learning_rate": 9.703824313352428e-06, "loss": 0.2042, "step": 708 }, { "epoch": 0.11026438569206842, "grad_norm": 1.5202724190274493, "learning_rate": 9.702995456904323e-06, "loss": 0.2446, "step": 709 }, { "epoch": 0.1104199066874028, "grad_norm": 1.3109419274601464, "learning_rate": 9.702165477785618e-06, "loss": 0.2791, "step": 710 }, { "epoch": 0.11057542768273718, "grad_norm": 1.2175779348655416, "learning_rate": 9.70133437619444e-06, "loss": 0.2787, "step": 711 }, { "epoch": 0.11073094867807154, "grad_norm": 2.4619987863193824, "learning_rate": 9.700502152329182e-06, "loss": 0.2184, "step": 712 }, { "epoch": 0.11088646967340592, "grad_norm": 1.1204962171981678, "learning_rate": 9.69966880638851e-06, "loss": 0.1796, "step": 713 }, { "epoch": 0.11104199066874028, "grad_norm": 1.3460375672771012, "learning_rate": 9.698834338571355e-06, "loss": 0.1536, "step": 714 }, { "epoch": 0.11119751166407466, "grad_norm": 1.4551247859245915, "learning_rate": 9.697998749076916e-06, "loss": 0.1775, "step": 715 }, { "epoch": 0.11135303265940902, "grad_norm": 1.64865769787968, "learning_rate": 9.69716203810466e-06, "loss": 0.2341, "step": 716 }, { "epoch": 0.1115085536547434, "grad_norm": 1.8250018792840808, "learning_rate": 9.696324205854322e-06, "loss": 0.2058, "step": 717 }, { "epoch": 0.11166407465007776, "grad_norm": 1.067050937242904, "learning_rate": 9.695485252525902e-06, "loss": 0.1463, "step": 718 }, { "epoch": 0.11181959564541213, "grad_norm": 2.2821274396758127, "learning_rate": 9.694645178319673e-06, "loss": 0.2508, "step": 719 }, { "epoch": 0.1119751166407465, "grad_norm": 1.388014808020173, "learning_rate": 9.69380398343617e-06, "loss": 0.1977, "step": 720 }, { "epoch": 0.11213063763608087, "grad_norm": 1.5658859493501038, "learning_rate": 9.692961668076197e-06, "loss": 0.2291, "step": 721 }, { "epoch": 0.11228615863141524, "grad_norm": 1.0853791710998715, "learning_rate": 9.69211823244083e-06, "loss": 0.2763, "step": 722 }, { "epoch": 0.11244167962674961, "grad_norm": 1.27256020581809, "learning_rate": 9.691273676731408e-06, "loss": 0.195, "step": 723 }, { "epoch": 0.11259720062208398, "grad_norm": 0.6768405188507002, "learning_rate": 9.690428001149537e-06, "loss": 0.1839, "step": 724 }, { "epoch": 0.11275272161741835, "grad_norm": 3.309861478677342, "learning_rate": 9.68958120589709e-06, "loss": 0.1446, "step": 725 }, { "epoch": 0.11290824261275272, "grad_norm": 1.3577561463931358, "learning_rate": 9.688733291176211e-06, "loss": 0.174, "step": 726 }, { "epoch": 0.11306376360808709, "grad_norm": 0.7899130738957459, "learning_rate": 9.68788425718931e-06, "loss": 0.1819, "step": 727 }, { "epoch": 0.11321928460342146, "grad_norm": 1.9374468863177388, "learning_rate": 9.68703410413906e-06, "loss": 0.2148, "step": 728 }, { "epoch": 0.11337480559875583, "grad_norm": 0.9790173123360771, "learning_rate": 9.686182832228408e-06, "loss": 0.1842, "step": 729 }, { "epoch": 0.11353032659409021, "grad_norm": 1.8838507925348544, "learning_rate": 9.685330441660564e-06, "loss": 0.2482, "step": 730 }, { "epoch": 0.11368584758942457, "grad_norm": 1.7209011423931209, "learning_rate": 9.684476932639002e-06, "loss": 0.1938, "step": 731 }, { "epoch": 0.11384136858475895, "grad_norm": 1.3133247484457822, "learning_rate": 9.68362230536747e-06, "loss": 0.1629, "step": 732 }, { "epoch": 0.11399688958009331, "grad_norm": 1.4346328630835792, "learning_rate": 9.682766560049979e-06, "loss": 0.2393, "step": 733 }, { "epoch": 0.11415241057542769, "grad_norm": 1.416880965769396, "learning_rate": 9.681909696890805e-06, "loss": 0.2149, "step": 734 }, { "epoch": 0.11430793157076205, "grad_norm": 1.3604331981225013, "learning_rate": 9.681051716094497e-06, "loss": 0.2116, "step": 735 }, { "epoch": 0.11446345256609643, "grad_norm": 1.370682231566524, "learning_rate": 9.680192617865862e-06, "loss": 0.1574, "step": 736 }, { "epoch": 0.11461897356143079, "grad_norm": 3.11697026931608, "learning_rate": 9.679332402409983e-06, "loss": 0.1659, "step": 737 }, { "epoch": 0.11477449455676517, "grad_norm": 1.0795485204091093, "learning_rate": 9.678471069932205e-06, "loss": 0.1843, "step": 738 }, { "epoch": 0.11493001555209953, "grad_norm": 1.089003737321956, "learning_rate": 9.677608620638138e-06, "loss": 0.1289, "step": 739 }, { "epoch": 0.1150855365474339, "grad_norm": 1.9816825572482675, "learning_rate": 9.676745054733661e-06, "loss": 0.183, "step": 740 }, { "epoch": 0.11524105754276827, "grad_norm": 4.608323882578619, "learning_rate": 9.675880372424922e-06, "loss": 0.1797, "step": 741 }, { "epoch": 0.11539657853810265, "grad_norm": 0.9751878331403108, "learning_rate": 9.675014573918328e-06, "loss": 0.2649, "step": 742 }, { "epoch": 0.11555209953343701, "grad_norm": 0.913137520804308, "learning_rate": 9.67414765942056e-06, "loss": 0.1229, "step": 743 }, { "epoch": 0.11570762052877138, "grad_norm": 1.1182409613228717, "learning_rate": 9.673279629138565e-06, "loss": 0.1554, "step": 744 }, { "epoch": 0.11586314152410575, "grad_norm": 2.425925853364065, "learning_rate": 9.67241048327955e-06, "loss": 0.2414, "step": 745 }, { "epoch": 0.11601866251944012, "grad_norm": 2.1643434151507024, "learning_rate": 9.671540222050995e-06, "loss": 0.2402, "step": 746 }, { "epoch": 0.1161741835147745, "grad_norm": 1.1869224601016288, "learning_rate": 9.67066884566064e-06, "loss": 0.225, "step": 747 }, { "epoch": 0.11632970451010886, "grad_norm": 1.1850496858694712, "learning_rate": 9.669796354316497e-06, "loss": 0.1732, "step": 748 }, { "epoch": 0.11648522550544324, "grad_norm": 1.083880428656249, "learning_rate": 9.668922748226842e-06, "loss": 0.2256, "step": 749 }, { "epoch": 0.1166407465007776, "grad_norm": 0.9290306352610638, "learning_rate": 9.668048027600217e-06, "loss": 0.1814, "step": 750 }, { "epoch": 0.11679626749611198, "grad_norm": 1.1985316233321583, "learning_rate": 9.66717219264543e-06, "loss": 0.2646, "step": 751 }, { "epoch": 0.11695178849144634, "grad_norm": 1.5752976014862634, "learning_rate": 9.666295243571553e-06, "loss": 0.2212, "step": 752 }, { "epoch": 0.11710730948678072, "grad_norm": 1.554593030529623, "learning_rate": 9.665417180587928e-06, "loss": 0.2008, "step": 753 }, { "epoch": 0.11726283048211508, "grad_norm": 1.802147426905897, "learning_rate": 9.664538003904162e-06, "loss": 0.1694, "step": 754 }, { "epoch": 0.11741835147744946, "grad_norm": 1.117253074112765, "learning_rate": 9.663657713730123e-06, "loss": 0.1769, "step": 755 }, { "epoch": 0.11757387247278382, "grad_norm": 1.2713208371120763, "learning_rate": 9.662776310275954e-06, "loss": 0.3356, "step": 756 }, { "epoch": 0.1177293934681182, "grad_norm": 1.5049877808240208, "learning_rate": 9.661893793752053e-06, "loss": 0.2156, "step": 757 }, { "epoch": 0.11788491446345256, "grad_norm": 1.3646831264890733, "learning_rate": 9.661010164369092e-06, "loss": 0.2077, "step": 758 }, { "epoch": 0.11804043545878694, "grad_norm": 1.2057674637964264, "learning_rate": 9.660125422338003e-06, "loss": 0.234, "step": 759 }, { "epoch": 0.1181959564541213, "grad_norm": 1.7059599899477969, "learning_rate": 9.659239567869989e-06, "loss": 0.2019, "step": 760 }, { "epoch": 0.11835147744945568, "grad_norm": 1.359054263386884, "learning_rate": 9.658352601176514e-06, "loss": 0.2263, "step": 761 }, { "epoch": 0.11850699844479004, "grad_norm": 1.4779502971821263, "learning_rate": 9.65746452246931e-06, "loss": 0.229, "step": 762 }, { "epoch": 0.11866251944012442, "grad_norm": 1.2106031530437371, "learning_rate": 9.656575331960376e-06, "loss": 0.2075, "step": 763 }, { "epoch": 0.1188180404354588, "grad_norm": 1.5750869920441555, "learning_rate": 9.655685029861969e-06, "loss": 0.2103, "step": 764 }, { "epoch": 0.11897356143079316, "grad_norm": 1.328300416339256, "learning_rate": 9.654793616386621e-06, "loss": 0.1822, "step": 765 }, { "epoch": 0.11912908242612753, "grad_norm": 2.218866258760128, "learning_rate": 9.653901091747124e-06, "loss": 0.1909, "step": 766 }, { "epoch": 0.1192846034214619, "grad_norm": 1.8622051312400103, "learning_rate": 9.653007456156536e-06, "loss": 0.2241, "step": 767 }, { "epoch": 0.11944012441679627, "grad_norm": 1.3832228672336278, "learning_rate": 9.652112709828179e-06, "loss": 0.2256, "step": 768 }, { "epoch": 0.11959564541213064, "grad_norm": 1.0673171707909481, "learning_rate": 9.651216852975643e-06, "loss": 0.1959, "step": 769 }, { "epoch": 0.11975116640746501, "grad_norm": 1.3393619429463375, "learning_rate": 9.650319885812777e-06, "loss": 0.2727, "step": 770 }, { "epoch": 0.11990668740279938, "grad_norm": 1.0882111784771522, "learning_rate": 9.649421808553708e-06, "loss": 0.2259, "step": 771 }, { "epoch": 0.12006220839813375, "grad_norm": 4.447919742603164, "learning_rate": 9.648522621412812e-06, "loss": 0.231, "step": 772 }, { "epoch": 0.12021772939346811, "grad_norm": 1.5176403638597071, "learning_rate": 9.647622324604742e-06, "loss": 0.2824, "step": 773 }, { "epoch": 0.12037325038880249, "grad_norm": 1.7576074795768224, "learning_rate": 9.646720918344409e-06, "loss": 0.2034, "step": 774 }, { "epoch": 0.12052877138413685, "grad_norm": 1.5792838723378395, "learning_rate": 9.645818402846992e-06, "loss": 0.1677, "step": 775 }, { "epoch": 0.12068429237947123, "grad_norm": 1.0405000433648128, "learning_rate": 9.644914778327935e-06, "loss": 0.1742, "step": 776 }, { "epoch": 0.1208398133748056, "grad_norm": 1.545200668981177, "learning_rate": 9.644010045002942e-06, "loss": 0.215, "step": 777 }, { "epoch": 0.12099533437013997, "grad_norm": 1.203039484308954, "learning_rate": 9.64310420308799e-06, "loss": 0.1997, "step": 778 }, { "epoch": 0.12115085536547433, "grad_norm": 1.038062251460105, "learning_rate": 9.642197252799315e-06, "loss": 0.2001, "step": 779 }, { "epoch": 0.12130637636080871, "grad_norm": 1.3963430783849184, "learning_rate": 9.641289194353418e-06, "loss": 0.2034, "step": 780 }, { "epoch": 0.12146189735614307, "grad_norm": 1.7069918759015217, "learning_rate": 9.640380027967065e-06, "loss": 0.1763, "step": 781 }, { "epoch": 0.12161741835147745, "grad_norm": 1.1485309219449071, "learning_rate": 9.639469753857287e-06, "loss": 0.1946, "step": 782 }, { "epoch": 0.12177293934681183, "grad_norm": 0.9976269624811838, "learning_rate": 9.63855837224138e-06, "loss": 0.1797, "step": 783 }, { "epoch": 0.12192846034214619, "grad_norm": 1.413148682632424, "learning_rate": 9.6376458833369e-06, "loss": 0.1873, "step": 784 }, { "epoch": 0.12208398133748057, "grad_norm": 1.287068701523726, "learning_rate": 9.636732287361675e-06, "loss": 0.1964, "step": 785 }, { "epoch": 0.12223950233281493, "grad_norm": 1.338092957612231, "learning_rate": 9.635817584533791e-06, "loss": 0.2353, "step": 786 }, { "epoch": 0.1223950233281493, "grad_norm": 1.018985176065171, "learning_rate": 9.6349017750716e-06, "loss": 0.243, "step": 787 }, { "epoch": 0.12255054432348367, "grad_norm": 1.434405666961768, "learning_rate": 9.633984859193722e-06, "loss": 0.1622, "step": 788 }, { "epoch": 0.12270606531881804, "grad_norm": 1.2392900109261706, "learning_rate": 9.633066837119034e-06, "loss": 0.2223, "step": 789 }, { "epoch": 0.12286158631415241, "grad_norm": 0.9045673894396051, "learning_rate": 9.632147709066682e-06, "loss": 0.2079, "step": 790 }, { "epoch": 0.12301710730948678, "grad_norm": 1.14443309047443, "learning_rate": 9.631227475256072e-06, "loss": 0.1611, "step": 791 }, { "epoch": 0.12317262830482115, "grad_norm": 1.1564291271253233, "learning_rate": 9.630306135906882e-06, "loss": 0.1918, "step": 792 }, { "epoch": 0.12332814930015552, "grad_norm": 2.1831582412646138, "learning_rate": 9.629383691239043e-06, "loss": 0.3687, "step": 793 }, { "epoch": 0.12348367029548989, "grad_norm": 1.0115623861000755, "learning_rate": 9.628460141472759e-06, "loss": 0.1589, "step": 794 }, { "epoch": 0.12363919129082426, "grad_norm": 0.8936049036056027, "learning_rate": 9.627535486828491e-06, "loss": 0.1775, "step": 795 }, { "epoch": 0.12379471228615863, "grad_norm": 1.3757750926899586, "learning_rate": 9.626609727526973e-06, "loss": 0.2, "step": 796 }, { "epoch": 0.123950233281493, "grad_norm": 1.3462049704057701, "learning_rate": 9.62568286378919e-06, "loss": 0.2079, "step": 797 }, { "epoch": 0.12410575427682737, "grad_norm": 2.793319589376331, "learning_rate": 9.624754895836401e-06, "loss": 0.2297, "step": 798 }, { "epoch": 0.12426127527216174, "grad_norm": 2.1016347336310357, "learning_rate": 9.623825823890123e-06, "loss": 0.3106, "step": 799 }, { "epoch": 0.12441679626749612, "grad_norm": 1.003756031018623, "learning_rate": 9.622895648172141e-06, "loss": 0.2143, "step": 800 }, { "epoch": 0.12441679626749612, "eval_loss": 0.2170763909816742, "eval_runtime": 9.4305, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 800 }, { "epoch": 0.12457231726283048, "grad_norm": 0.897563337381756, "learning_rate": 9.621964368904497e-06, "loss": 0.1512, "step": 801 }, { "epoch": 0.12472783825816486, "grad_norm": 1.4190659163727315, "learning_rate": 9.621031986309504e-06, "loss": 0.1372, "step": 802 }, { "epoch": 0.12488335925349922, "grad_norm": 1.4031206175030444, "learning_rate": 9.620098500609734e-06, "loss": 0.1871, "step": 803 }, { "epoch": 0.12503888024883358, "grad_norm": 1.387547575925909, "learning_rate": 9.61916391202802e-06, "loss": 0.2899, "step": 804 }, { "epoch": 0.12519440124416797, "grad_norm": 1.3476031364192975, "learning_rate": 9.618228220787466e-06, "loss": 0.1693, "step": 805 }, { "epoch": 0.12534992223950234, "grad_norm": 2.5401419561208787, "learning_rate": 9.617291427111431e-06, "loss": 0.141, "step": 806 }, { "epoch": 0.1255054432348367, "grad_norm": 1.918003643731122, "learning_rate": 9.616353531223543e-06, "loss": 0.2531, "step": 807 }, { "epoch": 0.12566096423017106, "grad_norm": 0.8824574964250353, "learning_rate": 9.61541453334769e-06, "loss": 0.2257, "step": 808 }, { "epoch": 0.12581648522550545, "grad_norm": 1.2069677012195894, "learning_rate": 9.614474433708021e-06, "loss": 0.2012, "step": 809 }, { "epoch": 0.12597200622083982, "grad_norm": 0.8806254573901449, "learning_rate": 9.613533232528956e-06, "loss": 0.2312, "step": 810 }, { "epoch": 0.12612752721617418, "grad_norm": 0.9758926813848963, "learning_rate": 9.61259093003517e-06, "loss": 0.1623, "step": 811 }, { "epoch": 0.12628304821150854, "grad_norm": 1.601541464183247, "learning_rate": 9.611647526451603e-06, "loss": 0.2448, "step": 812 }, { "epoch": 0.12643856920684293, "grad_norm": 0.987236561765066, "learning_rate": 9.610703022003462e-06, "loss": 0.1833, "step": 813 }, { "epoch": 0.1265940902021773, "grad_norm": 1.1685078861500846, "learning_rate": 9.60975741691621e-06, "loss": 0.2708, "step": 814 }, { "epoch": 0.12674961119751166, "grad_norm": 1.2818789908746795, "learning_rate": 9.608810711415577e-06, "loss": 0.2132, "step": 815 }, { "epoch": 0.12690513219284602, "grad_norm": 1.7355503765107922, "learning_rate": 9.607862905727556e-06, "loss": 0.2316, "step": 816 }, { "epoch": 0.1270606531881804, "grad_norm": 2.4291900998321614, "learning_rate": 9.6069140000784e-06, "loss": 0.2607, "step": 817 }, { "epoch": 0.12721617418351477, "grad_norm": 1.2126882446943306, "learning_rate": 9.605963994694625e-06, "loss": 0.2374, "step": 818 }, { "epoch": 0.12737169517884914, "grad_norm": 1.402793253608196, "learning_rate": 9.605012889803013e-06, "loss": 0.1854, "step": 819 }, { "epoch": 0.12752721617418353, "grad_norm": 1.1350096409875572, "learning_rate": 9.604060685630608e-06, "loss": 0.2353, "step": 820 }, { "epoch": 0.1276827371695179, "grad_norm": 0.8605397955086846, "learning_rate": 9.603107382404708e-06, "loss": 0.1725, "step": 821 }, { "epoch": 0.12783825816485225, "grad_norm": 1.8193213761501528, "learning_rate": 9.602152980352884e-06, "loss": 0.191, "step": 822 }, { "epoch": 0.12799377916018662, "grad_norm": 1.0560479092155457, "learning_rate": 9.601197479702963e-06, "loss": 0.2129, "step": 823 }, { "epoch": 0.128149300155521, "grad_norm": 0.9886146739779551, "learning_rate": 9.60024088068304e-06, "loss": 0.1349, "step": 824 }, { "epoch": 0.12830482115085537, "grad_norm": 1.044208330213169, "learning_rate": 9.599283183521467e-06, "loss": 0.1611, "step": 825 }, { "epoch": 0.12846034214618973, "grad_norm": 1.105951942629371, "learning_rate": 9.598324388446856e-06, "loss": 0.25, "step": 826 }, { "epoch": 0.1286158631415241, "grad_norm": 1.2794645483672162, "learning_rate": 9.59736449568809e-06, "loss": 0.2132, "step": 827 }, { "epoch": 0.12877138413685849, "grad_norm": 1.3758053785309152, "learning_rate": 9.596403505474304e-06, "loss": 0.2149, "step": 828 }, { "epoch": 0.12892690513219285, "grad_norm": 3.355818230170184, "learning_rate": 9.595441418034903e-06, "loss": 0.3682, "step": 829 }, { "epoch": 0.1290824261275272, "grad_norm": 1.837073128336488, "learning_rate": 9.594478233599551e-06, "loss": 0.2032, "step": 830 }, { "epoch": 0.12923794712286157, "grad_norm": 1.5066969144898332, "learning_rate": 9.593513952398172e-06, "loss": 0.2378, "step": 831 }, { "epoch": 0.12939346811819596, "grad_norm": 2.2384679831338614, "learning_rate": 9.592548574660954e-06, "loss": 0.3073, "step": 832 }, { "epoch": 0.12954898911353033, "grad_norm": 0.9921790422628257, "learning_rate": 9.591582100618345e-06, "loss": 0.1937, "step": 833 }, { "epoch": 0.1297045101088647, "grad_norm": 1.198440591432804, "learning_rate": 9.590614530501057e-06, "loss": 0.1925, "step": 834 }, { "epoch": 0.12986003110419908, "grad_norm": 1.3748463927035848, "learning_rate": 9.589645864540061e-06, "loss": 0.1941, "step": 835 }, { "epoch": 0.13001555209953344, "grad_norm": 1.3610943196332044, "learning_rate": 9.588676102966593e-06, "loss": 0.166, "step": 836 }, { "epoch": 0.1301710730948678, "grad_norm": 0.8955532583487235, "learning_rate": 9.58770524601215e-06, "loss": 0.1495, "step": 837 }, { "epoch": 0.13032659409020217, "grad_norm": 1.285038495994977, "learning_rate": 9.586733293908486e-06, "loss": 0.2182, "step": 838 }, { "epoch": 0.13048211508553656, "grad_norm": 1.3128144306673817, "learning_rate": 9.585760246887618e-06, "loss": 0.2371, "step": 839 }, { "epoch": 0.13063763608087092, "grad_norm": 0.9827038114137296, "learning_rate": 9.584786105181831e-06, "loss": 0.2151, "step": 840 }, { "epoch": 0.13079315707620529, "grad_norm": 1.0846767572687748, "learning_rate": 9.583810869023663e-06, "loss": 0.2757, "step": 841 }, { "epoch": 0.13094867807153965, "grad_norm": 2.4064875629004265, "learning_rate": 9.582834538645917e-06, "loss": 0.2357, "step": 842 }, { "epoch": 0.13110419906687404, "grad_norm": 1.2894114641673238, "learning_rate": 9.581857114281656e-06, "loss": 0.1877, "step": 843 }, { "epoch": 0.1312597200622084, "grad_norm": 1.5574730662344252, "learning_rate": 9.580878596164207e-06, "loss": 0.1623, "step": 844 }, { "epoch": 0.13141524105754276, "grad_norm": 1.2710340775794473, "learning_rate": 9.579898984527154e-06, "loss": 0.187, "step": 845 }, { "epoch": 0.13157076205287713, "grad_norm": 1.4508001676942102, "learning_rate": 9.578918279604346e-06, "loss": 0.1372, "step": 846 }, { "epoch": 0.13172628304821152, "grad_norm": 1.0012618056091263, "learning_rate": 9.577936481629887e-06, "loss": 0.2201, "step": 847 }, { "epoch": 0.13188180404354588, "grad_norm": 0.8624233281967797, "learning_rate": 9.576953590838149e-06, "loss": 0.1979, "step": 848 }, { "epoch": 0.13203732503888024, "grad_norm": 1.083965076436999, "learning_rate": 9.57596960746376e-06, "loss": 0.2404, "step": 849 }, { "epoch": 0.1321928460342146, "grad_norm": 2.717551231092263, "learning_rate": 9.574984531741613e-06, "loss": 0.2745, "step": 850 }, { "epoch": 0.132348367029549, "grad_norm": 1.1408480485083061, "learning_rate": 9.573998363906858e-06, "loss": 0.207, "step": 851 }, { "epoch": 0.13250388802488336, "grad_norm": 1.6013848917828304, "learning_rate": 9.573011104194907e-06, "loss": 0.1826, "step": 852 }, { "epoch": 0.13265940902021772, "grad_norm": 1.1178529036140945, "learning_rate": 9.572022752841433e-06, "loss": 0.1676, "step": 853 }, { "epoch": 0.1328149300155521, "grad_norm": 1.4964605327939924, "learning_rate": 9.571033310082367e-06, "loss": 0.1929, "step": 854 }, { "epoch": 0.13297045101088648, "grad_norm": 1.1404147062516024, "learning_rate": 9.570042776153904e-06, "loss": 0.2274, "step": 855 }, { "epoch": 0.13312597200622084, "grad_norm": 1.037410347500119, "learning_rate": 9.5690511512925e-06, "loss": 0.1577, "step": 856 }, { "epoch": 0.1332814930015552, "grad_norm": 0.8366673014473697, "learning_rate": 9.56805843573487e-06, "loss": 0.1689, "step": 857 }, { "epoch": 0.1334370139968896, "grad_norm": 1.1452085152848681, "learning_rate": 9.567064629717986e-06, "loss": 0.1882, "step": 858 }, { "epoch": 0.13359253499222395, "grad_norm": 1.574854487100182, "learning_rate": 9.566069733479087e-06, "loss": 0.31, "step": 859 }, { "epoch": 0.13374805598755832, "grad_norm": 1.593208427145828, "learning_rate": 9.565073747255665e-06, "loss": 0.2198, "step": 860 }, { "epoch": 0.13390357698289268, "grad_norm": 1.1177720055491567, "learning_rate": 9.564076671285477e-06, "loss": 0.2164, "step": 861 }, { "epoch": 0.13405909797822707, "grad_norm": 0.8165999821951461, "learning_rate": 9.56307850580654e-06, "loss": 0.1506, "step": 862 }, { "epoch": 0.13421461897356143, "grad_norm": 1.6750367279986849, "learning_rate": 9.562079251057129e-06, "loss": 0.1732, "step": 863 }, { "epoch": 0.1343701399688958, "grad_norm": 0.8044448243559967, "learning_rate": 9.561078907275781e-06, "loss": 0.1922, "step": 864 }, { "epoch": 0.13452566096423016, "grad_norm": 1.271960150991974, "learning_rate": 9.56007747470129e-06, "loss": 0.2229, "step": 865 }, { "epoch": 0.13468118195956455, "grad_norm": 1.0004490456147865, "learning_rate": 9.559074953572713e-06, "loss": 0.171, "step": 866 }, { "epoch": 0.1348367029548989, "grad_norm": 1.312217862895249, "learning_rate": 9.558071344129368e-06, "loss": 0.1783, "step": 867 }, { "epoch": 0.13499222395023328, "grad_norm": 0.9356844106701133, "learning_rate": 9.557066646610826e-06, "loss": 0.1279, "step": 868 }, { "epoch": 0.13514774494556764, "grad_norm": 1.4966712904656105, "learning_rate": 9.556060861256928e-06, "loss": 0.1971, "step": 869 }, { "epoch": 0.13530326594090203, "grad_norm": 0.9157016732315058, "learning_rate": 9.555053988307764e-06, "loss": 0.1739, "step": 870 }, { "epoch": 0.1354587869362364, "grad_norm": 1.6187813697357434, "learning_rate": 9.554046028003691e-06, "loss": 0.2326, "step": 871 }, { "epoch": 0.13561430793157075, "grad_norm": 1.649258041134042, "learning_rate": 9.553036980585323e-06, "loss": 0.2775, "step": 872 }, { "epoch": 0.13576982892690515, "grad_norm": 0.8386386166459481, "learning_rate": 9.552026846293532e-06, "loss": 0.2225, "step": 873 }, { "epoch": 0.1359253499222395, "grad_norm": 0.96771492040488, "learning_rate": 9.551015625369455e-06, "loss": 0.1999, "step": 874 }, { "epoch": 0.13608087091757387, "grad_norm": 1.4939182411934322, "learning_rate": 9.550003318054482e-06, "loss": 0.2427, "step": 875 }, { "epoch": 0.13623639191290823, "grad_norm": 1.1599555983572944, "learning_rate": 9.548989924590263e-06, "loss": 0.2038, "step": 876 }, { "epoch": 0.13639191290824262, "grad_norm": 1.094972018927162, "learning_rate": 9.547975445218712e-06, "loss": 0.1477, "step": 877 }, { "epoch": 0.136547433903577, "grad_norm": 1.5378516224601575, "learning_rate": 9.546959880181998e-06, "loss": 0.2411, "step": 878 }, { "epoch": 0.13670295489891135, "grad_norm": 0.8702765312556789, "learning_rate": 9.545943229722553e-06, "loss": 0.1646, "step": 879 }, { "epoch": 0.1368584758942457, "grad_norm": 1.3664019719395564, "learning_rate": 9.544925494083062e-06, "loss": 0.1688, "step": 880 }, { "epoch": 0.1370139968895801, "grad_norm": 1.3206104649159593, "learning_rate": 9.543906673506474e-06, "loss": 0.1623, "step": 881 }, { "epoch": 0.13716951788491447, "grad_norm": 1.3156230503659714, "learning_rate": 9.542886768235996e-06, "loss": 0.2297, "step": 882 }, { "epoch": 0.13732503888024883, "grad_norm": 1.727680640232904, "learning_rate": 9.541865778515094e-06, "loss": 0.2824, "step": 883 }, { "epoch": 0.1374805598755832, "grad_norm": 1.3346266664784416, "learning_rate": 9.540843704587492e-06, "loss": 0.2533, "step": 884 }, { "epoch": 0.13763608087091758, "grad_norm": 1.663603312691407, "learning_rate": 9.539820546697175e-06, "loss": 0.1889, "step": 885 }, { "epoch": 0.13779160186625194, "grad_norm": 1.3931002570801638, "learning_rate": 9.53879630508838e-06, "loss": 0.2125, "step": 886 }, { "epoch": 0.1379471228615863, "grad_norm": 1.0312695868953268, "learning_rate": 9.537770980005616e-06, "loss": 0.157, "step": 887 }, { "epoch": 0.13810264385692067, "grad_norm": 1.291055270497525, "learning_rate": 9.536744571693634e-06, "loss": 0.1542, "step": 888 }, { "epoch": 0.13825816485225506, "grad_norm": 1.0586309772197517, "learning_rate": 9.535717080397458e-06, "loss": 0.1413, "step": 889 }, { "epoch": 0.13841368584758942, "grad_norm": 1.9142459890481243, "learning_rate": 9.53468850636236e-06, "loss": 0.2132, "step": 890 }, { "epoch": 0.1385692068429238, "grad_norm": 1.57785159694773, "learning_rate": 9.533658849833879e-06, "loss": 0.2704, "step": 891 }, { "epoch": 0.13872472783825818, "grad_norm": 0.6767899331815482, "learning_rate": 9.532628111057804e-06, "loss": 0.1994, "step": 892 }, { "epoch": 0.13888024883359254, "grad_norm": 0.7786068585931847, "learning_rate": 9.531596290280191e-06, "loss": 0.2215, "step": 893 }, { "epoch": 0.1390357698289269, "grad_norm": 1.1907351307303637, "learning_rate": 9.530563387747348e-06, "loss": 0.1597, "step": 894 }, { "epoch": 0.13919129082426127, "grad_norm": 0.994862972128769, "learning_rate": 9.529529403705844e-06, "loss": 0.2586, "step": 895 }, { "epoch": 0.13934681181959566, "grad_norm": 0.9549652766512168, "learning_rate": 9.528494338402502e-06, "loss": 0.1332, "step": 896 }, { "epoch": 0.13950233281493002, "grad_norm": 1.1799329518454007, "learning_rate": 9.527458192084413e-06, "loss": 0.1884, "step": 897 }, { "epoch": 0.13965785381026438, "grad_norm": 0.7863314952979764, "learning_rate": 9.526420964998915e-06, "loss": 0.1679, "step": 898 }, { "epoch": 0.13981337480559874, "grad_norm": 0.937917950726602, "learning_rate": 9.52538265739361e-06, "loss": 0.2024, "step": 899 }, { "epoch": 0.13996889580093314, "grad_norm": 1.7160775693106616, "learning_rate": 9.524343269516354e-06, "loss": 0.2127, "step": 900 }, { "epoch": 0.13996889580093314, "eval_loss": 0.21867091953754425, "eval_runtime": 9.4128, "eval_samples_per_second": 2.762, "eval_steps_per_second": 0.744, "step": 900 }, { "epoch": 0.1401244167962675, "grad_norm": 1.4496209630087886, "learning_rate": 9.523302801615266e-06, "loss": 0.2026, "step": 901 }, { "epoch": 0.14027993779160186, "grad_norm": 0.9035504049737524, "learning_rate": 9.522261253938721e-06, "loss": 0.237, "step": 902 }, { "epoch": 0.14043545878693622, "grad_norm": 1.0344016899215176, "learning_rate": 9.521218626735347e-06, "loss": 0.2079, "step": 903 }, { "epoch": 0.14059097978227061, "grad_norm": 0.8764502702407341, "learning_rate": 9.52017492025404e-06, "loss": 0.1512, "step": 904 }, { "epoch": 0.14074650077760498, "grad_norm": 0.78362955023232, "learning_rate": 9.519130134743938e-06, "loss": 0.1544, "step": 905 }, { "epoch": 0.14090202177293934, "grad_norm": 1.331879071297993, "learning_rate": 9.518084270454456e-06, "loss": 0.208, "step": 906 }, { "epoch": 0.14105754276827373, "grad_norm": 1.0576721252655992, "learning_rate": 9.51703732763525e-06, "loss": 0.1777, "step": 907 }, { "epoch": 0.1412130637636081, "grad_norm": 0.9777650095779323, "learning_rate": 9.515989306536241e-06, "loss": 0.2431, "step": 908 }, { "epoch": 0.14136858475894246, "grad_norm": 1.2351460184737522, "learning_rate": 9.514940207407608e-06, "loss": 0.164, "step": 909 }, { "epoch": 0.14152410575427682, "grad_norm": 1.0466682687606328, "learning_rate": 9.513890030499786e-06, "loss": 0.1862, "step": 910 }, { "epoch": 0.1416796267496112, "grad_norm": 1.667573553968496, "learning_rate": 9.512838776063464e-06, "loss": 0.1881, "step": 911 }, { "epoch": 0.14183514774494557, "grad_norm": 1.0309274313381354, "learning_rate": 9.51178644434959e-06, "loss": 0.1894, "step": 912 }, { "epoch": 0.14199066874027994, "grad_norm": 1.1516030880613233, "learning_rate": 9.510733035609376e-06, "loss": 0.1906, "step": 913 }, { "epoch": 0.1421461897356143, "grad_norm": 1.1964374362259393, "learning_rate": 9.509678550094282e-06, "loss": 0.2193, "step": 914 }, { "epoch": 0.1423017107309487, "grad_norm": 1.018131456622998, "learning_rate": 9.508622988056026e-06, "loss": 0.18, "step": 915 }, { "epoch": 0.14245723172628305, "grad_norm": 0.9878879365994556, "learning_rate": 9.50756634974659e-06, "loss": 0.2303, "step": 916 }, { "epoch": 0.14261275272161741, "grad_norm": 0.9092163587106824, "learning_rate": 9.506508635418203e-06, "loss": 0.1565, "step": 917 }, { "epoch": 0.14276827371695178, "grad_norm": 1.2600000274625656, "learning_rate": 9.505449845323362e-06, "loss": 0.2203, "step": 918 }, { "epoch": 0.14292379471228617, "grad_norm": 1.0177653430547444, "learning_rate": 9.504389979714812e-06, "loss": 0.1708, "step": 919 }, { "epoch": 0.14307931570762053, "grad_norm": 1.323141251202386, "learning_rate": 9.503329038845556e-06, "loss": 0.2041, "step": 920 }, { "epoch": 0.1432348367029549, "grad_norm": 0.8666613786933973, "learning_rate": 9.50226702296886e-06, "loss": 0.1709, "step": 921 }, { "epoch": 0.14339035769828926, "grad_norm": 1.4717207003269144, "learning_rate": 9.501203932338238e-06, "loss": 0.1531, "step": 922 }, { "epoch": 0.14354587869362365, "grad_norm": 0.9850527774643847, "learning_rate": 9.500139767207465e-06, "loss": 0.2673, "step": 923 }, { "epoch": 0.143701399688958, "grad_norm": 0.795383661376322, "learning_rate": 9.499074527830576e-06, "loss": 0.1514, "step": 924 }, { "epoch": 0.14385692068429237, "grad_norm": 1.5926732733378721, "learning_rate": 9.498008214461854e-06, "loss": 0.1919, "step": 925 }, { "epoch": 0.14401244167962676, "grad_norm": 1.0577956165619293, "learning_rate": 9.496940827355843e-06, "loss": 0.2541, "step": 926 }, { "epoch": 0.14416796267496113, "grad_norm": 1.0853608193427453, "learning_rate": 9.495872366767345e-06, "loss": 0.3026, "step": 927 }, { "epoch": 0.1443234836702955, "grad_norm": 1.5841584604687593, "learning_rate": 9.494802832951416e-06, "loss": 0.237, "step": 928 }, { "epoch": 0.14447900466562985, "grad_norm": 1.2668912692543315, "learning_rate": 9.493732226163368e-06, "loss": 0.1962, "step": 929 }, { "epoch": 0.14463452566096424, "grad_norm": 1.1865934879383473, "learning_rate": 9.492660546658771e-06, "loss": 0.205, "step": 930 }, { "epoch": 0.1447900466562986, "grad_norm": 1.16907334182334, "learning_rate": 9.491587794693448e-06, "loss": 0.1649, "step": 931 }, { "epoch": 0.14494556765163297, "grad_norm": 2.6694118671679035, "learning_rate": 9.490513970523482e-06, "loss": 0.1716, "step": 932 }, { "epoch": 0.14510108864696733, "grad_norm": 1.2693916754547256, "learning_rate": 9.489439074405211e-06, "loss": 0.2102, "step": 933 }, { "epoch": 0.14525660964230172, "grad_norm": 1.4815910522621762, "learning_rate": 9.488363106595223e-06, "loss": 0.2146, "step": 934 }, { "epoch": 0.14541213063763608, "grad_norm": 1.5330200808441012, "learning_rate": 9.48728606735037e-06, "loss": 0.1767, "step": 935 }, { "epoch": 0.14556765163297045, "grad_norm": 1.123567228978502, "learning_rate": 9.486207956927756e-06, "loss": 0.1864, "step": 936 }, { "epoch": 0.1457231726283048, "grad_norm": 0.9960966752159592, "learning_rate": 9.485128775584737e-06, "loss": 0.2118, "step": 937 }, { "epoch": 0.1458786936236392, "grad_norm": 1.2303193618017887, "learning_rate": 9.484048523578934e-06, "loss": 0.2106, "step": 938 }, { "epoch": 0.14603421461897356, "grad_norm": 1.2867421133114936, "learning_rate": 9.482967201168218e-06, "loss": 0.2252, "step": 939 }, { "epoch": 0.14618973561430793, "grad_norm": 1.3372951799730566, "learning_rate": 9.481884808610712e-06, "loss": 0.2662, "step": 940 }, { "epoch": 0.1463452566096423, "grad_norm": 0.6808561025624517, "learning_rate": 9.4808013461648e-06, "loss": 0.1613, "step": 941 }, { "epoch": 0.14650077760497668, "grad_norm": 1.0617639952793092, "learning_rate": 9.479716814089119e-06, "loss": 0.22, "step": 942 }, { "epoch": 0.14665629860031104, "grad_norm": 1.2088515247514138, "learning_rate": 9.478631212642565e-06, "loss": 0.2027, "step": 943 }, { "epoch": 0.1468118195956454, "grad_norm": 0.9673478217504623, "learning_rate": 9.477544542084283e-06, "loss": 0.2291, "step": 944 }, { "epoch": 0.1469673405909798, "grad_norm": 1.3295783157520016, "learning_rate": 9.476456802673677e-06, "loss": 0.2153, "step": 945 }, { "epoch": 0.14712286158631416, "grad_norm": 1.1001160858062626, "learning_rate": 9.475367994670406e-06, "loss": 0.2195, "step": 946 }, { "epoch": 0.14727838258164852, "grad_norm": 1.291866801296516, "learning_rate": 9.474278118334382e-06, "loss": 0.2213, "step": 947 }, { "epoch": 0.14743390357698288, "grad_norm": 1.2674302718543788, "learning_rate": 9.473187173925777e-06, "loss": 0.1371, "step": 948 }, { "epoch": 0.14758942457231727, "grad_norm": 1.4168689609608738, "learning_rate": 9.472095161705014e-06, "loss": 0.1902, "step": 949 }, { "epoch": 0.14774494556765164, "grad_norm": 1.0439332293475743, "learning_rate": 9.471002081932767e-06, "loss": 0.2069, "step": 950 }, { "epoch": 0.147900466562986, "grad_norm": 1.346490441102045, "learning_rate": 9.469907934869974e-06, "loss": 0.1982, "step": 951 }, { "epoch": 0.14805598755832036, "grad_norm": 1.1817129831636979, "learning_rate": 9.468812720777822e-06, "loss": 0.1626, "step": 952 }, { "epoch": 0.14821150855365475, "grad_norm": 0.846186520557803, "learning_rate": 9.467716439917753e-06, "loss": 0.1659, "step": 953 }, { "epoch": 0.14836702954898912, "grad_norm": 1.77057726290962, "learning_rate": 9.466619092551467e-06, "loss": 0.1571, "step": 954 }, { "epoch": 0.14852255054432348, "grad_norm": 1.503606666530362, "learning_rate": 9.465520678940913e-06, "loss": 0.2317, "step": 955 }, { "epoch": 0.14867807153965784, "grad_norm": 1.2988561500793663, "learning_rate": 9.4644211993483e-06, "loss": 0.184, "step": 956 }, { "epoch": 0.14883359253499223, "grad_norm": 0.9494708116205622, "learning_rate": 9.463320654036088e-06, "loss": 0.2061, "step": 957 }, { "epoch": 0.1489891135303266, "grad_norm": 1.1960711999747602, "learning_rate": 9.462219043266993e-06, "loss": 0.1595, "step": 958 }, { "epoch": 0.14914463452566096, "grad_norm": 1.456286481771, "learning_rate": 9.461116367303985e-06, "loss": 0.1803, "step": 959 }, { "epoch": 0.14930015552099535, "grad_norm": 2.193608162058263, "learning_rate": 9.460012626410286e-06, "loss": 0.2372, "step": 960 }, { "epoch": 0.1494556765163297, "grad_norm": 1.1257027932111565, "learning_rate": 9.458907820849378e-06, "loss": 0.2183, "step": 961 }, { "epoch": 0.14961119751166407, "grad_norm": 1.2699403552308035, "learning_rate": 9.457801950884991e-06, "loss": 0.2112, "step": 962 }, { "epoch": 0.14976671850699844, "grad_norm": 2.0211225561288986, "learning_rate": 9.456695016781112e-06, "loss": 0.3771, "step": 963 }, { "epoch": 0.14992223950233283, "grad_norm": 1.6233952494139523, "learning_rate": 9.455587018801979e-06, "loss": 0.1654, "step": 964 }, { "epoch": 0.1500777604976672, "grad_norm": 0.9536635356305013, "learning_rate": 9.454477957212092e-06, "loss": 0.1971, "step": 965 }, { "epoch": 0.15023328149300155, "grad_norm": 1.2024688455270478, "learning_rate": 9.453367832276196e-06, "loss": 0.2073, "step": 966 }, { "epoch": 0.15038880248833592, "grad_norm": 1.0163258023024337, "learning_rate": 9.452256644259296e-06, "loss": 0.1622, "step": 967 }, { "epoch": 0.1505443234836703, "grad_norm": 1.4838973791587633, "learning_rate": 9.451144393426643e-06, "loss": 0.2058, "step": 968 }, { "epoch": 0.15069984447900467, "grad_norm": 1.0443777554962437, "learning_rate": 9.450031080043752e-06, "loss": 0.165, "step": 969 }, { "epoch": 0.15085536547433903, "grad_norm": 1.1175170370729908, "learning_rate": 9.448916704376384e-06, "loss": 0.1419, "step": 970 }, { "epoch": 0.1510108864696734, "grad_norm": 1.2857861611804626, "learning_rate": 9.447801266690557e-06, "loss": 0.2171, "step": 971 }, { "epoch": 0.15116640746500778, "grad_norm": 0.7407729973632995, "learning_rate": 9.446684767252539e-06, "loss": 0.1714, "step": 972 }, { "epoch": 0.15132192846034215, "grad_norm": 2.195989894115042, "learning_rate": 9.445567206328857e-06, "loss": 0.1989, "step": 973 }, { "epoch": 0.1514774494556765, "grad_norm": 0.989971668490221, "learning_rate": 9.444448584186288e-06, "loss": 0.1664, "step": 974 }, { "epoch": 0.15163297045101087, "grad_norm": 1.081538706581427, "learning_rate": 9.44332890109186e-06, "loss": 0.2066, "step": 975 }, { "epoch": 0.15178849144634526, "grad_norm": 1.4377035491264887, "learning_rate": 9.442208157312859e-06, "loss": 0.2057, "step": 976 }, { "epoch": 0.15194401244167963, "grad_norm": 1.5898783963503191, "learning_rate": 9.441086353116825e-06, "loss": 0.1665, "step": 977 }, { "epoch": 0.152099533437014, "grad_norm": 0.899579074969373, "learning_rate": 9.439963488771543e-06, "loss": 0.2091, "step": 978 }, { "epoch": 0.15225505443234838, "grad_norm": 1.4218933674345213, "learning_rate": 9.438839564545059e-06, "loss": 0.2344, "step": 979 }, { "epoch": 0.15241057542768274, "grad_norm": 1.2490316562718224, "learning_rate": 9.437714580705671e-06, "loss": 0.1771, "step": 980 }, { "epoch": 0.1525660964230171, "grad_norm": 1.3535600594171835, "learning_rate": 9.436588537521925e-06, "loss": 0.2402, "step": 981 }, { "epoch": 0.15272161741835147, "grad_norm": 1.2653882449622933, "learning_rate": 9.435461435262623e-06, "loss": 0.2368, "step": 982 }, { "epoch": 0.15287713841368586, "grad_norm": 1.4171554003791706, "learning_rate": 9.434333274196822e-06, "loss": 0.16, "step": 983 }, { "epoch": 0.15303265940902022, "grad_norm": 0.9372171947174371, "learning_rate": 9.433204054593832e-06, "loss": 0.1464, "step": 984 }, { "epoch": 0.15318818040435458, "grad_norm": 0.9807519101904891, "learning_rate": 9.43207377672321e-06, "loss": 0.1743, "step": 985 }, { "epoch": 0.15334370139968895, "grad_norm": 1.9830197584350164, "learning_rate": 9.430942440854772e-06, "loss": 0.2979, "step": 986 }, { "epoch": 0.15349922239502334, "grad_norm": 1.013327149062581, "learning_rate": 9.429810047258578e-06, "loss": 0.2257, "step": 987 }, { "epoch": 0.1536547433903577, "grad_norm": 1.3644569563063227, "learning_rate": 9.428676596204953e-06, "loss": 0.227, "step": 988 }, { "epoch": 0.15381026438569206, "grad_norm": 1.2971192291816034, "learning_rate": 9.427542087964462e-06, "loss": 0.2012, "step": 989 }, { "epoch": 0.15396578538102643, "grad_norm": 1.063681975107411, "learning_rate": 9.426406522807932e-06, "loss": 0.2299, "step": 990 }, { "epoch": 0.15412130637636082, "grad_norm": 1.0390353297783406, "learning_rate": 9.425269901006435e-06, "loss": 0.1438, "step": 991 }, { "epoch": 0.15427682737169518, "grad_norm": 1.821321152512482, "learning_rate": 9.424132222831301e-06, "loss": 0.1797, "step": 992 }, { "epoch": 0.15443234836702954, "grad_norm": 1.0266940584964872, "learning_rate": 9.422993488554108e-06, "loss": 0.1524, "step": 993 }, { "epoch": 0.1545878693623639, "grad_norm": 1.2357982408354415, "learning_rate": 9.42185369844669e-06, "loss": 0.1765, "step": 994 }, { "epoch": 0.1547433903576983, "grad_norm": 1.3007180654461126, "learning_rate": 9.420712852781129e-06, "loss": 0.2278, "step": 995 }, { "epoch": 0.15489891135303266, "grad_norm": 1.3519816843089092, "learning_rate": 9.419570951829761e-06, "loss": 0.2261, "step": 996 }, { "epoch": 0.15505443234836702, "grad_norm": 0.814621189176537, "learning_rate": 9.418427995865174e-06, "loss": 0.2172, "step": 997 }, { "epoch": 0.1552099533437014, "grad_norm": 1.7543842879443927, "learning_rate": 9.417283985160206e-06, "loss": 0.2164, "step": 998 }, { "epoch": 0.15536547433903578, "grad_norm": 0.8276231350286671, "learning_rate": 9.41613891998795e-06, "loss": 0.1975, "step": 999 }, { "epoch": 0.15552099533437014, "grad_norm": 1.1550898822511304, "learning_rate": 9.414992800621749e-06, "loss": 0.1501, "step": 1000 }, { "epoch": 0.15552099533437014, "eval_loss": 0.21367190778255463, "eval_runtime": 9.4284, "eval_samples_per_second": 2.758, "eval_steps_per_second": 0.742, "step": 1000 }, { "epoch": 0.1556765163297045, "grad_norm": 1.6764153048318766, "learning_rate": 9.413845627335197e-06, "loss": 0.2071, "step": 1001 }, { "epoch": 0.1558320373250389, "grad_norm": 1.1886246410449919, "learning_rate": 9.41269740040214e-06, "loss": 0.1956, "step": 1002 }, { "epoch": 0.15598755832037325, "grad_norm": 1.0793500722611682, "learning_rate": 9.411548120096676e-06, "loss": 0.144, "step": 1003 }, { "epoch": 0.15614307931570762, "grad_norm": 1.2449924636096124, "learning_rate": 9.410397786693157e-06, "loss": 0.2734, "step": 1004 }, { "epoch": 0.15629860031104198, "grad_norm": 0.8611732851449306, "learning_rate": 9.409246400466178e-06, "loss": 0.1923, "step": 1005 }, { "epoch": 0.15645412130637637, "grad_norm": 6.74577569453225, "learning_rate": 9.408093961690596e-06, "loss": 0.1956, "step": 1006 }, { "epoch": 0.15660964230171073, "grad_norm": 1.2060004741533563, "learning_rate": 9.406940470641512e-06, "loss": 0.2739, "step": 1007 }, { "epoch": 0.1567651632970451, "grad_norm": 1.6202727992084955, "learning_rate": 9.405785927594281e-06, "loss": 0.3171, "step": 1008 }, { "epoch": 0.15692068429237946, "grad_norm": 2.0124632761977534, "learning_rate": 9.404630332824509e-06, "loss": 0.2104, "step": 1009 }, { "epoch": 0.15707620528771385, "grad_norm": 2.0142886633624286, "learning_rate": 9.40347368660805e-06, "loss": 0.2548, "step": 1010 }, { "epoch": 0.1572317262830482, "grad_norm": 1.3434989581281018, "learning_rate": 9.402315989221013e-06, "loss": 0.2411, "step": 1011 }, { "epoch": 0.15738724727838257, "grad_norm": 1.3315974814677487, "learning_rate": 9.40115724093976e-06, "loss": 0.2839, "step": 1012 }, { "epoch": 0.15754276827371697, "grad_norm": 1.1186058721777734, "learning_rate": 9.399997442040894e-06, "loss": 0.167, "step": 1013 }, { "epoch": 0.15769828926905133, "grad_norm": 1.4492217703231243, "learning_rate": 9.39883659280128e-06, "loss": 0.1268, "step": 1014 }, { "epoch": 0.1578538102643857, "grad_norm": 1.257425749091041, "learning_rate": 9.39767469349803e-06, "loss": 0.1433, "step": 1015 }, { "epoch": 0.15800933125972005, "grad_norm": 1.7996939549666984, "learning_rate": 9.396511744408498e-06, "loss": 0.2012, "step": 1016 }, { "epoch": 0.15816485225505444, "grad_norm": 0.8429015986655448, "learning_rate": 9.395347745810304e-06, "loss": 0.1935, "step": 1017 }, { "epoch": 0.1583203732503888, "grad_norm": 1.370521795316769, "learning_rate": 9.394182697981306e-06, "loss": 0.2183, "step": 1018 }, { "epoch": 0.15847589424572317, "grad_norm": 1.622770939923456, "learning_rate": 9.393016601199622e-06, "loss": 0.1593, "step": 1019 }, { "epoch": 0.15863141524105753, "grad_norm": 1.011909638401176, "learning_rate": 9.39184945574361e-06, "loss": 0.2053, "step": 1020 }, { "epoch": 0.15878693623639192, "grad_norm": 1.6110438711648936, "learning_rate": 9.390681261891887e-06, "loss": 0.222, "step": 1021 }, { "epoch": 0.1589424572317263, "grad_norm": 1.4859951673056488, "learning_rate": 9.389512019923318e-06, "loss": 0.231, "step": 1022 }, { "epoch": 0.15909797822706065, "grad_norm": 1.166598629738374, "learning_rate": 9.388341730117015e-06, "loss": 0.1917, "step": 1023 }, { "epoch": 0.159253499222395, "grad_norm": 1.0987845208229972, "learning_rate": 9.387170392752342e-06, "loss": 0.184, "step": 1024 }, { "epoch": 0.1594090202177294, "grad_norm": 1.5795930559063704, "learning_rate": 9.385998008108917e-06, "loss": 0.2097, "step": 1025 }, { "epoch": 0.15956454121306377, "grad_norm": 1.4302193933514027, "learning_rate": 9.384824576466601e-06, "loss": 0.2194, "step": 1026 }, { "epoch": 0.15972006220839813, "grad_norm": 0.9372034033824603, "learning_rate": 9.383650098105512e-06, "loss": 0.243, "step": 1027 }, { "epoch": 0.1598755832037325, "grad_norm": 1.0038945695499553, "learning_rate": 9.382474573306011e-06, "loss": 0.1861, "step": 1028 }, { "epoch": 0.16003110419906688, "grad_norm": 0.9989868346004813, "learning_rate": 9.381298002348713e-06, "loss": 0.2324, "step": 1029 }, { "epoch": 0.16018662519440124, "grad_norm": 1.4240189031581216, "learning_rate": 9.380120385514484e-06, "loss": 0.1974, "step": 1030 }, { "epoch": 0.1603421461897356, "grad_norm": 1.378754367931683, "learning_rate": 9.378941723084436e-06, "loss": 0.245, "step": 1031 }, { "epoch": 0.16049766718507, "grad_norm": 1.8715129600892846, "learning_rate": 9.37776201533993e-06, "loss": 0.3174, "step": 1032 }, { "epoch": 0.16065318818040436, "grad_norm": 1.1921962243878195, "learning_rate": 9.376581262562584e-06, "loss": 0.1917, "step": 1033 }, { "epoch": 0.16080870917573872, "grad_norm": 1.2635206395103649, "learning_rate": 9.375399465034257e-06, "loss": 0.1878, "step": 1034 }, { "epoch": 0.16096423017107309, "grad_norm": 1.2398545424205532, "learning_rate": 9.374216623037057e-06, "loss": 0.2344, "step": 1035 }, { "epoch": 0.16111975116640748, "grad_norm": 0.9462934166321078, "learning_rate": 9.373032736853352e-06, "loss": 0.187, "step": 1036 }, { "epoch": 0.16127527216174184, "grad_norm": 1.5590735847268282, "learning_rate": 9.371847806765749e-06, "loss": 0.2097, "step": 1037 }, { "epoch": 0.1614307931570762, "grad_norm": 1.160888284446341, "learning_rate": 9.370661833057103e-06, "loss": 0.1506, "step": 1038 }, { "epoch": 0.16158631415241057, "grad_norm": 1.1778543046473768, "learning_rate": 9.36947481601053e-06, "loss": 0.1716, "step": 1039 }, { "epoch": 0.16174183514774496, "grad_norm": 1.4532605779910739, "learning_rate": 9.368286755909383e-06, "loss": 0.182, "step": 1040 }, { "epoch": 0.16189735614307932, "grad_norm": 0.9502972420425978, "learning_rate": 9.36709765303727e-06, "loss": 0.2161, "step": 1041 }, { "epoch": 0.16205287713841368, "grad_norm": 1.4588748874097772, "learning_rate": 9.365907507678045e-06, "loss": 0.2338, "step": 1042 }, { "epoch": 0.16220839813374804, "grad_norm": 1.4225573142040282, "learning_rate": 9.364716320115813e-06, "loss": 0.1781, "step": 1043 }, { "epoch": 0.16236391912908243, "grad_norm": 1.029996429205044, "learning_rate": 9.363524090634928e-06, "loss": 0.2257, "step": 1044 }, { "epoch": 0.1625194401244168, "grad_norm": 1.379085736135871, "learning_rate": 9.362330819519991e-06, "loss": 0.2186, "step": 1045 }, { "epoch": 0.16267496111975116, "grad_norm": 1.2962827183429935, "learning_rate": 9.361136507055853e-06, "loss": 0.1916, "step": 1046 }, { "epoch": 0.16283048211508552, "grad_norm": 0.9451500150098339, "learning_rate": 9.359941153527612e-06, "loss": 0.1859, "step": 1047 }, { "epoch": 0.1629860031104199, "grad_norm": 1.0944328685975881, "learning_rate": 9.358744759220614e-06, "loss": 0.2225, "step": 1048 }, { "epoch": 0.16314152410575428, "grad_norm": 1.1266179070522002, "learning_rate": 9.357547324420461e-06, "loss": 0.2039, "step": 1049 }, { "epoch": 0.16329704510108864, "grad_norm": 1.26823288307141, "learning_rate": 9.356348849412991e-06, "loss": 0.2686, "step": 1050 }, { "epoch": 0.16345256609642303, "grad_norm": 1.3783372129870655, "learning_rate": 9.355149334484302e-06, "loss": 0.2715, "step": 1051 }, { "epoch": 0.1636080870917574, "grad_norm": 0.950454440753535, "learning_rate": 9.35394877992073e-06, "loss": 0.1697, "step": 1052 }, { "epoch": 0.16376360808709176, "grad_norm": 2.4437577046740895, "learning_rate": 9.352747186008865e-06, "loss": 0.2087, "step": 1053 }, { "epoch": 0.16391912908242612, "grad_norm": 1.4140943006046114, "learning_rate": 9.351544553035547e-06, "loss": 0.2063, "step": 1054 }, { "epoch": 0.1640746500777605, "grad_norm": 0.967217619359645, "learning_rate": 9.350340881287861e-06, "loss": 0.2008, "step": 1055 }, { "epoch": 0.16423017107309487, "grad_norm": 1.4590565286071695, "learning_rate": 9.349136171053139e-06, "loss": 0.1897, "step": 1056 }, { "epoch": 0.16438569206842923, "grad_norm": 1.0794053199949247, "learning_rate": 9.34793042261896e-06, "loss": 0.1037, "step": 1057 }, { "epoch": 0.1645412130637636, "grad_norm": 1.15272662266887, "learning_rate": 9.346723636273157e-06, "loss": 0.239, "step": 1058 }, { "epoch": 0.164696734059098, "grad_norm": 1.3755496055051248, "learning_rate": 9.345515812303802e-06, "loss": 0.2655, "step": 1059 }, { "epoch": 0.16485225505443235, "grad_norm": 1.1623669619389423, "learning_rate": 9.344306950999226e-06, "loss": 0.2254, "step": 1060 }, { "epoch": 0.1650077760497667, "grad_norm": 1.1373510201117636, "learning_rate": 9.343097052647996e-06, "loss": 0.2515, "step": 1061 }, { "epoch": 0.16516329704510108, "grad_norm": 1.349812652007435, "learning_rate": 9.341886117538931e-06, "loss": 0.2367, "step": 1062 }, { "epoch": 0.16531881804043547, "grad_norm": 1.0436524504014346, "learning_rate": 9.340674145961101e-06, "loss": 0.1552, "step": 1063 }, { "epoch": 0.16547433903576983, "grad_norm": 1.3297059840324263, "learning_rate": 9.339461138203821e-06, "loss": 0.2201, "step": 1064 }, { "epoch": 0.1656298600311042, "grad_norm": 1.7541537167845238, "learning_rate": 9.338247094556651e-06, "loss": 0.2076, "step": 1065 }, { "epoch": 0.16578538102643858, "grad_norm": 1.442252163275357, "learning_rate": 9.3370320153094e-06, "loss": 0.1753, "step": 1066 }, { "epoch": 0.16594090202177295, "grad_norm": 1.143025605577321, "learning_rate": 9.335815900752125e-06, "loss": 0.2217, "step": 1067 }, { "epoch": 0.1660964230171073, "grad_norm": 1.178025675869792, "learning_rate": 9.33459875117513e-06, "loss": 0.1621, "step": 1068 }, { "epoch": 0.16625194401244167, "grad_norm": 0.8859479026343935, "learning_rate": 9.333380566868963e-06, "loss": 0.2214, "step": 1069 }, { "epoch": 0.16640746500777606, "grad_norm": 1.1580516447127225, "learning_rate": 9.332161348124426e-06, "loss": 0.2104, "step": 1070 }, { "epoch": 0.16656298600311042, "grad_norm": 0.9322363288405592, "learning_rate": 9.33094109523256e-06, "loss": 0.1524, "step": 1071 }, { "epoch": 0.1667185069984448, "grad_norm": 1.2071920671355123, "learning_rate": 9.32971980848466e-06, "loss": 0.2204, "step": 1072 }, { "epoch": 0.16687402799377915, "grad_norm": 1.4321090820471434, "learning_rate": 9.328497488172256e-06, "loss": 0.2185, "step": 1073 }, { "epoch": 0.16702954898911354, "grad_norm": 1.5323210185604608, "learning_rate": 9.327274134587144e-06, "loss": 0.1967, "step": 1074 }, { "epoch": 0.1671850699844479, "grad_norm": 1.2827697157454871, "learning_rate": 9.326049748021348e-06, "loss": 0.1835, "step": 1075 }, { "epoch": 0.16734059097978227, "grad_norm": 0.9598851088099357, "learning_rate": 9.324824328767148e-06, "loss": 0.1524, "step": 1076 }, { "epoch": 0.16749611197511663, "grad_norm": 1.1012363230038584, "learning_rate": 9.323597877117069e-06, "loss": 0.1934, "step": 1077 }, { "epoch": 0.16765163297045102, "grad_norm": 1.7979943018863753, "learning_rate": 9.322370393363881e-06, "loss": 0.2809, "step": 1078 }, { "epoch": 0.16780715396578538, "grad_norm": 0.9525483556320685, "learning_rate": 9.321141877800604e-06, "loss": 0.1544, "step": 1079 }, { "epoch": 0.16796267496111975, "grad_norm": 1.1079754408286966, "learning_rate": 9.319912330720502e-06, "loss": 0.1939, "step": 1080 }, { "epoch": 0.1681181959564541, "grad_norm": 1.4615045454023567, "learning_rate": 9.31868175241708e-06, "loss": 0.1879, "step": 1081 }, { "epoch": 0.1682737169517885, "grad_norm": 0.9677318917431114, "learning_rate": 9.3174501431841e-06, "loss": 0.1572, "step": 1082 }, { "epoch": 0.16842923794712286, "grad_norm": 1.1156223371393144, "learning_rate": 9.316217503315562e-06, "loss": 0.2477, "step": 1083 }, { "epoch": 0.16858475894245722, "grad_norm": 0.9283556985369971, "learning_rate": 9.314983833105713e-06, "loss": 0.1855, "step": 1084 }, { "epoch": 0.16874027993779162, "grad_norm": 0.9107625137180413, "learning_rate": 9.313749132849048e-06, "loss": 0.1941, "step": 1085 }, { "epoch": 0.16889580093312598, "grad_norm": 1.1200752990922627, "learning_rate": 9.312513402840308e-06, "loss": 0.1714, "step": 1086 }, { "epoch": 0.16905132192846034, "grad_norm": 1.5919484746453285, "learning_rate": 9.311276643374478e-06, "loss": 0.1907, "step": 1087 }, { "epoch": 0.1692068429237947, "grad_norm": 1.6737891841333687, "learning_rate": 9.310038854746793e-06, "loss": 0.3096, "step": 1088 }, { "epoch": 0.1693623639191291, "grad_norm": 0.9356610939198378, "learning_rate": 9.308800037252726e-06, "loss": 0.215, "step": 1089 }, { "epoch": 0.16951788491446346, "grad_norm": 0.9978911792591384, "learning_rate": 9.307560191188e-06, "loss": 0.2023, "step": 1090 }, { "epoch": 0.16967340590979782, "grad_norm": 0.8618605808228078, "learning_rate": 9.30631931684859e-06, "loss": 0.1835, "step": 1091 }, { "epoch": 0.16982892690513218, "grad_norm": 1.073899023320524, "learning_rate": 9.305077414530701e-06, "loss": 0.2856, "step": 1092 }, { "epoch": 0.16998444790046657, "grad_norm": 1.390799646940327, "learning_rate": 9.303834484530798e-06, "loss": 0.1768, "step": 1093 }, { "epoch": 0.17013996889580094, "grad_norm": 1.1517992631531213, "learning_rate": 9.302590527145585e-06, "loss": 0.1661, "step": 1094 }, { "epoch": 0.1702954898911353, "grad_norm": 1.0942354595322217, "learning_rate": 9.301345542672012e-06, "loss": 0.2161, "step": 1095 }, { "epoch": 0.17045101088646966, "grad_norm": 0.8079291053355052, "learning_rate": 9.300099531407273e-06, "loss": 0.1768, "step": 1096 }, { "epoch": 0.17060653188180405, "grad_norm": 0.8090971826904667, "learning_rate": 9.298852493648808e-06, "loss": 0.1761, "step": 1097 }, { "epoch": 0.17076205287713841, "grad_norm": 1.2570428694136606, "learning_rate": 9.297604429694305e-06, "loss": 0.1742, "step": 1098 }, { "epoch": 0.17091757387247278, "grad_norm": 1.4714283316352859, "learning_rate": 9.296355339841692e-06, "loss": 0.2716, "step": 1099 }, { "epoch": 0.17107309486780714, "grad_norm": 1.07865700806752, "learning_rate": 9.295105224389144e-06, "loss": 0.1507, "step": 1100 }, { "epoch": 0.17107309486780714, "eval_loss": 0.21004652976989746, "eval_runtime": 9.4236, "eval_samples_per_second": 2.759, "eval_steps_per_second": 0.743, "step": 1100 }, { "epoch": 0.17122861586314153, "grad_norm": 0.8784655316390252, "learning_rate": 9.293854083635081e-06, "loss": 0.1673, "step": 1101 }, { "epoch": 0.1713841368584759, "grad_norm": 1.025281186756548, "learning_rate": 9.292601917878169e-06, "loss": 0.1715, "step": 1102 }, { "epoch": 0.17153965785381026, "grad_norm": 1.409333718683306, "learning_rate": 9.291348727417318e-06, "loss": 0.2155, "step": 1103 }, { "epoch": 0.17169517884914465, "grad_norm": 1.0469534251307742, "learning_rate": 9.290094512551679e-06, "loss": 0.1918, "step": 1104 }, { "epoch": 0.171850699844479, "grad_norm": 1.275008024365504, "learning_rate": 9.288839273580652e-06, "loss": 0.1264, "step": 1105 }, { "epoch": 0.17200622083981337, "grad_norm": 1.2168876399929267, "learning_rate": 9.287583010803882e-06, "loss": 0.2855, "step": 1106 }, { "epoch": 0.17216174183514774, "grad_norm": 1.2066762279123466, "learning_rate": 9.286325724521254e-06, "loss": 0.2242, "step": 1107 }, { "epoch": 0.17231726283048213, "grad_norm": 2.4948253959447144, "learning_rate": 9.285067415032902e-06, "loss": 0.2875, "step": 1108 }, { "epoch": 0.1724727838258165, "grad_norm": 1.8284540511597713, "learning_rate": 9.283808082639198e-06, "loss": 0.2049, "step": 1109 }, { "epoch": 0.17262830482115085, "grad_norm": 1.3355119525104016, "learning_rate": 9.282547727640767e-06, "loss": 0.1717, "step": 1110 }, { "epoch": 0.17278382581648521, "grad_norm": 1.0266534905254066, "learning_rate": 9.281286350338472e-06, "loss": 0.2066, "step": 1111 }, { "epoch": 0.1729393468118196, "grad_norm": 1.2099083780797275, "learning_rate": 9.280023951033418e-06, "loss": 0.2807, "step": 1112 }, { "epoch": 0.17309486780715397, "grad_norm": 0.949550488293792, "learning_rate": 9.278760530026963e-06, "loss": 0.1992, "step": 1113 }, { "epoch": 0.17325038880248833, "grad_norm": 1.0598653084819885, "learning_rate": 9.277496087620696e-06, "loss": 0.2358, "step": 1114 }, { "epoch": 0.1734059097978227, "grad_norm": 1.4050304182051088, "learning_rate": 9.276230624116464e-06, "loss": 0.2222, "step": 1115 }, { "epoch": 0.17356143079315708, "grad_norm": 0.9817712530234229, "learning_rate": 9.274964139816347e-06, "loss": 0.1931, "step": 1116 }, { "epoch": 0.17371695178849145, "grad_norm": 1.7060543693066812, "learning_rate": 9.273696635022674e-06, "loss": 0.2343, "step": 1117 }, { "epoch": 0.1738724727838258, "grad_norm": 1.2527360379181598, "learning_rate": 9.272428110038016e-06, "loss": 0.1717, "step": 1118 }, { "epoch": 0.17402799377916017, "grad_norm": 1.0592648467758805, "learning_rate": 9.271158565165186e-06, "loss": 0.1338, "step": 1119 }, { "epoch": 0.17418351477449456, "grad_norm": 1.1697431614729739, "learning_rate": 9.269888000707243e-06, "loss": 0.0937, "step": 1120 }, { "epoch": 0.17433903576982893, "grad_norm": 1.3666630215902802, "learning_rate": 9.26861641696749e-06, "loss": 0.195, "step": 1121 }, { "epoch": 0.1744945567651633, "grad_norm": 0.9618565647030869, "learning_rate": 9.267343814249468e-06, "loss": 0.175, "step": 1122 }, { "epoch": 0.17465007776049768, "grad_norm": 1.4220832361635052, "learning_rate": 9.266070192856968e-06, "loss": 0.1593, "step": 1123 }, { "epoch": 0.17480559875583204, "grad_norm": 0.776257033559064, "learning_rate": 9.264795553094022e-06, "loss": 0.2249, "step": 1124 }, { "epoch": 0.1749611197511664, "grad_norm": 1.2113799530837854, "learning_rate": 9.263519895264901e-06, "loss": 0.1907, "step": 1125 }, { "epoch": 0.17511664074650077, "grad_norm": 1.3082437362032786, "learning_rate": 9.262243219674126e-06, "loss": 0.2666, "step": 1126 }, { "epoch": 0.17527216174183516, "grad_norm": 1.872862944531211, "learning_rate": 9.260965526626452e-06, "loss": 0.1784, "step": 1127 }, { "epoch": 0.17542768273716952, "grad_norm": 1.3432522813757912, "learning_rate": 9.25968681642689e-06, "loss": 0.1451, "step": 1128 }, { "epoch": 0.17558320373250388, "grad_norm": 0.9703679937198076, "learning_rate": 9.258407089380679e-06, "loss": 0.1297, "step": 1129 }, { "epoch": 0.17573872472783825, "grad_norm": 1.0365436632456377, "learning_rate": 9.25712634579331e-06, "loss": 0.1761, "step": 1130 }, { "epoch": 0.17589424572317264, "grad_norm": 2.1522303469420994, "learning_rate": 9.255844585970516e-06, "loss": 0.1296, "step": 1131 }, { "epoch": 0.176049766718507, "grad_norm": 1.291217930882477, "learning_rate": 9.254561810218269e-06, "loss": 0.2044, "step": 1132 }, { "epoch": 0.17620528771384136, "grad_norm": 0.9937462574500329, "learning_rate": 9.253278018842786e-06, "loss": 0.1997, "step": 1133 }, { "epoch": 0.17636080870917573, "grad_norm": 0.9450489875743622, "learning_rate": 9.251993212150525e-06, "loss": 0.1747, "step": 1134 }, { "epoch": 0.17651632970451012, "grad_norm": 1.4735357191672043, "learning_rate": 9.250707390448187e-06, "loss": 0.2377, "step": 1135 }, { "epoch": 0.17667185069984448, "grad_norm": 0.957023692443933, "learning_rate": 9.24942055404272e-06, "loss": 0.1319, "step": 1136 }, { "epoch": 0.17682737169517884, "grad_norm": 0.9533362941250507, "learning_rate": 9.248132703241306e-06, "loss": 0.142, "step": 1137 }, { "epoch": 0.17698289269051323, "grad_norm": 1.1321821260027138, "learning_rate": 9.246843838351371e-06, "loss": 0.185, "step": 1138 }, { "epoch": 0.1771384136858476, "grad_norm": 0.6564569809439412, "learning_rate": 9.24555395968059e-06, "loss": 0.1511, "step": 1139 }, { "epoch": 0.17729393468118196, "grad_norm": 0.8235534803965409, "learning_rate": 9.244263067536872e-06, "loss": 0.1851, "step": 1140 }, { "epoch": 0.17744945567651632, "grad_norm": 0.97851675810554, "learning_rate": 9.24297116222837e-06, "loss": 0.2184, "step": 1141 }, { "epoch": 0.1776049766718507, "grad_norm": 1.1485004351012151, "learning_rate": 9.241678244063482e-06, "loss": 0.2106, "step": 1142 }, { "epoch": 0.17776049766718507, "grad_norm": 1.081146125371241, "learning_rate": 9.240384313350845e-06, "loss": 0.1844, "step": 1143 }, { "epoch": 0.17791601866251944, "grad_norm": 1.4013409835542678, "learning_rate": 9.239089370399338e-06, "loss": 0.2538, "step": 1144 }, { "epoch": 0.1780715396578538, "grad_norm": 6.587281038828778, "learning_rate": 9.237793415518083e-06, "loss": 0.2319, "step": 1145 }, { "epoch": 0.1782270606531882, "grad_norm": 1.463087775034242, "learning_rate": 9.23649644901644e-06, "loss": 0.1833, "step": 1146 }, { "epoch": 0.17838258164852255, "grad_norm": 0.8603221586452274, "learning_rate": 9.235198471204017e-06, "loss": 0.1652, "step": 1147 }, { "epoch": 0.17853810264385692, "grad_norm": 1.243900965186844, "learning_rate": 9.233899482390654e-06, "loss": 0.1688, "step": 1148 }, { "epoch": 0.17869362363919128, "grad_norm": 2.2219504182745964, "learning_rate": 9.232599482886444e-06, "loss": 0.2472, "step": 1149 }, { "epoch": 0.17884914463452567, "grad_norm": 0.8152250444616337, "learning_rate": 9.23129847300171e-06, "loss": 0.1542, "step": 1150 }, { "epoch": 0.17900466562986003, "grad_norm": 0.8972000242254355, "learning_rate": 9.229996453047022e-06, "loss": 0.1914, "step": 1151 }, { "epoch": 0.1791601866251944, "grad_norm": 1.3946215944007783, "learning_rate": 9.228693423333192e-06, "loss": 0.2517, "step": 1152 }, { "epoch": 0.17931570762052876, "grad_norm": 1.7211813642698215, "learning_rate": 9.227389384171272e-06, "loss": 0.1639, "step": 1153 }, { "epoch": 0.17947122861586315, "grad_norm": 1.045567391255685, "learning_rate": 9.22608433587255e-06, "loss": 0.1269, "step": 1154 }, { "epoch": 0.1796267496111975, "grad_norm": 1.6046875031988923, "learning_rate": 9.224778278748567e-06, "loss": 0.279, "step": 1155 }, { "epoch": 0.17978227060653187, "grad_norm": 1.204453994991899, "learning_rate": 9.223471213111089e-06, "loss": 0.1925, "step": 1156 }, { "epoch": 0.17993779160186626, "grad_norm": 1.3023690662744187, "learning_rate": 9.222163139272134e-06, "loss": 0.1788, "step": 1157 }, { "epoch": 0.18009331259720063, "grad_norm": 1.1433449264456945, "learning_rate": 9.220854057543958e-06, "loss": 0.2228, "step": 1158 }, { "epoch": 0.180248833592535, "grad_norm": 1.2657407961939997, "learning_rate": 9.219543968239057e-06, "loss": 0.1985, "step": 1159 }, { "epoch": 0.18040435458786935, "grad_norm": 1.0010295228905417, "learning_rate": 9.218232871670168e-06, "loss": 0.1976, "step": 1160 }, { "epoch": 0.18055987558320374, "grad_norm": 1.0125003198196167, "learning_rate": 9.216920768150266e-06, "loss": 0.1886, "step": 1161 }, { "epoch": 0.1807153965785381, "grad_norm": 1.2238980097949077, "learning_rate": 9.215607657992569e-06, "loss": 0.2848, "step": 1162 }, { "epoch": 0.18087091757387247, "grad_norm": 1.6192762802858285, "learning_rate": 9.214293541510537e-06, "loss": 0.1714, "step": 1163 }, { "epoch": 0.18102643856920683, "grad_norm": 1.0273533259054548, "learning_rate": 9.212978419017864e-06, "loss": 0.2001, "step": 1164 }, { "epoch": 0.18118195956454122, "grad_norm": 1.1061300881511378, "learning_rate": 9.211662290828493e-06, "loss": 0.2214, "step": 1165 }, { "epoch": 0.18133748055987559, "grad_norm": 1.061080909625091, "learning_rate": 9.210345157256597e-06, "loss": 0.1914, "step": 1166 }, { "epoch": 0.18149300155520995, "grad_norm": 0.6997117059310394, "learning_rate": 9.209027018616598e-06, "loss": 0.1434, "step": 1167 }, { "epoch": 0.1816485225505443, "grad_norm": 2.4894170703666125, "learning_rate": 9.207707875223153e-06, "loss": 0.154, "step": 1168 }, { "epoch": 0.1818040435458787, "grad_norm": 1.294402841120763, "learning_rate": 9.20638772739116e-06, "loss": 0.1398, "step": 1169 }, { "epoch": 0.18195956454121306, "grad_norm": 1.4691556974020672, "learning_rate": 9.205066575435754e-06, "loss": 0.2599, "step": 1170 }, { "epoch": 0.18211508553654743, "grad_norm": 1.5109698106731952, "learning_rate": 9.203744419672318e-06, "loss": 0.2715, "step": 1171 }, { "epoch": 0.1822706065318818, "grad_norm": 0.7824851605920647, "learning_rate": 9.202421260416464e-06, "loss": 0.155, "step": 1172 }, { "epoch": 0.18242612752721618, "grad_norm": 1.229104135640711, "learning_rate": 9.20109709798405e-06, "loss": 0.173, "step": 1173 }, { "epoch": 0.18258164852255054, "grad_norm": 1.493187696337834, "learning_rate": 9.199771932691172e-06, "loss": 0.1874, "step": 1174 }, { "epoch": 0.1827371695178849, "grad_norm": 1.3355865457774434, "learning_rate": 9.198445764854166e-06, "loss": 0.1868, "step": 1175 }, { "epoch": 0.1828926905132193, "grad_norm": 1.4822915990950787, "learning_rate": 9.19711859478961e-06, "loss": 0.1936, "step": 1176 }, { "epoch": 0.18304821150855366, "grad_norm": 1.3568523945836255, "learning_rate": 9.19579042281431e-06, "loss": 0.2351, "step": 1177 }, { "epoch": 0.18320373250388802, "grad_norm": 1.1221237537622042, "learning_rate": 9.194461249245326e-06, "loss": 0.1651, "step": 1178 }, { "epoch": 0.18335925349922239, "grad_norm": 1.0427220049147299, "learning_rate": 9.193131074399949e-06, "loss": 0.2095, "step": 1179 }, { "epoch": 0.18351477449455678, "grad_norm": 1.1443234808493088, "learning_rate": 9.191799898595706e-06, "loss": 0.1987, "step": 1180 }, { "epoch": 0.18367029548989114, "grad_norm": 0.8812799774315752, "learning_rate": 9.190467722150373e-06, "loss": 0.2529, "step": 1181 }, { "epoch": 0.1838258164852255, "grad_norm": 0.9190808713383141, "learning_rate": 9.189134545381954e-06, "loss": 0.2043, "step": 1182 }, { "epoch": 0.18398133748055986, "grad_norm": 1.1496814316391453, "learning_rate": 9.187800368608703e-06, "loss": 0.2166, "step": 1183 }, { "epoch": 0.18413685847589426, "grad_norm": 1.3800541644049227, "learning_rate": 9.1864651921491e-06, "loss": 0.2258, "step": 1184 }, { "epoch": 0.18429237947122862, "grad_norm": 0.91743359427612, "learning_rate": 9.185129016321877e-06, "loss": 0.1383, "step": 1185 }, { "epoch": 0.18444790046656298, "grad_norm": 1.4610869068656602, "learning_rate": 9.18379184144599e-06, "loss": 0.1508, "step": 1186 }, { "epoch": 0.18460342146189734, "grad_norm": 1.675711445184492, "learning_rate": 9.18245366784065e-06, "loss": 0.303, "step": 1187 }, { "epoch": 0.18475894245723173, "grad_norm": 0.7182617914658281, "learning_rate": 9.18111449582529e-06, "loss": 0.1663, "step": 1188 }, { "epoch": 0.1849144634525661, "grad_norm": 2.5919566299762105, "learning_rate": 9.179774325719593e-06, "loss": 0.1913, "step": 1189 }, { "epoch": 0.18506998444790046, "grad_norm": 1.5246187638405735, "learning_rate": 9.178433157843474e-06, "loss": 0.1974, "step": 1190 }, { "epoch": 0.18522550544323485, "grad_norm": 1.1870049850604156, "learning_rate": 9.17709099251709e-06, "loss": 0.1889, "step": 1191 }, { "epoch": 0.1853810264385692, "grad_norm": 1.2780528349437963, "learning_rate": 9.175747830060837e-06, "loss": 0.1682, "step": 1192 }, { "epoch": 0.18553654743390358, "grad_norm": 0.852064776201917, "learning_rate": 9.174403670795342e-06, "loss": 0.1786, "step": 1193 }, { "epoch": 0.18569206842923794, "grad_norm": 0.982736851978155, "learning_rate": 9.173058515041477e-06, "loss": 0.1759, "step": 1194 }, { "epoch": 0.18584758942457233, "grad_norm": 5.383045313258924, "learning_rate": 9.171712363120351e-06, "loss": 0.3862, "step": 1195 }, { "epoch": 0.1860031104199067, "grad_norm": 0.9849374073337689, "learning_rate": 9.170365215353306e-06, "loss": 0.1981, "step": 1196 }, { "epoch": 0.18615863141524105, "grad_norm": 1.1001803535527055, "learning_rate": 9.169017072061926e-06, "loss": 0.1989, "step": 1197 }, { "epoch": 0.18631415241057542, "grad_norm": 1.1570335250140034, "learning_rate": 9.167667933568032e-06, "loss": 0.1822, "step": 1198 }, { "epoch": 0.1864696734059098, "grad_norm": 1.6984581879530103, "learning_rate": 9.166317800193683e-06, "loss": 0.2171, "step": 1199 }, { "epoch": 0.18662519440124417, "grad_norm": 1.650860536979747, "learning_rate": 9.164966672261171e-06, "loss": 0.3055, "step": 1200 }, { "epoch": 0.18662519440124417, "eval_loss": 0.210090771317482, "eval_runtime": 9.4293, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 1200 }, { "epoch": 0.18678071539657853, "grad_norm": 1.416615143797259, "learning_rate": 9.163614550093035e-06, "loss": 0.1347, "step": 1201 }, { "epoch": 0.1869362363919129, "grad_norm": 1.3794733777830905, "learning_rate": 9.16226143401204e-06, "loss": 0.2041, "step": 1202 }, { "epoch": 0.1870917573872473, "grad_norm": 0.9282701621282511, "learning_rate": 9.160907324341199e-06, "loss": 0.1589, "step": 1203 }, { "epoch": 0.18724727838258165, "grad_norm": 1.4894253244171338, "learning_rate": 9.159552221403752e-06, "loss": 0.174, "step": 1204 }, { "epoch": 0.187402799377916, "grad_norm": 1.1504157025776975, "learning_rate": 9.158196125523182e-06, "loss": 0.1942, "step": 1205 }, { "epoch": 0.18755832037325038, "grad_norm": 0.7255523870962133, "learning_rate": 9.156839037023209e-06, "loss": 0.1925, "step": 1206 }, { "epoch": 0.18771384136858477, "grad_norm": 1.3297160614851913, "learning_rate": 9.155480956227789e-06, "loss": 0.2448, "step": 1207 }, { "epoch": 0.18786936236391913, "grad_norm": 1.2394203928257357, "learning_rate": 9.154121883461115e-06, "loss": 0.1644, "step": 1208 }, { "epoch": 0.1880248833592535, "grad_norm": 1.110942304313815, "learning_rate": 9.152761819047617e-06, "loss": 0.158, "step": 1209 }, { "epoch": 0.18818040435458788, "grad_norm": 0.8597754146450871, "learning_rate": 9.151400763311958e-06, "loss": 0.1765, "step": 1210 }, { "epoch": 0.18833592534992225, "grad_norm": 1.1244255534137637, "learning_rate": 9.150038716579046e-06, "loss": 0.14, "step": 1211 }, { "epoch": 0.1884914463452566, "grad_norm": 0.9441808017939254, "learning_rate": 9.148675679174017e-06, "loss": 0.1685, "step": 1212 }, { "epoch": 0.18864696734059097, "grad_norm": 1.49569762403274, "learning_rate": 9.147311651422248e-06, "loss": 0.1637, "step": 1213 }, { "epoch": 0.18880248833592536, "grad_norm": 1.0568658204953814, "learning_rate": 9.145946633649352e-06, "loss": 0.1713, "step": 1214 }, { "epoch": 0.18895800933125972, "grad_norm": 1.2127109888393217, "learning_rate": 9.144580626181176e-06, "loss": 0.161, "step": 1215 }, { "epoch": 0.1891135303265941, "grad_norm": 0.8503234486008238, "learning_rate": 9.143213629343807e-06, "loss": 0.1489, "step": 1216 }, { "epoch": 0.18926905132192845, "grad_norm": 0.9120088478974758, "learning_rate": 9.141845643463565e-06, "loss": 0.1939, "step": 1217 }, { "epoch": 0.18942457231726284, "grad_norm": 1.0121267789823751, "learning_rate": 9.140476668867008e-06, "loss": 0.15, "step": 1218 }, { "epoch": 0.1895800933125972, "grad_norm": 1.3638566134338714, "learning_rate": 9.13910670588093e-06, "loss": 0.2105, "step": 1219 }, { "epoch": 0.18973561430793157, "grad_norm": 1.6276021550806605, "learning_rate": 9.13773575483236e-06, "loss": 0.2869, "step": 1220 }, { "epoch": 0.18989113530326593, "grad_norm": 1.6764188720931026, "learning_rate": 9.136363816048562e-06, "loss": 0.1458, "step": 1221 }, { "epoch": 0.19004665629860032, "grad_norm": 0.6701780576831128, "learning_rate": 9.134990889857036e-06, "loss": 0.1842, "step": 1222 }, { "epoch": 0.19020217729393468, "grad_norm": 1.1322931167082202, "learning_rate": 9.133616976585522e-06, "loss": 0.2556, "step": 1223 }, { "epoch": 0.19035769828926905, "grad_norm": 1.2524154763717683, "learning_rate": 9.13224207656199e-06, "loss": 0.2104, "step": 1224 }, { "epoch": 0.1905132192846034, "grad_norm": 0.9592897430767787, "learning_rate": 9.130866190114649e-06, "loss": 0.2833, "step": 1225 }, { "epoch": 0.1906687402799378, "grad_norm": 1.7651472837705433, "learning_rate": 9.12948931757194e-06, "loss": 0.2524, "step": 1226 }, { "epoch": 0.19082426127527216, "grad_norm": 0.9879072001537496, "learning_rate": 9.128111459262543e-06, "loss": 0.1624, "step": 1227 }, { "epoch": 0.19097978227060652, "grad_norm": 1.320308534660155, "learning_rate": 9.126732615515373e-06, "loss": 0.2937, "step": 1228 }, { "epoch": 0.19113530326594091, "grad_norm": 1.6528470759003213, "learning_rate": 9.125352786659577e-06, "loss": 0.1824, "step": 1229 }, { "epoch": 0.19129082426127528, "grad_norm": 1.099113810582022, "learning_rate": 9.123971973024543e-06, "loss": 0.2282, "step": 1230 }, { "epoch": 0.19144634525660964, "grad_norm": 0.9906932002367946, "learning_rate": 9.122590174939887e-06, "loss": 0.1908, "step": 1231 }, { "epoch": 0.191601866251944, "grad_norm": 1.3700619269813867, "learning_rate": 9.121207392735465e-06, "loss": 0.1736, "step": 1232 }, { "epoch": 0.1917573872472784, "grad_norm": 0.9132669255091096, "learning_rate": 9.119823626741367e-06, "loss": 0.2559, "step": 1233 }, { "epoch": 0.19191290824261276, "grad_norm": 1.0158832597362466, "learning_rate": 9.118438877287913e-06, "loss": 0.218, "step": 1234 }, { "epoch": 0.19206842923794712, "grad_norm": 0.9172450560816615, "learning_rate": 9.11705314470567e-06, "loss": 0.2038, "step": 1235 }, { "epoch": 0.19222395023328148, "grad_norm": 1.0457809289045787, "learning_rate": 9.115666429325424e-06, "loss": 0.2383, "step": 1236 }, { "epoch": 0.19237947122861587, "grad_norm": 0.6123808194220389, "learning_rate": 9.114278731478207e-06, "loss": 0.1059, "step": 1237 }, { "epoch": 0.19253499222395024, "grad_norm": 0.8957445923668392, "learning_rate": 9.112890051495281e-06, "loss": 0.1753, "step": 1238 }, { "epoch": 0.1926905132192846, "grad_norm": 1.010302756648279, "learning_rate": 9.111500389708144e-06, "loss": 0.2162, "step": 1239 }, { "epoch": 0.19284603421461896, "grad_norm": 1.26307408847368, "learning_rate": 9.110109746448527e-06, "loss": 0.1901, "step": 1240 }, { "epoch": 0.19300155520995335, "grad_norm": 1.237621554432501, "learning_rate": 9.108718122048395e-06, "loss": 0.1746, "step": 1241 }, { "epoch": 0.19315707620528771, "grad_norm": 0.9172927280641415, "learning_rate": 9.107325516839952e-06, "loss": 0.1556, "step": 1242 }, { "epoch": 0.19331259720062208, "grad_norm": 1.7268710214147918, "learning_rate": 9.105931931155626e-06, "loss": 0.2808, "step": 1243 }, { "epoch": 0.19346811819595647, "grad_norm": 0.8932022562830918, "learning_rate": 9.10453736532809e-06, "loss": 0.1527, "step": 1244 }, { "epoch": 0.19362363919129083, "grad_norm": 1.2202712676463288, "learning_rate": 9.103141819690246e-06, "loss": 0.1376, "step": 1245 }, { "epoch": 0.1937791601866252, "grad_norm": 1.0826681500025592, "learning_rate": 9.101745294575227e-06, "loss": 0.1449, "step": 1246 }, { "epoch": 0.19393468118195956, "grad_norm": 1.1807575757930213, "learning_rate": 9.100347790316409e-06, "loss": 0.2126, "step": 1247 }, { "epoch": 0.19409020217729395, "grad_norm": 0.941763687751761, "learning_rate": 9.098949307247391e-06, "loss": 0.1632, "step": 1248 }, { "epoch": 0.1942457231726283, "grad_norm": 1.378441641768549, "learning_rate": 9.097549845702009e-06, "loss": 0.1906, "step": 1249 }, { "epoch": 0.19440124416796267, "grad_norm": 1.2339116886059447, "learning_rate": 9.09614940601434e-06, "loss": 0.2006, "step": 1250 }, { "epoch": 0.19455676516329704, "grad_norm": 1.1239344680494445, "learning_rate": 9.094747988518683e-06, "loss": 0.2336, "step": 1251 }, { "epoch": 0.19471228615863143, "grad_norm": 0.927588276459713, "learning_rate": 9.093345593549579e-06, "loss": 0.1449, "step": 1252 }, { "epoch": 0.1948678071539658, "grad_norm": 1.13724282637735, "learning_rate": 9.091942221441797e-06, "loss": 0.2126, "step": 1253 }, { "epoch": 0.19502332814930015, "grad_norm": 1.0365698182525573, "learning_rate": 9.090537872530343e-06, "loss": 0.1867, "step": 1254 }, { "epoch": 0.19517884914463451, "grad_norm": 0.9371814591941575, "learning_rate": 9.089132547150453e-06, "loss": 0.1618, "step": 1255 }, { "epoch": 0.1953343701399689, "grad_norm": 1.0697225550230685, "learning_rate": 9.0877262456376e-06, "loss": 0.1849, "step": 1256 }, { "epoch": 0.19548989113530327, "grad_norm": 1.5583498729530745, "learning_rate": 9.086318968327488e-06, "loss": 0.2014, "step": 1257 }, { "epoch": 0.19564541213063763, "grad_norm": 1.2271229677253923, "learning_rate": 9.084910715556052e-06, "loss": 0.2017, "step": 1258 }, { "epoch": 0.195800933125972, "grad_norm": 1.0026506309270833, "learning_rate": 9.083501487659461e-06, "loss": 0.1646, "step": 1259 }, { "epoch": 0.19595645412130638, "grad_norm": 1.2598951391108157, "learning_rate": 9.08209128497412e-06, "loss": 0.1851, "step": 1260 }, { "epoch": 0.19611197511664075, "grad_norm": 1.5838356552966606, "learning_rate": 9.080680107836662e-06, "loss": 0.1948, "step": 1261 }, { "epoch": 0.1962674961119751, "grad_norm": 1.1087104243969894, "learning_rate": 9.079267956583953e-06, "loss": 0.1687, "step": 1262 }, { "epoch": 0.1964230171073095, "grad_norm": 1.6020412697904411, "learning_rate": 9.077854831553097e-06, "loss": 0.1854, "step": 1263 }, { "epoch": 0.19657853810264386, "grad_norm": 1.0315547992066338, "learning_rate": 9.076440733081426e-06, "loss": 0.2211, "step": 1264 }, { "epoch": 0.19673405909797823, "grad_norm": 1.0349194289967332, "learning_rate": 9.075025661506505e-06, "loss": 0.182, "step": 1265 }, { "epoch": 0.1968895800933126, "grad_norm": 0.8148640872234216, "learning_rate": 9.073609617166129e-06, "loss": 0.2319, "step": 1266 }, { "epoch": 0.19704510108864698, "grad_norm": 0.8956967698145264, "learning_rate": 9.072192600398328e-06, "loss": 0.2318, "step": 1267 }, { "epoch": 0.19720062208398134, "grad_norm": 1.512397062737358, "learning_rate": 9.070774611541366e-06, "loss": 0.1279, "step": 1268 }, { "epoch": 0.1973561430793157, "grad_norm": 1.089155641459757, "learning_rate": 9.069355650933732e-06, "loss": 0.132, "step": 1269 }, { "epoch": 0.19751166407465007, "grad_norm": 1.15341700389814, "learning_rate": 9.06793571891416e-06, "loss": 0.1416, "step": 1270 }, { "epoch": 0.19766718506998446, "grad_norm": 1.2188604321419376, "learning_rate": 9.0665148158216e-06, "loss": 0.1635, "step": 1271 }, { "epoch": 0.19782270606531882, "grad_norm": 1.6133883720632236, "learning_rate": 9.065092941995245e-06, "loss": 0.185, "step": 1272 }, { "epoch": 0.19797822706065318, "grad_norm": 1.4486872766212289, "learning_rate": 9.063670097774513e-06, "loss": 0.2325, "step": 1273 }, { "epoch": 0.19813374805598755, "grad_norm": 1.557263365124596, "learning_rate": 9.062246283499058e-06, "loss": 0.1712, "step": 1274 }, { "epoch": 0.19828926905132194, "grad_norm": 1.9875754585690109, "learning_rate": 9.060821499508769e-06, "loss": 0.1843, "step": 1275 }, { "epoch": 0.1984447900466563, "grad_norm": 1.1418131416263584, "learning_rate": 9.059395746143756e-06, "loss": 0.1777, "step": 1276 }, { "epoch": 0.19860031104199066, "grad_norm": 1.0395361627239141, "learning_rate": 9.057969023744367e-06, "loss": 0.2194, "step": 1277 }, { "epoch": 0.19875583203732503, "grad_norm": 1.305159234748547, "learning_rate": 9.056541332651183e-06, "loss": 0.2141, "step": 1278 }, { "epoch": 0.19891135303265942, "grad_norm": 1.0849932011185046, "learning_rate": 9.055112673205014e-06, "loss": 0.1821, "step": 1279 }, { "epoch": 0.19906687402799378, "grad_norm": 0.979089764226756, "learning_rate": 9.053683045746897e-06, "loss": 0.269, "step": 1280 }, { "epoch": 0.19922239502332814, "grad_norm": 1.078405593629792, "learning_rate": 9.052252450618106e-06, "loss": 0.1413, "step": 1281 }, { "epoch": 0.19937791601866253, "grad_norm": 1.2031448135959215, "learning_rate": 9.050820888160145e-06, "loss": 0.2268, "step": 1282 }, { "epoch": 0.1995334370139969, "grad_norm": 0.9432997632179643, "learning_rate": 9.049388358714747e-06, "loss": 0.0856, "step": 1283 }, { "epoch": 0.19968895800933126, "grad_norm": 1.1798467376681538, "learning_rate": 9.04795486262388e-06, "loss": 0.1487, "step": 1284 }, { "epoch": 0.19984447900466562, "grad_norm": 0.9959594825238516, "learning_rate": 9.046520400229734e-06, "loss": 0.1363, "step": 1285 }, { "epoch": 0.2, "grad_norm": 1.2777597650080654, "learning_rate": 9.045084971874738e-06, "loss": 0.2053, "step": 1286 }, { "epoch": 0.20015552099533437, "grad_norm": 1.3807813898572032, "learning_rate": 9.04364857790155e-06, "loss": 0.1608, "step": 1287 }, { "epoch": 0.20031104199066874, "grad_norm": 1.213101350130223, "learning_rate": 9.042211218653054e-06, "loss": 0.1783, "step": 1288 }, { "epoch": 0.2004665629860031, "grad_norm": 1.270497799974636, "learning_rate": 9.040772894472369e-06, "loss": 0.1335, "step": 1289 }, { "epoch": 0.2006220839813375, "grad_norm": 1.143678584624158, "learning_rate": 9.039333605702844e-06, "loss": 0.2566, "step": 1290 }, { "epoch": 0.20077760497667185, "grad_norm": 0.9321591383595857, "learning_rate": 9.03789335268806e-06, "loss": 0.1517, "step": 1291 }, { "epoch": 0.20093312597200622, "grad_norm": 0.8482625172580437, "learning_rate": 9.036452135771818e-06, "loss": 0.2284, "step": 1292 }, { "epoch": 0.20108864696734058, "grad_norm": 1.5799008472731184, "learning_rate": 9.035009955298163e-06, "loss": 0.2491, "step": 1293 }, { "epoch": 0.20124416796267497, "grad_norm": 1.5021594414320747, "learning_rate": 9.03356681161136e-06, "loss": 0.1623, "step": 1294 }, { "epoch": 0.20139968895800933, "grad_norm": 1.1207507593154515, "learning_rate": 9.032122705055912e-06, "loss": 0.1996, "step": 1295 }, { "epoch": 0.2015552099533437, "grad_norm": 1.1753346897113919, "learning_rate": 9.030677635976542e-06, "loss": 0.156, "step": 1296 }, { "epoch": 0.20171073094867809, "grad_norm": 1.582912014985177, "learning_rate": 9.02923160471821e-06, "loss": 0.2852, "step": 1297 }, { "epoch": 0.20186625194401245, "grad_norm": 4.24419003235004, "learning_rate": 9.027784611626108e-06, "loss": 0.1857, "step": 1298 }, { "epoch": 0.2020217729393468, "grad_norm": 3.465507316165179, "learning_rate": 9.026336657045646e-06, "loss": 0.1331, "step": 1299 }, { "epoch": 0.20217729393468117, "grad_norm": 0.8992554022243577, "learning_rate": 9.024887741322475e-06, "loss": 0.1649, "step": 1300 }, { "epoch": 0.20217729393468117, "eval_loss": 0.20873166620731354, "eval_runtime": 9.4107, "eval_samples_per_second": 2.763, "eval_steps_per_second": 0.744, "step": 1300 }, { "epoch": 0.20233281493001556, "grad_norm": 1.2089278079623347, "learning_rate": 9.023437864802472e-06, "loss": 0.2705, "step": 1301 }, { "epoch": 0.20248833592534993, "grad_norm": 1.2901991665649666, "learning_rate": 9.021987027831743e-06, "loss": 0.1672, "step": 1302 }, { "epoch": 0.2026438569206843, "grad_norm": 1.5354719963652408, "learning_rate": 9.02053523075662e-06, "loss": 0.239, "step": 1303 }, { "epoch": 0.20279937791601865, "grad_norm": 1.214882523492219, "learning_rate": 9.01908247392367e-06, "loss": 0.1566, "step": 1304 }, { "epoch": 0.20295489891135304, "grad_norm": 1.473765899129253, "learning_rate": 9.017628757679685e-06, "loss": 0.1931, "step": 1305 }, { "epoch": 0.2031104199066874, "grad_norm": 2.6517165969707683, "learning_rate": 9.01617408237169e-06, "loss": 0.1307, "step": 1306 }, { "epoch": 0.20326594090202177, "grad_norm": 1.4993932954062734, "learning_rate": 9.01471844834693e-06, "loss": 0.2079, "step": 1307 }, { "epoch": 0.20342146189735613, "grad_norm": 1.0866992812991043, "learning_rate": 9.013261855952893e-06, "loss": 0.2361, "step": 1308 }, { "epoch": 0.20357698289269052, "grad_norm": 1.4691858213747517, "learning_rate": 9.011804305537281e-06, "loss": 0.2062, "step": 1309 }, { "epoch": 0.20373250388802489, "grad_norm": 1.218397331201916, "learning_rate": 9.010345797448037e-06, "loss": 0.1295, "step": 1310 }, { "epoch": 0.20388802488335925, "grad_norm": 1.317910015288317, "learning_rate": 9.008886332033323e-06, "loss": 0.221, "step": 1311 }, { "epoch": 0.2040435458786936, "grad_norm": 1.4368413534493716, "learning_rate": 9.007425909641538e-06, "loss": 0.3292, "step": 1312 }, { "epoch": 0.204199066874028, "grad_norm": 1.24467623609956, "learning_rate": 9.005964530621301e-06, "loss": 0.2276, "step": 1313 }, { "epoch": 0.20435458786936236, "grad_norm": 0.9849662601801316, "learning_rate": 9.004502195321468e-06, "loss": 0.1825, "step": 1314 }, { "epoch": 0.20451010886469673, "grad_norm": 3.783152250453029, "learning_rate": 9.003038904091113e-06, "loss": 0.1834, "step": 1315 }, { "epoch": 0.20466562986003112, "grad_norm": 1.0234608190416166, "learning_rate": 9.001574657279548e-06, "loss": 0.2172, "step": 1316 }, { "epoch": 0.20482115085536548, "grad_norm": 1.3240981295825394, "learning_rate": 9.00010945523631e-06, "loss": 0.1857, "step": 1317 }, { "epoch": 0.20497667185069984, "grad_norm": 1.1823107793426477, "learning_rate": 8.99864329831116e-06, "loss": 0.2747, "step": 1318 }, { "epoch": 0.2051321928460342, "grad_norm": 1.183188676477308, "learning_rate": 8.997176186854091e-06, "loss": 0.2091, "step": 1319 }, { "epoch": 0.2052877138413686, "grad_norm": 1.1306812200844953, "learning_rate": 8.995708121215325e-06, "loss": 0.1789, "step": 1320 }, { "epoch": 0.20544323483670296, "grad_norm": 1.3914844708441778, "learning_rate": 8.994239101745309e-06, "loss": 0.1626, "step": 1321 }, { "epoch": 0.20559875583203732, "grad_norm": 1.3328736681097808, "learning_rate": 8.992769128794717e-06, "loss": 0.1699, "step": 1322 }, { "epoch": 0.20575427682737168, "grad_norm": 1.3262550452320387, "learning_rate": 8.991298202714453e-06, "loss": 0.1985, "step": 1323 }, { "epoch": 0.20590979782270608, "grad_norm": 1.5863201904107513, "learning_rate": 8.989826323855647e-06, "loss": 0.2729, "step": 1324 }, { "epoch": 0.20606531881804044, "grad_norm": 1.0484153422588192, "learning_rate": 8.988353492569657e-06, "loss": 0.2243, "step": 1325 }, { "epoch": 0.2062208398133748, "grad_norm": 0.9724310873787251, "learning_rate": 8.986879709208069e-06, "loss": 0.2349, "step": 1326 }, { "epoch": 0.20637636080870916, "grad_norm": 1.319839764006134, "learning_rate": 8.985404974122699e-06, "loss": 0.1796, "step": 1327 }, { "epoch": 0.20653188180404355, "grad_norm": 1.7134943634197457, "learning_rate": 8.983929287665579e-06, "loss": 0.2289, "step": 1328 }, { "epoch": 0.20668740279937792, "grad_norm": 1.1812406274342315, "learning_rate": 8.98245265018898e-06, "loss": 0.2123, "step": 1329 }, { "epoch": 0.20684292379471228, "grad_norm": 1.4771839041530355, "learning_rate": 8.980975062045398e-06, "loss": 0.2228, "step": 1330 }, { "epoch": 0.20699844479004664, "grad_norm": 1.0073337669892177, "learning_rate": 8.979496523587552e-06, "loss": 0.1455, "step": 1331 }, { "epoch": 0.20715396578538103, "grad_norm": 0.8665969448850475, "learning_rate": 8.978017035168389e-06, "loss": 0.1689, "step": 1332 }, { "epoch": 0.2073094867807154, "grad_norm": 1.0555827692971853, "learning_rate": 8.976536597141085e-06, "loss": 0.1708, "step": 1333 }, { "epoch": 0.20746500777604976, "grad_norm": 0.8842215270037568, "learning_rate": 8.97505520985904e-06, "loss": 0.1751, "step": 1334 }, { "epoch": 0.20762052877138415, "grad_norm": 1.6924145041248846, "learning_rate": 8.973572873675882e-06, "loss": 0.1697, "step": 1335 }, { "epoch": 0.2077760497667185, "grad_norm": 1.1225115788471978, "learning_rate": 8.972089588945467e-06, "loss": 0.22, "step": 1336 }, { "epoch": 0.20793157076205288, "grad_norm": 1.1964311921620439, "learning_rate": 8.970605356021873e-06, "loss": 0.1953, "step": 1337 }, { "epoch": 0.20808709175738724, "grad_norm": 1.1874827397504135, "learning_rate": 8.96912017525941e-06, "loss": 0.1541, "step": 1338 }, { "epoch": 0.20824261275272163, "grad_norm": 1.2996586003784654, "learning_rate": 8.967634047012607e-06, "loss": 0.2543, "step": 1339 }, { "epoch": 0.208398133748056, "grad_norm": 1.9568915465615424, "learning_rate": 8.96614697163623e-06, "loss": 0.1742, "step": 1340 }, { "epoch": 0.20855365474339035, "grad_norm": 1.327702070183964, "learning_rate": 8.96465894948526e-06, "loss": 0.1688, "step": 1341 }, { "epoch": 0.20870917573872472, "grad_norm": 0.998729186682604, "learning_rate": 8.963169980914908e-06, "loss": 0.2165, "step": 1342 }, { "epoch": 0.2088646967340591, "grad_norm": 0.9250328323650552, "learning_rate": 8.961680066280614e-06, "loss": 0.1978, "step": 1343 }, { "epoch": 0.20902021772939347, "grad_norm": 0.685484375204563, "learning_rate": 8.96018920593804e-06, "loss": 0.1521, "step": 1344 }, { "epoch": 0.20917573872472783, "grad_norm": 1.2194077898180222, "learning_rate": 8.958697400243077e-06, "loss": 0.129, "step": 1345 }, { "epoch": 0.2093312597200622, "grad_norm": 1.3390006867631312, "learning_rate": 8.957204649551838e-06, "loss": 0.2295, "step": 1346 }, { "epoch": 0.2094867807153966, "grad_norm": 1.0791715779616644, "learning_rate": 8.955710954220664e-06, "loss": 0.1922, "step": 1347 }, { "epoch": 0.20964230171073095, "grad_norm": 1.0448818497216468, "learning_rate": 8.954216314606123e-06, "loss": 0.2074, "step": 1348 }, { "epoch": 0.2097978227060653, "grad_norm": 1.0968024521734823, "learning_rate": 8.952720731065e-06, "loss": 0.1956, "step": 1349 }, { "epoch": 0.2099533437013997, "grad_norm": 1.1729159260054676, "learning_rate": 8.95122420395432e-06, "loss": 0.1032, "step": 1350 }, { "epoch": 0.21010886469673407, "grad_norm": 0.7605452577854958, "learning_rate": 8.949726733631319e-06, "loss": 0.2173, "step": 1351 }, { "epoch": 0.21026438569206843, "grad_norm": 0.7896405561018206, "learning_rate": 8.948228320453465e-06, "loss": 0.1411, "step": 1352 }, { "epoch": 0.2104199066874028, "grad_norm": 1.3664851820052848, "learning_rate": 8.946728964778452e-06, "loss": 0.2043, "step": 1353 }, { "epoch": 0.21057542768273718, "grad_norm": 1.0930532560076165, "learning_rate": 8.945228666964197e-06, "loss": 0.2112, "step": 1354 }, { "epoch": 0.21073094867807154, "grad_norm": 1.3370376996193614, "learning_rate": 8.94372742736884e-06, "loss": 0.2763, "step": 1355 }, { "epoch": 0.2108864696734059, "grad_norm": 1.1733695403983486, "learning_rate": 8.942225246350748e-06, "loss": 0.1383, "step": 1356 }, { "epoch": 0.21104199066874027, "grad_norm": 1.518123240050466, "learning_rate": 8.940722124268515e-06, "loss": 0.2035, "step": 1357 }, { "epoch": 0.21119751166407466, "grad_norm": 0.7154774393150748, "learning_rate": 8.939218061480955e-06, "loss": 0.1513, "step": 1358 }, { "epoch": 0.21135303265940902, "grad_norm": 1.7277749667928948, "learning_rate": 8.937713058347109e-06, "loss": 0.1852, "step": 1359 }, { "epoch": 0.2115085536547434, "grad_norm": 0.8101754008908368, "learning_rate": 8.936207115226242e-06, "loss": 0.1755, "step": 1360 }, { "epoch": 0.21166407465007775, "grad_norm": 2.154263107894285, "learning_rate": 8.934700232477845e-06, "loss": 0.2284, "step": 1361 }, { "epoch": 0.21181959564541214, "grad_norm": 2.9946702775104552, "learning_rate": 8.933192410461632e-06, "loss": 0.1571, "step": 1362 }, { "epoch": 0.2119751166407465, "grad_norm": 1.3293853025848206, "learning_rate": 8.931683649537539e-06, "loss": 0.1818, "step": 1363 }, { "epoch": 0.21213063763608087, "grad_norm": 1.069623910831374, "learning_rate": 8.93017395006573e-06, "loss": 0.2389, "step": 1364 }, { "epoch": 0.21228615863141523, "grad_norm": 1.2692486168753456, "learning_rate": 8.928663312406593e-06, "loss": 0.1725, "step": 1365 }, { "epoch": 0.21244167962674962, "grad_norm": 2.31269662319102, "learning_rate": 8.927151736920733e-06, "loss": 0.3472, "step": 1366 }, { "epoch": 0.21259720062208398, "grad_norm": 1.3024374295612378, "learning_rate": 8.925639223968989e-06, "loss": 0.1601, "step": 1367 }, { "epoch": 0.21275272161741834, "grad_norm": 1.475662600105692, "learning_rate": 8.924125773912418e-06, "loss": 0.1652, "step": 1368 }, { "epoch": 0.21290824261275273, "grad_norm": 0.8719883727219597, "learning_rate": 8.9226113871123e-06, "loss": 0.2406, "step": 1369 }, { "epoch": 0.2130637636080871, "grad_norm": 1.355947295843189, "learning_rate": 8.921096063930141e-06, "loss": 0.2387, "step": 1370 }, { "epoch": 0.21321928460342146, "grad_norm": 1.462171782992857, "learning_rate": 8.919579804727671e-06, "loss": 0.2075, "step": 1371 }, { "epoch": 0.21337480559875582, "grad_norm": 1.4186556891621878, "learning_rate": 8.91806260986684e-06, "loss": 0.1906, "step": 1372 }, { "epoch": 0.21353032659409021, "grad_norm": 1.0297515081183366, "learning_rate": 8.916544479709826e-06, "loss": 0.1813, "step": 1373 }, { "epoch": 0.21368584758942458, "grad_norm": 0.8517207332254344, "learning_rate": 8.915025414619025e-06, "loss": 0.2314, "step": 1374 }, { "epoch": 0.21384136858475894, "grad_norm": 1.4500725099182117, "learning_rate": 8.91350541495706e-06, "loss": 0.2702, "step": 1375 }, { "epoch": 0.2139968895800933, "grad_norm": 1.4840249529134437, "learning_rate": 8.911984481086779e-06, "loss": 0.1957, "step": 1376 }, { "epoch": 0.2141524105754277, "grad_norm": 1.0812621557572404, "learning_rate": 8.910462613371246e-06, "loss": 0.1773, "step": 1377 }, { "epoch": 0.21430793157076206, "grad_norm": 0.8285771638848516, "learning_rate": 8.908939812173756e-06, "loss": 0.1879, "step": 1378 }, { "epoch": 0.21446345256609642, "grad_norm": 1.5413069191948623, "learning_rate": 8.907416077857818e-06, "loss": 0.2024, "step": 1379 }, { "epoch": 0.21461897356143078, "grad_norm": 1.5546998088262725, "learning_rate": 8.905891410787174e-06, "loss": 0.1297, "step": 1380 }, { "epoch": 0.21477449455676517, "grad_norm": 1.0276705986435684, "learning_rate": 8.904365811325779e-06, "loss": 0.1777, "step": 1381 }, { "epoch": 0.21493001555209953, "grad_norm": 2.186178551364591, "learning_rate": 8.902839279837818e-06, "loss": 0.1936, "step": 1382 }, { "epoch": 0.2150855365474339, "grad_norm": 1.409142378067793, "learning_rate": 8.901311816687693e-06, "loss": 0.2347, "step": 1383 }, { "epoch": 0.21524105754276826, "grad_norm": 0.909249039104448, "learning_rate": 8.899783422240031e-06, "loss": 0.1858, "step": 1384 }, { "epoch": 0.21539657853810265, "grad_norm": 1.389710830109919, "learning_rate": 8.898254096859681e-06, "loss": 0.2546, "step": 1385 }, { "epoch": 0.215552099533437, "grad_norm": 1.1722812780197163, "learning_rate": 8.896723840911718e-06, "loss": 0.2451, "step": 1386 }, { "epoch": 0.21570762052877138, "grad_norm": 1.0186256750739588, "learning_rate": 8.89519265476143e-06, "loss": 0.1423, "step": 1387 }, { "epoch": 0.21586314152410577, "grad_norm": 1.463755060922718, "learning_rate": 8.893660538774335e-06, "loss": 0.678, "step": 1388 }, { "epoch": 0.21601866251944013, "grad_norm": 1.2144290461428764, "learning_rate": 8.892127493316172e-06, "loss": 0.1289, "step": 1389 }, { "epoch": 0.2161741835147745, "grad_norm": 1.2754281076641276, "learning_rate": 8.8905935187529e-06, "loss": 0.1775, "step": 1390 }, { "epoch": 0.21632970451010886, "grad_norm": 0.8239843617970345, "learning_rate": 8.889058615450695e-06, "loss": 0.1379, "step": 1391 }, { "epoch": 0.21648522550544325, "grad_norm": 0.8183516543340216, "learning_rate": 8.887522783775965e-06, "loss": 0.4396, "step": 1392 }, { "epoch": 0.2166407465007776, "grad_norm": 1.163898200737944, "learning_rate": 8.885986024095334e-06, "loss": 0.1788, "step": 1393 }, { "epoch": 0.21679626749611197, "grad_norm": 1.0398663598746642, "learning_rate": 8.884448336775647e-06, "loss": 0.2058, "step": 1394 }, { "epoch": 0.21695178849144633, "grad_norm": 1.1038527572141106, "learning_rate": 8.882909722183973e-06, "loss": 0.1603, "step": 1395 }, { "epoch": 0.21710730948678073, "grad_norm": 1.1407224011212185, "learning_rate": 8.881370180687597e-06, "loss": 0.212, "step": 1396 }, { "epoch": 0.2172628304821151, "grad_norm": 1.171491183176733, "learning_rate": 8.879829712654032e-06, "loss": 0.156, "step": 1397 }, { "epoch": 0.21741835147744945, "grad_norm": 1.0914587320494888, "learning_rate": 8.878288318451006e-06, "loss": 0.0999, "step": 1398 }, { "epoch": 0.2175738724727838, "grad_norm": 1.0719935831541472, "learning_rate": 8.876745998446477e-06, "loss": 0.2026, "step": 1399 }, { "epoch": 0.2177293934681182, "grad_norm": 0.8893812774700685, "learning_rate": 8.875202753008614e-06, "loss": 0.1152, "step": 1400 }, { "epoch": 0.2177293934681182, "eval_loss": 0.20550738275051117, "eval_runtime": 9.4165, "eval_samples_per_second": 2.761, "eval_steps_per_second": 0.743, "step": 1400 }, { "epoch": 0.21788491446345257, "grad_norm": 1.1732595194107243, "learning_rate": 8.873658582505813e-06, "loss": 0.184, "step": 1401 }, { "epoch": 0.21804043545878693, "grad_norm": 1.3681566501491238, "learning_rate": 8.872113487306686e-06, "loss": 0.1787, "step": 1402 }, { "epoch": 0.2181959564541213, "grad_norm": 0.9384518321736989, "learning_rate": 8.870567467780073e-06, "loss": 0.1907, "step": 1403 }, { "epoch": 0.21835147744945568, "grad_norm": 1.6918321800893066, "learning_rate": 8.86902052429503e-06, "loss": 0.1814, "step": 1404 }, { "epoch": 0.21850699844479005, "grad_norm": 1.0615675392544648, "learning_rate": 8.867472657220829e-06, "loss": 0.1807, "step": 1405 }, { "epoch": 0.2186625194401244, "grad_norm": 1.2104557155019795, "learning_rate": 8.865923866926973e-06, "loss": 0.2046, "step": 1406 }, { "epoch": 0.2188180404354588, "grad_norm": 1.409015102478802, "learning_rate": 8.864374153783177e-06, "loss": 0.2415, "step": 1407 }, { "epoch": 0.21897356143079316, "grad_norm": 1.2088161026937052, "learning_rate": 8.86282351815938e-06, "loss": 0.1573, "step": 1408 }, { "epoch": 0.21912908242612752, "grad_norm": 1.0288225427805875, "learning_rate": 8.861271960425741e-06, "loss": 0.1812, "step": 1409 }, { "epoch": 0.2192846034214619, "grad_norm": 1.1067487085965078, "learning_rate": 8.859719480952637e-06, "loss": 0.1955, "step": 1410 }, { "epoch": 0.21944012441679628, "grad_norm": 1.331527983707418, "learning_rate": 8.858166080110666e-06, "loss": 0.2153, "step": 1411 }, { "epoch": 0.21959564541213064, "grad_norm": 1.3966673201995545, "learning_rate": 8.85661175827065e-06, "loss": 0.1861, "step": 1412 }, { "epoch": 0.219751166407465, "grad_norm": 1.7346922539447693, "learning_rate": 8.855056515803624e-06, "loss": 0.2217, "step": 1413 }, { "epoch": 0.21990668740279937, "grad_norm": 1.0429561703393233, "learning_rate": 8.853500353080848e-06, "loss": 0.137, "step": 1414 }, { "epoch": 0.22006220839813376, "grad_norm": 1.099146007367247, "learning_rate": 8.851943270473797e-06, "loss": 0.1888, "step": 1415 }, { "epoch": 0.22021772939346812, "grad_norm": 1.0622173162674204, "learning_rate": 8.850385268354171e-06, "loss": 0.2054, "step": 1416 }, { "epoch": 0.22037325038880248, "grad_norm": 1.7275165681110787, "learning_rate": 8.848826347093887e-06, "loss": 0.1839, "step": 1417 }, { "epoch": 0.22052877138413685, "grad_norm": 1.4049206778214125, "learning_rate": 8.84726650706508e-06, "loss": 0.2719, "step": 1418 }, { "epoch": 0.22068429237947124, "grad_norm": 0.984134518775913, "learning_rate": 8.845705748640104e-06, "loss": 0.2118, "step": 1419 }, { "epoch": 0.2208398133748056, "grad_norm": 0.8575267757080008, "learning_rate": 8.844144072191537e-06, "loss": 0.1633, "step": 1420 }, { "epoch": 0.22099533437013996, "grad_norm": 1.2572159208716647, "learning_rate": 8.842581478092172e-06, "loss": 0.2397, "step": 1421 }, { "epoch": 0.22115085536547435, "grad_norm": 1.2016606507273602, "learning_rate": 8.841017966715019e-06, "loss": 0.2033, "step": 1422 }, { "epoch": 0.22130637636080872, "grad_norm": 1.3276461025791215, "learning_rate": 8.839453538433314e-06, "loss": 0.1925, "step": 1423 }, { "epoch": 0.22146189735614308, "grad_norm": 0.8224092915871075, "learning_rate": 8.837888193620506e-06, "loss": 0.1358, "step": 1424 }, { "epoch": 0.22161741835147744, "grad_norm": 1.4495835386689406, "learning_rate": 8.836321932650266e-06, "loss": 0.2432, "step": 1425 }, { "epoch": 0.22177293934681183, "grad_norm": 1.4755944744177818, "learning_rate": 8.83475475589648e-06, "loss": 0.1231, "step": 1426 }, { "epoch": 0.2219284603421462, "grad_norm": 0.8119316049057401, "learning_rate": 8.833186663733258e-06, "loss": 0.2097, "step": 1427 }, { "epoch": 0.22208398133748056, "grad_norm": 1.0060829041279713, "learning_rate": 8.83161765653492e-06, "loss": 0.1738, "step": 1428 }, { "epoch": 0.22223950233281492, "grad_norm": 2.4145754711073733, "learning_rate": 8.830047734676018e-06, "loss": 0.2858, "step": 1429 }, { "epoch": 0.2223950233281493, "grad_norm": 1.1242173153619541, "learning_rate": 8.828476898531308e-06, "loss": 0.2166, "step": 1430 }, { "epoch": 0.22255054432348367, "grad_norm": 0.9324040289076934, "learning_rate": 8.826905148475772e-06, "loss": 0.1157, "step": 1431 }, { "epoch": 0.22270606531881804, "grad_norm": 1.1091005510043248, "learning_rate": 8.82533248488461e-06, "loss": 0.2387, "step": 1432 }, { "epoch": 0.2228615863141524, "grad_norm": 1.5660091935097067, "learning_rate": 8.823758908133237e-06, "loss": 0.1783, "step": 1433 }, { "epoch": 0.2230171073094868, "grad_norm": 1.7595194847301099, "learning_rate": 8.822184418597289e-06, "loss": 0.1971, "step": 1434 }, { "epoch": 0.22317262830482115, "grad_norm": 1.1991294408769844, "learning_rate": 8.820609016652616e-06, "loss": 0.1993, "step": 1435 }, { "epoch": 0.22332814930015552, "grad_norm": 1.16155323748872, "learning_rate": 8.819032702675293e-06, "loss": 0.1663, "step": 1436 }, { "epoch": 0.22348367029548988, "grad_norm": 1.144471577400653, "learning_rate": 8.817455477041605e-06, "loss": 0.1553, "step": 1437 }, { "epoch": 0.22363919129082427, "grad_norm": 1.1758756635872867, "learning_rate": 8.815877340128059e-06, "loss": 0.1997, "step": 1438 }, { "epoch": 0.22379471228615863, "grad_norm": 1.0774573442962538, "learning_rate": 8.814298292311376e-06, "loss": 0.224, "step": 1439 }, { "epoch": 0.223950233281493, "grad_norm": 1.018897824496983, "learning_rate": 8.812718333968498e-06, "loss": 0.1969, "step": 1440 }, { "epoch": 0.22410575427682738, "grad_norm": 0.7464671714955523, "learning_rate": 8.811137465476584e-06, "loss": 0.1704, "step": 1441 }, { "epoch": 0.22426127527216175, "grad_norm": 1.120267062163412, "learning_rate": 8.80955568721301e-06, "loss": 0.1941, "step": 1442 }, { "epoch": 0.2244167962674961, "grad_norm": 1.57559360058438, "learning_rate": 8.807972999555368e-06, "loss": 0.2603, "step": 1443 }, { "epoch": 0.22457231726283047, "grad_norm": 0.939702806688543, "learning_rate": 8.806389402881466e-06, "loss": 0.2116, "step": 1444 }, { "epoch": 0.22472783825816486, "grad_norm": 1.2188628504615986, "learning_rate": 8.80480489756933e-06, "loss": 0.2424, "step": 1445 }, { "epoch": 0.22488335925349923, "grad_norm": 1.0944670304974327, "learning_rate": 8.803219483997205e-06, "loss": 0.1828, "step": 1446 }, { "epoch": 0.2250388802488336, "grad_norm": 1.6182633129569433, "learning_rate": 8.801633162543555e-06, "loss": 0.1964, "step": 1447 }, { "epoch": 0.22519440124416795, "grad_norm": 0.8387542620173406, "learning_rate": 8.800045933587052e-06, "loss": 0.1585, "step": 1448 }, { "epoch": 0.22534992223950234, "grad_norm": 1.2464384825217707, "learning_rate": 8.798457797506588e-06, "loss": 0.1134, "step": 1449 }, { "epoch": 0.2255054432348367, "grad_norm": 0.9893884401535724, "learning_rate": 8.79686875468128e-06, "loss": 0.2421, "step": 1450 }, { "epoch": 0.22566096423017107, "grad_norm": 1.0480572189617101, "learning_rate": 8.79527880549045e-06, "loss": 0.1921, "step": 1451 }, { "epoch": 0.22581648522550543, "grad_norm": 1.3152458887234093, "learning_rate": 8.793687950313643e-06, "loss": 0.1597, "step": 1452 }, { "epoch": 0.22597200622083982, "grad_norm": 1.0970403207876425, "learning_rate": 8.792096189530614e-06, "loss": 0.1854, "step": 1453 }, { "epoch": 0.22612752721617418, "grad_norm": 1.0705518033654797, "learning_rate": 8.790503523521346e-06, "loss": 0.1839, "step": 1454 }, { "epoch": 0.22628304821150855, "grad_norm": 1.132932961220967, "learning_rate": 8.788909952666024e-06, "loss": 0.1871, "step": 1455 }, { "epoch": 0.2264385692068429, "grad_norm": 1.4797221000535143, "learning_rate": 8.787315477345059e-06, "loss": 0.2295, "step": 1456 }, { "epoch": 0.2265940902021773, "grad_norm": 1.0944162670416104, "learning_rate": 8.785720097939075e-06, "loss": 0.1745, "step": 1457 }, { "epoch": 0.22674961119751166, "grad_norm": 1.6430830107526284, "learning_rate": 8.784123814828908e-06, "loss": 0.3592, "step": 1458 }, { "epoch": 0.22690513219284603, "grad_norm": 1.1438907408683774, "learning_rate": 8.782526628395616e-06, "loss": 0.1613, "step": 1459 }, { "epoch": 0.22706065318818042, "grad_norm": 2.655708868757693, "learning_rate": 8.780928539020467e-06, "loss": 0.1821, "step": 1460 }, { "epoch": 0.22721617418351478, "grad_norm": 0.9605535718803637, "learning_rate": 8.779329547084949e-06, "loss": 0.1707, "step": 1461 }, { "epoch": 0.22737169517884914, "grad_norm": 2.2075086894366036, "learning_rate": 8.777729652970765e-06, "loss": 0.1383, "step": 1462 }, { "epoch": 0.2275272161741835, "grad_norm": 1.1974721511606266, "learning_rate": 8.77612885705983e-06, "loss": 0.2615, "step": 1463 }, { "epoch": 0.2276827371695179, "grad_norm": 1.076273551290465, "learning_rate": 8.774527159734277e-06, "loss": 0.2094, "step": 1464 }, { "epoch": 0.22783825816485226, "grad_norm": 1.3601919661341624, "learning_rate": 8.772924561376454e-06, "loss": 0.2324, "step": 1465 }, { "epoch": 0.22799377916018662, "grad_norm": 1.4328079078867457, "learning_rate": 8.771321062368922e-06, "loss": 0.1763, "step": 1466 }, { "epoch": 0.22814930015552098, "grad_norm": 1.1869126356200645, "learning_rate": 8.76971666309446e-06, "loss": 0.1093, "step": 1467 }, { "epoch": 0.22830482115085537, "grad_norm": 0.8016043523305539, "learning_rate": 8.768111363936058e-06, "loss": 0.1716, "step": 1468 }, { "epoch": 0.22846034214618974, "grad_norm": 1.1279000832737547, "learning_rate": 8.766505165276928e-06, "loss": 0.1415, "step": 1469 }, { "epoch": 0.2286158631415241, "grad_norm": 1.4632653437041683, "learning_rate": 8.764898067500488e-06, "loss": 0.1682, "step": 1470 }, { "epoch": 0.22877138413685846, "grad_norm": 1.427331448842405, "learning_rate": 8.763290070990377e-06, "loss": 0.261, "step": 1471 }, { "epoch": 0.22892690513219285, "grad_norm": 0.9926126679211552, "learning_rate": 8.761681176130443e-06, "loss": 0.1625, "step": 1472 }, { "epoch": 0.22908242612752722, "grad_norm": 1.690385156533882, "learning_rate": 8.760071383304755e-06, "loss": 0.2803, "step": 1473 }, { "epoch": 0.22923794712286158, "grad_norm": 1.0976612977720204, "learning_rate": 8.758460692897593e-06, "loss": 0.1802, "step": 1474 }, { "epoch": 0.22939346811819597, "grad_norm": 1.2314757179900722, "learning_rate": 8.756849105293447e-06, "loss": 0.1768, "step": 1475 }, { "epoch": 0.22954898911353033, "grad_norm": 1.1327643054428198, "learning_rate": 8.755236620877033e-06, "loss": 0.1865, "step": 1476 }, { "epoch": 0.2297045101088647, "grad_norm": 1.1639229615649782, "learning_rate": 8.753623240033265e-06, "loss": 0.1524, "step": 1477 }, { "epoch": 0.22986003110419906, "grad_norm": 0.9603164098229106, "learning_rate": 8.752008963147285e-06, "loss": 0.1721, "step": 1478 }, { "epoch": 0.23001555209953345, "grad_norm": 1.38792631561096, "learning_rate": 8.750393790604442e-06, "loss": 0.2342, "step": 1479 }, { "epoch": 0.2301710730948678, "grad_norm": 1.2479053691859538, "learning_rate": 8.7487777227903e-06, "loss": 0.1938, "step": 1480 }, { "epoch": 0.23032659409020217, "grad_norm": 1.2509939431760002, "learning_rate": 8.747160760090637e-06, "loss": 0.1844, "step": 1481 }, { "epoch": 0.23048211508553654, "grad_norm": 1.465934150389407, "learning_rate": 8.745542902891444e-06, "loss": 0.205, "step": 1482 }, { "epoch": 0.23063763608087093, "grad_norm": 1.0510694170069674, "learning_rate": 8.743924151578928e-06, "loss": 0.1759, "step": 1483 }, { "epoch": 0.2307931570762053, "grad_norm": 1.2869382169156265, "learning_rate": 8.742304506539506e-06, "loss": 0.1634, "step": 1484 }, { "epoch": 0.23094867807153965, "grad_norm": 2.0849533877813067, "learning_rate": 8.740683968159808e-06, "loss": 0.1834, "step": 1485 }, { "epoch": 0.23110419906687402, "grad_norm": 0.5391088701503829, "learning_rate": 8.739062536826683e-06, "loss": 0.1062, "step": 1486 }, { "epoch": 0.2312597200622084, "grad_norm": 1.339043790882886, "learning_rate": 8.737440212927188e-06, "loss": 0.154, "step": 1487 }, { "epoch": 0.23141524105754277, "grad_norm": 1.2239049109865379, "learning_rate": 8.735816996848592e-06, "loss": 0.1694, "step": 1488 }, { "epoch": 0.23157076205287713, "grad_norm": 0.8785721668205927, "learning_rate": 8.734192888978381e-06, "loss": 0.1501, "step": 1489 }, { "epoch": 0.2317262830482115, "grad_norm": 1.1018359589714184, "learning_rate": 8.732567889704253e-06, "loss": 0.2004, "step": 1490 }, { "epoch": 0.23188180404354589, "grad_norm": 1.2782960384351885, "learning_rate": 8.730941999414117e-06, "loss": 0.1514, "step": 1491 }, { "epoch": 0.23203732503888025, "grad_norm": 0.7470536578634075, "learning_rate": 8.729315218496097e-06, "loss": 0.1828, "step": 1492 }, { "epoch": 0.2321928460342146, "grad_norm": 1.0314729949458916, "learning_rate": 8.727687547338527e-06, "loss": 0.1766, "step": 1493 }, { "epoch": 0.232348367029549, "grad_norm": 1.435780946058732, "learning_rate": 8.726058986329954e-06, "loss": 0.2574, "step": 1494 }, { "epoch": 0.23250388802488337, "grad_norm": 1.3013711909380183, "learning_rate": 8.72442953585914e-06, "loss": 0.2304, "step": 1495 }, { "epoch": 0.23265940902021773, "grad_norm": 1.3258835525000316, "learning_rate": 8.722799196315057e-06, "loss": 0.1649, "step": 1496 }, { "epoch": 0.2328149300155521, "grad_norm": 1.4810824648278473, "learning_rate": 8.721167968086888e-06, "loss": 0.2786, "step": 1497 }, { "epoch": 0.23297045101088648, "grad_norm": 0.8879588001193606, "learning_rate": 8.719535851564034e-06, "loss": 0.1662, "step": 1498 }, { "epoch": 0.23312597200622084, "grad_norm": 1.0006636128134747, "learning_rate": 8.7179028471361e-06, "loss": 0.144, "step": 1499 }, { "epoch": 0.2332814930015552, "grad_norm": 1.0732426035660707, "learning_rate": 8.716268955192908e-06, "loss": 0.1799, "step": 1500 }, { "epoch": 0.2332814930015552, "eval_loss": 0.20381511747837067, "eval_runtime": 9.4315, "eval_samples_per_second": 2.757, "eval_steps_per_second": 0.742, "step": 1500 }, { "epoch": 0.23343701399688957, "grad_norm": 1.1848798776210054, "learning_rate": 8.714634176124492e-06, "loss": 0.2192, "step": 1501 }, { "epoch": 0.23359253499222396, "grad_norm": 1.1734962627193575, "learning_rate": 8.712998510321095e-06, "loss": 0.2218, "step": 1502 }, { "epoch": 0.23374805598755832, "grad_norm": 1.0346380522248477, "learning_rate": 8.711361958173175e-06, "loss": 0.1561, "step": 1503 }, { "epoch": 0.23390357698289269, "grad_norm": 0.8380236750022618, "learning_rate": 8.709724520071399e-06, "loss": 0.1238, "step": 1504 }, { "epoch": 0.23405909797822705, "grad_norm": 0.8234400155679666, "learning_rate": 8.708086196406646e-06, "loss": 0.1887, "step": 1505 }, { "epoch": 0.23421461897356144, "grad_norm": 1.3627952832885772, "learning_rate": 8.706446987570005e-06, "loss": 0.1739, "step": 1506 }, { "epoch": 0.2343701399688958, "grad_norm": 2.486707766460104, "learning_rate": 8.704806893952782e-06, "loss": 0.1462, "step": 1507 }, { "epoch": 0.23452566096423016, "grad_norm": 1.041812062354574, "learning_rate": 8.703165915946488e-06, "loss": 0.2247, "step": 1508 }, { "epoch": 0.23468118195956453, "grad_norm": 1.2090827115985525, "learning_rate": 8.701524053942846e-06, "loss": 0.1931, "step": 1509 }, { "epoch": 0.23483670295489892, "grad_norm": 0.7956311279751848, "learning_rate": 8.699881308333794e-06, "loss": 0.1801, "step": 1510 }, { "epoch": 0.23499222395023328, "grad_norm": 2.3005427634248017, "learning_rate": 8.698237679511476e-06, "loss": 0.2116, "step": 1511 }, { "epoch": 0.23514774494556764, "grad_norm": 1.1297158899245439, "learning_rate": 8.696593167868252e-06, "loss": 0.2319, "step": 1512 }, { "epoch": 0.23530326594090203, "grad_norm": 0.960775125545338, "learning_rate": 8.694947773796685e-06, "loss": 0.1543, "step": 1513 }, { "epoch": 0.2354587869362364, "grad_norm": 1.213893040863673, "learning_rate": 8.69330149768956e-06, "loss": 0.2041, "step": 1514 }, { "epoch": 0.23561430793157076, "grad_norm": 0.8074468351762752, "learning_rate": 8.69165433993986e-06, "loss": 0.1965, "step": 1515 }, { "epoch": 0.23576982892690512, "grad_norm": 1.1267774919804718, "learning_rate": 8.690006300940789e-06, "loss": 0.1823, "step": 1516 }, { "epoch": 0.2359253499222395, "grad_norm": 1.4711843699980223, "learning_rate": 8.688357381085753e-06, "loss": 0.1753, "step": 1517 }, { "epoch": 0.23608087091757388, "grad_norm": 1.0215570051060534, "learning_rate": 8.686707580768376e-06, "loss": 0.214, "step": 1518 }, { "epoch": 0.23623639191290824, "grad_norm": 1.4485746749390973, "learning_rate": 8.685056900382486e-06, "loss": 0.1742, "step": 1519 }, { "epoch": 0.2363919129082426, "grad_norm": 1.6525523323599767, "learning_rate": 8.683405340322123e-06, "loss": 0.3261, "step": 1520 }, { "epoch": 0.236547433903577, "grad_norm": 1.411135121552525, "learning_rate": 8.681752900981539e-06, "loss": 0.1753, "step": 1521 }, { "epoch": 0.23670295489891136, "grad_norm": 1.4707330597490842, "learning_rate": 8.680099582755196e-06, "loss": 0.1668, "step": 1522 }, { "epoch": 0.23685847589424572, "grad_norm": 1.0942391175538886, "learning_rate": 8.678445386037759e-06, "loss": 0.1601, "step": 1523 }, { "epoch": 0.23701399688958008, "grad_norm": 1.470588177448403, "learning_rate": 8.67679031122411e-06, "loss": 0.246, "step": 1524 }, { "epoch": 0.23716951788491447, "grad_norm": 0.9581346042453303, "learning_rate": 8.675134358709341e-06, "loss": 0.1574, "step": 1525 }, { "epoch": 0.23732503888024883, "grad_norm": 1.4763786660245666, "learning_rate": 8.67347752888875e-06, "loss": 0.1907, "step": 1526 }, { "epoch": 0.2374805598755832, "grad_norm": 1.0363167034974192, "learning_rate": 8.671819822157842e-06, "loss": 0.1531, "step": 1527 }, { "epoch": 0.2376360808709176, "grad_norm": 1.1924345869848432, "learning_rate": 8.670161238912338e-06, "loss": 0.1347, "step": 1528 }, { "epoch": 0.23779160186625195, "grad_norm": 1.3358065512422586, "learning_rate": 8.668501779548165e-06, "loss": 0.1827, "step": 1529 }, { "epoch": 0.2379471228615863, "grad_norm": 2.021234266844145, "learning_rate": 8.666841444461456e-06, "loss": 0.1368, "step": 1530 }, { "epoch": 0.23810264385692068, "grad_norm": 1.4808660901110622, "learning_rate": 8.665180234048561e-06, "loss": 0.2527, "step": 1531 }, { "epoch": 0.23825816485225507, "grad_norm": 1.065494309629267, "learning_rate": 8.66351814870603e-06, "loss": 0.1645, "step": 1532 }, { "epoch": 0.23841368584758943, "grad_norm": 1.154174016882306, "learning_rate": 8.661855188830626e-06, "loss": 0.2328, "step": 1533 }, { "epoch": 0.2385692068429238, "grad_norm": 1.1447203609781391, "learning_rate": 8.660191354819324e-06, "loss": 0.1794, "step": 1534 }, { "epoch": 0.23872472783825816, "grad_norm": 0.9991428522588004, "learning_rate": 8.658526647069303e-06, "loss": 0.1233, "step": 1535 }, { "epoch": 0.23888024883359255, "grad_norm": 0.7670014014044277, "learning_rate": 8.65686106597795e-06, "loss": 0.1834, "step": 1536 }, { "epoch": 0.2390357698289269, "grad_norm": 1.5945089662017708, "learning_rate": 8.655194611942863e-06, "loss": 0.1921, "step": 1537 }, { "epoch": 0.23919129082426127, "grad_norm": 1.2997434550841578, "learning_rate": 8.65352728536185e-06, "loss": 0.1873, "step": 1538 }, { "epoch": 0.23934681181959563, "grad_norm": 0.7625665208100638, "learning_rate": 8.651859086632924e-06, "loss": 0.1049, "step": 1539 }, { "epoch": 0.23950233281493002, "grad_norm": 2.315830524891549, "learning_rate": 8.650190016154307e-06, "loss": 0.2199, "step": 1540 }, { "epoch": 0.2396578538102644, "grad_norm": 0.893513036921711, "learning_rate": 8.648520074324429e-06, "loss": 0.1486, "step": 1541 }, { "epoch": 0.23981337480559875, "grad_norm": 1.0954057776977126, "learning_rate": 8.64684926154193e-06, "loss": 0.143, "step": 1542 }, { "epoch": 0.2399688958009331, "grad_norm": 1.1636396222045602, "learning_rate": 8.645177578205654e-06, "loss": 0.1386, "step": 1543 }, { "epoch": 0.2401244167962675, "grad_norm": 1.6636278556595083, "learning_rate": 8.643505024714656e-06, "loss": 0.2057, "step": 1544 }, { "epoch": 0.24027993779160187, "grad_norm": 0.847583750776468, "learning_rate": 8.641831601468198e-06, "loss": 0.1272, "step": 1545 }, { "epoch": 0.24043545878693623, "grad_norm": 1.1676164916999088, "learning_rate": 8.640157308865751e-06, "loss": 0.2057, "step": 1546 }, { "epoch": 0.24059097978227062, "grad_norm": 1.1944835161358125, "learning_rate": 8.63848214730699e-06, "loss": 0.2237, "step": 1547 }, { "epoch": 0.24074650077760498, "grad_norm": 1.3051952058816747, "learning_rate": 8.6368061171918e-06, "loss": 0.1398, "step": 1548 }, { "epoch": 0.24090202177293935, "grad_norm": 1.2433159998532273, "learning_rate": 8.635129218920272e-06, "loss": 0.1514, "step": 1549 }, { "epoch": 0.2410575427682737, "grad_norm": 1.6469350149721569, "learning_rate": 8.633451452892707e-06, "loss": 0.2141, "step": 1550 }, { "epoch": 0.2412130637636081, "grad_norm": 1.0473985194623197, "learning_rate": 8.631772819509609e-06, "loss": 0.1629, "step": 1551 }, { "epoch": 0.24136858475894246, "grad_norm": 1.081030634052537, "learning_rate": 8.630093319171692e-06, "loss": 0.1647, "step": 1552 }, { "epoch": 0.24152410575427682, "grad_norm": 1.0002048515938975, "learning_rate": 8.628412952279879e-06, "loss": 0.1636, "step": 1553 }, { "epoch": 0.2416796267496112, "grad_norm": 1.2635804994332953, "learning_rate": 8.62673171923529e-06, "loss": 0.1922, "step": 1554 }, { "epoch": 0.24183514774494558, "grad_norm": 1.0841589283406547, "learning_rate": 8.625049620439266e-06, "loss": 0.1796, "step": 1555 }, { "epoch": 0.24199066874027994, "grad_norm": 1.2588626615586416, "learning_rate": 8.623366656293345e-06, "loss": 0.2045, "step": 1556 }, { "epoch": 0.2421461897356143, "grad_norm": 1.114070429674418, "learning_rate": 8.621682827199271e-06, "loss": 0.2155, "step": 1557 }, { "epoch": 0.24230171073094867, "grad_norm": 1.122877032526039, "learning_rate": 8.619998133559001e-06, "loss": 0.1647, "step": 1558 }, { "epoch": 0.24245723172628306, "grad_norm": 2.039494379737774, "learning_rate": 8.618312575774696e-06, "loss": 0.2327, "step": 1559 }, { "epoch": 0.24261275272161742, "grad_norm": 1.1450723191422727, "learning_rate": 8.616626154248717e-06, "loss": 0.1879, "step": 1560 }, { "epoch": 0.24276827371695178, "grad_norm": 1.1035439479736404, "learning_rate": 8.614938869383643e-06, "loss": 0.1987, "step": 1561 }, { "epoch": 0.24292379471228615, "grad_norm": 9.183796995970361, "learning_rate": 8.613250721582244e-06, "loss": 0.1657, "step": 1562 }, { "epoch": 0.24307931570762054, "grad_norm": 10.346790090579951, "learning_rate": 8.611561711247512e-06, "loss": 0.1277, "step": 1563 }, { "epoch": 0.2432348367029549, "grad_norm": 1.0950378522648088, "learning_rate": 8.609871838782636e-06, "loss": 0.1792, "step": 1564 }, { "epoch": 0.24339035769828926, "grad_norm": 1.2442899837619454, "learning_rate": 8.608181104591008e-06, "loss": 0.2481, "step": 1565 }, { "epoch": 0.24354587869362365, "grad_norm": 0.9579587283389649, "learning_rate": 8.606489509076232e-06, "loss": 0.1464, "step": 1566 }, { "epoch": 0.24370139968895801, "grad_norm": 1.3434609920952423, "learning_rate": 8.604797052642118e-06, "loss": 0.167, "step": 1567 }, { "epoch": 0.24385692068429238, "grad_norm": 1.3932778191886934, "learning_rate": 8.603103735692678e-06, "loss": 0.222, "step": 1568 }, { "epoch": 0.24401244167962674, "grad_norm": 1.2606515150004263, "learning_rate": 8.601409558632125e-06, "loss": 0.1734, "step": 1569 }, { "epoch": 0.24416796267496113, "grad_norm": 0.7524170445152542, "learning_rate": 8.59971452186489e-06, "loss": 0.1377, "step": 1570 }, { "epoch": 0.2443234836702955, "grad_norm": 1.8039225543958133, "learning_rate": 8.5980186257956e-06, "loss": 0.1645, "step": 1571 }, { "epoch": 0.24447900466562986, "grad_norm": 1.2660119379119157, "learning_rate": 8.596321870829084e-06, "loss": 0.1297, "step": 1572 }, { "epoch": 0.24463452566096422, "grad_norm": 0.9837487875887194, "learning_rate": 8.594624257370388e-06, "loss": 0.2292, "step": 1573 }, { "epoch": 0.2447900466562986, "grad_norm": 1.4946436207685003, "learning_rate": 8.592925785824753e-06, "loss": 0.171, "step": 1574 }, { "epoch": 0.24494556765163297, "grad_norm": 1.0654266730537136, "learning_rate": 8.591226456597626e-06, "loss": 0.1375, "step": 1575 }, { "epoch": 0.24510108864696734, "grad_norm": 0.971876018180366, "learning_rate": 8.589526270094664e-06, "loss": 0.1924, "step": 1576 }, { "epoch": 0.2452566096423017, "grad_norm": 1.0087644300116139, "learning_rate": 8.587825226721722e-06, "loss": 0.1687, "step": 1577 }, { "epoch": 0.2454121306376361, "grad_norm": 1.1652659496533695, "learning_rate": 8.586123326884865e-06, "loss": 0.186, "step": 1578 }, { "epoch": 0.24556765163297045, "grad_norm": 1.4775732365533967, "learning_rate": 8.584420570990361e-06, "loss": 0.1889, "step": 1579 }, { "epoch": 0.24572317262830481, "grad_norm": 1.0459439420285532, "learning_rate": 8.582716959444679e-06, "loss": 0.1928, "step": 1580 }, { "epoch": 0.2458786936236392, "grad_norm": 1.5372117734449058, "learning_rate": 8.581012492654495e-06, "loss": 0.1877, "step": 1581 }, { "epoch": 0.24603421461897357, "grad_norm": 1.9347395817267816, "learning_rate": 8.579307171026693e-06, "loss": 0.2777, "step": 1582 }, { "epoch": 0.24618973561430793, "grad_norm": 0.9029125279631515, "learning_rate": 8.577600994968352e-06, "loss": 0.1297, "step": 1583 }, { "epoch": 0.2463452566096423, "grad_norm": 0.8355029037365392, "learning_rate": 8.575893964886763e-06, "loss": 0.2099, "step": 1584 }, { "epoch": 0.24650077760497668, "grad_norm": 1.6899413873191795, "learning_rate": 8.574186081189416e-06, "loss": 0.2022, "step": 1585 }, { "epoch": 0.24665629860031105, "grad_norm": 1.087509710593699, "learning_rate": 8.572477344284009e-06, "loss": 0.1751, "step": 1586 }, { "epoch": 0.2468118195956454, "grad_norm": 1.0292806428751466, "learning_rate": 8.570767754578438e-06, "loss": 0.1593, "step": 1587 }, { "epoch": 0.24696734059097977, "grad_norm": 1.188609591991913, "learning_rate": 8.56905731248081e-06, "loss": 0.1491, "step": 1588 }, { "epoch": 0.24712286158631416, "grad_norm": 1.2300883239133906, "learning_rate": 8.567346018399427e-06, "loss": 0.165, "step": 1589 }, { "epoch": 0.24727838258164853, "grad_norm": 1.2064414577216789, "learning_rate": 8.565633872742803e-06, "loss": 0.2524, "step": 1590 }, { "epoch": 0.2474339035769829, "grad_norm": 0.8406003864640567, "learning_rate": 8.56392087591965e-06, "loss": 0.1658, "step": 1591 }, { "epoch": 0.24758942457231725, "grad_norm": 2.634699334807654, "learning_rate": 8.56220702833888e-06, "loss": 0.1692, "step": 1592 }, { "epoch": 0.24774494556765164, "grad_norm": 0.9815581638651881, "learning_rate": 8.560492330409618e-06, "loss": 0.1678, "step": 1593 }, { "epoch": 0.247900466562986, "grad_norm": 1.3909573488426212, "learning_rate": 8.558776782541183e-06, "loss": 0.2397, "step": 1594 }, { "epoch": 0.24805598755832037, "grad_norm": 1.2613818557792364, "learning_rate": 8.557060385143102e-06, "loss": 0.2273, "step": 1595 }, { "epoch": 0.24821150855365473, "grad_norm": 0.9777010646149178, "learning_rate": 8.5553431386251e-06, "loss": 0.1713, "step": 1596 }, { "epoch": 0.24836702954898912, "grad_norm": 1.2012423072130696, "learning_rate": 8.553625043397112e-06, "loss": 0.2192, "step": 1597 }, { "epoch": 0.24852255054432348, "grad_norm": 1.0747389022970961, "learning_rate": 8.551906099869269e-06, "loss": 0.1555, "step": 1598 }, { "epoch": 0.24867807153965785, "grad_norm": 0.9987345212261577, "learning_rate": 8.550186308451906e-06, "loss": 0.2117, "step": 1599 }, { "epoch": 0.24883359253499224, "grad_norm": 1.1743809541983374, "learning_rate": 8.548465669555564e-06, "loss": 0.1547, "step": 1600 }, { "epoch": 0.24883359253499224, "eval_loss": 0.2037108987569809, "eval_runtime": 9.4238, "eval_samples_per_second": 2.759, "eval_steps_per_second": 0.743, "step": 1600 }, { "epoch": 0.2489891135303266, "grad_norm": 1.0755504197866683, "learning_rate": 8.546744183590979e-06, "loss": 0.1448, "step": 1601 }, { "epoch": 0.24914463452566096, "grad_norm": 1.293645268455303, "learning_rate": 8.545021850969097e-06, "loss": 0.2045, "step": 1602 }, { "epoch": 0.24930015552099533, "grad_norm": 1.644496498579518, "learning_rate": 8.543298672101063e-06, "loss": 0.1745, "step": 1603 }, { "epoch": 0.24945567651632972, "grad_norm": 1.8853737644375217, "learning_rate": 8.541574647398224e-06, "loss": 0.1785, "step": 1604 }, { "epoch": 0.24961119751166408, "grad_norm": 0.8348472318309339, "learning_rate": 8.539849777272125e-06, "loss": 0.1976, "step": 1605 }, { "epoch": 0.24976671850699844, "grad_norm": 1.6007239985640846, "learning_rate": 8.538124062134521e-06, "loss": 0.1766, "step": 1606 }, { "epoch": 0.2499222395023328, "grad_norm": 2.1944156006209194, "learning_rate": 8.53639750239736e-06, "loss": 0.2715, "step": 1607 }, { "epoch": 0.25007776049766717, "grad_norm": 1.105749977206952, "learning_rate": 8.534670098472802e-06, "loss": 0.1564, "step": 1608 }, { "epoch": 0.25023328149300156, "grad_norm": 0.8083237797522677, "learning_rate": 8.532941850773195e-06, "loss": 0.1668, "step": 1609 }, { "epoch": 0.25038880248833595, "grad_norm": 1.172486307255137, "learning_rate": 8.531212759711103e-06, "loss": 0.2302, "step": 1610 }, { "epoch": 0.2505443234836703, "grad_norm": 1.268322758173216, "learning_rate": 8.52948282569928e-06, "loss": 0.1789, "step": 1611 }, { "epoch": 0.2506998444790047, "grad_norm": 0.9091823227567202, "learning_rate": 8.527752049150685e-06, "loss": 0.0784, "step": 1612 }, { "epoch": 0.250855365474339, "grad_norm": 1.3902158634610304, "learning_rate": 8.52602043047848e-06, "loss": 0.1681, "step": 1613 }, { "epoch": 0.2510108864696734, "grad_norm": 1.4942303280111533, "learning_rate": 8.524287970096026e-06, "loss": 0.217, "step": 1614 }, { "epoch": 0.2511664074650078, "grad_norm": 0.8627158934582907, "learning_rate": 8.522554668416887e-06, "loss": 0.2181, "step": 1615 }, { "epoch": 0.2513219284603421, "grad_norm": 1.0390290867530942, "learning_rate": 8.520820525854824e-06, "loss": 0.1764, "step": 1616 }, { "epoch": 0.2514774494556765, "grad_norm": 1.4108685539031005, "learning_rate": 8.519085542823802e-06, "loss": 0.2164, "step": 1617 }, { "epoch": 0.2516329704510109, "grad_norm": 1.371077345528009, "learning_rate": 8.517349719737984e-06, "loss": 0.1561, "step": 1618 }, { "epoch": 0.25178849144634524, "grad_norm": 1.2763042021188964, "learning_rate": 8.51561305701174e-06, "loss": 0.1526, "step": 1619 }, { "epoch": 0.25194401244167963, "grad_norm": 1.077695325158449, "learning_rate": 8.51387555505963e-06, "loss": 0.1876, "step": 1620 }, { "epoch": 0.252099533437014, "grad_norm": 1.3164226998591637, "learning_rate": 8.512137214296422e-06, "loss": 0.2131, "step": 1621 }, { "epoch": 0.25225505443234836, "grad_norm": 1.7522341912294899, "learning_rate": 8.510398035137083e-06, "loss": 0.133, "step": 1622 }, { "epoch": 0.25241057542768275, "grad_norm": 4.615604310333582, "learning_rate": 8.50865801799678e-06, "loss": 0.1955, "step": 1623 }, { "epoch": 0.2525660964230171, "grad_norm": 2.3506074867763536, "learning_rate": 8.506917163290877e-06, "loss": 0.3199, "step": 1624 }, { "epoch": 0.2527216174183515, "grad_norm": 0.7483739763165084, "learning_rate": 8.505175471434943e-06, "loss": 0.2213, "step": 1625 }, { "epoch": 0.25287713841368586, "grad_norm": 2.0095572169442333, "learning_rate": 8.50343294284474e-06, "loss": 0.2356, "step": 1626 }, { "epoch": 0.2530326594090202, "grad_norm": 0.9367298995041891, "learning_rate": 8.501689577936238e-06, "loss": 0.1567, "step": 1627 }, { "epoch": 0.2531881804043546, "grad_norm": 1.2746896918156698, "learning_rate": 8.499945377125602e-06, "loss": 0.1465, "step": 1628 }, { "epoch": 0.253343701399689, "grad_norm": 0.7971645300115215, "learning_rate": 8.498200340829195e-06, "loss": 0.1419, "step": 1629 }, { "epoch": 0.2534992223950233, "grad_norm": 1.7131432725110083, "learning_rate": 8.496454469463583e-06, "loss": 0.1437, "step": 1630 }, { "epoch": 0.2536547433903577, "grad_norm": 1.3945635968284718, "learning_rate": 8.494707763445526e-06, "loss": 0.2116, "step": 1631 }, { "epoch": 0.25381026438569204, "grad_norm": 1.130700720901677, "learning_rate": 8.492960223191994e-06, "loss": 0.1783, "step": 1632 }, { "epoch": 0.25396578538102643, "grad_norm": 0.9910207975897489, "learning_rate": 8.491211849120146e-06, "loss": 0.1275, "step": 1633 }, { "epoch": 0.2541213063763608, "grad_norm": 1.6819299813099522, "learning_rate": 8.48946264164734e-06, "loss": 0.2092, "step": 1634 }, { "epoch": 0.25427682737169516, "grad_norm": 0.8070165110990363, "learning_rate": 8.487712601191143e-06, "loss": 0.2104, "step": 1635 }, { "epoch": 0.25443234836702955, "grad_norm": 0.7832453865024183, "learning_rate": 8.485961728169308e-06, "loss": 0.1491, "step": 1636 }, { "epoch": 0.25458786936236394, "grad_norm": 1.570863259158348, "learning_rate": 8.484210022999795e-06, "loss": 0.1337, "step": 1637 }, { "epoch": 0.2547433903576983, "grad_norm": 2.094162070797788, "learning_rate": 8.482457486100761e-06, "loss": 0.1732, "step": 1638 }, { "epoch": 0.25489891135303266, "grad_norm": 1.3293274255208316, "learning_rate": 8.48070411789056e-06, "loss": 0.1587, "step": 1639 }, { "epoch": 0.25505443234836706, "grad_norm": 0.9704592907631973, "learning_rate": 8.478949918787746e-06, "loss": 0.167, "step": 1640 }, { "epoch": 0.2552099533437014, "grad_norm": 2.2927511192581935, "learning_rate": 8.47719488921107e-06, "loss": 0.1731, "step": 1641 }, { "epoch": 0.2553654743390358, "grad_norm": 1.2113969832398468, "learning_rate": 8.475439029579487e-06, "loss": 0.1636, "step": 1642 }, { "epoch": 0.2555209953343701, "grad_norm": 1.2700840486141427, "learning_rate": 8.473682340312136e-06, "loss": 0.2251, "step": 1643 }, { "epoch": 0.2556765163297045, "grad_norm": 0.8692629936958125, "learning_rate": 8.47192482182837e-06, "loss": 0.1944, "step": 1644 }, { "epoch": 0.2558320373250389, "grad_norm": 1.0546031026829716, "learning_rate": 8.470166474547731e-06, "loss": 0.1963, "step": 1645 }, { "epoch": 0.25598755832037323, "grad_norm": 1.8035421603246344, "learning_rate": 8.468407298889962e-06, "loss": 0.1678, "step": 1646 }, { "epoch": 0.2561430793157076, "grad_norm": 0.8593243264529278, "learning_rate": 8.466647295275002e-06, "loss": 0.1272, "step": 1647 }, { "epoch": 0.256298600311042, "grad_norm": 1.5174530612382813, "learning_rate": 8.464886464122988e-06, "loss": 0.2685, "step": 1648 }, { "epoch": 0.25645412130637635, "grad_norm": 1.5250972376290421, "learning_rate": 8.463124805854257e-06, "loss": 0.1674, "step": 1649 }, { "epoch": 0.25660964230171074, "grad_norm": 1.1663575092987046, "learning_rate": 8.461362320889338e-06, "loss": 0.1577, "step": 1650 }, { "epoch": 0.2567651632970451, "grad_norm": 1.474673013106268, "learning_rate": 8.459599009648964e-06, "loss": 0.1769, "step": 1651 }, { "epoch": 0.25692068429237946, "grad_norm": 1.1672631965692757, "learning_rate": 8.45783487255406e-06, "loss": 0.2249, "step": 1652 }, { "epoch": 0.25707620528771385, "grad_norm": 1.1953181883355133, "learning_rate": 8.456069910025751e-06, "loss": 0.2018, "step": 1653 }, { "epoch": 0.2572317262830482, "grad_norm": 1.1089828464331577, "learning_rate": 8.454304122485358e-06, "loss": 0.1419, "step": 1654 }, { "epoch": 0.2573872472783826, "grad_norm": 1.2716710060074294, "learning_rate": 8.452537510354397e-06, "loss": 0.1966, "step": 1655 }, { "epoch": 0.25754276827371697, "grad_norm": 1.952579937166782, "learning_rate": 8.450770074054586e-06, "loss": 0.2699, "step": 1656 }, { "epoch": 0.2576982892690513, "grad_norm": 0.7319931402583304, "learning_rate": 8.449001814007838e-06, "loss": 0.1401, "step": 1657 }, { "epoch": 0.2578538102643857, "grad_norm": 1.627013708512288, "learning_rate": 8.447232730636257e-06, "loss": 0.2617, "step": 1658 }, { "epoch": 0.2580093312597201, "grad_norm": 1.0492953509552387, "learning_rate": 8.44546282436215e-06, "loss": 0.1922, "step": 1659 }, { "epoch": 0.2581648522550544, "grad_norm": 0.9166534435780459, "learning_rate": 8.443692095608019e-06, "loss": 0.2099, "step": 1660 }, { "epoch": 0.2583203732503888, "grad_norm": 1.1458120209760718, "learning_rate": 8.441920544796558e-06, "loss": 0.1724, "step": 1661 }, { "epoch": 0.25847589424572315, "grad_norm": 1.071395804244241, "learning_rate": 8.440148172350666e-06, "loss": 0.1728, "step": 1662 }, { "epoch": 0.25863141524105754, "grad_norm": 1.2413704662622753, "learning_rate": 8.43837497869343e-06, "loss": 0.2031, "step": 1663 }, { "epoch": 0.25878693623639193, "grad_norm": 1.1068242296182698, "learning_rate": 8.436600964248138e-06, "loss": 0.1951, "step": 1664 }, { "epoch": 0.25894245723172626, "grad_norm": 0.8699381605693407, "learning_rate": 8.43482612943827e-06, "loss": 0.1764, "step": 1665 }, { "epoch": 0.25909797822706065, "grad_norm": 1.2048052321069596, "learning_rate": 8.433050474687505e-06, "loss": 0.2311, "step": 1666 }, { "epoch": 0.25925349922239505, "grad_norm": 1.315498269766704, "learning_rate": 8.431274000419716e-06, "loss": 0.2412, "step": 1667 }, { "epoch": 0.2594090202177294, "grad_norm": 0.6128855898398873, "learning_rate": 8.42949670705897e-06, "loss": 0.1068, "step": 1668 }, { "epoch": 0.25956454121306377, "grad_norm": 0.9552988172621262, "learning_rate": 8.427718595029537e-06, "loss": 0.1458, "step": 1669 }, { "epoch": 0.25972006220839816, "grad_norm": 1.411892967173632, "learning_rate": 8.425939664755874e-06, "loss": 0.2327, "step": 1670 }, { "epoch": 0.2598755832037325, "grad_norm": 1.066036249369497, "learning_rate": 8.424159916662636e-06, "loss": 0.1845, "step": 1671 }, { "epoch": 0.2600311041990669, "grad_norm": 1.0078601069914832, "learning_rate": 8.422379351174673e-06, "loss": 0.129, "step": 1672 }, { "epoch": 0.2601866251944012, "grad_norm": 0.9627418389301211, "learning_rate": 8.420597968717033e-06, "loss": 0.2346, "step": 1673 }, { "epoch": 0.2603421461897356, "grad_norm": 1.0190302705099263, "learning_rate": 8.418815769714956e-06, "loss": 0.1291, "step": 1674 }, { "epoch": 0.26049766718507, "grad_norm": 0.8536213147159897, "learning_rate": 8.417032754593879e-06, "loss": 0.1759, "step": 1675 }, { "epoch": 0.26065318818040434, "grad_norm": 0.9477728405361937, "learning_rate": 8.415248923779431e-06, "loss": 0.1708, "step": 1676 }, { "epoch": 0.26080870917573873, "grad_norm": 1.0305276755799404, "learning_rate": 8.413464277697436e-06, "loss": 0.3205, "step": 1677 }, { "epoch": 0.2609642301710731, "grad_norm": 1.324545893865915, "learning_rate": 8.411678816773916e-06, "loss": 0.2936, "step": 1678 }, { "epoch": 0.26111975116640745, "grad_norm": 1.3383489149505705, "learning_rate": 8.409892541435085e-06, "loss": 0.2406, "step": 1679 }, { "epoch": 0.26127527216174184, "grad_norm": 0.9651270377598534, "learning_rate": 8.408105452107353e-06, "loss": 0.1511, "step": 1680 }, { "epoch": 0.2614307931570762, "grad_norm": 0.6783781205233194, "learning_rate": 8.40631754921732e-06, "loss": 0.1567, "step": 1681 }, { "epoch": 0.26158631415241057, "grad_norm": 1.198981860000486, "learning_rate": 8.404528833191786e-06, "loss": 0.2125, "step": 1682 }, { "epoch": 0.26174183514774496, "grad_norm": 0.7449630196962097, "learning_rate": 8.402739304457743e-06, "loss": 0.179, "step": 1683 }, { "epoch": 0.2618973561430793, "grad_norm": 1.3499907032342544, "learning_rate": 8.400948963442373e-06, "loss": 0.1492, "step": 1684 }, { "epoch": 0.2620528771384137, "grad_norm": 1.2324653573954145, "learning_rate": 8.39915781057306e-06, "loss": 0.1442, "step": 1685 }, { "epoch": 0.2622083981337481, "grad_norm": 1.5240761421711815, "learning_rate": 8.397365846277371e-06, "loss": 0.3141, "step": 1686 }, { "epoch": 0.2623639191290824, "grad_norm": 0.9242701212113029, "learning_rate": 8.39557307098308e-06, "loss": 0.175, "step": 1687 }, { "epoch": 0.2625194401244168, "grad_norm": 1.0215723172112428, "learning_rate": 8.393779485118142e-06, "loss": 0.1572, "step": 1688 }, { "epoch": 0.2626749611197512, "grad_norm": 1.4272441271545482, "learning_rate": 8.391985089110715e-06, "loss": 0.2086, "step": 1689 }, { "epoch": 0.26283048211508553, "grad_norm": 0.98493015131112, "learning_rate": 8.390189883389143e-06, "loss": 0.1758, "step": 1690 }, { "epoch": 0.2629860031104199, "grad_norm": 1.412962012368002, "learning_rate": 8.388393868381967e-06, "loss": 0.137, "step": 1691 }, { "epoch": 0.26314152410575425, "grad_norm": 0.8439849086089997, "learning_rate": 8.386597044517923e-06, "loss": 0.1794, "step": 1692 }, { "epoch": 0.26329704510108864, "grad_norm": 0.9027272166442722, "learning_rate": 8.384799412225936e-06, "loss": 0.1827, "step": 1693 }, { "epoch": 0.26345256609642304, "grad_norm": 1.0861962602589315, "learning_rate": 8.383000971935129e-06, "loss": 0.1736, "step": 1694 }, { "epoch": 0.26360808709175737, "grad_norm": 1.4467531133479765, "learning_rate": 8.38120172407481e-06, "loss": 0.2872, "step": 1695 }, { "epoch": 0.26376360808709176, "grad_norm": 0.7243899321635017, "learning_rate": 8.379401669074489e-06, "loss": 0.1568, "step": 1696 }, { "epoch": 0.26391912908242615, "grad_norm": 0.8947544881090379, "learning_rate": 8.37760080736386e-06, "loss": 0.1516, "step": 1697 }, { "epoch": 0.2640746500777605, "grad_norm": 1.1759725115418023, "learning_rate": 8.375799139372818e-06, "loss": 0.1384, "step": 1698 }, { "epoch": 0.2642301710730949, "grad_norm": 0.8519187195565056, "learning_rate": 8.373996665531443e-06, "loss": 0.2027, "step": 1699 }, { "epoch": 0.2643856920684292, "grad_norm": 1.4756118825078526, "learning_rate": 8.37219338627001e-06, "loss": 0.2323, "step": 1700 }, { "epoch": 0.2643856920684292, "eval_loss": 0.19943906366825104, "eval_runtime": 9.4244, "eval_samples_per_second": 2.759, "eval_steps_per_second": 0.743, "step": 1700 }, { "epoch": 0.2645412130637636, "grad_norm": 1.1415194682343677, "learning_rate": 8.370389302018993e-06, "loss": 0.1627, "step": 1701 }, { "epoch": 0.264696734059098, "grad_norm": 0.9887030475180681, "learning_rate": 8.368584413209044e-06, "loss": 0.1913, "step": 1702 }, { "epoch": 0.26485225505443233, "grad_norm": 1.579433234849522, "learning_rate": 8.366778720271022e-06, "loss": 0.2494, "step": 1703 }, { "epoch": 0.2650077760497667, "grad_norm": 1.1581416599961576, "learning_rate": 8.364972223635967e-06, "loss": 0.1984, "step": 1704 }, { "epoch": 0.2651632970451011, "grad_norm": 1.4481396852315895, "learning_rate": 8.363164923735116e-06, "loss": 0.1772, "step": 1705 }, { "epoch": 0.26531881804043544, "grad_norm": 2.2248131911902918, "learning_rate": 8.361356820999897e-06, "loss": 0.2035, "step": 1706 }, { "epoch": 0.26547433903576984, "grad_norm": 1.296906679431483, "learning_rate": 8.359547915861927e-06, "loss": 0.1906, "step": 1707 }, { "epoch": 0.2656298600311042, "grad_norm": 1.4510599043288837, "learning_rate": 8.357738208753022e-06, "loss": 0.215, "step": 1708 }, { "epoch": 0.26578538102643856, "grad_norm": 1.3812180344156422, "learning_rate": 8.35592770010518e-06, "loss": 0.2366, "step": 1709 }, { "epoch": 0.26594090202177295, "grad_norm": 0.7624028953564842, "learning_rate": 8.354116390350594e-06, "loss": 0.1337, "step": 1710 }, { "epoch": 0.2660964230171073, "grad_norm": 1.0938571817018024, "learning_rate": 8.352304279921655e-06, "loss": 0.1739, "step": 1711 }, { "epoch": 0.2662519440124417, "grad_norm": 1.3112579396126312, "learning_rate": 8.350491369250933e-06, "loss": 0.2866, "step": 1712 }, { "epoch": 0.26640746500777607, "grad_norm": 1.4175431035953647, "learning_rate": 8.348677658771197e-06, "loss": 0.1308, "step": 1713 }, { "epoch": 0.2665629860031104, "grad_norm": 2.1014926949253327, "learning_rate": 8.346863148915402e-06, "loss": 0.1549, "step": 1714 }, { "epoch": 0.2667185069984448, "grad_norm": 1.132911689146343, "learning_rate": 8.345047840116704e-06, "loss": 0.2182, "step": 1715 }, { "epoch": 0.2668740279937792, "grad_norm": 0.6535130581015213, "learning_rate": 8.343231732808435e-06, "loss": 0.1748, "step": 1716 }, { "epoch": 0.2670295489891135, "grad_norm": 0.9808104365320156, "learning_rate": 8.34141482742413e-06, "loss": 0.1512, "step": 1717 }, { "epoch": 0.2671850699844479, "grad_norm": 1.2630125658621263, "learning_rate": 8.339597124397509e-06, "loss": 0.1698, "step": 1718 }, { "epoch": 0.26734059097978224, "grad_norm": 1.279259047820582, "learning_rate": 8.33777862416248e-06, "loss": 0.1769, "step": 1719 }, { "epoch": 0.26749611197511663, "grad_norm": 1.1242790219258612, "learning_rate": 8.335959327153148e-06, "loss": 0.2224, "step": 1720 }, { "epoch": 0.267651632970451, "grad_norm": 1.0035835372337707, "learning_rate": 8.334139233803801e-06, "loss": 0.1697, "step": 1721 }, { "epoch": 0.26780715396578536, "grad_norm": 1.9776796243145607, "learning_rate": 8.332318344548926e-06, "loss": 0.2033, "step": 1722 }, { "epoch": 0.26796267496111975, "grad_norm": 1.1521258085682824, "learning_rate": 8.330496659823189e-06, "loss": 0.1729, "step": 1723 }, { "epoch": 0.26811819595645414, "grad_norm": 1.0253842887133877, "learning_rate": 8.328674180061453e-06, "loss": 0.2185, "step": 1724 }, { "epoch": 0.2682737169517885, "grad_norm": 0.871091469827773, "learning_rate": 8.326850905698774e-06, "loss": 0.1359, "step": 1725 }, { "epoch": 0.26842923794712287, "grad_norm": 1.7009594103702224, "learning_rate": 8.325026837170386e-06, "loss": 0.2348, "step": 1726 }, { "epoch": 0.26858475894245726, "grad_norm": 1.367926551681483, "learning_rate": 8.323201974911723e-06, "loss": 0.1842, "step": 1727 }, { "epoch": 0.2687402799377916, "grad_norm": 1.148927442910907, "learning_rate": 8.321376319358407e-06, "loss": 0.1096, "step": 1728 }, { "epoch": 0.268895800933126, "grad_norm": 1.3075658909675654, "learning_rate": 8.319549870946244e-06, "loss": 0.1543, "step": 1729 }, { "epoch": 0.2690513219284603, "grad_norm": 0.8291270774545968, "learning_rate": 8.317722630111233e-06, "loss": 0.1093, "step": 1730 }, { "epoch": 0.2692068429237947, "grad_norm": 2.2622896049282706, "learning_rate": 8.315894597289565e-06, "loss": 0.2042, "step": 1731 }, { "epoch": 0.2693623639191291, "grad_norm": 0.7046996138148661, "learning_rate": 8.314065772917612e-06, "loss": 0.1303, "step": 1732 }, { "epoch": 0.26951788491446343, "grad_norm": 0.9333196367153322, "learning_rate": 8.312236157431946e-06, "loss": 0.169, "step": 1733 }, { "epoch": 0.2696734059097978, "grad_norm": 1.1869718333049797, "learning_rate": 8.310405751269318e-06, "loss": 0.2494, "step": 1734 }, { "epoch": 0.2698289269051322, "grad_norm": 0.9186255111712875, "learning_rate": 8.30857455486667e-06, "loss": 0.1449, "step": 1735 }, { "epoch": 0.26998444790046655, "grad_norm": 1.7158457711756847, "learning_rate": 8.306742568661137e-06, "loss": 0.2472, "step": 1736 }, { "epoch": 0.27013996889580094, "grad_norm": 0.9091734067747751, "learning_rate": 8.304909793090039e-06, "loss": 0.1517, "step": 1737 }, { "epoch": 0.2702954898911353, "grad_norm": 0.9472038650945157, "learning_rate": 8.303076228590885e-06, "loss": 0.1293, "step": 1738 }, { "epoch": 0.27045101088646967, "grad_norm": 1.359961162735269, "learning_rate": 8.301241875601371e-06, "loss": 0.1687, "step": 1739 }, { "epoch": 0.27060653188180406, "grad_norm": 1.3706412614563859, "learning_rate": 8.299406734559385e-06, "loss": 0.1151, "step": 1740 }, { "epoch": 0.2707620528771384, "grad_norm": 1.4633698039358347, "learning_rate": 8.297570805903e-06, "loss": 0.1834, "step": 1741 }, { "epoch": 0.2709175738724728, "grad_norm": 1.2706325476878815, "learning_rate": 8.295734090070477e-06, "loss": 0.1889, "step": 1742 }, { "epoch": 0.2710730948678072, "grad_norm": 1.40063937560449, "learning_rate": 8.293896587500266e-06, "loss": 0.1644, "step": 1743 }, { "epoch": 0.2712286158631415, "grad_norm": 1.756399176307069, "learning_rate": 8.292058298631003e-06, "loss": 0.2121, "step": 1744 }, { "epoch": 0.2713841368584759, "grad_norm": 1.3118943702099763, "learning_rate": 8.290219223901517e-06, "loss": 0.1657, "step": 1745 }, { "epoch": 0.2715396578538103, "grad_norm": 1.221070247479925, "learning_rate": 8.288379363750818e-06, "loss": 0.1799, "step": 1746 }, { "epoch": 0.2716951788491446, "grad_norm": 1.30049039400021, "learning_rate": 8.286538718618107e-06, "loss": 0.1659, "step": 1747 }, { "epoch": 0.271850699844479, "grad_norm": 0.8218052779463395, "learning_rate": 8.28469728894277e-06, "loss": 0.1417, "step": 1748 }, { "epoch": 0.27200622083981335, "grad_norm": 1.318881683721639, "learning_rate": 8.282855075164386e-06, "loss": 0.2086, "step": 1749 }, { "epoch": 0.27216174183514774, "grad_norm": 1.168225071909074, "learning_rate": 8.281012077722712e-06, "loss": 0.1481, "step": 1750 }, { "epoch": 0.27231726283048213, "grad_norm": 1.387527553498744, "learning_rate": 8.2791682970577e-06, "loss": 0.224, "step": 1751 }, { "epoch": 0.27247278382581647, "grad_norm": 0.9455523699522945, "learning_rate": 8.277323733609488e-06, "loss": 0.1689, "step": 1752 }, { "epoch": 0.27262830482115086, "grad_norm": 1.301993231412919, "learning_rate": 8.275478387818394e-06, "loss": 0.17, "step": 1753 }, { "epoch": 0.27278382581648525, "grad_norm": 1.1753804485169133, "learning_rate": 8.273632260124934e-06, "loss": 0.2231, "step": 1754 }, { "epoch": 0.2729393468118196, "grad_norm": 1.080698611275427, "learning_rate": 8.271785350969799e-06, "loss": 0.1796, "step": 1755 }, { "epoch": 0.273094867807154, "grad_norm": 1.290015540604507, "learning_rate": 8.269937660793875e-06, "loss": 0.1941, "step": 1756 }, { "epoch": 0.2732503888024883, "grad_norm": 1.070538218943679, "learning_rate": 8.268089190038228e-06, "loss": 0.1909, "step": 1757 }, { "epoch": 0.2734059097978227, "grad_norm": 1.2252798699112468, "learning_rate": 8.266239939144118e-06, "loss": 0.1569, "step": 1758 }, { "epoch": 0.2735614307931571, "grad_norm": 1.2346475130597931, "learning_rate": 8.264389908552987e-06, "loss": 0.1881, "step": 1759 }, { "epoch": 0.2737169517884914, "grad_norm": 0.8909529676508143, "learning_rate": 8.26253909870646e-06, "loss": 0.1635, "step": 1760 }, { "epoch": 0.2738724727838258, "grad_norm": 1.3801819199807877, "learning_rate": 8.260687510046352e-06, "loss": 0.1957, "step": 1761 }, { "epoch": 0.2740279937791602, "grad_norm": 0.9098604615543268, "learning_rate": 8.258835143014663e-06, "loss": 0.1556, "step": 1762 }, { "epoch": 0.27418351477449454, "grad_norm": 1.479953181946323, "learning_rate": 8.25698199805358e-06, "loss": 0.1673, "step": 1763 }, { "epoch": 0.27433903576982893, "grad_norm": 1.0391961011580078, "learning_rate": 8.255128075605475e-06, "loss": 0.1678, "step": 1764 }, { "epoch": 0.2744945567651633, "grad_norm": 1.1674213515628957, "learning_rate": 8.253273376112902e-06, "loss": 0.1575, "step": 1765 }, { "epoch": 0.27465007776049766, "grad_norm": 0.776827674790433, "learning_rate": 8.251417900018606e-06, "loss": 0.2087, "step": 1766 }, { "epoch": 0.27480559875583205, "grad_norm": 1.0737505366105782, "learning_rate": 8.249561647765515e-06, "loss": 0.202, "step": 1767 }, { "epoch": 0.2749611197511664, "grad_norm": 1.0278179070478979, "learning_rate": 8.247704619796743e-06, "loss": 0.2246, "step": 1768 }, { "epoch": 0.2751166407465008, "grad_norm": 1.3308057309065462, "learning_rate": 8.245846816555588e-06, "loss": 0.1781, "step": 1769 }, { "epoch": 0.27527216174183516, "grad_norm": 1.171891225152092, "learning_rate": 8.24398823848553e-06, "loss": 0.2838, "step": 1770 }, { "epoch": 0.2754276827371695, "grad_norm": 0.9162549134019579, "learning_rate": 8.242128886030243e-06, "loss": 0.153, "step": 1771 }, { "epoch": 0.2755832037325039, "grad_norm": 1.7094368421056838, "learning_rate": 8.240268759633576e-06, "loss": 0.1769, "step": 1772 }, { "epoch": 0.2757387247278383, "grad_norm": 1.088761334959302, "learning_rate": 8.23840785973957e-06, "loss": 0.1872, "step": 1773 }, { "epoch": 0.2758942457231726, "grad_norm": 1.0467068106039534, "learning_rate": 8.236546186792446e-06, "loss": 0.1941, "step": 1774 }, { "epoch": 0.276049766718507, "grad_norm": 1.469925204114295, "learning_rate": 8.234683741236612e-06, "loss": 0.2439, "step": 1775 }, { "epoch": 0.27620528771384134, "grad_norm": 1.286843667284798, "learning_rate": 8.23282052351666e-06, "loss": 0.1825, "step": 1776 }, { "epoch": 0.27636080870917573, "grad_norm": 1.5684518100667084, "learning_rate": 8.230956534077366e-06, "loss": 0.2088, "step": 1777 }, { "epoch": 0.2765163297045101, "grad_norm": 1.3158757876867857, "learning_rate": 8.22909177336369e-06, "loss": 0.1965, "step": 1778 }, { "epoch": 0.27667185069984446, "grad_norm": 0.7862541895693009, "learning_rate": 8.227226241820779e-06, "loss": 0.1388, "step": 1779 }, { "epoch": 0.27682737169517885, "grad_norm": 0.9288123715441376, "learning_rate": 8.225359939893954e-06, "loss": 0.243, "step": 1780 }, { "epoch": 0.27698289269051324, "grad_norm": 1.491008802108701, "learning_rate": 8.223492868028736e-06, "loss": 0.2521, "step": 1781 }, { "epoch": 0.2771384136858476, "grad_norm": 1.1202886550853388, "learning_rate": 8.221625026670814e-06, "loss": 0.1688, "step": 1782 }, { "epoch": 0.27729393468118196, "grad_norm": 1.1962734383960754, "learning_rate": 8.219756416266073e-06, "loss": 0.1294, "step": 1783 }, { "epoch": 0.27744945567651635, "grad_norm": 0.6740427840476089, "learning_rate": 8.217887037260575e-06, "loss": 0.1501, "step": 1784 }, { "epoch": 0.2776049766718507, "grad_norm": 1.8752578154372959, "learning_rate": 8.216016890100564e-06, "loss": 0.2524, "step": 1785 }, { "epoch": 0.2777604976671851, "grad_norm": 1.3276982202120067, "learning_rate": 8.214145975232474e-06, "loss": 0.1611, "step": 1786 }, { "epoch": 0.2779160186625194, "grad_norm": 0.9180331686214024, "learning_rate": 8.212274293102917e-06, "loss": 0.2069, "step": 1787 }, { "epoch": 0.2780715396578538, "grad_norm": 1.1644000920434754, "learning_rate": 8.210401844158688e-06, "loss": 0.2113, "step": 1788 }, { "epoch": 0.2782270606531882, "grad_norm": 1.6247680870264813, "learning_rate": 8.20852862884677e-06, "loss": 0.2167, "step": 1789 }, { "epoch": 0.27838258164852253, "grad_norm": 2.465352962757943, "learning_rate": 8.206654647614323e-06, "loss": 0.2917, "step": 1790 }, { "epoch": 0.2785381026438569, "grad_norm": 0.9826147561106185, "learning_rate": 8.204779900908694e-06, "loss": 0.1513, "step": 1791 }, { "epoch": 0.2786936236391913, "grad_norm": 1.1924827625995933, "learning_rate": 8.202904389177409e-06, "loss": 0.2069, "step": 1792 }, { "epoch": 0.27884914463452565, "grad_norm": 1.2507233550051102, "learning_rate": 8.201028112868182e-06, "loss": 0.1713, "step": 1793 }, { "epoch": 0.27900466562986004, "grad_norm": 1.056564405898492, "learning_rate": 8.199151072428903e-06, "loss": 0.152, "step": 1794 }, { "epoch": 0.27916018662519443, "grad_norm": 1.0582767182694146, "learning_rate": 8.19727326830765e-06, "loss": 0.1313, "step": 1795 }, { "epoch": 0.27931570762052876, "grad_norm": 0.9960646193169612, "learning_rate": 8.195394700952681e-06, "loss": 0.1663, "step": 1796 }, { "epoch": 0.27947122861586315, "grad_norm": 0.8580536351756373, "learning_rate": 8.193515370812433e-06, "loss": 0.1595, "step": 1797 }, { "epoch": 0.2796267496111975, "grad_norm": 1.0831765474333348, "learning_rate": 8.191635278335533e-06, "loss": 0.1646, "step": 1798 }, { "epoch": 0.2797822706065319, "grad_norm": 1.0292790758688968, "learning_rate": 8.189754423970783e-06, "loss": 0.1294, "step": 1799 }, { "epoch": 0.27993779160186627, "grad_norm": 0.6900206697382273, "learning_rate": 8.18787280816717e-06, "loss": 0.1962, "step": 1800 }, { "epoch": 0.27993779160186627, "eval_loss": 0.1942623406648636, "eval_runtime": 9.4402, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.742, "step": 1800 }, { "epoch": 0.2800933125972006, "grad_norm": 1.225359903420926, "learning_rate": 8.18599043137386e-06, "loss": 0.1613, "step": 1801 }, { "epoch": 0.280248833592535, "grad_norm": 1.6844618005986576, "learning_rate": 8.184107294040204e-06, "loss": 0.2253, "step": 1802 }, { "epoch": 0.2804043545878694, "grad_norm": 1.0175001190789204, "learning_rate": 8.182223396615733e-06, "loss": 0.1912, "step": 1803 }, { "epoch": 0.2805598755832037, "grad_norm": 1.050408004024866, "learning_rate": 8.18033873955016e-06, "loss": 0.2061, "step": 1804 }, { "epoch": 0.2807153965785381, "grad_norm": 1.4763046668239692, "learning_rate": 8.178453323293378e-06, "loss": 0.2781, "step": 1805 }, { "epoch": 0.28087091757387245, "grad_norm": 0.8219546561822222, "learning_rate": 8.176567148295462e-06, "loss": 0.2129, "step": 1806 }, { "epoch": 0.28102643856920684, "grad_norm": 0.9534941567105831, "learning_rate": 8.174680215006671e-06, "loss": 0.1653, "step": 1807 }, { "epoch": 0.28118195956454123, "grad_norm": 1.0531235123680651, "learning_rate": 8.172792523877439e-06, "loss": 0.1384, "step": 1808 }, { "epoch": 0.28133748055987556, "grad_norm": 1.3227244850484494, "learning_rate": 8.170904075358386e-06, "loss": 0.1878, "step": 1809 }, { "epoch": 0.28149300155520995, "grad_norm": 0.8199812475506189, "learning_rate": 8.169014869900308e-06, "loss": 0.1583, "step": 1810 }, { "epoch": 0.28164852255054434, "grad_norm": 1.1873233496157647, "learning_rate": 8.167124907954188e-06, "loss": 0.1689, "step": 1811 }, { "epoch": 0.2818040435458787, "grad_norm": 1.332689389458692, "learning_rate": 8.165234189971188e-06, "loss": 0.1509, "step": 1812 }, { "epoch": 0.28195956454121307, "grad_norm": 1.4288659016319332, "learning_rate": 8.163342716402645e-06, "loss": 0.1862, "step": 1813 }, { "epoch": 0.28211508553654746, "grad_norm": 1.314918590717926, "learning_rate": 8.16145048770008e-06, "loss": 0.226, "step": 1814 }, { "epoch": 0.2822706065318818, "grad_norm": 0.9155638179955898, "learning_rate": 8.159557504315197e-06, "loss": 0.1929, "step": 1815 }, { "epoch": 0.2824261275272162, "grad_norm": 1.0431139463881003, "learning_rate": 8.157663766699875e-06, "loss": 0.1443, "step": 1816 }, { "epoch": 0.2825816485225505, "grad_norm": 1.3294250069242237, "learning_rate": 8.155769275306178e-06, "loss": 0.193, "step": 1817 }, { "epoch": 0.2827371695178849, "grad_norm": 0.9943106694297035, "learning_rate": 8.153874030586343e-06, "loss": 0.1421, "step": 1818 }, { "epoch": 0.2828926905132193, "grad_norm": 1.165982265832558, "learning_rate": 8.151978032992798e-06, "loss": 0.1739, "step": 1819 }, { "epoch": 0.28304821150855364, "grad_norm": 0.7428727580266941, "learning_rate": 8.150081282978139e-06, "loss": 0.1572, "step": 1820 }, { "epoch": 0.28320373250388803, "grad_norm": 1.3026564844558632, "learning_rate": 8.14818378099515e-06, "loss": 0.1805, "step": 1821 }, { "epoch": 0.2833592534992224, "grad_norm": 1.2645554368075294, "learning_rate": 8.146285527496789e-06, "loss": 0.1798, "step": 1822 }, { "epoch": 0.28351477449455675, "grad_norm": 1.456481044065325, "learning_rate": 8.144386522936195e-06, "loss": 0.1598, "step": 1823 }, { "epoch": 0.28367029548989114, "grad_norm": 1.1300906728090474, "learning_rate": 8.142486767766688e-06, "loss": 0.1648, "step": 1824 }, { "epoch": 0.2838258164852255, "grad_norm": 1.1388001178297193, "learning_rate": 8.140586262441767e-06, "loss": 0.2733, "step": 1825 }, { "epoch": 0.28398133748055987, "grad_norm": 0.7532300919484063, "learning_rate": 8.138685007415109e-06, "loss": 0.1213, "step": 1826 }, { "epoch": 0.28413685847589426, "grad_norm": 1.0796067807349936, "learning_rate": 8.136783003140568e-06, "loss": 0.2189, "step": 1827 }, { "epoch": 0.2842923794712286, "grad_norm": 1.331438012696905, "learning_rate": 8.134880250072179e-06, "loss": 0.1804, "step": 1828 }, { "epoch": 0.284447900466563, "grad_norm": 1.2091930191659346, "learning_rate": 8.13297674866416e-06, "loss": 0.2194, "step": 1829 }, { "epoch": 0.2846034214618974, "grad_norm": 1.0049073814467957, "learning_rate": 8.131072499370897e-06, "loss": 0.1333, "step": 1830 }, { "epoch": 0.2847589424572317, "grad_norm": 1.0223717163539678, "learning_rate": 8.129167502646966e-06, "loss": 0.1988, "step": 1831 }, { "epoch": 0.2849144634525661, "grad_norm": 1.4867747212307119, "learning_rate": 8.127261758947114e-06, "loss": 0.1467, "step": 1832 }, { "epoch": 0.2850699844479005, "grad_norm": 0.8173079980321136, "learning_rate": 8.125355268726266e-06, "loss": 0.1058, "step": 1833 }, { "epoch": 0.28522550544323483, "grad_norm": 1.570586484505542, "learning_rate": 8.123448032439534e-06, "loss": 0.2065, "step": 1834 }, { "epoch": 0.2853810264385692, "grad_norm": 1.5595299992669573, "learning_rate": 8.121540050542198e-06, "loss": 0.2193, "step": 1835 }, { "epoch": 0.28553654743390355, "grad_norm": 1.007755342730857, "learning_rate": 8.119631323489722e-06, "loss": 0.1371, "step": 1836 }, { "epoch": 0.28569206842923794, "grad_norm": 1.301433540358406, "learning_rate": 8.117721851737744e-06, "loss": 0.176, "step": 1837 }, { "epoch": 0.28584758942457233, "grad_norm": 0.8910831403501445, "learning_rate": 8.115811635742079e-06, "loss": 0.1626, "step": 1838 }, { "epoch": 0.28600311041990667, "grad_norm": 0.7467945070581918, "learning_rate": 8.113900675958728e-06, "loss": 0.1821, "step": 1839 }, { "epoch": 0.28615863141524106, "grad_norm": 1.1530448700016815, "learning_rate": 8.111988972843859e-06, "loss": 0.1923, "step": 1840 }, { "epoch": 0.28631415241057545, "grad_norm": 2.088923036862537, "learning_rate": 8.110076526853824e-06, "loss": 0.1206, "step": 1841 }, { "epoch": 0.2864696734059098, "grad_norm": 1.2835755029352423, "learning_rate": 8.108163338445152e-06, "loss": 0.2546, "step": 1842 }, { "epoch": 0.2866251944012442, "grad_norm": 0.8829913186503389, "learning_rate": 8.106249408074544e-06, "loss": 0.1445, "step": 1843 }, { "epoch": 0.2867807153965785, "grad_norm": 1.7629923458811358, "learning_rate": 8.104334736198887e-06, "loss": 0.1544, "step": 1844 }, { "epoch": 0.2869362363919129, "grad_norm": 0.9657174681831697, "learning_rate": 8.102419323275234e-06, "loss": 0.2351, "step": 1845 }, { "epoch": 0.2870917573872473, "grad_norm": 1.492589192357126, "learning_rate": 8.100503169760827e-06, "loss": 0.186, "step": 1846 }, { "epoch": 0.28724727838258163, "grad_norm": 1.1233971084394738, "learning_rate": 8.098586276113073e-06, "loss": 0.1946, "step": 1847 }, { "epoch": 0.287402799377916, "grad_norm": 0.8577653456028049, "learning_rate": 8.096668642789565e-06, "loss": 0.1633, "step": 1848 }, { "epoch": 0.2875583203732504, "grad_norm": 1.1536545920544707, "learning_rate": 8.094750270248065e-06, "loss": 0.1603, "step": 1849 }, { "epoch": 0.28771384136858474, "grad_norm": 0.9838814306399297, "learning_rate": 8.09283115894652e-06, "loss": 0.1623, "step": 1850 }, { "epoch": 0.28786936236391913, "grad_norm": 0.9601384951644616, "learning_rate": 8.090911309343045e-06, "loss": 0.1252, "step": 1851 }, { "epoch": 0.2880248833592535, "grad_norm": 0.9976176427201153, "learning_rate": 8.088990721895938e-06, "loss": 0.1815, "step": 1852 }, { "epoch": 0.28818040435458786, "grad_norm": 0.7583399424827217, "learning_rate": 8.087069397063666e-06, "loss": 0.141, "step": 1853 }, { "epoch": 0.28833592534992225, "grad_norm": 1.5185081715928586, "learning_rate": 8.085147335304879e-06, "loss": 0.1887, "step": 1854 }, { "epoch": 0.2884914463452566, "grad_norm": 1.3061310770751247, "learning_rate": 8.083224537078401e-06, "loss": 0.1451, "step": 1855 }, { "epoch": 0.288646967340591, "grad_norm": 1.7351614485797129, "learning_rate": 8.081301002843226e-06, "loss": 0.1264, "step": 1856 }, { "epoch": 0.28880248833592537, "grad_norm": 0.9943204442132273, "learning_rate": 8.079376733058532e-06, "loss": 0.1743, "step": 1857 }, { "epoch": 0.2889580093312597, "grad_norm": 1.1594102001358773, "learning_rate": 8.07745172818367e-06, "loss": 0.1607, "step": 1858 }, { "epoch": 0.2891135303265941, "grad_norm": 1.6350163238448654, "learning_rate": 8.075525988678163e-06, "loss": 0.1813, "step": 1859 }, { "epoch": 0.2892690513219285, "grad_norm": 1.083878957563236, "learning_rate": 8.073599515001713e-06, "loss": 0.1194, "step": 1860 }, { "epoch": 0.2894245723172628, "grad_norm": 0.8178116527073355, "learning_rate": 8.071672307614195e-06, "loss": 0.228, "step": 1861 }, { "epoch": 0.2895800933125972, "grad_norm": 1.1118324651261078, "learning_rate": 8.069744366975664e-06, "loss": 0.197, "step": 1862 }, { "epoch": 0.28973561430793154, "grad_norm": 1.149751561349185, "learning_rate": 8.06781569354634e-06, "loss": 0.269, "step": 1863 }, { "epoch": 0.28989113530326593, "grad_norm": 1.1618468357632399, "learning_rate": 8.06588628778663e-06, "loss": 0.1846, "step": 1864 }, { "epoch": 0.2900466562986003, "grad_norm": 1.3277875865938236, "learning_rate": 8.063956150157107e-06, "loss": 0.1273, "step": 1865 }, { "epoch": 0.29020217729393466, "grad_norm": 2.436613602568436, "learning_rate": 8.062025281118524e-06, "loss": 0.2442, "step": 1866 }, { "epoch": 0.29035769828926905, "grad_norm": 1.1958552059286012, "learning_rate": 8.060093681131804e-06, "loss": 0.1874, "step": 1867 }, { "epoch": 0.29051321928460344, "grad_norm": 0.8401669076143116, "learning_rate": 8.058161350658047e-06, "loss": 0.1901, "step": 1868 }, { "epoch": 0.2906687402799378, "grad_norm": 0.9487811357677395, "learning_rate": 8.056228290158528e-06, "loss": 0.1346, "step": 1869 }, { "epoch": 0.29082426127527217, "grad_norm": 0.957901732333534, "learning_rate": 8.054294500094697e-06, "loss": 0.1411, "step": 1870 }, { "epoch": 0.29097978227060656, "grad_norm": 1.292131532239805, "learning_rate": 8.052359980928172e-06, "loss": 0.1827, "step": 1871 }, { "epoch": 0.2911353032659409, "grad_norm": 0.9675272253773427, "learning_rate": 8.050424733120757e-06, "loss": 0.1738, "step": 1872 }, { "epoch": 0.2912908242612753, "grad_norm": 1.367184606419033, "learning_rate": 8.048488757134416e-06, "loss": 0.1787, "step": 1873 }, { "epoch": 0.2914463452566096, "grad_norm": 1.2673549853684765, "learning_rate": 8.046552053431298e-06, "loss": 0.2333, "step": 1874 }, { "epoch": 0.291601866251944, "grad_norm": 1.9351495105907597, "learning_rate": 8.044614622473717e-06, "loss": 0.1987, "step": 1875 }, { "epoch": 0.2917573872472784, "grad_norm": 0.8606527150680897, "learning_rate": 8.042676464724169e-06, "loss": 0.172, "step": 1876 }, { "epoch": 0.29191290824261273, "grad_norm": 1.4901933699318817, "learning_rate": 8.040737580645316e-06, "loss": 0.1735, "step": 1877 }, { "epoch": 0.2920684292379471, "grad_norm": 1.1691647071434712, "learning_rate": 8.038797970699998e-06, "loss": 0.2316, "step": 1878 }, { "epoch": 0.2922239502332815, "grad_norm": 1.240770117738676, "learning_rate": 8.036857635351226e-06, "loss": 0.1667, "step": 1879 }, { "epoch": 0.29237947122861585, "grad_norm": 1.0351622011955766, "learning_rate": 8.034916575062188e-06, "loss": 0.1405, "step": 1880 }, { "epoch": 0.29253499222395024, "grad_norm": 0.9153491401389935, "learning_rate": 8.032974790296239e-06, "loss": 0.1726, "step": 1881 }, { "epoch": 0.2926905132192846, "grad_norm": 1.4362209764019978, "learning_rate": 8.031032281516913e-06, "loss": 0.1827, "step": 1882 }, { "epoch": 0.29284603421461897, "grad_norm": 1.183605507206791, "learning_rate": 8.029089049187909e-06, "loss": 0.1883, "step": 1883 }, { "epoch": 0.29300155520995336, "grad_norm": 1.0539400115284923, "learning_rate": 8.02714509377311e-06, "loss": 0.1208, "step": 1884 }, { "epoch": 0.2931570762052877, "grad_norm": 1.0217425114195149, "learning_rate": 8.02520041573656e-06, "loss": 0.174, "step": 1885 }, { "epoch": 0.2933125972006221, "grad_norm": 1.0405110359742253, "learning_rate": 8.023255015542482e-06, "loss": 0.249, "step": 1886 }, { "epoch": 0.2934681181959565, "grad_norm": 0.9949747841829932, "learning_rate": 8.021308893655273e-06, "loss": 0.1861, "step": 1887 }, { "epoch": 0.2936236391912908, "grad_norm": 0.9631918396707634, "learning_rate": 8.019362050539497e-06, "loss": 0.22, "step": 1888 }, { "epoch": 0.2937791601866252, "grad_norm": 1.471400212660711, "learning_rate": 8.017414486659894e-06, "loss": 0.2831, "step": 1889 }, { "epoch": 0.2939346811819596, "grad_norm": 1.6502542476240603, "learning_rate": 8.015466202481371e-06, "loss": 0.1856, "step": 1890 }, { "epoch": 0.2940902021772939, "grad_norm": 1.0678255046461738, "learning_rate": 8.013517198469017e-06, "loss": 0.2714, "step": 1891 }, { "epoch": 0.2942457231726283, "grad_norm": 1.5419672646129527, "learning_rate": 8.01156747508808e-06, "loss": 0.2432, "step": 1892 }, { "epoch": 0.29440124416796265, "grad_norm": 1.691620262630438, "learning_rate": 8.009617032803989e-06, "loss": 0.2494, "step": 1893 }, { "epoch": 0.29455676516329704, "grad_norm": 1.0149866152436102, "learning_rate": 8.007665872082343e-06, "loss": 0.1446, "step": 1894 }, { "epoch": 0.29471228615863143, "grad_norm": 1.2593397067130077, "learning_rate": 8.005713993388908e-06, "loss": 0.1813, "step": 1895 }, { "epoch": 0.29486780715396577, "grad_norm": 1.751259190433369, "learning_rate": 8.003761397189629e-06, "loss": 0.3067, "step": 1896 }, { "epoch": 0.29502332814930016, "grad_norm": 1.0592944557403567, "learning_rate": 8.001808083950615e-06, "loss": 0.1774, "step": 1897 }, { "epoch": 0.29517884914463455, "grad_norm": 0.7601316574689209, "learning_rate": 7.999854054138148e-06, "loss": 0.1986, "step": 1898 }, { "epoch": 0.2953343701399689, "grad_norm": 1.0763633141744329, "learning_rate": 7.997899308218687e-06, "loss": 0.1693, "step": 1899 }, { "epoch": 0.2954898911353033, "grad_norm": 0.848192935949934, "learning_rate": 7.995943846658852e-06, "loss": 0.1785, "step": 1900 }, { "epoch": 0.2954898911353033, "eval_loss": 0.19579939544200897, "eval_runtime": 9.4258, "eval_samples_per_second": 2.758, "eval_steps_per_second": 0.743, "step": 1900 }, { "epoch": 0.29564541213063766, "grad_norm": 1.1366949640186217, "learning_rate": 7.99398766992544e-06, "loss": 0.3427, "step": 1901 }, { "epoch": 0.295800933125972, "grad_norm": 1.3011369731626548, "learning_rate": 7.99203077848542e-06, "loss": 0.128, "step": 1902 }, { "epoch": 0.2959564541213064, "grad_norm": 1.6239083693901217, "learning_rate": 7.990073172805927e-06, "loss": 0.2033, "step": 1903 }, { "epoch": 0.2961119751166407, "grad_norm": 2.136757506768007, "learning_rate": 7.98811485335427e-06, "loss": 0.8244, "step": 1904 }, { "epoch": 0.2962674961119751, "grad_norm": 1.4156103108687226, "learning_rate": 7.986155820597927e-06, "loss": 0.2266, "step": 1905 }, { "epoch": 0.2964230171073095, "grad_norm": 1.3059948518525273, "learning_rate": 7.984196075004547e-06, "loss": 0.1772, "step": 1906 }, { "epoch": 0.29657853810264384, "grad_norm": 1.1897397554067446, "learning_rate": 7.982235617041947e-06, "loss": 0.2153, "step": 1907 }, { "epoch": 0.29673405909797823, "grad_norm": 1.8814984942898336, "learning_rate": 7.980274447178116e-06, "loss": 0.163, "step": 1908 }, { "epoch": 0.2968895800933126, "grad_norm": 0.8490191091642275, "learning_rate": 7.978312565881212e-06, "loss": 0.1929, "step": 1909 }, { "epoch": 0.29704510108864696, "grad_norm": 1.0730207253151238, "learning_rate": 7.976349973619567e-06, "loss": 0.152, "step": 1910 }, { "epoch": 0.29720062208398135, "grad_norm": 1.0988494794101311, "learning_rate": 7.974386670861676e-06, "loss": 0.1796, "step": 1911 }, { "epoch": 0.2973561430793157, "grad_norm": 0.8890702707468837, "learning_rate": 7.972422658076206e-06, "loss": 0.1658, "step": 1912 }, { "epoch": 0.2975116640746501, "grad_norm": 1.5485447290305507, "learning_rate": 7.970457935731996e-06, "loss": 0.219, "step": 1913 }, { "epoch": 0.29766718506998446, "grad_norm": 1.1870158533528972, "learning_rate": 7.968492504298053e-06, "loss": 0.1678, "step": 1914 }, { "epoch": 0.2978227060653188, "grad_norm": 0.8791513734953905, "learning_rate": 7.966526364243553e-06, "loss": 0.1379, "step": 1915 }, { "epoch": 0.2979782270606532, "grad_norm": 1.1547532699065137, "learning_rate": 7.96455951603784e-06, "loss": 0.1578, "step": 1916 }, { "epoch": 0.2981337480559876, "grad_norm": 1.2343036137247707, "learning_rate": 7.962591960150426e-06, "loss": 0.167, "step": 1917 }, { "epoch": 0.2982892690513219, "grad_norm": 1.199679900214714, "learning_rate": 7.960623697051e-06, "loss": 0.2216, "step": 1918 }, { "epoch": 0.2984447900466563, "grad_norm": 0.8701547919093023, "learning_rate": 7.958654727209406e-06, "loss": 0.1334, "step": 1919 }, { "epoch": 0.2986003110419907, "grad_norm": 1.0186941275746395, "learning_rate": 7.956685051095672e-06, "loss": 0.1992, "step": 1920 }, { "epoch": 0.29875583203732503, "grad_norm": 1.677907044209659, "learning_rate": 7.954714669179981e-06, "loss": 0.2557, "step": 1921 }, { "epoch": 0.2989113530326594, "grad_norm": 1.0741276350489937, "learning_rate": 7.952743581932696e-06, "loss": 0.2228, "step": 1922 }, { "epoch": 0.29906687402799376, "grad_norm": 1.1370483443720154, "learning_rate": 7.950771789824341e-06, "loss": 0.1822, "step": 1923 }, { "epoch": 0.29922239502332815, "grad_norm": 1.4805485099895457, "learning_rate": 7.948799293325607e-06, "loss": 0.2066, "step": 1924 }, { "epoch": 0.29937791601866254, "grad_norm": 1.0841471732459598, "learning_rate": 7.946826092907362e-06, "loss": 0.2086, "step": 1925 }, { "epoch": 0.2995334370139969, "grad_norm": 0.9923801848699839, "learning_rate": 7.944852189040633e-06, "loss": 0.1457, "step": 1926 }, { "epoch": 0.29968895800933126, "grad_norm": 1.1826489754247185, "learning_rate": 7.942877582196618e-06, "loss": 0.1335, "step": 1927 }, { "epoch": 0.29984447900466565, "grad_norm": 1.0374422374980892, "learning_rate": 7.940902272846684e-06, "loss": 0.1747, "step": 1928 }, { "epoch": 0.3, "grad_norm": 0.9355242713051211, "learning_rate": 7.938926261462366e-06, "loss": 0.2035, "step": 1929 }, { "epoch": 0.3001555209953344, "grad_norm": 1.3452996964657524, "learning_rate": 7.936949548515364e-06, "loss": 0.2284, "step": 1930 }, { "epoch": 0.3003110419906687, "grad_norm": 0.7948433007606517, "learning_rate": 7.93497213447755e-06, "loss": 0.2051, "step": 1931 }, { "epoch": 0.3004665629860031, "grad_norm": 1.130699049423352, "learning_rate": 7.932994019820956e-06, "loss": 0.174, "step": 1932 }, { "epoch": 0.3006220839813375, "grad_norm": 4.331642107991714, "learning_rate": 7.931015205017788e-06, "loss": 0.2259, "step": 1933 }, { "epoch": 0.30077760497667183, "grad_norm": 1.5306684316210843, "learning_rate": 7.929035690540414e-06, "loss": 0.1917, "step": 1934 }, { "epoch": 0.3009331259720062, "grad_norm": 0.8871970028065491, "learning_rate": 7.927055476861376e-06, "loss": 0.1765, "step": 1935 }, { "epoch": 0.3010886469673406, "grad_norm": 0.9400711133682595, "learning_rate": 7.925074564453376e-06, "loss": 0.1824, "step": 1936 }, { "epoch": 0.30124416796267495, "grad_norm": 0.9734009328190283, "learning_rate": 7.923092953789287e-06, "loss": 0.1575, "step": 1937 }, { "epoch": 0.30139968895800934, "grad_norm": 1.1309704131602631, "learning_rate": 7.921110645342144e-06, "loss": 0.2438, "step": 1938 }, { "epoch": 0.30155520995334373, "grad_norm": 1.2491112218817273, "learning_rate": 7.919127639585153e-06, "loss": 0.2252, "step": 1939 }, { "epoch": 0.30171073094867806, "grad_norm": 0.9626959898568382, "learning_rate": 7.917143936991688e-06, "loss": 0.1416, "step": 1940 }, { "epoch": 0.30186625194401245, "grad_norm": 0.933932349728071, "learning_rate": 7.915159538035284e-06, "loss": 0.1924, "step": 1941 }, { "epoch": 0.3020217729393468, "grad_norm": 1.198922066826054, "learning_rate": 7.913174443189645e-06, "loss": 0.1918, "step": 1942 }, { "epoch": 0.3021772939346812, "grad_norm": 0.711619672728743, "learning_rate": 7.911188652928639e-06, "loss": 0.1322, "step": 1943 }, { "epoch": 0.30233281493001557, "grad_norm": 0.9224372120486194, "learning_rate": 7.909202167726306e-06, "loss": 0.1775, "step": 1944 }, { "epoch": 0.3024883359253499, "grad_norm": 1.3276511094955517, "learning_rate": 7.907214988056844e-06, "loss": 0.2187, "step": 1945 }, { "epoch": 0.3026438569206843, "grad_norm": 0.8655219464600901, "learning_rate": 7.905227114394623e-06, "loss": 0.1465, "step": 1946 }, { "epoch": 0.3027993779160187, "grad_norm": 0.995295145761775, "learning_rate": 7.903238547214173e-06, "loss": 0.2004, "step": 1947 }, { "epoch": 0.302954898911353, "grad_norm": 1.1948454763354273, "learning_rate": 7.901249286990196e-06, "loss": 0.1755, "step": 1948 }, { "epoch": 0.3031104199066874, "grad_norm": 0.8682961110627464, "learning_rate": 7.899259334197554e-06, "loss": 0.1999, "step": 1949 }, { "epoch": 0.30326594090202175, "grad_norm": 1.0906703142458485, "learning_rate": 7.897268689311278e-06, "loss": 0.1014, "step": 1950 }, { "epoch": 0.30342146189735614, "grad_norm": 1.2664526681944839, "learning_rate": 7.895277352806562e-06, "loss": 0.2251, "step": 1951 }, { "epoch": 0.30357698289269053, "grad_norm": 0.9627019771781781, "learning_rate": 7.893285325158766e-06, "loss": 0.1591, "step": 1952 }, { "epoch": 0.30373250388802486, "grad_norm": 1.9216322578695895, "learning_rate": 7.891292606843414e-06, "loss": 0.2066, "step": 1953 }, { "epoch": 0.30388802488335925, "grad_norm": 0.9086586156841527, "learning_rate": 7.889299198336197e-06, "loss": 0.2196, "step": 1954 }, { "epoch": 0.30404354587869364, "grad_norm": 1.4203649142405548, "learning_rate": 7.887305100112967e-06, "loss": 0.1804, "step": 1955 }, { "epoch": 0.304199066874028, "grad_norm": 1.2381428600296667, "learning_rate": 7.885310312649747e-06, "loss": 0.1434, "step": 1956 }, { "epoch": 0.30435458786936237, "grad_norm": 0.7952770821447226, "learning_rate": 7.883314836422717e-06, "loss": 0.1955, "step": 1957 }, { "epoch": 0.30451010886469676, "grad_norm": 0.998488522800322, "learning_rate": 7.881318671908228e-06, "loss": 0.2239, "step": 1958 }, { "epoch": 0.3046656298600311, "grad_norm": 1.0829580403987296, "learning_rate": 7.879321819582788e-06, "loss": 0.2401, "step": 1959 }, { "epoch": 0.3048211508553655, "grad_norm": 1.043363355464928, "learning_rate": 7.877324279923078e-06, "loss": 0.1821, "step": 1960 }, { "epoch": 0.3049766718506998, "grad_norm": 1.7533270649933215, "learning_rate": 7.875326053405936e-06, "loss": 0.2513, "step": 1961 }, { "epoch": 0.3051321928460342, "grad_norm": 1.3436274607263432, "learning_rate": 7.873327140508367e-06, "loss": 0.2352, "step": 1962 }, { "epoch": 0.3052877138413686, "grad_norm": 2.0633364771352274, "learning_rate": 7.87132754170754e-06, "loss": 0.2125, "step": 1963 }, { "epoch": 0.30544323483670294, "grad_norm": 0.9097966158633792, "learning_rate": 7.869327257480787e-06, "loss": 0.1627, "step": 1964 }, { "epoch": 0.3055987558320373, "grad_norm": 1.8317277834761483, "learning_rate": 7.867326288305603e-06, "loss": 0.211, "step": 1965 }, { "epoch": 0.3057542768273717, "grad_norm": 1.1448361049962872, "learning_rate": 7.865324634659647e-06, "loss": 0.1683, "step": 1966 }, { "epoch": 0.30590979782270605, "grad_norm": 1.14865744697956, "learning_rate": 7.863322297020743e-06, "loss": 0.2238, "step": 1967 }, { "epoch": 0.30606531881804044, "grad_norm": 1.0967845312311937, "learning_rate": 7.861319275866877e-06, "loss": 0.1889, "step": 1968 }, { "epoch": 0.3062208398133748, "grad_norm": 1.2461473684464468, "learning_rate": 7.859315571676198e-06, "loss": 0.2138, "step": 1969 }, { "epoch": 0.30637636080870917, "grad_norm": 1.1992165952645324, "learning_rate": 7.857311184927015e-06, "loss": 0.2289, "step": 1970 }, { "epoch": 0.30653188180404356, "grad_norm": 0.9734656478980178, "learning_rate": 7.855306116097807e-06, "loss": 0.1798, "step": 1971 }, { "epoch": 0.3066874027993779, "grad_norm": 0.8576094794110676, "learning_rate": 7.853300365667211e-06, "loss": 0.1849, "step": 1972 }, { "epoch": 0.3068429237947123, "grad_norm": 0.9320489557446329, "learning_rate": 7.851293934114026e-06, "loss": 0.1663, "step": 1973 }, { "epoch": 0.3069984447900467, "grad_norm": 1.5628965027384294, "learning_rate": 7.849286821917217e-06, "loss": 0.2741, "step": 1974 }, { "epoch": 0.307153965785381, "grad_norm": 1.1064029390023975, "learning_rate": 7.847279029555908e-06, "loss": 0.1655, "step": 1975 }, { "epoch": 0.3073094867807154, "grad_norm": 1.1272492512254035, "learning_rate": 7.845270557509389e-06, "loss": 0.1473, "step": 1976 }, { "epoch": 0.3074650077760498, "grad_norm": 0.8321910160414181, "learning_rate": 7.843261406257108e-06, "loss": 0.1571, "step": 1977 }, { "epoch": 0.3076205287713841, "grad_norm": 0.9606345664210296, "learning_rate": 7.841251576278681e-06, "loss": 0.227, "step": 1978 }, { "epoch": 0.3077760497667185, "grad_norm": 1.0695344914586096, "learning_rate": 7.839241068053878e-06, "loss": 0.1616, "step": 1979 }, { "epoch": 0.30793157076205285, "grad_norm": 2.415757365845339, "learning_rate": 7.837229882062638e-06, "loss": 0.2091, "step": 1980 }, { "epoch": 0.30808709175738724, "grad_norm": 0.8945861519999385, "learning_rate": 7.83521801878506e-06, "loss": 0.1769, "step": 1981 }, { "epoch": 0.30824261275272163, "grad_norm": 1.1779736396630833, "learning_rate": 7.8332054787014e-06, "loss": 0.2311, "step": 1982 }, { "epoch": 0.30839813374805597, "grad_norm": 1.372493149836755, "learning_rate": 7.831192262292082e-06, "loss": 0.172, "step": 1983 }, { "epoch": 0.30855365474339036, "grad_norm": 2.4487535069237407, "learning_rate": 7.82917837003769e-06, "loss": 0.1395, "step": 1984 }, { "epoch": 0.30870917573872475, "grad_norm": 0.871516091971395, "learning_rate": 7.827163802418967e-06, "loss": 0.1437, "step": 1985 }, { "epoch": 0.3088646967340591, "grad_norm": 1.2731269701284036, "learning_rate": 7.825148559916817e-06, "loss": 0.1857, "step": 1986 }, { "epoch": 0.3090202177293935, "grad_norm": 0.9768725926218434, "learning_rate": 7.823132643012308e-06, "loss": 0.195, "step": 1987 }, { "epoch": 0.3091757387247278, "grad_norm": 0.9369720572131188, "learning_rate": 7.821116052186668e-06, "loss": 0.2034, "step": 1988 }, { "epoch": 0.3093312597200622, "grad_norm": 1.1315495162839369, "learning_rate": 7.819098787921283e-06, "loss": 0.1755, "step": 1989 }, { "epoch": 0.3094867807153966, "grad_norm": 1.0958800584806985, "learning_rate": 7.817080850697705e-06, "loss": 0.2575, "step": 1990 }, { "epoch": 0.3096423017107309, "grad_norm": 0.9336006846477578, "learning_rate": 7.815062240997642e-06, "loss": 0.1376, "step": 1991 }, { "epoch": 0.3097978227060653, "grad_norm": 0.9121065280126879, "learning_rate": 7.813042959302963e-06, "loss": 0.1212, "step": 1992 }, { "epoch": 0.3099533437013997, "grad_norm": 0.6936258475052076, "learning_rate": 7.811023006095703e-06, "loss": 0.13, "step": 1993 }, { "epoch": 0.31010886469673404, "grad_norm": 1.278051470184625, "learning_rate": 7.809002381858048e-06, "loss": 0.1686, "step": 1994 }, { "epoch": 0.31026438569206843, "grad_norm": 1.2807241898257353, "learning_rate": 7.806981087072354e-06, "loss": 0.2569, "step": 1995 }, { "epoch": 0.3104199066874028, "grad_norm": 1.6449581085415006, "learning_rate": 7.804959122221127e-06, "loss": 0.3075, "step": 1996 }, { "epoch": 0.31057542768273716, "grad_norm": 0.9051580549498448, "learning_rate": 7.802936487787045e-06, "loss": 0.1603, "step": 1997 }, { "epoch": 0.31073094867807155, "grad_norm": 1.5451818475345835, "learning_rate": 7.800913184252931e-06, "loss": 0.2057, "step": 1998 }, { "epoch": 0.3108864696734059, "grad_norm": 1.0935081143315897, "learning_rate": 7.79888921210178e-06, "loss": 0.2238, "step": 1999 }, { "epoch": 0.3110419906687403, "grad_norm": 1.262020237993972, "learning_rate": 7.796864571816745e-06, "loss": 0.1977, "step": 2000 }, { "epoch": 0.3110419906687403, "eval_loss": 0.19129334390163422, "eval_runtime": 9.4405, "eval_samples_per_second": 2.754, "eval_steps_per_second": 0.741, "step": 2000 } ], "logging_steps": 1, "max_steps": 6430, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 191987712000000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }