{ "best_global_step": 26000, "best_metric": 1.9807677268981934, "best_model_checkpoint": "./medical_qwen_finetuned_improved/checkpoint-26000", "epoch": 7.9997372273734, "eval_steps": 100, "global_step": 26632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00750778933143136, "grad_norm": 5.094513416290283, "learning_rate": 1.6893477240732053e-07, "loss": 3.2412, "step": 25 }, { "epoch": 0.01501557866286272, "grad_norm": 3.9722039699554443, "learning_rate": 3.4490849366494603e-07, "loss": 3.2095, "step": 50 }, { "epoch": 0.02252336799429408, "grad_norm": 3.527215003967285, "learning_rate": 5.208822149225716e-07, "loss": 3.1277, "step": 75 }, { "epoch": 0.03003115732572544, "grad_norm": 2.680919647216797, "learning_rate": 6.968559361801971e-07, "loss": 3.0231, "step": 100 }, { "epoch": 0.03003115732572544, "eval_loss": 2.9903318881988525, "eval_runtime": 319.3002, "eval_samples_per_second": 17.567, "eval_steps_per_second": 4.394, "step": 100 }, { "epoch": 0.0375389466571568, "grad_norm": 2.085681438446045, "learning_rate": 8.728296574378227e-07, "loss": 2.9442, "step": 125 }, { "epoch": 0.04504673598858816, "grad_norm": 1.8346056938171387, "learning_rate": 1.0488033786954481e-06, "loss": 2.8133, "step": 150 }, { "epoch": 0.05255452532001952, "grad_norm": 1.7170641422271729, "learning_rate": 1.2247770999530738e-06, "loss": 2.741, "step": 175 }, { "epoch": 0.06006231465145088, "grad_norm": 1.7041053771972656, "learning_rate": 1.4007508212106992e-06, "loss": 2.6097, "step": 200 }, { "epoch": 0.06006231465145088, "eval_loss": 2.5728588104248047, "eval_runtime": 244.8867, "eval_samples_per_second": 22.904, "eval_steps_per_second": 5.729, "step": 200 }, { "epoch": 0.06757010398288224, "grad_norm": 1.1266319751739502, "learning_rate": 1.5767245424683247e-06, "loss": 2.5265, "step": 225 }, { "epoch": 0.0750778933143136, "grad_norm": 0.9362080097198486, "learning_rate": 1.7526982637259503e-06, "loss": 2.4491, "step": 250 }, { "epoch": 0.08258568264574496, "grad_norm": 0.5819249153137207, "learning_rate": 1.928671984983576e-06, "loss": 2.3677, "step": 275 }, { "epoch": 0.09009347197717632, "grad_norm": 0.5669568777084351, "learning_rate": 2.1046457062412012e-06, "loss": 2.3315, "step": 300 }, { "epoch": 0.09009347197717632, "eval_loss": 2.288224458694458, "eval_runtime": 244.739, "eval_samples_per_second": 22.918, "eval_steps_per_second": 5.733, "step": 300 }, { "epoch": 0.09760126130860768, "grad_norm": 0.43035316467285156, "learning_rate": 2.280619427498827e-06, "loss": 2.2718, "step": 325 }, { "epoch": 0.10510905064003905, "grad_norm": 0.41122695803642273, "learning_rate": 2.4565931487564526e-06, "loss": 2.2705, "step": 350 }, { "epoch": 0.1126168399714704, "grad_norm": 0.38599300384521484, "learning_rate": 2.632566870014078e-06, "loss": 2.2149, "step": 375 }, { "epoch": 0.12012462930290176, "grad_norm": 0.36087512969970703, "learning_rate": 2.8085405912717034e-06, "loss": 2.242, "step": 400 }, { "epoch": 0.12012462930290176, "eval_loss": 2.2342677116394043, "eval_runtime": 244.7563, "eval_samples_per_second": 22.917, "eval_steps_per_second": 5.732, "step": 400 }, { "epoch": 0.1276324186343331, "grad_norm": 0.39531558752059937, "learning_rate": 2.984514312529329e-06, "loss": 2.2117, "step": 425 }, { "epoch": 0.13514020796576448, "grad_norm": 0.4547671675682068, "learning_rate": 3.1604880337869548e-06, "loss": 2.2321, "step": 450 }, { "epoch": 0.14264799729719585, "grad_norm": 0.37058719992637634, "learning_rate": 3.33646175504458e-06, "loss": 2.2581, "step": 475 }, { "epoch": 0.1501557866286272, "grad_norm": 0.3959207534790039, "learning_rate": 3.5124354763022057e-06, "loss": 2.2329, "step": 500 }, { "epoch": 0.1501557866286272, "eval_loss": 2.2151083946228027, "eval_runtime": 244.7784, "eval_samples_per_second": 22.915, "eval_steps_per_second": 5.732, "step": 500 }, { "epoch": 0.15766357596005856, "grad_norm": 0.4138086438179016, "learning_rate": 3.688409197559831e-06, "loss": 2.235, "step": 525 }, { "epoch": 0.16517136529148993, "grad_norm": 0.4153759777545929, "learning_rate": 3.864382918817457e-06, "loss": 2.2237, "step": 550 }, { "epoch": 0.17267915462292127, "grad_norm": 0.4081685245037079, "learning_rate": 4.0403566400750816e-06, "loss": 2.2002, "step": 575 }, { "epoch": 0.18018694395435264, "grad_norm": 0.38760289549827576, "learning_rate": 4.216330361332708e-06, "loss": 2.2159, "step": 600 }, { "epoch": 0.18018694395435264, "eval_loss": 2.204134464263916, "eval_runtime": 244.6583, "eval_samples_per_second": 22.926, "eval_steps_per_second": 5.735, "step": 600 }, { "epoch": 0.187694733285784, "grad_norm": 0.38073575496673584, "learning_rate": 4.392304082590333e-06, "loss": 2.1967, "step": 625 }, { "epoch": 0.19520252261721535, "grad_norm": 0.4018952250480652, "learning_rate": 4.568277803847959e-06, "loss": 2.1968, "step": 650 }, { "epoch": 0.20271031194864672, "grad_norm": 0.4137013256549835, "learning_rate": 4.744251525105584e-06, "loss": 2.1997, "step": 675 }, { "epoch": 0.2102181012800781, "grad_norm": 0.411466509103775, "learning_rate": 4.92022524636321e-06, "loss": 2.2099, "step": 700 }, { "epoch": 0.2102181012800781, "eval_loss": 2.195915699005127, "eval_runtime": 244.7304, "eval_samples_per_second": 22.919, "eval_steps_per_second": 5.733, "step": 700 }, { "epoch": 0.21772589061150943, "grad_norm": 0.41950109601020813, "learning_rate": 5.096198967620835e-06, "loss": 2.1777, "step": 725 }, { "epoch": 0.2252336799429408, "grad_norm": 0.41122791171073914, "learning_rate": 5.272172688878461e-06, "loss": 2.2063, "step": 750 }, { "epoch": 0.23274146927437217, "grad_norm": 0.44570910930633545, "learning_rate": 5.448146410136086e-06, "loss": 2.1962, "step": 775 }, { "epoch": 0.2402492586058035, "grad_norm": 0.40760159492492676, "learning_rate": 5.624120131393712e-06, "loss": 2.2007, "step": 800 }, { "epoch": 0.2402492586058035, "eval_loss": 2.1890077590942383, "eval_runtime": 244.6573, "eval_samples_per_second": 22.926, "eval_steps_per_second": 5.735, "step": 800 }, { "epoch": 0.24775704793723488, "grad_norm": 0.4488222897052765, "learning_rate": 5.800093852651337e-06, "loss": 2.2008, "step": 825 }, { "epoch": 0.2552648372686662, "grad_norm": 0.4745488166809082, "learning_rate": 5.976067573908963e-06, "loss": 2.2013, "step": 850 }, { "epoch": 0.2627726266000976, "grad_norm": 0.45855531096458435, "learning_rate": 6.152041295166589e-06, "loss": 2.1824, "step": 875 }, { "epoch": 0.27028041593152896, "grad_norm": 0.4843423366546631, "learning_rate": 6.328015016424214e-06, "loss": 2.1872, "step": 900 }, { "epoch": 0.27028041593152896, "eval_loss": 2.182678699493408, "eval_runtime": 244.7929, "eval_samples_per_second": 22.913, "eval_steps_per_second": 5.731, "step": 900 }, { "epoch": 0.27778820526296033, "grad_norm": 0.4567316770553589, "learning_rate": 6.5039887376818395e-06, "loss": 2.184, "step": 925 }, { "epoch": 0.2852959945943917, "grad_norm": 0.46967923641204834, "learning_rate": 6.679962458939465e-06, "loss": 2.1739, "step": 950 }, { "epoch": 0.29280378392582307, "grad_norm": 0.4461369216442108, "learning_rate": 6.85593618019709e-06, "loss": 2.1818, "step": 975 }, { "epoch": 0.3003115732572544, "grad_norm": 0.4638686776161194, "learning_rate": 7.031909901454717e-06, "loss": 2.194, "step": 1000 }, { "epoch": 0.3003115732572544, "eval_loss": 2.1770713329315186, "eval_runtime": 244.571, "eval_samples_per_second": 22.934, "eval_steps_per_second": 5.737, "step": 1000 }, { "epoch": 0.30781936258868575, "grad_norm": 0.4287603199481964, "learning_rate": 7.207883622712341e-06, "loss": 2.1563, "step": 1025 }, { "epoch": 0.3153271519201171, "grad_norm": 0.4473567605018616, "learning_rate": 7.383857343969968e-06, "loss": 2.1661, "step": 1050 }, { "epoch": 0.3228349412515485, "grad_norm": 0.5221546292304993, "learning_rate": 7.559831065227592e-06, "loss": 2.1744, "step": 1075 }, { "epoch": 0.33034273058297986, "grad_norm": 0.4909228980541229, "learning_rate": 7.735804786485218e-06, "loss": 2.1729, "step": 1100 }, { "epoch": 0.33034273058297986, "eval_loss": 2.1715242862701416, "eval_runtime": 245.0613, "eval_samples_per_second": 22.888, "eval_steps_per_second": 5.725, "step": 1100 }, { "epoch": 0.33785051991441123, "grad_norm": 0.5596965551376343, "learning_rate": 7.911778507742844e-06, "loss": 2.1615, "step": 1125 }, { "epoch": 0.34535830924584254, "grad_norm": 0.4983489215373993, "learning_rate": 8.08775222900047e-06, "loss": 2.1717, "step": 1150 }, { "epoch": 0.3528660985772739, "grad_norm": 0.485856831073761, "learning_rate": 8.263725950258095e-06, "loss": 2.1507, "step": 1175 }, { "epoch": 0.3603738879087053, "grad_norm": 0.5247727632522583, "learning_rate": 8.43969967151572e-06, "loss": 2.1939, "step": 1200 }, { "epoch": 0.3603738879087053, "eval_loss": 2.1655497550964355, "eval_runtime": 244.6519, "eval_samples_per_second": 22.926, "eval_steps_per_second": 5.735, "step": 1200 }, { "epoch": 0.36788167724013665, "grad_norm": 0.5695153474807739, "learning_rate": 8.615673392773347e-06, "loss": 2.1827, "step": 1225 }, { "epoch": 0.375389466571568, "grad_norm": 0.5112013816833496, "learning_rate": 8.791647114030971e-06, "loss": 2.1748, "step": 1250 }, { "epoch": 0.3828972559029994, "grad_norm": 0.46719494462013245, "learning_rate": 8.967620835288597e-06, "loss": 2.1856, "step": 1275 }, { "epoch": 0.3904050452344307, "grad_norm": 0.48362448811531067, "learning_rate": 9.143594556546222e-06, "loss": 2.1766, "step": 1300 }, { "epoch": 0.3904050452344307, "eval_loss": 2.160381317138672, "eval_runtime": 245.4635, "eval_samples_per_second": 22.851, "eval_steps_per_second": 5.716, "step": 1300 }, { "epoch": 0.3979128345658621, "grad_norm": 0.5096102356910706, "learning_rate": 9.31956827780385e-06, "loss": 2.1664, "step": 1325 }, { "epoch": 0.40542062389729344, "grad_norm": 0.6038557887077332, "learning_rate": 9.495541999061475e-06, "loss": 2.1617, "step": 1350 }, { "epoch": 0.4129284132287248, "grad_norm": 0.5893401503562927, "learning_rate": 9.671515720319098e-06, "loss": 2.1473, "step": 1375 }, { "epoch": 0.4204362025601562, "grad_norm": 0.5666929483413696, "learning_rate": 9.847489441576724e-06, "loss": 2.195, "step": 1400 }, { "epoch": 0.4204362025601562, "eval_loss": 2.154971122741699, "eval_runtime": 244.6458, "eval_samples_per_second": 22.927, "eval_steps_per_second": 5.735, "step": 1400 }, { "epoch": 0.42794399189158755, "grad_norm": 0.6115418672561646, "learning_rate": 1.0023463162834351e-05, "loss": 2.1356, "step": 1425 }, { "epoch": 0.43545178122301886, "grad_norm": 0.6469879150390625, "learning_rate": 1.0199436884091976e-05, "loss": 2.1755, "step": 1450 }, { "epoch": 0.44295957055445023, "grad_norm": 0.5257688760757446, "learning_rate": 1.0375410605349602e-05, "loss": 2.1731, "step": 1475 }, { "epoch": 0.4504673598858816, "grad_norm": 0.5619986653327942, "learning_rate": 1.0551384326607226e-05, "loss": 2.172, "step": 1500 }, { "epoch": 0.4504673598858816, "eval_loss": 2.15023136138916, "eval_runtime": 244.8519, "eval_samples_per_second": 22.908, "eval_steps_per_second": 5.73, "step": 1500 }, { "epoch": 0.45797514921731297, "grad_norm": 0.5681572556495667, "learning_rate": 1.0727358047864853e-05, "loss": 2.1556, "step": 1525 }, { "epoch": 0.46548293854874434, "grad_norm": 0.6319741010665894, "learning_rate": 1.0903331769122478e-05, "loss": 2.1369, "step": 1550 }, { "epoch": 0.4729907278801757, "grad_norm": 0.5815430283546448, "learning_rate": 1.1079305490380104e-05, "loss": 2.1589, "step": 1575 }, { "epoch": 0.480498517211607, "grad_norm": 0.5797183513641357, "learning_rate": 1.1255279211637729e-05, "loss": 2.1676, "step": 1600 }, { "epoch": 0.480498517211607, "eval_loss": 2.1454966068267822, "eval_runtime": 244.6098, "eval_samples_per_second": 22.93, "eval_steps_per_second": 5.736, "step": 1600 }, { "epoch": 0.4880063065430384, "grad_norm": 0.6238908171653748, "learning_rate": 1.1431252932895355e-05, "loss": 2.1451, "step": 1625 }, { "epoch": 0.49551409587446976, "grad_norm": 0.6378119587898254, "learning_rate": 1.160722665415298e-05, "loss": 2.1393, "step": 1650 }, { "epoch": 0.5030218852059011, "grad_norm": 0.5630180239677429, "learning_rate": 1.1783200375410605e-05, "loss": 2.1532, "step": 1675 }, { "epoch": 0.5105296745373324, "grad_norm": 0.5868392586708069, "learning_rate": 1.1959174096668231e-05, "loss": 2.1473, "step": 1700 }, { "epoch": 0.5105296745373324, "eval_loss": 2.141220808029175, "eval_runtime": 244.9172, "eval_samples_per_second": 22.902, "eval_steps_per_second": 5.728, "step": 1700 }, { "epoch": 0.5180374638687638, "grad_norm": 0.6577850580215454, "learning_rate": 1.2135147817925858e-05, "loss": 2.1379, "step": 1725 }, { "epoch": 0.5255452532001952, "grad_norm": 0.6026327013969421, "learning_rate": 1.2311121539183482e-05, "loss": 2.1464, "step": 1750 }, { "epoch": 0.5330530425316266, "grad_norm": 0.60736483335495, "learning_rate": 1.2487095260441107e-05, "loss": 2.1588, "step": 1775 }, { "epoch": 0.5405608318630579, "grad_norm": 0.6438941359519958, "learning_rate": 1.2663068981698733e-05, "loss": 2.1421, "step": 1800 }, { "epoch": 0.5405608318630579, "eval_loss": 2.1365151405334473, "eval_runtime": 244.577, "eval_samples_per_second": 22.933, "eval_steps_per_second": 5.736, "step": 1800 }, { "epoch": 0.5480686211944893, "grad_norm": 0.6403496861457825, "learning_rate": 1.283904270295636e-05, "loss": 2.1428, "step": 1825 }, { "epoch": 0.5555764105259207, "grad_norm": 0.645140528678894, "learning_rate": 1.3015016424213985e-05, "loss": 2.1603, "step": 1850 }, { "epoch": 0.563084199857352, "grad_norm": 0.6453937292098999, "learning_rate": 1.3190990145471609e-05, "loss": 2.156, "step": 1875 }, { "epoch": 0.5705919891887834, "grad_norm": 0.7146685123443604, "learning_rate": 1.3366963866729234e-05, "loss": 2.1016, "step": 1900 }, { "epoch": 0.5705919891887834, "eval_loss": 2.1333518028259277, "eval_runtime": 245.1598, "eval_samples_per_second": 22.879, "eval_steps_per_second": 5.723, "step": 1900 }, { "epoch": 0.5780997785202148, "grad_norm": 0.6153611540794373, "learning_rate": 1.3542937587986862e-05, "loss": 2.1577, "step": 1925 }, { "epoch": 0.5856075678516461, "grad_norm": 0.7233150601387024, "learning_rate": 1.3718911309244487e-05, "loss": 2.1348, "step": 1950 }, { "epoch": 0.5931153571830774, "grad_norm": 0.7316763401031494, "learning_rate": 1.3894885030502113e-05, "loss": 2.1316, "step": 1975 }, { "epoch": 0.6006231465145088, "grad_norm": 0.6433097124099731, "learning_rate": 1.4070858751759736e-05, "loss": 2.1445, "step": 2000 }, { "epoch": 0.6006231465145088, "eval_loss": 2.129106044769287, "eval_runtime": 244.5211, "eval_samples_per_second": 22.939, "eval_steps_per_second": 5.738, "step": 2000 }, { "epoch": 0.6081309358459401, "grad_norm": 0.6830511689186096, "learning_rate": 1.4246832473017363e-05, "loss": 2.1139, "step": 2025 }, { "epoch": 0.6156387251773715, "grad_norm": 0.6850073337554932, "learning_rate": 1.4422806194274989e-05, "loss": 2.1218, "step": 2050 }, { "epoch": 0.6231465145088029, "grad_norm": 0.6426066160202026, "learning_rate": 1.4598779915532614e-05, "loss": 2.1275, "step": 2075 }, { "epoch": 0.6306543038402342, "grad_norm": 0.6646946668624878, "learning_rate": 1.477475363679024e-05, "loss": 2.126, "step": 2100 }, { "epoch": 0.6306543038402342, "eval_loss": 2.1254663467407227, "eval_runtime": 244.8863, "eval_samples_per_second": 22.905, "eval_steps_per_second": 5.729, "step": 2100 }, { "epoch": 0.6381620931716656, "grad_norm": 0.7284884452819824, "learning_rate": 1.4950727358047865e-05, "loss": 2.116, "step": 2125 }, { "epoch": 0.645669882503097, "grad_norm": 0.8441785573959351, "learning_rate": 1.4999980024014693e-05, "loss": 2.1195, "step": 2150 }, { "epoch": 0.6531776718345284, "grad_norm": 0.7109578847885132, "learning_rate": 1.4999886001482528e-05, "loss": 2.122, "step": 2175 }, { "epoch": 0.6606854611659597, "grad_norm": 0.7228453755378723, "learning_rate": 1.4999714912309012e-05, "loss": 2.1058, "step": 2200 }, { "epoch": 0.6606854611659597, "eval_loss": 2.1222198009490967, "eval_runtime": 245.2072, "eval_samples_per_second": 22.875, "eval_steps_per_second": 5.722, "step": 2200 }, { "epoch": 0.6681932504973911, "grad_norm": 0.7744355201721191, "learning_rate": 1.4999466758252207e-05, "loss": 2.1252, "step": 2225 }, { "epoch": 0.6757010398288225, "grad_norm": 0.7705317735671997, "learning_rate": 1.4999141541862068e-05, "loss": 2.0941, "step": 2250 }, { "epoch": 0.6832088291602537, "grad_norm": 0.7709174156188965, "learning_rate": 1.4998739266480427e-05, "loss": 2.1044, "step": 2275 }, { "epoch": 0.6907166184916851, "grad_norm": 0.6840139627456665, "learning_rate": 1.4998259936240949e-05, "loss": 2.1146, "step": 2300 }, { "epoch": 0.6907166184916851, "eval_loss": 2.1187844276428223, "eval_runtime": 244.5599, "eval_samples_per_second": 22.935, "eval_steps_per_second": 5.737, "step": 2300 }, { "epoch": 0.6982244078231165, "grad_norm": 0.8008989095687866, "learning_rate": 1.4997703556069088e-05, "loss": 2.1483, "step": 2325 }, { "epoch": 0.7057321971545478, "grad_norm": 0.7936817407608032, "learning_rate": 1.499707013168205e-05, "loss": 2.1354, "step": 2350 }, { "epoch": 0.7132399864859792, "grad_norm": 0.7062814831733704, "learning_rate": 1.4996359669588714e-05, "loss": 2.1378, "step": 2375 }, { "epoch": 0.7207477758174106, "grad_norm": 0.8156118392944336, "learning_rate": 1.4995572177089582e-05, "loss": 2.0949, "step": 2400 }, { "epoch": 0.7207477758174106, "eval_loss": 2.1153197288513184, "eval_runtime": 244.9003, "eval_samples_per_second": 22.903, "eval_steps_per_second": 5.729, "step": 2400 }, { "epoch": 0.7282555651488419, "grad_norm": 0.7018394470214844, "learning_rate": 1.4994707662276703e-05, "loss": 2.1084, "step": 2425 }, { "epoch": 0.7357633544802733, "grad_norm": 0.7865644097328186, "learning_rate": 1.4993766134033573e-05, "loss": 2.1087, "step": 2450 }, { "epoch": 0.7432711438117047, "grad_norm": 0.7718919515609741, "learning_rate": 1.4992747602035062e-05, "loss": 2.1248, "step": 2475 }, { "epoch": 0.750778933143136, "grad_norm": 0.8038984537124634, "learning_rate": 1.499165207674731e-05, "loss": 2.124, "step": 2500 }, { "epoch": 0.750778933143136, "eval_loss": 2.112464666366577, "eval_runtime": 244.648, "eval_samples_per_second": 22.927, "eval_steps_per_second": 5.735, "step": 2500 }, { "epoch": 0.7582867224745674, "grad_norm": 0.8126859664916992, "learning_rate": 1.4990479569427615e-05, "loss": 2.0879, "step": 2525 }, { "epoch": 0.7657945118059988, "grad_norm": 0.7394261360168457, "learning_rate": 1.4989230092124322e-05, "loss": 2.1167, "step": 2550 }, { "epoch": 0.77330230113743, "grad_norm": 0.8700124621391296, "learning_rate": 1.498790365767669e-05, "loss": 2.0892, "step": 2575 }, { "epoch": 0.7808100904688614, "grad_norm": 0.7596783638000488, "learning_rate": 1.4986500279714777e-05, "loss": 2.112, "step": 2600 }, { "epoch": 0.7808100904688614, "eval_loss": 2.1093900203704834, "eval_runtime": 244.8809, "eval_samples_per_second": 22.905, "eval_steps_per_second": 5.729, "step": 2600 }, { "epoch": 0.7883178798002928, "grad_norm": 0.7278156876564026, "learning_rate": 1.4985019972659285e-05, "loss": 2.1186, "step": 2625 }, { "epoch": 0.7958256691317241, "grad_norm": 0.8945568203926086, "learning_rate": 1.4983462751721418e-05, "loss": 2.0986, "step": 2650 }, { "epoch": 0.8033334584631555, "grad_norm": 0.8277415037155151, "learning_rate": 1.498182863290272e-05, "loss": 2.1247, "step": 2675 }, { "epoch": 0.8108412477945869, "grad_norm": 0.7230107188224792, "learning_rate": 1.4980117632994925e-05, "loss": 2.1107, "step": 2700 }, { "epoch": 0.8108412477945869, "eval_loss": 2.106996774673462, "eval_runtime": 244.8592, "eval_samples_per_second": 22.907, "eval_steps_per_second": 5.73, "step": 2700 }, { "epoch": 0.8183490371260183, "grad_norm": 0.8236918449401855, "learning_rate": 1.4978329769579768e-05, "loss": 2.1138, "step": 2725 }, { "epoch": 0.8258568264574496, "grad_norm": 0.7915171384811401, "learning_rate": 1.4976465061028811e-05, "loss": 2.1113, "step": 2750 }, { "epoch": 0.833364615788881, "grad_norm": 0.8001993894577026, "learning_rate": 1.4974523526503252e-05, "loss": 2.122, "step": 2775 }, { "epoch": 0.8408724051203124, "grad_norm": 0.915046751499176, "learning_rate": 1.4972505185953739e-05, "loss": 2.1145, "step": 2800 }, { "epoch": 0.8408724051203124, "eval_loss": 2.1040894985198975, "eval_runtime": 244.924, "eval_samples_per_second": 22.901, "eval_steps_per_second": 5.728, "step": 2800 }, { "epoch": 0.8483801944517437, "grad_norm": 0.7762336134910583, "learning_rate": 1.4970410060120146e-05, "loss": 2.0905, "step": 2825 }, { "epoch": 0.8558879837831751, "grad_norm": 0.8220327496528625, "learning_rate": 1.496823817053138e-05, "loss": 2.1149, "step": 2850 }, { "epoch": 0.8633957731146064, "grad_norm": 0.8111168146133423, "learning_rate": 1.4965989539505144e-05, "loss": 2.1035, "step": 2875 }, { "epoch": 0.8709035624460377, "grad_norm": 0.7875452637672424, "learning_rate": 1.4963664190147713e-05, "loss": 2.1091, "step": 2900 }, { "epoch": 0.8709035624460377, "eval_loss": 2.101161241531372, "eval_runtime": 247.6368, "eval_samples_per_second": 22.65, "eval_steps_per_second": 5.666, "step": 2900 }, { "epoch": 0.8784113517774691, "grad_norm": 0.8538459539413452, "learning_rate": 1.4961262146353696e-05, "loss": 2.0994, "step": 2925 }, { "epoch": 0.8859191411089005, "grad_norm": 0.7686406373977661, "learning_rate": 1.4958783432805801e-05, "loss": 2.0858, "step": 2950 }, { "epoch": 0.8934269304403318, "grad_norm": 0.792827844619751, "learning_rate": 1.4956228074974561e-05, "loss": 2.1001, "step": 2975 }, { "epoch": 0.9009347197717632, "grad_norm": 0.9214953780174255, "learning_rate": 1.4953596099118089e-05, "loss": 2.0844, "step": 3000 }, { "epoch": 0.9009347197717632, "eval_loss": 2.100034713745117, "eval_runtime": 247.8843, "eval_samples_per_second": 22.627, "eval_steps_per_second": 5.66, "step": 3000 }, { "epoch": 0.9084425091031946, "grad_norm": 0.8309657573699951, "learning_rate": 1.49508875322818e-05, "loss": 2.0882, "step": 3025 }, { "epoch": 0.9159502984346259, "grad_norm": 0.8833063244819641, "learning_rate": 1.4948102402298141e-05, "loss": 2.1063, "step": 3050 }, { "epoch": 0.9234580877660573, "grad_norm": 0.7956681847572327, "learning_rate": 1.4945240737786292e-05, "loss": 2.0885, "step": 3075 }, { "epoch": 0.9309658770974887, "grad_norm": 0.8342053890228271, "learning_rate": 1.4942302568151882e-05, "loss": 2.1001, "step": 3100 }, { "epoch": 0.9309658770974887, "eval_loss": 2.0970711708068848, "eval_runtime": 245.0795, "eval_samples_per_second": 22.886, "eval_steps_per_second": 5.725, "step": 3100 }, { "epoch": 0.93847366642892, "grad_norm": 0.9061738848686218, "learning_rate": 1.493928792358669e-05, "loss": 2.1135, "step": 3125 }, { "epoch": 0.9459814557603514, "grad_norm": 0.9443092346191406, "learning_rate": 1.4936196835068322e-05, "loss": 2.0909, "step": 3150 }, { "epoch": 0.9534892450917827, "grad_norm": 0.7598241567611694, "learning_rate": 1.4933029334359898e-05, "loss": 2.1215, "step": 3175 }, { "epoch": 0.960997034423214, "grad_norm": 1.001592993736267, "learning_rate": 1.4929785454009737e-05, "loss": 2.0884, "step": 3200 }, { "epoch": 0.960997034423214, "eval_loss": 2.09686541557312, "eval_runtime": 244.53, "eval_samples_per_second": 22.938, "eval_steps_per_second": 5.738, "step": 3200 }, { "epoch": 0.9685048237546454, "grad_norm": 0.9168058633804321, "learning_rate": 1.4926465227351008e-05, "loss": 2.0785, "step": 3225 }, { "epoch": 0.9760126130860768, "grad_norm": 0.8249208331108093, "learning_rate": 1.4923068688501385e-05, "loss": 2.0841, "step": 3250 }, { "epoch": 0.9835204024175082, "grad_norm": 0.8430188298225403, "learning_rate": 1.4919595872362719e-05, "loss": 2.0969, "step": 3275 }, { "epoch": 0.9910281917489395, "grad_norm": 0.9370065927505493, "learning_rate": 1.491604681462065e-05, "loss": 2.1052, "step": 3300 }, { "epoch": 0.9910281917489395, "eval_loss": 2.0929176807403564, "eval_runtime": 244.8371, "eval_samples_per_second": 22.909, "eval_steps_per_second": 5.73, "step": 3300 }, { "epoch": 0.9985359810803709, "grad_norm": 0.7515010237693787, "learning_rate": 1.4912421551744264e-05, "loss": 2.0882, "step": 3325 }, { "epoch": 1.0063065430384024, "grad_norm": 0.8594741821289062, "learning_rate": 1.4908720120985703e-05, "loss": 2.2045, "step": 3350 }, { "epoch": 1.0138143323698336, "grad_norm": 0.852730929851532, "learning_rate": 1.4904942560379791e-05, "loss": 2.0833, "step": 3375 }, { "epoch": 1.0213221217012651, "grad_norm": 0.8965045809745789, "learning_rate": 1.4901088908743635e-05, "loss": 2.1122, "step": 3400 }, { "epoch": 1.0213221217012651, "eval_loss": 2.0909690856933594, "eval_runtime": 245.1692, "eval_samples_per_second": 22.878, "eval_steps_per_second": 5.723, "step": 3400 }, { "epoch": 1.0288299110326964, "grad_norm": 0.8129332065582275, "learning_rate": 1.4897159205676244e-05, "loss": 2.062, "step": 3425 }, { "epoch": 1.0363377003641279, "grad_norm": 0.7968320846557617, "learning_rate": 1.4893153491558093e-05, "loss": 2.1195, "step": 3450 }, { "epoch": 1.0438454896955591, "grad_norm": 0.8569227457046509, "learning_rate": 1.4889071807550734e-05, "loss": 2.0819, "step": 3475 }, { "epoch": 1.0513532790269906, "grad_norm": 0.790208101272583, "learning_rate": 1.4884914195596364e-05, "loss": 2.0831, "step": 3500 }, { "epoch": 1.0513532790269906, "eval_loss": 2.0892488956451416, "eval_runtime": 244.1949, "eval_samples_per_second": 22.969, "eval_steps_per_second": 5.745, "step": 3500 }, { "epoch": 1.0588610683584219, "grad_norm": 0.7736139893531799, "learning_rate": 1.488068069841739e-05, "loss": 2.0969, "step": 3525 }, { "epoch": 1.0663688576898531, "grad_norm": 0.9392566084861755, "learning_rate": 1.4876371359515992e-05, "loss": 2.0835, "step": 3550 }, { "epoch": 1.0738766470212846, "grad_norm": 0.9095376133918762, "learning_rate": 1.4871986223173682e-05, "loss": 2.0882, "step": 3575 }, { "epoch": 1.0813844363527159, "grad_norm": 0.999569833278656, "learning_rate": 1.4867525334450842e-05, "loss": 2.0789, "step": 3600 }, { "epoch": 1.0813844363527159, "eval_loss": 2.0872867107391357, "eval_runtime": 245.2287, "eval_samples_per_second": 22.873, "eval_steps_per_second": 5.721, "step": 3600 }, { "epoch": 1.0888922256841473, "grad_norm": 0.8475573658943176, "learning_rate": 1.4862988739186265e-05, "loss": 2.0472, "step": 3625 }, { "epoch": 1.0964000150155786, "grad_norm": 0.8783066868782043, "learning_rate": 1.4858376483996675e-05, "loss": 2.1, "step": 3650 }, { "epoch": 1.10390780434701, "grad_norm": 0.8863905072212219, "learning_rate": 1.4853688616276268e-05, "loss": 2.112, "step": 3675 }, { "epoch": 1.1114155936784413, "grad_norm": 1.0993289947509766, "learning_rate": 1.4848925184196203e-05, "loss": 2.0788, "step": 3700 }, { "epoch": 1.1114155936784413, "eval_loss": 2.0860979557037354, "eval_runtime": 245.145, "eval_samples_per_second": 22.88, "eval_steps_per_second": 5.723, "step": 3700 }, { "epoch": 1.1189233830098728, "grad_norm": 0.7591436505317688, "learning_rate": 1.4844086236704119e-05, "loss": 2.0705, "step": 3725 }, { "epoch": 1.126431172341304, "grad_norm": 0.9064419269561768, "learning_rate": 1.4839171823523628e-05, "loss": 2.0421, "step": 3750 }, { "epoch": 1.1339389616727356, "grad_norm": 0.8282918930053711, "learning_rate": 1.483418199515381e-05, "loss": 2.0621, "step": 3775 }, { "epoch": 1.1414467510041668, "grad_norm": 0.9208828806877136, "learning_rate": 1.4829116802868684e-05, "loss": 2.08, "step": 3800 }, { "epoch": 1.1414467510041668, "eval_loss": 2.0833563804626465, "eval_runtime": 245.2937, "eval_samples_per_second": 22.866, "eval_steps_per_second": 5.72, "step": 3800 }, { "epoch": 1.1489545403355983, "grad_norm": 0.8673622608184814, "learning_rate": 1.4823976298716686e-05, "loss": 2.0879, "step": 3825 }, { "epoch": 1.1564623296670296, "grad_norm": 0.9238690137863159, "learning_rate": 1.4818760535520142e-05, "loss": 2.083, "step": 3850 }, { "epoch": 1.1639701189984608, "grad_norm": 0.855536937713623, "learning_rate": 1.4813469566874711e-05, "loss": 2.0705, "step": 3875 }, { "epoch": 1.1714779083298923, "grad_norm": 0.8495576977729797, "learning_rate": 1.4808103447148845e-05, "loss": 2.092, "step": 3900 }, { "epoch": 1.1714779083298923, "eval_loss": 2.081465721130371, "eval_runtime": 244.6166, "eval_samples_per_second": 22.93, "eval_steps_per_second": 5.736, "step": 3900 }, { "epoch": 1.1789856976613236, "grad_norm": 0.9213201403617859, "learning_rate": 1.4802662231483224e-05, "loss": 2.0695, "step": 3925 }, { "epoch": 1.186493486992755, "grad_norm": 0.9453656673431396, "learning_rate": 1.4797145975790194e-05, "loss": 2.0856, "step": 3950 }, { "epoch": 1.1940012763241863, "grad_norm": 0.894378662109375, "learning_rate": 1.4791554736753193e-05, "loss": 2.0705, "step": 3975 }, { "epoch": 1.2015090656556178, "grad_norm": 0.9393320083618164, "learning_rate": 1.4785888571826158e-05, "loss": 2.0693, "step": 4000 }, { "epoch": 1.2015090656556178, "eval_loss": 2.079852819442749, "eval_runtime": 245.0032, "eval_samples_per_second": 22.894, "eval_steps_per_second": 5.726, "step": 4000 }, { "epoch": 1.209016854987049, "grad_norm": 0.8150069117546082, "learning_rate": 1.478014753923295e-05, "loss": 2.0721, "step": 4025 }, { "epoch": 1.2165246443184805, "grad_norm": 0.867784321308136, "learning_rate": 1.4774331697966743e-05, "loss": 2.1046, "step": 4050 }, { "epoch": 1.2240324336499118, "grad_norm": 0.8931713700294495, "learning_rate": 1.476844110778943e-05, "loss": 2.0718, "step": 4075 }, { "epoch": 1.231540222981343, "grad_norm": 0.9451190829277039, "learning_rate": 1.4762475829230994e-05, "loss": 2.0826, "step": 4100 }, { "epoch": 1.231540222981343, "eval_loss": 2.078012466430664, "eval_runtime": 244.9722, "eval_samples_per_second": 22.896, "eval_steps_per_second": 5.727, "step": 4100 }, { "epoch": 1.2390480123127745, "grad_norm": 0.9044253826141357, "learning_rate": 1.4756435923588899e-05, "loss": 2.0853, "step": 4125 }, { "epoch": 1.246555801644206, "grad_norm": 0.9442611336708069, "learning_rate": 1.4750321452927454e-05, "loss": 2.039, "step": 4150 }, { "epoch": 1.2540635909756372, "grad_norm": 0.8297872543334961, "learning_rate": 1.4744132480077177e-05, "loss": 2.0371, "step": 4175 }, { "epoch": 1.2615713803070685, "grad_norm": 0.783397912979126, "learning_rate": 1.4737869068634148e-05, "loss": 2.0508, "step": 4200 }, { "epoch": 1.2615713803070685, "eval_loss": 2.076925754547119, "eval_runtime": 244.7969, "eval_samples_per_second": 22.913, "eval_steps_per_second": 5.731, "step": 4200 }, { "epoch": 1.2690791696385, "grad_norm": 0.9161412119865417, "learning_rate": 1.4731531282959356e-05, "loss": 2.0785, "step": 4225 }, { "epoch": 1.2765869589699312, "grad_norm": 0.8472649455070496, "learning_rate": 1.4725119188178038e-05, "loss": 2.057, "step": 4250 }, { "epoch": 1.2840947483013627, "grad_norm": 0.777370035648346, "learning_rate": 1.4718632850179013e-05, "loss": 2.0842, "step": 4275 }, { "epoch": 1.291602537632794, "grad_norm": 0.9465096592903137, "learning_rate": 1.471207233561399e-05, "loss": 2.0788, "step": 4300 }, { "epoch": 1.291602537632794, "eval_loss": 2.0751006603240967, "eval_runtime": 244.7621, "eval_samples_per_second": 22.916, "eval_steps_per_second": 5.732, "step": 4300 }, { "epoch": 1.2991103269642255, "grad_norm": 0.9006996750831604, "learning_rate": 1.4705437711896914e-05, "loss": 2.0689, "step": 4325 }, { "epoch": 1.3066181162956567, "grad_norm": 0.8863036632537842, "learning_rate": 1.469872904720324e-05, "loss": 2.0536, "step": 4350 }, { "epoch": 1.3141259056270882, "grad_norm": 0.8076067566871643, "learning_rate": 1.4691946410469244e-05, "loss": 2.0704, "step": 4375 }, { "epoch": 1.3216336949585195, "grad_norm": 0.8585737943649292, "learning_rate": 1.4685089871391332e-05, "loss": 2.0566, "step": 4400 }, { "epoch": 1.3216336949585195, "eval_loss": 2.0732879638671875, "eval_runtime": 245.4201, "eval_samples_per_second": 22.855, "eval_steps_per_second": 5.717, "step": 4400 }, { "epoch": 1.3291414842899507, "grad_norm": 0.8773880004882812, "learning_rate": 1.4678159500425296e-05, "loss": 2.0661, "step": 4425 }, { "epoch": 1.3366492736213822, "grad_norm": 0.9763519763946533, "learning_rate": 1.4671155368785604e-05, "loss": 2.0684, "step": 4450 }, { "epoch": 1.3441570629528137, "grad_norm": 0.8556541204452515, "learning_rate": 1.4664077548444675e-05, "loss": 2.0788, "step": 4475 }, { "epoch": 1.351664852284245, "grad_norm": 0.8426047563552856, "learning_rate": 1.4656926112132124e-05, "loss": 2.0645, "step": 4500 }, { "epoch": 1.351664852284245, "eval_loss": 2.0714945793151855, "eval_runtime": 271.6463, "eval_samples_per_second": 20.648, "eval_steps_per_second": 5.165, "step": 4500 }, { "epoch": 1.3591726416156762, "grad_norm": 0.8249872326850891, "learning_rate": 1.4649701133334025e-05, "loss": 2.0679, "step": 4525 }, { "epoch": 1.3666804309471077, "grad_norm": 0.8870148658752441, "learning_rate": 1.4642402686292155e-05, "loss": 2.0873, "step": 4550 }, { "epoch": 1.374188220278539, "grad_norm": 0.8625667095184326, "learning_rate": 1.4635030846003225e-05, "loss": 2.0655, "step": 4575 }, { "epoch": 1.3816960096099704, "grad_norm": 1.0245722532272339, "learning_rate": 1.4627585688218116e-05, "loss": 2.0939, "step": 4600 }, { "epoch": 1.3816960096099704, "eval_loss": 2.0702602863311768, "eval_runtime": 244.5585, "eval_samples_per_second": 22.935, "eval_steps_per_second": 5.737, "step": 4600 }, { "epoch": 1.3892037989414017, "grad_norm": 0.9307467937469482, "learning_rate": 1.4620067289441101e-05, "loss": 2.0582, "step": 4625 }, { "epoch": 1.396711588272833, "grad_norm": 0.8650360703468323, "learning_rate": 1.461247572692905e-05, "loss": 2.0486, "step": 4650 }, { "epoch": 1.4042193776042644, "grad_norm": 0.8464282155036926, "learning_rate": 1.4604811078690648e-05, "loss": 2.0513, "step": 4675 }, { "epoch": 1.4117271669356959, "grad_norm": 0.9079179167747498, "learning_rate": 1.4597073423485583e-05, "loss": 2.0642, "step": 4700 }, { "epoch": 1.4117271669356959, "eval_loss": 2.068575143814087, "eval_runtime": 244.9525, "eval_samples_per_second": 22.898, "eval_steps_per_second": 5.728, "step": 4700 }, { "epoch": 1.4192349562671271, "grad_norm": 0.8237431049346924, "learning_rate": 1.4589262840823746e-05, "loss": 2.0619, "step": 4725 }, { "epoch": 1.4267427455985584, "grad_norm": 0.8957166075706482, "learning_rate": 1.4581379410964402e-05, "loss": 2.0896, "step": 4750 }, { "epoch": 1.4342505349299899, "grad_norm": 0.7650532722473145, "learning_rate": 1.4573423214915382e-05, "loss": 2.0554, "step": 4775 }, { "epoch": 1.4417583242614211, "grad_norm": 0.9083628058433533, "learning_rate": 1.4565394334432233e-05, "loss": 2.0811, "step": 4800 }, { "epoch": 1.4417583242614211, "eval_loss": 2.066969394683838, "eval_runtime": 244.7686, "eval_samples_per_second": 22.916, "eval_steps_per_second": 5.732, "step": 4800 }, { "epoch": 1.4492661135928526, "grad_norm": 0.963108479976654, "learning_rate": 1.4557292852017392e-05, "loss": 2.0727, "step": 4825 }, { "epoch": 1.4567739029242839, "grad_norm": 0.8735617399215698, "learning_rate": 1.454911885091933e-05, "loss": 2.0681, "step": 4850 }, { "epoch": 1.4642816922557154, "grad_norm": 1.0220097303390503, "learning_rate": 1.4540872415131695e-05, "loss": 2.0602, "step": 4875 }, { "epoch": 1.4717894815871466, "grad_norm": 0.9304827451705933, "learning_rate": 1.4532553629392455e-05, "loss": 2.0539, "step": 4900 }, { "epoch": 1.4717894815871466, "eval_loss": 2.0658257007598877, "eval_runtime": 244.4897, "eval_samples_per_second": 22.942, "eval_steps_per_second": 5.738, "step": 4900 }, { "epoch": 1.479297270918578, "grad_norm": 0.9377899765968323, "learning_rate": 1.4524162579183032e-05, "loss": 2.0552, "step": 4925 }, { "epoch": 1.4868050602500094, "grad_norm": 0.9211867451667786, "learning_rate": 1.451569935072741e-05, "loss": 2.0622, "step": 4950 }, { "epoch": 1.4943128495814406, "grad_norm": 1.0366291999816895, "learning_rate": 1.4507164030991254e-05, "loss": 2.0673, "step": 4975 }, { "epoch": 1.501820638912872, "grad_norm": 0.9624854326248169, "learning_rate": 1.449855670768102e-05, "loss": 2.0748, "step": 5000 }, { "epoch": 1.501820638912872, "eval_loss": 2.0644030570983887, "eval_runtime": 245.047, "eval_samples_per_second": 22.889, "eval_steps_per_second": 5.725, "step": 5000 }, { "epoch": 1.5093284282443036, "grad_norm": 0.8962668180465698, "learning_rate": 1.4489877469243053e-05, "loss": 2.0701, "step": 5025 }, { "epoch": 1.5168362175757348, "grad_norm": 0.8921008110046387, "learning_rate": 1.4481126404862677e-05, "loss": 2.0669, "step": 5050 }, { "epoch": 1.524344006907166, "grad_norm": 0.9402926564216614, "learning_rate": 1.4472303604463279e-05, "loss": 2.0576, "step": 5075 }, { "epoch": 1.5318517962385976, "grad_norm": 0.8990075588226318, "learning_rate": 1.4463409158705376e-05, "loss": 2.0517, "step": 5100 }, { "epoch": 1.5318517962385976, "eval_loss": 2.0629703998565674, "eval_runtime": 244.3655, "eval_samples_per_second": 22.953, "eval_steps_per_second": 5.741, "step": 5100 }, { "epoch": 1.539359585570029, "grad_norm": 1.0020679235458374, "learning_rate": 1.4454443158985708e-05, "loss": 2.0582, "step": 5125 }, { "epoch": 1.5468673749014603, "grad_norm": 0.9144858121871948, "learning_rate": 1.4445405697436267e-05, "loss": 2.0518, "step": 5150 }, { "epoch": 1.5543751642328916, "grad_norm": 0.9205281138420105, "learning_rate": 1.4436296866923373e-05, "loss": 2.0553, "step": 5175 }, { "epoch": 1.5618829535643228, "grad_norm": 1.0122096538543701, "learning_rate": 1.4427116761046714e-05, "loss": 2.0333, "step": 5200 }, { "epoch": 1.5618829535643228, "eval_loss": 2.061532735824585, "eval_runtime": 244.549, "eval_samples_per_second": 22.936, "eval_steps_per_second": 5.737, "step": 5200 }, { "epoch": 1.5693907428957543, "grad_norm": 0.9542369842529297, "learning_rate": 1.441786547413838e-05, "loss": 2.0722, "step": 5225 }, { "epoch": 1.5768985322271858, "grad_norm": 0.9306456446647644, "learning_rate": 1.4408543101261898e-05, "loss": 2.0731, "step": 5250 }, { "epoch": 1.584406321558617, "grad_norm": 0.8262733221054077, "learning_rate": 1.4399149738211251e-05, "loss": 2.0629, "step": 5275 }, { "epoch": 1.5919141108900483, "grad_norm": 0.9227537512779236, "learning_rate": 1.43896854815099e-05, "loss": 2.0832, "step": 5300 }, { "epoch": 1.5919141108900483, "eval_loss": 2.0603787899017334, "eval_runtime": 244.6958, "eval_samples_per_second": 22.922, "eval_steps_per_second": 5.734, "step": 5300 }, { "epoch": 1.5994219002214798, "grad_norm": 0.9182181358337402, "learning_rate": 1.4380150428409788e-05, "loss": 2.0516, "step": 5325 }, { "epoch": 1.6069296895529113, "grad_norm": 0.8036996126174927, "learning_rate": 1.4370544676890333e-05, "loss": 2.0531, "step": 5350 }, { "epoch": 1.6144374788843425, "grad_norm": 0.9126760363578796, "learning_rate": 1.4360868325657447e-05, "loss": 2.0665, "step": 5375 }, { "epoch": 1.6219452682157738, "grad_norm": 1.0143436193466187, "learning_rate": 1.4351121474142484e-05, "loss": 2.029, "step": 5400 }, { "epoch": 1.6219452682157738, "eval_loss": 2.0587964057922363, "eval_runtime": 244.7582, "eval_samples_per_second": 22.916, "eval_steps_per_second": 5.732, "step": 5400 }, { "epoch": 1.6294530575472053, "grad_norm": 0.9128186702728271, "learning_rate": 1.4341304222501254e-05, "loss": 2.0253, "step": 5425 }, { "epoch": 1.6369608468786367, "grad_norm": 0.915397584438324, "learning_rate": 1.4331416671612966e-05, "loss": 2.0771, "step": 5450 }, { "epoch": 1.644468636210068, "grad_norm": 0.8913278579711914, "learning_rate": 1.4321458923079216e-05, "loss": 2.0781, "step": 5475 }, { "epoch": 1.6519764255414993, "grad_norm": 1.062047004699707, "learning_rate": 1.431143107922292e-05, "loss": 2.0567, "step": 5500 }, { "epoch": 1.6519764255414993, "eval_loss": 2.057093858718872, "eval_runtime": 245.0447, "eval_samples_per_second": 22.89, "eval_steps_per_second": 5.725, "step": 5500 }, { "epoch": 1.6594842148729305, "grad_norm": 0.8677504658699036, "learning_rate": 1.4301333243087277e-05, "loss": 2.0696, "step": 5525 }, { "epoch": 1.666992004204362, "grad_norm": 0.9853184223175049, "learning_rate": 1.4291165518434707e-05, "loss": 2.0113, "step": 5550 }, { "epoch": 1.6744997935357935, "grad_norm": 0.8988690972328186, "learning_rate": 1.4280928009745786e-05, "loss": 2.0278, "step": 5575 }, { "epoch": 1.6820075828672247, "grad_norm": 0.877238929271698, "learning_rate": 1.4270620822218162e-05, "loss": 2.0231, "step": 5600 }, { "epoch": 1.6820075828672247, "eval_loss": 2.0566163063049316, "eval_runtime": 244.8536, "eval_samples_per_second": 22.908, "eval_steps_per_second": 5.73, "step": 5600 }, { "epoch": 1.689515372198656, "grad_norm": 0.8475340008735657, "learning_rate": 1.4260244061765492e-05, "loss": 2.0667, "step": 5625 }, { "epoch": 1.6970231615300875, "grad_norm": 1.0350947380065918, "learning_rate": 1.4249797835016339e-05, "loss": 2.0482, "step": 5650 }, { "epoch": 1.704530950861519, "grad_norm": 0.9984613656997681, "learning_rate": 1.4239282249313083e-05, "loss": 2.0553, "step": 5675 }, { "epoch": 1.7120387401929502, "grad_norm": 0.8884134888648987, "learning_rate": 1.4228697412710817e-05, "loss": 2.063, "step": 5700 }, { "epoch": 1.7120387401929502, "eval_loss": 2.0545597076416016, "eval_runtime": 244.9412, "eval_samples_per_second": 22.899, "eval_steps_per_second": 5.728, "step": 5700 }, { "epoch": 1.7195465295243815, "grad_norm": 0.8889881372451782, "learning_rate": 1.4218043433976232e-05, "loss": 2.0594, "step": 5725 }, { "epoch": 1.727054318855813, "grad_norm": 0.9351671934127808, "learning_rate": 1.4207320422586511e-05, "loss": 2.0317, "step": 5750 }, { "epoch": 1.7345621081872442, "grad_norm": 0.9845299124717712, "learning_rate": 1.4196528488728189e-05, "loss": 2.0613, "step": 5775 }, { "epoch": 1.7420698975186757, "grad_norm": 1.0036661624908447, "learning_rate": 1.418566774329603e-05, "loss": 2.0203, "step": 5800 }, { "epoch": 1.7420698975186757, "eval_loss": 2.052852153778076, "eval_runtime": 244.7583, "eval_samples_per_second": 22.916, "eval_steps_per_second": 5.732, "step": 5800 }, { "epoch": 1.749577686850107, "grad_norm": 1.1337708234786987, "learning_rate": 1.4174738297891891e-05, "loss": 2.035, "step": 5825 }, { "epoch": 1.7570854761815382, "grad_norm": 0.9224268198013306, "learning_rate": 1.416374026482356e-05, "loss": 2.068, "step": 5850 }, { "epoch": 1.7645932655129697, "grad_norm": 0.8932907581329346, "learning_rate": 1.4152673757103622e-05, "loss": 2.0668, "step": 5875 }, { "epoch": 1.7721010548444012, "grad_norm": 0.9014378786087036, "learning_rate": 1.414153888844828e-05, "loss": 2.0585, "step": 5900 }, { "epoch": 1.7721010548444012, "eval_loss": 2.0522830486297607, "eval_runtime": 244.4651, "eval_samples_per_second": 22.944, "eval_steps_per_second": 5.739, "step": 5900 }, { "epoch": 1.7796088441758324, "grad_norm": 0.9573795795440674, "learning_rate": 1.41303357732762e-05, "loss": 2.0726, "step": 5925 }, { "epoch": 1.7871166335072637, "grad_norm": 1.0068199634552002, "learning_rate": 1.4119064526707325e-05, "loss": 2.0117, "step": 5950 }, { "epoch": 1.7946244228386952, "grad_norm": 0.8137004971504211, "learning_rate": 1.4107725264561694e-05, "loss": 2.0531, "step": 5975 }, { "epoch": 1.8021322121701266, "grad_norm": 0.9432706832885742, "learning_rate": 1.4096318103358264e-05, "loss": 2.0528, "step": 6000 }, { "epoch": 1.8021322121701266, "eval_loss": 2.0512585639953613, "eval_runtime": 244.6438, "eval_samples_per_second": 22.927, "eval_steps_per_second": 5.735, "step": 6000 }, { "epoch": 1.809640001501558, "grad_norm": 0.8738940954208374, "learning_rate": 1.4084843160313693e-05, "loss": 2.0486, "step": 6025 }, { "epoch": 1.8171477908329892, "grad_norm": 0.9203903079032898, "learning_rate": 1.407330055334115e-05, "loss": 2.0431, "step": 6050 }, { "epoch": 1.8246555801644204, "grad_norm": 0.8773927688598633, "learning_rate": 1.4061690401049101e-05, "loss": 2.0336, "step": 6075 }, { "epoch": 1.832163369495852, "grad_norm": 1.0781759023666382, "learning_rate": 1.4050012822740082e-05, "loss": 2.0839, "step": 6100 }, { "epoch": 1.832163369495852, "eval_loss": 2.0504093170166016, "eval_runtime": 244.864, "eval_samples_per_second": 22.907, "eval_steps_per_second": 5.73, "step": 6100 }, { "epoch": 1.8396711588272834, "grad_norm": 0.8537021279335022, "learning_rate": 1.4038267938409481e-05, "loss": 2.0394, "step": 6125 }, { "epoch": 1.8471789481587146, "grad_norm": 0.9055094122886658, "learning_rate": 1.4026455868744306e-05, "loss": 2.0267, "step": 6150 }, { "epoch": 1.854686737490146, "grad_norm": 0.8958349227905273, "learning_rate": 1.401457673512194e-05, "loss": 2.0427, "step": 6175 }, { "epoch": 1.8621945268215774, "grad_norm": 0.8849508166313171, "learning_rate": 1.4002630659608895e-05, "loss": 2.0492, "step": 6200 }, { "epoch": 1.8621945268215774, "eval_loss": 2.0487124919891357, "eval_runtime": 244.4909, "eval_samples_per_second": 22.942, "eval_steps_per_second": 5.738, "step": 6200 }, { "epoch": 1.8697023161530089, "grad_norm": 0.9771384000778198, "learning_rate": 1.3990617764959564e-05, "loss": 2.0473, "step": 6225 }, { "epoch": 1.8772101054844401, "grad_norm": 0.9234246611595154, "learning_rate": 1.3978538174614942e-05, "loss": 2.0408, "step": 6250 }, { "epoch": 1.8847178948158714, "grad_norm": 1.0580551624298096, "learning_rate": 1.3966392012701381e-05, "loss": 2.0299, "step": 6275 }, { "epoch": 1.8922256841473029, "grad_norm": 0.8676178455352783, "learning_rate": 1.3954179404029295e-05, "loss": 2.0513, "step": 6300 }, { "epoch": 1.8922256841473029, "eval_loss": 2.0470457077026367, "eval_runtime": 244.6825, "eval_samples_per_second": 22.924, "eval_steps_per_second": 5.734, "step": 6300 }, { "epoch": 1.8997334734787343, "grad_norm": 1.0486456155776978, "learning_rate": 1.3941900474091892e-05, "loss": 2.0646, "step": 6325 }, { "epoch": 1.9072412628101656, "grad_norm": 0.963049054145813, "learning_rate": 1.3929555349063875e-05, "loss": 2.0421, "step": 6350 }, { "epoch": 1.9147490521415969, "grad_norm": 0.9626838564872742, "learning_rate": 1.391714415580015e-05, "loss": 2.0369, "step": 6375 }, { "epoch": 1.922256841473028, "grad_norm": 0.9801763296127319, "learning_rate": 1.3904667021834514e-05, "loss": 2.0114, "step": 6400 }, { "epoch": 1.922256841473028, "eval_loss": 2.046201467514038, "eval_runtime": 244.6721, "eval_samples_per_second": 22.925, "eval_steps_per_second": 5.734, "step": 6400 }, { "epoch": 1.9297646308044596, "grad_norm": 1.0865575075149536, "learning_rate": 1.3892124075378364e-05, "loss": 2.0132, "step": 6425 }, { "epoch": 1.937272420135891, "grad_norm": 0.899895429611206, "learning_rate": 1.3879515445319353e-05, "loss": 2.0412, "step": 6450 }, { "epoch": 1.9447802094673223, "grad_norm": 0.9657663106918335, "learning_rate": 1.3866841261220093e-05, "loss": 2.0367, "step": 6475 }, { "epoch": 1.9522879987987536, "grad_norm": 0.8613144159317017, "learning_rate": 1.3854101653316798e-05, "loss": 2.0456, "step": 6500 }, { "epoch": 1.9522879987987536, "eval_loss": 2.0444774627685547, "eval_runtime": 244.5805, "eval_samples_per_second": 22.933, "eval_steps_per_second": 5.736, "step": 6500 }, { "epoch": 1.959795788130185, "grad_norm": 0.8493949174880981, "learning_rate": 1.3841296752517967e-05, "loss": 2.0617, "step": 6525 }, { "epoch": 1.9673035774616165, "grad_norm": 0.9268197417259216, "learning_rate": 1.3828426690403026e-05, "loss": 2.0502, "step": 6550 }, { "epoch": 1.9748113667930478, "grad_norm": 0.9686461091041565, "learning_rate": 1.3815491599220977e-05, "loss": 2.057, "step": 6575 }, { "epoch": 1.982319156124479, "grad_norm": 0.9616640210151672, "learning_rate": 1.3802491611889048e-05, "loss": 2.0442, "step": 6600 }, { "epoch": 1.982319156124479, "eval_loss": 2.043835401535034, "eval_runtime": 244.9743, "eval_samples_per_second": 22.896, "eval_steps_per_second": 5.727, "step": 6600 }, { "epoch": 1.9898269454559105, "grad_norm": 0.8984593152999878, "learning_rate": 1.3789426861991317e-05, "loss": 2.0366, "step": 6625 }, { "epoch": 1.997334734787342, "grad_norm": 0.8971940875053406, "learning_rate": 1.3776297483777344e-05, "loss": 2.0255, "step": 6650 }, { "epoch": 2.0051052967453735, "grad_norm": 0.9031795859336853, "learning_rate": 1.3763103612160788e-05, "loss": 2.0926, "step": 6675 }, { "epoch": 2.012613086076805, "grad_norm": 0.8842533230781555, "learning_rate": 1.374984538271803e-05, "loss": 2.0172, "step": 6700 }, { "epoch": 2.012613086076805, "eval_loss": 2.0426952838897705, "eval_runtime": 244.4788, "eval_samples_per_second": 22.943, "eval_steps_per_second": 5.739, "step": 6700 }, { "epoch": 2.020120875408236, "grad_norm": 1.008647084236145, "learning_rate": 1.3736522931686765e-05, "loss": 2.0135, "step": 6725 }, { "epoch": 2.0276286647396673, "grad_norm": 1.0014972686767578, "learning_rate": 1.372313639596462e-05, "loss": 2.0175, "step": 6750 }, { "epoch": 2.0351364540710986, "grad_norm": 0.9429395198822021, "learning_rate": 1.3709685913107728e-05, "loss": 2.0228, "step": 6775 }, { "epoch": 2.0426442434025303, "grad_norm": 1.057131052017212, "learning_rate": 1.369617162132933e-05, "loss": 2.0281, "step": 6800 }, { "epoch": 2.0426442434025303, "eval_loss": 2.0424487590789795, "eval_runtime": 244.6503, "eval_samples_per_second": 22.927, "eval_steps_per_second": 5.735, "step": 6800 }, { "epoch": 2.0501520327339615, "grad_norm": 0.990040123462677, "learning_rate": 1.3682593659498343e-05, "loss": 2.0111, "step": 6825 }, { "epoch": 2.0576598220653928, "grad_norm": 0.9503148794174194, "learning_rate": 1.3668952167137948e-05, "loss": 2.0273, "step": 6850 }, { "epoch": 2.065167611396824, "grad_norm": 0.9117149710655212, "learning_rate": 1.3655247284424141e-05, "loss": 2.0239, "step": 6875 }, { "epoch": 2.0726754007282557, "grad_norm": 1.0101039409637451, "learning_rate": 1.36414791521843e-05, "loss": 2.0336, "step": 6900 }, { "epoch": 2.0726754007282557, "eval_loss": 2.0416696071624756, "eval_runtime": 245.4111, "eval_samples_per_second": 22.856, "eval_steps_per_second": 5.717, "step": 6900 }, { "epoch": 2.080183190059687, "grad_norm": 0.8587022423744202, "learning_rate": 1.3627647911895737e-05, "loss": 2.0239, "step": 6925 }, { "epoch": 2.0876909793911183, "grad_norm": 0.8640381693840027, "learning_rate": 1.3613753705684241e-05, "loss": 2.0079, "step": 6950 }, { "epoch": 2.0951987687225495, "grad_norm": 0.8698000907897949, "learning_rate": 1.3599796676322627e-05, "loss": 2.0181, "step": 6975 }, { "epoch": 2.102706558053981, "grad_norm": 0.9826030731201172, "learning_rate": 1.3585776967229254e-05, "loss": 2.0165, "step": 7000 }, { "epoch": 2.102706558053981, "eval_loss": 2.0403730869293213, "eval_runtime": 244.4187, "eval_samples_per_second": 22.948, "eval_steps_per_second": 5.74, "step": 7000 }, { "epoch": 2.1102143473854125, "grad_norm": 0.9374090433120728, "learning_rate": 1.3571694722466567e-05, "loss": 2.0125, "step": 7025 }, { "epoch": 2.1177221367168437, "grad_norm": 0.9569231271743774, "learning_rate": 1.3557550086739605e-05, "loss": 2.0426, "step": 7050 }, { "epoch": 2.125229926048275, "grad_norm": 1.0747652053833008, "learning_rate": 1.3543343205394521e-05, "loss": 2.0391, "step": 7075 }, { "epoch": 2.1327377153797062, "grad_norm": 0.9164227247238159, "learning_rate": 1.3529074224417086e-05, "loss": 2.0171, "step": 7100 }, { "epoch": 2.1327377153797062, "eval_loss": 2.0392725467681885, "eval_runtime": 244.3097, "eval_samples_per_second": 22.959, "eval_steps_per_second": 5.743, "step": 7100 }, { "epoch": 2.140245504711138, "grad_norm": 1.2145719528198242, "learning_rate": 1.3514743290431186e-05, "loss": 1.9985, "step": 7125 }, { "epoch": 2.147753294042569, "grad_norm": 1.0173206329345703, "learning_rate": 1.3500350550697316e-05, "loss": 2.0221, "step": 7150 }, { "epoch": 2.1552610833740005, "grad_norm": 1.0180777311325073, "learning_rate": 1.3485896153111076e-05, "loss": 2.0316, "step": 7175 }, { "epoch": 2.1627688727054317, "grad_norm": 0.9768148064613342, "learning_rate": 1.3471380246201637e-05, "loss": 2.0115, "step": 7200 }, { "epoch": 2.1627688727054317, "eval_loss": 2.038167953491211, "eval_runtime": 244.3446, "eval_samples_per_second": 22.955, "eval_steps_per_second": 5.742, "step": 7200 }, { "epoch": 2.1702766620368634, "grad_norm": 1.1061457395553589, "learning_rate": 1.3456802979130227e-05, "loss": 2.0091, "step": 7225 }, { "epoch": 2.1777844513682947, "grad_norm": 1.1214226484298706, "learning_rate": 1.3442164501688593e-05, "loss": 2.0287, "step": 7250 }, { "epoch": 2.185292240699726, "grad_norm": 0.9686478972434998, "learning_rate": 1.342746496429746e-05, "loss": 2.0485, "step": 7275 }, { "epoch": 2.192800030031157, "grad_norm": 0.971811056137085, "learning_rate": 1.3412704518004983e-05, "loss": 2.0011, "step": 7300 }, { "epoch": 2.192800030031157, "eval_loss": 2.0375237464904785, "eval_runtime": 244.4348, "eval_samples_per_second": 22.947, "eval_steps_per_second": 5.74, "step": 7300 }, { "epoch": 2.200307819362589, "grad_norm": 0.9958051443099976, "learning_rate": 1.3397883314485206e-05, "loss": 2.0151, "step": 7325 }, { "epoch": 2.20781560869402, "grad_norm": 0.9805117249488831, "learning_rate": 1.3383001506036497e-05, "loss": 2.012, "step": 7350 }, { "epoch": 2.2153233980254514, "grad_norm": 0.9299209117889404, "learning_rate": 1.3368059245579976e-05, "loss": 2.0226, "step": 7375 }, { "epoch": 2.2228311873568827, "grad_norm": 0.9592748880386353, "learning_rate": 1.3353056686657956e-05, "loss": 2.0256, "step": 7400 }, { "epoch": 2.2228311873568827, "eval_loss": 2.0365006923675537, "eval_runtime": 243.9271, "eval_samples_per_second": 22.995, "eval_steps_per_second": 5.752, "step": 7400 }, { "epoch": 2.230338976688314, "grad_norm": 0.9213986396789551, "learning_rate": 1.3337993983432353e-05, "loss": 2.0179, "step": 7425 }, { "epoch": 2.2378467660197456, "grad_norm": 0.9306337237358093, "learning_rate": 1.3322871290683117e-05, "loss": 2.0189, "step": 7450 }, { "epoch": 2.245354555351177, "grad_norm": 0.9785804152488708, "learning_rate": 1.3307688763806629e-05, "loss": 2.0228, "step": 7475 }, { "epoch": 2.252862344682608, "grad_norm": 0.9108986258506775, "learning_rate": 1.3292446558814106e-05, "loss": 2.0357, "step": 7500 }, { "epoch": 2.252862344682608, "eval_loss": 2.035933494567871, "eval_runtime": 244.2267, "eval_samples_per_second": 22.966, "eval_steps_per_second": 5.745, "step": 7500 }, { "epoch": 2.2603701340140394, "grad_norm": 0.9188127517700195, "learning_rate": 1.3277144832329998e-05, "loss": 2.0241, "step": 7525 }, { "epoch": 2.267877923345471, "grad_norm": 0.9804355502128601, "learning_rate": 1.3261783741590389e-05, "loss": 2.0234, "step": 7550 }, { "epoch": 2.2753857126769024, "grad_norm": 0.9870203137397766, "learning_rate": 1.3246363444441365e-05, "loss": 2.0078, "step": 7575 }, { "epoch": 2.2828935020083336, "grad_norm": 1.1177314519882202, "learning_rate": 1.3230884099337404e-05, "loss": 2.0186, "step": 7600 }, { "epoch": 2.2828935020083336, "eval_loss": 2.035186290740967, "eval_runtime": 244.2073, "eval_samples_per_second": 22.968, "eval_steps_per_second": 5.745, "step": 7600 }, { "epoch": 2.290401291339765, "grad_norm": 0.9781551957130432, "learning_rate": 1.3215345865339738e-05, "loss": 1.9881, "step": 7625 }, { "epoch": 2.2979090806711966, "grad_norm": 1.1340678930282593, "learning_rate": 1.3199748902114734e-05, "loss": 2.0113, "step": 7650 }, { "epoch": 2.305416870002628, "grad_norm": 0.8932919502258301, "learning_rate": 1.3184093369932237e-05, "loss": 2.0349, "step": 7675 }, { "epoch": 2.312924659334059, "grad_norm": 0.9024244546890259, "learning_rate": 1.3168379429663924e-05, "loss": 2.0241, "step": 7700 }, { "epoch": 2.312924659334059, "eval_loss": 2.0337536334991455, "eval_runtime": 243.8773, "eval_samples_per_second": 22.999, "eval_steps_per_second": 5.753, "step": 7700 }, { "epoch": 2.3204324486654904, "grad_norm": 0.9510346055030823, "learning_rate": 1.3152607242781668e-05, "loss": 2.0297, "step": 7725 }, { "epoch": 2.3279402379969216, "grad_norm": 1.004501461982727, "learning_rate": 1.313677697135586e-05, "loss": 2.0276, "step": 7750 }, { "epoch": 2.3354480273283533, "grad_norm": 1.0247652530670166, "learning_rate": 1.312088877805375e-05, "loss": 2.0152, "step": 7775 }, { "epoch": 2.3429558166597846, "grad_norm": 0.9948970675468445, "learning_rate": 1.3104942826137785e-05, "loss": 2.0104, "step": 7800 }, { "epoch": 2.3429558166597846, "eval_loss": 2.032724618911743, "eval_runtime": 244.6368, "eval_samples_per_second": 22.928, "eval_steps_per_second": 5.735, "step": 7800 }, { "epoch": 2.350463605991216, "grad_norm": 1.062002182006836, "learning_rate": 1.3088939279463914e-05, "loss": 2.0329, "step": 7825 }, { "epoch": 2.357971395322647, "grad_norm": 0.9641005396842957, "learning_rate": 1.3072878302479912e-05, "loss": 2.0121, "step": 7850 }, { "epoch": 2.3654791846540784, "grad_norm": 0.9504510164260864, "learning_rate": 1.30567600602237e-05, "loss": 2.0203, "step": 7875 }, { "epoch": 2.37298697398551, "grad_norm": 0.970635712146759, "learning_rate": 1.3040584718321629e-05, "loss": 2.0101, "step": 7900 }, { "epoch": 2.37298697398551, "eval_loss": 2.032496452331543, "eval_runtime": 243.9409, "eval_samples_per_second": 22.993, "eval_steps_per_second": 5.751, "step": 7900 }, { "epoch": 2.3804947633169413, "grad_norm": 0.9251878261566162, "learning_rate": 1.30243524429868e-05, "loss": 2.0166, "step": 7925 }, { "epoch": 2.3880025526483726, "grad_norm": 0.8651822805404663, "learning_rate": 1.300806340101734e-05, "loss": 2.0213, "step": 7950 }, { "epoch": 2.3955103419798043, "grad_norm": 1.0655325651168823, "learning_rate": 1.2991717759794689e-05, "loss": 1.9892, "step": 7975 }, { "epoch": 2.4030181313112355, "grad_norm": 0.8861711621284485, "learning_rate": 1.2975315687281895e-05, "loss": 2.0632, "step": 8000 }, { "epoch": 2.4030181313112355, "eval_loss": 2.031506299972534, "eval_runtime": 244.4184, "eval_samples_per_second": 22.948, "eval_steps_per_second": 5.74, "step": 8000 }, { "epoch": 2.410525920642667, "grad_norm": 1.0595537424087524, "learning_rate": 1.2958857352021873e-05, "loss": 2.0257, "step": 8025 }, { "epoch": 2.418033709974098, "grad_norm": 1.1569972038269043, "learning_rate": 1.2942342923135669e-05, "loss": 2.0165, "step": 8050 }, { "epoch": 2.4255414993055293, "grad_norm": 0.9342359900474548, "learning_rate": 1.2925772570320744e-05, "loss": 2.0085, "step": 8075 }, { "epoch": 2.433049288636961, "grad_norm": 0.9486634731292725, "learning_rate": 1.2909146463849207e-05, "loss": 1.9926, "step": 8100 }, { "epoch": 2.433049288636961, "eval_loss": 2.0305228233337402, "eval_runtime": 244.4927, "eval_samples_per_second": 22.941, "eval_steps_per_second": 5.738, "step": 8100 }, { "epoch": 2.4405570779683923, "grad_norm": 1.04513418674469, "learning_rate": 1.2892464774566082e-05, "loss": 2.0207, "step": 8125 }, { "epoch": 2.4480648672998235, "grad_norm": 1.0375896692276, "learning_rate": 1.2875727673887548e-05, "loss": 2.0299, "step": 8150 }, { "epoch": 2.455572656631255, "grad_norm": 0.8860157132148743, "learning_rate": 1.2858935333799161e-05, "loss": 2.0164, "step": 8175 }, { "epoch": 2.463080445962686, "grad_norm": 0.9642972350120544, "learning_rate": 1.2842087926854117e-05, "loss": 1.9905, "step": 8200 }, { "epoch": 2.463080445962686, "eval_loss": 2.029367208480835, "eval_runtime": 244.4104, "eval_samples_per_second": 22.949, "eval_steps_per_second": 5.74, "step": 8200 }, { "epoch": 2.4705882352941178, "grad_norm": 0.9699326753616333, "learning_rate": 1.282518562617145e-05, "loss": 2.05, "step": 8225 }, { "epoch": 2.478096024625549, "grad_norm": 1.12892746925354, "learning_rate": 1.2808228605434282e-05, "loss": 1.984, "step": 8250 }, { "epoch": 2.4856038139569803, "grad_norm": 0.9147679209709167, "learning_rate": 1.2791217038888008e-05, "loss": 2.0349, "step": 8275 }, { "epoch": 2.493111603288412, "grad_norm": 0.9576278328895569, "learning_rate": 1.2774151101338523e-05, "loss": 2.0547, "step": 8300 }, { "epoch": 2.493111603288412, "eval_loss": 2.0288000106811523, "eval_runtime": 244.145, "eval_samples_per_second": 22.974, "eval_steps_per_second": 5.747, "step": 8300 }, { "epoch": 2.5006193926198432, "grad_norm": 1.0111256837844849, "learning_rate": 1.2757030968150426e-05, "loss": 2.0108, "step": 8325 }, { "epoch": 2.5081271819512745, "grad_norm": 0.8969287276268005, "learning_rate": 1.2739856815245213e-05, "loss": 1.9897, "step": 8350 }, { "epoch": 2.5156349712827057, "grad_norm": 1.02077054977417, "learning_rate": 1.2722628819099472e-05, "loss": 2.0071, "step": 8375 }, { "epoch": 2.523142760614137, "grad_norm": 0.9784366488456726, "learning_rate": 1.2705347156743066e-05, "loss": 2.0018, "step": 8400 }, { "epoch": 2.523142760614137, "eval_loss": 2.027707099914551, "eval_runtime": 244.2262, "eval_samples_per_second": 22.966, "eval_steps_per_second": 5.745, "step": 8400 }, { "epoch": 2.5306505499455687, "grad_norm": 0.9159882664680481, "learning_rate": 1.2688012005757317e-05, "loss": 2.0298, "step": 8425 }, { "epoch": 2.538158339277, "grad_norm": 1.080963373184204, "learning_rate": 1.2670623544273182e-05, "loss": 2.015, "step": 8450 }, { "epoch": 2.5456661286084312, "grad_norm": 0.9042007923126221, "learning_rate": 1.2653181950969418e-05, "loss": 1.9907, "step": 8475 }, { "epoch": 2.5531739179398625, "grad_norm": 0.9830322861671448, "learning_rate": 1.2635687405070755e-05, "loss": 2.015, "step": 8500 }, { "epoch": 2.5531739179398625, "eval_loss": 2.0268571376800537, "eval_runtime": 244.5259, "eval_samples_per_second": 22.938, "eval_steps_per_second": 5.738, "step": 8500 }, { "epoch": 2.5606817072712937, "grad_norm": 0.8969373106956482, "learning_rate": 1.2618842990073232e-05, "loss": 1.985, "step": 8525 }, { "epoch": 2.5681894966027254, "grad_norm": 1.0655286312103271, "learning_rate": 1.2601245179065439e-05, "loss": 2.0409, "step": 8550 }, { "epoch": 2.5756972859341567, "grad_norm": 1.0102958679199219, "learning_rate": 1.2583594949149863e-05, "loss": 2.0358, "step": 8575 }, { "epoch": 2.583205075265588, "grad_norm": 0.9221513271331787, "learning_rate": 1.2565892481695126e-05, "loss": 2.0241, "step": 8600 }, { "epoch": 2.583205075265588, "eval_loss": 2.025696039199829, "eval_runtime": 244.8481, "eval_samples_per_second": 22.908, "eval_steps_per_second": 5.73, "step": 8600 }, { "epoch": 2.5907128645970197, "grad_norm": 1.0198999643325806, "learning_rate": 1.2548137958606616e-05, "loss": 2.0061, "step": 8625 }, { "epoch": 2.598220653928451, "grad_norm": 1.0228906869888306, "learning_rate": 1.2530331562324637e-05, "loss": 2.0183, "step": 8650 }, { "epoch": 2.605728443259882, "grad_norm": 0.9328727126121521, "learning_rate": 1.2512473475822524e-05, "loss": 2.0111, "step": 8675 }, { "epoch": 2.6132362325913134, "grad_norm": 1.0237301588058472, "learning_rate": 1.2494563882604764e-05, "loss": 2.0461, "step": 8700 }, { "epoch": 2.6132362325913134, "eval_loss": 2.025115489959717, "eval_runtime": 244.776, "eval_samples_per_second": 22.915, "eval_steps_per_second": 5.732, "step": 8700 }, { "epoch": 2.6207440219227447, "grad_norm": 1.0419483184814453, "learning_rate": 1.2476602966705117e-05, "loss": 2.0226, "step": 8725 }, { "epoch": 2.6282518112541764, "grad_norm": 1.0212359428405762, "learning_rate": 1.2458590912684718e-05, "loss": 2.0294, "step": 8750 }, { "epoch": 2.6357596005856077, "grad_norm": 0.9352961778640747, "learning_rate": 1.2440527905630174e-05, "loss": 2.0287, "step": 8775 }, { "epoch": 2.643267389917039, "grad_norm": 0.9289619326591492, "learning_rate": 1.2422414131151686e-05, "loss": 1.9629, "step": 8800 }, { "epoch": 2.643267389917039, "eval_loss": 2.023833751678467, "eval_runtime": 244.5795, "eval_samples_per_second": 22.933, "eval_steps_per_second": 5.736, "step": 8800 }, { "epoch": 2.65077517924847, "grad_norm": 1.081150770187378, "learning_rate": 1.2404249775381112e-05, "loss": 2.0166, "step": 8825 }, { "epoch": 2.6582829685799014, "grad_norm": 0.9818612933158875, "learning_rate": 1.2386035024970076e-05, "loss": 2.0314, "step": 8850 }, { "epoch": 2.665790757911333, "grad_norm": 0.9447384476661682, "learning_rate": 1.2367770067088045e-05, "loss": 2.0172, "step": 8875 }, { "epoch": 2.6732985472427644, "grad_norm": 0.9655535817146301, "learning_rate": 1.2349455089420397e-05, "loss": 2.0163, "step": 8900 }, { "epoch": 2.6732985472427644, "eval_loss": 2.0230913162231445, "eval_runtime": 244.504, "eval_samples_per_second": 22.94, "eval_steps_per_second": 5.738, "step": 8900 }, { "epoch": 2.6808063365741956, "grad_norm": 1.010567307472229, "learning_rate": 1.2331090280166499e-05, "loss": 2.0132, "step": 8925 }, { "epoch": 2.6883141259056273, "grad_norm": 1.014929175376892, "learning_rate": 1.2312675828037778e-05, "loss": 2.0155, "step": 8950 }, { "epoch": 2.6958219152370586, "grad_norm": 0.9091641902923584, "learning_rate": 1.2294211922255775e-05, "loss": 2.0069, "step": 8975 }, { "epoch": 2.70332970456849, "grad_norm": 1.0267935991287231, "learning_rate": 1.2275698752550196e-05, "loss": 2.0101, "step": 9000 }, { "epoch": 2.70332970456849, "eval_loss": 2.0226101875305176, "eval_runtime": 244.615, "eval_samples_per_second": 22.93, "eval_steps_per_second": 5.736, "step": 9000 }, { "epoch": 2.710837493899921, "grad_norm": 1.147930383682251, "learning_rate": 1.2257136509156978e-05, "loss": 1.9859, "step": 9025 }, { "epoch": 2.7183452832313524, "grad_norm": 1.0729800462722778, "learning_rate": 1.2238525382816322e-05, "loss": 2.0083, "step": 9050 }, { "epoch": 2.725853072562784, "grad_norm": 1.0532081127166748, "learning_rate": 1.2219865564770731e-05, "loss": 2.0317, "step": 9075 }, { "epoch": 2.7333608618942153, "grad_norm": 1.0475471019744873, "learning_rate": 1.2201157246763056e-05, "loss": 2.0117, "step": 9100 }, { "epoch": 2.7333608618942153, "eval_loss": 2.0220327377319336, "eval_runtime": 244.6775, "eval_samples_per_second": 22.924, "eval_steps_per_second": 5.734, "step": 9100 }, { "epoch": 2.7408686512256466, "grad_norm": 0.9435563683509827, "learning_rate": 1.2182400621034513e-05, "loss": 2.0271, "step": 9125 }, { "epoch": 2.748376440557078, "grad_norm": 0.9693319201469421, "learning_rate": 1.2163595880322726e-05, "loss": 2.0162, "step": 9150 }, { "epoch": 2.755884229888509, "grad_norm": 1.0163437128067017, "learning_rate": 1.2144743217859717e-05, "loss": 2.0039, "step": 9175 }, { "epoch": 2.763392019219941, "grad_norm": 0.8770220279693604, "learning_rate": 1.2125842827369955e-05, "loss": 2.0098, "step": 9200 }, { "epoch": 2.763392019219941, "eval_loss": 2.021249771118164, "eval_runtime": 244.5171, "eval_samples_per_second": 22.939, "eval_steps_per_second": 5.738, "step": 9200 }, { "epoch": 2.770899808551372, "grad_norm": 0.9660369753837585, "learning_rate": 1.2106894903068337e-05, "loss": 2.0, "step": 9225 }, { "epoch": 2.7784075978828033, "grad_norm": 1.1277518272399902, "learning_rate": 1.2087899639658208e-05, "loss": 2.0048, "step": 9250 }, { "epoch": 2.785915387214235, "grad_norm": 0.9551436305046082, "learning_rate": 1.2068857232329355e-05, "loss": 1.9856, "step": 9275 }, { "epoch": 2.793423176545666, "grad_norm": 0.9860432744026184, "learning_rate": 1.2049767876756002e-05, "loss": 2.0292, "step": 9300 }, { "epoch": 2.793423176545666, "eval_loss": 2.0205230712890625, "eval_runtime": 244.4184, "eval_samples_per_second": 22.948, "eval_steps_per_second": 5.74, "step": 9300 }, { "epoch": 2.8009309658770976, "grad_norm": 1.023398756980896, "learning_rate": 1.2030631769094799e-05, "loss": 2.0173, "step": 9325 }, { "epoch": 2.808438755208529, "grad_norm": 0.9791613817214966, "learning_rate": 1.2011449105982813e-05, "loss": 2.0237, "step": 9350 }, { "epoch": 2.81594654453996, "grad_norm": 0.9436085224151611, "learning_rate": 1.1992220084535487e-05, "loss": 1.99, "step": 9375 }, { "epoch": 2.8234543338713918, "grad_norm": 0.9325253367424011, "learning_rate": 1.1972944902344646e-05, "loss": 2.0368, "step": 9400 }, { "epoch": 2.8234543338713918, "eval_loss": 2.019615650177002, "eval_runtime": 244.3993, "eval_samples_per_second": 22.95, "eval_steps_per_second": 5.741, "step": 9400 }, { "epoch": 2.830962123202823, "grad_norm": 0.9791749119758606, "learning_rate": 1.1953623757476436e-05, "loss": 2.0055, "step": 9425 }, { "epoch": 2.8384699125342543, "grad_norm": 0.9658190608024597, "learning_rate": 1.1934256848469312e-05, "loss": 2.0166, "step": 9450 }, { "epoch": 2.8459777018656855, "grad_norm": 1.026522159576416, "learning_rate": 1.1914844374331974e-05, "loss": 1.9916, "step": 9475 }, { "epoch": 2.853485491197117, "grad_norm": 1.1535567045211792, "learning_rate": 1.1895386534541354e-05, "loss": 1.9948, "step": 9500 }, { "epoch": 2.853485491197117, "eval_loss": 2.0190258026123047, "eval_runtime": 244.5245, "eval_samples_per_second": 22.938, "eval_steps_per_second": 5.738, "step": 9500 }, { "epoch": 2.8609932805285485, "grad_norm": 0.8700292110443115, "learning_rate": 1.1875883529040534e-05, "loss": 1.9998, "step": 9525 }, { "epoch": 2.8685010698599798, "grad_norm": 1.00760018825531, "learning_rate": 1.1856335558236714e-05, "loss": 2.0286, "step": 9550 }, { "epoch": 2.876008859191411, "grad_norm": 1.0481544733047485, "learning_rate": 1.1836742822999139e-05, "loss": 2.0145, "step": 9575 }, { "epoch": 2.8835166485228423, "grad_norm": 0.9422263503074646, "learning_rate": 1.1817105524657043e-05, "loss": 2.0123, "step": 9600 }, { "epoch": 2.8835166485228423, "eval_loss": 2.018214702606201, "eval_runtime": 244.6614, "eval_samples_per_second": 22.926, "eval_steps_per_second": 5.734, "step": 9600 }, { "epoch": 2.8910244378542735, "grad_norm": 1.012352466583252, "learning_rate": 1.1797423864997577e-05, "loss": 2.0425, "step": 9625 }, { "epoch": 2.8985322271857052, "grad_norm": 1.0469133853912354, "learning_rate": 1.1777698046263735e-05, "loss": 2.0266, "step": 9650 }, { "epoch": 2.9060400165171365, "grad_norm": 1.0227727890014648, "learning_rate": 1.175792827115228e-05, "loss": 2.0272, "step": 9675 }, { "epoch": 2.9135478058485678, "grad_norm": 1.1656129360198975, "learning_rate": 1.1738114742811654e-05, "loss": 1.9813, "step": 9700 }, { "epoch": 2.9135478058485678, "eval_loss": 2.017220973968506, "eval_runtime": 244.7357, "eval_samples_per_second": 22.919, "eval_steps_per_second": 5.733, "step": 9700 }, { "epoch": 2.9210555951799995, "grad_norm": 0.9345014095306396, "learning_rate": 1.1718257664839896e-05, "loss": 1.9932, "step": 9725 }, { "epoch": 2.9285633845114307, "grad_norm": 1.0153813362121582, "learning_rate": 1.1698357241282546e-05, "loss": 2.0216, "step": 9750 }, { "epoch": 2.936071173842862, "grad_norm": 1.0141171216964722, "learning_rate": 1.167841367663056e-05, "loss": 2.0118, "step": 9775 }, { "epoch": 2.9435789631742932, "grad_norm": 1.0706440210342407, "learning_rate": 1.1658427175818184e-05, "loss": 1.9952, "step": 9800 }, { "epoch": 2.9435789631742932, "eval_loss": 2.016911029815674, "eval_runtime": 244.4656, "eval_samples_per_second": 22.944, "eval_steps_per_second": 5.739, "step": 9800 }, { "epoch": 2.9510867525057245, "grad_norm": 0.9770407676696777, "learning_rate": 1.1638397944220876e-05, "loss": 2.0154, "step": 9825 }, { "epoch": 2.958594541837156, "grad_norm": 0.9835750460624695, "learning_rate": 1.1618326187653178e-05, "loss": 2.0186, "step": 9850 }, { "epoch": 2.9661023311685875, "grad_norm": 1.0434762239456177, "learning_rate": 1.1598212112366606e-05, "loss": 1.9859, "step": 9875 }, { "epoch": 2.9736101205000187, "grad_norm": 1.0988759994506836, "learning_rate": 1.1578055925047533e-05, "loss": 2.0024, "step": 9900 }, { "epoch": 2.9736101205000187, "eval_loss": 2.0162084102630615, "eval_runtime": 244.4388, "eval_samples_per_second": 22.946, "eval_steps_per_second": 5.74, "step": 9900 }, { "epoch": 2.98111790983145, "grad_norm": 0.9690369367599487, "learning_rate": 1.1557857832815063e-05, "loss": 2.0261, "step": 9925 }, { "epoch": 2.9886256991628812, "grad_norm": 0.932151198387146, "learning_rate": 1.1537618043218898e-05, "loss": 2.0233, "step": 9950 }, { "epoch": 2.996133488494313, "grad_norm": 1.0118919610977173, "learning_rate": 1.1517336764237217e-05, "loss": 1.981, "step": 9975 }, { "epoch": 3.0039040504523444, "grad_norm": 1.0406084060668945, "learning_rate": 1.1497014204274526e-05, "loss": 2.0523, "step": 10000 }, { "epoch": 3.0039040504523444, "eval_loss": 2.0155766010284424, "eval_runtime": 243.5325, "eval_samples_per_second": 23.032, "eval_steps_per_second": 5.761, "step": 10000 }, { "epoch": 3.0114118397837757, "grad_norm": 1.0300322771072388, "learning_rate": 1.1476650572159522e-05, "loss": 1.9657, "step": 10025 }, { "epoch": 3.018919629115207, "grad_norm": 1.0281704664230347, "learning_rate": 1.1456246077142954e-05, "loss": 1.9883, "step": 10050 }, { "epoch": 3.026427418446638, "grad_norm": 1.0092098712921143, "learning_rate": 1.1435800928895464e-05, "loss": 2.003, "step": 10075 }, { "epoch": 3.03393520777807, "grad_norm": 1.0722483396530151, "learning_rate": 1.1415315337505426e-05, "loss": 1.9913, "step": 10100 }, { "epoch": 3.03393520777807, "eval_loss": 2.0157699584960938, "eval_runtime": 244.4253, "eval_samples_per_second": 22.948, "eval_steps_per_second": 5.74, "step": 10100 }, { "epoch": 3.041442997109501, "grad_norm": 0.9789544939994812, "learning_rate": 1.1394789513476809e-05, "loss": 1.9866, "step": 10125 }, { "epoch": 3.0489507864409324, "grad_norm": 1.0212770700454712, "learning_rate": 1.137422366772699e-05, "loss": 1.976, "step": 10150 }, { "epoch": 3.0564585757723637, "grad_norm": 1.1227072477340698, "learning_rate": 1.1353618011584607e-05, "loss": 1.9816, "step": 10175 }, { "epoch": 3.0639663651037954, "grad_norm": 1.0329065322875977, "learning_rate": 1.1332972756787368e-05, "loss": 1.9773, "step": 10200 }, { "epoch": 3.0639663651037954, "eval_loss": 2.01505708694458, "eval_runtime": 244.0878, "eval_samples_per_second": 22.979, "eval_steps_per_second": 5.748, "step": 10200 }, { "epoch": 3.0714741544352266, "grad_norm": 1.0419589281082153, "learning_rate": 1.1312288115479897e-05, "loss": 1.9966, "step": 10225 }, { "epoch": 3.078981943766658, "grad_norm": 1.0318610668182373, "learning_rate": 1.1291564300211533e-05, "loss": 1.9615, "step": 10250 }, { "epoch": 3.086489733098089, "grad_norm": 1.0802398920059204, "learning_rate": 1.1270801523934156e-05, "loss": 1.9815, "step": 10275 }, { "epoch": 3.0939975224295204, "grad_norm": 1.0594321489334106, "learning_rate": 1.125e-05, "loss": 2.0002, "step": 10300 }, { "epoch": 3.0939975224295204, "eval_loss": 2.0144717693328857, "eval_runtime": 244.0019, "eval_samples_per_second": 22.988, "eval_steps_per_second": 5.75, "step": 10300 }, { "epoch": 3.101505311760952, "grad_norm": 0.8644378781318665, "learning_rate": 1.122915994215946e-05, "loss": 1.9563, "step": 10325 }, { "epoch": 3.1090131010923834, "grad_norm": 1.0262008905410767, "learning_rate": 1.1208281564558895e-05, "loss": 1.9977, "step": 10350 }, { "epoch": 3.1165208904238146, "grad_norm": 1.1098688840866089, "learning_rate": 1.1187365081738422e-05, "loss": 1.9673, "step": 10375 }, { "epoch": 3.124028679755246, "grad_norm": 1.0585020780563354, "learning_rate": 1.1166410708629716e-05, "loss": 1.9967, "step": 10400 }, { "epoch": 3.124028679755246, "eval_loss": 2.014115571975708, "eval_runtime": 244.2712, "eval_samples_per_second": 22.962, "eval_steps_per_second": 5.744, "step": 10400 }, { "epoch": 3.1315364690866776, "grad_norm": 0.9442121386528015, "learning_rate": 1.1145418660553808e-05, "loss": 2.0003, "step": 10425 }, { "epoch": 3.139044258418109, "grad_norm": 1.0891814231872559, "learning_rate": 1.1124389153218861e-05, "loss": 2.0022, "step": 10450 }, { "epoch": 3.14655204774954, "grad_norm": 1.0310977697372437, "learning_rate": 1.1103322402717958e-05, "loss": 1.9881, "step": 10475 }, { "epoch": 3.1540598370809714, "grad_norm": 1.2457115650177002, "learning_rate": 1.1082218625526887e-05, "loss": 1.9545, "step": 10500 }, { "epoch": 3.1540598370809714, "eval_loss": 2.0137479305267334, "eval_runtime": 244.4917, "eval_samples_per_second": 22.941, "eval_steps_per_second": 5.738, "step": 10500 }, { "epoch": 3.161567626412403, "grad_norm": 1.0390257835388184, "learning_rate": 1.1061078038501906e-05, "loss": 1.9965, "step": 10525 }, { "epoch": 3.1690754157438343, "grad_norm": 0.9900075793266296, "learning_rate": 1.1039900858877521e-05, "loss": 2.0066, "step": 10550 }, { "epoch": 3.1765832050752656, "grad_norm": 1.074483871459961, "learning_rate": 1.1018687304264256e-05, "loss": 1.9794, "step": 10575 }, { "epoch": 3.184090994406697, "grad_norm": 0.9264243245124817, "learning_rate": 1.099743759264641e-05, "loss": 1.9793, "step": 10600 }, { "epoch": 3.184090994406697, "eval_loss": 2.013479709625244, "eval_runtime": 244.7217, "eval_samples_per_second": 22.92, "eval_steps_per_second": 5.733, "step": 10600 }, { "epoch": 3.191598783738128, "grad_norm": 1.0158064365386963, "learning_rate": 1.097615194237982e-05, "loss": 1.992, "step": 10625 }, { "epoch": 3.19910657306956, "grad_norm": 1.084500789642334, "learning_rate": 1.0954830572189625e-05, "loss": 1.981, "step": 10650 }, { "epoch": 3.206614362400991, "grad_norm": 1.1871960163116455, "learning_rate": 1.0933473701168006e-05, "loss": 2.0098, "step": 10675 }, { "epoch": 3.2141221517324223, "grad_norm": 1.0174176692962646, "learning_rate": 1.0912081548771941e-05, "loss": 1.9898, "step": 10700 }, { "epoch": 3.2141221517324223, "eval_loss": 2.012505054473877, "eval_runtime": 244.4334, "eval_samples_per_second": 22.947, "eval_steps_per_second": 5.74, "step": 10700 }, { "epoch": 3.2216299410638536, "grad_norm": 1.1954680681228638, "learning_rate": 1.089065433482095e-05, "loss": 1.9965, "step": 10725 }, { "epoch": 3.2291377303952853, "grad_norm": 1.0380609035491943, "learning_rate": 1.0869192279494832e-05, "loss": 2.0142, "step": 10750 }, { "epoch": 3.2366455197267165, "grad_norm": 1.1713154315948486, "learning_rate": 1.0847695603331412e-05, "loss": 2.0032, "step": 10775 }, { "epoch": 3.244153309058148, "grad_norm": 0.9350267648696899, "learning_rate": 1.0826164527224262e-05, "loss": 1.9926, "step": 10800 }, { "epoch": 3.244153309058148, "eval_loss": 2.0120630264282227, "eval_runtime": 244.3746, "eval_samples_per_second": 22.952, "eval_steps_per_second": 5.741, "step": 10800 }, { "epoch": 3.251661098389579, "grad_norm": 1.1291122436523438, "learning_rate": 1.0804599272420443e-05, "loss": 1.9854, "step": 10825 }, { "epoch": 3.2591688877210103, "grad_norm": 0.9929710030555725, "learning_rate": 1.0783000060518225e-05, "loss": 1.9712, "step": 10850 }, { "epoch": 3.266676677052442, "grad_norm": 0.9652737379074097, "learning_rate": 1.076136711346481e-05, "loss": 1.9767, "step": 10875 }, { "epoch": 3.2741844663838733, "grad_norm": 0.9600501656532288, "learning_rate": 1.0739700653554052e-05, "loss": 1.9792, "step": 10900 }, { "epoch": 3.2741844663838733, "eval_loss": 2.0115151405334473, "eval_runtime": 244.8887, "eval_samples_per_second": 22.904, "eval_steps_per_second": 5.729, "step": 10900 }, { "epoch": 3.2816922557153045, "grad_norm": 1.0329478979110718, "learning_rate": 1.0718000903424174e-05, "loss": 1.9961, "step": 10925 }, { "epoch": 3.289200045046736, "grad_norm": 1.1442408561706543, "learning_rate": 1.0696268086055482e-05, "loss": 1.9898, "step": 10950 }, { "epoch": 3.2967078343781675, "grad_norm": 1.0361113548278809, "learning_rate": 1.0674502424768066e-05, "loss": 1.9861, "step": 10975 }, { "epoch": 3.3042156237095988, "grad_norm": 0.997988760471344, "learning_rate": 1.0652704143219519e-05, "loss": 1.99, "step": 11000 }, { "epoch": 3.3042156237095988, "eval_loss": 2.0116584300994873, "eval_runtime": 243.9919, "eval_samples_per_second": 22.988, "eval_steps_per_second": 5.75, "step": 11000 }, { "epoch": 3.31172341304103, "grad_norm": 0.9052268266677856, "learning_rate": 1.0630873465402622e-05, "loss": 1.9942, "step": 11025 }, { "epoch": 3.3192312023724613, "grad_norm": 0.9491928815841675, "learning_rate": 1.0609010615643052e-05, "loss": 2.0145, "step": 11050 }, { "epoch": 3.326738991703893, "grad_norm": 1.0330880880355835, "learning_rate": 1.058711581859708e-05, "loss": 1.992, "step": 11075 }, { "epoch": 3.3342467810353242, "grad_norm": 1.0044811964035034, "learning_rate": 1.0565189299249254e-05, "loss": 2.0099, "step": 11100 }, { "epoch": 3.3342467810353242, "eval_loss": 2.0105700492858887, "eval_runtime": 244.4106, "eval_samples_per_second": 22.949, "eval_steps_per_second": 5.74, "step": 11100 }, { "epoch": 3.3417545703667555, "grad_norm": 1.0180730819702148, "learning_rate": 1.0543231282910093e-05, "loss": 1.9847, "step": 11125 }, { "epoch": 3.3492623596981868, "grad_norm": 1.0637898445129395, "learning_rate": 1.0521241995213771e-05, "loss": 1.9725, "step": 11150 }, { "epoch": 3.356770149029618, "grad_norm": 1.1966840028762817, "learning_rate": 1.049922166211579e-05, "loss": 1.9909, "step": 11175 }, { "epoch": 3.3642779383610497, "grad_norm": 1.0537995100021362, "learning_rate": 1.0477170509890681e-05, "loss": 2.0051, "step": 11200 }, { "epoch": 3.3642779383610497, "eval_loss": 2.0095300674438477, "eval_runtime": 244.5586, "eval_samples_per_second": 22.935, "eval_steps_per_second": 5.737, "step": 11200 }, { "epoch": 3.371785727692481, "grad_norm": 0.9709149599075317, "learning_rate": 1.0455088765129643e-05, "loss": 1.9907, "step": 11225 }, { "epoch": 3.3792935170239122, "grad_norm": 1.1112037897109985, "learning_rate": 1.043297665473825e-05, "loss": 1.9855, "step": 11250 }, { "epoch": 3.3868013063553435, "grad_norm": 0.9346416592597961, "learning_rate": 1.0410834405934099e-05, "loss": 2.0005, "step": 11275 }, { "epoch": 3.394309095686775, "grad_norm": 1.053544044494629, "learning_rate": 1.0388662246244482e-05, "loss": 1.9858, "step": 11300 }, { "epoch": 3.394309095686775, "eval_loss": 2.0087532997131348, "eval_runtime": 244.6298, "eval_samples_per_second": 22.929, "eval_steps_per_second": 5.735, "step": 11300 }, { "epoch": 3.4018168850182064, "grad_norm": 1.0392097234725952, "learning_rate": 1.0366460403504045e-05, "loss": 1.9907, "step": 11325 }, { "epoch": 3.4093246743496377, "grad_norm": 0.9744161367416382, "learning_rate": 1.0344229105852453e-05, "loss": 1.9888, "step": 11350 }, { "epoch": 3.416832463681069, "grad_norm": 1.0045557022094727, "learning_rate": 1.0321968581732035e-05, "loss": 2.0007, "step": 11375 }, { "epoch": 3.4243402530125007, "grad_norm": 1.0795562267303467, "learning_rate": 1.0299679059885441e-05, "loss": 1.9836, "step": 11400 }, { "epoch": 3.4243402530125007, "eval_loss": 2.008427381515503, "eval_runtime": 243.7629, "eval_samples_per_second": 23.01, "eval_steps_per_second": 5.756, "step": 11400 }, { "epoch": 3.431848042343932, "grad_norm": 1.0574262142181396, "learning_rate": 1.0277360769353302e-05, "loss": 1.9968, "step": 11425 }, { "epoch": 3.439355831675363, "grad_norm": 1.0723813772201538, "learning_rate": 1.0255013939471862e-05, "loss": 1.9778, "step": 11450 }, { "epoch": 3.4468636210067944, "grad_norm": 1.0221625566482544, "learning_rate": 1.0232638799870627e-05, "loss": 1.9795, "step": 11475 }, { "epoch": 3.4543714103382257, "grad_norm": 1.0293052196502686, "learning_rate": 1.0210235580470003e-05, "loss": 2.0101, "step": 11500 }, { "epoch": 3.4543714103382257, "eval_loss": 2.008002996444702, "eval_runtime": 244.5192, "eval_samples_per_second": 22.939, "eval_steps_per_second": 5.738, "step": 11500 }, { "epoch": 3.4618791996696574, "grad_norm": 0.9779027700424194, "learning_rate": 1.0187804511478948e-05, "loss": 2.0353, "step": 11525 }, { "epoch": 3.4693869890010887, "grad_norm": 1.3106768131256104, "learning_rate": 1.0165345823392577e-05, "loss": 1.9887, "step": 11550 }, { "epoch": 3.47689477833252, "grad_norm": 1.0175050497055054, "learning_rate": 1.0142859746989822e-05, "loss": 1.9838, "step": 11575 }, { "epoch": 3.484402567663951, "grad_norm": 1.142027735710144, "learning_rate": 1.0120346513331048e-05, "loss": 1.9585, "step": 11600 }, { "epoch": 3.484402567663951, "eval_loss": 2.0071005821228027, "eval_runtime": 244.0492, "eval_samples_per_second": 22.983, "eval_steps_per_second": 5.749, "step": 11600 }, { "epoch": 3.491910356995383, "grad_norm": 1.0209110975265503, "learning_rate": 1.0097806353755675e-05, "loss": 1.9731, "step": 11625 }, { "epoch": 3.499418146326814, "grad_norm": 1.046372413635254, "learning_rate": 1.0075239499879812e-05, "loss": 1.9688, "step": 11650 }, { "epoch": 3.5069259356582454, "grad_norm": 1.227776050567627, "learning_rate": 1.0052646183593868e-05, "loss": 1.9843, "step": 11675 }, { "epoch": 3.5144337249896767, "grad_norm": 1.0463147163391113, "learning_rate": 1.0030026637060175e-05, "loss": 2.0024, "step": 11700 }, { "epoch": 3.5144337249896767, "eval_loss": 2.0066797733306885, "eval_runtime": 243.8922, "eval_samples_per_second": 22.998, "eval_steps_per_second": 5.753, "step": 11700 }, { "epoch": 3.5219415143211084, "grad_norm": 1.0555408000946045, "learning_rate": 1.0007381092710587e-05, "loss": 1.9974, "step": 11725 }, { "epoch": 3.5294493036525396, "grad_norm": 1.007045865058899, "learning_rate": 9.984709783244125e-06, "loss": 2.004, "step": 11750 }, { "epoch": 3.536957092983971, "grad_norm": 1.170345425605774, "learning_rate": 9.962012941624547e-06, "loss": 1.9492, "step": 11775 }, { "epoch": 3.544464882315402, "grad_norm": 1.1506013870239258, "learning_rate": 9.939290801077979e-06, "loss": 1.9908, "step": 11800 }, { "epoch": 3.544464882315402, "eval_loss": 2.0061874389648438, "eval_runtime": 244.205, "eval_samples_per_second": 22.968, "eval_steps_per_second": 5.745, "step": 11800 }, { "epoch": 3.5519726716468334, "grad_norm": 0.9976746439933777, "learning_rate": 9.916543595090514e-06, "loss": 1.995, "step": 11825 }, { "epoch": 3.559480460978265, "grad_norm": 1.0817415714263916, "learning_rate": 9.893771557405803e-06, "loss": 1.9989, "step": 11850 }, { "epoch": 3.5669882503096964, "grad_norm": 0.9880387187004089, "learning_rate": 9.870974922022668e-06, "loss": 1.9706, "step": 11875 }, { "epoch": 3.5744960396411276, "grad_norm": 1.629197120666504, "learning_rate": 9.848153923192681e-06, "loss": 1.9957, "step": 11900 }, { "epoch": 3.5744960396411276, "eval_loss": 2.0057406425476074, "eval_runtime": 244.5085, "eval_samples_per_second": 22.94, "eval_steps_per_second": 5.738, "step": 11900 }, { "epoch": 3.582003828972559, "grad_norm": 1.1123307943344116, "learning_rate": 9.825308795417776e-06, "loss": 1.9746, "step": 11925 }, { "epoch": 3.58951161830399, "grad_norm": 1.107917308807373, "learning_rate": 9.802439773447818e-06, "loss": 1.983, "step": 11950 }, { "epoch": 3.597019407635422, "grad_norm": 1.0012487173080444, "learning_rate": 9.779547092278212e-06, "loss": 1.9592, "step": 11975 }, { "epoch": 3.604527196966853, "grad_norm": 0.9805944561958313, "learning_rate": 9.756630987147473e-06, "loss": 1.974, "step": 12000 }, { "epoch": 3.604527196966853, "eval_loss": 2.0051681995391846, "eval_runtime": 244.093, "eval_samples_per_second": 22.979, "eval_steps_per_second": 5.748, "step": 12000 }, { "epoch": 3.6120349862982843, "grad_norm": 0.9973050355911255, "learning_rate": 9.733691693534814e-06, "loss": 2.018, "step": 12025 }, { "epoch": 3.619542775629716, "grad_norm": 1.0701146125793457, "learning_rate": 9.710729447157725e-06, "loss": 1.9395, "step": 12050 }, { "epoch": 3.6270505649611473, "grad_norm": 0.9309558868408203, "learning_rate": 9.687744483969555e-06, "loss": 1.9866, "step": 12075 }, { "epoch": 3.6345583542925786, "grad_norm": 1.1145427227020264, "learning_rate": 9.66473704015708e-06, "loss": 1.9669, "step": 12100 }, { "epoch": 3.6345583542925786, "eval_loss": 2.004288911819458, "eval_runtime": 244.31, "eval_samples_per_second": 22.959, "eval_steps_per_second": 5.743, "step": 12100 }, { "epoch": 3.64206614362401, "grad_norm": 1.0386533737182617, "learning_rate": 9.641707352138083e-06, "loss": 1.9833, "step": 12125 }, { "epoch": 3.649573932955441, "grad_norm": 1.0102437734603882, "learning_rate": 9.618655656558927e-06, "loss": 2.0004, "step": 12150 }, { "epoch": 3.657081722286873, "grad_norm": 1.063219666481018, "learning_rate": 9.595582190292109e-06, "loss": 1.9995, "step": 12175 }, { "epoch": 3.664589511618304, "grad_norm": 1.0717073678970337, "learning_rate": 9.57248719043384e-06, "loss": 1.9995, "step": 12200 }, { "epoch": 3.664589511618304, "eval_loss": 2.0040318965911865, "eval_runtime": 244.4579, "eval_samples_per_second": 22.945, "eval_steps_per_second": 5.739, "step": 12200 }, { "epoch": 3.6720973009497353, "grad_norm": 1.0240517854690552, "learning_rate": 9.549370894301602e-06, "loss": 2.0077, "step": 12225 }, { "epoch": 3.6796050902811666, "grad_norm": 1.0465691089630127, "learning_rate": 9.526233539431713e-06, "loss": 2.0077, "step": 12250 }, { "epoch": 3.687112879612598, "grad_norm": 1.101195216178894, "learning_rate": 9.503075363576889e-06, "loss": 1.99, "step": 12275 }, { "epoch": 3.6946206689440295, "grad_norm": 1.0206913948059082, "learning_rate": 9.479896604703785e-06, "loss": 1.9897, "step": 12300 }, { "epoch": 3.6946206689440295, "eval_loss": 2.003530740737915, "eval_runtime": 244.8327, "eval_samples_per_second": 22.91, "eval_steps_per_second": 5.73, "step": 12300 }, { "epoch": 3.7021284582754608, "grad_norm": 0.9398745894432068, "learning_rate": 9.456697500990571e-06, "loss": 1.9811, "step": 12325 }, { "epoch": 3.709636247606892, "grad_norm": 1.0570793151855469, "learning_rate": 9.433478290824472e-06, "loss": 1.9719, "step": 12350 }, { "epoch": 3.7171440369383237, "grad_norm": 1.0618635416030884, "learning_rate": 9.410239212799315e-06, "loss": 1.9744, "step": 12375 }, { "epoch": 3.724651826269755, "grad_norm": 1.0616377592086792, "learning_rate": 9.387911227877156e-06, "loss": 1.9889, "step": 12400 }, { "epoch": 3.724651826269755, "eval_loss": 2.003262996673584, "eval_runtime": 244.6377, "eval_samples_per_second": 22.928, "eval_steps_per_second": 5.735, "step": 12400 }, { "epoch": 3.7321596156011863, "grad_norm": 1.0657788515090942, "learning_rate": 9.364633901740714e-06, "loss": 1.9712, "step": 12425 }, { "epoch": 3.7396674049326175, "grad_norm": 1.0607733726501465, "learning_rate": 9.341337415170081e-06, "loss": 1.9622, "step": 12450 }, { "epoch": 3.7471751942640488, "grad_norm": 1.1743979454040527, "learning_rate": 9.318022007553162e-06, "loss": 1.9693, "step": 12475 }, { "epoch": 3.7546829835954805, "grad_norm": 1.0691910982131958, "learning_rate": 9.294687918472286e-06, "loss": 1.9865, "step": 12500 }, { "epoch": 3.7546829835954805, "eval_loss": 2.0024280548095703, "eval_runtime": 244.387, "eval_samples_per_second": 22.951, "eval_steps_per_second": 5.741, "step": 12500 }, { "epoch": 3.7621907729269117, "grad_norm": 1.0780701637268066, "learning_rate": 9.271335387701745e-06, "loss": 1.9788, "step": 12525 }, { "epoch": 3.769698562258343, "grad_norm": 1.0889036655426025, "learning_rate": 9.247964655205333e-06, "loss": 2.0001, "step": 12550 }, { "epoch": 3.7772063515897742, "grad_norm": 1.0859447717666626, "learning_rate": 9.224575961133889e-06, "loss": 1.9875, "step": 12575 }, { "epoch": 3.7847141409212055, "grad_norm": 1.1142594814300537, "learning_rate": 9.201169545822806e-06, "loss": 1.9703, "step": 12600 }, { "epoch": 3.7847141409212055, "eval_loss": 2.0022220611572266, "eval_runtime": 244.6481, "eval_samples_per_second": 22.927, "eval_steps_per_second": 5.735, "step": 12600 }, { "epoch": 3.792221930252637, "grad_norm": 0.9859952926635742, "learning_rate": 9.177745649789582e-06, "loss": 1.9795, "step": 12625 }, { "epoch": 3.7997297195840685, "grad_norm": 1.0307040214538574, "learning_rate": 9.154304513731345e-06, "loss": 1.9635, "step": 12650 }, { "epoch": 3.8072375089154997, "grad_norm": 1.1140483617782593, "learning_rate": 9.130846378522373e-06, "loss": 1.9709, "step": 12675 }, { "epoch": 3.8147452982469314, "grad_norm": 1.2594614028930664, "learning_rate": 9.107371485211619e-06, "loss": 1.998, "step": 12700 }, { "epoch": 3.8147452982469314, "eval_loss": 2.0013692378997803, "eval_runtime": 244.2752, "eval_samples_per_second": 22.962, "eval_steps_per_second": 5.744, "step": 12700 }, { "epoch": 3.8222530875783627, "grad_norm": 1.0169751644134521, "learning_rate": 9.083880075020243e-06, "loss": 1.9712, "step": 12725 }, { "epoch": 3.829760876909794, "grad_norm": 0.9640651345252991, "learning_rate": 9.060372389339123e-06, "loss": 1.9748, "step": 12750 }, { "epoch": 3.837268666241225, "grad_norm": 1.0947884321212769, "learning_rate": 9.036848669726382e-06, "loss": 1.9854, "step": 12775 }, { "epoch": 3.8447764555726565, "grad_norm": 1.1233420372009277, "learning_rate": 9.013309157904907e-06, "loss": 1.9968, "step": 12800 }, { "epoch": 3.8447764555726565, "eval_loss": 2.001154661178589, "eval_runtime": 244.9198, "eval_samples_per_second": 22.901, "eval_steps_per_second": 5.728, "step": 12800 }, { "epoch": 3.852284244904088, "grad_norm": 0.9935488700866699, "learning_rate": 8.98975409575985e-06, "loss": 1.9756, "step": 12825 }, { "epoch": 3.8597920342355194, "grad_norm": 0.9727908372879028, "learning_rate": 8.966183725336167e-06, "loss": 1.9942, "step": 12850 }, { "epoch": 3.8672998235669507, "grad_norm": 1.1200799942016602, "learning_rate": 8.942598288836103e-06, "loss": 1.9982, "step": 12875 }, { "epoch": 3.874807612898382, "grad_norm": 1.172968864440918, "learning_rate": 8.91899802861673e-06, "loss": 1.9842, "step": 12900 }, { "epoch": 3.874807612898382, "eval_loss": 2.000430107116699, "eval_runtime": 244.7767, "eval_samples_per_second": 22.915, "eval_steps_per_second": 5.732, "step": 12900 }, { "epoch": 3.882315402229813, "grad_norm": 1.1125150918960571, "learning_rate": 8.89538318718744e-06, "loss": 1.9832, "step": 12925 }, { "epoch": 3.889823191561245, "grad_norm": 1.1382113695144653, "learning_rate": 8.871754007207454e-06, "loss": 1.9774, "step": 12950 }, { "epoch": 3.897330980892676, "grad_norm": 1.090171217918396, "learning_rate": 8.848110731483337e-06, "loss": 1.9914, "step": 12975 }, { "epoch": 3.9048387702241074, "grad_norm": 0.9999351501464844, "learning_rate": 8.824453602966493e-06, "loss": 1.9787, "step": 13000 }, { "epoch": 3.9048387702241074, "eval_loss": 2.0002853870391846, "eval_runtime": 244.3984, "eval_samples_per_second": 22.95, "eval_steps_per_second": 5.741, "step": 13000 }, { "epoch": 3.912346559555539, "grad_norm": 1.0934284925460815, "learning_rate": 8.800782864750677e-06, "loss": 1.9817, "step": 13025 }, { "epoch": 3.9198543488869704, "grad_norm": 1.0394964218139648, "learning_rate": 8.777098760069491e-06, "loss": 1.968, "step": 13050 }, { "epoch": 3.9273621382184016, "grad_norm": 1.1079460382461548, "learning_rate": 8.753401532293889e-06, "loss": 1.9757, "step": 13075 }, { "epoch": 3.934869927549833, "grad_norm": 0.9885277152061462, "learning_rate": 8.729691424929671e-06, "loss": 1.9789, "step": 13100 }, { "epoch": 3.934869927549833, "eval_loss": 1.9996843338012695, "eval_runtime": 245.1096, "eval_samples_per_second": 22.884, "eval_steps_per_second": 5.724, "step": 13100 }, { "epoch": 3.942377716881264, "grad_norm": 1.005743145942688, "learning_rate": 8.705968681614985e-06, "loss": 1.9701, "step": 13125 }, { "epoch": 3.949885506212696, "grad_norm": 1.0854625701904297, "learning_rate": 8.682233546117827e-06, "loss": 2.0009, "step": 13150 }, { "epoch": 3.957393295544127, "grad_norm": 0.9378837943077087, "learning_rate": 8.658486262333524e-06, "loss": 1.9618, "step": 13175 }, { "epoch": 3.9649010848755584, "grad_norm": 1.0081528425216675, "learning_rate": 8.63472707428224e-06, "loss": 1.9598, "step": 13200 }, { "epoch": 3.9649010848755584, "eval_loss": 1.9990559816360474, "eval_runtime": 244.3863, "eval_samples_per_second": 22.951, "eval_steps_per_second": 5.741, "step": 13200 }, { "epoch": 3.9724088742069896, "grad_norm": 1.0947321653366089, "learning_rate": 8.61095622610646e-06, "loss": 1.9754, "step": 13225 }, { "epoch": 3.979916663538421, "grad_norm": 1.01126229763031, "learning_rate": 8.587173962068493e-06, "loss": 2.0003, "step": 13250 }, { "epoch": 3.9874244528698526, "grad_norm": 1.0570297241210938, "learning_rate": 8.563380526547944e-06, "loss": 1.9662, "step": 13275 }, { "epoch": 3.994932242201284, "grad_norm": 1.103887677192688, "learning_rate": 8.539576164039218e-06, "loss": 1.9603, "step": 13300 }, { "epoch": 3.994932242201284, "eval_loss": 1.9989780187606812, "eval_runtime": 244.1926, "eval_samples_per_second": 22.97, "eval_steps_per_second": 5.745, "step": 13300 }, { "epoch": 4.002702804159315, "grad_norm": 0.9994622468948364, "learning_rate": 8.515761119149003e-06, "loss": 2.0651, "step": 13325 }, { "epoch": 4.010210593490747, "grad_norm": 1.1002482175827026, "learning_rate": 8.491935636593756e-06, "loss": 1.9639, "step": 13350 }, { "epoch": 4.017718382822178, "grad_norm": 1.1589230298995972, "learning_rate": 8.468099961197186e-06, "loss": 1.9654, "step": 13375 }, { "epoch": 4.02522617215361, "grad_norm": 1.0557494163513184, "learning_rate": 8.444254337887742e-06, "loss": 1.9567, "step": 13400 }, { "epoch": 4.02522617215361, "eval_loss": 1.9992824792861938, "eval_runtime": 244.4365, "eval_samples_per_second": 22.947, "eval_steps_per_second": 5.74, "step": 13400 }, { "epoch": 4.03273396148504, "grad_norm": 1.0956406593322754, "learning_rate": 8.420399011696096e-06, "loss": 1.9574, "step": 13425 }, { "epoch": 4.040241750816472, "grad_norm": 1.314028024673462, "learning_rate": 8.396534227752622e-06, "loss": 1.9599, "step": 13450 }, { "epoch": 4.047749540147904, "grad_norm": 1.048609972000122, "learning_rate": 8.372660231284883e-06, "loss": 1.9483, "step": 13475 }, { "epoch": 4.055257329479335, "grad_norm": 1.119491696357727, "learning_rate": 8.348777267615099e-06, "loss": 1.9838, "step": 13500 }, { "epoch": 4.055257329479335, "eval_loss": 1.998762607574463, "eval_runtime": 244.4149, "eval_samples_per_second": 22.949, "eval_steps_per_second": 5.74, "step": 13500 }, { "epoch": 4.062765118810766, "grad_norm": 1.0003256797790527, "learning_rate": 8.324885582157645e-06, "loss": 1.9629, "step": 13525 }, { "epoch": 4.070272908142197, "grad_norm": 1.059667706489563, "learning_rate": 8.300985420416509e-06, "loss": 1.9866, "step": 13550 }, { "epoch": 4.077780697473629, "grad_norm": 1.1236132383346558, "learning_rate": 8.277077027982787e-06, "loss": 1.9787, "step": 13575 }, { "epoch": 4.0852884868050605, "grad_norm": 1.0514492988586426, "learning_rate": 8.253160650532144e-06, "loss": 1.9829, "step": 13600 }, { "epoch": 4.0852884868050605, "eval_loss": 1.9986952543258667, "eval_runtime": 245.1032, "eval_samples_per_second": 22.884, "eval_steps_per_second": 5.724, "step": 13600 }, { "epoch": 4.092796276136491, "grad_norm": 1.0734481811523438, "learning_rate": 8.2292365338223e-06, "loss": 1.9832, "step": 13625 }, { "epoch": 4.100304065467923, "grad_norm": 1.0448415279388428, "learning_rate": 8.205304923690505e-06, "loss": 1.9827, "step": 13650 }, { "epoch": 4.107811854799355, "grad_norm": 1.1534922122955322, "learning_rate": 8.181366066051e-06, "loss": 1.9398, "step": 13675 }, { "epoch": 4.1153196441307855, "grad_norm": 1.0893254280090332, "learning_rate": 8.157420206892509e-06, "loss": 1.9696, "step": 13700 }, { "epoch": 4.1153196441307855, "eval_loss": 1.9981467723846436, "eval_runtime": 244.0215, "eval_samples_per_second": 22.986, "eval_steps_per_second": 5.749, "step": 13700 }, { "epoch": 4.122827433462217, "grad_norm": 1.1225614547729492, "learning_rate": 8.133467592275697e-06, "loss": 1.9785, "step": 13725 }, { "epoch": 4.130335222793648, "grad_norm": 1.1276017427444458, "learning_rate": 8.109508468330643e-06, "loss": 1.9679, "step": 13750 }, { "epoch": 4.13784301212508, "grad_norm": 1.0437787771224976, "learning_rate": 8.08554308125432e-06, "loss": 1.9794, "step": 13775 }, { "epoch": 4.1453508014565115, "grad_norm": 1.1491374969482422, "learning_rate": 8.061571677308061e-06, "loss": 1.9575, "step": 13800 }, { "epoch": 4.1453508014565115, "eval_loss": 1.9976245164871216, "eval_runtime": 244.1266, "eval_samples_per_second": 22.976, "eval_steps_per_second": 5.747, "step": 13800 }, { "epoch": 4.152858590787942, "grad_norm": 1.140905499458313, "learning_rate": 8.037594502815015e-06, "loss": 1.9591, "step": 13825 }, { "epoch": 4.160366380119374, "grad_norm": 0.9632274508476257, "learning_rate": 8.013611804157636e-06, "loss": 1.9593, "step": 13850 }, { "epoch": 4.167874169450805, "grad_norm": 1.1178561449050903, "learning_rate": 7.989623827775142e-06, "loss": 1.9729, "step": 13875 }, { "epoch": 4.1753819587822365, "grad_norm": 1.068928837776184, "learning_rate": 7.965630820160984e-06, "loss": 1.9359, "step": 13900 }, { "epoch": 4.1753819587822365, "eval_loss": 1.9976770877838135, "eval_runtime": 244.3884, "eval_samples_per_second": 22.951, "eval_steps_per_second": 5.741, "step": 13900 }, { "epoch": 4.182889748113668, "grad_norm": 1.0295666456222534, "learning_rate": 7.941633027860312e-06, "loss": 1.9739, "step": 13925 }, { "epoch": 4.190397537445099, "grad_norm": 1.0357112884521484, "learning_rate": 7.917630697467438e-06, "loss": 1.9554, "step": 13950 }, { "epoch": 4.197905326776531, "grad_norm": 1.0465984344482422, "learning_rate": 7.893624075623312e-06, "loss": 1.9688, "step": 13975 }, { "epoch": 4.205413116107962, "grad_norm": 1.0274240970611572, "learning_rate": 7.869613409012976e-06, "loss": 1.9705, "step": 14000 }, { "epoch": 4.205413116107962, "eval_loss": 1.9968942403793335, "eval_runtime": 244.9157, "eval_samples_per_second": 22.902, "eval_steps_per_second": 5.729, "step": 14000 }, { "epoch": 4.212920905439393, "grad_norm": 0.9973297119140625, "learning_rate": 7.845598944363041e-06, "loss": 1.9775, "step": 14025 }, { "epoch": 4.220428694770825, "grad_norm": 1.0587254762649536, "learning_rate": 7.821580928439141e-06, "loss": 1.9808, "step": 14050 }, { "epoch": 4.227936484102256, "grad_norm": 1.1307932138442993, "learning_rate": 7.797559608043403e-06, "loss": 1.9646, "step": 14075 }, { "epoch": 4.2354442734336875, "grad_norm": 1.0376613140106201, "learning_rate": 7.773535230011909e-06, "loss": 1.961, "step": 14100 }, { "epoch": 4.2354442734336875, "eval_loss": 1.9972692728042603, "eval_runtime": 244.4264, "eval_samples_per_second": 22.948, "eval_steps_per_second": 5.74, "step": 14100 }, { "epoch": 4.242952062765119, "grad_norm": 1.0353500843048096, "learning_rate": 7.749508041212167e-06, "loss": 1.9881, "step": 14125 }, { "epoch": 4.25045985209655, "grad_norm": 1.191989541053772, "learning_rate": 7.725478288540554e-06, "loss": 1.9307, "step": 14150 }, { "epoch": 4.257967641427982, "grad_norm": 1.1267927885055542, "learning_rate": 7.701446218919805e-06, "loss": 1.9837, "step": 14175 }, { "epoch": 4.2654754307594125, "grad_norm": 1.103934407234192, "learning_rate": 7.677412079296458e-06, "loss": 1.9557, "step": 14200 }, { "epoch": 4.2654754307594125, "eval_loss": 1.9968904256820679, "eval_runtime": 244.788, "eval_samples_per_second": 22.914, "eval_steps_per_second": 5.731, "step": 14200 }, { "epoch": 4.272983220090844, "grad_norm": 1.1149851083755493, "learning_rate": 7.653376116638324e-06, "loss": 1.9573, "step": 14225 }, { "epoch": 4.280491009422276, "grad_norm": 1.2663904428482056, "learning_rate": 7.629338577931943e-06, "loss": 1.9652, "step": 14250 }, { "epoch": 4.287998798753707, "grad_norm": 1.1402429342269897, "learning_rate": 7.605299710180056e-06, "loss": 1.9834, "step": 14275 }, { "epoch": 4.295506588085138, "grad_norm": 1.1735416650772095, "learning_rate": 7.581259760399059e-06, "loss": 1.9743, "step": 14300 }, { "epoch": 4.295506588085138, "eval_loss": 1.9964790344238281, "eval_runtime": 244.5619, "eval_samples_per_second": 22.935, "eval_steps_per_second": 5.737, "step": 14300 }, { "epoch": 4.30301437741657, "grad_norm": 1.0733146667480469, "learning_rate": 7.557218975616456e-06, "loss": 1.9297, "step": 14325 }, { "epoch": 4.310522166748001, "grad_norm": 1.0636229515075684, "learning_rate": 7.5331776028683485e-06, "loss": 2.0013, "step": 14350 }, { "epoch": 4.318029956079433, "grad_norm": 1.0287854671478271, "learning_rate": 7.509135889196871e-06, "loss": 1.9394, "step": 14375 }, { "epoch": 4.325537745410863, "grad_norm": 1.2089693546295166, "learning_rate": 7.485094081647659e-06, "loss": 1.9651, "step": 14400 }, { "epoch": 4.325537745410863, "eval_loss": 1.9961069822311401, "eval_runtime": 244.3299, "eval_samples_per_second": 22.957, "eval_steps_per_second": 5.742, "step": 14400 }, { "epoch": 4.333045534742295, "grad_norm": 1.0768557786941528, "learning_rate": 7.461052427267318e-06, "loss": 1.9671, "step": 14425 }, { "epoch": 4.340553324073727, "grad_norm": 1.1563024520874023, "learning_rate": 7.437011173100874e-06, "loss": 1.9492, "step": 14450 }, { "epoch": 4.348061113405158, "grad_norm": 1.1290167570114136, "learning_rate": 7.412970566189248e-06, "loss": 1.9858, "step": 14475 }, { "epoch": 4.355568902736589, "grad_norm": 1.0945930480957031, "learning_rate": 7.388930853566703e-06, "loss": 1.9662, "step": 14500 }, { "epoch": 4.355568902736589, "eval_loss": 1.9953595399856567, "eval_runtime": 244.4122, "eval_samples_per_second": 22.949, "eval_steps_per_second": 5.74, "step": 14500 }, { "epoch": 4.36307669206802, "grad_norm": 1.0695611238479614, "learning_rate": 7.364892282258315e-06, "loss": 1.947, "step": 14525 }, { "epoch": 4.370584481399452, "grad_norm": 1.0597783327102661, "learning_rate": 7.340855099277433e-06, "loss": 1.9644, "step": 14550 }, { "epoch": 4.378092270730884, "grad_norm": 1.0378893613815308, "learning_rate": 7.3168195516231395e-06, "loss": 1.9737, "step": 14575 }, { "epoch": 4.385600060062314, "grad_norm": 1.2585569620132446, "learning_rate": 7.2937471936532264e-06, "loss": 1.9779, "step": 14600 }, { "epoch": 4.385600060062314, "eval_loss": 1.9953750371932983, "eval_runtime": 244.2809, "eval_samples_per_second": 22.961, "eval_steps_per_second": 5.743, "step": 14600 }, { "epoch": 4.393107849393746, "grad_norm": 1.064031958580017, "learning_rate": 7.269715567667308e-06, "loss": 1.9663, "step": 14625 }, { "epoch": 4.400615638725178, "grad_norm": 1.1410213708877563, "learning_rate": 7.245686308017058e-06, "loss": 1.9573, "step": 14650 }, { "epoch": 4.408123428056609, "grad_norm": 1.088382601737976, "learning_rate": 7.221659661620141e-06, "loss": 1.9772, "step": 14675 }, { "epoch": 4.41563121738804, "grad_norm": 0.994836151599884, "learning_rate": 7.197635875367368e-06, "loss": 1.9703, "step": 14700 }, { "epoch": 4.41563121738804, "eval_loss": 1.9953012466430664, "eval_runtime": 244.5139, "eval_samples_per_second": 22.939, "eval_steps_per_second": 5.738, "step": 14700 }, { "epoch": 4.423139006719471, "grad_norm": 1.0412800312042236, "learning_rate": 7.173615196120162e-06, "loss": 1.9413, "step": 14725 }, { "epoch": 4.430646796050903, "grad_norm": 1.159559726715088, "learning_rate": 7.149597870708011e-06, "loss": 2.0046, "step": 14750 }, { "epoch": 4.4381545853823345, "grad_norm": 1.045264482498169, "learning_rate": 7.12558414592596e-06, "loss": 1.9684, "step": 14775 }, { "epoch": 4.445662374713765, "grad_norm": 1.1119288206100464, "learning_rate": 7.1015742685320326e-06, "loss": 1.9649, "step": 14800 }, { "epoch": 4.445662374713765, "eval_loss": 1.9939073324203491, "eval_runtime": 244.3543, "eval_samples_per_second": 22.954, "eval_steps_per_second": 5.742, "step": 14800 }, { "epoch": 4.453170164045197, "grad_norm": 1.2049273252487183, "learning_rate": 7.077568485244728e-06, "loss": 1.9586, "step": 14825 }, { "epoch": 4.460677953376628, "grad_norm": 1.0386916399002075, "learning_rate": 7.053567042740475e-06, "loss": 1.9811, "step": 14850 }, { "epoch": 4.46818574270806, "grad_norm": 1.0895438194274902, "learning_rate": 7.029570187651096e-06, "loss": 1.9829, "step": 14875 }, { "epoch": 4.475693532039491, "grad_norm": 1.1542959213256836, "learning_rate": 7.005578166561275e-06, "loss": 1.9678, "step": 14900 }, { "epoch": 4.475693532039491, "eval_loss": 1.9941613674163818, "eval_runtime": 244.7416, "eval_samples_per_second": 22.918, "eval_steps_per_second": 5.733, "step": 14900 }, { "epoch": 4.483201321370922, "grad_norm": 1.0670243501663208, "learning_rate": 6.9815912260060295e-06, "loss": 1.9542, "step": 14925 }, { "epoch": 4.490709110702354, "grad_norm": 1.1406601667404175, "learning_rate": 6.95760961246816e-06, "loss": 1.9947, "step": 14950 }, { "epoch": 4.4982169000337855, "grad_norm": 1.1366952657699585, "learning_rate": 6.933633572375736e-06, "loss": 1.9659, "step": 14975 }, { "epoch": 4.505724689365216, "grad_norm": 1.0811400413513184, "learning_rate": 6.909663352099552e-06, "loss": 1.9442, "step": 15000 }, { "epoch": 4.505724689365216, "eval_loss": 1.993889331817627, "eval_runtime": 244.6905, "eval_samples_per_second": 22.923, "eval_steps_per_second": 5.734, "step": 15000 }, { "epoch": 4.513232478696648, "grad_norm": 1.013418436050415, "learning_rate": 6.885699197950602e-06, "loss": 1.9702, "step": 15025 }, { "epoch": 4.520740268028079, "grad_norm": 1.097463846206665, "learning_rate": 6.86174135617754e-06, "loss": 1.9547, "step": 15050 }, { "epoch": 4.5282480573595105, "grad_norm": 1.1067347526550293, "learning_rate": 6.83779007296417e-06, "loss": 1.9772, "step": 15075 }, { "epoch": 4.535755846690942, "grad_norm": 1.0753045082092285, "learning_rate": 6.813845594426891e-06, "loss": 1.9522, "step": 15100 }, { "epoch": 4.535755846690942, "eval_loss": 1.9931405782699585, "eval_runtime": 244.8392, "eval_samples_per_second": 22.909, "eval_steps_per_second": 5.73, "step": 15100 }, { "epoch": 4.543263636022373, "grad_norm": 1.0194171667099, "learning_rate": 6.789908166612178e-06, "loss": 1.9643, "step": 15125 }, { "epoch": 4.550771425353805, "grad_norm": 1.123500108718872, "learning_rate": 6.76597803549406e-06, "loss": 1.954, "step": 15150 }, { "epoch": 4.558279214685236, "grad_norm": 1.1514633893966675, "learning_rate": 6.742055446971586e-06, "loss": 1.954, "step": 15175 }, { "epoch": 4.565787004016667, "grad_norm": 1.1776665449142456, "learning_rate": 6.718140646866296e-06, "loss": 1.9539, "step": 15200 }, { "epoch": 4.565787004016667, "eval_loss": 1.9931755065917969, "eval_runtime": 243.9594, "eval_samples_per_second": 22.992, "eval_steps_per_second": 5.751, "step": 15200 }, { "epoch": 4.573294793348099, "grad_norm": 1.1815805435180664, "learning_rate": 6.694233880919708e-06, "loss": 1.9478, "step": 15225 }, { "epoch": 4.58080258267953, "grad_norm": 1.0977429151535034, "learning_rate": 6.670335394790772e-06, "loss": 1.947, "step": 15250 }, { "epoch": 4.5883103720109615, "grad_norm": 1.1538454294204712, "learning_rate": 6.6464454340533655e-06, "loss": 1.9462, "step": 15275 }, { "epoch": 4.595818161342393, "grad_norm": 1.1371299028396606, "learning_rate": 6.622564244193754e-06, "loss": 1.9586, "step": 15300 }, { "epoch": 4.595818161342393, "eval_loss": 1.9928078651428223, "eval_runtime": 244.396, "eval_samples_per_second": 22.95, "eval_steps_per_second": 5.741, "step": 15300 }, { "epoch": 4.603325950673824, "grad_norm": 1.1348552703857422, "learning_rate": 6.598692070608083e-06, "loss": 1.9509, "step": 15325 }, { "epoch": 4.610833740005256, "grad_norm": 0.9622187614440918, "learning_rate": 6.5748291585998436e-06, "loss": 1.9359, "step": 15350 }, { "epoch": 4.6183415293366865, "grad_norm": 0.9866182208061218, "learning_rate": 6.55097575337736e-06, "loss": 1.9664, "step": 15375 }, { "epoch": 4.625849318668118, "grad_norm": 1.1888655424118042, "learning_rate": 6.5271321000512715e-06, "loss": 1.9483, "step": 15400 }, { "epoch": 4.625849318668118, "eval_loss": 1.9925552606582642, "eval_runtime": 244.5029, "eval_samples_per_second": 22.94, "eval_steps_per_second": 5.738, "step": 15400 }, { "epoch": 4.63335710799955, "grad_norm": 1.1576796770095825, "learning_rate": 6.503298443632006e-06, "loss": 1.9494, "step": 15425 }, { "epoch": 4.640864897330981, "grad_norm": 1.1134017705917358, "learning_rate": 6.479475029027266e-06, "loss": 1.9282, "step": 15450 }, { "epoch": 4.648372686662412, "grad_norm": 1.2257720232009888, "learning_rate": 6.45566210103951e-06, "loss": 1.9648, "step": 15475 }, { "epoch": 4.655880475993843, "grad_norm": 1.1228723526000977, "learning_rate": 6.431859904363441e-06, "loss": 1.9436, "step": 15500 }, { "epoch": 4.655880475993843, "eval_loss": 1.9922431707382202, "eval_runtime": 244.1596, "eval_samples_per_second": 22.973, "eval_steps_per_second": 5.746, "step": 15500 }, { "epoch": 4.663388265325275, "grad_norm": 1.1629343032836914, "learning_rate": 6.40806868358349e-06, "loss": 1.9939, "step": 15525 }, { "epoch": 4.670896054656707, "grad_norm": 1.229765772819519, "learning_rate": 6.38428868317131e-06, "loss": 1.9469, "step": 15550 }, { "epoch": 4.6784038439881375, "grad_norm": 1.10246741771698, "learning_rate": 6.360520147483243e-06, "loss": 1.97, "step": 15575 }, { "epoch": 4.685911633319569, "grad_norm": 1.0727444887161255, "learning_rate": 6.336763320757837e-06, "loss": 1.9598, "step": 15600 }, { "epoch": 4.685911633319569, "eval_loss": 1.9915326833724976, "eval_runtime": 244.8129, "eval_samples_per_second": 22.911, "eval_steps_per_second": 5.731, "step": 15600 }, { "epoch": 4.693419422651001, "grad_norm": 1.1504484415054321, "learning_rate": 6.313018447113308e-06, "loss": 2.0044, "step": 15625 }, { "epoch": 4.700927211982432, "grad_norm": 1.2329007387161255, "learning_rate": 6.289285770545056e-06, "loss": 1.9718, "step": 15650 }, { "epoch": 4.708435001313863, "grad_norm": 1.12412691116333, "learning_rate": 6.265565534923142e-06, "loss": 1.9716, "step": 15675 }, { "epoch": 4.715942790645294, "grad_norm": 1.1599684953689575, "learning_rate": 6.241857983989794e-06, "loss": 1.9562, "step": 15700 }, { "epoch": 4.715942790645294, "eval_loss": 1.9914188385009766, "eval_runtime": 244.1223, "eval_samples_per_second": 22.976, "eval_steps_per_second": 5.747, "step": 15700 }, { "epoch": 4.723450579976726, "grad_norm": 1.0815508365631104, "learning_rate": 6.21816336135689e-06, "loss": 1.9537, "step": 15725 }, { "epoch": 4.730958369308157, "grad_norm": 1.1671316623687744, "learning_rate": 6.1944819105034615e-06, "loss": 1.94, "step": 15750 }, { "epoch": 4.738466158639588, "grad_norm": 1.0050816535949707, "learning_rate": 6.170813874773193e-06, "loss": 1.9701, "step": 15775 }, { "epoch": 4.74597394797102, "grad_norm": 1.134464979171753, "learning_rate": 6.1471594973719145e-06, "loss": 1.9671, "step": 15800 }, { "epoch": 4.74597394797102, "eval_loss": 1.9911348819732666, "eval_runtime": 244.1165, "eval_samples_per_second": 22.977, "eval_steps_per_second": 5.747, "step": 15800 }, { "epoch": 4.753481737302451, "grad_norm": 1.177161455154419, "learning_rate": 6.123519021365107e-06, "loss": 1.9476, "step": 15825 }, { "epoch": 4.760989526633883, "grad_norm": 0.998101532459259, "learning_rate": 6.099892689675414e-06, "loss": 1.9599, "step": 15850 }, { "epoch": 4.768497315965314, "grad_norm": 1.0538263320922852, "learning_rate": 6.076280745080128e-06, "loss": 2.0034, "step": 15875 }, { "epoch": 4.776005105296745, "grad_norm": 1.2193068265914917, "learning_rate": 6.0526834302087054e-06, "loss": 1.9526, "step": 15900 }, { "epoch": 4.776005105296745, "eval_loss": 1.9908709526062012, "eval_runtime": 244.0685, "eval_samples_per_second": 22.981, "eval_steps_per_second": 5.748, "step": 15900 }, { "epoch": 4.783512894628177, "grad_norm": 1.0334097146987915, "learning_rate": 6.0291009875402705e-06, "loss": 1.9999, "step": 15925 }, { "epoch": 4.791020683959609, "grad_norm": 1.173577904701233, "learning_rate": 6.005533659401131e-06, "loss": 1.9886, "step": 15950 }, { "epoch": 4.798528473291039, "grad_norm": 1.270085334777832, "learning_rate": 5.98198168796227e-06, "loss": 1.9726, "step": 15975 }, { "epoch": 4.806036262622471, "grad_norm": 1.2580983638763428, "learning_rate": 5.958445315236885e-06, "loss": 1.9382, "step": 16000 }, { "epoch": 4.806036262622471, "eval_loss": 1.9913830757141113, "eval_runtime": 244.081, "eval_samples_per_second": 22.98, "eval_steps_per_second": 5.748, "step": 16000 }, { "epoch": 4.813544051953902, "grad_norm": 1.080772042274475, "learning_rate": 5.934924783077876e-06, "loss": 1.9402, "step": 16025 }, { "epoch": 4.821051841285334, "grad_norm": 1.1412278413772583, "learning_rate": 5.911420333175371e-06, "loss": 1.9609, "step": 16050 }, { "epoch": 4.828559630616764, "grad_norm": 1.0110164880752563, "learning_rate": 5.887932207054245e-06, "loss": 1.9922, "step": 16075 }, { "epoch": 4.836067419948196, "grad_norm": 1.1360834836959839, "learning_rate": 5.864460646071631e-06, "loss": 2.0002, "step": 16100 }, { "epoch": 4.836067419948196, "eval_loss": 1.9903969764709473, "eval_runtime": 244.6106, "eval_samples_per_second": 22.93, "eval_steps_per_second": 5.736, "step": 16100 }, { "epoch": 4.843575209279628, "grad_norm": 1.163109302520752, "learning_rate": 5.841005891414443e-06, "loss": 1.9692, "step": 16125 }, { "epoch": 4.851082998611059, "grad_norm": 1.1313296556472778, "learning_rate": 5.817568184096897e-06, "loss": 1.9648, "step": 16150 }, { "epoch": 4.85859078794249, "grad_norm": 1.1544945240020752, "learning_rate": 5.794147764958046e-06, "loss": 1.9696, "step": 16175 }, { "epoch": 4.866098577273922, "grad_norm": 1.0399378538131714, "learning_rate": 5.770744874659283e-06, "loss": 1.9396, "step": 16200 }, { "epoch": 4.866098577273922, "eval_loss": 1.9903674125671387, "eval_runtime": 244.9102, "eval_samples_per_second": 22.902, "eval_steps_per_second": 5.729, "step": 16200 }, { "epoch": 4.873606366605353, "grad_norm": 1.189995288848877, "learning_rate": 5.747359753681883e-06, "loss": 1.9542, "step": 16225 }, { "epoch": 4.8811141559367845, "grad_norm": 1.0425307750701904, "learning_rate": 5.7239926423245305e-06, "loss": 1.9764, "step": 16250 }, { "epoch": 4.888621945268216, "grad_norm": 1.1978663206100464, "learning_rate": 5.700643780700849e-06, "loss": 1.9624, "step": 16275 }, { "epoch": 4.896129734599647, "grad_norm": 1.045249104499817, "learning_rate": 5.677313408736924e-06, "loss": 1.9709, "step": 16300 }, { "epoch": 4.896129734599647, "eval_loss": 1.9895341396331787, "eval_runtime": 244.445, "eval_samples_per_second": 22.946, "eval_steps_per_second": 5.74, "step": 16300 }, { "epoch": 4.903637523931079, "grad_norm": 1.119350790977478, "learning_rate": 5.654001766168861e-06, "loss": 1.9712, "step": 16325 }, { "epoch": 4.91114531326251, "grad_norm": 1.090303897857666, "learning_rate": 5.630709092540301e-06, "loss": 1.9269, "step": 16350 }, { "epoch": 4.918653102593941, "grad_norm": 1.2654612064361572, "learning_rate": 5.607435627199961e-06, "loss": 1.9468, "step": 16375 }, { "epoch": 4.926160891925372, "grad_norm": 1.0900917053222656, "learning_rate": 5.584181609299187e-06, "loss": 1.9574, "step": 16400 }, { "epoch": 4.926160891925372, "eval_loss": 1.989732265472412, "eval_runtime": 244.1216, "eval_samples_per_second": 22.976, "eval_steps_per_second": 5.747, "step": 16400 }, { "epoch": 4.933668681256804, "grad_norm": 1.192901372909546, "learning_rate": 5.560947277789483e-06, "loss": 1.928, "step": 16425 }, { "epoch": 4.9411764705882355, "grad_norm": 1.0490167140960693, "learning_rate": 5.537732871420064e-06, "loss": 1.9452, "step": 16450 }, { "epoch": 4.948684259919666, "grad_norm": 1.0243791341781616, "learning_rate": 5.514538628735402e-06, "loss": 1.9646, "step": 16475 }, { "epoch": 4.956192049251098, "grad_norm": 1.091910481452942, "learning_rate": 5.491364788072769e-06, "loss": 1.982, "step": 16500 }, { "epoch": 4.956192049251098, "eval_loss": 1.9894477128982544, "eval_runtime": 244.6192, "eval_samples_per_second": 22.93, "eval_steps_per_second": 5.735, "step": 16500 }, { "epoch": 4.96369983858253, "grad_norm": 1.1949421167373657, "learning_rate": 5.468211587559794e-06, "loss": 1.9528, "step": 16525 }, { "epoch": 4.9712076279139605, "grad_norm": 1.0508161783218384, "learning_rate": 5.445079265112013e-06, "loss": 1.9485, "step": 16550 }, { "epoch": 4.978715417245392, "grad_norm": 1.2194594144821167, "learning_rate": 5.421968058430424e-06, "loss": 1.9324, "step": 16575 }, { "epoch": 4.986223206576824, "grad_norm": 1.1732969284057617, "learning_rate": 5.398878204999047e-06, "loss": 1.9588, "step": 16600 }, { "epoch": 4.986223206576824, "eval_loss": 1.9885412454605103, "eval_runtime": 244.0337, "eval_samples_per_second": 22.985, "eval_steps_per_second": 5.749, "step": 16600 }, { "epoch": 4.993730995908255, "grad_norm": 1.142801284790039, "learning_rate": 5.375809942082486e-06, "loss": 1.969, "step": 16625 }, { "epoch": 5.001501557866287, "grad_norm": 1.1804615259170532, "learning_rate": 5.35276350672348e-06, "loss": 2.0292, "step": 16650 }, { "epoch": 5.0090093471977175, "grad_norm": 1.184505581855774, "learning_rate": 5.329739135740479e-06, "loss": 1.9356, "step": 16675 }, { "epoch": 5.016517136529149, "grad_norm": 1.23818039894104, "learning_rate": 5.306737065725203e-06, "loss": 1.9537, "step": 16700 }, { "epoch": 5.016517136529149, "eval_loss": 1.9894059896469116, "eval_runtime": 244.4081, "eval_samples_per_second": 22.949, "eval_steps_per_second": 5.74, "step": 16700 }, { "epoch": 5.02402492586058, "grad_norm": 1.1360877752304077, "learning_rate": 5.283757533040218e-06, "loss": 1.9584, "step": 16725 }, { "epoch": 5.031532715192012, "grad_norm": 0.9773384928703308, "learning_rate": 5.260800773816495e-06, "loss": 1.9773, "step": 16750 }, { "epoch": 5.039040504523443, "grad_norm": 1.052291750907898, "learning_rate": 5.237867023951004e-06, "loss": 1.9516, "step": 16775 }, { "epoch": 5.046548293854874, "grad_norm": 1.086792230606079, "learning_rate": 5.214956519104266e-06, "loss": 1.9529, "step": 16800 }, { "epoch": 5.046548293854874, "eval_loss": 1.9890544414520264, "eval_runtime": 244.3105, "eval_samples_per_second": 22.958, "eval_steps_per_second": 5.743, "step": 16800 }, { "epoch": 5.054056083186306, "grad_norm": 1.0600550174713135, "learning_rate": 5.192069494697948e-06, "loss": 1.9553, "step": 16825 }, { "epoch": 5.061563872517737, "grad_norm": 1.2051581144332886, "learning_rate": 5.169206185912439e-06, "loss": 1.9469, "step": 16850 }, { "epoch": 5.0690716618491685, "grad_norm": 1.0981636047363281, "learning_rate": 5.146366827684433e-06, "loss": 1.9817, "step": 16875 }, { "epoch": 5.0765794511806, "grad_norm": 1.2117871046066284, "learning_rate": 5.123551654704513e-06, "loss": 1.9476, "step": 16900 }, { "epoch": 5.0765794511806, "eval_loss": 1.9889459609985352, "eval_runtime": 244.8126, "eval_samples_per_second": 22.911, "eval_steps_per_second": 5.731, "step": 16900 }, { "epoch": 5.084087240512031, "grad_norm": 1.1481753587722778, "learning_rate": 5.101672059749764e-06, "loss": 1.9257, "step": 16925 }, { "epoch": 5.091595029843463, "grad_norm": 1.0862995386123657, "learning_rate": 5.0789049696927284e-06, "loss": 1.9393, "step": 16950 }, { "epoch": 5.099102819174894, "grad_norm": 1.1692668199539185, "learning_rate": 5.056162758102157e-06, "loss": 1.9525, "step": 16975 }, { "epoch": 5.106610608506325, "grad_norm": 1.2036652565002441, "learning_rate": 5.033445658670386e-06, "loss": 1.9622, "step": 17000 }, { "epoch": 5.106610608506325, "eval_loss": 1.988864541053772, "eval_runtime": 244.6644, "eval_samples_per_second": 22.925, "eval_steps_per_second": 5.734, "step": 17000 }, { "epoch": 5.114118397837757, "grad_norm": 1.2949445247650146, "learning_rate": 5.0107539048317025e-06, "loss": 1.9454, "step": 17025 }, { "epoch": 5.121626187169188, "grad_norm": 1.3341749906539917, "learning_rate": 4.98808772975995e-06, "loss": 1.9501, "step": 17050 }, { "epoch": 5.129133976500619, "grad_norm": 1.147869348526001, "learning_rate": 4.965447366366137e-06, "loss": 1.9392, "step": 17075 }, { "epoch": 5.136641765832051, "grad_norm": 1.2364166975021362, "learning_rate": 4.9428330472960326e-06, "loss": 1.957, "step": 17100 }, { "epoch": 5.136641765832051, "eval_loss": 1.9885538816452026, "eval_runtime": 244.7413, "eval_samples_per_second": 22.918, "eval_steps_per_second": 5.733, "step": 17100 }, { "epoch": 5.144149555163482, "grad_norm": 1.0443626642227173, "learning_rate": 4.920245004927787e-06, "loss": 1.9461, "step": 17125 }, { "epoch": 5.151657344494914, "grad_norm": 1.1504952907562256, "learning_rate": 4.897683471369532e-06, "loss": 1.9492, "step": 17150 }, { "epoch": 5.1591651338263445, "grad_norm": 1.1174638271331787, "learning_rate": 4.875148678457012e-06, "loss": 1.9496, "step": 17175 }, { "epoch": 5.166672923157776, "grad_norm": 1.2215421199798584, "learning_rate": 4.852640857751181e-06, "loss": 1.9272, "step": 17200 }, { "epoch": 5.166672923157776, "eval_loss": 1.9891639947891235, "eval_runtime": 244.7795, "eval_samples_per_second": 22.915, "eval_steps_per_second": 5.732, "step": 17200 }, { "epoch": 5.174180712489208, "grad_norm": 1.179458498954773, "learning_rate": 4.830160240535846e-06, "loss": 1.965, "step": 17225 }, { "epoch": 5.181688501820639, "grad_norm": 1.1385215520858765, "learning_rate": 4.807707057815272e-06, "loss": 1.9466, "step": 17250 }, { "epoch": 5.18919629115207, "grad_norm": 1.4543343782424927, "learning_rate": 4.785281540311815e-06, "loss": 1.9864, "step": 17275 }, { "epoch": 5.196704080483501, "grad_norm": 1.1437246799468994, "learning_rate": 4.762883918463555e-06, "loss": 1.9545, "step": 17300 }, { "epoch": 5.196704080483501, "eval_loss": 1.988171935081482, "eval_runtime": 244.7849, "eval_samples_per_second": 22.914, "eval_steps_per_second": 5.732, "step": 17300 }, { "epoch": 5.204211869814933, "grad_norm": 1.154579520225525, "learning_rate": 4.740514422421921e-06, "loss": 1.9295, "step": 17325 }, { "epoch": 5.211719659146365, "grad_norm": 1.21454656124115, "learning_rate": 4.71817328204933e-06, "loss": 1.9554, "step": 17350 }, { "epoch": 5.219227448477795, "grad_norm": 1.1201882362365723, "learning_rate": 4.695860726916826e-06, "loss": 1.9313, "step": 17375 }, { "epoch": 5.226735237809227, "grad_norm": 1.1768020391464233, "learning_rate": 4.673576986301719e-06, "loss": 1.9316, "step": 17400 }, { "epoch": 5.226735237809227, "eval_loss": 1.9883191585540771, "eval_runtime": 245.1496, "eval_samples_per_second": 22.88, "eval_steps_per_second": 5.723, "step": 17400 }, { "epoch": 5.234243027140659, "grad_norm": 1.14753258228302, "learning_rate": 4.651322289185229e-06, "loss": 1.9224, "step": 17425 }, { "epoch": 5.24175081647209, "grad_norm": 1.2445884943008423, "learning_rate": 4.629096864250132e-06, "loss": 1.9336, "step": 17450 }, { "epoch": 5.249258605803521, "grad_norm": 1.0989011526107788, "learning_rate": 4.606900939878415e-06, "loss": 1.9434, "step": 17475 }, { "epoch": 5.256766395134952, "grad_norm": 1.2167655229568481, "learning_rate": 4.584734744148922e-06, "loss": 1.9219, "step": 17500 }, { "epoch": 5.256766395134952, "eval_loss": 1.9880566596984863, "eval_runtime": 245.0782, "eval_samples_per_second": 22.887, "eval_steps_per_second": 5.725, "step": 17500 }, { "epoch": 5.264274184466384, "grad_norm": 1.070075511932373, "learning_rate": 4.562598504835015e-06, "loss": 1.9723, "step": 17525 }, { "epoch": 5.2717819737978155, "grad_norm": 1.1149256229400635, "learning_rate": 4.540492449402237e-06, "loss": 1.9661, "step": 17550 }, { "epoch": 5.279289763129246, "grad_norm": 1.1833670139312744, "learning_rate": 4.5184168050059645e-06, "loss": 1.9208, "step": 17575 }, { "epoch": 5.286797552460678, "grad_norm": 1.1111880540847778, "learning_rate": 4.496371798489084e-06, "loss": 1.9621, "step": 17600 }, { "epoch": 5.286797552460678, "eval_loss": 1.9878884553909302, "eval_runtime": 245.0298, "eval_samples_per_second": 22.891, "eval_steps_per_second": 5.726, "step": 17600 }, { "epoch": 5.294305341792109, "grad_norm": 1.1070398092269897, "learning_rate": 4.47435765637965e-06, "loss": 1.9578, "step": 17625 }, { "epoch": 5.301813131123541, "grad_norm": 1.127094030380249, "learning_rate": 4.452374604888568e-06, "loss": 1.9291, "step": 17650 }, { "epoch": 5.309320920454972, "grad_norm": 1.1716080904006958, "learning_rate": 4.430422869907261e-06, "loss": 1.9694, "step": 17675 }, { "epoch": 5.316828709786403, "grad_norm": 1.0636411905288696, "learning_rate": 4.408502677005365e-06, "loss": 1.9692, "step": 17700 }, { "epoch": 5.316828709786403, "eval_loss": 1.9873278141021729, "eval_runtime": 245.0048, "eval_samples_per_second": 22.893, "eval_steps_per_second": 5.726, "step": 17700 }, { "epoch": 5.324336499117835, "grad_norm": 1.0959444046020508, "learning_rate": 4.386614251428382e-06, "loss": 1.9467, "step": 17725 }, { "epoch": 5.3318442884492665, "grad_norm": 1.3291634321212769, "learning_rate": 4.3647578180953905e-06, "loss": 1.9335, "step": 17750 }, { "epoch": 5.339352077780697, "grad_norm": 1.1393731832504272, "learning_rate": 4.342933601596728e-06, "loss": 1.9253, "step": 17775 }, { "epoch": 5.346859867112129, "grad_norm": 1.0294339656829834, "learning_rate": 4.321141826191677e-06, "loss": 1.9358, "step": 17800 }, { "epoch": 5.346859867112129, "eval_loss": 1.9870134592056274, "eval_runtime": 244.8052, "eval_samples_per_second": 22.912, "eval_steps_per_second": 5.731, "step": 17800 }, { "epoch": 5.35436765644356, "grad_norm": 1.1546337604522705, "learning_rate": 4.299382715806166e-06, "loss": 1.9828, "step": 17825 }, { "epoch": 5.3618754457749915, "grad_norm": 1.1130156517028809, "learning_rate": 4.27765649403047e-06, "loss": 1.9328, "step": 17850 }, { "epoch": 5.369383235106423, "grad_norm": 1.1644601821899414, "learning_rate": 4.2559633841169055e-06, "loss": 1.9425, "step": 17875 }, { "epoch": 5.376891024437854, "grad_norm": 1.1819103956222534, "learning_rate": 4.2343036089775444e-06, "loss": 1.9346, "step": 17900 }, { "epoch": 5.376891024437854, "eval_loss": 1.9867066144943237, "eval_runtime": 244.6669, "eval_samples_per_second": 22.925, "eval_steps_per_second": 5.734, "step": 17900 }, { "epoch": 5.384398813769286, "grad_norm": 1.126689076423645, "learning_rate": 4.212677391181919e-06, "loss": 1.9554, "step": 17925 }, { "epoch": 5.391906603100717, "grad_norm": 1.2009223699569702, "learning_rate": 4.191084952954739e-06, "loss": 1.9597, "step": 17950 }, { "epoch": 5.399414392432148, "grad_norm": 1.1235226392745972, "learning_rate": 4.169526516173596e-06, "loss": 1.9362, "step": 17975 }, { "epoch": 5.40692218176358, "grad_norm": 1.2246404886245728, "learning_rate": 4.148002302366707e-06, "loss": 1.9621, "step": 18000 }, { "epoch": 5.40692218176358, "eval_loss": 1.9868113994598389, "eval_runtime": 244.649, "eval_samples_per_second": 22.927, "eval_steps_per_second": 5.735, "step": 18000 }, { "epoch": 5.414429971095011, "grad_norm": 1.140601634979248, "learning_rate": 4.126512532710613e-06, "loss": 1.9313, "step": 18025 }, { "epoch": 5.4219377604264425, "grad_norm": 1.1360208988189697, "learning_rate": 4.105057428027919e-06, "loss": 1.9462, "step": 18050 }, { "epoch": 5.429445549757874, "grad_norm": 1.172402024269104, "learning_rate": 4.0836372087850255e-06, "loss": 1.9577, "step": 18075 }, { "epoch": 5.436953339089305, "grad_norm": 1.1650437116622925, "learning_rate": 4.062252095089857e-06, "loss": 1.9299, "step": 18100 }, { "epoch": 5.436953339089305, "eval_loss": 1.9864240884780884, "eval_runtime": 244.7091, "eval_samples_per_second": 22.921, "eval_steps_per_second": 5.733, "step": 18100 }, { "epoch": 5.444461128420737, "grad_norm": 1.1697797775268555, "learning_rate": 4.040902306689605e-06, "loss": 1.9483, "step": 18125 }, { "epoch": 5.4519689177521675, "grad_norm": 1.194689393043518, "learning_rate": 4.019588062968471e-06, "loss": 1.9468, "step": 18150 }, { "epoch": 5.459476707083599, "grad_norm": 1.0986504554748535, "learning_rate": 3.998309582945405e-06, "loss": 1.9472, "step": 18175 }, { "epoch": 5.466984496415031, "grad_norm": 1.2854065895080566, "learning_rate": 3.977067085271864e-06, "loss": 1.9455, "step": 18200 }, { "epoch": 5.466984496415031, "eval_loss": 1.9863779544830322, "eval_runtime": 244.6161, "eval_samples_per_second": 22.93, "eval_steps_per_second": 5.736, "step": 18200 }, { "epoch": 5.474492285746462, "grad_norm": 1.167863368988037, "learning_rate": 3.95586078822956e-06, "loss": 1.9287, "step": 18225 }, { "epoch": 5.482000075077893, "grad_norm": 1.190122365951538, "learning_rate": 3.934690909728214e-06, "loss": 1.9581, "step": 18250 }, { "epoch": 5.489507864409324, "grad_norm": 1.0951225757598877, "learning_rate": 3.913557667303326e-06, "loss": 1.93, "step": 18275 }, { "epoch": 5.497015653740756, "grad_norm": 1.052368402481079, "learning_rate": 3.8924612781139276e-06, "loss": 1.9753, "step": 18300 }, { "epoch": 5.497015653740756, "eval_loss": 1.9860328435897827, "eval_runtime": 245.3363, "eval_samples_per_second": 22.862, "eval_steps_per_second": 5.719, "step": 18300 }, { "epoch": 5.504523443072188, "grad_norm": 1.1306352615356445, "learning_rate": 3.87140195894037e-06, "loss": 1.9711, "step": 18325 }, { "epoch": 5.5120312324036185, "grad_norm": 1.174249291419983, "learning_rate": 3.850379926182069e-06, "loss": 1.9391, "step": 18350 }, { "epoch": 5.51953902173505, "grad_norm": 1.0850168466567993, "learning_rate": 3.8293953958553055e-06, "loss": 1.9709, "step": 18375 }, { "epoch": 5.527046811066482, "grad_norm": 1.1175942420959473, "learning_rate": 3.8084485835909922e-06, "loss": 1.9369, "step": 18400 }, { "epoch": 5.527046811066482, "eval_loss": 1.9858981370925903, "eval_runtime": 244.8528, "eval_samples_per_second": 22.908, "eval_steps_per_second": 5.73, "step": 18400 }, { "epoch": 5.534554600397913, "grad_norm": 1.1870449781417847, "learning_rate": 3.7875397046324636e-06, "loss": 1.9603, "step": 18425 }, { "epoch": 5.542062389729344, "grad_norm": 1.1581183671951294, "learning_rate": 3.766668973833262e-06, "loss": 1.9415, "step": 18450 }, { "epoch": 5.549570179060775, "grad_norm": 1.0704885721206665, "learning_rate": 3.7458366056549304e-06, "loss": 1.945, "step": 18475 }, { "epoch": 5.557077968392207, "grad_norm": 1.209778904914856, "learning_rate": 3.7250428141648097e-06, "loss": 1.9571, "step": 18500 }, { "epoch": 5.557077968392207, "eval_loss": 1.9858996868133545, "eval_runtime": 244.4136, "eval_samples_per_second": 22.949, "eval_steps_per_second": 5.74, "step": 18500 }, { "epoch": 5.564585757723639, "grad_norm": 1.1481252908706665, "learning_rate": 3.704287813033836e-06, "loss": 1.9445, "step": 18525 }, { "epoch": 5.572093547055069, "grad_norm": 1.1967343091964722, "learning_rate": 3.6835718155343483e-06, "loss": 1.9457, "step": 18550 }, { "epoch": 5.579601336386501, "grad_norm": 1.246741771697998, "learning_rate": 3.6628950345378965e-06, "loss": 1.951, "step": 18575 }, { "epoch": 5.587109125717932, "grad_norm": 1.1486597061157227, "learning_rate": 3.6422576825130477e-06, "loss": 1.9534, "step": 18600 }, { "epoch": 5.587109125717932, "eval_loss": 1.9850828647613525, "eval_runtime": 244.5498, "eval_samples_per_second": 22.936, "eval_steps_per_second": 5.737, "step": 18600 }, { "epoch": 5.594616915049364, "grad_norm": 1.0895804166793823, "learning_rate": 3.62165997152322e-06, "loss": 1.9507, "step": 18625 }, { "epoch": 5.602124704380795, "grad_norm": 1.150476098060608, "learning_rate": 3.6011021132244807e-06, "loss": 1.9709, "step": 18650 }, { "epoch": 5.609632493712226, "grad_norm": 1.2993876934051514, "learning_rate": 3.5805843188633868e-06, "loss": 1.9095, "step": 18675 }, { "epoch": 5.617140283043658, "grad_norm": 1.1421048641204834, "learning_rate": 3.56010679927481e-06, "loss": 1.9381, "step": 18700 }, { "epoch": 5.617140283043658, "eval_loss": 1.9856911897659302, "eval_runtime": 244.4243, "eval_samples_per_second": 22.948, "eval_steps_per_second": 5.74, "step": 18700 }, { "epoch": 5.62464807237509, "grad_norm": 1.2726351022720337, "learning_rate": 3.539669764879769e-06, "loss": 1.9533, "step": 18725 }, { "epoch": 5.63215586170652, "grad_norm": 1.3039084672927856, "learning_rate": 3.519273425683269e-06, "loss": 1.9381, "step": 18750 }, { "epoch": 5.639663651037952, "grad_norm": 1.2816251516342163, "learning_rate": 3.4989179912721443e-06, "loss": 1.9566, "step": 18775 }, { "epoch": 5.647171440369383, "grad_norm": 1.1940944194793701, "learning_rate": 3.4786036708129018e-06, "loss": 1.9684, "step": 18800 }, { "epoch": 5.647171440369383, "eval_loss": 1.985024094581604, "eval_runtime": 245.079, "eval_samples_per_second": 22.887, "eval_steps_per_second": 5.725, "step": 18800 }, { "epoch": 5.654679229700815, "grad_norm": 1.1532857418060303, "learning_rate": 3.4583306730495745e-06, "loss": 1.9131, "step": 18825 }, { "epoch": 5.662187019032246, "grad_norm": 1.1996498107910156, "learning_rate": 3.4380992063015747e-06, "loss": 1.9262, "step": 18850 }, { "epoch": 5.669694808363677, "grad_norm": 1.1328129768371582, "learning_rate": 3.4179094784615565e-06, "loss": 1.9509, "step": 18875 }, { "epoch": 5.677202597695109, "grad_norm": 1.124004602432251, "learning_rate": 3.3977616969932705e-06, "loss": 1.9334, "step": 18900 }, { "epoch": 5.677202597695109, "eval_loss": 1.9849857091903687, "eval_runtime": 244.5129, "eval_samples_per_second": 22.939, "eval_steps_per_second": 5.738, "step": 18900 }, { "epoch": 5.68471038702654, "grad_norm": 1.187667727470398, "learning_rate": 3.3776560689294486e-06, "loss": 1.9702, "step": 18925 }, { "epoch": 5.692218176357971, "grad_norm": 1.1103003025054932, "learning_rate": 3.3575928008696606e-06, "loss": 1.9825, "step": 18950 }, { "epoch": 5.699725965689403, "grad_norm": 1.1390193700790405, "learning_rate": 3.3375720989781967e-06, "loss": 1.9481, "step": 18975 }, { "epoch": 5.707233755020834, "grad_norm": 1.0689352750778198, "learning_rate": 3.3175941689819507e-06, "loss": 1.9633, "step": 19000 }, { "epoch": 5.707233755020834, "eval_loss": 1.9846566915512085, "eval_runtime": 244.7924, "eval_samples_per_second": 22.913, "eval_steps_per_second": 5.731, "step": 19000 }, { "epoch": 5.714741544352266, "grad_norm": 1.3017778396606445, "learning_rate": 3.297659216168305e-06, "loss": 1.9521, "step": 19025 }, { "epoch": 5.722249333683697, "grad_norm": 1.0697276592254639, "learning_rate": 3.277767445383023e-06, "loss": 1.926, "step": 19050 }, { "epoch": 5.729757123015128, "grad_norm": 1.2455774545669556, "learning_rate": 3.2579190610281378e-06, "loss": 1.9708, "step": 19075 }, { "epoch": 5.73726491234656, "grad_norm": 1.2440054416656494, "learning_rate": 3.238114267059859e-06, "loss": 1.9728, "step": 19100 }, { "epoch": 5.73726491234656, "eval_loss": 1.9845046997070312, "eval_runtime": 245.1879, "eval_samples_per_second": 22.876, "eval_steps_per_second": 5.722, "step": 19100 }, { "epoch": 5.744772701677991, "grad_norm": 1.1576147079467773, "learning_rate": 3.218353266986476e-06, "loss": 1.9956, "step": 19125 }, { "epoch": 5.752280491009422, "grad_norm": 1.4614973068237305, "learning_rate": 3.198636263866259e-06, "loss": 1.9471, "step": 19150 }, { "epoch": 5.759788280340854, "grad_norm": 1.2813773155212402, "learning_rate": 3.1789634603053846e-06, "loss": 1.9516, "step": 19175 }, { "epoch": 5.767296069672285, "grad_norm": 1.212929368019104, "learning_rate": 3.1593350584558446e-06, "loss": 1.9446, "step": 19200 }, { "epoch": 5.767296069672285, "eval_loss": 1.9842097759246826, "eval_runtime": 244.6725, "eval_samples_per_second": 22.925, "eval_steps_per_second": 5.734, "step": 19200 }, { "epoch": 5.7748038590037165, "grad_norm": 1.0693471431732178, "learning_rate": 3.1397512600133694e-06, "loss": 1.9767, "step": 19225 }, { "epoch": 5.782311648335147, "grad_norm": 1.2919217348098755, "learning_rate": 3.120212266215365e-06, "loss": 1.9476, "step": 19250 }, { "epoch": 5.789819437666579, "grad_norm": 1.1402935981750488, "learning_rate": 3.1007182778388315e-06, "loss": 1.9495, "step": 19275 }, { "epoch": 5.797327226998011, "grad_norm": 1.2192392349243164, "learning_rate": 3.0812694951983087e-06, "loss": 1.9633, "step": 19300 }, { "epoch": 5.797327226998011, "eval_loss": 1.9841300249099731, "eval_runtime": 245.1028, "eval_samples_per_second": 22.884, "eval_steps_per_second": 5.724, "step": 19300 }, { "epoch": 5.8048350163294415, "grad_norm": 1.232828140258789, "learning_rate": 3.0618661181438147e-06, "loss": 1.9147, "step": 19325 }, { "epoch": 5.812342805660873, "grad_norm": 1.1075702905654907, "learning_rate": 3.042508346058794e-06, "loss": 1.9493, "step": 19350 }, { "epoch": 5.819850594992305, "grad_norm": 1.3566619157791138, "learning_rate": 3.0231963778580643e-06, "loss": 1.9314, "step": 19375 }, { "epoch": 5.827358384323736, "grad_norm": 0.9761985540390015, "learning_rate": 3.0039304119857863e-06, "loss": 1.9674, "step": 19400 }, { "epoch": 5.827358384323736, "eval_loss": 1.9838725328445435, "eval_runtime": 244.9514, "eval_samples_per_second": 22.898, "eval_steps_per_second": 5.728, "step": 19400 }, { "epoch": 5.8348661736551675, "grad_norm": 1.236413836479187, "learning_rate": 2.984710646413399e-06, "loss": 1.9401, "step": 19425 }, { "epoch": 5.842373962986598, "grad_norm": 1.1869447231292725, "learning_rate": 2.965537278637612e-06, "loss": 1.9927, "step": 19450 }, { "epoch": 5.84988175231803, "grad_norm": 1.1049928665161133, "learning_rate": 2.946410505678359e-06, "loss": 1.9789, "step": 19475 }, { "epoch": 5.857389541649462, "grad_norm": 1.0848510265350342, "learning_rate": 2.927330524076784e-06, "loss": 1.9329, "step": 19500 }, { "epoch": 5.857389541649462, "eval_loss": 1.9838305711746216, "eval_runtime": 244.8394, "eval_samples_per_second": 22.909, "eval_steps_per_second": 5.73, "step": 19500 }, { "epoch": 5.8648973309808925, "grad_norm": 1.257003664970398, "learning_rate": 2.9082975298932073e-06, "loss": 1.9271, "step": 19525 }, { "epoch": 5.872405120312324, "grad_norm": 1.1217468976974487, "learning_rate": 2.889311718705135e-06, "loss": 1.9593, "step": 19550 }, { "epoch": 5.879912909643755, "grad_norm": 1.0415600538253784, "learning_rate": 2.8703732856052216e-06, "loss": 1.9436, "step": 19575 }, { "epoch": 5.887420698975187, "grad_norm": 1.1773449182510376, "learning_rate": 2.8514824251992834e-06, "loss": 1.9604, "step": 19600 }, { "epoch": 5.887420698975187, "eval_loss": 1.983793020248413, "eval_runtime": 244.4635, "eval_samples_per_second": 22.944, "eval_steps_per_second": 5.739, "step": 19600 }, { "epoch": 5.894928488306618, "grad_norm": 1.1694916486740112, "learning_rate": 2.832639331604292e-06, "loss": 1.9281, "step": 19625 }, { "epoch": 5.902436277638049, "grad_norm": 1.1439831256866455, "learning_rate": 2.813844198446383e-06, "loss": 1.9469, "step": 19650 }, { "epoch": 5.909944066969481, "grad_norm": 1.2244899272918701, "learning_rate": 2.7950972188588596e-06, "loss": 1.9203, "step": 19675 }, { "epoch": 5.917451856300913, "grad_norm": 1.0796282291412354, "learning_rate": 2.776398585480223e-06, "loss": 1.9569, "step": 19700 }, { "epoch": 5.917451856300913, "eval_loss": 1.983589768409729, "eval_runtime": 244.6683, "eval_samples_per_second": 22.925, "eval_steps_per_second": 5.734, "step": 19700 }, { "epoch": 5.9249596456323435, "grad_norm": 1.1817554235458374, "learning_rate": 2.757748490452177e-06, "loss": 1.967, "step": 19725 }, { "epoch": 5.932467434963775, "grad_norm": 1.1933224201202393, "learning_rate": 2.739147125417653e-06, "loss": 1.9553, "step": 19750 }, { "epoch": 5.939975224295206, "grad_norm": 1.0195425748825073, "learning_rate": 2.7205946815188563e-06, "loss": 1.9477, "step": 19775 }, { "epoch": 5.947483013626638, "grad_norm": 1.1039797067642212, "learning_rate": 2.7020913493952893e-06, "loss": 1.9508, "step": 19800 }, { "epoch": 5.947483013626638, "eval_loss": 1.9835751056671143, "eval_runtime": 244.4808, "eval_samples_per_second": 22.942, "eval_steps_per_second": 5.739, "step": 19800 }, { "epoch": 5.954990802958069, "grad_norm": 1.1363548040390015, "learning_rate": 2.6836373191817982e-06, "loss": 1.9466, "step": 19825 }, { "epoch": 5.9624985922895, "grad_norm": 1.182576298713684, "learning_rate": 2.6652327805066128e-06, "loss": 1.9549, "step": 19850 }, { "epoch": 5.970006381620932, "grad_norm": 1.083834171295166, "learning_rate": 2.6468779224894086e-06, "loss": 1.9421, "step": 19875 }, { "epoch": 5.977514170952363, "grad_norm": 1.1508004665374756, "learning_rate": 2.628572933739354e-06, "loss": 1.9237, "step": 19900 }, { "epoch": 5.977514170952363, "eval_loss": 1.9832342863082886, "eval_runtime": 276.0383, "eval_samples_per_second": 20.32, "eval_steps_per_second": 5.083, "step": 19900 }, { "epoch": 5.985021960283794, "grad_norm": 1.134469985961914, "learning_rate": 2.6103180023531726e-06, "loss": 1.9175, "step": 19925 }, { "epoch": 5.992529749615226, "grad_norm": 1.1148380041122437, "learning_rate": 2.592113315913217e-06, "loss": 1.96, "step": 19950 }, { "epoch": 6.000300311573257, "grad_norm": 2.1076784133911133, "learning_rate": 2.5739590614855353e-06, "loss": 2.0546, "step": 19975 }, { "epoch": 6.007808100904689, "grad_norm": 1.1790810823440552, "learning_rate": 2.5558554256179507e-06, "loss": 1.9568, "step": 20000 }, { "epoch": 6.007808100904689, "eval_loss": 1.983675241470337, "eval_runtime": 277.6925, "eval_samples_per_second": 20.199, "eval_steps_per_second": 5.052, "step": 20000 }, { "epoch": 6.01531589023612, "grad_norm": 1.3004977703094482, "learning_rate": 2.5378025943381482e-06, "loss": 1.9195, "step": 20025 }, { "epoch": 6.022823679567551, "grad_norm": 1.2546344995498657, "learning_rate": 2.519800753151757e-06, "loss": 1.9527, "step": 20050 }, { "epoch": 6.030331468898983, "grad_norm": 1.0866812467575073, "learning_rate": 2.501850087040448e-06, "loss": 1.937, "step": 20075 }, { "epoch": 6.037839258230414, "grad_norm": 1.1754050254821777, "learning_rate": 2.4839507804600274e-06, "loss": 1.8801, "step": 20100 }, { "epoch": 6.037839258230414, "eval_loss": 1.983474850654602, "eval_runtime": 244.4023, "eval_samples_per_second": 22.95, "eval_steps_per_second": 5.741, "step": 20100 }, { "epoch": 6.045347047561846, "grad_norm": 1.3076649904251099, "learning_rate": 2.466103017338552e-06, "loss": 1.9264, "step": 20125 }, { "epoch": 6.052854836893276, "grad_norm": 1.3242402076721191, "learning_rate": 2.448306981074428e-06, "loss": 1.9262, "step": 20150 }, { "epoch": 6.060362626224708, "grad_norm": 1.0890467166900635, "learning_rate": 2.4305628545345394e-06, "loss": 1.9743, "step": 20175 }, { "epoch": 6.06787041555614, "grad_norm": 1.1457139253616333, "learning_rate": 2.412870820052353e-06, "loss": 1.9558, "step": 20200 }, { "epoch": 6.06787041555614, "eval_loss": 1.983147144317627, "eval_runtime": 244.7147, "eval_samples_per_second": 22.921, "eval_steps_per_second": 5.733, "step": 20200 }, { "epoch": 6.075378204887571, "grad_norm": 1.1762498617172241, "learning_rate": 2.395231059426055e-06, "loss": 1.9198, "step": 20225 }, { "epoch": 6.082885994219002, "grad_norm": 1.1638132333755493, "learning_rate": 2.3776437539166825e-06, "loss": 1.9397, "step": 20250 }, { "epoch": 6.090393783550433, "grad_norm": 1.2441715002059937, "learning_rate": 2.3601090842462575e-06, "loss": 1.9676, "step": 20275 }, { "epoch": 6.097901572881865, "grad_norm": 1.1457374095916748, "learning_rate": 2.342627230595929e-06, "loss": 1.9574, "step": 20300 }, { "epoch": 6.097901572881865, "eval_loss": 1.9833580255508423, "eval_runtime": 322.7704, "eval_samples_per_second": 17.378, "eval_steps_per_second": 4.347, "step": 20300 }, { "epoch": 6.1054093622132966, "grad_norm": 1.2807676792144775, "learning_rate": 2.325198372604132e-06, "loss": 1.91, "step": 20325 }, { "epoch": 6.112917151544727, "grad_norm": 1.1415411233901978, "learning_rate": 2.3078226893647254e-06, "loss": 1.9255, "step": 20350 }, { "epoch": 6.120424940876159, "grad_norm": 1.1930123567581177, "learning_rate": 2.290500359425165e-06, "loss": 1.898, "step": 20375 }, { "epoch": 6.127932730207591, "grad_norm": 1.1319756507873535, "learning_rate": 2.2732315607846606e-06, "loss": 1.9043, "step": 20400 }, { "epoch": 6.127932730207591, "eval_loss": 1.9833526611328125, "eval_runtime": 244.7015, "eval_samples_per_second": 22.922, "eval_steps_per_second": 5.734, "step": 20400 }, { "epoch": 6.135440519539022, "grad_norm": 1.197733759880066, "learning_rate": 2.25601647089235e-06, "loss": 1.9325, "step": 20425 }, { "epoch": 6.142948308870453, "grad_norm": 1.1803226470947266, "learning_rate": 2.238855266645473e-06, "loss": 1.9357, "step": 20450 }, { "epoch": 6.150456098201884, "grad_norm": 1.2374463081359863, "learning_rate": 2.2217481243875666e-06, "loss": 1.9071, "step": 20475 }, { "epoch": 6.157963887533316, "grad_norm": 1.178080439567566, "learning_rate": 2.2046952199066323e-06, "loss": 1.936, "step": 20500 }, { "epoch": 6.157963887533316, "eval_loss": 1.9832499027252197, "eval_runtime": 244.9473, "eval_samples_per_second": 22.899, "eval_steps_per_second": 5.728, "step": 20500 }, { "epoch": 6.1654716768647475, "grad_norm": 1.1624387502670288, "learning_rate": 2.1876967284333436e-06, "loss": 1.9722, "step": 20525 }, { "epoch": 6.172979466196178, "grad_norm": 1.2391788959503174, "learning_rate": 2.170752824639242e-06, "loss": 1.971, "step": 20550 }, { "epoch": 6.18048725552761, "grad_norm": 1.183759331703186, "learning_rate": 2.153863682634941e-06, "loss": 1.9717, "step": 20575 }, { "epoch": 6.187995044859041, "grad_norm": 1.164625644683838, "learning_rate": 2.137029475968338e-06, "loss": 1.9668, "step": 20600 }, { "epoch": 6.187995044859041, "eval_loss": 1.982852578163147, "eval_runtime": 244.5369, "eval_samples_per_second": 22.937, "eval_steps_per_second": 5.737, "step": 20600 }, { "epoch": 6.1955028341904725, "grad_norm": 1.2210159301757812, "learning_rate": 2.1209204813122366e-06, "loss": 1.9451, "step": 20625 }, { "epoch": 6.203010623521904, "grad_norm": 1.2201151847839355, "learning_rate": 2.104194449172132e-06, "loss": 1.926, "step": 20650 }, { "epoch": 6.210518412853335, "grad_norm": 1.2492685317993164, "learning_rate": 2.0875238627562834e-06, "loss": 1.928, "step": 20675 }, { "epoch": 6.218026202184767, "grad_norm": 0.9965053796768188, "learning_rate": 2.0709088933667766e-06, "loss": 1.9374, "step": 20700 }, { "epoch": 6.218026202184767, "eval_loss": 1.9826812744140625, "eval_runtime": 244.5669, "eval_samples_per_second": 22.934, "eval_steps_per_second": 5.737, "step": 20700 }, { "epoch": 6.2255339915161985, "grad_norm": 1.0911799669265747, "learning_rate": 2.0543497117341904e-06, "loss": 1.9361, "step": 20725 }, { "epoch": 6.233041780847629, "grad_norm": 1.277250051498413, "learning_rate": 2.0378464880158453e-06, "loss": 1.9285, "step": 20750 }, { "epoch": 6.240549570179061, "grad_norm": 1.1859968900680542, "learning_rate": 2.0213993917940577e-06, "loss": 1.9531, "step": 20775 }, { "epoch": 6.248057359510492, "grad_norm": 1.2009955644607544, "learning_rate": 2.0050085920743904e-06, "loss": 1.9415, "step": 20800 }, { "epoch": 6.248057359510492, "eval_loss": 1.9828299283981323, "eval_runtime": 244.6515, "eval_samples_per_second": 22.926, "eval_steps_per_second": 5.735, "step": 20800 }, { "epoch": 6.2555651488419235, "grad_norm": 1.23262357711792, "learning_rate": 1.9886742572839227e-06, "loss": 1.9466, "step": 20825 }, { "epoch": 6.263072938173355, "grad_norm": 1.1354538202285767, "learning_rate": 1.9723965552695134e-06, "loss": 1.9538, "step": 20850 }, { "epoch": 6.270580727504786, "grad_norm": 1.2842826843261719, "learning_rate": 1.956175653296082e-06, "loss": 1.9547, "step": 20875 }, { "epoch": 6.278088516836218, "grad_norm": 1.2268083095550537, "learning_rate": 1.9400117180448872e-06, "loss": 1.9535, "step": 20900 }, { "epoch": 6.278088516836218, "eval_loss": 1.9827969074249268, "eval_runtime": 244.6424, "eval_samples_per_second": 22.927, "eval_steps_per_second": 5.735, "step": 20900 }, { "epoch": 6.2855963061676485, "grad_norm": 1.1584311723709106, "learning_rate": 1.923904915611814e-06, "loss": 1.9903, "step": 20925 }, { "epoch": 6.29310409549908, "grad_norm": 1.1765952110290527, "learning_rate": 1.9078554115056657e-06, "loss": 1.9313, "step": 20950 }, { "epoch": 6.300611884830512, "grad_norm": 1.0743718147277832, "learning_rate": 1.8918633706464663e-06, "loss": 1.937, "step": 20975 }, { "epoch": 6.308119674161943, "grad_norm": 1.1020997762680054, "learning_rate": 1.8759289573637645e-06, "loss": 1.9505, "step": 21000 }, { "epoch": 6.308119674161943, "eval_loss": 1.9827996492385864, "eval_runtime": 244.7905, "eval_samples_per_second": 22.913, "eval_steps_per_second": 5.731, "step": 21000 }, { "epoch": 6.3156274634933744, "grad_norm": 1.2609679698944092, "learning_rate": 1.8600523353949437e-06, "loss": 1.9424, "step": 21025 }, { "epoch": 6.323135252824806, "grad_norm": 1.2550972700119019, "learning_rate": 1.8442336678835417e-06, "loss": 1.9284, "step": 21050 }, { "epoch": 6.330643042156237, "grad_norm": 1.21172297000885, "learning_rate": 1.8284731173775695e-06, "loss": 1.9422, "step": 21075 }, { "epoch": 6.338150831487669, "grad_norm": 1.2744083404541016, "learning_rate": 1.8127708458278532e-06, "loss": 1.9512, "step": 21100 }, { "epoch": 6.338150831487669, "eval_loss": 1.9828649759292603, "eval_runtime": 244.7295, "eval_samples_per_second": 22.919, "eval_steps_per_second": 5.733, "step": 21100 }, { "epoch": 6.3456586208190995, "grad_norm": 1.006986141204834, "learning_rate": 1.7971270145863531e-06, "loss": 1.9737, "step": 21125 }, { "epoch": 6.353166410150531, "grad_norm": 1.1543078422546387, "learning_rate": 1.7815417844045175e-06, "loss": 1.9688, "step": 21150 }, { "epoch": 6.360674199481963, "grad_norm": 1.2171674966812134, "learning_rate": 1.7660153154316258e-06, "loss": 1.9549, "step": 21175 }, { "epoch": 6.368181988813394, "grad_norm": 1.1868822574615479, "learning_rate": 1.7505477672131454e-06, "loss": 1.9467, "step": 21200 }, { "epoch": 6.368181988813394, "eval_loss": 1.9822328090667725, "eval_runtime": 244.9166, "eval_samples_per_second": 22.902, "eval_steps_per_second": 5.728, "step": 21200 }, { "epoch": 6.375689778144825, "grad_norm": 1.2831307649612427, "learning_rate": 1.7351392986890915e-06, "loss": 1.9572, "step": 21225 }, { "epoch": 6.383197567476256, "grad_norm": 1.2353671789169312, "learning_rate": 1.7197900681923927e-06, "loss": 1.9286, "step": 21250 }, { "epoch": 6.390705356807688, "grad_norm": 1.2204623222351074, "learning_rate": 1.7045002334472654e-06, "loss": 1.959, "step": 21275 }, { "epoch": 6.39821314613912, "grad_norm": 1.3212610483169556, "learning_rate": 1.689269951567592e-06, "loss": 1.9591, "step": 21300 }, { "epoch": 6.39821314613912, "eval_loss": 1.9822728633880615, "eval_runtime": 244.7907, "eval_samples_per_second": 22.913, "eval_steps_per_second": 5.731, "step": 21300 }, { "epoch": 6.40572093547055, "grad_norm": 1.1899783611297607, "learning_rate": 1.674099379055308e-06, "loss": 1.9496, "step": 21325 }, { "epoch": 6.413228724801982, "grad_norm": 1.1367979049682617, "learning_rate": 1.6589886717987917e-06, "loss": 1.9283, "step": 21350 }, { "epoch": 6.420736514133413, "grad_norm": 1.343583106994629, "learning_rate": 1.6439379850712633e-06, "loss": 1.9282, "step": 21375 }, { "epoch": 6.428244303464845, "grad_norm": 1.0660362243652344, "learning_rate": 1.6289474735291935e-06, "loss": 1.9577, "step": 21400 }, { "epoch": 6.428244303464845, "eval_loss": 1.9821466207504272, "eval_runtime": 245.3705, "eval_samples_per_second": 22.859, "eval_steps_per_second": 5.718, "step": 21400 }, { "epoch": 6.435752092796276, "grad_norm": 1.1740394830703735, "learning_rate": 1.6140172912107054e-06, "loss": 1.9397, "step": 21425 }, { "epoch": 6.443259882127707, "grad_norm": 1.2688024044036865, "learning_rate": 1.5991475915339973e-06, "loss": 1.9066, "step": 21450 }, { "epoch": 6.450767671459139, "grad_norm": 1.2636834383010864, "learning_rate": 1.5843385272957686e-06, "loss": 1.9337, "step": 21475 }, { "epoch": 6.458275460790571, "grad_norm": 1.207352638244629, "learning_rate": 1.5695902506696439e-06, "loss": 1.9523, "step": 21500 }, { "epoch": 6.458275460790571, "eval_loss": 1.982376217842102, "eval_runtime": 245.0395, "eval_samples_per_second": 22.89, "eval_steps_per_second": 5.726, "step": 21500 }, { "epoch": 6.465783250122001, "grad_norm": 1.1533434391021729, "learning_rate": 1.5549029132046123e-06, "loss": 1.9335, "step": 21525 }, { "epoch": 6.473291039453433, "grad_norm": 1.138137936592102, "learning_rate": 1.5402766658234704e-06, "loss": 1.9457, "step": 21550 }, { "epoch": 6.480798828784864, "grad_norm": 1.1742804050445557, "learning_rate": 1.5257116588212709e-06, "loss": 1.9303, "step": 21575 }, { "epoch": 6.488306618116296, "grad_norm": 1.1066967248916626, "learning_rate": 1.511208041863778e-06, "loss": 1.9251, "step": 21600 }, { "epoch": 6.488306618116296, "eval_loss": 1.9820733070373535, "eval_runtime": 244.8749, "eval_samples_per_second": 22.906, "eval_steps_per_second": 5.729, "step": 21600 }, { "epoch": 6.495814407447727, "grad_norm": 1.1347085237503052, "learning_rate": 1.4967659639859308e-06, "loss": 1.9311, "step": 21625 }, { "epoch": 6.503322196779158, "grad_norm": 1.2577033042907715, "learning_rate": 1.4823855735903083e-06, "loss": 1.9354, "step": 21650 }, { "epoch": 6.51082998611059, "grad_norm": 1.1945990324020386, "learning_rate": 1.468067018445608e-06, "loss": 1.9046, "step": 21675 }, { "epoch": 6.518337775442021, "grad_norm": 1.195004940032959, "learning_rate": 1.4538104456851294e-06, "loss": 1.9374, "step": 21700 }, { "epoch": 6.518337775442021, "eval_loss": 1.9817756414413452, "eval_runtime": 244.4709, "eval_samples_per_second": 22.943, "eval_steps_per_second": 5.739, "step": 21700 }, { "epoch": 6.525845564773452, "grad_norm": 1.289342999458313, "learning_rate": 1.4396160018052555e-06, "loss": 1.9201, "step": 21725 }, { "epoch": 6.533353354104884, "grad_norm": 1.231141209602356, "learning_rate": 1.4254838326639514e-06, "loss": 1.9527, "step": 21750 }, { "epoch": 6.540861143436315, "grad_norm": 1.3896465301513672, "learning_rate": 1.4114140834792666e-06, "loss": 1.9347, "step": 21775 }, { "epoch": 6.548368932767747, "grad_norm": 1.2049739360809326, "learning_rate": 1.3974068988278402e-06, "loss": 1.969, "step": 21800 }, { "epoch": 6.548368932767747, "eval_loss": 1.9819016456604004, "eval_runtime": 244.8747, "eval_samples_per_second": 22.906, "eval_steps_per_second": 5.729, "step": 21800 }, { "epoch": 6.555876722099178, "grad_norm": 1.2919905185699463, "learning_rate": 1.3834624226434162e-06, "loss": 1.9555, "step": 21825 }, { "epoch": 6.563384511430609, "grad_norm": 1.2100296020507812, "learning_rate": 1.3695807982153666e-06, "loss": 1.9239, "step": 21850 }, { "epoch": 6.570892300762041, "grad_norm": 1.1903220415115356, "learning_rate": 1.3557621681872142e-06, "loss": 1.9201, "step": 21875 }, { "epoch": 6.578400090093472, "grad_norm": 1.1797667741775513, "learning_rate": 1.3420066745551715e-06, "loss": 1.9418, "step": 21900 }, { "epoch": 6.578400090093472, "eval_loss": 1.9816147089004517, "eval_runtime": 244.5673, "eval_samples_per_second": 22.934, "eval_steps_per_second": 5.737, "step": 21900 }, { "epoch": 6.585907879424903, "grad_norm": 1.155019760131836, "learning_rate": 1.3283144586666803e-06, "loss": 1.9466, "step": 21925 }, { "epoch": 6.593415668756335, "grad_norm": 1.2090644836425781, "learning_rate": 1.314685661218958e-06, "loss": 1.9444, "step": 21950 }, { "epoch": 6.600923458087766, "grad_norm": 1.1589152812957764, "learning_rate": 1.3011204222575515e-06, "loss": 1.9282, "step": 21975 }, { "epoch": 6.6084312474191975, "grad_norm": 1.3078739643096924, "learning_rate": 1.287618881174899e-06, "loss": 1.9273, "step": 22000 }, { "epoch": 6.6084312474191975, "eval_loss": 1.9818423986434937, "eval_runtime": 244.6939, "eval_samples_per_second": 22.923, "eval_steps_per_second": 5.734, "step": 22000 }, { "epoch": 6.615939036750628, "grad_norm": 1.070833683013916, "learning_rate": 1.2741811767089034e-06, "loss": 1.9397, "step": 22025 }, { "epoch": 6.62344682608206, "grad_norm": 1.1993708610534668, "learning_rate": 1.2608074469414949e-06, "loss": 1.959, "step": 22050 }, { "epoch": 6.630954615413492, "grad_norm": 1.1407665014266968, "learning_rate": 1.2474978292972209e-06, "loss": 1.9474, "step": 22075 }, { "epoch": 6.6384624047449226, "grad_norm": 1.2709163427352905, "learning_rate": 1.2342524605418293e-06, "loss": 1.9464, "step": 22100 }, { "epoch": 6.6384624047449226, "eval_loss": 1.9815821647644043, "eval_runtime": 244.5724, "eval_samples_per_second": 22.934, "eval_steps_per_second": 5.737, "step": 22100 }, { "epoch": 6.645970194076354, "grad_norm": 1.1849255561828613, "learning_rate": 1.221071476780867e-06, "loss": 1.9201, "step": 22125 }, { "epoch": 6.653477983407786, "grad_norm": 1.2153717279434204, "learning_rate": 1.207955013458281e-06, "loss": 1.9624, "step": 22150 }, { "epoch": 6.660985772739217, "grad_norm": 1.1668004989624023, "learning_rate": 1.1949032053550208e-06, "loss": 1.9304, "step": 22175 }, { "epoch": 6.6684935620706485, "grad_norm": 1.1738938093185425, "learning_rate": 1.1819161865876618e-06, "loss": 1.9117, "step": 22200 }, { "epoch": 6.6684935620706485, "eval_loss": 1.981676697731018, "eval_runtime": 244.4832, "eval_samples_per_second": 22.942, "eval_steps_per_second": 5.739, "step": 22200 }, { "epoch": 6.676001351402079, "grad_norm": 1.191311001777649, "learning_rate": 1.1689940906070203e-06, "loss": 1.9211, "step": 22225 }, { "epoch": 6.683509140733511, "grad_norm": 1.1772605180740356, "learning_rate": 1.1561370501967871e-06, "loss": 1.933, "step": 22250 }, { "epoch": 6.691016930064943, "grad_norm": 1.3537640571594238, "learning_rate": 1.1433451974721602e-06, "loss": 1.9239, "step": 22275 }, { "epoch": 6.6985247193963735, "grad_norm": 1.1915578842163086, "learning_rate": 1.1306186638784846e-06, "loss": 1.9429, "step": 22300 }, { "epoch": 6.6985247193963735, "eval_loss": 1.981661081314087, "eval_runtime": 244.4697, "eval_samples_per_second": 22.944, "eval_steps_per_second": 5.739, "step": 22300 }, { "epoch": 6.706032508727805, "grad_norm": 1.0470706224441528, "learning_rate": 1.1179575801899122e-06, "loss": 1.9428, "step": 22325 }, { "epoch": 6.713540298059236, "grad_norm": 1.2210986614227295, "learning_rate": 1.1053620765080458e-06, "loss": 1.9551, "step": 22350 }, { "epoch": 6.721048087390668, "grad_norm": 1.2881091833114624, "learning_rate": 1.0928322822606064e-06, "loss": 1.9365, "step": 22375 }, { "epoch": 6.728555876722099, "grad_norm": 1.2427425384521484, "learning_rate": 1.0803683262001066e-06, "loss": 1.9491, "step": 22400 }, { "epoch": 6.728555876722099, "eval_loss": 1.9814238548278809, "eval_runtime": 244.3102, "eval_samples_per_second": 22.959, "eval_steps_per_second": 5.743, "step": 22400 }, { "epoch": 6.73606366605353, "grad_norm": 1.2582703828811646, "learning_rate": 1.067970336402524e-06, "loss": 1.9398, "step": 22425 }, { "epoch": 6.743571455384962, "grad_norm": 1.2919580936431885, "learning_rate": 1.055638440265983e-06, "loss": 1.9626, "step": 22450 }, { "epoch": 6.751079244716394, "grad_norm": 1.2642123699188232, "learning_rate": 1.0433727645094574e-06, "loss": 1.9278, "step": 22475 }, { "epoch": 6.7585870340478245, "grad_norm": 1.1762003898620605, "learning_rate": 1.0311734351714533e-06, "loss": 1.9289, "step": 22500 }, { "epoch": 6.7585870340478245, "eval_loss": 1.981581687927246, "eval_runtime": 245.2807, "eval_samples_per_second": 22.868, "eval_steps_per_second": 5.72, "step": 22500 }, { "epoch": 6.766094823379256, "grad_norm": 1.1311445236206055, "learning_rate": 1.0190405776087183e-06, "loss": 1.9347, "step": 22525 }, { "epoch": 6.773602612710687, "grad_norm": 1.2351762056350708, "learning_rate": 1.0069743164949595e-06, "loss": 1.9398, "step": 22550 }, { "epoch": 6.781110402042119, "grad_norm": 1.097221851348877, "learning_rate": 9.949747758195568e-07, "loss": 1.9527, "step": 22575 }, { "epoch": 6.78861819137355, "grad_norm": 1.2283989191055298, "learning_rate": 9.830420788862903e-07, "loss": 1.9374, "step": 22600 }, { "epoch": 6.78861819137355, "eval_loss": 1.9813563823699951, "eval_runtime": 244.58, "eval_samples_per_second": 22.933, "eval_steps_per_second": 5.736, "step": 22600 }, { "epoch": 6.796125980704981, "grad_norm": 1.28317391872406, "learning_rate": 9.71176348312076e-07, "loss": 1.9048, "step": 22625 }, { "epoch": 6.803633770036413, "grad_norm": 1.3111134767532349, "learning_rate": 9.593777060257004e-07, "loss": 1.9211, "step": 22650 }, { "epoch": 6.811141559367844, "grad_norm": 1.1793931722640991, "learning_rate": 9.476462732665697e-07, "loss": 1.928, "step": 22675 }, { "epoch": 6.818649348699275, "grad_norm": 1.111651062965393, "learning_rate": 9.359821705834662e-07, "loss": 1.9336, "step": 22700 }, { "epoch": 6.818649348699275, "eval_loss": 1.9812109470367432, "eval_runtime": 244.4408, "eval_samples_per_second": 22.946, "eval_steps_per_second": 5.74, "step": 22700 }, { "epoch": 6.826157138030707, "grad_norm": 1.0364270210266113, "learning_rate": 9.243855178333066e-07, "loss": 1.9512, "step": 22725 }, { "epoch": 6.833664927362138, "grad_norm": 1.0628246068954468, "learning_rate": 9.128564341799139e-07, "loss": 1.9368, "step": 22750 }, { "epoch": 6.84117271669357, "grad_norm": 1.250428557395935, "learning_rate": 9.013950380927874e-07, "loss": 1.9603, "step": 22775 }, { "epoch": 6.848680506025001, "grad_norm": 1.2401816844940186, "learning_rate": 8.900014473458943e-07, "loss": 1.9414, "step": 22800 }, { "epoch": 6.848680506025001, "eval_loss": 1.981279969215393, "eval_runtime": 244.3713, "eval_samples_per_second": 22.953, "eval_steps_per_second": 5.741, "step": 22800 }, { "epoch": 6.856188295356432, "grad_norm": 1.2459220886230469, "learning_rate": 8.78675779016449e-07, "loss": 1.9228, "step": 22825 }, { "epoch": 6.863696084687864, "grad_norm": 1.0594732761383057, "learning_rate": 8.674181494837147e-07, "loss": 1.9627, "step": 22850 }, { "epoch": 6.871203874019295, "grad_norm": 1.1160441637039185, "learning_rate": 8.5622867442781e-07, "loss": 1.9599, "step": 22875 }, { "epoch": 6.878711663350726, "grad_norm": 1.4957025051116943, "learning_rate": 8.451074688285182e-07, "loss": 1.9485, "step": 22900 }, { "epoch": 6.878711663350726, "eval_loss": 1.9810361862182617, "eval_runtime": 244.8768, "eval_samples_per_second": 22.905, "eval_steps_per_second": 5.729, "step": 22900 }, { "epoch": 6.886219452682158, "grad_norm": 1.3058786392211914, "learning_rate": 8.340546469641027e-07, "loss": 1.9092, "step": 22925 }, { "epoch": 6.893727242013589, "grad_norm": 1.2844972610473633, "learning_rate": 8.23070322410141e-07, "loss": 1.9442, "step": 22950 }, { "epoch": 6.901235031345021, "grad_norm": 1.291462779045105, "learning_rate": 8.121546080383474e-07, "loss": 1.9241, "step": 22975 }, { "epoch": 6.908742820676451, "grad_norm": 1.2580238580703735, "learning_rate": 8.013076160154187e-07, "loss": 1.9412, "step": 23000 }, { "epoch": 6.908742820676451, "eval_loss": 1.9810972213745117, "eval_runtime": 244.8376, "eval_samples_per_second": 22.909, "eval_steps_per_second": 5.73, "step": 23000 }, { "epoch": 6.916250610007883, "grad_norm": 1.2016263008117676, "learning_rate": 7.905294578018824e-07, "loss": 1.932, "step": 23025 }, { "epoch": 6.923758399339315, "grad_norm": 1.2784329652786255, "learning_rate": 7.798202441509484e-07, "loss": 1.9505, "step": 23050 }, { "epoch": 6.931266188670746, "grad_norm": 1.3308900594711304, "learning_rate": 7.691800851073724e-07, "loss": 1.9416, "step": 23075 }, { "epoch": 6.938773978002177, "grad_norm": 1.1549803018569946, "learning_rate": 7.58609090006328e-07, "loss": 1.9469, "step": 23100 }, { "epoch": 6.938773978002177, "eval_loss": 1.9810361862182617, "eval_runtime": 244.9865, "eval_samples_per_second": 22.895, "eval_steps_per_second": 5.727, "step": 23100 }, { "epoch": 6.946281767333609, "grad_norm": 1.0882563591003418, "learning_rate": 7.481073674722763e-07, "loss": 1.9424, "step": 23125 }, { "epoch": 6.95378955666504, "grad_norm": 1.1521430015563965, "learning_rate": 7.37675025417856e-07, "loss": 1.9525, "step": 23150 }, { "epoch": 6.9612973459964715, "grad_norm": 1.1167752742767334, "learning_rate": 7.273121710427738e-07, "loss": 1.9644, "step": 23175 }, { "epoch": 6.968805135327902, "grad_norm": 1.0596712827682495, "learning_rate": 7.170189108326941e-07, "loss": 1.921, "step": 23200 }, { "epoch": 6.968805135327902, "eval_loss": 1.9809205532073975, "eval_runtime": 244.6054, "eval_samples_per_second": 22.931, "eval_steps_per_second": 5.736, "step": 23200 }, { "epoch": 6.976312924659334, "grad_norm": 1.2589685916900635, "learning_rate": 7.067953505581593e-07, "loss": 1.948, "step": 23225 }, { "epoch": 6.983820713990766, "grad_norm": 1.0607045888900757, "learning_rate": 6.966415952734953e-07, "loss": 1.9632, "step": 23250 }, { "epoch": 6.991328503322197, "grad_norm": 1.2630726099014282, "learning_rate": 6.86557749315728e-07, "loss": 1.9264, "step": 23275 }, { "epoch": 6.998836292653628, "grad_norm": 1.1883573532104492, "learning_rate": 6.765439163035183e-07, "loss": 1.9428, "step": 23300 }, { "epoch": 6.998836292653628, "eval_loss": 1.9808813333511353, "eval_runtime": 244.9512, "eval_samples_per_second": 22.898, "eval_steps_per_second": 5.728, "step": 23300 }, { "epoch": 7.006606854611659, "grad_norm": 1.3225353956222534, "learning_rate": 6.666001991360948e-07, "loss": 2.0098, "step": 23325 }, { "epoch": 7.014114643943091, "grad_norm": 1.1181532144546509, "learning_rate": 6.567266999921936e-07, "loss": 1.9435, "step": 23350 }, { "epoch": 7.021622433274523, "grad_norm": 1.167235016822815, "learning_rate": 6.469235203290125e-07, "loss": 1.9534, "step": 23375 }, { "epoch": 7.0291302226059535, "grad_norm": 1.1674944162368774, "learning_rate": 6.371907608811686e-07, "loss": 1.9374, "step": 23400 }, { "epoch": 7.0291302226059535, "eval_loss": 1.9810993671417236, "eval_runtime": 244.971, "eval_samples_per_second": 22.897, "eval_steps_per_second": 5.727, "step": 23400 }, { "epoch": 7.036638011937385, "grad_norm": 1.2668019533157349, "learning_rate": 6.275285216596583e-07, "loss": 1.9401, "step": 23425 }, { "epoch": 7.044145801268816, "grad_norm": 1.2170027494430542, "learning_rate": 6.179369019508346e-07, "loss": 1.9334, "step": 23450 }, { "epoch": 7.051653590600248, "grad_norm": 1.1965893507003784, "learning_rate": 6.084160003153849e-07, "loss": 1.9103, "step": 23475 }, { "epoch": 7.0591613799316795, "grad_norm": 1.1913440227508545, "learning_rate": 5.989659145873175e-07, "loss": 1.9268, "step": 23500 }, { "epoch": 7.0591613799316795, "eval_loss": 1.9811537265777588, "eval_runtime": 244.4854, "eval_samples_per_second": 22.942, "eval_steps_per_second": 5.739, "step": 23500 }, { "epoch": 7.06666916926311, "grad_norm": 1.3365952968597412, "learning_rate": 5.895867418729561e-07, "loss": 1.9736, "step": 23525 }, { "epoch": 7.074176958594542, "grad_norm": 1.1637241840362549, "learning_rate": 5.802785785499434e-07, "loss": 1.9338, "step": 23550 }, { "epoch": 7.081684747925973, "grad_norm": 1.279487133026123, "learning_rate": 5.710415202662539e-07, "loss": 1.9281, "step": 23575 }, { "epoch": 7.0891925372574045, "grad_norm": 1.1029129028320312, "learning_rate": 5.618756619392048e-07, "loss": 1.9513, "step": 23600 }, { "epoch": 7.0891925372574045, "eval_loss": 1.9810665845870972, "eval_runtime": 244.7676, "eval_samples_per_second": 22.916, "eval_steps_per_second": 5.732, "step": 23600 }, { "epoch": 7.096700326588836, "grad_norm": 1.3634998798370361, "learning_rate": 5.527810977544814e-07, "loss": 1.972, "step": 23625 }, { "epoch": 7.104208115920267, "grad_norm": 1.1559998989105225, "learning_rate": 5.437579211651739e-07, "loss": 1.9436, "step": 23650 }, { "epoch": 7.111715905251699, "grad_norm": 1.1781272888183594, "learning_rate": 5.348062248908126e-07, "loss": 1.9489, "step": 23675 }, { "epoch": 7.11922369458313, "grad_norm": 1.2078481912612915, "learning_rate": 5.259261009164179e-07, "loss": 1.973, "step": 23700 }, { "epoch": 7.11922369458313, "eval_loss": 1.9810374975204468, "eval_runtime": 244.6697, "eval_samples_per_second": 22.925, "eval_steps_per_second": 5.734, "step": 23700 }, { "epoch": 7.126731483914561, "grad_norm": 1.2811923027038574, "learning_rate": 5.171176404915562e-07, "loss": 1.9334, "step": 23725 }, { "epoch": 7.134239273245993, "grad_norm": 1.2578486204147339, "learning_rate": 5.08380934129396e-07, "loss": 1.9083, "step": 23750 }, { "epoch": 7.141747062577424, "grad_norm": 1.2639051675796509, "learning_rate": 4.99716071605785e-07, "loss": 1.9363, "step": 23775 }, { "epoch": 7.1492548519088555, "grad_norm": 1.4398607015609741, "learning_rate": 4.911231419583228e-07, "loss": 1.9547, "step": 23800 }, { "epoch": 7.1492548519088555, "eval_loss": 1.9809601306915283, "eval_runtime": 244.5082, "eval_samples_per_second": 22.94, "eval_steps_per_second": 5.738, "step": 23800 }, { "epoch": 7.156762641240287, "grad_norm": 1.1956835985183716, "learning_rate": 4.826022334854482e-07, "loss": 1.9501, "step": 23825 }, { "epoch": 7.164270430571718, "grad_norm": 1.192142367362976, "learning_rate": 4.741534337455333e-07, "loss": 1.9336, "step": 23850 }, { "epoch": 7.17177821990315, "grad_norm": 1.204443335533142, "learning_rate": 4.6577682955597804e-07, "loss": 1.9482, "step": 23875 }, { "epoch": 7.1792860092345805, "grad_norm": 1.205980896949768, "learning_rate": 4.5747250699232664e-07, "loss": 1.9229, "step": 23900 }, { "epoch": 7.1792860092345805, "eval_loss": 1.9809165000915527, "eval_runtime": 244.9164, "eval_samples_per_second": 22.902, "eval_steps_per_second": 5.728, "step": 23900 }, { "epoch": 7.186793798566012, "grad_norm": 1.1392741203308105, "learning_rate": 4.492405513873732e-07, "loss": 1.9091, "step": 23925 }, { "epoch": 7.194301587897444, "grad_norm": 1.1868606805801392, "learning_rate": 4.4108104733029506e-07, "loss": 1.9538, "step": 23950 }, { "epoch": 7.201809377228875, "grad_norm": 1.2095065116882324, "learning_rate": 4.32994078665776e-07, "loss": 1.9259, "step": 23975 }, { "epoch": 7.209317166560306, "grad_norm": 1.2137978076934814, "learning_rate": 4.2497972849314587e-07, "loss": 1.9086, "step": 24000 }, { "epoch": 7.209317166560306, "eval_loss": 1.9811201095581055, "eval_runtime": 244.5079, "eval_samples_per_second": 22.94, "eval_steps_per_second": 5.738, "step": 24000 }, { "epoch": 7.216824955891738, "grad_norm": 1.3852958679199219, "learning_rate": 4.170380791655323e-07, "loss": 1.9304, "step": 24025 }, { "epoch": 7.224332745223169, "grad_norm": 1.1705379486083984, "learning_rate": 4.0916921228900643e-07, "loss": 1.9256, "step": 24050 }, { "epoch": 7.231840534554601, "grad_norm": 1.2448861598968506, "learning_rate": 4.013732087217492e-07, "loss": 1.9281, "step": 24075 }, { "epoch": 7.2393483238860314, "grad_norm": 1.1513874530792236, "learning_rate": 3.9365014857322133e-07, "loss": 1.9121, "step": 24100 }, { "epoch": 7.2393483238860314, "eval_loss": 1.9809224605560303, "eval_runtime": 244.5719, "eval_samples_per_second": 22.934, "eval_steps_per_second": 5.737, "step": 24100 }, { "epoch": 7.246856113217463, "grad_norm": 1.2126648426055908, "learning_rate": 3.8600011120333483e-07, "loss": 1.9301, "step": 24125 }, { "epoch": 7.254363902548895, "grad_norm": 1.213840126991272, "learning_rate": 3.7842317522164274e-07, "loss": 1.9395, "step": 24150 }, { "epoch": 7.261871691880326, "grad_norm": 1.1836591958999634, "learning_rate": 3.709194184865314e-07, "loss": 1.9326, "step": 24175 }, { "epoch": 7.269379481211757, "grad_norm": 1.1955537796020508, "learning_rate": 3.6348891810441457e-07, "loss": 1.9385, "step": 24200 }, { "epoch": 7.269379481211757, "eval_loss": 1.9811633825302124, "eval_runtime": 244.8759, "eval_samples_per_second": 22.905, "eval_steps_per_second": 5.729, "step": 24200 }, { "epoch": 7.276887270543188, "grad_norm": 1.0566610097885132, "learning_rate": 3.5613175042894823e-07, "loss": 1.9263, "step": 24225 }, { "epoch": 7.28439505987462, "grad_norm": 1.196273922920227, "learning_rate": 3.4884799106024185e-07, "loss": 1.905, "step": 24250 }, { "epoch": 7.291902849206052, "grad_norm": 1.1962950229644775, "learning_rate": 3.4163771484408247e-07, "loss": 1.9178, "step": 24275 }, { "epoch": 7.299410638537482, "grad_norm": 1.1637682914733887, "learning_rate": 3.3450099587116533e-07, "loss": 1.9427, "step": 24300 }, { "epoch": 7.299410638537482, "eval_loss": 1.9810516834259033, "eval_runtime": 244.4733, "eval_samples_per_second": 22.943, "eval_steps_per_second": 5.739, "step": 24300 }, { "epoch": 7.306918427868914, "grad_norm": 1.2521005868911743, "learning_rate": 3.2743790747633285e-07, "loss": 1.9469, "step": 24325 }, { "epoch": 7.314426217200346, "grad_norm": 1.241564154624939, "learning_rate": 3.2044852223782337e-07, "loss": 1.9265, "step": 24350 }, { "epoch": 7.321934006531777, "grad_norm": 1.2285447120666504, "learning_rate": 3.135329119765204e-07, "loss": 1.9296, "step": 24375 }, { "epoch": 7.329441795863208, "grad_norm": 1.1928914785385132, "learning_rate": 3.0669114775521784e-07, "loss": 1.9409, "step": 24400 }, { "epoch": 7.329441795863208, "eval_loss": 1.9809722900390625, "eval_runtime": 244.5739, "eval_samples_per_second": 22.934, "eval_steps_per_second": 5.737, "step": 24400 }, { "epoch": 7.336949585194639, "grad_norm": 1.1230217218399048, "learning_rate": 2.9992329987789004e-07, "loss": 1.9087, "step": 24425 }, { "epoch": 7.344457374526071, "grad_norm": 1.2940632104873657, "learning_rate": 2.932294378889672e-07, "loss": 1.9574, "step": 24450 }, { "epoch": 7.3519651638575025, "grad_norm": 1.3414610624313354, "learning_rate": 2.8660963057262427e-07, "loss": 1.945, "step": 24475 }, { "epoch": 7.359472953188933, "grad_norm": 1.2023084163665771, "learning_rate": 2.800639459520693e-07, "loss": 1.9368, "step": 24500 }, { "epoch": 7.359472953188933, "eval_loss": 1.9809428453445435, "eval_runtime": 244.6252, "eval_samples_per_second": 22.929, "eval_steps_per_second": 5.735, "step": 24500 }, { "epoch": 7.366980742520365, "grad_norm": 1.3090476989746094, "learning_rate": 2.7359245128884935e-07, "loss": 1.9401, "step": 24525 }, { "epoch": 7.374488531851796, "grad_norm": 1.1625995635986328, "learning_rate": 2.6719521308215644e-07, "loss": 1.9421, "step": 24550 }, { "epoch": 7.381996321183228, "grad_norm": 1.1445448398590088, "learning_rate": 2.608722970681446e-07, "loss": 1.9201, "step": 24575 }, { "epoch": 7.389504110514659, "grad_norm": 1.2008908987045288, "learning_rate": 2.5462376821925453e-07, "loss": 1.9368, "step": 24600 }, { "epoch": 7.389504110514659, "eval_loss": 1.9809165000915527, "eval_runtime": 245.2653, "eval_samples_per_second": 22.869, "eval_steps_per_second": 5.72, "step": 24600 }, { "epoch": 7.39701189984609, "grad_norm": 1.1442121267318726, "learning_rate": 2.484496907435452e-07, "loss": 1.9356, "step": 24625 }, { "epoch": 7.404519689177522, "grad_norm": 1.194258689880371, "learning_rate": 2.42350128084039e-07, "loss": 1.957, "step": 24650 }, { "epoch": 7.4120274785089535, "grad_norm": 1.2677561044692993, "learning_rate": 2.3632514291806185e-07, "loss": 1.9405, "step": 24675 }, { "epoch": 7.419535267840384, "grad_norm": 1.1491544246673584, "learning_rate": 2.3037479715660337e-07, "loss": 1.921, "step": 24700 }, { "epoch": 7.419535267840384, "eval_loss": 1.9808403253555298, "eval_runtime": 245.1407, "eval_samples_per_second": 22.881, "eval_steps_per_second": 5.723, "step": 24700 }, { "epoch": 7.427043057171816, "grad_norm": 1.2121251821517944, "learning_rate": 2.2449915194368258e-07, "loss": 1.9255, "step": 24725 }, { "epoch": 7.434550846503247, "grad_norm": 1.2214640378952026, "learning_rate": 2.1869826765571505e-07, "loss": 1.9289, "step": 24750 }, { "epoch": 7.4420586358346785, "grad_norm": 1.3879413604736328, "learning_rate": 2.1297220390089662e-07, "loss": 1.9396, "step": 24775 }, { "epoch": 7.44956642516611, "grad_norm": 1.1950923204421997, "learning_rate": 2.0732101951858816e-07, "loss": 1.926, "step": 24800 }, { "epoch": 7.44956642516611, "eval_loss": 1.9808101654052734, "eval_runtime": 244.5077, "eval_samples_per_second": 22.94, "eval_steps_per_second": 5.738, "step": 24800 }, { "epoch": 7.457074214497541, "grad_norm": 1.1813709735870361, "learning_rate": 2.0174477257871277e-07, "loss": 1.9482, "step": 24825 }, { "epoch": 7.464582003828973, "grad_norm": 1.2802395820617676, "learning_rate": 1.9624352038115773e-07, "loss": 1.9456, "step": 24850 }, { "epoch": 7.472089793160404, "grad_norm": 1.1578987836837769, "learning_rate": 1.9103292580586406e-07, "loss": 1.9607, "step": 24875 }, { "epoch": 7.479597582491835, "grad_norm": 1.2001193761825562, "learning_rate": 1.856788265674736e-07, "loss": 1.8946, "step": 24900 }, { "epoch": 7.479597582491835, "eval_loss": 1.9809141159057617, "eval_runtime": 245.3503, "eval_samples_per_second": 22.861, "eval_steps_per_second": 5.718, "step": 24900 }, { "epoch": 7.487105371823267, "grad_norm": 1.0772101879119873, "learning_rate": 1.8039988716037763e-07, "loss": 1.9356, "step": 24925 }, { "epoch": 7.494613161154698, "grad_norm": 1.2263306379318237, "learning_rate": 1.7519616182942067e-07, "loss": 1.9384, "step": 24950 }, { "epoch": 7.5021209504861295, "grad_norm": 1.1009598970413208, "learning_rate": 1.7006770404656534e-07, "loss": 1.9524, "step": 24975 }, { "epoch": 7.509628739817561, "grad_norm": 1.192656397819519, "learning_rate": 1.6501456651034808e-07, "loss": 1.9367, "step": 25000 }, { "epoch": 7.509628739817561, "eval_loss": 1.9808765649795532, "eval_runtime": 244.5611, "eval_samples_per_second": 22.935, "eval_steps_per_second": 5.737, "step": 25000 }, { "epoch": 7.517136529148992, "grad_norm": 1.0909212827682495, "learning_rate": 1.6003680114533763e-07, "loss": 1.906, "step": 25025 }, { "epoch": 7.524644318480424, "grad_norm": 1.217670202255249, "learning_rate": 1.5513445910159823e-07, "loss": 1.9117, "step": 25050 }, { "epoch": 7.5321521078118545, "grad_norm": 1.1752556562423706, "learning_rate": 1.503075907541665e-07, "loss": 1.9262, "step": 25075 }, { "epoch": 7.539659897143286, "grad_norm": 1.1952420473098755, "learning_rate": 1.455562457025353e-07, "loss": 1.9484, "step": 25100 }, { "epoch": 7.539659897143286, "eval_loss": 1.9809269905090332, "eval_runtime": 244.6411, "eval_samples_per_second": 22.927, "eval_steps_per_second": 5.735, "step": 25100 }, { "epoch": 7.547167686474717, "grad_norm": 1.1993337869644165, "learning_rate": 1.4088047277013987e-07, "loss": 1.9473, "step": 25125 }, { "epoch": 7.554675475806149, "grad_norm": 1.2464344501495361, "learning_rate": 1.3628032000386008e-07, "loss": 1.9417, "step": 25150 }, { "epoch": 7.56218326513758, "grad_norm": 1.3131685256958008, "learning_rate": 1.3175583467352316e-07, "loss": 1.9431, "step": 25175 }, { "epoch": 7.569691054469011, "grad_norm": 1.255356788635254, "learning_rate": 1.2730706327142155e-07, "loss": 1.9323, "step": 25200 }, { "epoch": 7.569691054469011, "eval_loss": 1.9808063507080078, "eval_runtime": 244.7486, "eval_samples_per_second": 22.917, "eval_steps_per_second": 5.732, "step": 25200 }, { "epoch": 7.577198843800443, "grad_norm": 1.2939985990524292, "learning_rate": 1.2293405151183184e-07, "loss": 1.9484, "step": 25225 }, { "epoch": 7.584706633131875, "grad_norm": 1.2347224950790405, "learning_rate": 1.1863684433054994e-07, "loss": 1.9408, "step": 25250 }, { "epoch": 7.5922144224633055, "grad_norm": 1.142849087715149, "learning_rate": 1.1441548588442152e-07, "loss": 1.9449, "step": 25275 }, { "epoch": 7.599722211794737, "grad_norm": 1.2810004949569702, "learning_rate": 1.1027001955089572e-07, "loss": 1.9499, "step": 25300 }, { "epoch": 7.599722211794737, "eval_loss": 1.9808244705200195, "eval_runtime": 244.5894, "eval_samples_per_second": 22.932, "eval_steps_per_second": 5.736, "step": 25300 }, { "epoch": 7.607230001126169, "grad_norm": 1.2134476900100708, "learning_rate": 1.0620048792757464e-07, "loss": 1.9384, "step": 25325 }, { "epoch": 7.6147377904576, "grad_norm": 1.1853543519973755, "learning_rate": 1.0220693283177957e-07, "loss": 1.945, "step": 25350 }, { "epoch": 7.622245579789031, "grad_norm": 1.2892330884933472, "learning_rate": 9.82893953001171e-08, "loss": 1.9541, "step": 25375 }, { "epoch": 7.629753369120462, "grad_norm": 1.139492392539978, "learning_rate": 9.444791558806121e-08, "loss": 1.9462, "step": 25400 }, { "epoch": 7.629753369120462, "eval_loss": 1.9807994365692139, "eval_runtime": 244.5984, "eval_samples_per_second": 22.931, "eval_steps_per_second": 5.736, "step": 25400 }, { "epoch": 7.637261158451894, "grad_norm": 1.1218518018722534, "learning_rate": 9.068253316953684e-08, "loss": 1.926, "step": 25425 }, { "epoch": 7.644768947783325, "grad_norm": 1.1967252492904663, "learning_rate": 8.699328673651613e-08, "loss": 1.921, "step": 25450 }, { "epoch": 7.652276737114756, "grad_norm": 1.1184207201004028, "learning_rate": 8.338021419861868e-08, "loss": 1.9127, "step": 25475 }, { "epoch": 7.659784526446188, "grad_norm": 1.2853116989135742, "learning_rate": 7.984335268272441e-08, "loss": 1.9373, "step": 25500 }, { "epoch": 7.659784526446188, "eval_loss": 1.9808040857315063, "eval_runtime": 244.5622, "eval_samples_per_second": 22.935, "eval_steps_per_second": 5.737, "step": 25500 }, { "epoch": 7.667292315777619, "grad_norm": 1.1911348104476929, "learning_rate": 7.638273853259131e-08, "loss": 1.9449, "step": 25525 }, { "epoch": 7.674800105109051, "grad_norm": 1.2387864589691162, "learning_rate": 7.299840730847995e-08, "loss": 1.9314, "step": 25550 }, { "epoch": 7.682307894440482, "grad_norm": 1.2212029695510864, "learning_rate": 6.969039378679292e-08, "loss": 1.9205, "step": 25575 }, { "epoch": 7.689815683771913, "grad_norm": 1.2776437997817993, "learning_rate": 6.645873195971098e-08, "loss": 1.984, "step": 25600 }, { "epoch": 7.689815683771913, "eval_loss": 1.9808552265167236, "eval_runtime": 244.7299, "eval_samples_per_second": 22.919, "eval_steps_per_second": 5.733, "step": 25600 }, { "epoch": 7.697323473103345, "grad_norm": 1.2346168756484985, "learning_rate": 6.330345503484908e-08, "loss": 1.9367, "step": 25625 }, { "epoch": 7.704831262434777, "grad_norm": 1.2425425052642822, "learning_rate": 6.02245954349126e-08, "loss": 1.9449, "step": 25650 }, { "epoch": 7.712339051766207, "grad_norm": 1.1887537240982056, "learning_rate": 5.722218479736502e-08, "loss": 1.9207, "step": 25675 }, { "epoch": 7.719846841097639, "grad_norm": 1.1335868835449219, "learning_rate": 5.429625397410237e-08, "loss": 1.9374, "step": 25700 }, { "epoch": 7.719846841097639, "eval_loss": 1.9808275699615479, "eval_runtime": 244.9051, "eval_samples_per_second": 22.903, "eval_steps_per_second": 5.729, "step": 25700 }, { "epoch": 7.72735463042907, "grad_norm": 1.2654999494552612, "learning_rate": 5.144683303113684e-08, "loss": 1.9645, "step": 25725 }, { "epoch": 7.734862419760502, "grad_norm": 1.3081024885177612, "learning_rate": 4.8673951248286166e-08, "loss": 1.9205, "step": 25750 }, { "epoch": 7.742370209091932, "grad_norm": 1.2318024635314941, "learning_rate": 4.597763711887637e-08, "loss": 1.9425, "step": 25775 }, { "epoch": 7.749877998423364, "grad_norm": 1.2977827787399292, "learning_rate": 4.335791834944369e-08, "loss": 1.9496, "step": 25800 }, { "epoch": 7.749877998423364, "eval_loss": 1.9807723760604858, "eval_runtime": 244.5126, "eval_samples_per_second": 22.94, "eval_steps_per_second": 5.738, "step": 25800 }, { "epoch": 7.757385787754796, "grad_norm": 1.0694193840026855, "learning_rate": 4.081482185945479e-08, "loss": 1.9416, "step": 25825 }, { "epoch": 7.764893577086227, "grad_norm": 1.2503465414047241, "learning_rate": 3.8348373781026955e-08, "loss": 1.9512, "step": 25850 }, { "epoch": 7.772401366417658, "grad_norm": 1.2763334512710571, "learning_rate": 3.5958599458662537e-08, "loss": 1.9338, "step": 25875 }, { "epoch": 7.77990915574909, "grad_norm": 1.1357148885726929, "learning_rate": 3.3645523448984914e-08, "loss": 1.9579, "step": 25900 }, { "epoch": 7.77990915574909, "eval_loss": 1.9807769060134888, "eval_runtime": 244.8991, "eval_samples_per_second": 22.903, "eval_steps_per_second": 5.729, "step": 25900 }, { "epoch": 7.787416945080521, "grad_norm": 1.1911311149597168, "learning_rate": 3.149715032283562e-08, "loss": 1.9241, "step": 25925 }, { "epoch": 7.7949247344119525, "grad_norm": 1.2044905424118042, "learning_rate": 2.933447122186239e-08, "loss": 1.9325, "step": 25950 }, { "epoch": 7.802432523743384, "grad_norm": 1.355704426765442, "learning_rate": 2.724855850118585e-08, "loss": 1.9369, "step": 25975 }, { "epoch": 7.809940313074815, "grad_norm": 1.1674330234527588, "learning_rate": 2.5239433595037053e-08, "loss": 1.9114, "step": 26000 }, { "epoch": 7.809940313074815, "eval_loss": 1.9807677268981934, "eval_runtime": 244.6737, "eval_samples_per_second": 22.924, "eval_steps_per_second": 5.734, "step": 26000 }, { "epoch": 7.817448102406247, "grad_norm": 1.242080807685852, "learning_rate": 2.33071171485974e-08, "loss": 1.9247, "step": 26025 }, { "epoch": 7.824955891737678, "grad_norm": 1.2494958639144897, "learning_rate": 2.1451629017787133e-08, "loss": 1.9284, "step": 26050 }, { "epoch": 7.832463681069109, "grad_norm": 1.2800395488739014, "learning_rate": 1.9672988269061332e-08, "loss": 1.9365, "step": 26075 }, { "epoch": 7.83997147040054, "grad_norm": 1.2159162759780884, "learning_rate": 1.797121317921341e-08, "loss": 1.9213, "step": 26100 }, { "epoch": 7.83997147040054, "eval_loss": 1.9807840585708618, "eval_runtime": 245.0653, "eval_samples_per_second": 22.888, "eval_steps_per_second": 5.725, "step": 26100 }, { "epoch": 7.847479259731972, "grad_norm": 1.2391120195388794, "learning_rate": 1.6346321235187756e-08, "loss": 1.9321, "step": 26125 }, { "epoch": 7.8549870490634035, "grad_norm": 1.1283122301101685, "learning_rate": 1.4798329133900724e-08, "loss": 1.9741, "step": 26150 }, { "epoch": 7.862494838394834, "grad_norm": 1.1563724279403687, "learning_rate": 1.3327252782067423e-08, "loss": 1.9312, "step": 26175 }, { "epoch": 7.870002627726266, "grad_norm": 1.2058521509170532, "learning_rate": 1.1933107296039358e-08, "loss": 1.9255, "step": 26200 }, { "epoch": 7.870002627726266, "eval_loss": 1.980788230895996, "eval_runtime": 244.7317, "eval_samples_per_second": 22.919, "eval_steps_per_second": 5.733, "step": 26200 }, { "epoch": 7.877510417057698, "grad_norm": 1.1759904623031616, "learning_rate": 1.0615907001648717e-08, "loss": 1.9553, "step": 26225 }, { "epoch": 7.8850182063891285, "grad_norm": 1.2614035606384277, "learning_rate": 9.37566543406182e-09, "loss": 1.9662, "step": 26250 }, { "epoch": 7.89252599572056, "grad_norm": 1.3371256589889526, "learning_rate": 8.212395337640066e-09, "loss": 1.9287, "step": 26275 }, { "epoch": 7.900033785051991, "grad_norm": 1.3627214431762695, "learning_rate": 7.126108665805875e-09, "loss": 1.9213, "step": 26300 }, { "epoch": 7.900033785051991, "eval_loss": 1.980796217918396, "eval_runtime": 244.3911, "eval_samples_per_second": 22.951, "eval_steps_per_second": 5.741, "step": 26300 }, { "epoch": 7.907541574383423, "grad_norm": 1.236215353012085, "learning_rate": 6.11681658092611e-09, "loss": 1.9431, "step": 26325 }, { "epoch": 7.9150493637148545, "grad_norm": 1.1915570497512817, "learning_rate": 5.184529454191344e-09, "loss": 1.9467, "step": 26350 }, { "epoch": 7.922557153046285, "grad_norm": 1.1169012784957886, "learning_rate": 4.329256865511777e-09, "loss": 1.9403, "step": 26375 }, { "epoch": 7.930064942377717, "grad_norm": 1.4462562799453735, "learning_rate": 3.5510076034198093e-09, "loss": 1.9356, "step": 26400 }, { "epoch": 7.930064942377717, "eval_loss": 1.980790138244629, "eval_runtime": 244.7112, "eval_samples_per_second": 22.921, "eval_steps_per_second": 5.733, "step": 26400 }, { "epoch": 7.937572731709148, "grad_norm": 1.2058677673339844, "learning_rate": 2.8497896649767872e-09, "loss": 1.9233, "step": 26425 }, { "epoch": 7.9450805210405795, "grad_norm": 1.246536135673523, "learning_rate": 2.225610255694732e-09, "loss": 1.9439, "step": 26450 }, { "epoch": 7.952588310372011, "grad_norm": 1.4007676839828491, "learning_rate": 1.6784757894588998e-09, "loss": 1.947, "step": 26475 }, { "epoch": 7.960096099703442, "grad_norm": 1.2076547145843506, "learning_rate": 1.2083918884636668e-09, "loss": 1.9181, "step": 26500 }, { "epoch": 7.960096099703442, "eval_loss": 1.9807934761047363, "eval_runtime": 244.5038, "eval_samples_per_second": 22.94, "eval_steps_per_second": 5.738, "step": 26500 }, { "epoch": 7.967603889034874, "grad_norm": 1.1972334384918213, "learning_rate": 8.15363383154244e-10, "loss": 1.928, "step": 26525 }, { "epoch": 7.975111678366305, "grad_norm": 1.246110439300537, "learning_rate": 4.993943121767153e-10, "loss": 1.9361, "step": 26550 }, { "epoch": 7.982619467697736, "grad_norm": 1.2921074628829956, "learning_rate": 2.604879223364054e-10, "loss": 1.9193, "step": 26575 }, { "epoch": 7.990127257029168, "grad_norm": 1.2516605854034424, "learning_rate": 9.864666856707061e-11, "loss": 1.9276, "step": 26600 }, { "epoch": 7.990127257029168, "eval_loss": 1.9807960987091064, "eval_runtime": 244.5065, "eval_samples_per_second": 22.94, "eval_steps_per_second": 5.738, "step": 26600 }, { "epoch": 7.997635046360599, "grad_norm": 1.2723952531814575, "learning_rate": 1.3872213900922859e-11, "loss": 1.9274, "step": 26625 } ], "logging_steps": 25, "max_steps": 26632, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.055320138484023e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }