{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 726327, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002065185515614868, "grad_norm": 0.7036675810813904, "learning_rate": 0.0001996, "loss": 7.8213, "step": 500 }, { "epoch": 0.004130371031229736, "grad_norm": 1.0079172849655151, "learning_rate": 0.0003996, "loss": 6.7143, "step": 1000 }, { "epoch": 0.006195556546844603, "grad_norm": 1.0771256685256958, "learning_rate": 0.0005996, "loss": 6.5273, "step": 1500 }, { "epoch": 0.008260742062459471, "grad_norm": 1.1658340692520142, "learning_rate": 0.0007996, "loss": 6.3948, "step": 2000 }, { "epoch": 0.010325927578074339, "grad_norm": 1.4879825115203857, "learning_rate": 0.0009996, "loss": 5.9994, "step": 2500 }, { "epoch": 0.012391113093689206, "grad_norm": 1.6412945985794067, "learning_rate": 0.0009993106087504335, "loss": 4.7896, "step": 3000 }, { "epoch": 0.014456298609304074, "grad_norm": 1.4124268293380737, "learning_rate": 0.0009986198359552766, "loss": 3.8145, "step": 3500 }, { "epoch": 0.016521484124918943, "grad_norm": 1.2819844484329224, "learning_rate": 0.0009979290631601198, "loss": 3.4161, "step": 4000 }, { "epoch": 0.01858666964053381, "grad_norm": 1.155612826347351, "learning_rate": 0.0009972382903649629, "loss": 3.1921, "step": 4500 }, { "epoch": 0.020651855156148678, "grad_norm": 1.1647284030914307, "learning_rate": 0.0009965475175698062, "loss": 3.055, "step": 5000 }, { "epoch": 0.022717040671763545, "grad_norm": 1.1216390132904053, "learning_rate": 0.0009958567447746491, "loss": 2.9393, "step": 5500 }, { "epoch": 0.024782226187378412, "grad_norm": 1.1327152252197266, "learning_rate": 0.0009951659719794922, "loss": 2.8641, "step": 6000 }, { "epoch": 0.02684741170299328, "grad_norm": 1.0822185277938843, "learning_rate": 0.0009944751991843356, "loss": 2.7994, "step": 6500 }, { "epoch": 0.028912597218608147, "grad_norm": 1.0448203086853027, "learning_rate": 0.0009937844263891787, "loss": 2.7461, "step": 7000 }, { "epoch": 0.030977782734223015, "grad_norm": 1.0597904920578003, "learning_rate": 0.0009930936535940218, "loss": 2.7034, "step": 7500 }, { "epoch": 0.033042968249837885, "grad_norm": 1.0737932920455933, "learning_rate": 0.000992402880798865, "loss": 2.6603, "step": 8000 }, { "epoch": 0.03510815376545275, "grad_norm": 1.033523440361023, "learning_rate": 0.000991712108003708, "loss": 2.6265, "step": 8500 }, { "epoch": 0.03717333928106762, "grad_norm": 0.976208508014679, "learning_rate": 0.0009910213352085512, "loss": 2.5961, "step": 9000 }, { "epoch": 0.03923852479668249, "grad_norm": 0.9439292550086975, "learning_rate": 0.0009903305624133943, "loss": 2.576, "step": 9500 }, { "epoch": 0.041303710312297355, "grad_norm": 0.9609880447387695, "learning_rate": 0.0009896397896182376, "loss": 2.5466, "step": 10000 }, { "epoch": 0.04336889582791222, "grad_norm": 0.9652389883995056, "learning_rate": 0.0009889490168230807, "loss": 2.527, "step": 10500 }, { "epoch": 0.04543408134352709, "grad_norm": 1.0029548406600952, "learning_rate": 0.0009882582440279238, "loss": 2.5086, "step": 11000 }, { "epoch": 0.04749926685914196, "grad_norm": 0.9536625742912292, "learning_rate": 0.000987567471232767, "loss": 2.4902, "step": 11500 }, { "epoch": 0.049564452374756825, "grad_norm": 0.9976128339767456, "learning_rate": 0.00098687669843761, "loss": 2.4657, "step": 12000 }, { "epoch": 0.05162963789037169, "grad_norm": 1.0109055042266846, "learning_rate": 0.0009861859256424532, "loss": 2.4605, "step": 12500 }, { "epoch": 0.05369482340598656, "grad_norm": 0.9560060501098633, "learning_rate": 0.0009854951528472963, "loss": 2.4431, "step": 13000 }, { "epoch": 0.05576000892160143, "grad_norm": 0.9709720015525818, "learning_rate": 0.0009848043800521396, "loss": 2.4307, "step": 13500 }, { "epoch": 0.057825194437216294, "grad_norm": 0.9962353706359863, "learning_rate": 0.0009841136072569828, "loss": 2.4115, "step": 14000 }, { "epoch": 0.05989037995283116, "grad_norm": 0.9110284447669983, "learning_rate": 0.0009834228344618259, "loss": 2.4021, "step": 14500 }, { "epoch": 0.06195556546844603, "grad_norm": 0.9498186707496643, "learning_rate": 0.000982732061666669, "loss": 2.3856, "step": 15000 }, { "epoch": 0.0640207509840609, "grad_norm": 0.8862460851669312, "learning_rate": 0.0009820412888715121, "loss": 2.3762, "step": 15500 }, { "epoch": 0.06608593649967577, "grad_norm": 0.9397904276847839, "learning_rate": 0.0009813505160763552, "loss": 2.3679, "step": 16000 }, { "epoch": 0.06815112201529064, "grad_norm": 0.9054779410362244, "learning_rate": 0.0009806597432811984, "loss": 2.3561, "step": 16500 }, { "epoch": 0.0702163075309055, "grad_norm": 0.8556115627288818, "learning_rate": 0.0009799689704860417, "loss": 2.353, "step": 17000 }, { "epoch": 0.07228149304652037, "grad_norm": 0.9651133418083191, "learning_rate": 0.0009792781976908848, "loss": 2.3416, "step": 17500 }, { "epoch": 0.07434667856213524, "grad_norm": 0.9362500309944153, "learning_rate": 0.000978587424895728, "loss": 2.3328, "step": 18000 }, { "epoch": 0.07641186407775011, "grad_norm": 0.9050174951553345, "learning_rate": 0.000977896652100571, "loss": 2.3216, "step": 18500 }, { "epoch": 0.07847704959336498, "grad_norm": 0.8851823806762695, "learning_rate": 0.0009772058793054142, "loss": 2.3177, "step": 19000 }, { "epoch": 0.08054223510897984, "grad_norm": 0.8814013600349426, "learning_rate": 0.0009765151065102573, "loss": 2.3002, "step": 19500 }, { "epoch": 0.08260742062459471, "grad_norm": 0.9048078656196594, "learning_rate": 0.0009758243337151005, "loss": 2.305, "step": 20000 }, { "epoch": 0.08467260614020958, "grad_norm": 0.8821763396263123, "learning_rate": 0.0009751335609199436, "loss": 2.2925, "step": 20500 }, { "epoch": 0.08673779165582444, "grad_norm": 0.873921811580658, "learning_rate": 0.0009744427881247867, "loss": 2.2861, "step": 21000 }, { "epoch": 0.08880297717143931, "grad_norm": 0.8664683699607849, "learning_rate": 0.0009737520153296299, "loss": 2.2809, "step": 21500 }, { "epoch": 0.09086816268705418, "grad_norm": 0.9115278720855713, "learning_rate": 0.0009730612425344731, "loss": 2.2765, "step": 22000 }, { "epoch": 0.09293334820266905, "grad_norm": 0.875135064125061, "learning_rate": 0.0009723704697393162, "loss": 2.2699, "step": 22500 }, { "epoch": 0.09499853371828391, "grad_norm": 0.8888856172561646, "learning_rate": 0.0009716796969441593, "loss": 2.2637, "step": 23000 }, { "epoch": 0.09706371923389878, "grad_norm": 0.8921205401420593, "learning_rate": 0.0009709889241490025, "loss": 2.2591, "step": 23500 }, { "epoch": 0.09912890474951365, "grad_norm": 0.840370774269104, "learning_rate": 0.0009702981513538456, "loss": 2.25, "step": 24000 }, { "epoch": 0.10119409026512852, "grad_norm": 0.8678010702133179, "learning_rate": 0.0009696073785586888, "loss": 2.2472, "step": 24500 }, { "epoch": 0.10325927578074338, "grad_norm": 0.8795912265777588, "learning_rate": 0.0009689166057635319, "loss": 2.2403, "step": 25000 }, { "epoch": 0.10532446129635825, "grad_norm": 0.909457266330719, "learning_rate": 0.0009682258329683751, "loss": 2.2362, "step": 25500 }, { "epoch": 0.10738964681197312, "grad_norm": 0.8739911317825317, "learning_rate": 0.0009675350601732182, "loss": 2.2285, "step": 26000 }, { "epoch": 0.10945483232758799, "grad_norm": 0.8885407447814941, "learning_rate": 0.0009668442873780614, "loss": 2.2268, "step": 26500 }, { "epoch": 0.11152001784320285, "grad_norm": 0.8348733186721802, "learning_rate": 0.0009661535145829045, "loss": 2.2201, "step": 27000 }, { "epoch": 0.11358520335881772, "grad_norm": 0.8733665943145752, "learning_rate": 0.0009654627417877476, "loss": 2.219, "step": 27500 }, { "epoch": 0.11565038887443259, "grad_norm": 0.8849397897720337, "learning_rate": 0.0009647719689925908, "loss": 2.2115, "step": 28000 }, { "epoch": 0.11771557439004746, "grad_norm": 0.8752795457839966, "learning_rate": 0.0009640811961974339, "loss": 2.2051, "step": 28500 }, { "epoch": 0.11978075990566232, "grad_norm": 0.8557031750679016, "learning_rate": 0.0009633904234022772, "loss": 2.2029, "step": 29000 }, { "epoch": 0.12184594542127719, "grad_norm": 0.8175500631332397, "learning_rate": 0.0009626996506071203, "loss": 2.1967, "step": 29500 }, { "epoch": 0.12391113093689206, "grad_norm": 0.8393064737319946, "learning_rate": 0.0009620088778119633, "loss": 2.1949, "step": 30000 }, { "epoch": 0.12597631645250693, "grad_norm": 0.8515117764472961, "learning_rate": 0.0009613181050168065, "loss": 2.1909, "step": 30500 }, { "epoch": 0.1280415019681218, "grad_norm": 0.8967178463935852, "learning_rate": 0.0009606273322216496, "loss": 2.1858, "step": 31000 }, { "epoch": 0.13010668748373666, "grad_norm": 0.8990112543106079, "learning_rate": 0.0009599365594264929, "loss": 2.1814, "step": 31500 }, { "epoch": 0.13217187299935154, "grad_norm": 0.8051102161407471, "learning_rate": 0.000959245786631336, "loss": 2.1803, "step": 32000 }, { "epoch": 0.1342370585149664, "grad_norm": 0.8505108952522278, "learning_rate": 0.0009585550138361792, "loss": 2.1775, "step": 32500 }, { "epoch": 0.13630224403058128, "grad_norm": 0.8586075305938721, "learning_rate": 0.0009578642410410222, "loss": 2.1671, "step": 33000 }, { "epoch": 0.13836742954619613, "grad_norm": 0.830560028553009, "learning_rate": 0.0009571734682458653, "loss": 2.1697, "step": 33500 }, { "epoch": 0.140432615061811, "grad_norm": 0.8251802325248718, "learning_rate": 0.0009564826954507086, "loss": 2.1646, "step": 34000 }, { "epoch": 0.14249780057742586, "grad_norm": 0.8522030711174011, "learning_rate": 0.0009557919226555517, "loss": 2.1609, "step": 34500 }, { "epoch": 0.14456298609304075, "grad_norm": 0.8344951272010803, "learning_rate": 0.0009551011498603949, "loss": 2.1542, "step": 35000 }, { "epoch": 0.1466281716086556, "grad_norm": 0.8527629375457764, "learning_rate": 0.000954410377065238, "loss": 2.1584, "step": 35500 }, { "epoch": 0.14869335712427048, "grad_norm": 0.8409314155578613, "learning_rate": 0.0009537196042700811, "loss": 2.1472, "step": 36000 }, { "epoch": 0.15075854263988533, "grad_norm": 0.8568186163902283, "learning_rate": 0.0009530288314749243, "loss": 2.146, "step": 36500 }, { "epoch": 0.15282372815550022, "grad_norm": 0.8142380118370056, "learning_rate": 0.0009523380586797674, "loss": 2.1467, "step": 37000 }, { "epoch": 0.15488891367111507, "grad_norm": 0.8309258222579956, "learning_rate": 0.0009516472858846106, "loss": 2.142, "step": 37500 }, { "epoch": 0.15695409918672995, "grad_norm": 0.8471255302429199, "learning_rate": 0.0009509565130894537, "loss": 2.1425, "step": 38000 }, { "epoch": 0.1590192847023448, "grad_norm": 0.8846974968910217, "learning_rate": 0.0009502657402942969, "loss": 2.1377, "step": 38500 }, { "epoch": 0.16108447021795969, "grad_norm": 0.8476696014404297, "learning_rate": 0.00094957496749914, "loss": 2.1356, "step": 39000 }, { "epoch": 0.16314965573357454, "grad_norm": 0.8468635678291321, "learning_rate": 0.0009488841947039832, "loss": 2.1316, "step": 39500 }, { "epoch": 0.16521484124918942, "grad_norm": 0.8356343507766724, "learning_rate": 0.0009481934219088263, "loss": 2.1269, "step": 40000 }, { "epoch": 0.16728002676480427, "grad_norm": 0.7726144790649414, "learning_rate": 0.0009475026491136694, "loss": 2.1277, "step": 40500 }, { "epoch": 0.16934521228041916, "grad_norm": 0.8468815684318542, "learning_rate": 0.0009468118763185126, "loss": 2.1332, "step": 41000 }, { "epoch": 0.171410397796034, "grad_norm": 0.78179931640625, "learning_rate": 0.0009461211035233558, "loss": 2.1195, "step": 41500 }, { "epoch": 0.1734755833116489, "grad_norm": 0.8012422919273376, "learning_rate": 0.0009454303307281989, "loss": 2.115, "step": 42000 }, { "epoch": 0.17554076882726374, "grad_norm": 0.8458732962608337, "learning_rate": 0.000944739557933042, "loss": 2.1146, "step": 42500 }, { "epoch": 0.17760595434287862, "grad_norm": 0.8353042006492615, "learning_rate": 0.0009440487851378852, "loss": 2.1123, "step": 43000 }, { "epoch": 0.17967113985849348, "grad_norm": 0.8672284483909607, "learning_rate": 0.0009433580123427283, "loss": 2.1114, "step": 43500 }, { "epoch": 0.18173632537410836, "grad_norm": 0.7824869751930237, "learning_rate": 0.0009426672395475715, "loss": 2.1082, "step": 44000 }, { "epoch": 0.1838015108897232, "grad_norm": 0.7976692318916321, "learning_rate": 0.0009419764667524147, "loss": 2.1052, "step": 44500 }, { "epoch": 0.1858666964053381, "grad_norm": 0.876670777797699, "learning_rate": 0.0009412856939572577, "loss": 2.1033, "step": 45000 }, { "epoch": 0.18793188192095295, "grad_norm": 0.7947434186935425, "learning_rate": 0.0009405949211621009, "loss": 2.0971, "step": 45500 }, { "epoch": 0.18999706743656783, "grad_norm": 0.823627233505249, "learning_rate": 0.000939904148366944, "loss": 2.0984, "step": 46000 }, { "epoch": 0.19206225295218268, "grad_norm": 0.8043273091316223, "learning_rate": 0.0009392133755717873, "loss": 2.0958, "step": 46500 }, { "epoch": 0.19412743846779756, "grad_norm": 0.8782801032066345, "learning_rate": 0.0009385226027766304, "loss": 2.0914, "step": 47000 }, { "epoch": 0.19619262398341242, "grad_norm": 0.8043196201324463, "learning_rate": 0.0009378318299814735, "loss": 2.0888, "step": 47500 }, { "epoch": 0.1982578094990273, "grad_norm": 0.8064476251602173, "learning_rate": 0.0009371410571863166, "loss": 2.0847, "step": 48000 }, { "epoch": 0.20032299501464215, "grad_norm": 0.801071047782898, "learning_rate": 0.0009364502843911597, "loss": 2.0844, "step": 48500 }, { "epoch": 0.20238818053025703, "grad_norm": 0.8486244678497314, "learning_rate": 0.000935759511596003, "loss": 2.0853, "step": 49000 }, { "epoch": 0.2044533660458719, "grad_norm": 0.813061535358429, "learning_rate": 0.0009350687388008461, "loss": 2.0812, "step": 49500 }, { "epoch": 0.20651855156148677, "grad_norm": 0.8625230193138123, "learning_rate": 0.0009343779660056893, "loss": 2.0832, "step": 50000 }, { "epoch": 0.20858373707710165, "grad_norm": 0.8224324584007263, "learning_rate": 0.0009336871932105324, "loss": 2.0785, "step": 50500 }, { "epoch": 0.2106489225927165, "grad_norm": 0.8722664713859558, "learning_rate": 0.0009329964204153754, "loss": 2.074, "step": 51000 }, { "epoch": 0.21271410810833138, "grad_norm": 0.8052055239677429, "learning_rate": 0.0009323056476202187, "loss": 2.074, "step": 51500 }, { "epoch": 0.21477929362394624, "grad_norm": 0.8521301746368408, "learning_rate": 0.0009316148748250618, "loss": 2.0681, "step": 52000 }, { "epoch": 0.21684447913956112, "grad_norm": 0.846494197845459, "learning_rate": 0.000930924102029905, "loss": 2.073, "step": 52500 }, { "epoch": 0.21890966465517597, "grad_norm": 0.8026652336120605, "learning_rate": 0.0009302333292347481, "loss": 2.0685, "step": 53000 }, { "epoch": 0.22097485017079085, "grad_norm": 0.8246744871139526, "learning_rate": 0.0009295425564395913, "loss": 2.0653, "step": 53500 }, { "epoch": 0.2230400356864057, "grad_norm": 0.8326907157897949, "learning_rate": 0.0009288517836444344, "loss": 2.0643, "step": 54000 }, { "epoch": 0.2251052212020206, "grad_norm": 0.7792090177536011, "learning_rate": 0.0009281610108492775, "loss": 2.0622, "step": 54500 }, { "epoch": 0.22717040671763544, "grad_norm": 0.8691778779029846, "learning_rate": 0.0009274702380541207, "loss": 2.0624, "step": 55000 }, { "epoch": 0.22923559223325032, "grad_norm": 0.7907185554504395, "learning_rate": 0.0009267794652589638, "loss": 2.0571, "step": 55500 }, { "epoch": 0.23130077774886518, "grad_norm": 0.8440839052200317, "learning_rate": 0.000926088692463807, "loss": 2.0612, "step": 56000 }, { "epoch": 0.23336596326448006, "grad_norm": 0.8027564883232117, "learning_rate": 0.0009253979196686502, "loss": 2.054, "step": 56500 }, { "epoch": 0.2354311487800949, "grad_norm": 0.7806565165519714, "learning_rate": 0.0009247071468734933, "loss": 2.053, "step": 57000 }, { "epoch": 0.2374963342957098, "grad_norm": 0.8598223328590393, "learning_rate": 0.0009240163740783364, "loss": 2.0518, "step": 57500 }, { "epoch": 0.23956151981132465, "grad_norm": 0.8221333622932434, "learning_rate": 0.0009233256012831795, "loss": 2.052, "step": 58000 }, { "epoch": 0.24162670532693953, "grad_norm": 0.8474496603012085, "learning_rate": 0.0009226348284880227, "loss": 2.0435, "step": 58500 }, { "epoch": 0.24369189084255438, "grad_norm": 0.8255507349967957, "learning_rate": 0.0009219440556928659, "loss": 2.045, "step": 59000 }, { "epoch": 0.24575707635816926, "grad_norm": 0.7817030549049377, "learning_rate": 0.0009212532828977091, "loss": 2.0472, "step": 59500 }, { "epoch": 0.24782226187378412, "grad_norm": 0.7616594433784485, "learning_rate": 0.0009205625101025521, "loss": 2.0424, "step": 60000 }, { "epoch": 0.249887447389399, "grad_norm": 0.8131653070449829, "learning_rate": 0.0009198717373073953, "loss": 2.0438, "step": 60500 }, { "epoch": 0.25195263290501385, "grad_norm": 0.7939597368240356, "learning_rate": 0.0009191809645122384, "loss": 2.0392, "step": 61000 }, { "epoch": 0.2540178184206287, "grad_norm": 0.823221743106842, "learning_rate": 0.0009184901917170816, "loss": 2.0409, "step": 61500 }, { "epoch": 0.2560830039362436, "grad_norm": 0.8100286722183228, "learning_rate": 0.0009177994189219248, "loss": 2.0352, "step": 62000 }, { "epoch": 0.25814818945185847, "grad_norm": 0.84886634349823, "learning_rate": 0.0009171086461267679, "loss": 2.0395, "step": 62500 }, { "epoch": 0.2602133749674733, "grad_norm": 0.8171844482421875, "learning_rate": 0.000916417873331611, "loss": 2.0374, "step": 63000 }, { "epoch": 0.2622785604830882, "grad_norm": 0.8373914957046509, "learning_rate": 0.0009157271005364541, "loss": 2.0302, "step": 63500 }, { "epoch": 0.2643437459987031, "grad_norm": 0.8553788065910339, "learning_rate": 0.0009150363277412974, "loss": 2.0346, "step": 64000 }, { "epoch": 0.26640893151431794, "grad_norm": 0.8569718599319458, "learning_rate": 0.0009143455549461405, "loss": 2.0347, "step": 64500 }, { "epoch": 0.2684741170299328, "grad_norm": 0.8263908624649048, "learning_rate": 0.0009136547821509836, "loss": 2.0306, "step": 65000 }, { "epoch": 0.27053930254554764, "grad_norm": 0.8501819372177124, "learning_rate": 0.0009129640093558268, "loss": 2.0271, "step": 65500 }, { "epoch": 0.27260448806116255, "grad_norm": 0.8343943357467651, "learning_rate": 0.0009122732365606698, "loss": 2.0296, "step": 66000 }, { "epoch": 0.2746696735767774, "grad_norm": 0.8072646856307983, "learning_rate": 0.000911582463765513, "loss": 2.0238, "step": 66500 }, { "epoch": 0.27673485909239226, "grad_norm": 0.8142940998077393, "learning_rate": 0.0009108916909703562, "loss": 2.0289, "step": 67000 }, { "epoch": 0.2788000446080071, "grad_norm": 0.7751716375350952, "learning_rate": 0.0009102009181751994, "loss": 2.0262, "step": 67500 }, { "epoch": 0.280865230123622, "grad_norm": 0.7758037447929382, "learning_rate": 0.0009095101453800425, "loss": 2.0202, "step": 68000 }, { "epoch": 0.2829304156392369, "grad_norm": 0.8752540349960327, "learning_rate": 0.0009088193725848856, "loss": 2.0195, "step": 68500 }, { "epoch": 0.28499560115485173, "grad_norm": 0.8347713351249695, "learning_rate": 0.0009081285997897288, "loss": 2.0187, "step": 69000 }, { "epoch": 0.2870607866704666, "grad_norm": 0.8156507015228271, "learning_rate": 0.0009074378269945719, "loss": 2.0157, "step": 69500 }, { "epoch": 0.2891259721860815, "grad_norm": 0.7821555137634277, "learning_rate": 0.0009067470541994151, "loss": 2.0152, "step": 70000 }, { "epoch": 0.29119115770169635, "grad_norm": 0.84757399559021, "learning_rate": 0.0009060562814042582, "loss": 2.0157, "step": 70500 }, { "epoch": 0.2932563432173112, "grad_norm": 0.8818306922912598, "learning_rate": 0.0009053655086091014, "loss": 2.0121, "step": 71000 }, { "epoch": 0.29532152873292605, "grad_norm": 0.8257991671562195, "learning_rate": 0.0009046747358139446, "loss": 2.009, "step": 71500 }, { "epoch": 0.29738671424854096, "grad_norm": 0.821416437625885, "learning_rate": 0.0009039839630187876, "loss": 2.0094, "step": 72000 }, { "epoch": 0.2994518997641558, "grad_norm": 0.7886099815368652, "learning_rate": 0.0009032931902236308, "loss": 2.0067, "step": 72500 }, { "epoch": 0.30151708527977067, "grad_norm": 0.8650347590446472, "learning_rate": 0.0009026024174284739, "loss": 2.0046, "step": 73000 }, { "epoch": 0.3035822707953855, "grad_norm": 0.8249508738517761, "learning_rate": 0.0009019116446333171, "loss": 2.0044, "step": 73500 }, { "epoch": 0.30564745631100043, "grad_norm": 0.8648396730422974, "learning_rate": 0.0009012208718381603, "loss": 2.0061, "step": 74000 }, { "epoch": 0.3077126418266153, "grad_norm": 0.8078823089599609, "learning_rate": 0.0009005300990430035, "loss": 2.001, "step": 74500 }, { "epoch": 0.30977782734223014, "grad_norm": 0.8452419638633728, "learning_rate": 0.0008998393262478465, "loss": 1.9992, "step": 75000 }, { "epoch": 0.31184301285784505, "grad_norm": 0.7989551424980164, "learning_rate": 0.0008991485534526896, "loss": 2.007, "step": 75500 }, { "epoch": 0.3139081983734599, "grad_norm": 0.8734456300735474, "learning_rate": 0.0008984577806575328, "loss": 2.0004, "step": 76000 }, { "epoch": 0.31597338388907475, "grad_norm": 0.8965834975242615, "learning_rate": 0.000897767007862376, "loss": 2.0034, "step": 76500 }, { "epoch": 0.3180385694046896, "grad_norm": 0.7855513691902161, "learning_rate": 0.0008970762350672192, "loss": 1.9932, "step": 77000 }, { "epoch": 0.3201037549203045, "grad_norm": 0.825775682926178, "learning_rate": 0.0008963854622720623, "loss": 1.9979, "step": 77500 }, { "epoch": 0.32216894043591937, "grad_norm": 0.7757362127304077, "learning_rate": 0.0008956946894769054, "loss": 1.9974, "step": 78000 }, { "epoch": 0.3242341259515342, "grad_norm": 0.8657450675964355, "learning_rate": 0.0008950039166817485, "loss": 1.9976, "step": 78500 }, { "epoch": 0.3262993114671491, "grad_norm": 0.8072881102561951, "learning_rate": 0.0008943131438865916, "loss": 1.9934, "step": 79000 }, { "epoch": 0.328364496982764, "grad_norm": 0.7893191576004028, "learning_rate": 0.0008936223710914349, "loss": 1.9945, "step": 79500 }, { "epoch": 0.33042968249837884, "grad_norm": 0.8834479451179504, "learning_rate": 0.000892931598296278, "loss": 1.9914, "step": 80000 }, { "epoch": 0.3324948680139937, "grad_norm": 0.8713655471801758, "learning_rate": 0.0008922408255011212, "loss": 1.9946, "step": 80500 }, { "epoch": 0.33456005352960855, "grad_norm": 0.8255290389060974, "learning_rate": 0.0008915500527059643, "loss": 1.9914, "step": 81000 }, { "epoch": 0.33662523904522346, "grad_norm": 0.8153598308563232, "learning_rate": 0.0008908592799108073, "loss": 1.9861, "step": 81500 }, { "epoch": 0.3386904245608383, "grad_norm": 0.8533855080604553, "learning_rate": 0.0008901685071156506, "loss": 1.9887, "step": 82000 }, { "epoch": 0.34075561007645316, "grad_norm": 0.912350594997406, "learning_rate": 0.0008894777343204937, "loss": 1.9923, "step": 82500 }, { "epoch": 0.342820795592068, "grad_norm": 0.8206115365028381, "learning_rate": 0.0008887869615253369, "loss": 1.9869, "step": 83000 }, { "epoch": 0.3448859811076829, "grad_norm": 0.8313278555870056, "learning_rate": 0.00088809618873018, "loss": 1.9863, "step": 83500 }, { "epoch": 0.3469511666232978, "grad_norm": 0.9152906537055969, "learning_rate": 0.0008874054159350233, "loss": 1.9786, "step": 84000 }, { "epoch": 0.34901635213891263, "grad_norm": 0.8398587107658386, "learning_rate": 0.0008867146431398663, "loss": 1.986, "step": 84500 }, { "epoch": 0.3510815376545275, "grad_norm": 0.8084604144096375, "learning_rate": 0.0008860238703447094, "loss": 1.9837, "step": 85000 }, { "epoch": 0.3531467231701424, "grad_norm": 0.7918562889099121, "learning_rate": 0.0008853330975495526, "loss": 1.9789, "step": 85500 }, { "epoch": 0.35521190868575725, "grad_norm": 0.8110492825508118, "learning_rate": 0.0008846423247543957, "loss": 1.9754, "step": 86000 }, { "epoch": 0.3572770942013721, "grad_norm": 0.7786925435066223, "learning_rate": 0.000883951551959239, "loss": 1.9817, "step": 86500 }, { "epoch": 0.35934227971698696, "grad_norm": 0.8928225636482239, "learning_rate": 0.0008832607791640821, "loss": 1.9809, "step": 87000 }, { "epoch": 0.36140746523260187, "grad_norm": 0.821860134601593, "learning_rate": 0.0008825700063689252, "loss": 1.9759, "step": 87500 }, { "epoch": 0.3634726507482167, "grad_norm": 0.8514395952224731, "learning_rate": 0.0008818792335737683, "loss": 1.9739, "step": 88000 }, { "epoch": 0.3655378362638316, "grad_norm": 0.8256642818450928, "learning_rate": 0.0008811884607786114, "loss": 1.9743, "step": 88500 }, { "epoch": 0.3676030217794464, "grad_norm": 0.8043322563171387, "learning_rate": 0.0008804976879834547, "loss": 1.975, "step": 89000 }, { "epoch": 0.36966820729506134, "grad_norm": 0.8065923452377319, "learning_rate": 0.0008798069151882978, "loss": 1.9712, "step": 89500 }, { "epoch": 0.3717333928106762, "grad_norm": 0.8350073099136353, "learning_rate": 0.000879116142393141, "loss": 1.9741, "step": 90000 }, { "epoch": 0.37379857832629104, "grad_norm": 0.8081244230270386, "learning_rate": 0.000878425369597984, "loss": 1.977, "step": 90500 }, { "epoch": 0.3758637638419059, "grad_norm": 0.7285000681877136, "learning_rate": 0.0008777345968028272, "loss": 1.9688, "step": 91000 }, { "epoch": 0.3779289493575208, "grad_norm": 0.8110142350196838, "learning_rate": 0.0008770438240076704, "loss": 1.9675, "step": 91500 }, { "epoch": 0.37999413487313566, "grad_norm": 0.8193402886390686, "learning_rate": 0.0008763530512125135, "loss": 1.9682, "step": 92000 }, { "epoch": 0.3820593203887505, "grad_norm": 0.8382455110549927, "learning_rate": 0.0008756622784173567, "loss": 1.9652, "step": 92500 }, { "epoch": 0.38412450590436537, "grad_norm": 0.7900645732879639, "learning_rate": 0.0008749715056221998, "loss": 1.9654, "step": 93000 }, { "epoch": 0.3861896914199803, "grad_norm": 0.7835169434547424, "learning_rate": 0.0008742807328270429, "loss": 1.9676, "step": 93500 }, { "epoch": 0.38825487693559513, "grad_norm": 0.8066137433052063, "learning_rate": 0.000873589960031886, "loss": 1.9668, "step": 94000 }, { "epoch": 0.39032006245121, "grad_norm": 0.8150152564048767, "learning_rate": 0.0008728991872367293, "loss": 1.9683, "step": 94500 }, { "epoch": 0.39238524796682483, "grad_norm": 1.05111825466156, "learning_rate": 0.0008722084144415724, "loss": 1.9613, "step": 95000 }, { "epoch": 0.39445043348243974, "grad_norm": 0.8422666788101196, "learning_rate": 0.0008715176416464155, "loss": 1.9625, "step": 95500 }, { "epoch": 0.3965156189980546, "grad_norm": 0.8087729215621948, "learning_rate": 0.0008708268688512587, "loss": 1.9657, "step": 96000 }, { "epoch": 0.39858080451366945, "grad_norm": 0.8095026612281799, "learning_rate": 0.0008701360960561017, "loss": 1.9636, "step": 96500 }, { "epoch": 0.4006459900292843, "grad_norm": 0.7824914455413818, "learning_rate": 0.000869445323260945, "loss": 1.9655, "step": 97000 }, { "epoch": 0.4027111755448992, "grad_norm": 0.8077009320259094, "learning_rate": 0.0008687545504657881, "loss": 1.9577, "step": 97500 }, { "epoch": 0.40477636106051407, "grad_norm": 0.7984289526939392, "learning_rate": 0.0008680637776706313, "loss": 1.9575, "step": 98000 }, { "epoch": 0.4068415465761289, "grad_norm": 0.8378064036369324, "learning_rate": 0.0008673730048754744, "loss": 1.9587, "step": 98500 }, { "epoch": 0.4089067320917438, "grad_norm": 0.7952322959899902, "learning_rate": 0.0008666822320803175, "loss": 1.9572, "step": 99000 }, { "epoch": 0.4109719176073587, "grad_norm": 0.9045737385749817, "learning_rate": 0.0008659914592851607, "loss": 1.957, "step": 99500 }, { "epoch": 0.41303710312297354, "grad_norm": 0.8450877666473389, "learning_rate": 0.0008653006864900038, "loss": 1.9573, "step": 100000 }, { "epoch": 0.4151022886385884, "grad_norm": 0.8580604791641235, "learning_rate": 0.000864609913694847, "loss": 1.9561, "step": 100500 }, { "epoch": 0.4171674741542033, "grad_norm": 0.8783984780311584, "learning_rate": 0.0008639191408996901, "loss": 1.9551, "step": 101000 }, { "epoch": 0.41923265966981815, "grad_norm": 0.7707995772361755, "learning_rate": 0.0008632283681045334, "loss": 1.9551, "step": 101500 }, { "epoch": 0.421297845185433, "grad_norm": 0.7902424931526184, "learning_rate": 0.0008625375953093765, "loss": 1.9544, "step": 102000 }, { "epoch": 0.42336303070104786, "grad_norm": 0.850943922996521, "learning_rate": 0.0008618468225142195, "loss": 1.9556, "step": 102500 }, { "epoch": 0.42542821621666277, "grad_norm": 0.918465793132782, "learning_rate": 0.0008611560497190627, "loss": 1.9504, "step": 103000 }, { "epoch": 0.4274934017322776, "grad_norm": 0.8017387390136719, "learning_rate": 0.0008604652769239058, "loss": 1.9467, "step": 103500 }, { "epoch": 0.4295585872478925, "grad_norm": 0.8548043370246887, "learning_rate": 0.000859774504128749, "loss": 1.9488, "step": 104000 }, { "epoch": 0.43162377276350733, "grad_norm": 0.8529847264289856, "learning_rate": 0.0008590837313335922, "loss": 1.9472, "step": 104500 }, { "epoch": 0.43368895827912224, "grad_norm": 0.9331560730934143, "learning_rate": 0.0008583929585384354, "loss": 1.9444, "step": 105000 }, { "epoch": 0.4357541437947371, "grad_norm": 0.7767966985702515, "learning_rate": 0.0008577021857432784, "loss": 1.9437, "step": 105500 }, { "epoch": 0.43781932931035195, "grad_norm": 0.8031103610992432, "learning_rate": 0.0008570114129481215, "loss": 1.9481, "step": 106000 }, { "epoch": 0.4398845148259668, "grad_norm": 0.8879848122596741, "learning_rate": 0.0008563206401529647, "loss": 1.9439, "step": 106500 }, { "epoch": 0.4419497003415817, "grad_norm": 0.8233328461647034, "learning_rate": 0.0008556298673578079, "loss": 1.9485, "step": 107000 }, { "epoch": 0.44401488585719656, "grad_norm": 0.8277767300605774, "learning_rate": 0.0008549390945626511, "loss": 1.9475, "step": 107500 }, { "epoch": 0.4460800713728114, "grad_norm": 0.8291540741920471, "learning_rate": 0.0008542483217674942, "loss": 1.9503, "step": 108000 }, { "epoch": 0.44814525688842627, "grad_norm": 0.8007998466491699, "learning_rate": 0.0008535575489723373, "loss": 1.9441, "step": 108500 }, { "epoch": 0.4502104424040412, "grad_norm": 0.7802460193634033, "learning_rate": 0.0008528667761771804, "loss": 1.9422, "step": 109000 }, { "epoch": 0.45227562791965603, "grad_norm": 0.7900969982147217, "learning_rate": 0.0008521760033820236, "loss": 1.9418, "step": 109500 }, { "epoch": 0.4543408134352709, "grad_norm": 0.958767294883728, "learning_rate": 0.0008514852305868668, "loss": 1.9431, "step": 110000 }, { "epoch": 0.45640599895088574, "grad_norm": 0.8186129331588745, "learning_rate": 0.0008507944577917099, "loss": 1.9368, "step": 110500 }, { "epoch": 0.45847118446650065, "grad_norm": 0.7958455085754395, "learning_rate": 0.0008501036849965531, "loss": 1.9368, "step": 111000 }, { "epoch": 0.4605363699821155, "grad_norm": 1.3525958061218262, "learning_rate": 0.0008494129122013961, "loss": 1.936, "step": 111500 }, { "epoch": 0.46260155549773035, "grad_norm": 0.8453717827796936, "learning_rate": 0.0008487221394062394, "loss": 1.9438, "step": 112000 }, { "epoch": 0.4646667410133452, "grad_norm": 0.8021391034126282, "learning_rate": 0.0008480313666110825, "loss": 1.9428, "step": 112500 }, { "epoch": 0.4667319265289601, "grad_norm": 0.8905833959579468, "learning_rate": 0.0008473405938159256, "loss": 1.9416, "step": 113000 }, { "epoch": 0.46879711204457497, "grad_norm": 0.789579451084137, "learning_rate": 0.0008466498210207688, "loss": 1.9406, "step": 113500 }, { "epoch": 0.4708622975601898, "grad_norm": 0.8398124575614929, "learning_rate": 0.000845959048225612, "loss": 1.935, "step": 114000 }, { "epoch": 0.4729274830758047, "grad_norm": 0.8189172148704529, "learning_rate": 0.0008452682754304551, "loss": 1.9367, "step": 114500 }, { "epoch": 0.4749926685914196, "grad_norm": 0.7979219555854797, "learning_rate": 0.0008445775026352982, "loss": 1.9309, "step": 115000 }, { "epoch": 0.47705785410703444, "grad_norm": 0.9062512516975403, "learning_rate": 0.0008438867298401414, "loss": 1.9389, "step": 115500 }, { "epoch": 0.4791230396226493, "grad_norm": 0.9431639909744263, "learning_rate": 0.0008431959570449845, "loss": 1.9307, "step": 116000 }, { "epoch": 0.48118822513826415, "grad_norm": 0.8639684319496155, "learning_rate": 0.0008425051842498276, "loss": 1.9325, "step": 116500 }, { "epoch": 0.48325341065387906, "grad_norm": 0.8229732513427734, "learning_rate": 0.0008418144114546709, "loss": 1.9323, "step": 117000 }, { "epoch": 0.4853185961694939, "grad_norm": 0.789789080619812, "learning_rate": 0.0008411236386595139, "loss": 1.9317, "step": 117500 }, { "epoch": 0.48738378168510876, "grad_norm": 0.8473231196403503, "learning_rate": 0.0008404328658643571, "loss": 1.9332, "step": 118000 }, { "epoch": 0.4894489672007236, "grad_norm": 0.9255551099777222, "learning_rate": 0.0008397420930692002, "loss": 1.9309, "step": 118500 }, { "epoch": 0.4915141527163385, "grad_norm": 0.7924582958221436, "learning_rate": 0.0008390513202740435, "loss": 1.9277, "step": 119000 }, { "epoch": 0.4935793382319534, "grad_norm": 0.8907535672187805, "learning_rate": 0.0008383605474788866, "loss": 1.9294, "step": 119500 }, { "epoch": 0.49564452374756823, "grad_norm": 0.8191530108451843, "learning_rate": 0.0008376697746837297, "loss": 1.9306, "step": 120000 }, { "epoch": 0.4977097092631831, "grad_norm": 0.8925333023071289, "learning_rate": 0.0008369790018885728, "loss": 1.9229, "step": 120500 }, { "epoch": 0.499774894778798, "grad_norm": 0.8087531924247742, "learning_rate": 0.0008362882290934159, "loss": 1.9243, "step": 121000 }, { "epoch": 0.5018400802944128, "grad_norm": 0.8658357858657837, "learning_rate": 0.0008355974562982591, "loss": 1.9296, "step": 121500 }, { "epoch": 0.5039052658100277, "grad_norm": 0.8883163332939148, "learning_rate": 0.0008349066835031023, "loss": 1.9228, "step": 122000 }, { "epoch": 0.5059704513256426, "grad_norm": 0.9020292162895203, "learning_rate": 0.0008342159107079455, "loss": 1.9242, "step": 122500 }, { "epoch": 0.5080356368412574, "grad_norm": 0.7825981974601746, "learning_rate": 0.0008335251379127886, "loss": 1.9244, "step": 123000 }, { "epoch": 0.5101008223568723, "grad_norm": 0.7903372645378113, "learning_rate": 0.0008328343651176316, "loss": 1.9263, "step": 123500 }, { "epoch": 0.5121660078724872, "grad_norm": 0.8415020108222961, "learning_rate": 0.0008321435923224748, "loss": 1.9243, "step": 124000 }, { "epoch": 0.5142311933881021, "grad_norm": 0.8838851451873779, "learning_rate": 0.000831452819527318, "loss": 1.9229, "step": 124500 }, { "epoch": 0.5162963789037169, "grad_norm": 0.8412485718727112, "learning_rate": 0.0008307620467321612, "loss": 1.9198, "step": 125000 }, { "epoch": 0.5183615644193318, "grad_norm": 0.8944464921951294, "learning_rate": 0.0008300712739370043, "loss": 1.9242, "step": 125500 }, { "epoch": 0.5204267499349466, "grad_norm": 0.8970022797584534, "learning_rate": 0.0008293805011418475, "loss": 1.9157, "step": 126000 }, { "epoch": 0.5224919354505615, "grad_norm": 0.7767829895019531, "learning_rate": 0.0008286897283466905, "loss": 1.9146, "step": 126500 }, { "epoch": 0.5245571209661763, "grad_norm": 0.9366709589958191, "learning_rate": 0.0008279989555515337, "loss": 1.9219, "step": 127000 }, { "epoch": 0.5266223064817913, "grad_norm": 0.813752293586731, "learning_rate": 0.0008273081827563769, "loss": 1.9131, "step": 127500 }, { "epoch": 0.5286874919974062, "grad_norm": 0.7913943529129028, "learning_rate": 0.00082661740996122, "loss": 1.9151, "step": 128000 }, { "epoch": 0.530752677513021, "grad_norm": 0.7573590278625488, "learning_rate": 0.0008259266371660632, "loss": 1.9141, "step": 128500 }, { "epoch": 0.5328178630286359, "grad_norm": 0.8860184550285339, "learning_rate": 0.0008252358643709063, "loss": 1.9157, "step": 129000 }, { "epoch": 0.5348830485442507, "grad_norm": 0.7423400282859802, "learning_rate": 0.0008245450915757495, "loss": 1.9123, "step": 129500 }, { "epoch": 0.5369482340598656, "grad_norm": 0.7855700254440308, "learning_rate": 0.0008238543187805926, "loss": 1.9156, "step": 130000 }, { "epoch": 0.5390134195754804, "grad_norm": 0.7748924493789673, "learning_rate": 0.0008231635459854357, "loss": 1.9125, "step": 130500 }, { "epoch": 0.5410786050910953, "grad_norm": 0.823998212814331, "learning_rate": 0.0008224727731902789, "loss": 1.914, "step": 131000 }, { "epoch": 0.5431437906067103, "grad_norm": 0.837291955947876, "learning_rate": 0.000821782000395122, "loss": 1.9135, "step": 131500 }, { "epoch": 0.5452089761223251, "grad_norm": 0.8040900230407715, "learning_rate": 0.0008210912275999653, "loss": 1.9167, "step": 132000 }, { "epoch": 0.54727416163794, "grad_norm": 0.8205652236938477, "learning_rate": 0.0008204004548048083, "loss": 1.9111, "step": 132500 }, { "epoch": 0.5493393471535548, "grad_norm": 0.9085518717765808, "learning_rate": 0.0008197096820096515, "loss": 1.9082, "step": 133000 }, { "epoch": 0.5514045326691697, "grad_norm": 0.9547085165977478, "learning_rate": 0.0008190189092144946, "loss": 1.9066, "step": 133500 }, { "epoch": 0.5534697181847845, "grad_norm": 0.8351136445999146, "learning_rate": 0.0008183281364193377, "loss": 1.9152, "step": 134000 }, { "epoch": 0.5555349037003994, "grad_norm": 0.814534068107605, "learning_rate": 0.000817637363624181, "loss": 1.9093, "step": 134500 }, { "epoch": 0.5576000892160142, "grad_norm": 0.8208035826683044, "learning_rate": 0.0008169465908290241, "loss": 1.9107, "step": 135000 }, { "epoch": 0.5596652747316292, "grad_norm": 0.8544581532478333, "learning_rate": 0.0008162558180338672, "loss": 1.9092, "step": 135500 }, { "epoch": 0.561730460247244, "grad_norm": 0.8623299598693848, "learning_rate": 0.0008155650452387103, "loss": 1.9097, "step": 136000 }, { "epoch": 0.5637956457628589, "grad_norm": 0.8688506484031677, "learning_rate": 0.0008148742724435535, "loss": 1.9103, "step": 136500 }, { "epoch": 0.5658608312784738, "grad_norm": 0.8412228226661682, "learning_rate": 0.0008141834996483967, "loss": 1.9074, "step": 137000 }, { "epoch": 0.5679260167940886, "grad_norm": 0.8734971880912781, "learning_rate": 0.0008134927268532398, "loss": 1.906, "step": 137500 }, { "epoch": 0.5699912023097035, "grad_norm": 0.8894969820976257, "learning_rate": 0.000812801954058083, "loss": 1.9072, "step": 138000 }, { "epoch": 0.5720563878253183, "grad_norm": 0.7939966320991516, "learning_rate": 0.0008121111812629261, "loss": 1.908, "step": 138500 }, { "epoch": 0.5741215733409332, "grad_norm": 0.8480666875839233, "learning_rate": 0.0008114204084677692, "loss": 1.9031, "step": 139000 }, { "epoch": 0.5761867588565481, "grad_norm": 0.7555306553840637, "learning_rate": 0.0008107296356726124, "loss": 1.902, "step": 139500 }, { "epoch": 0.578251944372163, "grad_norm": 0.8896836638450623, "learning_rate": 0.0008100388628774556, "loss": 1.9075, "step": 140000 }, { "epoch": 0.5803171298877778, "grad_norm": 0.9097606539726257, "learning_rate": 0.0008093480900822987, "loss": 1.9072, "step": 140500 }, { "epoch": 0.5823823154033927, "grad_norm": 0.9234623312950134, "learning_rate": 0.0008086573172871418, "loss": 1.9053, "step": 141000 }, { "epoch": 0.5844475009190075, "grad_norm": 0.9804132580757141, "learning_rate": 0.000807966544491985, "loss": 1.8995, "step": 141500 }, { "epoch": 0.5865126864346224, "grad_norm": 0.8714466691017151, "learning_rate": 0.0008072757716968281, "loss": 1.9036, "step": 142000 }, { "epoch": 0.5885778719502373, "grad_norm": 0.8345698714256287, "learning_rate": 0.0008065849989016713, "loss": 1.8996, "step": 142500 }, { "epoch": 0.5906430574658521, "grad_norm": 0.8244128227233887, "learning_rate": 0.0008058942261065144, "loss": 1.9, "step": 143000 }, { "epoch": 0.5927082429814671, "grad_norm": 0.8433374166488647, "learning_rate": 0.0008052034533113576, "loss": 1.8983, "step": 143500 }, { "epoch": 0.5947734284970819, "grad_norm": 0.9245389699935913, "learning_rate": 0.0008045126805162007, "loss": 1.8985, "step": 144000 }, { "epoch": 0.5968386140126968, "grad_norm": 0.8123714923858643, "learning_rate": 0.0008038219077210439, "loss": 1.8955, "step": 144500 }, { "epoch": 0.5989037995283116, "grad_norm": 0.834078848361969, "learning_rate": 0.000803131134925887, "loss": 1.8985, "step": 145000 }, { "epoch": 0.6009689850439265, "grad_norm": 0.8230902552604675, "learning_rate": 0.0008024403621307301, "loss": 1.8993, "step": 145500 }, { "epoch": 0.6030341705595413, "grad_norm": 0.7516800761222839, "learning_rate": 0.0008017495893355733, "loss": 1.8954, "step": 146000 }, { "epoch": 0.6050993560751562, "grad_norm": 0.8156014084815979, "learning_rate": 0.0008010588165404164, "loss": 1.8934, "step": 146500 }, { "epoch": 0.607164541590771, "grad_norm": 0.8357443809509277, "learning_rate": 0.0008003680437452597, "loss": 1.8985, "step": 147000 }, { "epoch": 0.609229727106386, "grad_norm": 0.8833040595054626, "learning_rate": 0.0007996772709501028, "loss": 1.8951, "step": 147500 }, { "epoch": 0.6112949126220009, "grad_norm": 0.9052265286445618, "learning_rate": 0.0007989864981549458, "loss": 1.8919, "step": 148000 }, { "epoch": 0.6133600981376157, "grad_norm": 0.7939783334732056, "learning_rate": 0.000798295725359789, "loss": 1.8982, "step": 148500 }, { "epoch": 0.6154252836532306, "grad_norm": 0.8598021864891052, "learning_rate": 0.0007976049525646321, "loss": 1.8955, "step": 149000 }, { "epoch": 0.6174904691688454, "grad_norm": 0.7993877530097961, "learning_rate": 0.0007969141797694754, "loss": 1.894, "step": 149500 }, { "epoch": 0.6195556546844603, "grad_norm": 0.8220402002334595, "learning_rate": 0.0007962234069743185, "loss": 1.8963, "step": 150000 }, { "epoch": 0.6216208402000751, "grad_norm": 0.9298192262649536, "learning_rate": 0.0007955326341791617, "loss": 1.8902, "step": 150500 }, { "epoch": 0.6236860257156901, "grad_norm": 0.7912063002586365, "learning_rate": 0.0007948418613840047, "loss": 1.8918, "step": 151000 }, { "epoch": 0.625751211231305, "grad_norm": 0.907156765460968, "learning_rate": 0.0007941510885888478, "loss": 1.8929, "step": 151500 }, { "epoch": 0.6278163967469198, "grad_norm": 0.8619490265846252, "learning_rate": 0.0007934603157936911, "loss": 1.8881, "step": 152000 }, { "epoch": 0.6298815822625347, "grad_norm": 0.8170045018196106, "learning_rate": 0.0007927695429985342, "loss": 1.8879, "step": 152500 }, { "epoch": 0.6319467677781495, "grad_norm": 0.7822418212890625, "learning_rate": 0.0007920787702033774, "loss": 1.8956, "step": 153000 }, { "epoch": 0.6340119532937644, "grad_norm": 0.878753125667572, "learning_rate": 0.0007913879974082205, "loss": 1.8914, "step": 153500 }, { "epoch": 0.6360771388093792, "grad_norm": 0.8338424563407898, "learning_rate": 0.0007906972246130636, "loss": 1.8911, "step": 154000 }, { "epoch": 0.6381423243249941, "grad_norm": 0.8565462827682495, "learning_rate": 0.0007900064518179068, "loss": 1.8881, "step": 154500 }, { "epoch": 0.640207509840609, "grad_norm": 0.82133948802948, "learning_rate": 0.0007893156790227499, "loss": 1.8884, "step": 155000 }, { "epoch": 0.6422726953562239, "grad_norm": 0.9342901706695557, "learning_rate": 0.0007886249062275931, "loss": 1.8865, "step": 155500 }, { "epoch": 0.6443378808718387, "grad_norm": 0.8597960472106934, "learning_rate": 0.0007879341334324362, "loss": 1.8893, "step": 156000 }, { "epoch": 0.6464030663874536, "grad_norm": 0.816633939743042, "learning_rate": 0.0007872433606372795, "loss": 1.8831, "step": 156500 }, { "epoch": 0.6484682519030684, "grad_norm": 0.8402358293533325, "learning_rate": 0.0007865525878421225, "loss": 1.8844, "step": 157000 }, { "epoch": 0.6505334374186833, "grad_norm": 0.8066496253013611, "learning_rate": 0.0007858618150469657, "loss": 1.8879, "step": 157500 }, { "epoch": 0.6525986229342982, "grad_norm": 0.7855266332626343, "learning_rate": 0.0007851710422518088, "loss": 1.8883, "step": 158000 }, { "epoch": 0.654663808449913, "grad_norm": 0.8272327184677124, "learning_rate": 0.0007844802694566519, "loss": 1.8823, "step": 158500 }, { "epoch": 0.656728993965528, "grad_norm": 0.7959176898002625, "learning_rate": 0.0007837894966614951, "loss": 1.8862, "step": 159000 }, { "epoch": 0.6587941794811428, "grad_norm": 0.8315137028694153, "learning_rate": 0.0007830987238663383, "loss": 1.8839, "step": 159500 }, { "epoch": 0.6608593649967577, "grad_norm": 0.8382706046104431, "learning_rate": 0.0007824079510711814, "loss": 1.8838, "step": 160000 }, { "epoch": 0.6629245505123725, "grad_norm": 0.7986578941345215, "learning_rate": 0.0007817171782760245, "loss": 1.8825, "step": 160500 }, { "epoch": 0.6649897360279874, "grad_norm": 0.8452582359313965, "learning_rate": 0.0007810264054808677, "loss": 1.884, "step": 161000 }, { "epoch": 0.6670549215436022, "grad_norm": 0.86090487241745, "learning_rate": 0.0007803356326857108, "loss": 1.8837, "step": 161500 }, { "epoch": 0.6691201070592171, "grad_norm": 0.8608242273330688, "learning_rate": 0.000779644859890554, "loss": 1.8791, "step": 162000 }, { "epoch": 0.671185292574832, "grad_norm": 0.8503440618515015, "learning_rate": 0.0007789540870953972, "loss": 1.8771, "step": 162500 }, { "epoch": 0.6732504780904469, "grad_norm": 0.7802348136901855, "learning_rate": 0.0007782633143002402, "loss": 1.8848, "step": 163000 }, { "epoch": 0.6753156636060618, "grad_norm": 0.9252862930297852, "learning_rate": 0.0007775725415050834, "loss": 1.8791, "step": 163500 }, { "epoch": 0.6773808491216766, "grad_norm": 0.8752533793449402, "learning_rate": 0.0007768817687099265, "loss": 1.881, "step": 164000 }, { "epoch": 0.6794460346372915, "grad_norm": 0.9123765826225281, "learning_rate": 0.0007761909959147698, "loss": 1.8809, "step": 164500 }, { "epoch": 0.6815112201529063, "grad_norm": 0.8338991403579712, "learning_rate": 0.0007755002231196129, "loss": 1.8786, "step": 165000 }, { "epoch": 0.6835764056685212, "grad_norm": 0.8287580609321594, "learning_rate": 0.000774809450324456, "loss": 1.8797, "step": 165500 }, { "epoch": 0.685641591184136, "grad_norm": 0.8854800462722778, "learning_rate": 0.0007741186775292991, "loss": 1.8799, "step": 166000 }, { "epoch": 0.6877067766997509, "grad_norm": 0.8071344494819641, "learning_rate": 0.0007734279047341422, "loss": 1.8738, "step": 166500 }, { "epoch": 0.6897719622153659, "grad_norm": 0.8015414476394653, "learning_rate": 0.0007727371319389855, "loss": 1.8764, "step": 167000 }, { "epoch": 0.6918371477309807, "grad_norm": 0.8209612965583801, "learning_rate": 0.0007720463591438286, "loss": 1.8802, "step": 167500 }, { "epoch": 0.6939023332465956, "grad_norm": 0.8554903268814087, "learning_rate": 0.0007713555863486718, "loss": 1.8742, "step": 168000 }, { "epoch": 0.6959675187622104, "grad_norm": 0.8712402582168579, "learning_rate": 0.0007706648135535149, "loss": 1.8761, "step": 168500 }, { "epoch": 0.6980327042778253, "grad_norm": 0.8566715121269226, "learning_rate": 0.0007699740407583579, "loss": 1.8742, "step": 169000 }, { "epoch": 0.7000978897934401, "grad_norm": 0.8078393340110779, "learning_rate": 0.0007692832679632012, "loss": 1.8723, "step": 169500 }, { "epoch": 0.702163075309055, "grad_norm": 0.8996677994728088, "learning_rate": 0.0007685924951680443, "loss": 1.8729, "step": 170000 }, { "epoch": 0.7042282608246698, "grad_norm": 0.8081231117248535, "learning_rate": 0.0007679017223728875, "loss": 1.8749, "step": 170500 }, { "epoch": 0.7062934463402848, "grad_norm": 0.8042668104171753, "learning_rate": 0.0007672109495777306, "loss": 1.869, "step": 171000 }, { "epoch": 0.7083586318558996, "grad_norm": 0.8018625378608704, "learning_rate": 0.0007665201767825738, "loss": 1.87, "step": 171500 }, { "epoch": 0.7104238173715145, "grad_norm": 0.8580893874168396, "learning_rate": 0.0007658294039874169, "loss": 1.874, "step": 172000 }, { "epoch": 0.7124890028871294, "grad_norm": 0.8910616636276245, "learning_rate": 0.00076513863119226, "loss": 1.8749, "step": 172500 }, { "epoch": 0.7145541884027442, "grad_norm": 0.9036041498184204, "learning_rate": 0.0007644478583971032, "loss": 1.874, "step": 173000 }, { "epoch": 0.7166193739183591, "grad_norm": 0.8568321466445923, "learning_rate": 0.0007637570856019463, "loss": 1.8683, "step": 173500 }, { "epoch": 0.7186845594339739, "grad_norm": 0.7695671916007996, "learning_rate": 0.0007630663128067895, "loss": 1.8677, "step": 174000 }, { "epoch": 0.7207497449495888, "grad_norm": 0.9139420390129089, "learning_rate": 0.0007623755400116327, "loss": 1.8694, "step": 174500 }, { "epoch": 0.7228149304652037, "grad_norm": 0.8462100625038147, "learning_rate": 0.0007616847672164757, "loss": 1.8714, "step": 175000 }, { "epoch": 0.7248801159808186, "grad_norm": 0.8447960615158081, "learning_rate": 0.0007609939944213189, "loss": 1.8669, "step": 175500 }, { "epoch": 0.7269453014964334, "grad_norm": 0.810688316822052, "learning_rate": 0.000760303221626162, "loss": 1.8637, "step": 176000 }, { "epoch": 0.7290104870120483, "grad_norm": 0.7696130871772766, "learning_rate": 0.0007596124488310052, "loss": 1.8658, "step": 176500 }, { "epoch": 0.7310756725276631, "grad_norm": 0.8709802031517029, "learning_rate": 0.0007589216760358484, "loss": 1.8664, "step": 177000 }, { "epoch": 0.733140858043278, "grad_norm": 0.8365340828895569, "learning_rate": 0.0007582309032406916, "loss": 1.8656, "step": 177500 }, { "epoch": 0.7352060435588929, "grad_norm": 1.0223950147628784, "learning_rate": 0.0007575401304455346, "loss": 1.8694, "step": 178000 }, { "epoch": 0.7372712290745078, "grad_norm": 0.829572319984436, "learning_rate": 0.0007568493576503777, "loss": 1.8708, "step": 178500 }, { "epoch": 0.7393364145901227, "grad_norm": 0.80058354139328, "learning_rate": 0.0007561585848552209, "loss": 1.8687, "step": 179000 }, { "epoch": 0.7414016001057375, "grad_norm": 0.8351449370384216, "learning_rate": 0.0007554678120600641, "loss": 1.8639, "step": 179500 }, { "epoch": 0.7434667856213524, "grad_norm": 0.905135989189148, "learning_rate": 0.0007547770392649073, "loss": 1.8645, "step": 180000 }, { "epoch": 0.7455319711369672, "grad_norm": 0.8477722406387329, "learning_rate": 0.0007540862664697504, "loss": 1.8615, "step": 180500 }, { "epoch": 0.7475971566525821, "grad_norm": 0.9718809723854065, "learning_rate": 0.0007533954936745935, "loss": 1.8575, "step": 181000 }, { "epoch": 0.7496623421681969, "grad_norm": 0.9097675681114197, "learning_rate": 0.0007527047208794366, "loss": 1.8632, "step": 181500 }, { "epoch": 0.7517275276838118, "grad_norm": 0.9181948900222778, "learning_rate": 0.0007520139480842798, "loss": 1.864, "step": 182000 }, { "epoch": 0.7537927131994268, "grad_norm": 0.8058791160583496, "learning_rate": 0.000751323175289123, "loss": 1.8585, "step": 182500 }, { "epoch": 0.7558578987150416, "grad_norm": 0.818452775478363, "learning_rate": 0.0007506324024939661, "loss": 1.8621, "step": 183000 }, { "epoch": 0.7579230842306565, "grad_norm": 0.8789253830909729, "learning_rate": 0.0007499416296988093, "loss": 1.8567, "step": 183500 }, { "epoch": 0.7599882697462713, "grad_norm": 0.8578682541847229, "learning_rate": 0.0007492508569036523, "loss": 1.8615, "step": 184000 }, { "epoch": 0.7620534552618862, "grad_norm": 0.8071935176849365, "learning_rate": 0.0007485600841084956, "loss": 1.8589, "step": 184500 }, { "epoch": 0.764118640777501, "grad_norm": 0.8300654888153076, "learning_rate": 0.0007478693113133387, "loss": 1.8588, "step": 185000 }, { "epoch": 0.7661838262931159, "grad_norm": 0.8175327181816101, "learning_rate": 0.0007471785385181818, "loss": 1.8617, "step": 185500 }, { "epoch": 0.7682490118087307, "grad_norm": 0.8610235452651978, "learning_rate": 0.000746487765723025, "loss": 1.8561, "step": 186000 }, { "epoch": 0.7703141973243457, "grad_norm": 0.857377290725708, "learning_rate": 0.0007457969929278681, "loss": 1.856, "step": 186500 }, { "epoch": 0.7723793828399605, "grad_norm": 0.8002254366874695, "learning_rate": 0.0007451062201327113, "loss": 1.8609, "step": 187000 }, { "epoch": 0.7744445683555754, "grad_norm": 0.9635730385780334, "learning_rate": 0.0007444154473375544, "loss": 1.8592, "step": 187500 }, { "epoch": 0.7765097538711903, "grad_norm": 0.8251007795333862, "learning_rate": 0.0007437246745423976, "loss": 1.8623, "step": 188000 }, { "epoch": 0.7785749393868051, "grad_norm": 0.8280484676361084, "learning_rate": 0.0007430339017472407, "loss": 1.855, "step": 188500 }, { "epoch": 0.78064012490242, "grad_norm": 0.7635123133659363, "learning_rate": 0.0007423431289520838, "loss": 1.8559, "step": 189000 }, { "epoch": 0.7827053104180348, "grad_norm": 0.7589561939239502, "learning_rate": 0.0007416523561569271, "loss": 1.8576, "step": 189500 }, { "epoch": 0.7847704959336497, "grad_norm": 0.876846969127655, "learning_rate": 0.0007409615833617701, "loss": 1.8571, "step": 190000 }, { "epoch": 0.7868356814492646, "grad_norm": 0.9164223074913025, "learning_rate": 0.0007402708105666133, "loss": 1.8542, "step": 190500 }, { "epoch": 0.7889008669648795, "grad_norm": 0.9645445346832275, "learning_rate": 0.0007395800377714564, "loss": 1.8584, "step": 191000 }, { "epoch": 0.7909660524804943, "grad_norm": 0.8780491948127747, "learning_rate": 0.0007388892649762996, "loss": 1.862, "step": 191500 }, { "epoch": 0.7930312379961092, "grad_norm": 0.8747962117195129, "learning_rate": 0.0007381984921811428, "loss": 1.8544, "step": 192000 }, { "epoch": 0.795096423511724, "grad_norm": 0.8750070333480835, "learning_rate": 0.0007375077193859859, "loss": 1.8562, "step": 192500 }, { "epoch": 0.7971616090273389, "grad_norm": 0.7979694604873657, "learning_rate": 0.000736816946590829, "loss": 1.8559, "step": 193000 }, { "epoch": 0.7992267945429538, "grad_norm": 0.8153182864189148, "learning_rate": 0.0007361261737956721, "loss": 1.8497, "step": 193500 }, { "epoch": 0.8012919800585686, "grad_norm": 0.9015457034111023, "learning_rate": 0.0007354354010005153, "loss": 1.85, "step": 194000 }, { "epoch": 0.8033571655741836, "grad_norm": 0.845658540725708, "learning_rate": 0.0007347446282053585, "loss": 1.853, "step": 194500 }, { "epoch": 0.8054223510897984, "grad_norm": 0.839846670627594, "learning_rate": 0.0007340538554102017, "loss": 1.8511, "step": 195000 }, { "epoch": 0.8074875366054133, "grad_norm": 0.8285427689552307, "learning_rate": 0.0007333630826150448, "loss": 1.8512, "step": 195500 }, { "epoch": 0.8095527221210281, "grad_norm": 0.8489523530006409, "learning_rate": 0.0007326723098198878, "loss": 1.8538, "step": 196000 }, { "epoch": 0.811617907636643, "grad_norm": 0.8332532644271851, "learning_rate": 0.000731981537024731, "loss": 1.8528, "step": 196500 }, { "epoch": 0.8136830931522578, "grad_norm": 0.8185180425643921, "learning_rate": 0.0007312907642295742, "loss": 1.8472, "step": 197000 }, { "epoch": 0.8157482786678727, "grad_norm": 0.8716513514518738, "learning_rate": 0.0007305999914344174, "loss": 1.8487, "step": 197500 }, { "epoch": 0.8178134641834875, "grad_norm": 0.8488348126411438, "learning_rate": 0.0007299092186392605, "loss": 1.8487, "step": 198000 }, { "epoch": 0.8198786496991025, "grad_norm": 0.7853295207023621, "learning_rate": 0.0007292184458441037, "loss": 1.8467, "step": 198500 }, { "epoch": 0.8219438352147174, "grad_norm": 0.8092118501663208, "learning_rate": 0.0007285276730489468, "loss": 1.8454, "step": 199000 }, { "epoch": 0.8240090207303322, "grad_norm": 0.8414338231086731, "learning_rate": 0.0007278369002537899, "loss": 1.8456, "step": 199500 }, { "epoch": 0.8260742062459471, "grad_norm": 0.7936431765556335, "learning_rate": 0.0007271461274586331, "loss": 1.8455, "step": 200000 }, { "epoch": 0.8281393917615619, "grad_norm": 0.8624149560928345, "learning_rate": 0.0007264553546634762, "loss": 1.8417, "step": 200500 }, { "epoch": 0.8302045772771768, "grad_norm": 0.7787384986877441, "learning_rate": 0.0007257645818683194, "loss": 1.8469, "step": 201000 }, { "epoch": 0.8322697627927916, "grad_norm": 0.7881982922554016, "learning_rate": 0.0007250738090731625, "loss": 1.8469, "step": 201500 }, { "epoch": 0.8343349483084066, "grad_norm": 0.8017438650131226, "learning_rate": 0.0007243830362780058, "loss": 1.8477, "step": 202000 }, { "epoch": 0.8364001338240215, "grad_norm": 0.8839012980461121, "learning_rate": 0.0007236922634828488, "loss": 1.8433, "step": 202500 }, { "epoch": 0.8384653193396363, "grad_norm": 0.8032566905021667, "learning_rate": 0.0007230014906876919, "loss": 1.845, "step": 203000 }, { "epoch": 0.8405305048552512, "grad_norm": 0.9038089513778687, "learning_rate": 0.0007223107178925351, "loss": 1.8425, "step": 203500 }, { "epoch": 0.842595690370866, "grad_norm": 0.9411084651947021, "learning_rate": 0.0007216199450973782, "loss": 1.8414, "step": 204000 }, { "epoch": 0.8446608758864809, "grad_norm": 0.8129530549049377, "learning_rate": 0.0007209291723022215, "loss": 1.8413, "step": 204500 }, { "epoch": 0.8467260614020957, "grad_norm": 0.7938794493675232, "learning_rate": 0.0007202383995070646, "loss": 1.8419, "step": 205000 }, { "epoch": 0.8487912469177106, "grad_norm": 0.9388673305511475, "learning_rate": 0.0007195476267119077, "loss": 1.8433, "step": 205500 }, { "epoch": 0.8508564324333255, "grad_norm": 0.9263845682144165, "learning_rate": 0.0007188568539167508, "loss": 1.8407, "step": 206000 }, { "epoch": 0.8529216179489404, "grad_norm": 0.7958715558052063, "learning_rate": 0.0007181660811215939, "loss": 1.8464, "step": 206500 }, { "epoch": 0.8549868034645552, "grad_norm": 0.7796356081962585, "learning_rate": 0.0007174753083264372, "loss": 1.8444, "step": 207000 }, { "epoch": 0.8570519889801701, "grad_norm": 0.8039528727531433, "learning_rate": 0.0007167845355312803, "loss": 1.8474, "step": 207500 }, { "epoch": 0.859117174495785, "grad_norm": 0.8445290327072144, "learning_rate": 0.0007160937627361235, "loss": 1.8405, "step": 208000 }, { "epoch": 0.8611823600113998, "grad_norm": 0.8098761439323425, "learning_rate": 0.0007154029899409665, "loss": 1.8395, "step": 208500 }, { "epoch": 0.8632475455270147, "grad_norm": 0.8343963027000427, "learning_rate": 0.0007147122171458097, "loss": 1.8362, "step": 209000 }, { "epoch": 0.8653127310426295, "grad_norm": 0.8452053666114807, "learning_rate": 0.0007140214443506529, "loss": 1.8431, "step": 209500 }, { "epoch": 0.8673779165582445, "grad_norm": 0.8454539179801941, "learning_rate": 0.000713330671555496, "loss": 1.8327, "step": 210000 }, { "epoch": 0.8694431020738593, "grad_norm": 0.7599641680717468, "learning_rate": 0.0007126398987603392, "loss": 1.8335, "step": 210500 }, { "epoch": 0.8715082875894742, "grad_norm": 0.8617073893547058, "learning_rate": 0.0007119491259651823, "loss": 1.8381, "step": 211000 }, { "epoch": 0.873573473105089, "grad_norm": 0.8182563781738281, "learning_rate": 0.0007112583531700254, "loss": 1.8359, "step": 211500 }, { "epoch": 0.8756386586207039, "grad_norm": 0.8188121318817139, "learning_rate": 0.0007105675803748686, "loss": 1.8362, "step": 212000 }, { "epoch": 0.8777038441363187, "grad_norm": 0.7888435125350952, "learning_rate": 0.0007098768075797118, "loss": 1.8389, "step": 212500 }, { "epoch": 0.8797690296519336, "grad_norm": 0.8186080455780029, "learning_rate": 0.0007091860347845549, "loss": 1.8356, "step": 213000 }, { "epoch": 0.8818342151675485, "grad_norm": 0.7884934544563293, "learning_rate": 0.000708495261989398, "loss": 1.8356, "step": 213500 }, { "epoch": 0.8838994006831634, "grad_norm": 0.8210222125053406, "learning_rate": 0.0007078044891942412, "loss": 1.8348, "step": 214000 }, { "epoch": 0.8859645861987783, "grad_norm": 0.868903398513794, "learning_rate": 0.0007071137163990843, "loss": 1.8349, "step": 214500 }, { "epoch": 0.8880297717143931, "grad_norm": 0.8679877519607544, "learning_rate": 0.0007064229436039275, "loss": 1.834, "step": 215000 }, { "epoch": 0.890094957230008, "grad_norm": 0.8414639234542847, "learning_rate": 0.0007057321708087706, "loss": 1.8348, "step": 215500 }, { "epoch": 0.8921601427456228, "grad_norm": 0.8036888241767883, "learning_rate": 0.0007050413980136138, "loss": 1.8331, "step": 216000 }, { "epoch": 0.8942253282612377, "grad_norm": 0.833270251750946, "learning_rate": 0.0007043506252184569, "loss": 1.8317, "step": 216500 }, { "epoch": 0.8962905137768525, "grad_norm": 0.7350865602493286, "learning_rate": 0.0007036598524233001, "loss": 1.8305, "step": 217000 }, { "epoch": 0.8983556992924674, "grad_norm": 0.8501140475273132, "learning_rate": 0.0007029690796281432, "loss": 1.8278, "step": 217500 }, { "epoch": 0.9004208848080824, "grad_norm": 0.82123202085495, "learning_rate": 0.0007022783068329863, "loss": 1.8285, "step": 218000 }, { "epoch": 0.9024860703236972, "grad_norm": 0.8079880475997925, "learning_rate": 0.0007015875340378295, "loss": 1.833, "step": 218500 }, { "epoch": 0.9045512558393121, "grad_norm": 0.7871448397636414, "learning_rate": 0.0007008967612426726, "loss": 1.8338, "step": 219000 }, { "epoch": 0.9066164413549269, "grad_norm": 0.8511725664138794, "learning_rate": 0.0007002059884475159, "loss": 1.8325, "step": 219500 }, { "epoch": 0.9086816268705418, "grad_norm": 0.9022111296653748, "learning_rate": 0.000699515215652359, "loss": 1.8292, "step": 220000 }, { "epoch": 0.9107468123861566, "grad_norm": 0.8371003270149231, "learning_rate": 0.000698824442857202, "loss": 1.8287, "step": 220500 }, { "epoch": 0.9128119979017715, "grad_norm": 0.824407160282135, "learning_rate": 0.0006981336700620452, "loss": 1.8315, "step": 221000 }, { "epoch": 0.9148771834173863, "grad_norm": 0.847411572933197, "learning_rate": 0.0006974428972668883, "loss": 1.8333, "step": 221500 }, { "epoch": 0.9169423689330013, "grad_norm": 0.8592170476913452, "learning_rate": 0.0006967521244717316, "loss": 1.8234, "step": 222000 }, { "epoch": 0.9190075544486161, "grad_norm": 0.7863643169403076, "learning_rate": 0.0006960613516765747, "loss": 1.8364, "step": 222500 }, { "epoch": 0.921072739964231, "grad_norm": 0.7801703214645386, "learning_rate": 0.0006953705788814179, "loss": 1.8243, "step": 223000 }, { "epoch": 0.9231379254798459, "grad_norm": 0.8160432577133179, "learning_rate": 0.0006946798060862609, "loss": 1.8236, "step": 223500 }, { "epoch": 0.9252031109954607, "grad_norm": 0.825862467288971, "learning_rate": 0.000693989033291104, "loss": 1.827, "step": 224000 }, { "epoch": 0.9272682965110756, "grad_norm": 0.8575713634490967, "learning_rate": 0.0006932982604959473, "loss": 1.8225, "step": 224500 }, { "epoch": 0.9293334820266904, "grad_norm": 0.7798600792884827, "learning_rate": 0.0006926074877007904, "loss": 1.8276, "step": 225000 }, { "epoch": 0.9313986675423054, "grad_norm": 0.8042396903038025, "learning_rate": 0.0006919167149056336, "loss": 1.8274, "step": 225500 }, { "epoch": 0.9334638530579202, "grad_norm": 0.8900014758110046, "learning_rate": 0.0006912259421104767, "loss": 1.8242, "step": 226000 }, { "epoch": 0.9355290385735351, "grad_norm": 0.8842340111732483, "learning_rate": 0.0006905351693153198, "loss": 1.8222, "step": 226500 }, { "epoch": 0.9375942240891499, "grad_norm": 0.8076005578041077, "learning_rate": 0.000689844396520163, "loss": 1.824, "step": 227000 }, { "epoch": 0.9396594096047648, "grad_norm": 0.8478308916091919, "learning_rate": 0.0006891536237250061, "loss": 1.8213, "step": 227500 }, { "epoch": 0.9417245951203796, "grad_norm": 0.8478752374649048, "learning_rate": 0.0006884628509298493, "loss": 1.8271, "step": 228000 }, { "epoch": 0.9437897806359945, "grad_norm": 0.8306804299354553, "learning_rate": 0.0006877720781346924, "loss": 1.8257, "step": 228500 }, { "epoch": 0.9458549661516094, "grad_norm": 0.8503381013870239, "learning_rate": 0.0006870813053395356, "loss": 1.8196, "step": 229000 }, { "epoch": 0.9479201516672243, "grad_norm": 0.7972338795661926, "learning_rate": 0.0006863905325443787, "loss": 1.8219, "step": 229500 }, { "epoch": 0.9499853371828392, "grad_norm": 0.8305501341819763, "learning_rate": 0.0006856997597492219, "loss": 1.8204, "step": 230000 }, { "epoch": 0.952050522698454, "grad_norm": 0.8877650499343872, "learning_rate": 0.000685008986954065, "loss": 1.8227, "step": 230500 }, { "epoch": 0.9541157082140689, "grad_norm": 0.8762148022651672, "learning_rate": 0.0006843182141589081, "loss": 1.8224, "step": 231000 }, { "epoch": 0.9561808937296837, "grad_norm": 0.7984791398048401, "learning_rate": 0.0006836274413637513, "loss": 1.8188, "step": 231500 }, { "epoch": 0.9582460792452986, "grad_norm": 0.8119187355041504, "learning_rate": 0.0006829366685685945, "loss": 1.8212, "step": 232000 }, { "epoch": 0.9603112647609134, "grad_norm": 0.8037796020507812, "learning_rate": 0.0006822458957734376, "loss": 1.8187, "step": 232500 }, { "epoch": 0.9623764502765283, "grad_norm": 0.8950905799865723, "learning_rate": 0.0006815551229782807, "loss": 1.816, "step": 233000 }, { "epoch": 0.9644416357921433, "grad_norm": 0.8347873091697693, "learning_rate": 0.0006808643501831239, "loss": 1.8218, "step": 233500 }, { "epoch": 0.9665068213077581, "grad_norm": 0.8473377823829651, "learning_rate": 0.000680173577387967, "loss": 1.8195, "step": 234000 }, { "epoch": 0.968572006823373, "grad_norm": 0.7937746644020081, "learning_rate": 0.0006794828045928102, "loss": 1.8171, "step": 234500 }, { "epoch": 0.9706371923389878, "grad_norm": 0.7679367065429688, "learning_rate": 0.0006787920317976534, "loss": 1.8146, "step": 235000 }, { "epoch": 0.9727023778546027, "grad_norm": 0.8515623807907104, "learning_rate": 0.0006781012590024964, "loss": 1.8173, "step": 235500 }, { "epoch": 0.9747675633702175, "grad_norm": 0.8188038468360901, "learning_rate": 0.0006774104862073396, "loss": 1.817, "step": 236000 }, { "epoch": 0.9768327488858324, "grad_norm": 0.7979288697242737, "learning_rate": 0.0006767197134121827, "loss": 1.819, "step": 236500 }, { "epoch": 0.9788979344014472, "grad_norm": 0.8797492980957031, "learning_rate": 0.000676028940617026, "loss": 1.8148, "step": 237000 }, { "epoch": 0.9809631199170622, "grad_norm": 0.8223576545715332, "learning_rate": 0.0006753381678218691, "loss": 1.815, "step": 237500 }, { "epoch": 0.983028305432677, "grad_norm": 0.8249248266220093, "learning_rate": 0.0006746473950267122, "loss": 1.8174, "step": 238000 }, { "epoch": 0.9850934909482919, "grad_norm": 0.8677356243133545, "learning_rate": 0.0006739566222315553, "loss": 1.8209, "step": 238500 }, { "epoch": 0.9871586764639068, "grad_norm": 0.829744815826416, "learning_rate": 0.0006732658494363984, "loss": 1.8132, "step": 239000 }, { "epoch": 0.9892238619795216, "grad_norm": 0.8238321542739868, "learning_rate": 0.0006725750766412417, "loss": 1.8104, "step": 239500 }, { "epoch": 0.9912890474951365, "grad_norm": 0.8242679834365845, "learning_rate": 0.0006718843038460848, "loss": 1.8129, "step": 240000 }, { "epoch": 0.9933542330107513, "grad_norm": 0.7887668013572693, "learning_rate": 0.000671193531050928, "loss": 1.8156, "step": 240500 }, { "epoch": 0.9954194185263662, "grad_norm": 0.7950047850608826, "learning_rate": 0.0006705027582557711, "loss": 1.8158, "step": 241000 }, { "epoch": 0.9974846040419811, "grad_norm": 0.7892596125602722, "learning_rate": 0.0006698119854606141, "loss": 1.8122, "step": 241500 }, { "epoch": 0.999549789557596, "grad_norm": 0.8291964530944824, "learning_rate": 0.0006691212126654574, "loss": 1.8176, "step": 242000 }, { "epoch": 1.0016149750732108, "grad_norm": 0.7552099227905273, "learning_rate": 0.0006684304398703005, "loss": 1.8117, "step": 242500 }, { "epoch": 1.0036801605888257, "grad_norm": 0.9016017317771912, "learning_rate": 0.0006677396670751437, "loss": 1.8141, "step": 243000 }, { "epoch": 1.0057453461044406, "grad_norm": 0.7623195648193359, "learning_rate": 0.0006670488942799868, "loss": 1.8104, "step": 243500 }, { "epoch": 1.0078105316200554, "grad_norm": 0.8782749772071838, "learning_rate": 0.00066635812148483, "loss": 1.813, "step": 244000 }, { "epoch": 1.0098757171356703, "grad_norm": 0.800456702709198, "learning_rate": 0.000665667348689673, "loss": 1.8105, "step": 244500 }, { "epoch": 1.0119409026512851, "grad_norm": 0.855076014995575, "learning_rate": 0.0006649765758945162, "loss": 1.8081, "step": 245000 }, { "epoch": 1.0140060881669, "grad_norm": 0.8036173582077026, "learning_rate": 0.0006642858030993594, "loss": 1.8112, "step": 245500 }, { "epoch": 1.0160712736825148, "grad_norm": 0.8001554012298584, "learning_rate": 0.0006635950303042025, "loss": 1.8079, "step": 246000 }, { "epoch": 1.0181364591981297, "grad_norm": 0.8144285082817078, "learning_rate": 0.0006629042575090457, "loss": 1.8076, "step": 246500 }, { "epoch": 1.0202016447137445, "grad_norm": 0.8857467174530029, "learning_rate": 0.0006622134847138889, "loss": 1.8108, "step": 247000 }, { "epoch": 1.0222668302293596, "grad_norm": 0.7909874320030212, "learning_rate": 0.000661522711918732, "loss": 1.8068, "step": 247500 }, { "epoch": 1.0243320157449745, "grad_norm": 0.8089008331298828, "learning_rate": 0.0006608319391235751, "loss": 1.8093, "step": 248000 }, { "epoch": 1.0263972012605893, "grad_norm": 0.8550245761871338, "learning_rate": 0.0006601411663284182, "loss": 1.8087, "step": 248500 }, { "epoch": 1.0284623867762042, "grad_norm": 0.8594583868980408, "learning_rate": 0.0006594503935332614, "loss": 1.8062, "step": 249000 }, { "epoch": 1.030527572291819, "grad_norm": 0.8355042338371277, "learning_rate": 0.0006587596207381046, "loss": 1.809, "step": 249500 }, { "epoch": 1.0325927578074339, "grad_norm": 0.8276521563529968, "learning_rate": 0.0006580688479429478, "loss": 1.8138, "step": 250000 }, { "epoch": 1.0346579433230487, "grad_norm": 0.8123018741607666, "learning_rate": 0.0006573780751477908, "loss": 1.8017, "step": 250500 }, { "epoch": 1.0367231288386636, "grad_norm": 0.7968121767044067, "learning_rate": 0.000656687302352634, "loss": 1.8076, "step": 251000 }, { "epoch": 1.0387883143542784, "grad_norm": 0.941233217716217, "learning_rate": 0.0006559965295574771, "loss": 1.8059, "step": 251500 }, { "epoch": 1.0408534998698933, "grad_norm": 0.8153935074806213, "learning_rate": 0.0006553057567623202, "loss": 1.8065, "step": 252000 }, { "epoch": 1.0429186853855081, "grad_norm": 0.7739303112030029, "learning_rate": 0.0006546149839671635, "loss": 1.8043, "step": 252500 }, { "epoch": 1.044983870901123, "grad_norm": 0.8117313385009766, "learning_rate": 0.0006539242111720066, "loss": 1.8004, "step": 253000 }, { "epoch": 1.0470490564167378, "grad_norm": 0.8029870390892029, "learning_rate": 0.0006532334383768497, "loss": 1.8046, "step": 253500 }, { "epoch": 1.0491142419323527, "grad_norm": 0.8172849416732788, "learning_rate": 0.0006525426655816928, "loss": 1.8059, "step": 254000 }, { "epoch": 1.0511794274479676, "grad_norm": 0.7874976992607117, "learning_rate": 0.0006518518927865361, "loss": 1.8054, "step": 254500 }, { "epoch": 1.0532446129635824, "grad_norm": 0.7877236604690552, "learning_rate": 0.0006511611199913792, "loss": 1.8026, "step": 255000 }, { "epoch": 1.0553097984791975, "grad_norm": 0.8584260940551758, "learning_rate": 0.0006504703471962223, "loss": 1.8007, "step": 255500 }, { "epoch": 1.0573749839948123, "grad_norm": 0.8094419240951538, "learning_rate": 0.0006497795744010655, "loss": 1.7978, "step": 256000 }, { "epoch": 1.0594401695104272, "grad_norm": 0.7890325784683228, "learning_rate": 0.0006490888016059086, "loss": 1.8, "step": 256500 }, { "epoch": 1.061505355026042, "grad_norm": 0.8764976263046265, "learning_rate": 0.0006483980288107518, "loss": 1.8046, "step": 257000 }, { "epoch": 1.063570540541657, "grad_norm": 0.8675107359886169, "learning_rate": 0.0006477072560155949, "loss": 1.8036, "step": 257500 }, { "epoch": 1.0656357260572717, "grad_norm": 0.7634553909301758, "learning_rate": 0.0006470164832204381, "loss": 1.8018, "step": 258000 }, { "epoch": 1.0677009115728866, "grad_norm": 0.7381558418273926, "learning_rate": 0.0006463257104252812, "loss": 1.7998, "step": 258500 }, { "epoch": 1.0697660970885015, "grad_norm": 0.9076355695724487, "learning_rate": 0.0006456349376301243, "loss": 1.7948, "step": 259000 }, { "epoch": 1.0718312826041163, "grad_norm": 0.9148507118225098, "learning_rate": 0.0006449441648349676, "loss": 1.8018, "step": 259500 }, { "epoch": 1.0738964681197312, "grad_norm": 0.7980071902275085, "learning_rate": 0.0006442533920398106, "loss": 1.8023, "step": 260000 }, { "epoch": 1.075961653635346, "grad_norm": 0.8765013217926025, "learning_rate": 0.0006435626192446538, "loss": 1.8003, "step": 260500 }, { "epoch": 1.0780268391509609, "grad_norm": 0.8422530293464661, "learning_rate": 0.0006428718464494969, "loss": 1.802, "step": 261000 }, { "epoch": 1.0800920246665757, "grad_norm": 0.8139374852180481, "learning_rate": 0.0006421810736543401, "loss": 1.8038, "step": 261500 }, { "epoch": 1.0821572101821906, "grad_norm": 0.7891634702682495, "learning_rate": 0.0006414903008591833, "loss": 1.8047, "step": 262000 }, { "epoch": 1.0842223956978054, "grad_norm": 0.822912335395813, "learning_rate": 0.0006407995280640264, "loss": 1.7979, "step": 262500 }, { "epoch": 1.0862875812134205, "grad_norm": 0.8557060360908508, "learning_rate": 0.0006401087552688695, "loss": 1.7974, "step": 263000 }, { "epoch": 1.0883527667290354, "grad_norm": 0.8948346972465515, "learning_rate": 0.0006394179824737126, "loss": 1.8004, "step": 263500 }, { "epoch": 1.0904179522446502, "grad_norm": 0.795589029788971, "learning_rate": 0.0006387272096785558, "loss": 1.7988, "step": 264000 }, { "epoch": 1.092483137760265, "grad_norm": 0.7854675054550171, "learning_rate": 0.000638036436883399, "loss": 1.7999, "step": 264500 }, { "epoch": 1.09454832327588, "grad_norm": 0.8320429921150208, "learning_rate": 0.0006373456640882422, "loss": 1.7988, "step": 265000 }, { "epoch": 1.0966135087914948, "grad_norm": 0.7923471331596375, "learning_rate": 0.0006366548912930853, "loss": 1.7947, "step": 265500 }, { "epoch": 1.0986786943071096, "grad_norm": 0.7947016358375549, "learning_rate": 0.0006359641184979283, "loss": 1.7949, "step": 266000 }, { "epoch": 1.1007438798227245, "grad_norm": 0.8632909655570984, "learning_rate": 0.0006352733457027715, "loss": 1.7917, "step": 266500 }, { "epoch": 1.1028090653383393, "grad_norm": 0.8080165982246399, "learning_rate": 0.0006345825729076146, "loss": 1.7925, "step": 267000 }, { "epoch": 1.1048742508539542, "grad_norm": 0.8370658159255981, "learning_rate": 0.0006338918001124579, "loss": 1.7926, "step": 267500 }, { "epoch": 1.106939436369569, "grad_norm": 0.7986084818840027, "learning_rate": 0.000633201027317301, "loss": 1.7918, "step": 268000 }, { "epoch": 1.109004621885184, "grad_norm": 0.8623395562171936, "learning_rate": 0.0006325102545221442, "loss": 1.7916, "step": 268500 }, { "epoch": 1.1110698074007987, "grad_norm": 0.8417394757270813, "learning_rate": 0.0006318194817269872, "loss": 1.7924, "step": 269000 }, { "epoch": 1.1131349929164136, "grad_norm": 0.8161811232566833, "learning_rate": 0.0006311287089318303, "loss": 1.7914, "step": 269500 }, { "epoch": 1.1152001784320285, "grad_norm": 0.8812907338142395, "learning_rate": 0.0006304379361366736, "loss": 1.7906, "step": 270000 }, { "epoch": 1.1172653639476433, "grad_norm": 0.7669122219085693, "learning_rate": 0.0006297471633415167, "loss": 1.7917, "step": 270500 }, { "epoch": 1.1193305494632584, "grad_norm": 0.792958676815033, "learning_rate": 0.0006290563905463599, "loss": 1.7916, "step": 271000 }, { "epoch": 1.1213957349788732, "grad_norm": 0.8431819677352905, "learning_rate": 0.000628365617751203, "loss": 1.7913, "step": 271500 }, { "epoch": 1.123460920494488, "grad_norm": 0.8096106648445129, "learning_rate": 0.0006276748449560462, "loss": 1.7937, "step": 272000 }, { "epoch": 1.125526106010103, "grad_norm": 0.8288501501083374, "learning_rate": 0.0006269840721608893, "loss": 1.7928, "step": 272500 }, { "epoch": 1.1275912915257178, "grad_norm": 0.8212178349494934, "learning_rate": 0.0006262932993657324, "loss": 1.7892, "step": 273000 }, { "epoch": 1.1296564770413327, "grad_norm": 0.7889783978462219, "learning_rate": 0.0006256025265705756, "loss": 1.7865, "step": 273500 }, { "epoch": 1.1317216625569475, "grad_norm": 0.8126891255378723, "learning_rate": 0.0006249117537754187, "loss": 1.7884, "step": 274000 }, { "epoch": 1.1337868480725624, "grad_norm": 0.8215599060058594, "learning_rate": 0.000624220980980262, "loss": 1.788, "step": 274500 }, { "epoch": 1.1358520335881772, "grad_norm": 0.7967174053192139, "learning_rate": 0.000623530208185105, "loss": 1.7886, "step": 275000 }, { "epoch": 1.137917219103792, "grad_norm": 0.8354322910308838, "learning_rate": 0.0006228394353899482, "loss": 1.7841, "step": 275500 }, { "epoch": 1.139982404619407, "grad_norm": 0.7985238432884216, "learning_rate": 0.0006221486625947913, "loss": 1.7886, "step": 276000 }, { "epoch": 1.1420475901350218, "grad_norm": 0.8069713115692139, "learning_rate": 0.0006214578897996344, "loss": 1.7893, "step": 276500 }, { "epoch": 1.1441127756506366, "grad_norm": 0.8244253396987915, "learning_rate": 0.0006207671170044777, "loss": 1.7846, "step": 277000 }, { "epoch": 1.1461779611662515, "grad_norm": 0.7911844253540039, "learning_rate": 0.0006200763442093208, "loss": 1.782, "step": 277500 }, { "epoch": 1.1482431466818666, "grad_norm": 0.8204144239425659, "learning_rate": 0.0006193855714141639, "loss": 1.7877, "step": 278000 }, { "epoch": 1.1503083321974814, "grad_norm": 0.7840794920921326, "learning_rate": 0.000618694798619007, "loss": 1.7855, "step": 278500 }, { "epoch": 1.1523735177130963, "grad_norm": 0.7567317485809326, "learning_rate": 0.0006180040258238502, "loss": 1.7815, "step": 279000 }, { "epoch": 1.1544387032287111, "grad_norm": 0.8889859914779663, "learning_rate": 0.0006173132530286934, "loss": 1.7844, "step": 279500 }, { "epoch": 1.156503888744326, "grad_norm": 0.7965997457504272, "learning_rate": 0.0006166224802335365, "loss": 1.7823, "step": 280000 }, { "epoch": 1.1585690742599408, "grad_norm": 0.7915734052658081, "learning_rate": 0.0006159317074383797, "loss": 1.7829, "step": 280500 }, { "epoch": 1.1606342597755557, "grad_norm": 0.8453460335731506, "learning_rate": 0.0006152409346432227, "loss": 1.7854, "step": 281000 }, { "epoch": 1.1626994452911705, "grad_norm": 0.8347840905189514, "learning_rate": 0.0006145501618480659, "loss": 1.7777, "step": 281500 }, { "epoch": 1.1647646308067854, "grad_norm": 0.806870698928833, "learning_rate": 0.000613859389052909, "loss": 1.7814, "step": 282000 }, { "epoch": 1.1668298163224002, "grad_norm": 0.7722708582878113, "learning_rate": 0.0006131686162577523, "loss": 1.7816, "step": 282500 }, { "epoch": 1.168895001838015, "grad_norm": 0.7900815606117249, "learning_rate": 0.0006124778434625954, "loss": 1.78, "step": 283000 }, { "epoch": 1.17096018735363, "grad_norm": 0.8070388436317444, "learning_rate": 0.0006117870706674385, "loss": 1.7794, "step": 283500 }, { "epoch": 1.1730253728692448, "grad_norm": 0.8343568444252014, "learning_rate": 0.0006110962978722816, "loss": 1.7817, "step": 284000 }, { "epoch": 1.1750905583848597, "grad_norm": 0.7810460329055786, "learning_rate": 0.0006104055250771247, "loss": 1.7822, "step": 284500 }, { "epoch": 1.1771557439004745, "grad_norm": 0.8281691670417786, "learning_rate": 0.000609714752281968, "loss": 1.7773, "step": 285000 }, { "epoch": 1.1792209294160894, "grad_norm": 0.7959678769111633, "learning_rate": 0.0006090239794868111, "loss": 1.7755, "step": 285500 }, { "epoch": 1.1812861149317042, "grad_norm": 0.7893877625465393, "learning_rate": 0.0006083332066916543, "loss": 1.7788, "step": 286000 }, { "epoch": 1.183351300447319, "grad_norm": 0.794282078742981, "learning_rate": 0.0006076424338964974, "loss": 1.7785, "step": 286500 }, { "epoch": 1.1854164859629341, "grad_norm": 0.833561360836029, "learning_rate": 0.0006069516611013404, "loss": 1.778, "step": 287000 }, { "epoch": 1.187481671478549, "grad_norm": 0.7725043296813965, "learning_rate": 0.0006062608883061837, "loss": 1.7763, "step": 287500 }, { "epoch": 1.1895468569941638, "grad_norm": 0.8378251194953918, "learning_rate": 0.0006055701155110268, "loss": 1.7785, "step": 288000 }, { "epoch": 1.1916120425097787, "grad_norm": 0.8435170650482178, "learning_rate": 0.00060487934271587, "loss": 1.7808, "step": 288500 }, { "epoch": 1.1936772280253936, "grad_norm": 0.7910299301147461, "learning_rate": 0.0006041885699207131, "loss": 1.7791, "step": 289000 }, { "epoch": 1.1957424135410084, "grad_norm": 0.7965072989463806, "learning_rate": 0.0006034977971255562, "loss": 1.7762, "step": 289500 }, { "epoch": 1.1978075990566233, "grad_norm": 0.7592757344245911, "learning_rate": 0.0006028070243303994, "loss": 1.7724, "step": 290000 }, { "epoch": 1.1998727845722381, "grad_norm": 0.7980614304542542, "learning_rate": 0.0006021162515352425, "loss": 1.7773, "step": 290500 }, { "epoch": 1.201937970087853, "grad_norm": 0.8618481755256653, "learning_rate": 0.0006014254787400857, "loss": 1.7773, "step": 291000 }, { "epoch": 1.2040031556034678, "grad_norm": 0.7855138778686523, "learning_rate": 0.0006007347059449288, "loss": 1.7775, "step": 291500 }, { "epoch": 1.2060683411190827, "grad_norm": 0.9088487029075623, "learning_rate": 0.000600043933149772, "loss": 1.7732, "step": 292000 }, { "epoch": 1.2081335266346975, "grad_norm": 0.7684744000434875, "learning_rate": 0.0005993531603546152, "loss": 1.7712, "step": 292500 }, { "epoch": 1.2101987121503124, "grad_norm": 0.8111701607704163, "learning_rate": 0.0005986623875594582, "loss": 1.7728, "step": 293000 }, { "epoch": 1.2122638976659272, "grad_norm": 0.7883111834526062, "learning_rate": 0.0005979716147643014, "loss": 1.7744, "step": 293500 }, { "epoch": 1.2143290831815423, "grad_norm": 0.888268768787384, "learning_rate": 0.0005972808419691445, "loss": 1.7758, "step": 294000 }, { "epoch": 1.2163942686971572, "grad_norm": 1.260141372680664, "learning_rate": 0.0005965900691739878, "loss": 1.7733, "step": 294500 }, { "epoch": 1.218459454212772, "grad_norm": 0.7965800166130066, "learning_rate": 0.0005958992963788309, "loss": 1.7681, "step": 295000 }, { "epoch": 1.2205246397283869, "grad_norm": 0.8069186806678772, "learning_rate": 0.0005952085235836741, "loss": 1.7709, "step": 295500 }, { "epoch": 1.2225898252440017, "grad_norm": 0.7815278172492981, "learning_rate": 0.0005945177507885171, "loss": 1.7746, "step": 296000 }, { "epoch": 1.2246550107596166, "grad_norm": 0.8087014555931091, "learning_rate": 0.0005938269779933602, "loss": 1.7769, "step": 296500 }, { "epoch": 1.2267201962752314, "grad_norm": 0.8358011245727539, "learning_rate": 0.0005931362051982034, "loss": 1.7748, "step": 297000 }, { "epoch": 1.2287853817908463, "grad_norm": 0.7773184776306152, "learning_rate": 0.0005924454324030466, "loss": 1.7687, "step": 297500 }, { "epoch": 1.2308505673064611, "grad_norm": 0.8666139245033264, "learning_rate": 0.0005917546596078898, "loss": 1.771, "step": 298000 }, { "epoch": 1.232915752822076, "grad_norm": 0.7656075954437256, "learning_rate": 0.0005910638868127329, "loss": 1.7668, "step": 298500 }, { "epoch": 1.2349809383376908, "grad_norm": 0.78355473279953, "learning_rate": 0.000590373114017576, "loss": 1.7773, "step": 299000 }, { "epoch": 1.2370461238533057, "grad_norm": 0.7689515948295593, "learning_rate": 0.0005896823412224191, "loss": 1.7722, "step": 299500 }, { "epoch": 1.2391113093689206, "grad_norm": 0.7110136151313782, "learning_rate": 0.0005889915684272623, "loss": 1.7741, "step": 300000 }, { "epoch": 1.2411764948845354, "grad_norm": 0.7835440635681152, "learning_rate": 0.0005883007956321055, "loss": 1.7699, "step": 300500 }, { "epoch": 1.2432416804001503, "grad_norm": 0.8212382793426514, "learning_rate": 0.0005876100228369486, "loss": 1.7723, "step": 301000 }, { "epoch": 1.2453068659157651, "grad_norm": 1.0025386810302734, "learning_rate": 0.0005869192500417918, "loss": 1.774, "step": 301500 }, { "epoch": 1.24737205143138, "grad_norm": 0.8511669635772705, "learning_rate": 0.0005862284772466348, "loss": 1.769, "step": 302000 }, { "epoch": 1.249437236946995, "grad_norm": 0.8234755992889404, "learning_rate": 0.0005855377044514781, "loss": 1.7718, "step": 302500 }, { "epoch": 1.25150242246261, "grad_norm": 0.7945066094398499, "learning_rate": 0.0005848469316563212, "loss": 1.7678, "step": 303000 }, { "epoch": 1.2535676079782248, "grad_norm": 0.7915132641792297, "learning_rate": 0.0005841561588611643, "loss": 1.7661, "step": 303500 }, { "epoch": 1.2556327934938396, "grad_norm": 0.7837565541267395, "learning_rate": 0.0005834653860660075, "loss": 1.7655, "step": 304000 }, { "epoch": 1.2576979790094545, "grad_norm": 0.8219539523124695, "learning_rate": 0.0005827746132708506, "loss": 1.768, "step": 304500 }, { "epoch": 1.2597631645250693, "grad_norm": 0.7561802268028259, "learning_rate": 0.0005820838404756938, "loss": 1.7652, "step": 305000 }, { "epoch": 1.2618283500406842, "grad_norm": 0.7804844975471497, "learning_rate": 0.0005813930676805369, "loss": 1.767, "step": 305500 }, { "epoch": 1.263893535556299, "grad_norm": 0.7776834964752197, "learning_rate": 0.0005807022948853801, "loss": 1.7619, "step": 306000 }, { "epoch": 1.2659587210719139, "grad_norm": 0.7807164192199707, "learning_rate": 0.0005800115220902232, "loss": 1.7677, "step": 306500 }, { "epoch": 1.2680239065875287, "grad_norm": 0.7830272316932678, "learning_rate": 0.0005793207492950663, "loss": 1.7638, "step": 307000 }, { "epoch": 1.2700890921031436, "grad_norm": 0.7787231206893921, "learning_rate": 0.0005786299764999096, "loss": 1.7605, "step": 307500 }, { "epoch": 1.2721542776187584, "grad_norm": 0.7798328399658203, "learning_rate": 0.0005779392037047526, "loss": 1.7617, "step": 308000 }, { "epoch": 1.2742194631343733, "grad_norm": 0.8115136027336121, "learning_rate": 0.0005772484309095958, "loss": 1.7629, "step": 308500 }, { "epoch": 1.2762846486499884, "grad_norm": 0.7370808720588684, "learning_rate": 0.0005765576581144389, "loss": 1.7584, "step": 309000 }, { "epoch": 1.2783498341656032, "grad_norm": 0.7543977499008179, "learning_rate": 0.0005758668853192822, "loss": 1.7643, "step": 309500 }, { "epoch": 1.280415019681218, "grad_norm": 0.8601275086402893, "learning_rate": 0.0005751761125241253, "loss": 1.7606, "step": 310000 }, { "epoch": 1.282480205196833, "grad_norm": 0.7618570327758789, "learning_rate": 0.0005744853397289684, "loss": 1.7604, "step": 310500 }, { "epoch": 1.2845453907124478, "grad_norm": 0.7939981818199158, "learning_rate": 0.0005737945669338115, "loss": 1.7632, "step": 311000 }, { "epoch": 1.2866105762280626, "grad_norm": 0.7793112397193909, "learning_rate": 0.0005731037941386546, "loss": 1.7631, "step": 311500 }, { "epoch": 1.2886757617436775, "grad_norm": 0.7892638444900513, "learning_rate": 0.0005724130213434978, "loss": 1.7576, "step": 312000 }, { "epoch": 1.2907409472592923, "grad_norm": 0.8487522602081299, "learning_rate": 0.000571722248548341, "loss": 1.761, "step": 312500 }, { "epoch": 1.2928061327749072, "grad_norm": 0.7883718609809875, "learning_rate": 0.0005710314757531842, "loss": 1.7571, "step": 313000 }, { "epoch": 1.294871318290522, "grad_norm": 0.7739648818969727, "learning_rate": 0.0005703407029580273, "loss": 1.7595, "step": 313500 }, { "epoch": 1.296936503806137, "grad_norm": 0.7544906139373779, "learning_rate": 0.0005696499301628703, "loss": 1.7563, "step": 314000 }, { "epoch": 1.2990016893217518, "grad_norm": 0.7876557111740112, "learning_rate": 0.0005689591573677135, "loss": 1.7623, "step": 314500 }, { "epoch": 1.3010668748373666, "grad_norm": 0.8152881860733032, "learning_rate": 0.0005682683845725567, "loss": 1.7561, "step": 315000 }, { "epoch": 1.3031320603529815, "grad_norm": 0.8232436180114746, "learning_rate": 0.0005675776117773999, "loss": 1.7547, "step": 315500 }, { "epoch": 1.3051972458685963, "grad_norm": 0.8717594742774963, "learning_rate": 0.000566886838982243, "loss": 1.7589, "step": 316000 }, { "epoch": 1.3072624313842112, "grad_norm": 0.8005387187004089, "learning_rate": 0.0005661960661870862, "loss": 1.7531, "step": 316500 }, { "epoch": 1.309327616899826, "grad_norm": 0.7955303192138672, "learning_rate": 0.0005655052933919294, "loss": 1.7581, "step": 317000 }, { "epoch": 1.3113928024154409, "grad_norm": 0.785753071308136, "learning_rate": 0.0005648145205967724, "loss": 1.7584, "step": 317500 }, { "epoch": 1.3134579879310557, "grad_norm": 0.775891125202179, "learning_rate": 0.0005641237478016156, "loss": 1.755, "step": 318000 }, { "epoch": 1.3155231734466706, "grad_norm": 0.8091910481452942, "learning_rate": 0.0005634329750064587, "loss": 1.7562, "step": 318500 }, { "epoch": 1.3175883589622857, "grad_norm": 0.7912936806678772, "learning_rate": 0.0005627422022113019, "loss": 1.7593, "step": 319000 }, { "epoch": 1.3196535444779005, "grad_norm": 0.7949129343032837, "learning_rate": 0.000562051429416145, "loss": 1.7555, "step": 319500 }, { "epoch": 1.3217187299935154, "grad_norm": 0.784271240234375, "learning_rate": 0.0005613606566209883, "loss": 1.7578, "step": 320000 }, { "epoch": 1.3237839155091302, "grad_norm": 0.8425039052963257, "learning_rate": 0.0005606698838258313, "loss": 1.7578, "step": 320500 }, { "epoch": 1.325849101024745, "grad_norm": 0.9427282214164734, "learning_rate": 0.0005599791110306744, "loss": 1.7553, "step": 321000 }, { "epoch": 1.32791428654036, "grad_norm": 2.0400569438934326, "learning_rate": 0.0005592883382355176, "loss": 1.7582, "step": 321500 }, { "epoch": 1.3299794720559748, "grad_norm": 0.8407232761383057, "learning_rate": 0.0005585975654403607, "loss": 1.7528, "step": 322000 }, { "epoch": 1.3320446575715896, "grad_norm": 0.8112275004386902, "learning_rate": 0.000557906792645204, "loss": 1.7532, "step": 322500 }, { "epoch": 1.3341098430872045, "grad_norm": 0.7587988376617432, "learning_rate": 0.0005572160198500471, "loss": 1.7578, "step": 323000 }, { "epoch": 1.3361750286028193, "grad_norm": 0.7508676052093506, "learning_rate": 0.0005565252470548902, "loss": 1.7538, "step": 323500 }, { "epoch": 1.3382402141184342, "grad_norm": 0.7507205009460449, "learning_rate": 0.0005558344742597333, "loss": 1.7516, "step": 324000 }, { "epoch": 1.340305399634049, "grad_norm": 0.8090864419937134, "learning_rate": 0.0005551437014645764, "loss": 1.7537, "step": 324500 }, { "epoch": 1.3423705851496641, "grad_norm": 0.7598680853843689, "learning_rate": 0.0005544529286694197, "loss": 1.7528, "step": 325000 }, { "epoch": 1.344435770665279, "grad_norm": 0.795917272567749, "learning_rate": 0.0005537621558742628, "loss": 1.7503, "step": 325500 }, { "epoch": 1.3465009561808938, "grad_norm": 0.8306310772895813, "learning_rate": 0.000553071383079106, "loss": 1.7519, "step": 326000 }, { "epoch": 1.3485661416965087, "grad_norm": 0.7626925110816956, "learning_rate": 0.000552380610283949, "loss": 1.7504, "step": 326500 }, { "epoch": 1.3506313272121235, "grad_norm": 0.8428457975387573, "learning_rate": 0.0005516898374887922, "loss": 1.75, "step": 327000 }, { "epoch": 1.3526965127277384, "grad_norm": 0.7658423185348511, "learning_rate": 0.0005509990646936354, "loss": 1.7544, "step": 327500 }, { "epoch": 1.3547616982433532, "grad_norm": 0.7995271682739258, "learning_rate": 0.0005503082918984785, "loss": 1.7523, "step": 328000 }, { "epoch": 1.356826883758968, "grad_norm": 0.7842050790786743, "learning_rate": 0.0005496175191033217, "loss": 1.75, "step": 328500 }, { "epoch": 1.358892069274583, "grad_norm": 0.7905313968658447, "learning_rate": 0.0005489267463081648, "loss": 1.7459, "step": 329000 }, { "epoch": 1.3609572547901978, "grad_norm": 0.7925072312355042, "learning_rate": 0.0005482359735130079, "loss": 1.7489, "step": 329500 }, { "epoch": 1.3630224403058127, "grad_norm": 0.786297082901001, "learning_rate": 0.0005475452007178511, "loss": 1.7511, "step": 330000 }, { "epoch": 1.3650876258214275, "grad_norm": 0.8276360630989075, "learning_rate": 0.0005468544279226943, "loss": 1.7501, "step": 330500 }, { "epoch": 1.3671528113370424, "grad_norm": 0.7593994736671448, "learning_rate": 0.0005461636551275374, "loss": 1.7495, "step": 331000 }, { "epoch": 1.3692179968526572, "grad_norm": 0.7812342047691345, "learning_rate": 0.0005454728823323805, "loss": 1.7444, "step": 331500 }, { "epoch": 1.371283182368272, "grad_norm": 0.8850775957107544, "learning_rate": 0.0005447821095372237, "loss": 1.7459, "step": 332000 }, { "epoch": 1.373348367883887, "grad_norm": 0.7758823037147522, "learning_rate": 0.0005440913367420668, "loss": 1.7466, "step": 332500 }, { "epoch": 1.3754135533995018, "grad_norm": 0.7858127951622009, "learning_rate": 0.00054340056394691, "loss": 1.7484, "step": 333000 }, { "epoch": 1.3774787389151166, "grad_norm": 0.7900636792182922, "learning_rate": 0.0005427097911517531, "loss": 1.7463, "step": 333500 }, { "epoch": 1.3795439244307315, "grad_norm": 0.8080687522888184, "learning_rate": 0.0005420190183565963, "loss": 1.7465, "step": 334000 }, { "epoch": 1.3816091099463466, "grad_norm": 0.8253558278083801, "learning_rate": 0.0005413282455614394, "loss": 1.7429, "step": 334500 }, { "epoch": 1.3836742954619614, "grad_norm": 0.8424259424209595, "learning_rate": 0.0005406374727662826, "loss": 1.748, "step": 335000 }, { "epoch": 1.3857394809775763, "grad_norm": 0.7918978929519653, "learning_rate": 0.0005399466999711257, "loss": 1.7424, "step": 335500 }, { "epoch": 1.3878046664931911, "grad_norm": 0.7710541486740112, "learning_rate": 0.0005392559271759688, "loss": 1.7429, "step": 336000 }, { "epoch": 1.389869852008806, "grad_norm": 0.8952863216400146, "learning_rate": 0.000538565154380812, "loss": 1.7379, "step": 336500 }, { "epoch": 1.3919350375244208, "grad_norm": 0.7974414229393005, "learning_rate": 0.0005378743815856551, "loss": 1.7442, "step": 337000 }, { "epoch": 1.3940002230400357, "grad_norm": 0.8121086955070496, "learning_rate": 0.0005371836087904984, "loss": 1.7417, "step": 337500 }, { "epoch": 1.3960654085556505, "grad_norm": 0.7721625566482544, "learning_rate": 0.0005364928359953415, "loss": 1.7399, "step": 338000 }, { "epoch": 1.3981305940712654, "grad_norm": 0.7423729300498962, "learning_rate": 0.0005358020632001845, "loss": 1.7451, "step": 338500 }, { "epoch": 1.4001957795868802, "grad_norm": 0.8138153553009033, "learning_rate": 0.0005351112904050277, "loss": 1.7405, "step": 339000 }, { "epoch": 1.402260965102495, "grad_norm": 0.8146694302558899, "learning_rate": 0.0005344205176098708, "loss": 1.7431, "step": 339500 }, { "epoch": 1.40432615061811, "grad_norm": 0.826502799987793, "learning_rate": 0.0005337297448147141, "loss": 1.7389, "step": 340000 }, { "epoch": 1.406391336133725, "grad_norm": 0.7904347777366638, "learning_rate": 0.0005330389720195572, "loss": 1.7387, "step": 340500 }, { "epoch": 1.4084565216493399, "grad_norm": 0.7897937297821045, "learning_rate": 0.0005323481992244004, "loss": 1.7428, "step": 341000 }, { "epoch": 1.4105217071649547, "grad_norm": 0.8036805391311646, "learning_rate": 0.0005316574264292434, "loss": 1.7417, "step": 341500 }, { "epoch": 1.4125868926805696, "grad_norm": 0.7628007531166077, "learning_rate": 0.0005309666536340865, "loss": 1.7404, "step": 342000 }, { "epoch": 1.4146520781961844, "grad_norm": 0.8156040906906128, "learning_rate": 0.0005302758808389298, "loss": 1.7374, "step": 342500 }, { "epoch": 1.4167172637117993, "grad_norm": 0.8283891081809998, "learning_rate": 0.0005295851080437729, "loss": 1.7373, "step": 343000 }, { "epoch": 1.4187824492274141, "grad_norm": 0.8151522278785706, "learning_rate": 0.0005288943352486161, "loss": 1.7356, "step": 343500 }, { "epoch": 1.420847634743029, "grad_norm": 0.8706732988357544, "learning_rate": 0.0005282035624534592, "loss": 1.7403, "step": 344000 }, { "epoch": 1.4229128202586439, "grad_norm": 0.8165752291679382, "learning_rate": 0.0005275127896583023, "loss": 1.7405, "step": 344500 }, { "epoch": 1.4249780057742587, "grad_norm": 0.8452313542366028, "learning_rate": 0.0005268220168631455, "loss": 1.7385, "step": 345000 }, { "epoch": 1.4270431912898736, "grad_norm": 0.8291791081428528, "learning_rate": 0.0005261312440679886, "loss": 1.7381, "step": 345500 }, { "epoch": 1.4291083768054884, "grad_norm": 0.8294808864593506, "learning_rate": 0.0005254404712728318, "loss": 1.7398, "step": 346000 }, { "epoch": 1.4311735623211033, "grad_norm": 0.8119639754295349, "learning_rate": 0.0005247496984776749, "loss": 1.7386, "step": 346500 }, { "epoch": 1.4332387478367181, "grad_norm": 0.7947481870651245, "learning_rate": 0.0005240589256825181, "loss": 1.7361, "step": 347000 }, { "epoch": 1.435303933352333, "grad_norm": 0.8204724192619324, "learning_rate": 0.0005233681528873612, "loss": 1.7405, "step": 347500 }, { "epoch": 1.4373691188679478, "grad_norm": 0.7718450427055359, "learning_rate": 0.0005226773800922044, "loss": 1.7398, "step": 348000 }, { "epoch": 1.4394343043835627, "grad_norm": 0.7919915318489075, "learning_rate": 0.0005219866072970475, "loss": 1.7334, "step": 348500 }, { "epoch": 1.4414994898991775, "grad_norm": 0.8244622945785522, "learning_rate": 0.0005212958345018906, "loss": 1.7363, "step": 349000 }, { "epoch": 1.4435646754147924, "grad_norm": 0.8124867677688599, "learning_rate": 0.0005206050617067338, "loss": 1.7368, "step": 349500 }, { "epoch": 1.4456298609304072, "grad_norm": 0.8139218091964722, "learning_rate": 0.000519914288911577, "loss": 1.7344, "step": 350000 }, { "epoch": 1.4476950464460223, "grad_norm": 0.7997359037399292, "learning_rate": 0.0005192235161164201, "loss": 1.7296, "step": 350500 }, { "epoch": 1.4497602319616372, "grad_norm": 0.8655456304550171, "learning_rate": 0.0005185327433212632, "loss": 1.7353, "step": 351000 }, { "epoch": 1.451825417477252, "grad_norm": 0.8099657893180847, "learning_rate": 0.0005178419705261064, "loss": 1.7356, "step": 351500 }, { "epoch": 1.4538906029928669, "grad_norm": 0.7905128002166748, "learning_rate": 0.0005171511977309495, "loss": 1.7331, "step": 352000 }, { "epoch": 1.4559557885084817, "grad_norm": 0.7679085731506348, "learning_rate": 0.0005164604249357927, "loss": 1.7347, "step": 352500 }, { "epoch": 1.4580209740240966, "grad_norm": 0.7984927892684937, "learning_rate": 0.0005157696521406359, "loss": 1.7331, "step": 353000 }, { "epoch": 1.4600861595397114, "grad_norm": 0.8509982824325562, "learning_rate": 0.0005150788793454789, "loss": 1.7281, "step": 353500 }, { "epoch": 1.4621513450553263, "grad_norm": 0.756581723690033, "learning_rate": 0.0005143881065503221, "loss": 1.7272, "step": 354000 }, { "epoch": 1.4642165305709411, "grad_norm": 0.808980405330658, "learning_rate": 0.0005136973337551652, "loss": 1.7321, "step": 354500 }, { "epoch": 1.466281716086556, "grad_norm": 0.8383910059928894, "learning_rate": 0.0005130065609600085, "loss": 1.7337, "step": 355000 }, { "epoch": 1.4683469016021709, "grad_norm": 0.7818363308906555, "learning_rate": 0.0005123157881648516, "loss": 1.7327, "step": 355500 }, { "epoch": 1.470412087117786, "grad_norm": 0.7779876589775085, "learning_rate": 0.0005116250153696947, "loss": 1.7317, "step": 356000 }, { "epoch": 1.4724772726334008, "grad_norm": 0.7729701399803162, "learning_rate": 0.0005109342425745378, "loss": 1.7382, "step": 356500 }, { "epoch": 1.4745424581490156, "grad_norm": 0.7971392273902893, "learning_rate": 0.0005102434697793809, "loss": 1.7274, "step": 357000 }, { "epoch": 1.4766076436646305, "grad_norm": 0.8112899661064148, "learning_rate": 0.0005095526969842242, "loss": 1.7297, "step": 357500 }, { "epoch": 1.4786728291802453, "grad_norm": 0.7968249917030334, "learning_rate": 0.0005088619241890673, "loss": 1.7293, "step": 358000 }, { "epoch": 1.4807380146958602, "grad_norm": 0.7626878619194031, "learning_rate": 0.0005081711513939105, "loss": 1.7308, "step": 358500 }, { "epoch": 1.482803200211475, "grad_norm": 0.7603055834770203, "learning_rate": 0.0005074803785987536, "loss": 1.7285, "step": 359000 }, { "epoch": 1.48486838572709, "grad_norm": 0.7844238877296448, "learning_rate": 0.0005067896058035966, "loss": 1.7296, "step": 359500 }, { "epoch": 1.4869335712427048, "grad_norm": 0.7728045582771301, "learning_rate": 0.0005060988330084399, "loss": 1.7287, "step": 360000 }, { "epoch": 1.4889987567583196, "grad_norm": 0.7845308184623718, "learning_rate": 0.000505408060213283, "loss": 1.7249, "step": 360500 }, { "epoch": 1.4910639422739345, "grad_norm": 0.8352622985839844, "learning_rate": 0.0005047172874181262, "loss": 1.7259, "step": 361000 }, { "epoch": 1.4931291277895493, "grad_norm": 0.8270286917686462, "learning_rate": 0.0005040265146229693, "loss": 1.7291, "step": 361500 }, { "epoch": 1.4951943133051642, "grad_norm": 0.7802717089653015, "learning_rate": 0.0005033357418278125, "loss": 1.7243, "step": 362000 }, { "epoch": 1.497259498820779, "grad_norm": 0.7886295914649963, "learning_rate": 0.0005026449690326556, "loss": 1.7298, "step": 362500 }, { "epoch": 1.4993246843363939, "grad_norm": 0.8236453533172607, "learning_rate": 0.0005019541962374987, "loss": 1.725, "step": 363000 }, { "epoch": 1.5013898698520087, "grad_norm": 0.7767708897590637, "learning_rate": 0.0005012634234423419, "loss": 1.7302, "step": 363500 }, { "epoch": 1.5034550553676236, "grad_norm": 0.7658302783966064, "learning_rate": 0.000500572650647185, "loss": 1.7227, "step": 364000 }, { "epoch": 1.5055202408832384, "grad_norm": 0.7607765793800354, "learning_rate": 0.0004998818778520282, "loss": 1.7292, "step": 364500 }, { "epoch": 1.5075854263988533, "grad_norm": 0.7231427431106567, "learning_rate": 0.0004991911050568714, "loss": 1.7257, "step": 365000 }, { "epoch": 1.5096506119144681, "grad_norm": 0.965886116027832, "learning_rate": 0.0004985003322617145, "loss": 1.7238, "step": 365500 }, { "epoch": 1.511715797430083, "grad_norm": 0.8113678097724915, "learning_rate": 0.0004978095594665576, "loss": 1.7213, "step": 366000 }, { "epoch": 1.5137809829456979, "grad_norm": 0.8098276853561401, "learning_rate": 0.0004971187866714007, "loss": 1.7289, "step": 366500 }, { "epoch": 1.515846168461313, "grad_norm": 1.9579529762268066, "learning_rate": 0.0004964280138762439, "loss": 1.7273, "step": 367000 }, { "epoch": 1.5179113539769278, "grad_norm": 0.8172611594200134, "learning_rate": 0.0004957372410810871, "loss": 1.728, "step": 367500 }, { "epoch": 1.5199765394925426, "grad_norm": 0.7897489666938782, "learning_rate": 0.0004950464682859302, "loss": 1.722, "step": 368000 }, { "epoch": 1.5220417250081575, "grad_norm": 0.804604172706604, "learning_rate": 0.0004943556954907734, "loss": 1.7228, "step": 368500 }, { "epoch": 1.5241069105237723, "grad_norm": 0.8362699151039124, "learning_rate": 0.0004936649226956165, "loss": 1.7204, "step": 369000 }, { "epoch": 1.5261720960393872, "grad_norm": 0.7928584814071655, "learning_rate": 0.0004929741499004596, "loss": 1.7232, "step": 369500 }, { "epoch": 1.528237281555002, "grad_norm": 0.8171131610870361, "learning_rate": 0.0004922833771053028, "loss": 1.721, "step": 370000 }, { "epoch": 1.530302467070617, "grad_norm": 0.7682649493217468, "learning_rate": 0.000491592604310146, "loss": 1.7181, "step": 370500 }, { "epoch": 1.532367652586232, "grad_norm": 0.7589514255523682, "learning_rate": 0.0004909018315149891, "loss": 1.7207, "step": 371000 }, { "epoch": 1.5344328381018468, "grad_norm": 0.7927723526954651, "learning_rate": 0.0004902110587198322, "loss": 1.7172, "step": 371500 }, { "epoch": 1.5364980236174617, "grad_norm": 0.7993720173835754, "learning_rate": 0.0004895202859246754, "loss": 1.7239, "step": 372000 }, { "epoch": 1.5385632091330765, "grad_norm": 0.75545734167099, "learning_rate": 0.0004888295131295186, "loss": 1.7193, "step": 372500 }, { "epoch": 1.5406283946486914, "grad_norm": 0.8410167694091797, "learning_rate": 0.0004881387403343617, "loss": 1.7212, "step": 373000 }, { "epoch": 1.5426935801643062, "grad_norm": 0.7745389938354492, "learning_rate": 0.00048744796753920485, "loss": 1.7178, "step": 373500 }, { "epoch": 1.544758765679921, "grad_norm": 1.1876429319381714, "learning_rate": 0.00048675719474404797, "loss": 1.7166, "step": 374000 }, { "epoch": 1.546823951195536, "grad_norm": 0.8236234188079834, "learning_rate": 0.0004860664219488911, "loss": 1.7244, "step": 374500 }, { "epoch": 1.5488891367111508, "grad_norm": 0.8301746845245361, "learning_rate": 0.00048537564915373426, "loss": 1.7154, "step": 375000 }, { "epoch": 1.5509543222267657, "grad_norm": 0.8341511487960815, "learning_rate": 0.0004846848763585774, "loss": 1.7201, "step": 375500 }, { "epoch": 1.5530195077423805, "grad_norm": 0.772774338722229, "learning_rate": 0.00048399410356342055, "loss": 1.7194, "step": 376000 }, { "epoch": 1.5550846932579954, "grad_norm": 0.7931101322174072, "learning_rate": 0.0004833033307682637, "loss": 1.7218, "step": 376500 }, { "epoch": 1.5571498787736102, "grad_norm": 0.7973618507385254, "learning_rate": 0.00048261255797310684, "loss": 1.7132, "step": 377000 }, { "epoch": 1.559215064289225, "grad_norm": 0.7944709062576294, "learning_rate": 0.00048192178517795, "loss": 1.7152, "step": 377500 }, { "epoch": 1.56128024980484, "grad_norm": 0.7873803377151489, "learning_rate": 0.00048123101238279313, "loss": 1.7136, "step": 378000 }, { "epoch": 1.5633454353204548, "grad_norm": 0.8176526427268982, "learning_rate": 0.00048054023958763625, "loss": 1.7146, "step": 378500 }, { "epoch": 1.5654106208360696, "grad_norm": 0.80870121717453, "learning_rate": 0.0004798494667924794, "loss": 1.7152, "step": 379000 }, { "epoch": 1.5674758063516845, "grad_norm": 0.8075997829437256, "learning_rate": 0.0004791586939973226, "loss": 1.7156, "step": 379500 }, { "epoch": 1.5695409918672993, "grad_norm": 0.7649165391921997, "learning_rate": 0.0004784679212021657, "loss": 1.7128, "step": 380000 }, { "epoch": 1.5716061773829142, "grad_norm": 0.7735922932624817, "learning_rate": 0.0004777771484070089, "loss": 1.7127, "step": 380500 }, { "epoch": 1.573671362898529, "grad_norm": 0.8073831796646118, "learning_rate": 0.00047708637561185205, "loss": 1.7157, "step": 381000 }, { "epoch": 1.575736548414144, "grad_norm": 0.7841485738754272, "learning_rate": 0.0004763956028166951, "loss": 1.7164, "step": 381500 }, { "epoch": 1.5778017339297588, "grad_norm": 0.7511780261993408, "learning_rate": 0.0004757048300215383, "loss": 1.7146, "step": 382000 }, { "epoch": 1.5798669194453738, "grad_norm": 0.7717761993408203, "learning_rate": 0.00047501405722638146, "loss": 1.7141, "step": 382500 }, { "epoch": 1.5819321049609887, "grad_norm": 0.8251765966415405, "learning_rate": 0.0004743232844312246, "loss": 1.7146, "step": 383000 }, { "epoch": 1.5839972904766035, "grad_norm": 0.8129590749740601, "learning_rate": 0.00047363251163606775, "loss": 1.7151, "step": 383500 }, { "epoch": 1.5860624759922184, "grad_norm": 0.7942067384719849, "learning_rate": 0.0004729417388409109, "loss": 1.7145, "step": 384000 }, { "epoch": 1.5881276615078332, "grad_norm": 0.8091747760772705, "learning_rate": 0.00047225096604575404, "loss": 1.7102, "step": 384500 }, { "epoch": 1.590192847023448, "grad_norm": 0.8157942295074463, "learning_rate": 0.00047156019325059716, "loss": 1.7139, "step": 385000 }, { "epoch": 1.592258032539063, "grad_norm": 0.7791504859924316, "learning_rate": 0.00047086942045544033, "loss": 1.7138, "step": 385500 }, { "epoch": 1.5943232180546778, "grad_norm": 0.8184142708778381, "learning_rate": 0.00047017864766028345, "loss": 1.7119, "step": 386000 }, { "epoch": 1.5963884035702929, "grad_norm": 0.7216043472290039, "learning_rate": 0.0004694878748651266, "loss": 1.7062, "step": 386500 }, { "epoch": 1.5984535890859077, "grad_norm": 0.7634962797164917, "learning_rate": 0.0004687971020699698, "loss": 1.7109, "step": 387000 }, { "epoch": 1.6005187746015226, "grad_norm": 0.7682668566703796, "learning_rate": 0.0004681063292748129, "loss": 1.7175, "step": 387500 }, { "epoch": 1.6025839601171374, "grad_norm": 0.7891648411750793, "learning_rate": 0.0004674155564796561, "loss": 1.7076, "step": 388000 }, { "epoch": 1.6046491456327523, "grad_norm": 0.7735166549682617, "learning_rate": 0.0004667247836844992, "loss": 1.7122, "step": 388500 }, { "epoch": 1.6067143311483671, "grad_norm": 0.7342345714569092, "learning_rate": 0.0004660340108893423, "loss": 1.7093, "step": 389000 }, { "epoch": 1.608779516663982, "grad_norm": 0.7745596170425415, "learning_rate": 0.0004653432380941855, "loss": 1.7072, "step": 389500 }, { "epoch": 1.6108447021795969, "grad_norm": 0.7656903266906738, "learning_rate": 0.00046465246529902866, "loss": 1.7114, "step": 390000 }, { "epoch": 1.6129098876952117, "grad_norm": 0.807043731212616, "learning_rate": 0.0004639616925038718, "loss": 1.7085, "step": 390500 }, { "epoch": 1.6149750732108266, "grad_norm": 0.7980780601501465, "learning_rate": 0.00046327091970871495, "loss": 1.7054, "step": 391000 }, { "epoch": 1.6170402587264414, "grad_norm": 0.7772185802459717, "learning_rate": 0.0004625801469135581, "loss": 1.7077, "step": 391500 }, { "epoch": 1.6191054442420563, "grad_norm": 0.7955535054206848, "learning_rate": 0.0004618893741184012, "loss": 1.7059, "step": 392000 }, { "epoch": 1.6211706297576711, "grad_norm": 0.7842792868614197, "learning_rate": 0.00046119860132324436, "loss": 1.7032, "step": 392500 }, { "epoch": 1.623235815273286, "grad_norm": 0.7722345590591431, "learning_rate": 0.00046050782852808753, "loss": 1.7076, "step": 393000 }, { "epoch": 1.6253010007889008, "grad_norm": 0.7836925983428955, "learning_rate": 0.00045981705573293065, "loss": 1.701, "step": 393500 }, { "epoch": 1.6273661863045157, "grad_norm": 0.8407610058784485, "learning_rate": 0.0004591262829377738, "loss": 1.7054, "step": 394000 }, { "epoch": 1.6294313718201305, "grad_norm": 0.7842757701873779, "learning_rate": 0.000458435510142617, "loss": 1.7085, "step": 394500 }, { "epoch": 1.6314965573357454, "grad_norm": 0.7749829292297363, "learning_rate": 0.0004577447373474601, "loss": 1.7082, "step": 395000 }, { "epoch": 1.6335617428513602, "grad_norm": 0.7778738141059875, "learning_rate": 0.0004570539645523032, "loss": 1.7071, "step": 395500 }, { "epoch": 1.635626928366975, "grad_norm": 0.7654650211334229, "learning_rate": 0.0004563631917571464, "loss": 1.7093, "step": 396000 }, { "epoch": 1.63769211388259, "grad_norm": 0.7864561676979065, "learning_rate": 0.0004556724189619895, "loss": 1.7035, "step": 396500 }, { "epoch": 1.6397572993982048, "grad_norm": 0.7672191262245178, "learning_rate": 0.0004549816461668327, "loss": 1.7052, "step": 397000 }, { "epoch": 1.6418224849138197, "grad_norm": 0.7847920656204224, "learning_rate": 0.00045429087337167586, "loss": 1.7033, "step": 397500 }, { "epoch": 1.6438876704294345, "grad_norm": 0.7824931144714355, "learning_rate": 0.000453600100576519, "loss": 1.7042, "step": 398000 }, { "epoch": 1.6459528559450496, "grad_norm": 0.7992446422576904, "learning_rate": 0.00045290932778136215, "loss": 1.7051, "step": 398500 }, { "epoch": 1.6480180414606644, "grad_norm": 0.7504148483276367, "learning_rate": 0.00045221855498620527, "loss": 1.7036, "step": 399000 }, { "epoch": 1.6500832269762793, "grad_norm": 0.8227455019950867, "learning_rate": 0.0004515277821910484, "loss": 1.6998, "step": 399500 }, { "epoch": 1.6521484124918941, "grad_norm": 0.7897786498069763, "learning_rate": 0.00045083700939589156, "loss": 1.7047, "step": 400000 }, { "epoch": 1.654213598007509, "grad_norm": 0.7825984954833984, "learning_rate": 0.00045014623660073473, "loss": 1.7043, "step": 400500 }, { "epoch": 1.6562787835231239, "grad_norm": 0.8071085214614868, "learning_rate": 0.00044945546380557785, "loss": 1.7035, "step": 401000 }, { "epoch": 1.6583439690387387, "grad_norm": 0.9406007528305054, "learning_rate": 0.000448764691010421, "loss": 1.7011, "step": 401500 }, { "epoch": 1.6604091545543538, "grad_norm": 0.7797788381576538, "learning_rate": 0.0004480739182152642, "loss": 1.7024, "step": 402000 }, { "epoch": 1.6624743400699686, "grad_norm": 0.8257543444633484, "learning_rate": 0.00044738314542010725, "loss": 1.6958, "step": 402500 }, { "epoch": 1.6645395255855835, "grad_norm": 0.8066025972366333, "learning_rate": 0.0004466923726249504, "loss": 1.6998, "step": 403000 }, { "epoch": 1.6666047111011983, "grad_norm": 0.8041613698005676, "learning_rate": 0.0004460015998297936, "loss": 1.7053, "step": 403500 }, { "epoch": 1.6686698966168132, "grad_norm": 0.8146698474884033, "learning_rate": 0.0004453108270346367, "loss": 1.6969, "step": 404000 }, { "epoch": 1.670735082132428, "grad_norm": 0.7349113821983337, "learning_rate": 0.0004446200542394799, "loss": 1.6997, "step": 404500 }, { "epoch": 1.672800267648043, "grad_norm": 0.7776924967765808, "learning_rate": 0.00044392928144432306, "loss": 1.6976, "step": 405000 }, { "epoch": 1.6748654531636578, "grad_norm": 0.8118670582771301, "learning_rate": 0.0004432385086491662, "loss": 1.7039, "step": 405500 }, { "epoch": 1.6769306386792726, "grad_norm": 0.7525516152381897, "learning_rate": 0.0004425477358540093, "loss": 1.7017, "step": 406000 }, { "epoch": 1.6789958241948875, "grad_norm": 0.7723379135131836, "learning_rate": 0.00044185696305885247, "loss": 1.6997, "step": 406500 }, { "epoch": 1.6810610097105023, "grad_norm": 0.7601300477981567, "learning_rate": 0.00044116619026369564, "loss": 1.6931, "step": 407000 }, { "epoch": 1.6831261952261172, "grad_norm": 0.788893461227417, "learning_rate": 0.00044047541746853876, "loss": 1.6941, "step": 407500 }, { "epoch": 1.685191380741732, "grad_norm": 0.8101310133934021, "learning_rate": 0.00043978464467338193, "loss": 1.697, "step": 408000 }, { "epoch": 1.6872565662573469, "grad_norm": 0.7839348912239075, "learning_rate": 0.0004390938718782251, "loss": 1.7037, "step": 408500 }, { "epoch": 1.6893217517729617, "grad_norm": 1.0003387928009033, "learning_rate": 0.0004384030990830682, "loss": 1.6995, "step": 409000 }, { "epoch": 1.6913869372885766, "grad_norm": 0.7542647123336792, "learning_rate": 0.00043771232628791134, "loss": 1.6982, "step": 409500 }, { "epoch": 1.6934521228041914, "grad_norm": 0.8054424524307251, "learning_rate": 0.0004370215534927545, "loss": 1.6971, "step": 410000 }, { "epoch": 1.6955173083198063, "grad_norm": 0.7765061259269714, "learning_rate": 0.0004363307806975976, "loss": 1.6951, "step": 410500 }, { "epoch": 1.6975824938354211, "grad_norm": 0.7774503231048584, "learning_rate": 0.0004356400079024408, "loss": 1.6947, "step": 411000 }, { "epoch": 1.699647679351036, "grad_norm": 0.8000075817108154, "learning_rate": 0.00043494923510728397, "loss": 1.6937, "step": 411500 }, { "epoch": 1.7017128648666509, "grad_norm": 0.8032427430152893, "learning_rate": 0.0004342584623121271, "loss": 1.6921, "step": 412000 }, { "epoch": 1.7037780503822657, "grad_norm": 0.7988405227661133, "learning_rate": 0.00043356768951697026, "loss": 1.6968, "step": 412500 }, { "epoch": 1.7058432358978806, "grad_norm": 0.7719324231147766, "learning_rate": 0.0004328769167218134, "loss": 1.6912, "step": 413000 }, { "epoch": 1.7079084214134954, "grad_norm": 0.7580344080924988, "learning_rate": 0.0004321861439266565, "loss": 1.6951, "step": 413500 }, { "epoch": 1.7099736069291105, "grad_norm": 0.8045200705528259, "learning_rate": 0.00043149537113149967, "loss": 1.6921, "step": 414000 }, { "epoch": 1.7120387924447253, "grad_norm": 0.7698059678077698, "learning_rate": 0.00043080459833634284, "loss": 1.6929, "step": 414500 }, { "epoch": 1.7141039779603402, "grad_norm": 0.8124533891677856, "learning_rate": 0.00043011382554118596, "loss": 1.6918, "step": 415000 }, { "epoch": 1.716169163475955, "grad_norm": 0.7770412564277649, "learning_rate": 0.00042942305274602913, "loss": 1.6903, "step": 415500 }, { "epoch": 1.71823434899157, "grad_norm": 0.7901027202606201, "learning_rate": 0.0004287322799508723, "loss": 1.6932, "step": 416000 }, { "epoch": 1.7202995345071848, "grad_norm": 0.7586656212806702, "learning_rate": 0.00042804150715571536, "loss": 1.6932, "step": 416500 }, { "epoch": 1.7223647200227996, "grad_norm": 0.7596163153648376, "learning_rate": 0.00042735073436055853, "loss": 1.6979, "step": 417000 }, { "epoch": 1.7244299055384145, "grad_norm": 0.7645015716552734, "learning_rate": 0.0004266599615654017, "loss": 1.6929, "step": 417500 }, { "epoch": 1.7264950910540295, "grad_norm": 0.8256881237030029, "learning_rate": 0.0004259691887702448, "loss": 1.691, "step": 418000 }, { "epoch": 1.7285602765696444, "grad_norm": 0.78524249792099, "learning_rate": 0.000425278415975088, "loss": 1.69, "step": 418500 }, { "epoch": 1.7306254620852592, "grad_norm": 0.814737856388092, "learning_rate": 0.00042458764317993117, "loss": 1.6926, "step": 419000 }, { "epoch": 1.732690647600874, "grad_norm": 0.7561067342758179, "learning_rate": 0.00042389687038477423, "loss": 1.6928, "step": 419500 }, { "epoch": 1.734755833116489, "grad_norm": 0.7771661281585693, "learning_rate": 0.0004232060975896174, "loss": 1.6863, "step": 420000 }, { "epoch": 1.7368210186321038, "grad_norm": 0.7177093625068665, "learning_rate": 0.0004225153247944606, "loss": 1.6931, "step": 420500 }, { "epoch": 1.7388862041477187, "grad_norm": 0.8142688870429993, "learning_rate": 0.0004218245519993037, "loss": 1.6895, "step": 421000 }, { "epoch": 1.7409513896633335, "grad_norm": 0.8166112899780273, "learning_rate": 0.00042113377920414687, "loss": 1.69, "step": 421500 }, { "epoch": 1.7430165751789484, "grad_norm": 0.7927871942520142, "learning_rate": 0.00042044300640899004, "loss": 1.69, "step": 422000 }, { "epoch": 1.7450817606945632, "grad_norm": 0.8192989230155945, "learning_rate": 0.00041975223361383316, "loss": 1.6834, "step": 422500 }, { "epoch": 1.747146946210178, "grad_norm": 0.825117290019989, "learning_rate": 0.00041906146081867627, "loss": 1.6875, "step": 423000 }, { "epoch": 1.749212131725793, "grad_norm": 0.8357008695602417, "learning_rate": 0.00041837068802351944, "loss": 1.6869, "step": 423500 }, { "epoch": 1.7512773172414078, "grad_norm": 0.8047915101051331, "learning_rate": 0.00041767991522836256, "loss": 1.6864, "step": 424000 }, { "epoch": 1.7533425027570226, "grad_norm": 0.8068717122077942, "learning_rate": 0.00041698914243320573, "loss": 1.6871, "step": 424500 }, { "epoch": 1.7554076882726375, "grad_norm": 0.7879107594490051, "learning_rate": 0.0004162983696380489, "loss": 1.6826, "step": 425000 }, { "epoch": 1.7574728737882523, "grad_norm": 0.7748578190803528, "learning_rate": 0.000415607596842892, "loss": 1.6831, "step": 425500 }, { "epoch": 1.7595380593038672, "grad_norm": 0.7206512093544006, "learning_rate": 0.0004149168240477352, "loss": 1.6884, "step": 426000 }, { "epoch": 1.761603244819482, "grad_norm": 0.7805559039115906, "learning_rate": 0.0004142260512525783, "loss": 1.6863, "step": 426500 }, { "epoch": 1.763668430335097, "grad_norm": 0.8146787285804749, "learning_rate": 0.00041353527845742143, "loss": 1.6838, "step": 427000 }, { "epoch": 1.7657336158507118, "grad_norm": 0.7216916680335999, "learning_rate": 0.0004128445056622646, "loss": 1.6863, "step": 427500 }, { "epoch": 1.7677988013663266, "grad_norm": 0.7865545153617859, "learning_rate": 0.0004121537328671078, "loss": 1.6838, "step": 428000 }, { "epoch": 1.7698639868819415, "grad_norm": 0.7617883682250977, "learning_rate": 0.0004114629600719509, "loss": 1.684, "step": 428500 }, { "epoch": 1.7719291723975563, "grad_norm": 0.8186792135238647, "learning_rate": 0.00041077218727679407, "loss": 1.6828, "step": 429000 }, { "epoch": 1.7739943579131714, "grad_norm": 0.7898605465888977, "learning_rate": 0.00041008141448163724, "loss": 1.68, "step": 429500 }, { "epoch": 1.7760595434287862, "grad_norm": 0.7490332126617432, "learning_rate": 0.0004093906416864803, "loss": 1.6822, "step": 430000 }, { "epoch": 1.778124728944401, "grad_norm": 0.7616461515426636, "learning_rate": 0.00040869986889132347, "loss": 1.6866, "step": 430500 }, { "epoch": 1.780189914460016, "grad_norm": 0.7681095004081726, "learning_rate": 0.00040800909609616664, "loss": 1.6811, "step": 431000 }, { "epoch": 1.7822550999756308, "grad_norm": 0.7684192657470703, "learning_rate": 0.00040731832330100976, "loss": 1.6862, "step": 431500 }, { "epoch": 1.7843202854912457, "grad_norm": 0.7826496362686157, "learning_rate": 0.00040662755050585293, "loss": 1.6859, "step": 432000 }, { "epoch": 1.7863854710068605, "grad_norm": 0.7974809408187866, "learning_rate": 0.0004059367777106961, "loss": 1.6833, "step": 432500 }, { "epoch": 1.7884506565224754, "grad_norm": 0.8294712901115417, "learning_rate": 0.0004052460049155392, "loss": 1.6832, "step": 433000 }, { "epoch": 1.7905158420380904, "grad_norm": 0.8153785467147827, "learning_rate": 0.00040455523212038234, "loss": 1.6811, "step": 433500 }, { "epoch": 1.7925810275537053, "grad_norm": 0.80795818567276, "learning_rate": 0.0004038644593252255, "loss": 1.6788, "step": 434000 }, { "epoch": 1.7946462130693202, "grad_norm": 0.7648016214370728, "learning_rate": 0.00040317368653006863, "loss": 1.686, "step": 434500 }, { "epoch": 1.796711398584935, "grad_norm": 0.7882753610610962, "learning_rate": 0.0004024829137349118, "loss": 1.6792, "step": 435000 }, { "epoch": 1.7987765841005499, "grad_norm": 0.7577452659606934, "learning_rate": 0.000401792140939755, "loss": 1.6803, "step": 435500 }, { "epoch": 1.8008417696161647, "grad_norm": 0.7712865471839905, "learning_rate": 0.0004011013681445981, "loss": 1.6813, "step": 436000 }, { "epoch": 1.8029069551317796, "grad_norm": 0.7820202708244324, "learning_rate": 0.00040041059534944126, "loss": 1.6819, "step": 436500 }, { "epoch": 1.8049721406473944, "grad_norm": 0.7566621899604797, "learning_rate": 0.0003997198225542844, "loss": 1.68, "step": 437000 }, { "epoch": 1.8070373261630093, "grad_norm": 0.7587839365005493, "learning_rate": 0.0003990290497591275, "loss": 1.6833, "step": 437500 }, { "epoch": 1.8091025116786241, "grad_norm": 0.798997700214386, "learning_rate": 0.00039833827696397067, "loss": 1.6797, "step": 438000 }, { "epoch": 1.811167697194239, "grad_norm": 0.7913112044334412, "learning_rate": 0.00039764750416881384, "loss": 1.6792, "step": 438500 }, { "epoch": 1.8132328827098538, "grad_norm": 0.7663547992706299, "learning_rate": 0.00039695673137365696, "loss": 1.6807, "step": 439000 }, { "epoch": 1.8152980682254687, "grad_norm": 0.77425217628479, "learning_rate": 0.00039626595857850013, "loss": 1.6759, "step": 439500 }, { "epoch": 1.8173632537410835, "grad_norm": 0.807633101940155, "learning_rate": 0.0003955751857833433, "loss": 1.6777, "step": 440000 }, { "epoch": 1.8194284392566984, "grad_norm": 0.7748910188674927, "learning_rate": 0.00039488441298818637, "loss": 1.6794, "step": 440500 }, { "epoch": 1.8214936247723132, "grad_norm": 0.8132478594779968, "learning_rate": 0.00039419364019302954, "loss": 1.6777, "step": 441000 }, { "epoch": 1.823558810287928, "grad_norm": 0.7609587907791138, "learning_rate": 0.0003935028673978727, "loss": 1.6775, "step": 441500 }, { "epoch": 1.825623995803543, "grad_norm": 0.8203696608543396, "learning_rate": 0.00039281209460271583, "loss": 1.675, "step": 442000 }, { "epoch": 1.8276891813191578, "grad_norm": 0.7865495681762695, "learning_rate": 0.000392121321807559, "loss": 1.6783, "step": 442500 }, { "epoch": 1.8297543668347727, "grad_norm": 0.7632693648338318, "learning_rate": 0.0003914305490124022, "loss": 1.6771, "step": 443000 }, { "epoch": 1.8318195523503875, "grad_norm": 0.790891706943512, "learning_rate": 0.0003907397762172453, "loss": 1.6787, "step": 443500 }, { "epoch": 1.8338847378660024, "grad_norm": 0.7918925881385803, "learning_rate": 0.0003900490034220884, "loss": 1.6749, "step": 444000 }, { "epoch": 1.8359499233816172, "grad_norm": 0.8381515741348267, "learning_rate": 0.0003893582306269316, "loss": 1.6741, "step": 444500 }, { "epoch": 1.838015108897232, "grad_norm": 0.8085419535636902, "learning_rate": 0.0003886674578317747, "loss": 1.6697, "step": 445000 }, { "epoch": 1.8400802944128472, "grad_norm": 0.7606683969497681, "learning_rate": 0.00038797668503661787, "loss": 1.6745, "step": 445500 }, { "epoch": 1.842145479928462, "grad_norm": 0.7622495889663696, "learning_rate": 0.00038728591224146104, "loss": 1.6722, "step": 446000 }, { "epoch": 1.8442106654440769, "grad_norm": 0.8180463910102844, "learning_rate": 0.00038659513944630416, "loss": 1.6708, "step": 446500 }, { "epoch": 1.8462758509596917, "grad_norm": 0.7783413529396057, "learning_rate": 0.00038590436665114733, "loss": 1.6741, "step": 447000 }, { "epoch": 1.8483410364753066, "grad_norm": 0.7698727250099182, "learning_rate": 0.00038521359385599045, "loss": 1.6744, "step": 447500 }, { "epoch": 1.8504062219909214, "grad_norm": 0.7889679670333862, "learning_rate": 0.00038452282106083357, "loss": 1.6744, "step": 448000 }, { "epoch": 1.8524714075065363, "grad_norm": 0.8463781476020813, "learning_rate": 0.00038383204826567674, "loss": 1.6693, "step": 448500 }, { "epoch": 1.8545365930221511, "grad_norm": 0.7730614542961121, "learning_rate": 0.0003831412754705199, "loss": 1.6748, "step": 449000 }, { "epoch": 1.8566017785377662, "grad_norm": 0.7694717049598694, "learning_rate": 0.00038245050267536303, "loss": 1.6723, "step": 449500 }, { "epoch": 1.858666964053381, "grad_norm": 0.7720078229904175, "learning_rate": 0.0003817597298802062, "loss": 1.6712, "step": 450000 }, { "epoch": 1.860732149568996, "grad_norm": 0.7817273139953613, "learning_rate": 0.0003810689570850494, "loss": 1.6703, "step": 450500 }, { "epoch": 1.8627973350846108, "grad_norm": 0.7825304269790649, "learning_rate": 0.00038037818428989244, "loss": 1.6688, "step": 451000 }, { "epoch": 1.8648625206002256, "grad_norm": 0.7758463621139526, "learning_rate": 0.0003796874114947356, "loss": 1.6657, "step": 451500 }, { "epoch": 1.8669277061158405, "grad_norm": 0.7757241129875183, "learning_rate": 0.0003789966386995788, "loss": 1.6734, "step": 452000 }, { "epoch": 1.8689928916314553, "grad_norm": 0.8086944222450256, "learning_rate": 0.0003783058659044219, "loss": 1.669, "step": 452500 }, { "epoch": 1.8710580771470702, "grad_norm": 0.7736507058143616, "learning_rate": 0.00037761509310926507, "loss": 1.6729, "step": 453000 }, { "epoch": 1.873123262662685, "grad_norm": 0.7895172834396362, "learning_rate": 0.00037692432031410824, "loss": 1.6681, "step": 453500 }, { "epoch": 1.8751884481782999, "grad_norm": 0.7610639929771423, "learning_rate": 0.00037623354751895136, "loss": 1.6624, "step": 454000 }, { "epoch": 1.8772536336939147, "grad_norm": 0.7881196737289429, "learning_rate": 0.0003755427747237945, "loss": 1.6697, "step": 454500 }, { "epoch": 1.8793188192095296, "grad_norm": 0.7839071154594421, "learning_rate": 0.00037485200192863765, "loss": 1.6713, "step": 455000 }, { "epoch": 1.8813840047251444, "grad_norm": 0.7790060043334961, "learning_rate": 0.00037416122913348077, "loss": 1.6683, "step": 455500 }, { "epoch": 1.8834491902407593, "grad_norm": 0.757612943649292, "learning_rate": 0.00037347045633832394, "loss": 1.6662, "step": 456000 }, { "epoch": 1.8855143757563742, "grad_norm": 0.7868499755859375, "learning_rate": 0.0003727796835431671, "loss": 1.6666, "step": 456500 }, { "epoch": 1.887579561271989, "grad_norm": 0.8040853142738342, "learning_rate": 0.00037208891074801023, "loss": 1.6637, "step": 457000 }, { "epoch": 1.8896447467876039, "grad_norm": 0.7756462693214417, "learning_rate": 0.0003713981379528534, "loss": 1.6678, "step": 457500 }, { "epoch": 1.8917099323032187, "grad_norm": 0.781300961971283, "learning_rate": 0.0003707073651576965, "loss": 1.6656, "step": 458000 }, { "epoch": 1.8937751178188336, "grad_norm": 0.7810469269752502, "learning_rate": 0.00037001659236253964, "loss": 1.6617, "step": 458500 }, { "epoch": 1.8958403033344484, "grad_norm": 0.7562840580940247, "learning_rate": 0.0003693258195673828, "loss": 1.6635, "step": 459000 }, { "epoch": 1.8979054888500633, "grad_norm": 0.7803590893745422, "learning_rate": 0.000368635046772226, "loss": 1.6689, "step": 459500 }, { "epoch": 1.8999706743656781, "grad_norm": 0.8209202885627747, "learning_rate": 0.0003679442739770691, "loss": 1.6632, "step": 460000 }, { "epoch": 1.902035859881293, "grad_norm": 0.7608214020729065, "learning_rate": 0.00036725350118191227, "loss": 1.6607, "step": 460500 }, { "epoch": 1.904101045396908, "grad_norm": 0.796277642250061, "learning_rate": 0.00036656272838675544, "loss": 1.6653, "step": 461000 }, { "epoch": 1.906166230912523, "grad_norm": 0.796653687953949, "learning_rate": 0.0003658719555915985, "loss": 1.6601, "step": 461500 }, { "epoch": 1.9082314164281378, "grad_norm": 0.7833842039108276, "learning_rate": 0.0003651811827964417, "loss": 1.662, "step": 462000 }, { "epoch": 1.9102966019437526, "grad_norm": 0.7710606455802917, "learning_rate": 0.00036449041000128485, "loss": 1.6616, "step": 462500 }, { "epoch": 1.9123617874593675, "grad_norm": 0.7609611749649048, "learning_rate": 0.00036379963720612797, "loss": 1.6612, "step": 463000 }, { "epoch": 1.9144269729749823, "grad_norm": 0.7709171175956726, "learning_rate": 0.00036310886441097114, "loss": 1.6657, "step": 463500 }, { "epoch": 1.9164921584905972, "grad_norm": 0.7778812646865845, "learning_rate": 0.0003624180916158143, "loss": 1.6663, "step": 464000 }, { "epoch": 1.918557344006212, "grad_norm": 0.7948848605155945, "learning_rate": 0.0003617273188206574, "loss": 1.6607, "step": 464500 }, { "epoch": 1.920622529521827, "grad_norm": 0.7880497574806213, "learning_rate": 0.00036103654602550055, "loss": 1.6615, "step": 465000 }, { "epoch": 1.922687715037442, "grad_norm": 0.7933222055435181, "learning_rate": 0.0003603457732303437, "loss": 1.6622, "step": 465500 }, { "epoch": 1.9247529005530568, "grad_norm": 0.7489884495735168, "learning_rate": 0.00035965500043518684, "loss": 1.6594, "step": 466000 }, { "epoch": 1.9268180860686717, "grad_norm": 0.7909550666809082, "learning_rate": 0.00035896422764003, "loss": 1.6606, "step": 466500 }, { "epoch": 1.9288832715842865, "grad_norm": 0.8264633417129517, "learning_rate": 0.0003582734548448732, "loss": 1.6586, "step": 467000 }, { "epoch": 1.9309484570999014, "grad_norm": 0.8184587359428406, "learning_rate": 0.00035758268204971635, "loss": 1.6621, "step": 467500 }, { "epoch": 1.9330136426155162, "grad_norm": 0.78268963098526, "learning_rate": 0.0003568919092545594, "loss": 1.6603, "step": 468000 }, { "epoch": 1.935078828131131, "grad_norm": 0.7832273244857788, "learning_rate": 0.0003562011364594026, "loss": 1.661, "step": 468500 }, { "epoch": 1.937144013646746, "grad_norm": 0.7547221183776855, "learning_rate": 0.00035551036366424576, "loss": 1.6607, "step": 469000 }, { "epoch": 1.9392091991623608, "grad_norm": 0.8514434099197388, "learning_rate": 0.0003548195908690889, "loss": 1.6612, "step": 469500 }, { "epoch": 1.9412743846779756, "grad_norm": 0.7895204424858093, "learning_rate": 0.00035412881807393205, "loss": 1.6577, "step": 470000 }, { "epoch": 1.9433395701935905, "grad_norm": 0.8596895933151245, "learning_rate": 0.0003534380452787752, "loss": 1.6572, "step": 470500 }, { "epoch": 1.9454047557092053, "grad_norm": 0.7693920731544495, "learning_rate": 0.00035274727248361834, "loss": 1.6587, "step": 471000 }, { "epoch": 1.9474699412248202, "grad_norm": 0.8171895742416382, "learning_rate": 0.00035205649968846146, "loss": 1.6628, "step": 471500 }, { "epoch": 1.949535126740435, "grad_norm": 0.7534123659133911, "learning_rate": 0.00035136572689330463, "loss": 1.6561, "step": 472000 }, { "epoch": 1.95160031225605, "grad_norm": 0.7739940881729126, "learning_rate": 0.00035067495409814775, "loss": 1.6567, "step": 472500 }, { "epoch": 1.9536654977716648, "grad_norm": 0.7825185656547546, "learning_rate": 0.0003499841813029909, "loss": 1.6544, "step": 473000 }, { "epoch": 1.9557306832872796, "grad_norm": 0.7564761638641357, "learning_rate": 0.0003492934085078341, "loss": 1.6577, "step": 473500 }, { "epoch": 1.9577958688028945, "grad_norm": 0.808772087097168, "learning_rate": 0.0003486026357126772, "loss": 1.655, "step": 474000 }, { "epoch": 1.9598610543185093, "grad_norm": 0.8151499629020691, "learning_rate": 0.0003479118629175204, "loss": 1.6579, "step": 474500 }, { "epoch": 1.9619262398341242, "grad_norm": 0.8655403852462769, "learning_rate": 0.0003472210901223635, "loss": 1.6532, "step": 475000 }, { "epoch": 1.963991425349739, "grad_norm": 0.7786942720413208, "learning_rate": 0.0003465303173272066, "loss": 1.653, "step": 475500 }, { "epoch": 1.9660566108653539, "grad_norm": 0.8005113005638123, "learning_rate": 0.0003458395445320498, "loss": 1.6538, "step": 476000 }, { "epoch": 1.968121796380969, "grad_norm": 0.7797335386276245, "learning_rate": 0.00034514877173689296, "loss": 1.6567, "step": 476500 }, { "epoch": 1.9701869818965838, "grad_norm": 0.7935357689857483, "learning_rate": 0.0003444579989417361, "loss": 1.6551, "step": 477000 }, { "epoch": 1.9722521674121987, "grad_norm": 0.7659555077552795, "learning_rate": 0.00034376722614657925, "loss": 1.654, "step": 477500 }, { "epoch": 1.9743173529278135, "grad_norm": 0.7984480857849121, "learning_rate": 0.0003430764533514224, "loss": 1.6513, "step": 478000 }, { "epoch": 1.9763825384434284, "grad_norm": 0.7759101986885071, "learning_rate": 0.0003423856805562655, "loss": 1.6517, "step": 478500 }, { "epoch": 1.9784477239590432, "grad_norm": 0.7922109961509705, "learning_rate": 0.00034169490776110866, "loss": 1.6551, "step": 479000 }, { "epoch": 1.980512909474658, "grad_norm": 0.7864669561386108, "learning_rate": 0.00034100413496595183, "loss": 1.6521, "step": 479500 }, { "epoch": 1.982578094990273, "grad_norm": 0.7987329959869385, "learning_rate": 0.00034031336217079495, "loss": 1.6531, "step": 480000 }, { "epoch": 1.984643280505888, "grad_norm": 0.7777888774871826, "learning_rate": 0.0003396225893756381, "loss": 1.6509, "step": 480500 }, { "epoch": 1.9867084660215029, "grad_norm": 0.7795775532722473, "learning_rate": 0.0003389318165804813, "loss": 1.6518, "step": 481000 }, { "epoch": 1.9887736515371177, "grad_norm": 0.7711332440376282, "learning_rate": 0.0003382410437853244, "loss": 1.6519, "step": 481500 }, { "epoch": 1.9908388370527326, "grad_norm": 0.8026793003082275, "learning_rate": 0.0003375502709901675, "loss": 1.6509, "step": 482000 }, { "epoch": 1.9929040225683474, "grad_norm": 0.7959824204444885, "learning_rate": 0.0003368594981950107, "loss": 1.6511, "step": 482500 }, { "epoch": 1.9949692080839623, "grad_norm": 0.7960503697395325, "learning_rate": 0.0003361687253998538, "loss": 1.6534, "step": 483000 }, { "epoch": 1.9970343935995771, "grad_norm": 0.8475084900856018, "learning_rate": 0.000335477952604697, "loss": 1.6517, "step": 483500 }, { "epoch": 1.999099579115192, "grad_norm": 0.7885191440582275, "learning_rate": 0.00033478717980954016, "loss": 1.6531, "step": 484000 }, { "epoch": 2.001164764630807, "grad_norm": 0.7712221145629883, "learning_rate": 0.0003340964070143833, "loss": 1.6502, "step": 484500 }, { "epoch": 2.0032299501464217, "grad_norm": 0.7648369669914246, "learning_rate": 0.00033340563421922645, "loss": 1.6441, "step": 485000 }, { "epoch": 2.0052951356620365, "grad_norm": 0.8174281120300293, "learning_rate": 0.00033271486142406957, "loss": 1.6481, "step": 485500 }, { "epoch": 2.0073603211776514, "grad_norm": 0.7810222506523132, "learning_rate": 0.0003320240886289127, "loss": 1.6484, "step": 486000 }, { "epoch": 2.0094255066932663, "grad_norm": 0.8257454633712769, "learning_rate": 0.00033133331583375586, "loss": 1.6465, "step": 486500 }, { "epoch": 2.011490692208881, "grad_norm": 0.7819936871528625, "learning_rate": 0.00033064254303859903, "loss": 1.6459, "step": 487000 }, { "epoch": 2.013555877724496, "grad_norm": 0.7899196743965149, "learning_rate": 0.00032995177024344215, "loss": 1.6459, "step": 487500 }, { "epoch": 2.015621063240111, "grad_norm": 0.8132250905036926, "learning_rate": 0.0003292609974482853, "loss": 1.6488, "step": 488000 }, { "epoch": 2.0176862487557257, "grad_norm": 0.803816020488739, "learning_rate": 0.0003285702246531285, "loss": 1.6426, "step": 488500 }, { "epoch": 2.0197514342713405, "grad_norm": 0.7602670788764954, "learning_rate": 0.00032787945185797155, "loss": 1.6462, "step": 489000 }, { "epoch": 2.0218166197869554, "grad_norm": 0.7873088121414185, "learning_rate": 0.0003271886790628147, "loss": 1.6463, "step": 489500 }, { "epoch": 2.0238818053025702, "grad_norm": 0.81231290102005, "learning_rate": 0.0003264979062676579, "loss": 1.6477, "step": 490000 }, { "epoch": 2.025946990818185, "grad_norm": 0.8037064671516418, "learning_rate": 0.000325807133472501, "loss": 1.6454, "step": 490500 }, { "epoch": 2.0280121763338, "grad_norm": 0.8113204836845398, "learning_rate": 0.0003251163606773442, "loss": 1.6449, "step": 491000 }, { "epoch": 2.030077361849415, "grad_norm": 0.7967438101768494, "learning_rate": 0.00032442558788218736, "loss": 1.6413, "step": 491500 }, { "epoch": 2.0321425473650296, "grad_norm": 0.7982317805290222, "learning_rate": 0.0003237348150870305, "loss": 1.6461, "step": 492000 }, { "epoch": 2.0342077328806445, "grad_norm": 0.786389172077179, "learning_rate": 0.0003230440422918736, "loss": 1.6492, "step": 492500 }, { "epoch": 2.0362729183962593, "grad_norm": 0.8528838157653809, "learning_rate": 0.00032235326949671677, "loss": 1.6433, "step": 493000 }, { "epoch": 2.038338103911874, "grad_norm": 0.7775473594665527, "learning_rate": 0.0003216624967015599, "loss": 1.6445, "step": 493500 }, { "epoch": 2.040403289427489, "grad_norm": 0.7589669227600098, "learning_rate": 0.00032097172390640306, "loss": 1.6424, "step": 494000 }, { "epoch": 2.0424684749431044, "grad_norm": 0.7403915524482727, "learning_rate": 0.00032028095111124623, "loss": 1.6414, "step": 494500 }, { "epoch": 2.044533660458719, "grad_norm": 0.7815344333648682, "learning_rate": 0.00031959017831608935, "loss": 1.6398, "step": 495000 }, { "epoch": 2.046598845974334, "grad_norm": 0.7826516628265381, "learning_rate": 0.0003188994055209325, "loss": 1.6404, "step": 495500 }, { "epoch": 2.048664031489949, "grad_norm": 0.8382503986358643, "learning_rate": 0.00031820863272577564, "loss": 1.6477, "step": 496000 }, { "epoch": 2.0507292170055638, "grad_norm": 0.8345251679420471, "learning_rate": 0.00031751785993061875, "loss": 1.6395, "step": 496500 }, { "epoch": 2.0527944025211786, "grad_norm": 0.7702645659446716, "learning_rate": 0.0003168270871354619, "loss": 1.6393, "step": 497000 }, { "epoch": 2.0548595880367935, "grad_norm": 0.7861506938934326, "learning_rate": 0.0003161363143403051, "loss": 1.6431, "step": 497500 }, { "epoch": 2.0569247735524083, "grad_norm": 0.8483462929725647, "learning_rate": 0.0003154455415451482, "loss": 1.638, "step": 498000 }, { "epoch": 2.058989959068023, "grad_norm": 0.7427666783332825, "learning_rate": 0.0003147547687499914, "loss": 1.6398, "step": 498500 }, { "epoch": 2.061055144583638, "grad_norm": 0.8200947046279907, "learning_rate": 0.00031406399595483456, "loss": 1.6442, "step": 499000 }, { "epoch": 2.063120330099253, "grad_norm": 0.7826699018478394, "learning_rate": 0.0003133732231596776, "loss": 1.6373, "step": 499500 }, { "epoch": 2.0651855156148677, "grad_norm": 0.8340067267417908, "learning_rate": 0.0003126824503645208, "loss": 1.6423, "step": 500000 }, { "epoch": 2.0672507011304826, "grad_norm": 0.8408244252204895, "learning_rate": 0.00031199167756936397, "loss": 1.6385, "step": 500500 }, { "epoch": 2.0693158866460974, "grad_norm": 0.7903205752372742, "learning_rate": 0.0003113009047742071, "loss": 1.64, "step": 501000 }, { "epoch": 2.0713810721617123, "grad_norm": 0.8002933859825134, "learning_rate": 0.00031061013197905026, "loss": 1.6406, "step": 501500 }, { "epoch": 2.073446257677327, "grad_norm": 0.7864850759506226, "learning_rate": 0.00030991935918389343, "loss": 1.6357, "step": 502000 }, { "epoch": 2.075511443192942, "grad_norm": 0.8161391615867615, "learning_rate": 0.00030922858638873655, "loss": 1.6361, "step": 502500 }, { "epoch": 2.077576628708557, "grad_norm": 0.8277705311775208, "learning_rate": 0.00030853781359357966, "loss": 1.6375, "step": 503000 }, { "epoch": 2.0796418142241717, "grad_norm": 0.7988829016685486, "learning_rate": 0.00030784704079842284, "loss": 1.6327, "step": 503500 }, { "epoch": 2.0817069997397866, "grad_norm": 0.7771642208099365, "learning_rate": 0.00030715626800326595, "loss": 1.6331, "step": 504000 }, { "epoch": 2.0837721852554014, "grad_norm": 0.7471011281013489, "learning_rate": 0.0003064654952081091, "loss": 1.6352, "step": 504500 }, { "epoch": 2.0858373707710163, "grad_norm": 0.7738475203514099, "learning_rate": 0.0003057747224129523, "loss": 1.6401, "step": 505000 }, { "epoch": 2.087902556286631, "grad_norm": 0.7593071460723877, "learning_rate": 0.0003050839496177954, "loss": 1.6394, "step": 505500 }, { "epoch": 2.089967741802246, "grad_norm": 0.7778981328010559, "learning_rate": 0.0003043931768226386, "loss": 1.6379, "step": 506000 }, { "epoch": 2.092032927317861, "grad_norm": 0.7668618559837341, "learning_rate": 0.0003037024040274817, "loss": 1.6374, "step": 506500 }, { "epoch": 2.0940981128334757, "grad_norm": 0.7854458689689636, "learning_rate": 0.0003030116312323248, "loss": 1.6314, "step": 507000 }, { "epoch": 2.0961632983490905, "grad_norm": 0.7660508751869202, "learning_rate": 0.000302320858437168, "loss": 1.6326, "step": 507500 }, { "epoch": 2.0982284838647054, "grad_norm": 0.759593665599823, "learning_rate": 0.00030163008564201117, "loss": 1.6352, "step": 508000 }, { "epoch": 2.1002936693803202, "grad_norm": 0.7907975912094116, "learning_rate": 0.0003009393128468543, "loss": 1.6301, "step": 508500 }, { "epoch": 2.102358854895935, "grad_norm": 0.8606127500534058, "learning_rate": 0.00030024854005169746, "loss": 1.6329, "step": 509000 }, { "epoch": 2.10442404041155, "grad_norm": 0.788470447063446, "learning_rate": 0.00029955776725654063, "loss": 1.6336, "step": 509500 }, { "epoch": 2.106489225927165, "grad_norm": 0.7975521087646484, "learning_rate": 0.0002988669944613837, "loss": 1.6354, "step": 510000 }, { "epoch": 2.10855441144278, "grad_norm": 0.8134068250656128, "learning_rate": 0.00029817622166622686, "loss": 1.6354, "step": 510500 }, { "epoch": 2.110619596958395, "grad_norm": 0.8084931373596191, "learning_rate": 0.00029748544887107004, "loss": 1.6348, "step": 511000 }, { "epoch": 2.11268478247401, "grad_norm": 0.8037887811660767, "learning_rate": 0.00029679467607591315, "loss": 1.6315, "step": 511500 }, { "epoch": 2.1147499679896247, "grad_norm": 0.7990454435348511, "learning_rate": 0.0002961039032807563, "loss": 1.6297, "step": 512000 }, { "epoch": 2.1168151535052395, "grad_norm": 0.7971472144126892, "learning_rate": 0.0002954131304855995, "loss": 1.6312, "step": 512500 }, { "epoch": 2.1188803390208544, "grad_norm": 0.8105595707893372, "learning_rate": 0.0002947223576904426, "loss": 1.6291, "step": 513000 }, { "epoch": 2.1209455245364692, "grad_norm": 0.8046666979789734, "learning_rate": 0.00029403158489528573, "loss": 1.6323, "step": 513500 }, { "epoch": 2.123010710052084, "grad_norm": 0.8228232860565186, "learning_rate": 0.0002933408121001289, "loss": 1.6309, "step": 514000 }, { "epoch": 2.125075895567699, "grad_norm": 0.7555162906646729, "learning_rate": 0.000292650039304972, "loss": 1.6305, "step": 514500 }, { "epoch": 2.127141081083314, "grad_norm": 0.7698606848716736, "learning_rate": 0.0002919592665098152, "loss": 1.6326, "step": 515000 }, { "epoch": 2.1292062665989286, "grad_norm": 0.7718132138252258, "learning_rate": 0.00029126849371465837, "loss": 1.6291, "step": 515500 }, { "epoch": 2.1312714521145435, "grad_norm": 0.7855656147003174, "learning_rate": 0.0002905777209195015, "loss": 1.6283, "step": 516000 }, { "epoch": 2.1333366376301584, "grad_norm": 0.8064797520637512, "learning_rate": 0.00028988694812434466, "loss": 1.628, "step": 516500 }, { "epoch": 2.135401823145773, "grad_norm": 0.7986974716186523, "learning_rate": 0.0002891961753291878, "loss": 1.6288, "step": 517000 }, { "epoch": 2.137467008661388, "grad_norm": 0.8309503197669983, "learning_rate": 0.0002885054025340309, "loss": 1.6316, "step": 517500 }, { "epoch": 2.139532194177003, "grad_norm": 0.8169652819633484, "learning_rate": 0.00028781462973887406, "loss": 1.6266, "step": 518000 }, { "epoch": 2.1415973796926178, "grad_norm": 0.7754685282707214, "learning_rate": 0.00028712385694371724, "loss": 1.6307, "step": 518500 }, { "epoch": 2.1436625652082326, "grad_norm": 0.7740616798400879, "learning_rate": 0.00028643308414856035, "loss": 1.6287, "step": 519000 }, { "epoch": 2.1457277507238475, "grad_norm": 0.7874515056610107, "learning_rate": 0.0002857423113534035, "loss": 1.6254, "step": 519500 }, { "epoch": 2.1477929362394623, "grad_norm": 0.8042634725570679, "learning_rate": 0.0002850515385582467, "loss": 1.6248, "step": 520000 }, { "epoch": 2.149858121755077, "grad_norm": 0.8339025974273682, "learning_rate": 0.00028436076576308976, "loss": 1.6293, "step": 520500 }, { "epoch": 2.151923307270692, "grad_norm": 0.822348415851593, "learning_rate": 0.00028366999296793293, "loss": 1.6297, "step": 521000 }, { "epoch": 2.153988492786307, "grad_norm": 0.7726020812988281, "learning_rate": 0.0002829792201727761, "loss": 1.6291, "step": 521500 }, { "epoch": 2.1560536783019217, "grad_norm": 0.7853801846504211, "learning_rate": 0.0002822884473776192, "loss": 1.6255, "step": 522000 }, { "epoch": 2.1581188638175366, "grad_norm": 0.7884477376937866, "learning_rate": 0.0002815976745824624, "loss": 1.627, "step": 522500 }, { "epoch": 2.1601840493331514, "grad_norm": 0.8253931999206543, "learning_rate": 0.00028090690178730557, "loss": 1.6259, "step": 523000 }, { "epoch": 2.1622492348487663, "grad_norm": 0.7904614210128784, "learning_rate": 0.00028021612899214863, "loss": 1.6269, "step": 523500 }, { "epoch": 2.164314420364381, "grad_norm": 0.7632104158401489, "learning_rate": 0.0002795253561969918, "loss": 1.6276, "step": 524000 }, { "epoch": 2.166379605879996, "grad_norm": 0.8433115482330322, "learning_rate": 0.000278834583401835, "loss": 1.624, "step": 524500 }, { "epoch": 2.168444791395611, "grad_norm": 0.8692212104797363, "learning_rate": 0.0002781438106066781, "loss": 1.6217, "step": 525000 }, { "epoch": 2.1705099769112257, "grad_norm": 0.7796012759208679, "learning_rate": 0.00027745303781152126, "loss": 1.622, "step": 525500 }, { "epoch": 2.172575162426841, "grad_norm": 0.8118318319320679, "learning_rate": 0.00027676226501636443, "loss": 1.6234, "step": 526000 }, { "epoch": 2.174640347942456, "grad_norm": 0.7940993309020996, "learning_rate": 0.00027607149222120755, "loss": 1.626, "step": 526500 }, { "epoch": 2.1767055334580707, "grad_norm": 0.797366738319397, "learning_rate": 0.00027538071942605067, "loss": 1.6189, "step": 527000 }, { "epoch": 2.1787707189736856, "grad_norm": 0.7807763814926147, "learning_rate": 0.00027468994663089384, "loss": 1.6204, "step": 527500 }, { "epoch": 2.1808359044893004, "grad_norm": 0.8015199303627014, "learning_rate": 0.000273999173835737, "loss": 1.6214, "step": 528000 }, { "epoch": 2.1829010900049153, "grad_norm": 0.8279714584350586, "learning_rate": 0.00027330840104058013, "loss": 1.6238, "step": 528500 }, { "epoch": 2.18496627552053, "grad_norm": 0.7654675245285034, "learning_rate": 0.0002726176282454233, "loss": 1.6285, "step": 529000 }, { "epoch": 2.187031461036145, "grad_norm": 0.7837437391281128, "learning_rate": 0.0002719268554502665, "loss": 1.6214, "step": 529500 }, { "epoch": 2.18909664655176, "grad_norm": 1.069981336593628, "learning_rate": 0.0002712360826551096, "loss": 1.6179, "step": 530000 }, { "epoch": 2.1911618320673747, "grad_norm": 0.7750839591026306, "learning_rate": 0.0002705453098599527, "loss": 1.6207, "step": 530500 }, { "epoch": 2.1932270175829895, "grad_norm": 0.7411586046218872, "learning_rate": 0.0002698545370647959, "loss": 1.6213, "step": 531000 }, { "epoch": 2.1952922030986044, "grad_norm": 0.8239914774894714, "learning_rate": 0.000269163764269639, "loss": 1.6162, "step": 531500 }, { "epoch": 2.1973573886142193, "grad_norm": 0.7895837426185608, "learning_rate": 0.0002684729914744822, "loss": 1.6175, "step": 532000 }, { "epoch": 2.199422574129834, "grad_norm": 0.7678940892219543, "learning_rate": 0.00026778221867932534, "loss": 1.6182, "step": 532500 }, { "epoch": 2.201487759645449, "grad_norm": 0.7663738131523132, "learning_rate": 0.00026709144588416846, "loss": 1.6161, "step": 533000 }, { "epoch": 2.203552945161064, "grad_norm": 0.8070668578147888, "learning_rate": 0.00026640067308901163, "loss": 1.6155, "step": 533500 }, { "epoch": 2.2056181306766787, "grad_norm": 0.8500379323959351, "learning_rate": 0.00026570990029385475, "loss": 1.6189, "step": 534000 }, { "epoch": 2.2076833161922935, "grad_norm": 0.8292637467384338, "learning_rate": 0.00026501912749869787, "loss": 1.6153, "step": 534500 }, { "epoch": 2.2097485017079084, "grad_norm": 0.7907617688179016, "learning_rate": 0.00026432835470354104, "loss": 1.6174, "step": 535000 }, { "epoch": 2.2118136872235232, "grad_norm": 0.7643933892250061, "learning_rate": 0.0002636375819083842, "loss": 1.6175, "step": 535500 }, { "epoch": 2.213878872739138, "grad_norm": 0.7963258624076843, "learning_rate": 0.00026294680911322733, "loss": 1.6184, "step": 536000 }, { "epoch": 2.215944058254753, "grad_norm": 0.7595391273498535, "learning_rate": 0.0002622560363180705, "loss": 1.6167, "step": 536500 }, { "epoch": 2.218009243770368, "grad_norm": 0.8099820613861084, "learning_rate": 0.0002615652635229137, "loss": 1.6172, "step": 537000 }, { "epoch": 2.2200744292859826, "grad_norm": 0.82416170835495, "learning_rate": 0.00026087449072775674, "loss": 1.6141, "step": 537500 }, { "epoch": 2.2221396148015975, "grad_norm": 0.8243468999862671, "learning_rate": 0.0002601837179325999, "loss": 1.616, "step": 538000 }, { "epoch": 2.2242048003172123, "grad_norm": 0.8235235214233398, "learning_rate": 0.0002594929451374431, "loss": 1.6142, "step": 538500 }, { "epoch": 2.226269985832827, "grad_norm": 0.8147215843200684, "learning_rate": 0.0002588021723422862, "loss": 1.6113, "step": 539000 }, { "epoch": 2.228335171348442, "grad_norm": 0.8038352131843567, "learning_rate": 0.00025811139954712937, "loss": 1.6198, "step": 539500 }, { "epoch": 2.230400356864057, "grad_norm": 0.7971067428588867, "learning_rate": 0.00025742062675197254, "loss": 1.6171, "step": 540000 }, { "epoch": 2.2324655423796718, "grad_norm": 0.8829773664474487, "learning_rate": 0.00025672985395681566, "loss": 1.6124, "step": 540500 }, { "epoch": 2.2345307278952866, "grad_norm": 0.8199840784072876, "learning_rate": 0.0002560390811616588, "loss": 1.6113, "step": 541000 }, { "epoch": 2.236595913410902, "grad_norm": 0.8040071725845337, "learning_rate": 0.00025534830836650195, "loss": 1.617, "step": 541500 }, { "epoch": 2.2386610989265168, "grad_norm": 0.7963501811027527, "learning_rate": 0.00025465753557134507, "loss": 1.6092, "step": 542000 }, { "epoch": 2.2407262844421316, "grad_norm": 0.8022527694702148, "learning_rate": 0.00025396676277618824, "loss": 1.6121, "step": 542500 }, { "epoch": 2.2427914699577465, "grad_norm": 0.7954930663108826, "learning_rate": 0.0002532759899810314, "loss": 1.6151, "step": 543000 }, { "epoch": 2.2448566554733613, "grad_norm": 0.7801050543785095, "learning_rate": 0.00025258521718587453, "loss": 1.6093, "step": 543500 }, { "epoch": 2.246921840988976, "grad_norm": 0.7806600332260132, "learning_rate": 0.0002518944443907177, "loss": 1.6089, "step": 544000 }, { "epoch": 2.248987026504591, "grad_norm": 0.7561779022216797, "learning_rate": 0.0002512036715955608, "loss": 1.6132, "step": 544500 }, { "epoch": 2.251052212020206, "grad_norm": 0.8682865500450134, "learning_rate": 0.00025051289880040394, "loss": 1.6124, "step": 545000 }, { "epoch": 2.2531173975358207, "grad_norm": 0.7335362434387207, "learning_rate": 0.0002498221260052471, "loss": 1.6115, "step": 545500 }, { "epoch": 2.2551825830514356, "grad_norm": 0.8360188603401184, "learning_rate": 0.0002491313532100903, "loss": 1.6157, "step": 546000 }, { "epoch": 2.2572477685670504, "grad_norm": 0.8048787713050842, "learning_rate": 0.0002484405804149334, "loss": 1.6138, "step": 546500 }, { "epoch": 2.2593129540826653, "grad_norm": 0.7759965658187866, "learning_rate": 0.00024774980761977657, "loss": 1.6112, "step": 547000 }, { "epoch": 2.26137813959828, "grad_norm": 0.8284432888031006, "learning_rate": 0.0002470590348246197, "loss": 1.6077, "step": 547500 }, { "epoch": 2.263443325113895, "grad_norm": 0.8492142558097839, "learning_rate": 0.00024636826202946286, "loss": 1.6059, "step": 548000 }, { "epoch": 2.26550851062951, "grad_norm": 0.921442449092865, "learning_rate": 0.000245677489234306, "loss": 1.6078, "step": 548500 }, { "epoch": 2.2675736961451247, "grad_norm": 0.7907894253730774, "learning_rate": 0.00024498671643914915, "loss": 1.6118, "step": 549000 }, { "epoch": 2.2696388816607396, "grad_norm": 0.7716451287269592, "learning_rate": 0.0002442959436439923, "loss": 1.6066, "step": 549500 }, { "epoch": 2.2717040671763544, "grad_norm": 0.757423460483551, "learning_rate": 0.00024360517084883544, "loss": 1.6098, "step": 550000 }, { "epoch": 2.2737692526919693, "grad_norm": 0.8127204179763794, "learning_rate": 0.00024291439805367856, "loss": 1.6089, "step": 550500 }, { "epoch": 2.275834438207584, "grad_norm": 0.7709484100341797, "learning_rate": 0.00024222362525852173, "loss": 1.6081, "step": 551000 }, { "epoch": 2.277899623723199, "grad_norm": 0.7922874093055725, "learning_rate": 0.00024153285246336488, "loss": 1.6064, "step": 551500 }, { "epoch": 2.279964809238814, "grad_norm": 0.789162814617157, "learning_rate": 0.00024084207966820802, "loss": 1.6052, "step": 552000 }, { "epoch": 2.2820299947544287, "grad_norm": 0.8289847373962402, "learning_rate": 0.00024015130687305117, "loss": 1.604, "step": 552500 }, { "epoch": 2.2840951802700435, "grad_norm": 0.8505263924598694, "learning_rate": 0.0002394605340778943, "loss": 1.6046, "step": 553000 }, { "epoch": 2.2861603657856584, "grad_norm": 0.7633844614028931, "learning_rate": 0.00023876976128273746, "loss": 1.6098, "step": 553500 }, { "epoch": 2.2882255513012733, "grad_norm": 0.775978147983551, "learning_rate": 0.0002380789884875806, "loss": 1.606, "step": 554000 }, { "epoch": 2.290290736816888, "grad_norm": 0.8002934455871582, "learning_rate": 0.00023738821569242374, "loss": 1.5989, "step": 554500 }, { "epoch": 2.292355922332503, "grad_norm": 0.8203332424163818, "learning_rate": 0.0002366974428972669, "loss": 1.6076, "step": 555000 }, { "epoch": 2.294421107848118, "grad_norm": 0.7718694806098938, "learning_rate": 0.00023600667010211006, "loss": 1.6023, "step": 555500 }, { "epoch": 2.296486293363733, "grad_norm": 0.8252015709877014, "learning_rate": 0.00023531589730695318, "loss": 1.6046, "step": 556000 }, { "epoch": 2.2985514788793475, "grad_norm": 0.868835985660553, "learning_rate": 0.00023462512451179632, "loss": 1.6013, "step": 556500 }, { "epoch": 2.300616664394963, "grad_norm": 0.8472076058387756, "learning_rate": 0.0002339343517166395, "loss": 1.6006, "step": 557000 }, { "epoch": 2.3026818499105772, "grad_norm": 0.7968847751617432, "learning_rate": 0.00023324357892148261, "loss": 1.6055, "step": 557500 }, { "epoch": 2.3047470354261925, "grad_norm": 0.8021098375320435, "learning_rate": 0.00023255280612632576, "loss": 1.6016, "step": 558000 }, { "epoch": 2.3068122209418074, "grad_norm": 0.852824866771698, "learning_rate": 0.00023186203333116893, "loss": 1.6028, "step": 558500 }, { "epoch": 2.3088774064574222, "grad_norm": 0.8099557161331177, "learning_rate": 0.00023117126053601208, "loss": 1.6022, "step": 559000 }, { "epoch": 2.310942591973037, "grad_norm": 0.7851099371910095, "learning_rate": 0.0002304804877408552, "loss": 1.6012, "step": 559500 }, { "epoch": 2.313007777488652, "grad_norm": 0.7841119170188904, "learning_rate": 0.00022978971494569837, "loss": 1.6055, "step": 560000 }, { "epoch": 2.315072963004267, "grad_norm": 0.7725875973701477, "learning_rate": 0.0002290989421505415, "loss": 1.6012, "step": 560500 }, { "epoch": 2.3171381485198816, "grad_norm": 0.8066521286964417, "learning_rate": 0.00022840816935538463, "loss": 1.6021, "step": 561000 }, { "epoch": 2.3192033340354965, "grad_norm": 0.804887056350708, "learning_rate": 0.0002277173965602278, "loss": 1.6006, "step": 561500 }, { "epoch": 2.3212685195511114, "grad_norm": 0.7885397672653198, "learning_rate": 0.00022702662376507094, "loss": 1.6035, "step": 562000 }, { "epoch": 2.323333705066726, "grad_norm": 0.7402700781822205, "learning_rate": 0.00022633585096991406, "loss": 1.5988, "step": 562500 }, { "epoch": 2.325398890582341, "grad_norm": 0.801807701587677, "learning_rate": 0.00022564507817475723, "loss": 1.6025, "step": 563000 }, { "epoch": 2.327464076097956, "grad_norm": 0.7947646379470825, "learning_rate": 0.00022495430537960038, "loss": 1.6002, "step": 563500 }, { "epoch": 2.3295292616135708, "grad_norm": 0.8268435001373291, "learning_rate": 0.00022426353258444352, "loss": 1.5977, "step": 564000 }, { "epoch": 2.3315944471291856, "grad_norm": 0.8092913627624512, "learning_rate": 0.00022357275978928667, "loss": 1.5996, "step": 564500 }, { "epoch": 2.3336596326448005, "grad_norm": 0.7848919630050659, "learning_rate": 0.00022288198699412981, "loss": 1.5995, "step": 565000 }, { "epoch": 2.3357248181604153, "grad_norm": 0.802832305431366, "learning_rate": 0.00022219121419897296, "loss": 1.5979, "step": 565500 }, { "epoch": 2.33779000367603, "grad_norm": 0.8020511865615845, "learning_rate": 0.0002215004414038161, "loss": 1.5981, "step": 566000 }, { "epoch": 2.339855189191645, "grad_norm": 0.8132838606834412, "learning_rate": 0.00022080966860865925, "loss": 1.5959, "step": 566500 }, { "epoch": 2.34192037470726, "grad_norm": 0.8069867491722107, "learning_rate": 0.0002201188958135024, "loss": 1.599, "step": 567000 }, { "epoch": 2.3439855602228747, "grad_norm": 0.8337593078613281, "learning_rate": 0.00021942812301834556, "loss": 1.5981, "step": 567500 }, { "epoch": 2.3460507457384896, "grad_norm": 0.7885046601295471, "learning_rate": 0.00021873735022318868, "loss": 1.597, "step": 568000 }, { "epoch": 2.3481159312541044, "grad_norm": 0.8003047108650208, "learning_rate": 0.00021804657742803183, "loss": 1.5969, "step": 568500 }, { "epoch": 2.3501811167697193, "grad_norm": 0.7714529037475586, "learning_rate": 0.000217355804632875, "loss": 1.5951, "step": 569000 }, { "epoch": 2.352246302285334, "grad_norm": 0.8057835102081299, "learning_rate": 0.00021666503183771812, "loss": 1.598, "step": 569500 }, { "epoch": 2.354311487800949, "grad_norm": 0.830685019493103, "learning_rate": 0.00021597425904256126, "loss": 1.5947, "step": 570000 }, { "epoch": 2.356376673316564, "grad_norm": 0.7966949939727783, "learning_rate": 0.00021528348624740443, "loss": 1.5933, "step": 570500 }, { "epoch": 2.3584418588321787, "grad_norm": 0.8312224745750427, "learning_rate": 0.00021459271345224758, "loss": 1.5941, "step": 571000 }, { "epoch": 2.3605070443477936, "grad_norm": 0.8126243948936462, "learning_rate": 0.0002139019406570907, "loss": 1.595, "step": 571500 }, { "epoch": 2.3625722298634084, "grad_norm": 0.7867225408554077, "learning_rate": 0.00021321116786193387, "loss": 1.5941, "step": 572000 }, { "epoch": 2.3646374153790237, "grad_norm": 0.8437660336494446, "learning_rate": 0.000212520395066777, "loss": 1.5937, "step": 572500 }, { "epoch": 2.366702600894638, "grad_norm": 0.7851312160491943, "learning_rate": 0.00021182962227162016, "loss": 1.5942, "step": 573000 }, { "epoch": 2.3687677864102534, "grad_norm": 0.8472355008125305, "learning_rate": 0.0002111388494764633, "loss": 1.5937, "step": 573500 }, { "epoch": 2.3708329719258683, "grad_norm": 0.7966650128364563, "learning_rate": 0.00021044807668130645, "loss": 1.5916, "step": 574000 }, { "epoch": 2.372898157441483, "grad_norm": 0.8345617651939392, "learning_rate": 0.00020975730388614962, "loss": 1.5868, "step": 574500 }, { "epoch": 2.374963342957098, "grad_norm": 0.82713383436203, "learning_rate": 0.00020906653109099274, "loss": 1.5982, "step": 575000 }, { "epoch": 2.377028528472713, "grad_norm": 0.8211519718170166, "learning_rate": 0.00020837575829583588, "loss": 1.5888, "step": 575500 }, { "epoch": 2.3790937139883277, "grad_norm": 0.8414788842201233, "learning_rate": 0.00020768498550067905, "loss": 1.5898, "step": 576000 }, { "epoch": 2.3811588995039425, "grad_norm": 0.7635331749916077, "learning_rate": 0.00020699421270552217, "loss": 1.5873, "step": 576500 }, { "epoch": 2.3832240850195574, "grad_norm": 0.8028623461723328, "learning_rate": 0.00020630343991036532, "loss": 1.5903, "step": 577000 }, { "epoch": 2.3852892705351723, "grad_norm": 0.8185293674468994, "learning_rate": 0.0002056126671152085, "loss": 1.5917, "step": 577500 }, { "epoch": 2.387354456050787, "grad_norm": 0.800356924533844, "learning_rate": 0.00020492189432005163, "loss": 1.5915, "step": 578000 }, { "epoch": 2.389419641566402, "grad_norm": 0.7916369438171387, "learning_rate": 0.00020423112152489475, "loss": 1.5909, "step": 578500 }, { "epoch": 2.391484827082017, "grad_norm": 0.830033540725708, "learning_rate": 0.00020354034872973792, "loss": 1.5881, "step": 579000 }, { "epoch": 2.3935500125976317, "grad_norm": 0.7948420643806458, "learning_rate": 0.00020284957593458107, "loss": 1.5897, "step": 579500 }, { "epoch": 2.3956151981132465, "grad_norm": 0.818466067314148, "learning_rate": 0.00020215880313942419, "loss": 1.5884, "step": 580000 }, { "epoch": 2.3976803836288614, "grad_norm": 0.8161965608596802, "learning_rate": 0.00020146803034426736, "loss": 1.5906, "step": 580500 }, { "epoch": 2.3997455691444762, "grad_norm": 0.8100621104240417, "learning_rate": 0.0002007772575491105, "loss": 1.5867, "step": 581000 }, { "epoch": 2.401810754660091, "grad_norm": 0.8225206136703491, "learning_rate": 0.00020008648475395365, "loss": 1.5912, "step": 581500 }, { "epoch": 2.403875940175706, "grad_norm": 0.8299617767333984, "learning_rate": 0.0001993957119587968, "loss": 1.592, "step": 582000 }, { "epoch": 2.405941125691321, "grad_norm": 0.7852752208709717, "learning_rate": 0.00019870493916363994, "loss": 1.5859, "step": 582500 }, { "epoch": 2.4080063112069356, "grad_norm": 0.8510515689849854, "learning_rate": 0.00019801416636848308, "loss": 1.5868, "step": 583000 }, { "epoch": 2.4100714967225505, "grad_norm": 0.8003944158554077, "learning_rate": 0.00019732339357332623, "loss": 1.5839, "step": 583500 }, { "epoch": 2.4121366822381654, "grad_norm": 0.8351225852966309, "learning_rate": 0.00019663262077816937, "loss": 1.5853, "step": 584000 }, { "epoch": 2.41420186775378, "grad_norm": 0.8417115211486816, "learning_rate": 0.00019594184798301252, "loss": 1.5808, "step": 584500 }, { "epoch": 2.416267053269395, "grad_norm": 0.822975754737854, "learning_rate": 0.0001952510751878557, "loss": 1.5828, "step": 585000 }, { "epoch": 2.41833223878501, "grad_norm": 0.8236469030380249, "learning_rate": 0.0001945603023926988, "loss": 1.5831, "step": 585500 }, { "epoch": 2.4203974243006248, "grad_norm": 0.8697351217269897, "learning_rate": 0.00019386952959754195, "loss": 1.5833, "step": 586000 }, { "epoch": 2.4224626098162396, "grad_norm": 0.7966268658638, "learning_rate": 0.00019317875680238512, "loss": 1.5835, "step": 586500 }, { "epoch": 2.4245277953318545, "grad_norm": 0.8148783445358276, "learning_rate": 0.00019248798400722824, "loss": 1.5855, "step": 587000 }, { "epoch": 2.4265929808474693, "grad_norm": 0.8134833574295044, "learning_rate": 0.00019179721121207139, "loss": 1.5843, "step": 587500 }, { "epoch": 2.4286581663630846, "grad_norm": 0.7940511703491211, "learning_rate": 0.00019110643841691456, "loss": 1.5831, "step": 588000 }, { "epoch": 2.430723351878699, "grad_norm": 0.7859951257705688, "learning_rate": 0.00019041566562175768, "loss": 1.5884, "step": 588500 }, { "epoch": 2.4327885373943143, "grad_norm": 0.7890865802764893, "learning_rate": 0.00018972489282660082, "loss": 1.5807, "step": 589000 }, { "epoch": 2.434853722909929, "grad_norm": 0.7785663604736328, "learning_rate": 0.000189034120031444, "loss": 1.581, "step": 589500 }, { "epoch": 2.436918908425544, "grad_norm": 0.8008002638816833, "learning_rate": 0.00018834334723628714, "loss": 1.5823, "step": 590000 }, { "epoch": 2.438984093941159, "grad_norm": 0.8359131813049316, "learning_rate": 0.00018765257444113025, "loss": 1.5773, "step": 590500 }, { "epoch": 2.4410492794567737, "grad_norm": 0.8443474173545837, "learning_rate": 0.00018696180164597343, "loss": 1.5841, "step": 591000 }, { "epoch": 2.4431144649723886, "grad_norm": 0.7927765846252441, "learning_rate": 0.00018627102885081657, "loss": 1.5777, "step": 591500 }, { "epoch": 2.4451796504880035, "grad_norm": 0.7933915853500366, "learning_rate": 0.0001855802560556597, "loss": 1.5801, "step": 592000 }, { "epoch": 2.4472448360036183, "grad_norm": 0.798565149307251, "learning_rate": 0.00018488948326050286, "loss": 1.5797, "step": 592500 }, { "epoch": 2.449310021519233, "grad_norm": 0.8151854276657104, "learning_rate": 0.000184198710465346, "loss": 1.5776, "step": 593000 }, { "epoch": 2.451375207034848, "grad_norm": 0.7885642051696777, "learning_rate": 0.00018350793767018915, "loss": 1.5798, "step": 593500 }, { "epoch": 2.453440392550463, "grad_norm": 0.8265528082847595, "learning_rate": 0.0001828171648750323, "loss": 1.5794, "step": 594000 }, { "epoch": 2.4555055780660777, "grad_norm": 0.8994278311729431, "learning_rate": 0.00018212639207987544, "loss": 1.5758, "step": 594500 }, { "epoch": 2.4575707635816926, "grad_norm": 0.7827315330505371, "learning_rate": 0.00018143561928471859, "loss": 1.5732, "step": 595000 }, { "epoch": 2.4596359490973074, "grad_norm": 0.7778897285461426, "learning_rate": 0.00018074484648956173, "loss": 1.5796, "step": 595500 }, { "epoch": 2.4617011346129223, "grad_norm": 0.7877337336540222, "learning_rate": 0.00018005407369440487, "loss": 1.5756, "step": 596000 }, { "epoch": 2.463766320128537, "grad_norm": 0.7807685136795044, "learning_rate": 0.00017936330089924802, "loss": 1.5787, "step": 596500 }, { "epoch": 2.465831505644152, "grad_norm": 0.825579047203064, "learning_rate": 0.0001786725281040912, "loss": 1.5794, "step": 597000 }, { "epoch": 2.467896691159767, "grad_norm": 0.8047968149185181, "learning_rate": 0.0001779817553089343, "loss": 1.5809, "step": 597500 }, { "epoch": 2.4699618766753817, "grad_norm": 0.8542481660842896, "learning_rate": 0.00017729098251377745, "loss": 1.5746, "step": 598000 }, { "epoch": 2.4720270621909965, "grad_norm": 0.8317158222198486, "learning_rate": 0.00017660020971862063, "loss": 1.5809, "step": 598500 }, { "epoch": 2.4740922477066114, "grad_norm": 0.8227892518043518, "learning_rate": 0.00017590943692346374, "loss": 1.5785, "step": 599000 }, { "epoch": 2.4761574332222263, "grad_norm": 0.8336827158927917, "learning_rate": 0.0001752186641283069, "loss": 1.5746, "step": 599500 }, { "epoch": 2.478222618737841, "grad_norm": 0.809407651424408, "learning_rate": 0.00017452789133315006, "loss": 1.5778, "step": 600000 }, { "epoch": 2.480287804253456, "grad_norm": 0.799867570400238, "learning_rate": 0.0001738371185379932, "loss": 1.5762, "step": 600500 }, { "epoch": 2.482352989769071, "grad_norm": 0.826615571975708, "learning_rate": 0.00017314634574283632, "loss": 1.5717, "step": 601000 }, { "epoch": 2.4844181752846857, "grad_norm": 0.7937526702880859, "learning_rate": 0.0001724555729476795, "loss": 1.579, "step": 601500 }, { "epoch": 2.4864833608003005, "grad_norm": 0.8167052865028381, "learning_rate": 0.00017176480015252264, "loss": 1.5727, "step": 602000 }, { "epoch": 2.4885485463159154, "grad_norm": 0.8457524180412292, "learning_rate": 0.00017107402735736579, "loss": 1.5684, "step": 602500 }, { "epoch": 2.4906137318315302, "grad_norm": 0.8600340485572815, "learning_rate": 0.00017038325456220893, "loss": 1.5767, "step": 603000 }, { "epoch": 2.4926789173471455, "grad_norm": 0.786114513874054, "learning_rate": 0.00016969248176705207, "loss": 1.5696, "step": 603500 }, { "epoch": 2.49474410286276, "grad_norm": 0.8081954717636108, "learning_rate": 0.00016900170897189525, "loss": 1.5735, "step": 604000 }, { "epoch": 2.4968092883783752, "grad_norm": 0.8113991618156433, "learning_rate": 0.00016831093617673836, "loss": 1.5746, "step": 604500 }, { "epoch": 2.49887447389399, "grad_norm": 0.8515011668205261, "learning_rate": 0.0001676201633815815, "loss": 1.568, "step": 605000 }, { "epoch": 2.500939659409605, "grad_norm": 0.7948423624038696, "learning_rate": 0.00016692939058642468, "loss": 1.5727, "step": 605500 }, { "epoch": 2.50300484492522, "grad_norm": 0.8115394711494446, "learning_rate": 0.0001662386177912678, "loss": 1.5704, "step": 606000 }, { "epoch": 2.5050700304408346, "grad_norm": 0.8036853671073914, "learning_rate": 0.00016554784499611094, "loss": 1.5684, "step": 606500 }, { "epoch": 2.5071352159564495, "grad_norm": 0.7892432808876038, "learning_rate": 0.00016485707220095412, "loss": 1.569, "step": 607000 }, { "epoch": 2.5092004014720644, "grad_norm": 0.7984645366668701, "learning_rate": 0.00016416629940579726, "loss": 1.5679, "step": 607500 }, { "epoch": 2.511265586987679, "grad_norm": 0.7996472120285034, "learning_rate": 0.00016347552661064038, "loss": 1.5691, "step": 608000 }, { "epoch": 2.513330772503294, "grad_norm": 0.8775748610496521, "learning_rate": 0.00016278475381548355, "loss": 1.5707, "step": 608500 }, { "epoch": 2.515395958018909, "grad_norm": 0.8051262497901917, "learning_rate": 0.0001620939810203267, "loss": 1.5739, "step": 609000 }, { "epoch": 2.5174611435345238, "grad_norm": 0.8654427528381348, "learning_rate": 0.0001614032082251698, "loss": 1.5697, "step": 609500 }, { "epoch": 2.5195263290501386, "grad_norm": 0.8159758448600769, "learning_rate": 0.00016071243543001298, "loss": 1.57, "step": 610000 }, { "epoch": 2.5215915145657535, "grad_norm": 0.8165413737297058, "learning_rate": 0.00016002166263485613, "loss": 1.569, "step": 610500 }, { "epoch": 2.5236567000813683, "grad_norm": 0.7978746891021729, "learning_rate": 0.00015933088983969927, "loss": 1.5659, "step": 611000 }, { "epoch": 2.525721885596983, "grad_norm": 0.781399130821228, "learning_rate": 0.00015864011704454242, "loss": 1.5707, "step": 611500 }, { "epoch": 2.527787071112598, "grad_norm": 0.8478353023529053, "learning_rate": 0.00015794934424938556, "loss": 1.5704, "step": 612000 }, { "epoch": 2.529852256628213, "grad_norm": 0.846371054649353, "learning_rate": 0.0001572585714542287, "loss": 1.5638, "step": 612500 }, { "epoch": 2.5319174421438277, "grad_norm": 0.8290744423866272, "learning_rate": 0.00015656779865907185, "loss": 1.5702, "step": 613000 }, { "epoch": 2.5339826276594426, "grad_norm": 0.8195119500160217, "learning_rate": 0.000155877025863915, "loss": 1.5677, "step": 613500 }, { "epoch": 2.5360478131750575, "grad_norm": 0.8459944128990173, "learning_rate": 0.00015518625306875814, "loss": 1.5662, "step": 614000 }, { "epoch": 2.5381129986906723, "grad_norm": 0.7994758486747742, "learning_rate": 0.00015449548027360132, "loss": 1.5676, "step": 614500 }, { "epoch": 2.540178184206287, "grad_norm": 0.7963876724243164, "learning_rate": 0.00015380470747844443, "loss": 1.5661, "step": 615000 }, { "epoch": 2.542243369721902, "grad_norm": 0.8234278559684753, "learning_rate": 0.00015311393468328758, "loss": 1.5635, "step": 615500 }, { "epoch": 2.544308555237517, "grad_norm": 0.7948046922683716, "learning_rate": 0.00015242316188813075, "loss": 1.5631, "step": 616000 }, { "epoch": 2.5463737407531317, "grad_norm": 0.7982361912727356, "learning_rate": 0.00015173238909297387, "loss": 1.5685, "step": 616500 }, { "epoch": 2.5484389262687466, "grad_norm": 0.7927718758583069, "learning_rate": 0.000151041616297817, "loss": 1.5661, "step": 617000 }, { "epoch": 2.5505041117843614, "grad_norm": 0.8640558123588562, "learning_rate": 0.00015035084350266018, "loss": 1.5673, "step": 617500 }, { "epoch": 2.5525692972999767, "grad_norm": 0.8167000412940979, "learning_rate": 0.0001496600707075033, "loss": 1.5666, "step": 618000 }, { "epoch": 2.554634482815591, "grad_norm": 0.8331367373466492, "learning_rate": 0.00014896929791234645, "loss": 1.5656, "step": 618500 }, { "epoch": 2.5566996683312064, "grad_norm": 0.8466469645500183, "learning_rate": 0.00014827852511718962, "loss": 1.562, "step": 619000 }, { "epoch": 2.558764853846821, "grad_norm": 0.7808212637901306, "learning_rate": 0.00014758775232203276, "loss": 1.5605, "step": 619500 }, { "epoch": 2.560830039362436, "grad_norm": 0.8436982035636902, "learning_rate": 0.00014689697952687588, "loss": 1.5621, "step": 620000 }, { "epoch": 2.5628952248780505, "grad_norm": 0.8526425361633301, "learning_rate": 0.00014620620673171905, "loss": 1.566, "step": 620500 }, { "epoch": 2.564960410393666, "grad_norm": 0.8892133831977844, "learning_rate": 0.0001455154339365622, "loss": 1.5623, "step": 621000 }, { "epoch": 2.5670255959092803, "grad_norm": 0.8048965930938721, "learning_rate": 0.00014482466114140532, "loss": 1.5617, "step": 621500 }, { "epoch": 2.5690907814248956, "grad_norm": 0.8180302977561951, "learning_rate": 0.0001441338883462485, "loss": 1.5605, "step": 622000 }, { "epoch": 2.5711559669405104, "grad_norm": 0.795669674873352, "learning_rate": 0.00014344311555109163, "loss": 1.5615, "step": 622500 }, { "epoch": 2.5732211524561253, "grad_norm": 0.8272981643676758, "learning_rate": 0.00014275234275593478, "loss": 1.5606, "step": 623000 }, { "epoch": 2.57528633797174, "grad_norm": 0.8385244607925415, "learning_rate": 0.00014206156996077792, "loss": 1.5628, "step": 623500 }, { "epoch": 2.577351523487355, "grad_norm": 0.8457437753677368, "learning_rate": 0.00014137079716562107, "loss": 1.5553, "step": 624000 }, { "epoch": 2.57941670900297, "grad_norm": 0.8497530221939087, "learning_rate": 0.0001406800243704642, "loss": 1.5521, "step": 624500 }, { "epoch": 2.5814818945185847, "grad_norm": 0.8231092691421509, "learning_rate": 0.00013998925157530736, "loss": 1.5613, "step": 625000 }, { "epoch": 2.5835470800341995, "grad_norm": 0.783505380153656, "learning_rate": 0.0001392984787801505, "loss": 1.5577, "step": 625500 }, { "epoch": 2.5856122655498144, "grad_norm": 0.8594375848770142, "learning_rate": 0.00013860770598499365, "loss": 1.5603, "step": 626000 }, { "epoch": 2.5876774510654292, "grad_norm": 0.824301540851593, "learning_rate": 0.00013791693318983682, "loss": 1.5592, "step": 626500 }, { "epoch": 2.589742636581044, "grad_norm": 0.7970808744430542, "learning_rate": 0.00013722616039467994, "loss": 1.5577, "step": 627000 }, { "epoch": 2.591807822096659, "grad_norm": 0.7681635022163391, "learning_rate": 0.00013653538759952308, "loss": 1.556, "step": 627500 }, { "epoch": 2.593873007612274, "grad_norm": 0.820792555809021, "learning_rate": 0.00013584461480436625, "loss": 1.5567, "step": 628000 }, { "epoch": 2.5959381931278886, "grad_norm": 0.8436790704727173, "learning_rate": 0.00013515384200920937, "loss": 1.5562, "step": 628500 }, { "epoch": 2.5980033786435035, "grad_norm": 0.806010901927948, "learning_rate": 0.00013446306921405252, "loss": 1.558, "step": 629000 }, { "epoch": 2.6000685641591184, "grad_norm": 0.8049686551094055, "learning_rate": 0.0001337722964188957, "loss": 1.5593, "step": 629500 }, { "epoch": 2.602133749674733, "grad_norm": 0.8346471786499023, "learning_rate": 0.00013308152362373883, "loss": 1.5551, "step": 630000 }, { "epoch": 2.604198935190348, "grad_norm": 0.8366252779960632, "learning_rate": 0.00013239075082858195, "loss": 1.5571, "step": 630500 }, { "epoch": 2.606264120705963, "grad_norm": 0.8249139785766602, "learning_rate": 0.00013169997803342512, "loss": 1.554, "step": 631000 }, { "epoch": 2.6083293062215778, "grad_norm": 0.8431522250175476, "learning_rate": 0.00013100920523826827, "loss": 1.557, "step": 631500 }, { "epoch": 2.6103944917371926, "grad_norm": 0.8180191516876221, "learning_rate": 0.00013031843244311138, "loss": 1.553, "step": 632000 }, { "epoch": 2.6124596772528075, "grad_norm": 0.7824527025222778, "learning_rate": 0.00012962765964795456, "loss": 1.558, "step": 632500 }, { "epoch": 2.6145248627684223, "grad_norm": 0.839433491230011, "learning_rate": 0.0001289368868527977, "loss": 1.5525, "step": 633000 }, { "epoch": 2.6165900482840376, "grad_norm": 0.9019516110420227, "learning_rate": 0.00012824611405764087, "loss": 1.5569, "step": 633500 }, { "epoch": 2.618655233799652, "grad_norm": 0.8029139637947083, "learning_rate": 0.000127555341262484, "loss": 1.5552, "step": 634000 }, { "epoch": 2.6207204193152673, "grad_norm": 0.8322605490684509, "learning_rate": 0.00012686456846732714, "loss": 1.5566, "step": 634500 }, { "epoch": 2.6227856048308817, "grad_norm": 0.8417773842811584, "learning_rate": 0.0001261737956721703, "loss": 1.551, "step": 635000 }, { "epoch": 2.624850790346497, "grad_norm": 0.8202713131904602, "learning_rate": 0.00012548302287701343, "loss": 1.5507, "step": 635500 }, { "epoch": 2.6269159758621115, "grad_norm": 0.839905858039856, "learning_rate": 0.00012479225008185657, "loss": 1.5495, "step": 636000 }, { "epoch": 2.6289811613777267, "grad_norm": 0.8542851805686951, "learning_rate": 0.00012410147728669972, "loss": 1.5504, "step": 636500 }, { "epoch": 2.631046346893341, "grad_norm": 0.8227192163467407, "learning_rate": 0.0001234107044915429, "loss": 1.5531, "step": 637000 }, { "epoch": 2.6331115324089565, "grad_norm": 0.8212194442749023, "learning_rate": 0.000122719931696386, "loss": 1.5523, "step": 637500 }, { "epoch": 2.6351767179245713, "grad_norm": 0.8629603981971741, "learning_rate": 0.00012202915890122916, "loss": 1.5479, "step": 638000 }, { "epoch": 2.637241903440186, "grad_norm": 0.8459728956222534, "learning_rate": 0.00012133838610607231, "loss": 1.5481, "step": 638500 }, { "epoch": 2.639307088955801, "grad_norm": 0.8557335734367371, "learning_rate": 0.00012064761331091545, "loss": 1.5487, "step": 639000 }, { "epoch": 2.641372274471416, "grad_norm": 0.8298543691635132, "learning_rate": 0.0001199568405157586, "loss": 1.5479, "step": 639500 }, { "epoch": 2.6434374599870307, "grad_norm": 0.8238996863365173, "learning_rate": 0.00011926606772060176, "loss": 1.5507, "step": 640000 }, { "epoch": 2.6455026455026456, "grad_norm": 0.7995360493659973, "learning_rate": 0.0001185752949254449, "loss": 1.5443, "step": 640500 }, { "epoch": 2.6475678310182604, "grad_norm": 0.8611718416213989, "learning_rate": 0.00011788452213028803, "loss": 1.5476, "step": 641000 }, { "epoch": 2.6496330165338753, "grad_norm": 0.8229385614395142, "learning_rate": 0.00011719374933513119, "loss": 1.545, "step": 641500 }, { "epoch": 2.65169820204949, "grad_norm": 0.8134409785270691, "learning_rate": 0.00011650297653997434, "loss": 1.5482, "step": 642000 }, { "epoch": 2.653763387565105, "grad_norm": 0.8563694953918457, "learning_rate": 0.00011581220374481748, "loss": 1.5457, "step": 642500 }, { "epoch": 2.65582857308072, "grad_norm": 0.8361693620681763, "learning_rate": 0.00011512143094966063, "loss": 1.5462, "step": 643000 }, { "epoch": 2.6578937585963347, "grad_norm": 0.8493614792823792, "learning_rate": 0.00011443065815450378, "loss": 1.5463, "step": 643500 }, { "epoch": 2.6599589441119496, "grad_norm": 0.7997604012489319, "learning_rate": 0.00011373988535934692, "loss": 1.547, "step": 644000 }, { "epoch": 2.6620241296275644, "grad_norm": 0.8045528531074524, "learning_rate": 0.00011304911256419006, "loss": 1.5491, "step": 644500 }, { "epoch": 2.6640893151431793, "grad_norm": 0.8172311186790466, "learning_rate": 0.00011235833976903322, "loss": 1.5486, "step": 645000 }, { "epoch": 2.666154500658794, "grad_norm": 0.8630313873291016, "learning_rate": 0.00011166756697387635, "loss": 1.5513, "step": 645500 }, { "epoch": 2.668219686174409, "grad_norm": 0.8246090412139893, "learning_rate": 0.00011097679417871951, "loss": 1.5461, "step": 646000 }, { "epoch": 2.670284871690024, "grad_norm": 0.8191748857498169, "learning_rate": 0.00011028602138356265, "loss": 1.545, "step": 646500 }, { "epoch": 2.6723500572056387, "grad_norm": 1.1739202737808228, "learning_rate": 0.0001095952485884058, "loss": 1.5455, "step": 647000 }, { "epoch": 2.6744152427212535, "grad_norm": 0.8145565390586853, "learning_rate": 0.00010890447579324894, "loss": 1.5408, "step": 647500 }, { "epoch": 2.6764804282368684, "grad_norm": 0.8613256216049194, "learning_rate": 0.00010821370299809209, "loss": 1.5439, "step": 648000 }, { "epoch": 2.6785456137524832, "grad_norm": 0.8024303317070007, "learning_rate": 0.00010752293020293523, "loss": 1.5438, "step": 648500 }, { "epoch": 2.680610799268098, "grad_norm": 0.8254972100257874, "learning_rate": 0.00010683215740777838, "loss": 1.5458, "step": 649000 }, { "epoch": 2.682675984783713, "grad_norm": 0.815696120262146, "learning_rate": 0.00010614138461262154, "loss": 1.542, "step": 649500 }, { "epoch": 2.6847411702993282, "grad_norm": 0.8715610504150391, "learning_rate": 0.00010545061181746467, "loss": 1.5415, "step": 650000 }, { "epoch": 2.6868063558149426, "grad_norm": 0.8358045220375061, "learning_rate": 0.00010475983902230781, "loss": 1.5419, "step": 650500 }, { "epoch": 2.688871541330558, "grad_norm": 0.7865080237388611, "learning_rate": 0.00010406906622715097, "loss": 1.5429, "step": 651000 }, { "epoch": 2.6909367268461724, "grad_norm": 0.8054898381233215, "learning_rate": 0.0001033782934319941, "loss": 1.5433, "step": 651500 }, { "epoch": 2.6930019123617877, "grad_norm": 0.8930450081825256, "learning_rate": 0.00010268752063683726, "loss": 1.5352, "step": 652000 }, { "epoch": 2.695067097877402, "grad_norm": 0.8042411208152771, "learning_rate": 0.0001019967478416804, "loss": 1.5413, "step": 652500 }, { "epoch": 2.6971322833930174, "grad_norm": 0.8400362133979797, "learning_rate": 0.00010130597504652355, "loss": 1.5423, "step": 653000 }, { "epoch": 2.699197468908632, "grad_norm": 0.8137294054031372, "learning_rate": 0.0001006152022513667, "loss": 1.5432, "step": 653500 }, { "epoch": 2.701262654424247, "grad_norm": 0.8344128727912903, "learning_rate": 9.992442945620984e-05, "loss": 1.5398, "step": 654000 }, { "epoch": 2.703327839939862, "grad_norm": 0.849104642868042, "learning_rate": 9.923365666105298e-05, "loss": 1.54, "step": 654500 }, { "epoch": 2.7053930254554768, "grad_norm": 0.8286527991294861, "learning_rate": 9.854288386589613e-05, "loss": 1.5412, "step": 655000 }, { "epoch": 2.7074582109710916, "grad_norm": 0.8378123641014099, "learning_rate": 9.785211107073929e-05, "loss": 1.5361, "step": 655500 }, { "epoch": 2.7095233964867065, "grad_norm": 0.8808925151824951, "learning_rate": 9.716133827558243e-05, "loss": 1.5382, "step": 656000 }, { "epoch": 2.7115885820023213, "grad_norm": 0.8783825039863586, "learning_rate": 9.647056548042558e-05, "loss": 1.5361, "step": 656500 }, { "epoch": 2.713653767517936, "grad_norm": 0.8051160573959351, "learning_rate": 9.577979268526872e-05, "loss": 1.5358, "step": 657000 }, { "epoch": 2.715718953033551, "grad_norm": 0.896801233291626, "learning_rate": 9.508901989011187e-05, "loss": 1.5368, "step": 657500 }, { "epoch": 2.717784138549166, "grad_norm": 0.8218420743942261, "learning_rate": 9.439824709495501e-05, "loss": 1.537, "step": 658000 }, { "epoch": 2.7198493240647807, "grad_norm": 0.8470411896705627, "learning_rate": 9.370747429979816e-05, "loss": 1.5322, "step": 658500 }, { "epoch": 2.7219145095803956, "grad_norm": 0.8505502939224243, "learning_rate": 9.301670150464131e-05, "loss": 1.5317, "step": 659000 }, { "epoch": 2.7239796950960105, "grad_norm": 0.8617528080940247, "learning_rate": 9.232592870948445e-05, "loss": 1.5375, "step": 659500 }, { "epoch": 2.7260448806116253, "grad_norm": 0.8441663384437561, "learning_rate": 9.16351559143276e-05, "loss": 1.5366, "step": 660000 }, { "epoch": 2.72811006612724, "grad_norm": 0.8294611573219299, "learning_rate": 9.094438311917075e-05, "loss": 1.5373, "step": 660500 }, { "epoch": 2.730175251642855, "grad_norm": 0.8215169906616211, "learning_rate": 9.025361032401388e-05, "loss": 1.5327, "step": 661000 }, { "epoch": 2.73224043715847, "grad_norm": 0.8766931891441345, "learning_rate": 8.956283752885704e-05, "loss": 1.5339, "step": 661500 }, { "epoch": 2.7343056226740847, "grad_norm": 0.8456342220306396, "learning_rate": 8.887206473370018e-05, "loss": 1.5341, "step": 662000 }, { "epoch": 2.7363708081896996, "grad_norm": 0.8384252786636353, "learning_rate": 8.818129193854333e-05, "loss": 1.5338, "step": 662500 }, { "epoch": 2.7384359937053144, "grad_norm": 0.8584861159324646, "learning_rate": 8.749051914338647e-05, "loss": 1.5301, "step": 663000 }, { "epoch": 2.7405011792209293, "grad_norm": 0.8463834524154663, "learning_rate": 8.679974634822962e-05, "loss": 1.531, "step": 663500 }, { "epoch": 2.742566364736544, "grad_norm": 0.84855055809021, "learning_rate": 8.610897355307276e-05, "loss": 1.5332, "step": 664000 }, { "epoch": 2.744631550252159, "grad_norm": 0.8267730474472046, "learning_rate": 8.541820075791591e-05, "loss": 1.5337, "step": 664500 }, { "epoch": 2.746696735767774, "grad_norm": 0.8398123383522034, "learning_rate": 8.472742796275907e-05, "loss": 1.5327, "step": 665000 }, { "epoch": 2.748761921283389, "grad_norm": 0.8413114547729492, "learning_rate": 8.40366551676022e-05, "loss": 1.5355, "step": 665500 }, { "epoch": 2.7508271067990036, "grad_norm": 0.8241723775863647, "learning_rate": 8.334588237244536e-05, "loss": 1.533, "step": 666000 }, { "epoch": 2.752892292314619, "grad_norm": 0.8695456981658936, "learning_rate": 8.26551095772885e-05, "loss": 1.5347, "step": 666500 }, { "epoch": 2.7549574778302333, "grad_norm": 0.8351263403892517, "learning_rate": 8.196433678213163e-05, "loss": 1.53, "step": 667000 }, { "epoch": 2.7570226633458486, "grad_norm": 0.8227745294570923, "learning_rate": 8.127356398697479e-05, "loss": 1.53, "step": 667500 }, { "epoch": 2.759087848861463, "grad_norm": 0.8654522895812988, "learning_rate": 8.058279119181794e-05, "loss": 1.532, "step": 668000 }, { "epoch": 2.7611530343770783, "grad_norm": 0.819057822227478, "learning_rate": 7.989201839666108e-05, "loss": 1.5297, "step": 668500 }, { "epoch": 2.763218219892693, "grad_norm": 0.8575501441955566, "learning_rate": 7.920124560150422e-05, "loss": 1.5275, "step": 669000 }, { "epoch": 2.765283405408308, "grad_norm": 0.8428553938865662, "learning_rate": 7.851047280634738e-05, "loss": 1.5321, "step": 669500 }, { "epoch": 2.767348590923923, "grad_norm": 0.8702006936073303, "learning_rate": 7.781970001119051e-05, "loss": 1.5291, "step": 670000 }, { "epoch": 2.7694137764395377, "grad_norm": 0.8024266958236694, "learning_rate": 7.712892721603366e-05, "loss": 1.529, "step": 670500 }, { "epoch": 2.7714789619551525, "grad_norm": 0.862339437007904, "learning_rate": 7.643815442087682e-05, "loss": 1.5337, "step": 671000 }, { "epoch": 2.7735441474707674, "grad_norm": 0.8829432725906372, "learning_rate": 7.574738162571996e-05, "loss": 1.5243, "step": 671500 }, { "epoch": 2.7756093329863822, "grad_norm": 0.8032020926475525, "learning_rate": 7.505660883056311e-05, "loss": 1.525, "step": 672000 }, { "epoch": 2.777674518501997, "grad_norm": 0.8329365849494934, "learning_rate": 7.436583603540625e-05, "loss": 1.532, "step": 672500 }, { "epoch": 2.779739704017612, "grad_norm": 0.865728497505188, "learning_rate": 7.367506324024941e-05, "loss": 1.5243, "step": 673000 }, { "epoch": 2.781804889533227, "grad_norm": 0.8427261114120483, "learning_rate": 7.298429044509254e-05, "loss": 1.5197, "step": 673500 }, { "epoch": 2.7838700750488417, "grad_norm": 0.8444133400917053, "learning_rate": 7.229351764993569e-05, "loss": 1.5314, "step": 674000 }, { "epoch": 2.7859352605644565, "grad_norm": 0.8255510330200195, "learning_rate": 7.160274485477885e-05, "loss": 1.5275, "step": 674500 }, { "epoch": 2.7880004460800714, "grad_norm": 0.794021487236023, "learning_rate": 7.091197205962198e-05, "loss": 1.5237, "step": 675000 }, { "epoch": 2.790065631595686, "grad_norm": 0.8648783564567566, "learning_rate": 7.022119926446513e-05, "loss": 1.5221, "step": 675500 }, { "epoch": 2.792130817111301, "grad_norm": 0.8662870526313782, "learning_rate": 6.953042646930828e-05, "loss": 1.5239, "step": 676000 }, { "epoch": 2.794196002626916, "grad_norm": 0.8716167211532593, "learning_rate": 6.883965367415141e-05, "loss": 1.5284, "step": 676500 }, { "epoch": 2.7962611881425308, "grad_norm": 0.8369839191436768, "learning_rate": 6.814888087899457e-05, "loss": 1.5206, "step": 677000 }, { "epoch": 2.7983263736581456, "grad_norm": 0.8716705441474915, "learning_rate": 6.745810808383771e-05, "loss": 1.5179, "step": 677500 }, { "epoch": 2.8003915591737605, "grad_norm": 0.8210489153862, "learning_rate": 6.676733528868086e-05, "loss": 1.5286, "step": 678000 }, { "epoch": 2.8024567446893753, "grad_norm": 0.8834524750709534, "learning_rate": 6.6076562493524e-05, "loss": 1.5271, "step": 678500 }, { "epoch": 2.80452193020499, "grad_norm": 0.858285665512085, "learning_rate": 6.538578969836716e-05, "loss": 1.5232, "step": 679000 }, { "epoch": 2.806587115720605, "grad_norm": 0.8696337342262268, "learning_rate": 6.46950169032103e-05, "loss": 1.524, "step": 679500 }, { "epoch": 2.80865230123622, "grad_norm": 0.8471727967262268, "learning_rate": 6.400424410805344e-05, "loss": 1.523, "step": 680000 }, { "epoch": 2.8107174867518347, "grad_norm": 0.8594076633453369, "learning_rate": 6.33134713128966e-05, "loss": 1.5166, "step": 680500 }, { "epoch": 2.81278267226745, "grad_norm": 0.856606662273407, "learning_rate": 6.262269851773973e-05, "loss": 1.523, "step": 681000 }, { "epoch": 2.8148478577830645, "grad_norm": 0.8609211444854736, "learning_rate": 6.193192572258289e-05, "loss": 1.5209, "step": 681500 }, { "epoch": 2.8169130432986798, "grad_norm": 0.8398802280426025, "learning_rate": 6.124115292742603e-05, "loss": 1.5271, "step": 682000 }, { "epoch": 2.818978228814294, "grad_norm": 0.9304519295692444, "learning_rate": 6.0550380132269176e-05, "loss": 1.5205, "step": 682500 }, { "epoch": 2.8210434143299095, "grad_norm": 0.8197703957557678, "learning_rate": 5.985960733711232e-05, "loss": 1.524, "step": 683000 }, { "epoch": 2.823108599845524, "grad_norm": 0.831089973449707, "learning_rate": 5.916883454195547e-05, "loss": 1.5204, "step": 683500 }, { "epoch": 2.825173785361139, "grad_norm": 0.8130340576171875, "learning_rate": 5.847806174679862e-05, "loss": 1.5151, "step": 684000 }, { "epoch": 2.827238970876754, "grad_norm": 0.8501649498939514, "learning_rate": 5.7787288951641755e-05, "loss": 1.5213, "step": 684500 }, { "epoch": 2.829304156392369, "grad_norm": 0.827510416507721, "learning_rate": 5.709651615648491e-05, "loss": 1.5202, "step": 685000 }, { "epoch": 2.8313693419079837, "grad_norm": 0.8375749588012695, "learning_rate": 5.640574336132805e-05, "loss": 1.5226, "step": 685500 }, { "epoch": 2.8334345274235986, "grad_norm": 0.8179614543914795, "learning_rate": 5.57149705661712e-05, "loss": 1.5174, "step": 686000 }, { "epoch": 2.8354997129392134, "grad_norm": 0.8485569953918457, "learning_rate": 5.502419777101435e-05, "loss": 1.5197, "step": 686500 }, { "epoch": 2.8375648984548283, "grad_norm": 0.8839040398597717, "learning_rate": 5.433342497585749e-05, "loss": 1.5206, "step": 687000 }, { "epoch": 2.839630083970443, "grad_norm": 0.8560023307800293, "learning_rate": 5.364265218070064e-05, "loss": 1.5177, "step": 687500 }, { "epoch": 2.841695269486058, "grad_norm": 0.8139906525611877, "learning_rate": 5.295187938554378e-05, "loss": 1.5154, "step": 688000 }, { "epoch": 2.843760455001673, "grad_norm": 0.9361693859100342, "learning_rate": 5.226110659038693e-05, "loss": 1.5131, "step": 688500 }, { "epoch": 2.8458256405172877, "grad_norm": 0.8294958472251892, "learning_rate": 5.157033379523008e-05, "loss": 1.5194, "step": 689000 }, { "epoch": 2.8478908260329026, "grad_norm": 0.8591476082801819, "learning_rate": 5.0879561000073224e-05, "loss": 1.5179, "step": 689500 }, { "epoch": 2.8499560115485174, "grad_norm": 0.8466942310333252, "learning_rate": 5.018878820491637e-05, "loss": 1.5122, "step": 690000 }, { "epoch": 2.8520211970641323, "grad_norm": 0.9315714240074158, "learning_rate": 4.949801540975952e-05, "loss": 1.5193, "step": 690500 }, { "epoch": 2.854086382579747, "grad_norm": 0.8646622896194458, "learning_rate": 4.8807242614602665e-05, "loss": 1.5155, "step": 691000 }, { "epoch": 2.856151568095362, "grad_norm": 0.8958275318145752, "learning_rate": 4.8116469819445804e-05, "loss": 1.5172, "step": 691500 }, { "epoch": 2.858216753610977, "grad_norm": 0.8623936176300049, "learning_rate": 4.7425697024288955e-05, "loss": 1.5138, "step": 692000 }, { "epoch": 2.8602819391265917, "grad_norm": 0.8689021468162537, "learning_rate": 4.67349242291321e-05, "loss": 1.5144, "step": 692500 }, { "epoch": 2.8623471246422065, "grad_norm": 0.8967764973640442, "learning_rate": 4.6044151433975245e-05, "loss": 1.5156, "step": 693000 }, { "epoch": 2.8644123101578214, "grad_norm": 0.8540061116218567, "learning_rate": 4.5353378638818396e-05, "loss": 1.5161, "step": 693500 }, { "epoch": 2.8664774956734362, "grad_norm": 0.8717928528785706, "learning_rate": 4.466260584366154e-05, "loss": 1.5097, "step": 694000 }, { "epoch": 2.868542681189051, "grad_norm": 0.861867368221283, "learning_rate": 4.397183304850468e-05, "loss": 1.5117, "step": 694500 }, { "epoch": 2.870607866704666, "grad_norm": 0.8746508955955505, "learning_rate": 4.328106025334783e-05, "loss": 1.5116, "step": 695000 }, { "epoch": 2.872673052220281, "grad_norm": 0.856505274772644, "learning_rate": 4.2590287458190976e-05, "loss": 1.5105, "step": 695500 }, { "epoch": 2.8747382377358957, "grad_norm": 0.8690941333770752, "learning_rate": 4.189951466303412e-05, "loss": 1.511, "step": 696000 }, { "epoch": 2.876803423251511, "grad_norm": 0.8394379019737244, "learning_rate": 4.120874186787727e-05, "loss": 1.511, "step": 696500 }, { "epoch": 2.8788686087671254, "grad_norm": 0.847400426864624, "learning_rate": 4.051796907272042e-05, "loss": 1.5152, "step": 697000 }, { "epoch": 2.8809337942827407, "grad_norm": 0.8548203706741333, "learning_rate": 3.982719627756357e-05, "loss": 1.5103, "step": 697500 }, { "epoch": 2.882998979798355, "grad_norm": 0.9266785979270935, "learning_rate": 3.913642348240671e-05, "loss": 1.5172, "step": 698000 }, { "epoch": 2.8850641653139704, "grad_norm": 0.8905568718910217, "learning_rate": 3.844565068724985e-05, "loss": 1.5147, "step": 698500 }, { "epoch": 2.8871293508295848, "grad_norm": 0.8947970271110535, "learning_rate": 3.7754877892093e-05, "loss": 1.5116, "step": 699000 }, { "epoch": 2.8891945363452, "grad_norm": 0.8671281337738037, "learning_rate": 3.706410509693615e-05, "loss": 1.5089, "step": 699500 }, { "epoch": 2.8912597218608145, "grad_norm": 0.8655187487602234, "learning_rate": 3.637333230177929e-05, "loss": 1.5079, "step": 700000 }, { "epoch": 2.8933249073764298, "grad_norm": 0.8781392574310303, "learning_rate": 3.5682559506622444e-05, "loss": 1.5051, "step": 700500 }, { "epoch": 2.8953900928920446, "grad_norm": 0.8239871859550476, "learning_rate": 3.499178671146558e-05, "loss": 1.5135, "step": 701000 }, { "epoch": 2.8974552784076595, "grad_norm": 0.8702250719070435, "learning_rate": 3.430101391630873e-05, "loss": 1.5101, "step": 701500 }, { "epoch": 2.8995204639232743, "grad_norm": 0.8681339621543884, "learning_rate": 3.361024112115188e-05, "loss": 1.5098, "step": 702000 }, { "epoch": 2.901585649438889, "grad_norm": 0.8929154276847839, "learning_rate": 3.2919468325995024e-05, "loss": 1.5115, "step": 702500 }, { "epoch": 2.903650834954504, "grad_norm": 0.8695405125617981, "learning_rate": 3.222869553083817e-05, "loss": 1.5073, "step": 703000 }, { "epoch": 2.905716020470119, "grad_norm": 0.8858229517936707, "learning_rate": 3.153792273568132e-05, "loss": 1.5081, "step": 703500 }, { "epoch": 2.9077812059857338, "grad_norm": 0.8298658132553101, "learning_rate": 3.0847149940524465e-05, "loss": 1.5109, "step": 704000 }, { "epoch": 2.9098463915013486, "grad_norm": 0.9026769399642944, "learning_rate": 3.015637714536761e-05, "loss": 1.5036, "step": 704500 }, { "epoch": 2.9119115770169635, "grad_norm": 0.8433796763420105, "learning_rate": 2.9465604350210755e-05, "loss": 1.5103, "step": 705000 }, { "epoch": 2.9139767625325783, "grad_norm": 0.8475963473320007, "learning_rate": 2.87748315550539e-05, "loss": 1.5084, "step": 705500 }, { "epoch": 2.916041948048193, "grad_norm": 0.8807883262634277, "learning_rate": 2.8084058759897048e-05, "loss": 1.5089, "step": 706000 }, { "epoch": 2.918107133563808, "grad_norm": 0.9054199457168579, "learning_rate": 2.7393285964740193e-05, "loss": 1.5123, "step": 706500 }, { "epoch": 2.920172319079423, "grad_norm": 0.8661481738090515, "learning_rate": 2.670251316958334e-05, "loss": 1.5088, "step": 707000 }, { "epoch": 2.9222375045950377, "grad_norm": 0.8456491231918335, "learning_rate": 2.6011740374426486e-05, "loss": 1.5078, "step": 707500 }, { "epoch": 2.9243026901106526, "grad_norm": 0.8700172305107117, "learning_rate": 2.5320967579269634e-05, "loss": 1.5056, "step": 708000 }, { "epoch": 2.9263678756262674, "grad_norm": 0.882483184337616, "learning_rate": 2.463019478411278e-05, "loss": 1.5118, "step": 708500 }, { "epoch": 2.9284330611418823, "grad_norm": 0.8397735357284546, "learning_rate": 2.3939421988955924e-05, "loss": 1.5078, "step": 709000 }, { "epoch": 2.930498246657497, "grad_norm": 0.8614588379859924, "learning_rate": 2.3248649193799072e-05, "loss": 1.504, "step": 709500 }, { "epoch": 2.932563432173112, "grad_norm": 0.8456758260726929, "learning_rate": 2.2557876398642217e-05, "loss": 1.5068, "step": 710000 }, { "epoch": 2.934628617688727, "grad_norm": 0.8835407495498657, "learning_rate": 2.1867103603485365e-05, "loss": 1.503, "step": 710500 }, { "epoch": 2.9366938032043417, "grad_norm": 0.8269529938697815, "learning_rate": 2.117633080832851e-05, "loss": 1.5039, "step": 711000 }, { "epoch": 2.9387589887199566, "grad_norm": 0.9135294556617737, "learning_rate": 2.0485558013171655e-05, "loss": 1.5067, "step": 711500 }, { "epoch": 2.940824174235572, "grad_norm": 0.8736814856529236, "learning_rate": 1.9794785218014803e-05, "loss": 1.5025, "step": 712000 }, { "epoch": 2.9428893597511863, "grad_norm": 0.8207076191902161, "learning_rate": 1.9104012422857948e-05, "loss": 1.503, "step": 712500 }, { "epoch": 2.9449545452668016, "grad_norm": 0.8992505669593811, "learning_rate": 1.8413239627701093e-05, "loss": 1.5057, "step": 713000 }, { "epoch": 2.947019730782416, "grad_norm": 0.8630014657974243, "learning_rate": 1.772246683254424e-05, "loss": 1.5026, "step": 713500 }, { "epoch": 2.9490849162980313, "grad_norm": 0.8466277122497559, "learning_rate": 1.703169403738739e-05, "loss": 1.5022, "step": 714000 }, { "epoch": 2.9511501018136457, "grad_norm": 0.8246403932571411, "learning_rate": 1.6340921242230534e-05, "loss": 1.5025, "step": 714500 }, { "epoch": 2.953215287329261, "grad_norm": 0.8537036776542664, "learning_rate": 1.565014844707368e-05, "loss": 1.5004, "step": 715000 }, { "epoch": 2.9552804728448754, "grad_norm": 0.8644038438796997, "learning_rate": 1.4959375651916825e-05, "loss": 1.5003, "step": 715500 }, { "epoch": 2.9573456583604907, "grad_norm": 0.8385940790176392, "learning_rate": 1.4268602856759972e-05, "loss": 1.4993, "step": 716000 }, { "epoch": 2.9594108438761055, "grad_norm": 0.8472567796707153, "learning_rate": 1.3577830061603118e-05, "loss": 1.503, "step": 716500 }, { "epoch": 2.9614760293917204, "grad_norm": 0.8817070126533508, "learning_rate": 1.2887057266446265e-05, "loss": 1.5039, "step": 717000 }, { "epoch": 2.9635412149073352, "grad_norm": 0.8786518573760986, "learning_rate": 1.219628447128941e-05, "loss": 1.506, "step": 717500 }, { "epoch": 2.96560640042295, "grad_norm": 0.8719050884246826, "learning_rate": 1.1505511676132556e-05, "loss": 1.5004, "step": 718000 }, { "epoch": 2.967671585938565, "grad_norm": 0.9109290242195129, "learning_rate": 1.0814738880975703e-05, "loss": 1.5021, "step": 718500 }, { "epoch": 2.96973677145418, "grad_norm": 0.8234292268753052, "learning_rate": 1.012396608581885e-05, "loss": 1.5025, "step": 719000 }, { "epoch": 2.9718019569697947, "grad_norm": 0.9141399264335632, "learning_rate": 9.433193290661996e-06, "loss": 1.4982, "step": 719500 }, { "epoch": 2.9738671424854095, "grad_norm": 0.8994991183280945, "learning_rate": 8.74242049550514e-06, "loss": 1.5012, "step": 720000 }, { "epoch": 2.9759323280010244, "grad_norm": 0.8629069328308105, "learning_rate": 8.051647700348289e-06, "loss": 1.5005, "step": 720500 }, { "epoch": 2.977997513516639, "grad_norm": 0.8604488968849182, "learning_rate": 7.360874905191434e-06, "loss": 1.497, "step": 721000 }, { "epoch": 2.980062699032254, "grad_norm": 0.8444788455963135, "learning_rate": 6.67010211003458e-06, "loss": 1.5015, "step": 721500 }, { "epoch": 2.982127884547869, "grad_norm": 0.844616711139679, "learning_rate": 5.979329314877727e-06, "loss": 1.5037, "step": 722000 }, { "epoch": 2.9841930700634838, "grad_norm": 0.8340693712234497, "learning_rate": 5.288556519720873e-06, "loss": 1.5002, "step": 722500 }, { "epoch": 2.9862582555790986, "grad_norm": 0.8410211205482483, "learning_rate": 4.597783724564018e-06, "loss": 1.4972, "step": 723000 }, { "epoch": 2.9883234410947135, "grad_norm": 0.8680119514465332, "learning_rate": 3.907010929407165e-06, "loss": 1.4977, "step": 723500 }, { "epoch": 2.9903886266103283, "grad_norm": 0.8596481084823608, "learning_rate": 3.2162381342503112e-06, "loss": 1.5007, "step": 724000 }, { "epoch": 2.992453812125943, "grad_norm": 0.7909371256828308, "learning_rate": 2.5254653390934573e-06, "loss": 1.4953, "step": 724500 }, { "epoch": 2.994518997641558, "grad_norm": 0.8666454553604126, "learning_rate": 1.8346925439366037e-06, "loss": 1.5011, "step": 725000 }, { "epoch": 2.996584183157173, "grad_norm": 0.8664350509643555, "learning_rate": 1.1439197487797498e-06, "loss": 1.5007, "step": 725500 }, { "epoch": 2.9986493686727878, "grad_norm": 0.8779242634773254, "learning_rate": 4.531469536228961e-07, "loss": 1.4985, "step": 726000 }, { "epoch": 3.0, "step": 726327, "total_flos": 1.546067484574894e+18, "train_loss": 1.7818369276394814, "train_runtime": 122016.1779, "train_samples_per_second": 380.973, "train_steps_per_second": 5.953 } ], "logging_steps": 500, "max_steps": 726327, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.546067484574894e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }