{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 13705, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001459321415541773, "grad_norm": 2.6745453947710165, "learning_rate": 4.611650485436894e-06, "loss": 1.0637, "step": 20 }, { "epoch": 0.002918642831083546, "grad_norm": 0.8636258229268736, "learning_rate": 9.466019417475729e-06, "loss": 0.3278, "step": 40 }, { "epoch": 0.004377964246625319, "grad_norm": 0.6429542529341992, "learning_rate": 1.4320388349514562e-05, "loss": 0.1701, "step": 60 }, { "epoch": 0.005837285662167092, "grad_norm": 0.4870228029221957, "learning_rate": 1.91747572815534e-05, "loss": 0.1413, "step": 80 }, { "epoch": 0.0072966070777088655, "grad_norm": 0.4961345702002848, "learning_rate": 2.4029126213592234e-05, "loss": 0.1396, "step": 100 }, { "epoch": 0.008755928493250638, "grad_norm": 0.40834540818199705, "learning_rate": 2.8883495145631068e-05, "loss": 0.1266, "step": 120 }, { "epoch": 0.010215249908792412, "grad_norm": 0.6018679639107668, "learning_rate": 3.373786407766991e-05, "loss": 0.1103, "step": 140 }, { "epoch": 0.011674571324334184, "grad_norm": 0.5960255000220145, "learning_rate": 3.859223300970874e-05, "loss": 0.1325, "step": 160 }, { "epoch": 0.013133892739875957, "grad_norm": 0.4212109164544222, "learning_rate": 4.344660194174757e-05, "loss": 0.1237, "step": 180 }, { "epoch": 0.014593214155417731, "grad_norm": 0.6565890138369451, "learning_rate": 4.830097087378641e-05, "loss": 0.1233, "step": 200 }, { "epoch": 0.016052535570959505, "grad_norm": 0.3857563740550121, "learning_rate": 5.3155339805825244e-05, "loss": 0.1185, "step": 220 }, { "epoch": 0.017511856986501276, "grad_norm": 0.35890344101071686, "learning_rate": 5.800970873786408e-05, "loss": 0.1415, "step": 240 }, { "epoch": 0.01897117840204305, "grad_norm": 0.2979671831060535, "learning_rate": 6.286407766990293e-05, "loss": 0.1362, "step": 260 }, { "epoch": 0.020430499817584824, "grad_norm": 0.38285277593806, "learning_rate": 6.771844660194175e-05, "loss": 0.1302, "step": 280 }, { "epoch": 0.021889821233126595, "grad_norm": 0.5932557337111825, "learning_rate": 7.25728155339806e-05, "loss": 0.1286, "step": 300 }, { "epoch": 0.02334914264866837, "grad_norm": 0.39643282763459103, "learning_rate": 7.742718446601942e-05, "loss": 0.1494, "step": 320 }, { "epoch": 0.024808464064210143, "grad_norm": 0.4853895916428549, "learning_rate": 8.228155339805825e-05, "loss": 0.1304, "step": 340 }, { "epoch": 0.026267785479751914, "grad_norm": 0.48381251386974544, "learning_rate": 8.713592233009709e-05, "loss": 0.1394, "step": 360 }, { "epoch": 0.027727106895293688, "grad_norm": 0.4904529766502894, "learning_rate": 9.199029126213593e-05, "loss": 0.1509, "step": 380 }, { "epoch": 0.029186428310835462, "grad_norm": 0.7958367491650168, "learning_rate": 9.684466019417477e-05, "loss": 0.1395, "step": 400 }, { "epoch": 0.030645749726377236, "grad_norm": 0.624215442246557, "learning_rate": 9.999993157895145e-05, "loss": 0.1745, "step": 420 }, { "epoch": 0.03210507114191901, "grad_norm": 0.6815596480986827, "learning_rate": 9.999898206558094e-05, "loss": 0.1433, "step": 440 }, { "epoch": 0.03356439255746078, "grad_norm": 0.7939990598700336, "learning_rate": 9.999691549843376e-05, "loss": 0.1609, "step": 460 }, { "epoch": 0.03502371397300255, "grad_norm": 0.4965525216601077, "learning_rate": 9.999373192368015e-05, "loss": 0.1523, "step": 480 }, { "epoch": 0.036483035388544326, "grad_norm": 1.8640551410419721, "learning_rate": 9.998943141244607e-05, "loss": 0.1809, "step": 500 }, { "epoch": 0.0379423568040861, "grad_norm": 0.6022579601783719, "learning_rate": 9.99840140608115e-05, "loss": 0.1956, "step": 520 }, { "epoch": 0.039401678219627874, "grad_norm": 0.40549268604230454, "learning_rate": 9.997747998980835e-05, "loss": 0.1648, "step": 540 }, { "epoch": 0.04086099963516965, "grad_norm": 0.49940827687014067, "learning_rate": 9.996982934541781e-05, "loss": 0.1475, "step": 560 }, { "epoch": 0.04232032105071142, "grad_norm": 0.5149466244422736, "learning_rate": 9.996106229856695e-05, "loss": 0.1518, "step": 580 }, { "epoch": 0.04377964246625319, "grad_norm": 0.29222877668330227, "learning_rate": 9.995117904512503e-05, "loss": 0.1682, "step": 600 }, { "epoch": 0.045238963881794964, "grad_norm": 0.5286646588687645, "learning_rate": 9.994017980589906e-05, "loss": 0.1421, "step": 620 }, { "epoch": 0.04669828529733674, "grad_norm": 1.1957202323680398, "learning_rate": 9.992806482662887e-05, "loss": 0.1699, "step": 640 }, { "epoch": 0.04815760671287851, "grad_norm": 0.5883304358400666, "learning_rate": 9.991483437798162e-05, "loss": 0.1338, "step": 660 }, { "epoch": 0.049616928128420286, "grad_norm": 0.6301249373045459, "learning_rate": 9.99004887555458e-05, "loss": 0.1547, "step": 680 }, { "epoch": 0.05107624954396206, "grad_norm": 0.4012564183658102, "learning_rate": 9.988502827982458e-05, "loss": 0.1367, "step": 700 }, { "epoch": 0.05253557095950383, "grad_norm": 0.3168597769245346, "learning_rate": 9.986845329622862e-05, "loss": 0.1475, "step": 720 }, { "epoch": 0.0539948923750456, "grad_norm": 0.38246057665835786, "learning_rate": 9.985076417506844e-05, "loss": 0.1397, "step": 740 }, { "epoch": 0.055454213790587376, "grad_norm": 0.3864484630630238, "learning_rate": 9.983196131154607e-05, "loss": 0.1368, "step": 760 }, { "epoch": 0.05691353520612915, "grad_norm": 0.34381534736001396, "learning_rate": 9.981204512574626e-05, "loss": 0.1202, "step": 780 }, { "epoch": 0.058372856621670924, "grad_norm": 0.5385327614437169, "learning_rate": 9.979101606262708e-05, "loss": 0.1444, "step": 800 }, { "epoch": 0.0598321780372127, "grad_norm": 0.3291873594168373, "learning_rate": 9.976887459200999e-05, "loss": 0.1344, "step": 820 }, { "epoch": 0.06129149945275447, "grad_norm": 0.35530404400780635, "learning_rate": 9.97456212085693e-05, "loss": 0.1455, "step": 840 }, { "epoch": 0.06275082086829624, "grad_norm": 0.25357381945588353, "learning_rate": 9.972125643182121e-05, "loss": 0.1405, "step": 860 }, { "epoch": 0.06421014228383802, "grad_norm": 0.33587236054655184, "learning_rate": 9.969578080611211e-05, "loss": 0.1273, "step": 880 }, { "epoch": 0.06566946369937979, "grad_norm": 0.2636319125685315, "learning_rate": 9.966919490060646e-05, "loss": 0.1155, "step": 900 }, { "epoch": 0.06712878511492155, "grad_norm": 0.4749750626433427, "learning_rate": 9.96414993092741e-05, "loss": 0.1577, "step": 920 }, { "epoch": 0.06858810653046334, "grad_norm": 0.26401350478445823, "learning_rate": 9.961269465087691e-05, "loss": 0.1313, "step": 940 }, { "epoch": 0.0700474279460051, "grad_norm": 0.3330162979528586, "learning_rate": 9.958278156895502e-05, "loss": 0.135, "step": 960 }, { "epoch": 0.07150674936154688, "grad_norm": 0.5511486073921424, "learning_rate": 9.955176073181249e-05, "loss": 0.1274, "step": 980 }, { "epoch": 0.07296607077708865, "grad_norm": 0.7818317017171568, "learning_rate": 9.951963283250227e-05, "loss": 0.1565, "step": 1000 }, { "epoch": 0.07442539219263043, "grad_norm": 0.42949055730605057, "learning_rate": 9.948639858881083e-05, "loss": 0.1487, "step": 1020 }, { "epoch": 0.0758847136081722, "grad_norm": 0.41682813706150323, "learning_rate": 9.945205874324201e-05, "loss": 0.143, "step": 1040 }, { "epoch": 0.07734403502371397, "grad_norm": 1.059389264065714, "learning_rate": 9.941661406300052e-05, "loss": 0.1312, "step": 1060 }, { "epoch": 0.07880335643925575, "grad_norm": 0.3764528008769406, "learning_rate": 9.938006533997475e-05, "loss": 0.1475, "step": 1080 }, { "epoch": 0.08026267785479752, "grad_norm": 0.3433293556468226, "learning_rate": 9.934241339071912e-05, "loss": 0.1379, "step": 1100 }, { "epoch": 0.0817219992703393, "grad_norm": 0.4669143421528186, "learning_rate": 9.930365905643578e-05, "loss": 0.1528, "step": 1120 }, { "epoch": 0.08318132068588106, "grad_norm": 0.3165847851959828, "learning_rate": 9.92638032029559e-05, "loss": 0.1424, "step": 1140 }, { "epoch": 0.08464064210142284, "grad_norm": 0.4112176255009246, "learning_rate": 9.922284672072021e-05, "loss": 0.1466, "step": 1160 }, { "epoch": 0.08609996351696461, "grad_norm": 0.24922324308271643, "learning_rate": 9.918079052475922e-05, "loss": 0.1151, "step": 1180 }, { "epoch": 0.08755928493250638, "grad_norm": 0.29440735283548447, "learning_rate": 9.913763555467269e-05, "loss": 0.1502, "step": 1200 }, { "epoch": 0.08901860634804816, "grad_norm": 0.31114703090098295, "learning_rate": 9.909338277460872e-05, "loss": 0.1163, "step": 1220 }, { "epoch": 0.09047792776358993, "grad_norm": 0.26314581527572667, "learning_rate": 9.904803317324211e-05, "loss": 0.1124, "step": 1240 }, { "epoch": 0.09193724917913171, "grad_norm": 0.37910467700641326, "learning_rate": 9.90015877637524e-05, "loss": 0.1624, "step": 1260 }, { "epoch": 0.09339657059467348, "grad_norm": 0.32917497252554034, "learning_rate": 9.895404758380109e-05, "loss": 0.1417, "step": 1280 }, { "epoch": 0.09485589201021526, "grad_norm": 0.4253161624681656, "learning_rate": 9.890541369550854e-05, "loss": 0.1243, "step": 1300 }, { "epoch": 0.09631521342575702, "grad_norm": 0.2579086839658108, "learning_rate": 9.885568718543025e-05, "loss": 0.1386, "step": 1320 }, { "epoch": 0.09777453484129879, "grad_norm": 0.22942493216872414, "learning_rate": 9.88048691645326e-05, "loss": 0.13, "step": 1340 }, { "epoch": 0.09923385625684057, "grad_norm": 0.45710318102883435, "learning_rate": 9.87529607681679e-05, "loss": 0.1777, "step": 1360 }, { "epoch": 0.10069317767238234, "grad_norm": 0.27950949145967136, "learning_rate": 9.869996315604915e-05, "loss": 0.1397, "step": 1380 }, { "epoch": 0.10215249908792412, "grad_norm": 0.35374191781286285, "learning_rate": 9.864587751222415e-05, "loss": 0.1269, "step": 1400 }, { "epoch": 0.10361182050346589, "grad_norm": 0.45122040063810864, "learning_rate": 9.859070504504894e-05, "loss": 0.1479, "step": 1420 }, { "epoch": 0.10507114191900765, "grad_norm": 0.3679594319404153, "learning_rate": 9.85344469871609e-05, "loss": 0.1333, "step": 1440 }, { "epoch": 0.10653046333454944, "grad_norm": 0.2773979819671957, "learning_rate": 9.847710459545109e-05, "loss": 0.1293, "step": 1460 }, { "epoch": 0.1079897847500912, "grad_norm": 0.18675042963035182, "learning_rate": 9.841867915103632e-05, "loss": 0.1262, "step": 1480 }, { "epoch": 0.10944910616563298, "grad_norm": 0.30565129208355624, "learning_rate": 9.835917195923044e-05, "loss": 0.1197, "step": 1500 }, { "epoch": 0.11090842758117475, "grad_norm": 0.2626135808770724, "learning_rate": 9.829858434951516e-05, "loss": 0.1132, "step": 1520 }, { "epoch": 0.11236774899671653, "grad_norm": 0.36381732806040473, "learning_rate": 9.823691767551042e-05, "loss": 0.1397, "step": 1540 }, { "epoch": 0.1138270704122583, "grad_norm": 0.2531250566483984, "learning_rate": 9.817417331494409e-05, "loss": 0.0946, "step": 1560 }, { "epoch": 0.11528639182780007, "grad_norm": 0.31305747632958897, "learning_rate": 9.81103526696212e-05, "loss": 0.1154, "step": 1580 }, { "epoch": 0.11674571324334185, "grad_norm": 0.3203482106895159, "learning_rate": 9.804545716539265e-05, "loss": 0.1263, "step": 1600 }, { "epoch": 0.11820503465888361, "grad_norm": 0.21193993774401784, "learning_rate": 9.797948825212331e-05, "loss": 0.1282, "step": 1620 }, { "epoch": 0.1196643560744254, "grad_norm": 0.328003998882712, "learning_rate": 9.791244740365965e-05, "loss": 0.1217, "step": 1640 }, { "epoch": 0.12112367748996716, "grad_norm": 0.25049879501157474, "learning_rate": 9.784433611779684e-05, "loss": 0.1395, "step": 1660 }, { "epoch": 0.12258299890550894, "grad_norm": 0.2889080458455597, "learning_rate": 9.777515591624522e-05, "loss": 0.1281, "step": 1680 }, { "epoch": 0.12404232032105071, "grad_norm": 0.32303024299802585, "learning_rate": 9.77049083445964e-05, "loss": 0.1279, "step": 1700 }, { "epoch": 0.12550164173659248, "grad_norm": 0.4839798792492212, "learning_rate": 9.76335949722886e-05, "loss": 0.1077, "step": 1720 }, { "epoch": 0.12696096315213426, "grad_norm": 0.3053946282511456, "learning_rate": 9.756121739257173e-05, "loss": 0.1306, "step": 1740 }, { "epoch": 0.12842028456767604, "grad_norm": 0.37752644019410203, "learning_rate": 9.748777722247164e-05, "loss": 0.1219, "step": 1760 }, { "epoch": 0.1298796059832178, "grad_norm": 0.3582486729875512, "learning_rate": 9.741327610275417e-05, "loss": 0.1098, "step": 1780 }, { "epoch": 0.13133892739875958, "grad_norm": 0.32665426597686364, "learning_rate": 9.73377156978883e-05, "loss": 0.1131, "step": 1800 }, { "epoch": 0.13279824881430136, "grad_norm": 0.29447834111629645, "learning_rate": 9.726109769600915e-05, "loss": 0.1408, "step": 1820 }, { "epoch": 0.1342575702298431, "grad_norm": 0.3757371639048124, "learning_rate": 9.718342380888013e-05, "loss": 0.1181, "step": 1840 }, { "epoch": 0.1357168916453849, "grad_norm": 0.2721970485578736, "learning_rate": 9.710469577185473e-05, "loss": 0.1397, "step": 1860 }, { "epoch": 0.13717621306092667, "grad_norm": 0.25541942141740964, "learning_rate": 9.702491534383779e-05, "loss": 0.123, "step": 1880 }, { "epoch": 0.13863553447646845, "grad_norm": 0.28833999806064703, "learning_rate": 9.69440843072462e-05, "loss": 0.117, "step": 1900 }, { "epoch": 0.1400948558920102, "grad_norm": 0.269673320995455, "learning_rate": 9.686220446796896e-05, "loss": 0.1137, "step": 1920 }, { "epoch": 0.141554177307552, "grad_norm": 0.48330990807963714, "learning_rate": 9.677927765532701e-05, "loss": 0.1528, "step": 1940 }, { "epoch": 0.14301349872309377, "grad_norm": 0.3363262147438343, "learning_rate": 9.669530572203227e-05, "loss": 0.1104, "step": 1960 }, { "epoch": 0.14447282013863552, "grad_norm": 0.25580892613717404, "learning_rate": 9.661029054414622e-05, "loss": 0.1193, "step": 1980 }, { "epoch": 0.1459321415541773, "grad_norm": 0.45388106676996987, "learning_rate": 9.652423402103805e-05, "loss": 0.1592, "step": 2000 }, { "epoch": 0.14739146296971908, "grad_norm": 0.3709494593344652, "learning_rate": 9.643713807534219e-05, "loss": 0.1073, "step": 2020 }, { "epoch": 0.14885078438526086, "grad_norm": 0.6788345381577189, "learning_rate": 9.634900465291534e-05, "loss": 0.1315, "step": 2040 }, { "epoch": 0.15031010580080262, "grad_norm": 0.2869088910525234, "learning_rate": 9.625983572279304e-05, "loss": 0.1184, "step": 2060 }, { "epoch": 0.1517694272163444, "grad_norm": 0.2964111500353762, "learning_rate": 9.616963327714566e-05, "loss": 0.1115, "step": 2080 }, { "epoch": 0.15322874863188618, "grad_norm": 0.3152701415407213, "learning_rate": 9.607839933123386e-05, "loss": 0.117, "step": 2100 }, { "epoch": 0.15468807004742793, "grad_norm": 0.30792427867208466, "learning_rate": 9.598613592336364e-05, "loss": 0.1219, "step": 2120 }, { "epoch": 0.15614739146296971, "grad_norm": 0.28660782708863575, "learning_rate": 9.589284511484071e-05, "loss": 0.1436, "step": 2140 }, { "epoch": 0.1576067128785115, "grad_norm": 0.32617953316288606, "learning_rate": 9.579852898992452e-05, "loss": 0.1287, "step": 2160 }, { "epoch": 0.15906603429405328, "grad_norm": 0.41379725314108085, "learning_rate": 9.570318965578163e-05, "loss": 0.1097, "step": 2180 }, { "epoch": 0.16052535570959503, "grad_norm": 0.22219925652341765, "learning_rate": 9.560682924243866e-05, "loss": 0.1171, "step": 2200 }, { "epoch": 0.1619846771251368, "grad_norm": 0.2835465449542872, "learning_rate": 9.550944990273473e-05, "loss": 0.1275, "step": 2220 }, { "epoch": 0.1634439985406786, "grad_norm": 0.3971492169748388, "learning_rate": 9.54110538122733e-05, "loss": 0.1029, "step": 2240 }, { "epoch": 0.16490331995622035, "grad_norm": 0.27416529849431764, "learning_rate": 9.531164316937362e-05, "loss": 0.1209, "step": 2260 }, { "epoch": 0.16636264137176213, "grad_norm": 0.2653777493304375, "learning_rate": 9.52112201950216e-05, "loss": 0.1132, "step": 2280 }, { "epoch": 0.1678219627873039, "grad_norm": 0.5642093807940471, "learning_rate": 9.510978713282017e-05, "loss": 0.1299, "step": 2300 }, { "epoch": 0.1692812842028457, "grad_norm": 0.24584617694789176, "learning_rate": 9.500734624893914e-05, "loss": 0.1251, "step": 2320 }, { "epoch": 0.17074060561838744, "grad_norm": 0.272116000365995, "learning_rate": 9.490389983206466e-05, "loss": 0.1281, "step": 2340 }, { "epoch": 0.17219992703392922, "grad_norm": 0.3358227142033562, "learning_rate": 9.4799450193348e-05, "loss": 0.1296, "step": 2360 }, { "epoch": 0.173659248449471, "grad_norm": 0.2051814222925398, "learning_rate": 9.469399966635391e-05, "loss": 0.1191, "step": 2380 }, { "epoch": 0.17511856986501276, "grad_norm": 0.20408049257855926, "learning_rate": 9.458755060700856e-05, "loss": 0.1141, "step": 2400 }, { "epoch": 0.17657789128055454, "grad_norm": 0.1849829651088512, "learning_rate": 9.448010539354685e-05, "loss": 0.1127, "step": 2420 }, { "epoch": 0.17803721269609632, "grad_norm": 0.18080713942428248, "learning_rate": 9.437166642645926e-05, "loss": 0.1394, "step": 2440 }, { "epoch": 0.1794965341116381, "grad_norm": 0.27541852629660835, "learning_rate": 9.426223612843828e-05, "loss": 0.1214, "step": 2460 }, { "epoch": 0.18095585552717985, "grad_norm": 0.37092708297153004, "learning_rate": 9.415181694432423e-05, "loss": 0.146, "step": 2480 }, { "epoch": 0.18241517694272164, "grad_norm": 0.36083890038936484, "learning_rate": 9.404041134105066e-05, "loss": 0.1248, "step": 2500 }, { "epoch": 0.18387449835826342, "grad_norm": 0.2776438139983535, "learning_rate": 9.392802180758926e-05, "loss": 0.1368, "step": 2520 }, { "epoch": 0.18533381977380517, "grad_norm": 0.34820547586785217, "learning_rate": 9.38146508548942e-05, "loss": 0.1155, "step": 2540 }, { "epoch": 0.18679314118934695, "grad_norm": 0.294238763398288, "learning_rate": 9.370030101584605e-05, "loss": 0.1172, "step": 2560 }, { "epoch": 0.18825246260488873, "grad_norm": 0.22876093865750505, "learning_rate": 9.358497484519524e-05, "loss": 0.1241, "step": 2580 }, { "epoch": 0.1897117840204305, "grad_norm": 0.2856249418184077, "learning_rate": 9.34686749195049e-05, "loss": 0.1251, "step": 2600 }, { "epoch": 0.19117110543597227, "grad_norm": 0.2919772530638231, "learning_rate": 9.335140383709333e-05, "loss": 0.12, "step": 2620 }, { "epoch": 0.19263042685151405, "grad_norm": 0.3294893864056268, "learning_rate": 9.323316421797602e-05, "loss": 0.1097, "step": 2640 }, { "epoch": 0.19408974826705583, "grad_norm": 0.3920752736984575, "learning_rate": 9.311395870380698e-05, "loss": 0.1151, "step": 2660 }, { "epoch": 0.19554906968259758, "grad_norm": 0.1668745397084369, "learning_rate": 9.299378995781984e-05, "loss": 0.1191, "step": 2680 }, { "epoch": 0.19700839109813936, "grad_norm": 0.19167495599752757, "learning_rate": 9.28726606647683e-05, "loss": 0.1413, "step": 2700 }, { "epoch": 0.19846771251368114, "grad_norm": 0.39783090766324053, "learning_rate": 9.275057353086611e-05, "loss": 0.149, "step": 2720 }, { "epoch": 0.19992703392922292, "grad_norm": 0.2810998132604279, "learning_rate": 9.262753128372672e-05, "loss": 0.1194, "step": 2740 }, { "epoch": 0.20138635534476468, "grad_norm": 0.2235431056863839, "learning_rate": 9.25035366723022e-05, "loss": 0.1339, "step": 2760 }, { "epoch": 0.20284567676030646, "grad_norm": 0.21528060380233013, "learning_rate": 9.237859246682193e-05, "loss": 0.1254, "step": 2780 }, { "epoch": 0.20430499817584824, "grad_norm": 0.5942022277992831, "learning_rate": 9.22527014587307e-05, "loss": 0.1279, "step": 2800 }, { "epoch": 0.20576431959139, "grad_norm": 0.22522986172233075, "learning_rate": 9.212586646062626e-05, "loss": 0.1016, "step": 2820 }, { "epoch": 0.20722364100693177, "grad_norm": 0.38913350777355465, "learning_rate": 9.19980903061966e-05, "loss": 0.1321, "step": 2840 }, { "epoch": 0.20868296242247356, "grad_norm": 0.2761445042724322, "learning_rate": 9.186937585015654e-05, "loss": 0.1006, "step": 2860 }, { "epoch": 0.2101422838380153, "grad_norm": 0.2904489675134637, "learning_rate": 9.173972596818399e-05, "loss": 0.1391, "step": 2880 }, { "epoch": 0.2116016052535571, "grad_norm": 0.3863861717225745, "learning_rate": 9.160914355685577e-05, "loss": 0.1338, "step": 2900 }, { "epoch": 0.21306092666909887, "grad_norm": 0.23375929184834016, "learning_rate": 9.147763153358276e-05, "loss": 0.1271, "step": 2920 }, { "epoch": 0.21452024808464065, "grad_norm": 0.2103193670388651, "learning_rate": 9.134519283654483e-05, "loss": 0.1115, "step": 2940 }, { "epoch": 0.2159795695001824, "grad_norm": 0.253417546073443, "learning_rate": 9.121183042462517e-05, "loss": 0.0965, "step": 2960 }, { "epoch": 0.2174388909157242, "grad_norm": 0.36352688754174645, "learning_rate": 9.107754727734414e-05, "loss": 0.1257, "step": 2980 }, { "epoch": 0.21889821233126597, "grad_norm": 0.33374450352895074, "learning_rate": 9.094234639479273e-05, "loss": 0.12, "step": 3000 }, { "epoch": 0.22035753374680772, "grad_norm": 0.24900650250532352, "learning_rate": 9.080623079756561e-05, "loss": 0.1071, "step": 3020 }, { "epoch": 0.2218168551623495, "grad_norm": 0.21315684544430627, "learning_rate": 9.066920352669353e-05, "loss": 0.1382, "step": 3040 }, { "epoch": 0.22327617657789128, "grad_norm": 0.2263034759808627, "learning_rate": 9.053126764357537e-05, "loss": 0.145, "step": 3060 }, { "epoch": 0.22473549799343306, "grad_norm": 0.17408999521123467, "learning_rate": 9.03924262299099e-05, "loss": 0.1125, "step": 3080 }, { "epoch": 0.22619481940897482, "grad_norm": 0.26382605132819903, "learning_rate": 9.025268238762678e-05, "loss": 0.1345, "step": 3100 }, { "epoch": 0.2276541408245166, "grad_norm": 0.428881304462081, "learning_rate": 9.011203923881728e-05, "loss": 0.1223, "step": 3120 }, { "epoch": 0.22911346224005838, "grad_norm": 0.23176671690985062, "learning_rate": 8.997049992566462e-05, "loss": 0.1259, "step": 3140 }, { "epoch": 0.23057278365560013, "grad_norm": 0.27711123572562557, "learning_rate": 8.982806761037363e-05, "loss": 0.1228, "step": 3160 }, { "epoch": 0.23203210507114191, "grad_norm": 0.28590819940098056, "learning_rate": 8.968474547510022e-05, "loss": 0.1312, "step": 3180 }, { "epoch": 0.2334914264866837, "grad_norm": 0.29608561342126305, "learning_rate": 8.954053672188022e-05, "loss": 0.1123, "step": 3200 }, { "epoch": 0.23495074790222548, "grad_norm": 0.22352582010864266, "learning_rate": 8.93954445725579e-05, "loss": 0.102, "step": 3220 }, { "epoch": 0.23641006931776723, "grad_norm": 0.17310790477532703, "learning_rate": 8.924947226871392e-05, "loss": 0.1326, "step": 3240 }, { "epoch": 0.237869390733309, "grad_norm": 0.21667547494734962, "learning_rate": 8.91026230715929e-05, "loss": 0.1367, "step": 3260 }, { "epoch": 0.2393287121488508, "grad_norm": 0.20281398075174428, "learning_rate": 8.895490026203067e-05, "loss": 0.1289, "step": 3280 }, { "epoch": 0.24078803356439255, "grad_norm": 0.2869894200024577, "learning_rate": 8.880630714038087e-05, "loss": 0.1356, "step": 3300 }, { "epoch": 0.24224735497993433, "grad_norm": 0.25752530118391737, "learning_rate": 8.865684702644121e-05, "loss": 0.1265, "step": 3320 }, { "epoch": 0.2437066763954761, "grad_norm": 0.3193768272298343, "learning_rate": 8.85065232593794e-05, "loss": 0.112, "step": 3340 }, { "epoch": 0.2451659978110179, "grad_norm": 0.36091443188401096, "learning_rate": 8.835533919765844e-05, "loss": 0.095, "step": 3360 }, { "epoch": 0.24662531922655964, "grad_norm": 1.3698974484227704, "learning_rate": 8.820329821896163e-05, "loss": 0.1168, "step": 3380 }, { "epoch": 0.24808464064210142, "grad_norm": 0.2487563159790042, "learning_rate": 8.805040372011712e-05, "loss": 0.113, "step": 3400 }, { "epoch": 0.2495439620576432, "grad_norm": 0.3698970509982561, "learning_rate": 8.789665911702199e-05, "loss": 0.1215, "step": 3420 }, { "epoch": 0.25100328347318496, "grad_norm": 0.1876341055377898, "learning_rate": 8.774206784456597e-05, "loss": 0.1209, "step": 3440 }, { "epoch": 0.25246260488872674, "grad_norm": 0.24295665772880776, "learning_rate": 8.758663335655469e-05, "loss": 0.1229, "step": 3460 }, { "epoch": 0.2539219263042685, "grad_norm": 0.2689776977679599, "learning_rate": 8.743035912563244e-05, "loss": 0.0832, "step": 3480 }, { "epoch": 0.2553812477198103, "grad_norm": 0.379412893920958, "learning_rate": 8.727324864320472e-05, "loss": 0.1106, "step": 3500 }, { "epoch": 0.2568405691353521, "grad_norm": 0.31695262624414844, "learning_rate": 8.711530541936017e-05, "loss": 0.1023, "step": 3520 }, { "epoch": 0.2582998905508938, "grad_norm": 0.21783049074759805, "learning_rate": 8.695653298279208e-05, "loss": 0.1009, "step": 3540 }, { "epoch": 0.2597592119664356, "grad_norm": 0.2703353850483838, "learning_rate": 8.67969348807197e-05, "loss": 0.1211, "step": 3560 }, { "epoch": 0.26121853338197737, "grad_norm": 0.1932384187079349, "learning_rate": 8.663651467880885e-05, "loss": 0.1039, "step": 3580 }, { "epoch": 0.26267785479751915, "grad_norm": 0.28285556612386037, "learning_rate": 8.647527596109237e-05, "loss": 0.1158, "step": 3600 }, { "epoch": 0.26413717621306093, "grad_norm": 0.32813759107134893, "learning_rate": 8.631322232988994e-05, "loss": 0.1311, "step": 3620 }, { "epoch": 0.2655964976286027, "grad_norm": 0.2312850774172843, "learning_rate": 8.615035740572773e-05, "loss": 0.1129, "step": 3640 }, { "epoch": 0.2670558190441445, "grad_norm": 0.3303464994517715, "learning_rate": 8.598668482725732e-05, "loss": 0.1278, "step": 3660 }, { "epoch": 0.2685151404596862, "grad_norm": 0.24992627997941425, "learning_rate": 8.582220825117467e-05, "loss": 0.0928, "step": 3680 }, { "epoch": 0.269974461875228, "grad_norm": 0.4135400436284091, "learning_rate": 8.565693135213815e-05, "loss": 0.1032, "step": 3700 }, { "epoch": 0.2714337832907698, "grad_norm": 0.16234055427123434, "learning_rate": 8.549085782268663e-05, "loss": 0.1187, "step": 3720 }, { "epoch": 0.27289310470631156, "grad_norm": 0.21509892954974083, "learning_rate": 8.532399137315693e-05, "loss": 0.1312, "step": 3740 }, { "epoch": 0.27435242612185334, "grad_norm": 0.484277155110313, "learning_rate": 8.51563357316009e-05, "loss": 0.0971, "step": 3760 }, { "epoch": 0.2758117475373951, "grad_norm": 0.4137856953789829, "learning_rate": 8.498789464370212e-05, "loss": 0.1153, "step": 3780 }, { "epoch": 0.2772710689529369, "grad_norm": 0.5336023621351729, "learning_rate": 8.48186718726923e-05, "loss": 0.1133, "step": 3800 }, { "epoch": 0.27873039036847863, "grad_norm": 0.4245926967265952, "learning_rate": 8.464867119926711e-05, "loss": 0.1188, "step": 3820 }, { "epoch": 0.2801897117840204, "grad_norm": 0.37902596075543216, "learning_rate": 8.447789642150176e-05, "loss": 0.1054, "step": 3840 }, { "epoch": 0.2816490331995622, "grad_norm": 0.31818209527759106, "learning_rate": 8.430635135476615e-05, "loss": 0.1362, "step": 3860 }, { "epoch": 0.283108354615104, "grad_norm": 0.22795920895858368, "learning_rate": 8.413403983163958e-05, "loss": 0.111, "step": 3880 }, { "epoch": 0.28456767603064576, "grad_norm": 0.37896391196946616, "learning_rate": 8.396096570182519e-05, "loss": 0.1027, "step": 3900 }, { "epoch": 0.28602699744618754, "grad_norm": 0.3346173686783235, "learning_rate": 8.378713283206389e-05, "loss": 0.1245, "step": 3920 }, { "epoch": 0.2874863188617293, "grad_norm": 0.1912037431397104, "learning_rate": 8.361254510604804e-05, "loss": 0.1106, "step": 3940 }, { "epoch": 0.28894564027727104, "grad_norm": 0.2345929112818003, "learning_rate": 8.343720642433462e-05, "loss": 0.0864, "step": 3960 }, { "epoch": 0.2904049616928128, "grad_norm": 0.2699123096192621, "learning_rate": 8.326112070425811e-05, "loss": 0.1085, "step": 3980 }, { "epoch": 0.2918642831083546, "grad_norm": 0.23429437678203588, "learning_rate": 8.308429187984297e-05, "loss": 0.1356, "step": 4000 }, { "epoch": 0.2933236045238964, "grad_norm": 0.28130939764083146, "learning_rate": 8.290672390171576e-05, "loss": 0.1102, "step": 4020 }, { "epoch": 0.29478292593943817, "grad_norm": 0.31157607453363195, "learning_rate": 8.272842073701688e-05, "loss": 0.1004, "step": 4040 }, { "epoch": 0.29624224735497995, "grad_norm": 0.34115856403247896, "learning_rate": 8.254938636931184e-05, "loss": 0.0911, "step": 4060 }, { "epoch": 0.29770156877052173, "grad_norm": 0.23348388763532596, "learning_rate": 8.236962479850247e-05, "loss": 0.0934, "step": 4080 }, { "epoch": 0.29916089018606346, "grad_norm": 0.25310795043145723, "learning_rate": 8.218914004073734e-05, "loss": 0.107, "step": 4100 }, { "epoch": 0.30062021160160524, "grad_norm": 0.2859341390716333, "learning_rate": 8.200793612832213e-05, "loss": 0.1034, "step": 4120 }, { "epoch": 0.302079533017147, "grad_norm": 0.1589307537613677, "learning_rate": 8.182601710962958e-05, "loss": 0.1024, "step": 4140 }, { "epoch": 0.3035388544326888, "grad_norm": 0.21346275027988693, "learning_rate": 8.164338704900894e-05, "loss": 0.1008, "step": 4160 }, { "epoch": 0.3049981758482306, "grad_norm": 0.23819056236182787, "learning_rate": 8.14600500266953e-05, "loss": 0.1106, "step": 4180 }, { "epoch": 0.30645749726377236, "grad_norm": 0.28549063766432503, "learning_rate": 8.127601013871829e-05, "loss": 0.1127, "step": 4200 }, { "epoch": 0.30791681867931414, "grad_norm": 0.42956976973150385, "learning_rate": 8.109127149681066e-05, "loss": 0.119, "step": 4220 }, { "epoch": 0.30937614009485587, "grad_norm": 0.4318716465775294, "learning_rate": 8.090583822831637e-05, "loss": 0.1213, "step": 4240 }, { "epoch": 0.31083546151039765, "grad_norm": 0.3083984081079968, "learning_rate": 8.071971447609847e-05, "loss": 0.1161, "step": 4260 }, { "epoch": 0.31229478292593943, "grad_norm": 0.38276763388234036, "learning_rate": 8.053290439844639e-05, "loss": 0.1277, "step": 4280 }, { "epoch": 0.3137541043414812, "grad_norm": 0.1908229423823102, "learning_rate": 8.034541216898315e-05, "loss": 0.0972, "step": 4300 }, { "epoch": 0.315213425757023, "grad_norm": 0.1542948433528475, "learning_rate": 8.01572419765721e-05, "loss": 0.0921, "step": 4320 }, { "epoch": 0.3166727471725648, "grad_norm": 0.21419210109077533, "learning_rate": 7.996839802522331e-05, "loss": 0.1182, "step": 4340 }, { "epoch": 0.31813206858810655, "grad_norm": 0.3984295641541688, "learning_rate": 7.977888453399967e-05, "loss": 0.1277, "step": 4360 }, { "epoch": 0.3195913900036483, "grad_norm": 0.23812270948662653, "learning_rate": 7.958870573692258e-05, "loss": 0.1189, "step": 4380 }, { "epoch": 0.32105071141919006, "grad_norm": 0.2698116268855808, "learning_rate": 7.939786588287743e-05, "loss": 0.1028, "step": 4400 }, { "epoch": 0.32251003283473184, "grad_norm": 0.23000489317809045, "learning_rate": 7.92063692355186e-05, "loss": 0.1179, "step": 4420 }, { "epoch": 0.3239693542502736, "grad_norm": 0.2804136973700269, "learning_rate": 7.901422007317426e-05, "loss": 0.1039, "step": 4440 }, { "epoch": 0.3254286756658154, "grad_norm": 0.24908274716197615, "learning_rate": 7.882142268875075e-05, "loss": 0.1189, "step": 4460 }, { "epoch": 0.3268879970813572, "grad_norm": 0.23252680742999735, "learning_rate": 7.862798138963672e-05, "loss": 0.1131, "step": 4480 }, { "epoch": 0.32834731849689897, "grad_norm": 0.1786470217408672, "learning_rate": 7.843390049760679e-05, "loss": 0.1073, "step": 4500 }, { "epoch": 0.3298066399124407, "grad_norm": 0.26097955557070734, "learning_rate": 7.823918434872515e-05, "loss": 0.1315, "step": 4520 }, { "epoch": 0.33126596132798247, "grad_norm": 0.32002806565738656, "learning_rate": 7.80438372932485e-05, "loss": 0.1045, "step": 4540 }, { "epoch": 0.33272528274352425, "grad_norm": 0.2131844778500901, "learning_rate": 7.784786369552905e-05, "loss": 0.0941, "step": 4560 }, { "epoch": 0.33418460415906603, "grad_norm": 0.1798301571622198, "learning_rate": 7.765126793391691e-05, "loss": 0.1088, "step": 4580 }, { "epoch": 0.3356439255746078, "grad_norm": 0.17051341313737456, "learning_rate": 7.74540544006622e-05, "loss": 0.1042, "step": 4600 }, { "epoch": 0.3371032469901496, "grad_norm": 0.22572869425970987, "learning_rate": 7.725622750181712e-05, "loss": 0.1007, "step": 4620 }, { "epoch": 0.3385625684056914, "grad_norm": 0.1968135681484681, "learning_rate": 7.70577916571373e-05, "loss": 0.0884, "step": 4640 }, { "epoch": 0.3400218898212331, "grad_norm": 0.2868939162865515, "learning_rate": 7.68587512999832e-05, "loss": 0.0973, "step": 4660 }, { "epoch": 0.3414812112367749, "grad_norm": 0.1585501010853745, "learning_rate": 7.665911087722103e-05, "loss": 0.1008, "step": 4680 }, { "epoch": 0.34294053265231667, "grad_norm": 0.4516898386815449, "learning_rate": 7.645887484912334e-05, "loss": 0.1146, "step": 4700 }, { "epoch": 0.34439985406785845, "grad_norm": 0.3086424574036419, "learning_rate": 7.625804768926944e-05, "loss": 0.1184, "step": 4720 }, { "epoch": 0.3458591754834002, "grad_norm": 0.20584040402606343, "learning_rate": 7.605663388444541e-05, "loss": 0.1147, "step": 4740 }, { "epoch": 0.347318496898942, "grad_norm": 0.1966429839833748, "learning_rate": 7.585463793454393e-05, "loss": 0.0878, "step": 4760 }, { "epoch": 0.3487778183144838, "grad_norm": 0.2445766881016643, "learning_rate": 7.56520643524636e-05, "loss": 0.13, "step": 4780 }, { "epoch": 0.3502371397300255, "grad_norm": 0.21158701765416985, "learning_rate": 7.544891766400827e-05, "loss": 0.0956, "step": 4800 }, { "epoch": 0.3516964611455673, "grad_norm": 0.4238803079261724, "learning_rate": 7.524520240778587e-05, "loss": 0.1174, "step": 4820 }, { "epoch": 0.3531557825611091, "grad_norm": 0.3095728288400587, "learning_rate": 7.504092313510697e-05, "loss": 0.1155, "step": 4840 }, { "epoch": 0.35461510397665086, "grad_norm": 0.4293980639357039, "learning_rate": 7.483608440988316e-05, "loss": 0.1079, "step": 4860 }, { "epoch": 0.35607442539219264, "grad_norm": 0.16320882425277555, "learning_rate": 7.463069080852503e-05, "loss": 0.1044, "step": 4880 }, { "epoch": 0.3575337468077344, "grad_norm": 0.1566741416723225, "learning_rate": 7.442474691983996e-05, "loss": 0.1043, "step": 4900 }, { "epoch": 0.3589930682232762, "grad_norm": 0.20253896848392447, "learning_rate": 7.421825734492963e-05, "loss": 0.1061, "step": 4920 }, { "epoch": 0.3604523896388179, "grad_norm": 0.23451042679193784, "learning_rate": 7.40112266970871e-05, "loss": 0.0984, "step": 4940 }, { "epoch": 0.3619117110543597, "grad_norm": 0.31761969552147234, "learning_rate": 7.380365960169391e-05, "loss": 0.0982, "step": 4960 }, { "epoch": 0.3633710324699015, "grad_norm": 0.19087054811891263, "learning_rate": 7.35955606961166e-05, "loss": 0.0834, "step": 4980 }, { "epoch": 0.36483035388544327, "grad_norm": 0.26333137912323057, "learning_rate": 7.338693462960324e-05, "loss": 0.115, "step": 5000 }, { "epoch": 0.36628967530098505, "grad_norm": 0.35946333892955823, "learning_rate": 7.317778606317937e-05, "loss": 0.109, "step": 5020 }, { "epoch": 0.36774899671652683, "grad_norm": 0.29217079786710687, "learning_rate": 7.296811966954411e-05, "loss": 0.1061, "step": 5040 }, { "epoch": 0.3692083181320686, "grad_norm": 0.16099974509802562, "learning_rate": 7.27579401329655e-05, "loss": 0.1023, "step": 5060 }, { "epoch": 0.37066763954761034, "grad_norm": 0.14270002573644924, "learning_rate": 7.254725214917607e-05, "loss": 0.1363, "step": 5080 }, { "epoch": 0.3721269609631521, "grad_norm": 0.2765615965792803, "learning_rate": 7.233606042526781e-05, "loss": 0.1226, "step": 5100 }, { "epoch": 0.3735862823786939, "grad_norm": 0.352531222497615, "learning_rate": 7.212436967958703e-05, "loss": 0.116, "step": 5120 }, { "epoch": 0.3750456037942357, "grad_norm": 0.3179852115996289, "learning_rate": 7.191218464162897e-05, "loss": 0.1106, "step": 5140 }, { "epoch": 0.37650492520977746, "grad_norm": 0.21044459839378338, "learning_rate": 7.169951005193207e-05, "loss": 0.0794, "step": 5160 }, { "epoch": 0.37796424662531924, "grad_norm": 0.29435428319089024, "learning_rate": 7.148635066197216e-05, "loss": 0.1098, "step": 5180 }, { "epoch": 0.379423568040861, "grad_norm": 0.3678808726276575, "learning_rate": 7.127271123405622e-05, "loss": 0.1201, "step": 5200 }, { "epoch": 0.38088288945640275, "grad_norm": 0.3444852709706006, "learning_rate": 7.105859654121602e-05, "loss": 0.1186, "step": 5220 }, { "epoch": 0.38234221087194453, "grad_norm": 0.1407926086621674, "learning_rate": 7.084401136710149e-05, "loss": 0.1187, "step": 5240 }, { "epoch": 0.3838015322874863, "grad_norm": 0.1677280774460152, "learning_rate": 7.062896050587377e-05, "loss": 0.1027, "step": 5260 }, { "epoch": 0.3852608537030281, "grad_norm": 0.2039504177031652, "learning_rate": 7.041344876209827e-05, "loss": 0.0913, "step": 5280 }, { "epoch": 0.3867201751185699, "grad_norm": 0.23188710135621454, "learning_rate": 7.019748095063712e-05, "loss": 0.1141, "step": 5300 }, { "epoch": 0.38817949653411166, "grad_norm": 0.24724368926384838, "learning_rate": 6.998106189654176e-05, "loss": 0.1037, "step": 5320 }, { "epoch": 0.38963881794965344, "grad_norm": 0.32541077271508595, "learning_rate": 6.976419643494504e-05, "loss": 0.1199, "step": 5340 }, { "epoch": 0.39109813936519516, "grad_norm": 0.2385801144209165, "learning_rate": 6.954688941095327e-05, "loss": 0.0933, "step": 5360 }, { "epoch": 0.39255746078073694, "grad_norm": 0.20684187377778415, "learning_rate": 6.932914567953792e-05, "loss": 0.1046, "step": 5380 }, { "epoch": 0.3940167821962787, "grad_norm": 0.22717043667652123, "learning_rate": 6.91109701054272e-05, "loss": 0.0973, "step": 5400 }, { "epoch": 0.3954761036118205, "grad_norm": 0.1784316153067281, "learning_rate": 6.889236756299732e-05, "loss": 0.122, "step": 5420 }, { "epoch": 0.3969354250273623, "grad_norm": 0.2992073258265761, "learning_rate": 6.867334293616361e-05, "loss": 0.1064, "step": 5440 }, { "epoch": 0.39839474644290407, "grad_norm": 0.35809119189138666, "learning_rate": 6.845390111827142e-05, "loss": 0.1215, "step": 5460 }, { "epoch": 0.39985406785844585, "grad_norm": 0.24770942776501734, "learning_rate": 6.823404701198683e-05, "loss": 0.1015, "step": 5480 }, { "epoch": 0.4013133892739876, "grad_norm": 0.2676532625134847, "learning_rate": 6.801378552918697e-05, "loss": 0.1024, "step": 5500 }, { "epoch": 0.40277271068952936, "grad_norm": 0.17695706700427993, "learning_rate": 6.779312159085051e-05, "loss": 0.0866, "step": 5520 }, { "epoch": 0.40423203210507114, "grad_norm": 0.22513044418817704, "learning_rate": 6.757206012694751e-05, "loss": 0.0898, "step": 5540 }, { "epoch": 0.4056913535206129, "grad_norm": 0.24221056958067821, "learning_rate": 6.735060607632937e-05, "loss": 0.0923, "step": 5560 }, { "epoch": 0.4071506749361547, "grad_norm": 0.2516010085453918, "learning_rate": 6.71287643866185e-05, "loss": 0.0821, "step": 5580 }, { "epoch": 0.4086099963516965, "grad_norm": 0.16792435621352286, "learning_rate": 6.690654001409773e-05, "loss": 0.1064, "step": 5600 }, { "epoch": 0.41006931776723826, "grad_norm": 0.24148115931630454, "learning_rate": 6.668393792359967e-05, "loss": 0.1002, "step": 5620 }, { "epoch": 0.41152863918278, "grad_norm": 0.20454443340824457, "learning_rate": 6.646096308839564e-05, "loss": 0.0955, "step": 5640 }, { "epoch": 0.41298796059832177, "grad_norm": 0.22632163953996756, "learning_rate": 6.623762049008475e-05, "loss": 0.1067, "step": 5660 }, { "epoch": 0.41444728201386355, "grad_norm": 0.18189039381318844, "learning_rate": 6.60139151184824e-05, "loss": 0.0999, "step": 5680 }, { "epoch": 0.41590660342940533, "grad_norm": 0.2179009200010762, "learning_rate": 6.578985197150893e-05, "loss": 0.0897, "step": 5700 }, { "epoch": 0.4173659248449471, "grad_norm": 0.25728882549010346, "learning_rate": 6.5565436055078e-05, "loss": 0.092, "step": 5720 }, { "epoch": 0.4188252462604889, "grad_norm": 0.21132545096135683, "learning_rate": 6.53406723829846e-05, "loss": 0.1093, "step": 5740 }, { "epoch": 0.4202845676760306, "grad_norm": 0.3259354513604496, "learning_rate": 6.511556597679313e-05, "loss": 0.0939, "step": 5760 }, { "epoch": 0.4217438890915724, "grad_norm": 0.14428503360326553, "learning_rate": 6.48901218657252e-05, "loss": 0.113, "step": 5780 }, { "epoch": 0.4232032105071142, "grad_norm": 0.20596362998777773, "learning_rate": 6.466434508654729e-05, "loss": 0.1221, "step": 5800 }, { "epoch": 0.42466253192265596, "grad_norm": 0.2076732092116106, "learning_rate": 6.443824068345814e-05, "loss": 0.0981, "step": 5820 }, { "epoch": 0.42612185333819774, "grad_norm": 0.23210274197312583, "learning_rate": 6.421181370797616e-05, "loss": 0.1091, "step": 5840 }, { "epoch": 0.4275811747537395, "grad_norm": 0.34773891076479957, "learning_rate": 6.39850692188265e-05, "loss": 0.1152, "step": 5860 }, { "epoch": 0.4290404961692813, "grad_norm": 0.25768381087645986, "learning_rate": 6.375801228182804e-05, "loss": 0.0833, "step": 5880 }, { "epoch": 0.43049981758482303, "grad_norm": 0.20272718271445447, "learning_rate": 6.353064796978025e-05, "loss": 0.0821, "step": 5900 }, { "epoch": 0.4319591390003648, "grad_norm": 0.3769192263755196, "learning_rate": 6.330298136234981e-05, "loss": 0.1047, "step": 5920 }, { "epoch": 0.4334184604159066, "grad_norm": 0.18218184597043124, "learning_rate": 6.307501754595712e-05, "loss": 0.1114, "step": 5940 }, { "epoch": 0.4348777818314484, "grad_norm": 0.20670653864614483, "learning_rate": 6.284676161366276e-05, "loss": 0.0885, "step": 5960 }, { "epoch": 0.43633710324699015, "grad_norm": 0.21541995858015078, "learning_rate": 6.261821866505353e-05, "loss": 0.1153, "step": 5980 }, { "epoch": 0.43779642466253194, "grad_norm": 0.2073422866021902, "learning_rate": 6.23893938061287e-05, "loss": 0.0958, "step": 6000 }, { "epoch": 0.4392557460780737, "grad_norm": 0.3614983325797712, "learning_rate": 6.216029214918576e-05, "loss": 0.1039, "step": 6020 }, { "epoch": 0.44071506749361544, "grad_norm": 0.25937791265177973, "learning_rate": 6.193091881270639e-05, "loss": 0.0884, "step": 6040 }, { "epoch": 0.4421743889091572, "grad_norm": 0.27236848714431067, "learning_rate": 6.17012789212419e-05, "loss": 0.104, "step": 6060 }, { "epoch": 0.443633710324699, "grad_norm": 0.2193972961735453, "learning_rate": 6.147137760529893e-05, "loss": 0.1051, "step": 6080 }, { "epoch": 0.4450930317402408, "grad_norm": 0.3288936349913114, "learning_rate": 6.124122000122474e-05, "loss": 0.1194, "step": 6100 }, { "epoch": 0.44655235315578257, "grad_norm": 0.28030638179884143, "learning_rate": 6.101081125109238e-05, "loss": 0.0987, "step": 6120 }, { "epoch": 0.44801167457132435, "grad_norm": 0.28963057861922986, "learning_rate": 6.0780156502585974e-05, "loss": 0.104, "step": 6140 }, { "epoch": 0.44947099598686613, "grad_norm": 0.2107381404732794, "learning_rate": 6.054926090888559e-05, "loss": 0.0944, "step": 6160 }, { "epoch": 0.45093031740240785, "grad_norm": 0.22426758617758438, "learning_rate": 6.031812962855212e-05, "loss": 0.1088, "step": 6180 }, { "epoch": 0.45238963881794964, "grad_norm": 0.19433617911544113, "learning_rate": 6.008676782541214e-05, "loss": 0.0934, "step": 6200 }, { "epoch": 0.4538489602334914, "grad_norm": 0.16786745480860363, "learning_rate": 5.985518066844235e-05, "loss": 0.1065, "step": 6220 }, { "epoch": 0.4553082816490332, "grad_norm": 0.28967266828058946, "learning_rate": 5.9623373331654296e-05, "loss": 0.1104, "step": 6240 }, { "epoch": 0.456767603064575, "grad_norm": 0.2857631028326485, "learning_rate": 5.9391350993978586e-05, "loss": 0.1059, "step": 6260 }, { "epoch": 0.45822692448011676, "grad_norm": 0.16887743736632202, "learning_rate": 5.915911883914937e-05, "loss": 0.0921, "step": 6280 }, { "epoch": 0.45968624589565854, "grad_norm": 0.45462588569274826, "learning_rate": 5.892668205558838e-05, "loss": 0.1062, "step": 6300 }, { "epoch": 0.46114556731120027, "grad_norm": 0.1874149476062009, "learning_rate": 5.869404583628906e-05, "loss": 0.0877, "step": 6320 }, { "epoch": 0.46260488872674205, "grad_norm": 0.2035453150986139, "learning_rate": 5.846121537870059e-05, "loss": 0.0826, "step": 6340 }, { "epoch": 0.46406421014228383, "grad_norm": 0.2009650369528326, "learning_rate": 5.822819588461167e-05, "loss": 0.0988, "step": 6360 }, { "epoch": 0.4655235315578256, "grad_norm": 0.33321536680847463, "learning_rate": 5.799499256003447e-05, "loss": 0.0827, "step": 6380 }, { "epoch": 0.4669828529733674, "grad_norm": 0.20073386151251446, "learning_rate": 5.77616106150881e-05, "loss": 0.0831, "step": 6400 }, { "epoch": 0.46844217438890917, "grad_norm": 0.32718978675294735, "learning_rate": 5.7528055263882394e-05, "loss": 0.1012, "step": 6420 }, { "epoch": 0.46990149580445095, "grad_norm": 0.21478649269135933, "learning_rate": 5.729433172440133e-05, "loss": 0.1003, "step": 6440 }, { "epoch": 0.4713608172199927, "grad_norm": 0.35055435747379776, "learning_rate": 5.706044521838645e-05, "loss": 0.1186, "step": 6460 }, { "epoch": 0.47282013863553446, "grad_norm": 0.17583973159572328, "learning_rate": 5.682640097122024e-05, "loss": 0.1025, "step": 6480 }, { "epoch": 0.47427946005107624, "grad_norm": 0.19007944644812688, "learning_rate": 5.659220421180935e-05, "loss": 0.0897, "step": 6500 }, { "epoch": 0.475738781466618, "grad_norm": 0.20795427294782404, "learning_rate": 5.635786017246782e-05, "loss": 0.1066, "step": 6520 }, { "epoch": 0.4771981028821598, "grad_norm": 0.18309256793902484, "learning_rate": 5.612337408880011e-05, "loss": 0.0845, "step": 6540 }, { "epoch": 0.4786574242977016, "grad_norm": 0.3124216891183694, "learning_rate": 5.5888751199584156e-05, "loss": 0.1096, "step": 6560 }, { "epoch": 0.48011674571324336, "grad_norm": 0.18822513299788007, "learning_rate": 5.56539967466544e-05, "loss": 0.0957, "step": 6580 }, { "epoch": 0.4815760671287851, "grad_norm": 0.26561985090282353, "learning_rate": 5.541911597478458e-05, "loss": 0.0862, "step": 6600 }, { "epoch": 0.48303538854432687, "grad_norm": 0.2720185239193176, "learning_rate": 5.5184114131570574e-05, "loss": 0.0968, "step": 6620 }, { "epoch": 0.48449470995986865, "grad_norm": 0.23325174950351915, "learning_rate": 5.494899646731322e-05, "loss": 0.0987, "step": 6640 }, { "epoch": 0.48595403137541043, "grad_norm": 0.5924963942808815, "learning_rate": 5.4713768234900956e-05, "loss": 0.0865, "step": 6660 }, { "epoch": 0.4874133527909522, "grad_norm": 0.2626819591515115, "learning_rate": 5.447843468969247e-05, "loss": 0.0933, "step": 6680 }, { "epoch": 0.488872674206494, "grad_norm": 0.30178159697329016, "learning_rate": 5.4243001089399305e-05, "loss": 0.094, "step": 6700 }, { "epoch": 0.4903319956220358, "grad_norm": 0.32298125922899934, "learning_rate": 5.400747269396842e-05, "loss": 0.0892, "step": 6720 }, { "epoch": 0.4917913170375775, "grad_norm": 0.29452417657729046, "learning_rate": 5.37718547654646e-05, "loss": 0.1021, "step": 6740 }, { "epoch": 0.4932506384531193, "grad_norm": 0.15177550207237628, "learning_rate": 5.353615256795297e-05, "loss": 0.0992, "step": 6760 }, { "epoch": 0.49470995986866106, "grad_norm": 0.3145374189570284, "learning_rate": 5.3300371367381306e-05, "loss": 0.0978, "step": 6780 }, { "epoch": 0.49616928128420285, "grad_norm": 0.28863235717217767, "learning_rate": 5.306451643146247e-05, "loss": 0.0908, "step": 6800 }, { "epoch": 0.4976286026997446, "grad_norm": 0.20954696658671135, "learning_rate": 5.2828593029556705e-05, "loss": 0.1084, "step": 6820 }, { "epoch": 0.4990879241152864, "grad_norm": 0.2199618710390789, "learning_rate": 5.2592606432553846e-05, "loss": 0.0972, "step": 6840 }, { "epoch": 0.5005472455308282, "grad_norm": 0.20271409907110083, "learning_rate": 5.235656191275561e-05, "loss": 0.0999, "step": 6860 }, { "epoch": 0.5020065669463699, "grad_norm": 0.29349205157450786, "learning_rate": 5.21204647437578e-05, "loss": 0.0931, "step": 6880 }, { "epoch": 0.5034658883619118, "grad_norm": 0.33489812333335206, "learning_rate": 5.1884320200332517e-05, "loss": 0.0996, "step": 6900 }, { "epoch": 0.5049252097774535, "grad_norm": 0.19799483926382305, "learning_rate": 5.164813355831023e-05, "loss": 0.1108, "step": 6920 }, { "epoch": 0.5063845311929952, "grad_norm": 0.3720539958012567, "learning_rate": 5.141191009446198e-05, "loss": 0.1104, "step": 6940 }, { "epoch": 0.507843852608537, "grad_norm": 0.2954464123920451, "learning_rate": 5.1175655086381466e-05, "loss": 0.11, "step": 6960 }, { "epoch": 0.5093031740240788, "grad_norm": 0.1642741168702313, "learning_rate": 5.093937381236712e-05, "loss": 0.1031, "step": 6980 }, { "epoch": 0.5107624954396206, "grad_norm": 0.20935330232427551, "learning_rate": 5.0703071551304214e-05, "loss": 0.0978, "step": 7000 }, { "epoch": 0.5122218168551623, "grad_norm": 0.1993779996730668, "learning_rate": 5.04667535825469e-05, "loss": 0.0972, "step": 7020 }, { "epoch": 0.5136811382707042, "grad_norm": 0.19115582036939682, "learning_rate": 5.023042518580022e-05, "loss": 0.0847, "step": 7040 }, { "epoch": 0.5151404596862459, "grad_norm": 0.27120517340424255, "learning_rate": 4.999409164100226e-05, "loss": 0.1042, "step": 7060 }, { "epoch": 0.5165997811017876, "grad_norm": 0.10934691309913587, "learning_rate": 4.9757758228206084e-05, "loss": 0.081, "step": 7080 }, { "epoch": 0.5180591025173295, "grad_norm": 0.24394002533972173, "learning_rate": 4.952143022746181e-05, "loss": 0.094, "step": 7100 }, { "epoch": 0.5195184239328712, "grad_norm": 0.16029024193022853, "learning_rate": 4.928511291869865e-05, "loss": 0.0826, "step": 7120 }, { "epoch": 0.520977745348413, "grad_norm": 0.23900126446494024, "learning_rate": 4.9048811581606934e-05, "loss": 0.0961, "step": 7140 }, { "epoch": 0.5224370667639547, "grad_norm": 0.19889558694631473, "learning_rate": 4.8812531495520155e-05, "loss": 0.1087, "step": 7160 }, { "epoch": 0.5238963881794966, "grad_norm": 0.23105375198475658, "learning_rate": 4.857627793929705e-05, "loss": 0.0869, "step": 7180 }, { "epoch": 0.5253557095950383, "grad_norm": 0.22834103771419306, "learning_rate": 4.8340056191203615e-05, "loss": 0.0899, "step": 7200 }, { "epoch": 0.52681503101058, "grad_norm": 0.14832148410977408, "learning_rate": 4.810387152879521e-05, "loss": 0.0824, "step": 7220 }, { "epoch": 0.5282743524261219, "grad_norm": 0.14992555758737747, "learning_rate": 4.786772922879863e-05, "loss": 0.0887, "step": 7240 }, { "epoch": 0.5297336738416636, "grad_norm": 0.2316964809656849, "learning_rate": 4.763163456699427e-05, "loss": 0.1093, "step": 7260 }, { "epoch": 0.5311929952572054, "grad_norm": 0.1330630758475358, "learning_rate": 4.739559281809818e-05, "loss": 0.1009, "step": 7280 }, { "epoch": 0.5326523166727472, "grad_norm": 0.18333429681957206, "learning_rate": 4.715960925564427e-05, "loss": 0.1004, "step": 7300 }, { "epoch": 0.534111638088289, "grad_norm": 0.270128715362079, "learning_rate": 4.6923689151866444e-05, "loss": 0.1018, "step": 7320 }, { "epoch": 0.5355709595038307, "grad_norm": 0.23984397395024673, "learning_rate": 4.6687837777580886e-05, "loss": 0.0887, "step": 7340 }, { "epoch": 0.5370302809193724, "grad_norm": 0.4475338081181222, "learning_rate": 4.645206040206824e-05, "loss": 0.1036, "step": 7360 }, { "epoch": 0.5384896023349143, "grad_norm": 0.18351540680366502, "learning_rate": 4.621636229295591e-05, "loss": 0.0868, "step": 7380 }, { "epoch": 0.539948923750456, "grad_norm": 0.32433205707718027, "learning_rate": 4.5980748716100346e-05, "loss": 0.112, "step": 7400 }, { "epoch": 0.5414082451659978, "grad_norm": 0.26936021946381955, "learning_rate": 4.574522493546944e-05, "loss": 0.0752, "step": 7420 }, { "epoch": 0.5428675665815396, "grad_norm": 0.23965825882528494, "learning_rate": 4.550979621302488e-05, "loss": 0.0987, "step": 7440 }, { "epoch": 0.5443268879970814, "grad_norm": 0.15591543995056015, "learning_rate": 4.527446780860464e-05, "loss": 0.1019, "step": 7460 }, { "epoch": 0.5457862094126231, "grad_norm": 0.1448561831202914, "learning_rate": 4.5039244979805403e-05, "loss": 0.0764, "step": 7480 }, { "epoch": 0.5472455308281649, "grad_norm": 0.13626905980903883, "learning_rate": 4.480413298186516e-05, "loss": 0.0774, "step": 7500 }, { "epoch": 0.5487048522437067, "grad_norm": 0.28395775526753664, "learning_rate": 4.456913706754573e-05, "loss": 0.0746, "step": 7520 }, { "epoch": 0.5501641736592484, "grad_norm": 0.3285254794237905, "learning_rate": 4.4334262487015474e-05, "loss": 0.0835, "step": 7540 }, { "epoch": 0.5516234950747902, "grad_norm": 0.2496952670799407, "learning_rate": 4.4099514487732e-05, "loss": 0.0935, "step": 7560 }, { "epoch": 0.553082816490332, "grad_norm": 0.19932665975830546, "learning_rate": 4.386489831432483e-05, "loss": 0.0921, "step": 7580 }, { "epoch": 0.5545421379058738, "grad_norm": 0.18400201820649104, "learning_rate": 4.3630419208478356e-05, "loss": 0.0919, "step": 7600 }, { "epoch": 0.5560014593214155, "grad_norm": 0.22155855058445367, "learning_rate": 4.339608240881462e-05, "loss": 0.0764, "step": 7620 }, { "epoch": 0.5574607807369573, "grad_norm": 0.19580138253597132, "learning_rate": 4.316189315077636e-05, "loss": 0.0897, "step": 7640 }, { "epoch": 0.5589201021524991, "grad_norm": 0.19401483076890932, "learning_rate": 4.2927856666510005e-05, "loss": 0.0757, "step": 7660 }, { "epoch": 0.5603794235680408, "grad_norm": 0.20987182850074507, "learning_rate": 4.269397818474878e-05, "loss": 0.0882, "step": 7680 }, { "epoch": 0.5618387449835827, "grad_norm": 0.35798764065337424, "learning_rate": 4.246026293069588e-05, "loss": 0.0966, "step": 7700 }, { "epoch": 0.5632980663991244, "grad_norm": 0.257029410117375, "learning_rate": 4.222671612590775e-05, "loss": 0.0947, "step": 7720 }, { "epoch": 0.5647573878146662, "grad_norm": 0.1521057062552542, "learning_rate": 4.1993342988177434e-05, "loss": 0.0804, "step": 7740 }, { "epoch": 0.566216709230208, "grad_norm": 0.1742108319695617, "learning_rate": 4.176014873141798e-05, "loss": 0.0866, "step": 7760 }, { "epoch": 0.5676760306457497, "grad_norm": 0.13073108248641252, "learning_rate": 4.152713856554595e-05, "loss": 0.0878, "step": 7780 }, { "epoch": 0.5691353520612915, "grad_norm": 0.24385879986625963, "learning_rate": 4.129431769636505e-05, "loss": 0.0854, "step": 7800 }, { "epoch": 0.5705946734768332, "grad_norm": 0.22983093362986062, "learning_rate": 4.106169132544979e-05, "loss": 0.0882, "step": 7820 }, { "epoch": 0.5720539948923751, "grad_norm": 0.2362534084443868, "learning_rate": 4.082926465002932e-05, "loss": 0.0841, "step": 7840 }, { "epoch": 0.5735133163079168, "grad_norm": 0.19860411744987813, "learning_rate": 4.0597042862871257e-05, "loss": 0.0911, "step": 7860 }, { "epoch": 0.5749726377234586, "grad_norm": 0.14469692761467015, "learning_rate": 4.0365031152165724e-05, "loss": 0.0705, "step": 7880 }, { "epoch": 0.5764319591390004, "grad_norm": 0.3026702131043989, "learning_rate": 4.0133234701409386e-05, "loss": 0.1141, "step": 7900 }, { "epoch": 0.5778912805545421, "grad_norm": 0.20135466725204315, "learning_rate": 3.99016586892897e-05, "loss": 0.0981, "step": 7920 }, { "epoch": 0.5793506019700839, "grad_norm": 0.2627174484653081, "learning_rate": 3.967030828956918e-05, "loss": 0.0886, "step": 7940 }, { "epoch": 0.5808099233856256, "grad_norm": 0.13106325907585273, "learning_rate": 3.943918867096981e-05, "loss": 0.098, "step": 7960 }, { "epoch": 0.5822692448011675, "grad_norm": 0.17654659766268968, "learning_rate": 3.9208304997057566e-05, "loss": 0.093, "step": 7980 }, { "epoch": 0.5837285662167092, "grad_norm": 0.20015436956986157, "learning_rate": 3.897766242612706e-05, "loss": 0.0874, "step": 8000 }, { "epoch": 0.585187887632251, "grad_norm": 0.17037302015456898, "learning_rate": 3.874726611108628e-05, "loss": 0.0913, "step": 8020 }, { "epoch": 0.5866472090477928, "grad_norm": 0.1687960234761524, "learning_rate": 3.8517121199341535e-05, "loss": 0.0786, "step": 8040 }, { "epoch": 0.5881065304633345, "grad_norm": 0.29620459255170106, "learning_rate": 3.8287232832682335e-05, "loss": 0.0905, "step": 8060 }, { "epoch": 0.5895658518788763, "grad_norm": 0.1947016220691186, "learning_rate": 3.805760614716662e-05, "loss": 0.0852, "step": 8080 }, { "epoch": 0.5910251732944181, "grad_norm": 0.14437887795334717, "learning_rate": 3.782824627300593e-05, "loss": 0.0931, "step": 8100 }, { "epoch": 0.5924844947099599, "grad_norm": 0.21909261220655682, "learning_rate": 3.759915833445092e-05, "loss": 0.0878, "step": 8120 }, { "epoch": 0.5939438161255016, "grad_norm": 0.14839231943289036, "learning_rate": 3.737034744967669e-05, "loss": 0.0962, "step": 8140 }, { "epoch": 0.5954031375410435, "grad_norm": 0.23104939520938056, "learning_rate": 3.714181873066857e-05, "loss": 0.0912, "step": 8160 }, { "epoch": 0.5968624589565852, "grad_norm": 0.25933402155627416, "learning_rate": 3.691357728310789e-05, "loss": 0.081, "step": 8180 }, { "epoch": 0.5983217803721269, "grad_norm": 0.24053747298935832, "learning_rate": 3.668562820625785e-05, "loss": 0.0855, "step": 8200 }, { "epoch": 0.5997811017876687, "grad_norm": 0.23989767806744336, "learning_rate": 3.6457976592849754e-05, "loss": 0.0983, "step": 8220 }, { "epoch": 0.6012404232032105, "grad_norm": 0.3322203458314159, "learning_rate": 3.6230627528968964e-05, "loss": 0.1073, "step": 8240 }, { "epoch": 0.6026997446187523, "grad_norm": 0.2388704578332255, "learning_rate": 3.6003586093941534e-05, "loss": 0.0839, "step": 8260 }, { "epoch": 0.604159066034294, "grad_norm": 0.29568333505671857, "learning_rate": 3.577685736022056e-05, "loss": 0.0986, "step": 8280 }, { "epoch": 0.6056183874498359, "grad_norm": 0.2764304009893813, "learning_rate": 3.555044639327293e-05, "loss": 0.0914, "step": 8300 }, { "epoch": 0.6070777088653776, "grad_norm": 0.1994542234940755, "learning_rate": 3.532435825146618e-05, "loss": 0.0722, "step": 8320 }, { "epoch": 0.6085370302809193, "grad_norm": 0.1850674654066651, "learning_rate": 3.509859798595537e-05, "loss": 0.1007, "step": 8340 }, { "epoch": 0.6099963516964612, "grad_norm": 0.2602392735140372, "learning_rate": 3.487317064057033e-05, "loss": 0.0795, "step": 8360 }, { "epoch": 0.6114556731120029, "grad_norm": 0.236171327726822, "learning_rate": 3.464808125170295e-05, "loss": 0.0868, "step": 8380 }, { "epoch": 0.6129149945275447, "grad_norm": 0.24511982789375833, "learning_rate": 3.442333484819462e-05, "loss": 0.1099, "step": 8400 }, { "epoch": 0.6143743159430864, "grad_norm": 0.18929289550803868, "learning_rate": 3.4198936451224006e-05, "loss": 0.0639, "step": 8420 }, { "epoch": 0.6158336373586283, "grad_norm": 0.29573231890882484, "learning_rate": 3.397489107419466e-05, "loss": 0.086, "step": 8440 }, { "epoch": 0.61729295877417, "grad_norm": 0.15110985532433188, "learning_rate": 3.3751203722623185e-05, "loss": 0.0826, "step": 8460 }, { "epoch": 0.6187522801897117, "grad_norm": 0.2240730141612783, "learning_rate": 3.352787939402734e-05, "loss": 0.1002, "step": 8480 }, { "epoch": 0.6202116016052536, "grad_norm": 0.17617940816910643, "learning_rate": 3.330492307781442e-05, "loss": 0.0814, "step": 8500 }, { "epoch": 0.6216709230207953, "grad_norm": 0.20532201236991618, "learning_rate": 3.3082339755169724e-05, "loss": 0.0866, "step": 8520 }, { "epoch": 0.6231302444363371, "grad_norm": 0.2625908408503322, "learning_rate": 3.286013439894532e-05, "loss": 0.0824, "step": 8540 }, { "epoch": 0.6245895658518789, "grad_norm": 0.1370998701397795, "learning_rate": 3.2638311973548904e-05, "loss": 0.0775, "step": 8560 }, { "epoch": 0.6260488872674207, "grad_norm": 0.16525999384073028, "learning_rate": 3.241687743483293e-05, "loss": 0.0859, "step": 8580 }, { "epoch": 0.6275082086829624, "grad_norm": 0.1019124816557733, "learning_rate": 3.2195835729983914e-05, "loss": 0.0758, "step": 8600 }, { "epoch": 0.6289675300985041, "grad_norm": 0.16025292416956477, "learning_rate": 3.1975191797411786e-05, "loss": 0.0768, "step": 8620 }, { "epoch": 0.630426851514046, "grad_norm": 0.16545554798031012, "learning_rate": 3.1754950566639685e-05, "loss": 0.0736, "step": 8640 }, { "epoch": 0.6318861729295877, "grad_norm": 0.17355211388140368, "learning_rate": 3.153511695819374e-05, "loss": 0.0735, "step": 8660 }, { "epoch": 0.6333454943451295, "grad_norm": 0.4006051455854273, "learning_rate": 3.131569588349319e-05, "loss": 0.0765, "step": 8680 }, { "epoch": 0.6348048157606713, "grad_norm": 0.24052078851822786, "learning_rate": 3.1096692244740664e-05, "loss": 0.1022, "step": 8700 }, { "epoch": 0.6362641371762131, "grad_norm": 0.1533414015506768, "learning_rate": 3.08781109348126e-05, "loss": 0.0809, "step": 8720 }, { "epoch": 0.6377234585917548, "grad_norm": 0.1340242757509518, "learning_rate": 3.0659956837149985e-05, "loss": 0.0781, "step": 8740 }, { "epoch": 0.6391827800072966, "grad_norm": 0.1691744273675545, "learning_rate": 3.0442234825649185e-05, "loss": 0.0905, "step": 8760 }, { "epoch": 0.6406421014228384, "grad_norm": 0.23981672728497094, "learning_rate": 3.0224949764553144e-05, "loss": 0.0892, "step": 8780 }, { "epoch": 0.6421014228383801, "grad_norm": 0.24286106288672432, "learning_rate": 3.000810650834269e-05, "loss": 0.0817, "step": 8800 }, { "epoch": 0.643560744253922, "grad_norm": 0.1455899222141873, "learning_rate": 2.979170990162799e-05, "loss": 0.0836, "step": 8820 }, { "epoch": 0.6450200656694637, "grad_norm": 0.22817756184035107, "learning_rate": 2.9575764779040427e-05, "loss": 0.0789, "step": 8840 }, { "epoch": 0.6464793870850055, "grad_norm": 0.24718132089424502, "learning_rate": 2.9360275965124484e-05, "loss": 0.0966, "step": 8860 }, { "epoch": 0.6479387085005472, "grad_norm": 0.2830882130796971, "learning_rate": 2.914524827423006e-05, "loss": 0.0844, "step": 8880 }, { "epoch": 0.649398029916089, "grad_norm": 0.13154886405230762, "learning_rate": 2.8930686510404848e-05, "loss": 0.0882, "step": 8900 }, { "epoch": 0.6508573513316308, "grad_norm": 0.1628509299026824, "learning_rate": 2.871659546728701e-05, "loss": 0.1051, "step": 8920 }, { "epoch": 0.6523166727471725, "grad_norm": 0.2870676908057582, "learning_rate": 2.8502979927998096e-05, "loss": 0.0856, "step": 8940 }, { "epoch": 0.6537759941627144, "grad_norm": 0.25963714561124496, "learning_rate": 2.8289844665036136e-05, "loss": 0.0961, "step": 8960 }, { "epoch": 0.6552353155782561, "grad_norm": 0.2751758771997854, "learning_rate": 2.8077194440169117e-05, "loss": 0.0788, "step": 8980 }, { "epoch": 0.6566946369937979, "grad_norm": 0.30269194089469403, "learning_rate": 2.7865034004328496e-05, "loss": 0.0832, "step": 9000 }, { "epoch": 0.6581539584093397, "grad_norm": 0.21562618811033948, "learning_rate": 2.7653368097503085e-05, "loss": 0.0885, "step": 9020 }, { "epoch": 0.6596132798248814, "grad_norm": 0.2128702249563008, "learning_rate": 2.7442201448633165e-05, "loss": 0.0847, "step": 9040 }, { "epoch": 0.6610726012404232, "grad_norm": 0.19016118127632922, "learning_rate": 2.7231538775504846e-05, "loss": 0.0836, "step": 9060 }, { "epoch": 0.6625319226559649, "grad_norm": 0.1497778616340849, "learning_rate": 2.7021384784644632e-05, "loss": 0.0754, "step": 9080 }, { "epoch": 0.6639912440715068, "grad_norm": 0.23437427731049273, "learning_rate": 2.6811744171214303e-05, "loss": 0.0734, "step": 9100 }, { "epoch": 0.6654505654870485, "grad_norm": 0.3171739171707465, "learning_rate": 2.6602621618905988e-05, "loss": 0.0907, "step": 9120 }, { "epoch": 0.6669098869025903, "grad_norm": 0.34300590939562675, "learning_rate": 2.639402179983754e-05, "loss": 0.0913, "step": 9140 }, { "epoch": 0.6683692083181321, "grad_norm": 0.2917703835582748, "learning_rate": 2.6185949374448136e-05, "loss": 0.0789, "step": 9160 }, { "epoch": 0.6698285297336738, "grad_norm": 0.22378572737202726, "learning_rate": 2.5978408991394233e-05, "loss": 0.0815, "step": 9180 }, { "epoch": 0.6712878511492156, "grad_norm": 0.12173835938546967, "learning_rate": 2.5771405287445576e-05, "loss": 0.0758, "step": 9200 }, { "epoch": 0.6727471725647574, "grad_norm": 0.14662571413630113, "learning_rate": 2.5564942887381705e-05, "loss": 0.0714, "step": 9220 }, { "epoch": 0.6742064939802992, "grad_norm": 0.27764203523675757, "learning_rate": 2.535902640388861e-05, "loss": 0.089, "step": 9240 }, { "epoch": 0.6756658153958409, "grad_norm": 0.19946437768799996, "learning_rate": 2.5153660437455634e-05, "loss": 0.0703, "step": 9260 }, { "epoch": 0.6771251368113828, "grad_norm": 0.20290178502096412, "learning_rate": 2.494884957627282e-05, "loss": 0.0821, "step": 9280 }, { "epoch": 0.6785844582269245, "grad_norm": 0.2154294699026174, "learning_rate": 2.4744598396128183e-05, "loss": 0.0974, "step": 9300 }, { "epoch": 0.6800437796424662, "grad_norm": 0.2511874205987635, "learning_rate": 2.4540911460305694e-05, "loss": 0.0825, "step": 9320 }, { "epoch": 0.681503101058008, "grad_norm": 0.1810086992559282, "learning_rate": 2.4337793319483186e-05, "loss": 0.0874, "step": 9340 }, { "epoch": 0.6829624224735498, "grad_norm": 0.16642882654866892, "learning_rate": 2.4135248511630824e-05, "loss": 0.0736, "step": 9360 }, { "epoch": 0.6844217438890916, "grad_norm": 0.18730698333216458, "learning_rate": 2.3933281561909566e-05, "loss": 0.0682, "step": 9380 }, { "epoch": 0.6858810653046333, "grad_norm": 0.1991805058045345, "learning_rate": 2.373189698257014e-05, "loss": 0.0763, "step": 9400 }, { "epoch": 0.6873403867201752, "grad_norm": 0.1692253705188508, "learning_rate": 2.353109927285226e-05, "loss": 0.0825, "step": 9420 }, { "epoch": 0.6887997081357169, "grad_norm": 0.24581712588330082, "learning_rate": 2.333089291888403e-05, "loss": 0.072, "step": 9440 }, { "epoch": 0.6902590295512586, "grad_norm": 0.15001533830747885, "learning_rate": 2.3131282393581822e-05, "loss": 0.0835, "step": 9460 }, { "epoch": 0.6917183509668005, "grad_norm": 0.2572418200751706, "learning_rate": 2.293227215655026e-05, "loss": 0.0777, "step": 9480 }, { "epoch": 0.6931776723823422, "grad_norm": 0.23284739021368367, "learning_rate": 2.273386665398256e-05, "loss": 0.0909, "step": 9500 }, { "epoch": 0.694636993797884, "grad_norm": 0.18808290689449877, "learning_rate": 2.253607031856131e-05, "loss": 0.0806, "step": 9520 }, { "epoch": 0.6960963152134257, "grad_norm": 0.21599851137275142, "learning_rate": 2.2338887569359313e-05, "loss": 0.0818, "step": 9540 }, { "epoch": 0.6975556366289676, "grad_norm": 0.16231435724106183, "learning_rate": 2.2142322811740994e-05, "loss": 0.0651, "step": 9560 }, { "epoch": 0.6990149580445093, "grad_norm": 0.16358563863117298, "learning_rate": 2.194638043726384e-05, "loss": 0.0941, "step": 9580 }, { "epoch": 0.700474279460051, "grad_norm": 0.18696897549337352, "learning_rate": 2.175106482358037e-05, "loss": 0.077, "step": 9600 }, { "epoch": 0.7019336008755929, "grad_norm": 0.27938821625111243, "learning_rate": 2.1556380334340287e-05, "loss": 0.0691, "step": 9620 }, { "epoch": 0.7033929222911346, "grad_norm": 0.276170709188811, "learning_rate": 2.136233131909301e-05, "loss": 0.0891, "step": 9640 }, { "epoch": 0.7048522437066764, "grad_norm": 0.2066704076900185, "learning_rate": 2.116892211319054e-05, "loss": 0.08, "step": 9660 }, { "epoch": 0.7063115651222182, "grad_norm": 0.34640850265903106, "learning_rate": 2.0976157037690537e-05, "loss": 0.097, "step": 9680 }, { "epoch": 0.70777088653776, "grad_norm": 0.19562017264957848, "learning_rate": 2.078404039925974e-05, "loss": 0.077, "step": 9700 }, { "epoch": 0.7092302079533017, "grad_norm": 0.15661479322837638, "learning_rate": 2.0592576490077886e-05, "loss": 0.0709, "step": 9720 }, { "epoch": 0.7106895293688434, "grad_norm": 0.2717282580952972, "learning_rate": 2.040176958774171e-05, "loss": 0.0787, "step": 9740 }, { "epoch": 0.7121488507843853, "grad_norm": 0.27436935761606934, "learning_rate": 2.021162395516944e-05, "loss": 0.0742, "step": 9760 }, { "epoch": 0.713608172199927, "grad_norm": 0.14755232060666437, "learning_rate": 2.002214384050549e-05, "loss": 0.0805, "step": 9780 }, { "epoch": 0.7150674936154688, "grad_norm": 0.24785156225144722, "learning_rate": 1.98333334770256e-05, "loss": 0.0774, "step": 9800 }, { "epoch": 0.7165268150310106, "grad_norm": 0.276700390822216, "learning_rate": 1.9645197083042217e-05, "loss": 0.081, "step": 9820 }, { "epoch": 0.7179861364465524, "grad_norm": 0.18486086426882578, "learning_rate": 1.9457738861810344e-05, "loss": 0.0663, "step": 9840 }, { "epoch": 0.7194454578620941, "grad_norm": 0.20602502291890026, "learning_rate": 1.9270963001433506e-05, "loss": 0.0826, "step": 9860 }, { "epoch": 0.7209047792776359, "grad_norm": 0.17095661359091108, "learning_rate": 1.9084873674770258e-05, "loss": 0.0764, "step": 9880 }, { "epoch": 0.7223641006931777, "grad_norm": 0.1780004922244762, "learning_rate": 1.889947503934097e-05, "loss": 0.0849, "step": 9900 }, { "epoch": 0.7238234221087194, "grad_norm": 0.17930810952796022, "learning_rate": 1.871477123723483e-05, "loss": 0.0848, "step": 9920 }, { "epoch": 0.7252827435242613, "grad_norm": 0.11631516474736948, "learning_rate": 1.853076639501749e-05, "loss": 0.0726, "step": 9940 }, { "epoch": 0.726742064939803, "grad_norm": 0.25877277599531223, "learning_rate": 1.8347464623638716e-05, "loss": 0.0799, "step": 9960 }, { "epoch": 0.7282013863553448, "grad_norm": 0.2888977002420134, "learning_rate": 1.8164870018340595e-05, "loss": 0.0808, "step": 9980 }, { "epoch": 0.7296607077708865, "grad_norm": 0.24770955121408422, "learning_rate": 1.798298665856605e-05, "loss": 0.0933, "step": 10000 }, { "epoch": 0.7311200291864283, "grad_norm": 0.17145855088561357, "learning_rate": 1.780181860786767e-05, "loss": 0.0666, "step": 10020 }, { "epoch": 0.7325793506019701, "grad_norm": 0.1888311975799085, "learning_rate": 1.7621369913816998e-05, "loss": 0.0688, "step": 10040 }, { "epoch": 0.7340386720175118, "grad_norm": 0.21899758481758985, "learning_rate": 1.7441644607913997e-05, "loss": 0.0819, "step": 10060 }, { "epoch": 0.7354979934330537, "grad_norm": 0.15837265511931634, "learning_rate": 1.7262646705497054e-05, "loss": 0.0773, "step": 10080 }, { "epoch": 0.7369573148485954, "grad_norm": 0.16230722759639937, "learning_rate": 1.708438020565325e-05, "loss": 0.083, "step": 10100 }, { "epoch": 0.7384166362641372, "grad_norm": 0.1889470693308101, "learning_rate": 1.690684909112896e-05, "loss": 0.0648, "step": 10120 }, { "epoch": 0.739875957679679, "grad_norm": 0.35825558366398547, "learning_rate": 1.6730057328241032e-05, "loss": 0.0914, "step": 10140 }, { "epoch": 0.7413352790952207, "grad_norm": 0.17595763950131643, "learning_rate": 1.6554008866787978e-05, "loss": 0.0626, "step": 10160 }, { "epoch": 0.7427946005107625, "grad_norm": 0.2191406674992623, "learning_rate": 1.6378707639961847e-05, "loss": 0.118, "step": 10180 }, { "epoch": 0.7442539219263042, "grad_norm": 0.1977690967160498, "learning_rate": 1.620415756426032e-05, "loss": 0.0825, "step": 10200 }, { "epoch": 0.7457132433418461, "grad_norm": 0.16945524182505178, "learning_rate": 1.6030362539399235e-05, "loss": 0.0721, "step": 10220 }, { "epoch": 0.7471725647573878, "grad_norm": 0.2137107271036752, "learning_rate": 1.5857326448225413e-05, "loss": 0.0933, "step": 10240 }, { "epoch": 0.7486318861729296, "grad_norm": 0.14388503061137067, "learning_rate": 1.5685053156629936e-05, "loss": 0.0697, "step": 10260 }, { "epoch": 0.7500912075884714, "grad_norm": 0.16396133345953567, "learning_rate": 1.551354651346178e-05, "loss": 0.072, "step": 10280 }, { "epoch": 0.7515505290040131, "grad_norm": 0.20758612199072954, "learning_rate": 1.534281035044183e-05, "loss": 0.0782, "step": 10300 }, { "epoch": 0.7530098504195549, "grad_norm": 0.2677007162576603, "learning_rate": 1.5172848482077251e-05, "loss": 0.088, "step": 10320 }, { "epoch": 0.7544691718350967, "grad_norm": 0.22981080747403948, "learning_rate": 1.5003664705576292e-05, "loss": 0.0675, "step": 10340 }, { "epoch": 0.7559284932506385, "grad_norm": 0.14595975992209873, "learning_rate": 1.4835262800763433e-05, "loss": 0.0598, "step": 10360 }, { "epoch": 0.7573878146661802, "grad_norm": 0.2442624180440446, "learning_rate": 1.4667646529994955e-05, "loss": 0.0803, "step": 10380 }, { "epoch": 0.758847136081722, "grad_norm": 0.1769769305897374, "learning_rate": 1.4500819638074836e-05, "loss": 0.0717, "step": 10400 }, { "epoch": 0.7603064574972638, "grad_norm": 0.22752965532855682, "learning_rate": 1.4334785852171189e-05, "loss": 0.0773, "step": 10420 }, { "epoch": 0.7617657789128055, "grad_norm": 0.17198904935541695, "learning_rate": 1.4169548881732863e-05, "loss": 0.0679, "step": 10440 }, { "epoch": 0.7632251003283473, "grad_norm": 0.18227879189264046, "learning_rate": 1.4005112418406658e-05, "loss": 0.0779, "step": 10460 }, { "epoch": 0.7646844217438891, "grad_norm": 0.17918974376239177, "learning_rate": 1.3841480135954815e-05, "loss": 0.0755, "step": 10480 }, { "epoch": 0.7661437431594309, "grad_norm": 0.1808913214191542, "learning_rate": 1.3678655690172937e-05, "loss": 0.073, "step": 10500 }, { "epoch": 0.7676030645749726, "grad_norm": 0.19182604410775567, "learning_rate": 1.351664271880833e-05, "loss": 0.076, "step": 10520 }, { "epoch": 0.7690623859905145, "grad_norm": 0.2258827316579577, "learning_rate": 1.335544484147872e-05, "loss": 0.0736, "step": 10540 }, { "epoch": 0.7705217074060562, "grad_norm": 0.18920980773130505, "learning_rate": 1.3195065659591377e-05, "loss": 0.0979, "step": 10560 }, { "epoch": 0.7719810288215979, "grad_norm": 0.16526779110459391, "learning_rate": 1.303550875626266e-05, "loss": 0.0822, "step": 10580 }, { "epoch": 0.7734403502371398, "grad_norm": 0.11874076858796079, "learning_rate": 1.2876777696237957e-05, "loss": 0.0784, "step": 10600 }, { "epoch": 0.7748996716526815, "grad_norm": 0.13406374720218356, "learning_rate": 1.271887602581211e-05, "loss": 0.0607, "step": 10620 }, { "epoch": 0.7763589930682233, "grad_norm": 0.16665764155851395, "learning_rate": 1.2561807272750053e-05, "loss": 0.0775, "step": 10640 }, { "epoch": 0.777818314483765, "grad_norm": 0.1644277062999908, "learning_rate": 1.2405574946208116e-05, "loss": 0.0778, "step": 10660 }, { "epoch": 0.7792776358993069, "grad_norm": 0.36519522425925205, "learning_rate": 1.2250182536655563e-05, "loss": 0.0693, "step": 10680 }, { "epoch": 0.7807369573148486, "grad_norm": 0.16841701778065343, "learning_rate": 1.2095633515796639e-05, "loss": 0.0789, "step": 10700 }, { "epoch": 0.7821962787303903, "grad_norm": 0.23811108549083682, "learning_rate": 1.1941931336492984e-05, "loss": 0.07, "step": 10720 }, { "epoch": 0.7836556001459322, "grad_norm": 0.18785323479769722, "learning_rate": 1.1789079432686501e-05, "loss": 0.0679, "step": 10740 }, { "epoch": 0.7851149215614739, "grad_norm": 0.2187740182681998, "learning_rate": 1.1637081219322648e-05, "loss": 0.0783, "step": 10760 }, { "epoch": 0.7865742429770157, "grad_norm": 0.2083550955615201, "learning_rate": 1.1485940092274117e-05, "loss": 0.0847, "step": 10780 }, { "epoch": 0.7880335643925575, "grad_norm": 0.22098387990816473, "learning_rate": 1.1335659428265012e-05, "loss": 0.0741, "step": 10800 }, { "epoch": 0.7894928858080993, "grad_norm": 0.25484065655198496, "learning_rate": 1.1186242584795331e-05, "loss": 0.0743, "step": 10820 }, { "epoch": 0.790952207223641, "grad_norm": 0.3490731944764208, "learning_rate": 1.1037692900066038e-05, "loss": 0.0847, "step": 10840 }, { "epoch": 0.7924115286391827, "grad_norm": 0.16585089936476097, "learning_rate": 1.0890013692904411e-05, "loss": 0.0615, "step": 10860 }, { "epoch": 0.7938708500547246, "grad_norm": 0.11879388400453102, "learning_rate": 1.0743208262689958e-05, "loss": 0.0866, "step": 10880 }, { "epoch": 0.7953301714702663, "grad_norm": 0.16974280932555014, "learning_rate": 1.0597279889280649e-05, "loss": 0.0711, "step": 10900 }, { "epoch": 0.7967894928858081, "grad_norm": 0.2879906296131935, "learning_rate": 1.0452231832939669e-05, "loss": 0.087, "step": 10920 }, { "epoch": 0.7982488143013499, "grad_norm": 0.30448023595487583, "learning_rate": 1.0308067334262578e-05, "loss": 0.079, "step": 10940 }, { "epoch": 0.7997081357168917, "grad_norm": 0.1346240174177682, "learning_rate": 1.0164789614104909e-05, "loss": 0.0663, "step": 10960 }, { "epoch": 0.8011674571324334, "grad_norm": 0.11374165016723602, "learning_rate": 1.002240187351018e-05, "loss": 0.0716, "step": 10980 }, { "epoch": 0.8026267785479752, "grad_norm": 0.1995760830889922, "learning_rate": 9.880907293638447e-06, "loss": 0.0779, "step": 11000 }, { "epoch": 0.804086099963517, "grad_norm": 0.16050456710211813, "learning_rate": 9.740309035695156e-06, "loss": 0.0754, "step": 11020 }, { "epoch": 0.8055454213790587, "grad_norm": 0.20346144604531705, "learning_rate": 9.600610240860557e-06, "loss": 0.0744, "step": 11040 }, { "epoch": 0.8070047427946005, "grad_norm": 0.16473468373629052, "learning_rate": 9.461814030219518e-06, "loss": 0.066, "step": 11060 }, { "epoch": 0.8084640642101423, "grad_norm": 0.2408642242586706, "learning_rate": 9.323923504691795e-06, "loss": 0.0873, "step": 11080 }, { "epoch": 0.8099233856256841, "grad_norm": 0.18983244827766566, "learning_rate": 9.186941744962752e-06, "loss": 0.0727, "step": 11100 }, { "epoch": 0.8113827070412258, "grad_norm": 0.13201659836858892, "learning_rate": 9.050871811414535e-06, "loss": 0.0771, "step": 11120 }, { "epoch": 0.8128420284567676, "grad_norm": 0.28031452690082004, "learning_rate": 8.915716744057706e-06, "loss": 0.0854, "step": 11140 }, { "epoch": 0.8143013498723094, "grad_norm": 0.32592701500229143, "learning_rate": 8.781479562463285e-06, "loss": 0.0929, "step": 11160 }, { "epoch": 0.8157606712878511, "grad_norm": 0.2502727337909842, "learning_rate": 8.648163265695369e-06, "loss": 0.0823, "step": 11180 }, { "epoch": 0.817219992703393, "grad_norm": 0.15230810954172375, "learning_rate": 8.515770832244047e-06, "loss": 0.0713, "step": 11200 }, { "epoch": 0.8186793141189347, "grad_norm": 0.16309233407580462, "learning_rate": 8.384305219958889e-06, "loss": 0.0596, "step": 11220 }, { "epoch": 0.8201386355344765, "grad_norm": 0.23482748758834446, "learning_rate": 8.25376936598286e-06, "loss": 0.0655, "step": 11240 }, { "epoch": 0.8215979569500182, "grad_norm": 0.16819291222882218, "learning_rate": 8.1241661866867e-06, "loss": 0.0767, "step": 11260 }, { "epoch": 0.82305727836556, "grad_norm": 0.155955940914946, "learning_rate": 7.995498577603816e-06, "loss": 0.07, "step": 11280 }, { "epoch": 0.8245165997811018, "grad_norm": 0.16190605473805825, "learning_rate": 7.867769413365461e-06, "loss": 0.0695, "step": 11300 }, { "epoch": 0.8259759211966435, "grad_norm": 0.20401149032189506, "learning_rate": 7.740981547636656e-06, "loss": 0.0725, "step": 11320 }, { "epoch": 0.8274352426121854, "grad_norm": 0.18866486867252952, "learning_rate": 7.615137813052353e-06, "loss": 0.0765, "step": 11340 }, { "epoch": 0.8288945640277271, "grad_norm": 0.22917668312380834, "learning_rate": 7.490241021154154e-06, "loss": 0.0731, "step": 11360 }, { "epoch": 0.8303538854432689, "grad_norm": 0.15573965400489234, "learning_rate": 7.366293962327564e-06, "loss": 0.078, "step": 11380 }, { "epoch": 0.8318132068588107, "grad_norm": 0.20293931138222282, "learning_rate": 7.243299405739539e-06, "loss": 0.0653, "step": 11400 }, { "epoch": 0.8332725282743524, "grad_norm": 0.3734354194730838, "learning_rate": 7.1212600992767165e-06, "loss": 0.0729, "step": 11420 }, { "epoch": 0.8347318496898942, "grad_norm": 0.18655222782248948, "learning_rate": 7.0001787694839504e-06, "loss": 0.0697, "step": 11440 }, { "epoch": 0.836191171105436, "grad_norm": 0.20224114343218869, "learning_rate": 6.880058121503452e-06, "loss": 0.0672, "step": 11460 }, { "epoch": 0.8376504925209778, "grad_norm": 0.23796394228026443, "learning_rate": 6.760900839014356e-06, "loss": 0.0822, "step": 11480 }, { "epoch": 0.8391098139365195, "grad_norm": 0.20702446741177674, "learning_rate": 6.642709584172674e-06, "loss": 0.0709, "step": 11500 }, { "epoch": 0.8405691353520612, "grad_norm": 0.21404136231074256, "learning_rate": 6.525486997551933e-06, "loss": 0.0647, "step": 11520 }, { "epoch": 0.8420284567676031, "grad_norm": 0.19107414563165565, "learning_rate": 6.409235698084093e-06, "loss": 0.0704, "step": 11540 }, { "epoch": 0.8434877781831448, "grad_norm": 0.14301208487157718, "learning_rate": 6.293958283001122e-06, "loss": 0.0638, "step": 11560 }, { "epoch": 0.8449470995986866, "grad_norm": 0.16690480624670154, "learning_rate": 6.179657327776872e-06, "loss": 0.0718, "step": 11580 }, { "epoch": 0.8464064210142284, "grad_norm": 0.14535518790707497, "learning_rate": 6.066335386069616e-06, "loss": 0.064, "step": 11600 }, { "epoch": 0.8478657424297702, "grad_norm": 0.25689609440186906, "learning_rate": 5.953994989664952e-06, "loss": 0.0739, "step": 11620 }, { "epoch": 0.8493250638453119, "grad_norm": 0.2874364243397915, "learning_rate": 5.842638648419252e-06, "loss": 0.0798, "step": 11640 }, { "epoch": 0.8507843852608536, "grad_norm": 0.32559338716955627, "learning_rate": 5.7322688502036145e-06, "loss": 0.0795, "step": 11660 }, { "epoch": 0.8522437066763955, "grad_norm": 0.26451174523619075, "learning_rate": 5.622888060848225e-06, "loss": 0.0638, "step": 11680 }, { "epoch": 0.8537030280919372, "grad_norm": 0.15011807077671308, "learning_rate": 5.51449872408733e-06, "loss": 0.0799, "step": 11700 }, { "epoch": 0.855162349507479, "grad_norm": 0.15800610605357415, "learning_rate": 5.407103261504565e-06, "loss": 0.0633, "step": 11720 }, { "epoch": 0.8566216709230208, "grad_norm": 0.2885871145585813, "learning_rate": 5.300704072478918e-06, "loss": 0.0814, "step": 11740 }, { "epoch": 0.8580809923385626, "grad_norm": 0.19478064630131137, "learning_rate": 5.195303534131124e-06, "loss": 0.0708, "step": 11760 }, { "epoch": 0.8595403137541043, "grad_norm": 0.1669746056578264, "learning_rate": 5.090904001270502e-06, "loss": 0.0662, "step": 11780 }, { "epoch": 0.8609996351696461, "grad_norm": 0.19509662064642885, "learning_rate": 4.987507806342395e-06, "loss": 0.0604, "step": 11800 }, { "epoch": 0.8624589565851879, "grad_norm": 0.13302036796781258, "learning_rate": 4.885117259376021e-06, "loss": 0.0665, "step": 11820 }, { "epoch": 0.8639182780007296, "grad_norm": 0.2705229793863414, "learning_rate": 4.783734647932891e-06, "loss": 0.0812, "step": 11840 }, { "epoch": 0.8653775994162715, "grad_norm": 0.21217950595702748, "learning_rate": 4.683362237055716e-06, "loss": 0.0851, "step": 11860 }, { "epoch": 0.8668369208318132, "grad_norm": 0.10430228014392347, "learning_rate": 4.584002269217758e-06, "loss": 0.0797, "step": 11880 }, { "epoch": 0.868296242247355, "grad_norm": 0.18383288154386884, "learning_rate": 4.485656964272761e-06, "loss": 0.0687, "step": 11900 }, { "epoch": 0.8697555636628967, "grad_norm": 0.15062393283221726, "learning_rate": 4.388328519405321e-06, "loss": 0.0726, "step": 11920 }, { "epoch": 0.8712148850784385, "grad_norm": 0.18392906928261096, "learning_rate": 4.292019109081863e-06, "loss": 0.0728, "step": 11940 }, { "epoch": 0.8726742064939803, "grad_norm": 0.19301702379177835, "learning_rate": 4.196730885002003e-06, "loss": 0.0743, "step": 11960 }, { "epoch": 0.874133527909522, "grad_norm": 0.28144373325731764, "learning_rate": 4.102465976050495e-06, "loss": 0.0765, "step": 11980 }, { "epoch": 0.8755928493250639, "grad_norm": 0.33870680329228003, "learning_rate": 4.009226488249656e-06, "loss": 0.0741, "step": 12000 }, { "epoch": 0.8770521707406056, "grad_norm": 0.14028718131513995, "learning_rate": 3.917014504712341e-06, "loss": 0.0826, "step": 12020 }, { "epoch": 0.8785114921561474, "grad_norm": 0.16242741860000037, "learning_rate": 3.825832085595382e-06, "loss": 0.0827, "step": 12040 }, { "epoch": 0.8799708135716892, "grad_norm": 0.2236973430421186, "learning_rate": 3.73568126805357e-06, "loss": 0.0738, "step": 12060 }, { "epoch": 0.8814301349872309, "grad_norm": 0.18121546816554587, "learning_rate": 3.6465640661941305e-06, "loss": 0.0759, "step": 12080 }, { "epoch": 0.8828894564027727, "grad_norm": 0.1671473845979495, "learning_rate": 3.5584824710317433e-06, "loss": 0.0707, "step": 12100 }, { "epoch": 0.8843487778183144, "grad_norm": 0.287452266941361, "learning_rate": 3.4714384504440145e-06, "loss": 0.0702, "step": 12120 }, { "epoch": 0.8858080992338563, "grad_norm": 0.2603122182092065, "learning_rate": 3.3854339491276034e-06, "loss": 0.0763, "step": 12140 }, { "epoch": 0.887267420649398, "grad_norm": 0.23937881451583296, "learning_rate": 3.30047088855468e-06, "loss": 0.0688, "step": 12160 }, { "epoch": 0.8887267420649398, "grad_norm": 0.3034105099595713, "learning_rate": 3.2165511669300374e-06, "loss": 0.07, "step": 12180 }, { "epoch": 0.8901860634804816, "grad_norm": 0.23546891436663464, "learning_rate": 3.1336766591486986e-06, "loss": 0.0764, "step": 12200 }, { "epoch": 0.8916453848960233, "grad_norm": 0.22874430917476343, "learning_rate": 3.051849216753977e-06, "loss": 0.0813, "step": 12220 }, { "epoch": 0.8931047063115651, "grad_norm": 0.18731436997461934, "learning_rate": 2.971070667896181e-06, "loss": 0.0748, "step": 12240 }, { "epoch": 0.8945640277271069, "grad_norm": 0.28697155444258127, "learning_rate": 2.8913428172917088e-06, "loss": 0.0714, "step": 12260 }, { "epoch": 0.8960233491426487, "grad_norm": 0.1665356610579465, "learning_rate": 2.812667446182754e-06, "loss": 0.0619, "step": 12280 }, { "epoch": 0.8974826705581904, "grad_norm": 0.3780123004550475, "learning_rate": 2.735046312297512e-06, "loss": 0.0897, "step": 12300 }, { "epoch": 0.8989419919737323, "grad_norm": 0.17553586444633024, "learning_rate": 2.658481149810904e-06, "loss": 0.0795, "step": 12320 }, { "epoch": 0.900401313389274, "grad_norm": 0.22562570429434525, "learning_rate": 2.5829736693058324e-06, "loss": 0.0791, "step": 12340 }, { "epoch": 0.9018606348048157, "grad_norm": 0.18563882080258232, "learning_rate": 2.508525557734964e-06, "loss": 0.0844, "step": 12360 }, { "epoch": 0.9033199562203575, "grad_norm": 0.16316948443096166, "learning_rate": 2.4351384783830476e-06, "loss": 0.078, "step": 12380 }, { "epoch": 0.9047792776358993, "grad_norm": 0.33268252514205654, "learning_rate": 2.3628140708297387e-06, "loss": 0.0804, "step": 12400 }, { "epoch": 0.9062385990514411, "grad_norm": 0.22218951884610635, "learning_rate": 2.2915539509130056e-06, "loss": 0.0743, "step": 12420 }, { "epoch": 0.9076979204669828, "grad_norm": 0.2798071798612865, "learning_rate": 2.221359710692961e-06, "loss": 0.0797, "step": 12440 }, { "epoch": 0.9091572418825247, "grad_norm": 0.18122090589025527, "learning_rate": 2.1522329184163693e-06, "loss": 0.0829, "step": 12460 }, { "epoch": 0.9106165632980664, "grad_norm": 0.16423755657097971, "learning_rate": 2.084175118481552e-06, "loss": 0.0711, "step": 12480 }, { "epoch": 0.9120758847136081, "grad_norm": 0.26153582906662814, "learning_rate": 2.0171878314039216e-06, "loss": 0.1026, "step": 12500 }, { "epoch": 0.91353520612915, "grad_norm": 0.14423016740050856, "learning_rate": 1.951272553781974e-06, "loss": 0.0568, "step": 12520 }, { "epoch": 0.9149945275446917, "grad_norm": 0.1767728721541835, "learning_rate": 1.8864307582639018e-06, "loss": 0.0709, "step": 12540 }, { "epoch": 0.9164538489602335, "grad_norm": 0.2557552892882785, "learning_rate": 1.8226638935146368e-06, "loss": 0.0655, "step": 12560 }, { "epoch": 0.9179131703757752, "grad_norm": 0.16552993833577492, "learning_rate": 1.759973384183533e-06, "loss": 0.0778, "step": 12580 }, { "epoch": 0.9193724917913171, "grad_norm": 0.20666952535042302, "learning_rate": 1.6983606308724975e-06, "loss": 0.0594, "step": 12600 }, { "epoch": 0.9208318132068588, "grad_norm": 0.17169473621194706, "learning_rate": 1.6378270101047476e-06, "loss": 0.0615, "step": 12620 }, { "epoch": 0.9222911346224005, "grad_norm": 0.19249775498672067, "learning_rate": 1.5783738742940035e-06, "loss": 0.0768, "step": 12640 }, { "epoch": 0.9237504560379424, "grad_norm": 0.23939912325403057, "learning_rate": 1.5200025517143002e-06, "loss": 0.073, "step": 12660 }, { "epoch": 0.9252097774534841, "grad_norm": 0.1470790842962273, "learning_rate": 1.4627143464703175e-06, "loss": 0.0643, "step": 12680 }, { "epoch": 0.9266690988690259, "grad_norm": 0.14616263478217778, "learning_rate": 1.4065105384682365e-06, "loss": 0.0748, "step": 12700 }, { "epoch": 0.9281284202845677, "grad_norm": 0.19447868042741956, "learning_rate": 1.3513923833871344e-06, "loss": 0.0785, "step": 12720 }, { "epoch": 0.9295877417001095, "grad_norm": 0.2699774519408524, "learning_rate": 1.2973611126509465e-06, "loss": 0.0573, "step": 12740 }, { "epoch": 0.9310470631156512, "grad_norm": 0.2363625074829158, "learning_rate": 1.2444179334009598e-06, "loss": 0.0774, "step": 12760 }, { "epoch": 0.9325063845311929, "grad_norm": 0.21819478694246727, "learning_rate": 1.1925640284688067e-06, "loss": 0.0646, "step": 12780 }, { "epoch": 0.9339657059467348, "grad_norm": 0.21027646446608875, "learning_rate": 1.1418005563500977e-06, "loss": 0.0831, "step": 12800 }, { "epoch": 0.9354250273622765, "grad_norm": 0.16671790982554727, "learning_rate": 1.0921286511784757e-06, "loss": 0.0604, "step": 12820 }, { "epoch": 0.9368843487778183, "grad_norm": 0.21424601887889774, "learning_rate": 1.0435494227003183e-06, "loss": 0.0794, "step": 12840 }, { "epoch": 0.9383436701933601, "grad_norm": 0.2758342830425722, "learning_rate": 9.960639562499374e-07, "loss": 0.0558, "step": 12860 }, { "epoch": 0.9398029916089019, "grad_norm": 0.2067849866378926, "learning_rate": 9.496733127253243e-07, "loss": 0.0708, "step": 12880 }, { "epoch": 0.9412623130244436, "grad_norm": 0.25852268360753444, "learning_rate": 9.043785285644534e-07, "loss": 0.0658, "step": 12900 }, { "epoch": 0.9427216344399854, "grad_norm": 0.13562651531215247, "learning_rate": 8.601806157221171e-07, "loss": 0.0543, "step": 12920 }, { "epoch": 0.9441809558555272, "grad_norm": 0.1276801534608157, "learning_rate": 8.170805616473265e-07, "loss": 0.0589, "step": 12940 }, { "epoch": 0.9456402772710689, "grad_norm": 0.21656400304795456, "learning_rate": 7.750793292612469e-07, "loss": 0.0653, "step": 12960 }, { "epoch": 0.9470995986866108, "grad_norm": 0.15236754016229437, "learning_rate": 7.341778569356916e-07, "loss": 0.0861, "step": 12980 }, { "epoch": 0.9485589201021525, "grad_norm": 0.13042448104911, "learning_rate": 6.943770584721565e-07, "loss": 0.0558, "step": 13000 }, { "epoch": 0.9500182415176943, "grad_norm": 0.16814342944607893, "learning_rate": 6.556778230813743e-07, "loss": 0.0693, "step": 13020 }, { "epoch": 0.951477562933236, "grad_norm": 0.24541824676137683, "learning_rate": 6.180810153634919e-07, "loss": 0.0654, "step": 13040 }, { "epoch": 0.9529368843487778, "grad_norm": 0.23375974468426802, "learning_rate": 5.815874752887362e-07, "loss": 0.0774, "step": 13060 }, { "epoch": 0.9543962057643196, "grad_norm": 0.19559616963339815, "learning_rate": 5.461980181786397e-07, "loss": 0.079, "step": 13080 }, { "epoch": 0.9558555271798613, "grad_norm": 0.27441559379098057, "learning_rate": 5.119134346878273e-07, "loss": 0.0865, "step": 13100 }, { "epoch": 0.9573148485954032, "grad_norm": 0.27156969288522603, "learning_rate": 4.7873449078637e-07, "loss": 0.0629, "step": 13120 }, { "epoch": 0.9587741700109449, "grad_norm": 0.28696180329193466, "learning_rate": 4.466619277426476e-07, "loss": 0.0631, "step": 13140 }, { "epoch": 0.9602334914264867, "grad_norm": 0.15271798864372269, "learning_rate": 4.1569646210680156e-07, "loss": 0.063, "step": 13160 }, { "epoch": 0.9616928128420285, "grad_norm": 0.23490129701109025, "learning_rate": 3.858387856947254e-07, "loss": 0.0731, "step": 13180 }, { "epoch": 0.9631521342575702, "grad_norm": 0.22618295012189688, "learning_rate": 3.570895655725992e-07, "loss": 0.0702, "step": 13200 }, { "epoch": 0.964611455673112, "grad_norm": 0.20700633424973702, "learning_rate": 3.2944944404200153e-07, "loss": 0.0818, "step": 13220 }, { "epoch": 0.9660707770886537, "grad_norm": 0.15624758185553145, "learning_rate": 3.0291903862554873e-07, "loss": 0.0711, "step": 13240 }, { "epoch": 0.9675300985041956, "grad_norm": 0.16367727608879512, "learning_rate": 2.774989420530949e-07, "loss": 0.0682, "step": 13260 }, { "epoch": 0.9689894199197373, "grad_norm": 0.152753645721427, "learning_rate": 2.531897222485036e-07, "loss": 0.0678, "step": 13280 }, { "epoch": 0.9704487413352791, "grad_norm": 0.12746459313433045, "learning_rate": 2.2999192231694667e-07, "loss": 0.0673, "step": 13300 }, { "epoch": 0.9719080627508209, "grad_norm": 0.18022389515283288, "learning_rate": 2.0790606053276984e-07, "loss": 0.0733, "step": 13320 }, { "epoch": 0.9733673841663626, "grad_norm": 0.2043683161630788, "learning_rate": 1.8693263032793506e-07, "loss": 0.0664, "step": 13340 }, { "epoch": 0.9748267055819044, "grad_norm": 0.21315387218220452, "learning_rate": 1.6707210028095722e-07, "loss": 0.0766, "step": 13360 }, { "epoch": 0.9762860269974462, "grad_norm": 0.10838765239735793, "learning_rate": 1.4832491410649018e-07, "loss": 0.0668, "step": 13380 }, { "epoch": 0.977745348412988, "grad_norm": 0.20493249133697297, "learning_rate": 1.3069149064534603e-07, "loss": 0.0668, "step": 13400 }, { "epoch": 0.9792046698285297, "grad_norm": 0.12533859768480493, "learning_rate": 1.1417222385520232e-07, "loss": 0.0624, "step": 13420 }, { "epoch": 0.9806639912440716, "grad_norm": 0.13640407728401674, "learning_rate": 9.876748280175374e-08, "loss": 0.0648, "step": 13440 }, { "epoch": 0.9821233126596133, "grad_norm": 0.26255640137890773, "learning_rate": 8.447761165049084e-08, "loss": 0.0748, "step": 13460 }, { "epoch": 0.983582634075155, "grad_norm": 0.2045065387998203, "learning_rate": 7.130292965901176e-08, "loss": 0.0736, "step": 13480 }, { "epoch": 0.9850419554906968, "grad_norm": 0.2589541202073395, "learning_rate": 5.924373116986126e-08, "loss": 0.0804, "step": 13500 }, { "epoch": 0.9865012769062386, "grad_norm": 0.23813447695351278, "learning_rate": 4.830028560399713e-08, "loss": 0.0717, "step": 13520 }, { "epoch": 0.9879605983217804, "grad_norm": 0.22908649033900605, "learning_rate": 3.84728374547394e-08, "loss": 0.0615, "step": 13540 }, { "epoch": 0.9894199197373221, "grad_norm": 0.134381505549619, "learning_rate": 2.9761606282319164e-08, "loss": 0.0696, "step": 13560 }, { "epoch": 0.990879241152864, "grad_norm": 0.20686107675298593, "learning_rate": 2.2166786708976983e-08, "loss": 0.0608, "step": 13580 }, { "epoch": 0.9923385625684057, "grad_norm": 0.1470958235212741, "learning_rate": 1.5688548414594107e-08, "loss": 0.0722, "step": 13600 }, { "epoch": 0.9937978839839474, "grad_norm": 0.28992636074658135, "learning_rate": 1.0327036132939949e-08, "loss": 0.079, "step": 13620 }, { "epoch": 0.9952572053994893, "grad_norm": 0.1518264410524196, "learning_rate": 6.082369648396924e-09, "loss": 0.0716, "step": 13640 }, { "epoch": 0.996716526815031, "grad_norm": 0.1355086227621071, "learning_rate": 2.9546437933070104e-09, "loss": 0.0631, "step": 13660 }, { "epoch": 0.9981758482305728, "grad_norm": 0.1903490129846117, "learning_rate": 9.439284458623299e-10, "loss": 0.0774, "step": 13680 }, { "epoch": 0.9996351696461145, "grad_norm": 0.1922983216458204, "learning_rate": 5.02685285175275e-11, "loss": 0.0681, "step": 13700 }, { "epoch": 1.0, "step": 13705, "total_flos": 336607794167808.0, "train_loss": 0.10042130719436776, "train_runtime": 24357.1331, "train_samples_per_second": 4.501, "train_steps_per_second": 0.563 } ], "logging_steps": 20, "max_steps": 13705, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 336607794167808.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }