{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 10190, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009813542688910697, "grad_norm": 1.582729697227478, "learning_rate": 4.9955839057899906e-05, "loss": 0.6655, "step": 10 }, { "epoch": 0.019627085377821395, "grad_norm": 2.231220006942749, "learning_rate": 4.990677134445535e-05, "loss": 0.6564, "step": 20 }, { "epoch": 0.029440628066732092, "grad_norm": 2.0203068256378174, "learning_rate": 4.98577036310108e-05, "loss": 0.5558, "step": 30 }, { "epoch": 0.03925417075564279, "grad_norm": 1.8266687393188477, "learning_rate": 4.980863591756624e-05, "loss": 0.3794, "step": 40 }, { "epoch": 0.04906771344455348, "grad_norm": 0.9987813830375671, "learning_rate": 4.9759568204121696e-05, "loss": 0.3129, "step": 50 }, { "epoch": 0.058881256133464184, "grad_norm": 6.515292644500732, "learning_rate": 4.971050049067714e-05, "loss": 0.2824, "step": 60 }, { "epoch": 0.06869479882237488, "grad_norm": 12.534858703613281, "learning_rate": 4.966143277723259e-05, "loss": 0.2215, "step": 70 }, { "epoch": 0.07850834151128558, "grad_norm": 1.54276704788208, "learning_rate": 4.961236506378803e-05, "loss": 0.0981, "step": 80 }, { "epoch": 0.08832188420019627, "grad_norm": 1.2632302045822144, "learning_rate": 4.956329735034347e-05, "loss": 0.0815, "step": 90 }, { "epoch": 0.09813542688910697, "grad_norm": 8.447662353515625, "learning_rate": 4.951422963689892e-05, "loss": 0.076, "step": 100 }, { "epoch": 0.10794896957801767, "grad_norm": 0.18015748262405396, "learning_rate": 4.946516192345437e-05, "loss": 0.0877, "step": 110 }, { "epoch": 0.11776251226692837, "grad_norm": 0.57831871509552, "learning_rate": 4.941609421000982e-05, "loss": 0.2572, "step": 120 }, { "epoch": 0.12757605495583907, "grad_norm": 0.19590969383716583, "learning_rate": 4.936702649656526e-05, "loss": 0.1026, "step": 130 }, { "epoch": 0.13738959764474976, "grad_norm": 0.1655094027519226, "learning_rate": 4.931795878312071e-05, "loss": 0.0515, "step": 140 }, { "epoch": 0.14720314033366044, "grad_norm": 0.09949669986963272, "learning_rate": 4.926889106967615e-05, "loss": 0.1725, "step": 150 }, { "epoch": 0.15701668302257116, "grad_norm": 0.10112312436103821, "learning_rate": 4.92198233562316e-05, "loss": 0.279, "step": 160 }, { "epoch": 0.16683022571148184, "grad_norm": 0.08742330223321915, "learning_rate": 4.917075564278705e-05, "loss": 0.1379, "step": 170 }, { "epoch": 0.17664376840039253, "grad_norm": 0.08159990608692169, "learning_rate": 4.91216879293425e-05, "loss": 0.0306, "step": 180 }, { "epoch": 0.18645731108930325, "grad_norm": 0.06950151175260544, "learning_rate": 4.907262021589794e-05, "loss": 0.0121, "step": 190 }, { "epoch": 0.19627085377821393, "grad_norm": 10.549564361572266, "learning_rate": 4.902355250245339e-05, "loss": 0.1062, "step": 200 }, { "epoch": 0.20608439646712462, "grad_norm": 0.06549222767353058, "learning_rate": 4.897448478900883e-05, "loss": 0.0633, "step": 210 }, { "epoch": 0.21589793915603533, "grad_norm": 15.829712867736816, "learning_rate": 4.892541707556428e-05, "loss": 0.0802, "step": 220 }, { "epoch": 0.22571148184494602, "grad_norm": 1.9238972663879395, "learning_rate": 4.887634936211973e-05, "loss": 0.0688, "step": 230 }, { "epoch": 0.23552502453385674, "grad_norm": 0.07546406239271164, "learning_rate": 4.882728164867517e-05, "loss": 0.0553, "step": 240 }, { "epoch": 0.24533856722276742, "grad_norm": 0.3736225366592407, "learning_rate": 4.877821393523062e-05, "loss": 0.0661, "step": 250 }, { "epoch": 0.25515210991167814, "grad_norm": 0.2823021709918976, "learning_rate": 4.872914622178606e-05, "loss": 0.0617, "step": 260 }, { "epoch": 0.2649656526005888, "grad_norm": 4.213985919952393, "learning_rate": 4.868007850834151e-05, "loss": 0.0628, "step": 270 }, { "epoch": 0.2747791952894995, "grad_norm": 0.09498825669288635, "learning_rate": 4.863101079489696e-05, "loss": 0.2317, "step": 280 }, { "epoch": 0.2845927379784102, "grad_norm": 0.05033993721008301, "learning_rate": 4.858194308145241e-05, "loss": 0.008, "step": 290 }, { "epoch": 0.2944062806673209, "grad_norm": 0.05174524709582329, "learning_rate": 4.853287536800785e-05, "loss": 0.0062, "step": 300 }, { "epoch": 0.3042198233562316, "grad_norm": 0.05778587609529495, "learning_rate": 4.84838076545633e-05, "loss": 0.0728, "step": 310 }, { "epoch": 0.3140333660451423, "grad_norm": 0.04121818020939827, "learning_rate": 4.8434739941118744e-05, "loss": 0.1023, "step": 320 }, { "epoch": 0.323846908734053, "grad_norm": 0.049691397696733475, "learning_rate": 4.838567222767419e-05, "loss": 0.1547, "step": 330 }, { "epoch": 0.3336604514229637, "grad_norm": 18.06907081604004, "learning_rate": 4.833660451422964e-05, "loss": 0.1047, "step": 340 }, { "epoch": 0.3434739941118744, "grad_norm": 4.178352355957031, "learning_rate": 4.828753680078509e-05, "loss": 0.0453, "step": 350 }, { "epoch": 0.35328753680078506, "grad_norm": 0.04574347659945488, "learning_rate": 4.823846908734053e-05, "loss": 0.0051, "step": 360 }, { "epoch": 0.3631010794896958, "grad_norm": 0.0428372398018837, "learning_rate": 4.818940137389598e-05, "loss": 0.0606, "step": 370 }, { "epoch": 0.3729146221786065, "grad_norm": 1.1302443742752075, "learning_rate": 4.8140333660451424e-05, "loss": 0.1306, "step": 380 }, { "epoch": 0.38272816486751715, "grad_norm": 0.05571115016937256, "learning_rate": 4.8091265947006866e-05, "loss": 0.1686, "step": 390 }, { "epoch": 0.39254170755642787, "grad_norm": 7.147359371185303, "learning_rate": 4.804219823356232e-05, "loss": 0.2399, "step": 400 }, { "epoch": 0.4023552502453386, "grad_norm": 15.144064903259277, "learning_rate": 4.7993130520117764e-05, "loss": 0.1326, "step": 410 }, { "epoch": 0.41216879293424924, "grad_norm": 36.820003509521484, "learning_rate": 4.794406280667321e-05, "loss": 0.0448, "step": 420 }, { "epoch": 0.42198233562315995, "grad_norm": 0.14495636522769928, "learning_rate": 4.7894995093228655e-05, "loss": 0.1117, "step": 430 }, { "epoch": 0.43179587831207067, "grad_norm": 0.046849966049194336, "learning_rate": 4.7845927379784104e-05, "loss": 0.0092, "step": 440 }, { "epoch": 0.44160942100098133, "grad_norm": 11.208320617675781, "learning_rate": 4.7796859666339546e-05, "loss": 0.0612, "step": 450 }, { "epoch": 0.45142296368989204, "grad_norm": 0.039328742772340775, "learning_rate": 4.7747791952895e-05, "loss": 0.0114, "step": 460 }, { "epoch": 0.46123650637880276, "grad_norm": 0.06963516771793365, "learning_rate": 4.7698724239450444e-05, "loss": 0.0859, "step": 470 }, { "epoch": 0.47105004906771347, "grad_norm": 0.0437813363969326, "learning_rate": 4.764965652600589e-05, "loss": 0.0734, "step": 480 }, { "epoch": 0.48086359175662413, "grad_norm": 0.039823539555072784, "learning_rate": 4.7600588812561336e-05, "loss": 0.0085, "step": 490 }, { "epoch": 0.49067713444553485, "grad_norm": 0.041767820715904236, "learning_rate": 4.7551521099116785e-05, "loss": 0.192, "step": 500 }, { "epoch": 0.5004906771344455, "grad_norm": 0.06061721593141556, "learning_rate": 4.750245338567223e-05, "loss": 0.0775, "step": 510 }, { "epoch": 0.5103042198233563, "grad_norm": 0.06972894817590714, "learning_rate": 4.7453385672227676e-05, "loss": 0.0186, "step": 520 }, { "epoch": 0.5201177625122669, "grad_norm": 0.07340658456087112, "learning_rate": 4.7404317958783125e-05, "loss": 0.1116, "step": 530 }, { "epoch": 0.5299313052011776, "grad_norm": 0.03600945696234703, "learning_rate": 4.735525024533857e-05, "loss": 0.0052, "step": 540 }, { "epoch": 0.5397448478900884, "grad_norm": 0.029176561161875725, "learning_rate": 4.7306182531894016e-05, "loss": 0.0042, "step": 550 }, { "epoch": 0.549558390578999, "grad_norm": 0.029673976823687553, "learning_rate": 4.725711481844946e-05, "loss": 0.0649, "step": 560 }, { "epoch": 0.5593719332679097, "grad_norm": 0.028555743396282196, "learning_rate": 4.720804710500491e-05, "loss": 0.0639, "step": 570 }, { "epoch": 0.5691854759568205, "grad_norm": 5.0781378746032715, "learning_rate": 4.7158979391560356e-05, "loss": 0.1382, "step": 580 }, { "epoch": 0.5789990186457311, "grad_norm": 0.05270172283053398, "learning_rate": 4.7109911678115805e-05, "loss": 0.042, "step": 590 }, { "epoch": 0.5888125613346418, "grad_norm": 0.03829975053668022, "learning_rate": 4.706084396467125e-05, "loss": 0.1485, "step": 600 }, { "epoch": 0.5986261040235525, "grad_norm": 0.03773980960249901, "learning_rate": 4.7011776251226696e-05, "loss": 0.0435, "step": 610 }, { "epoch": 0.6084396467124632, "grad_norm": 0.06550557911396027, "learning_rate": 4.696270853778214e-05, "loss": 0.1656, "step": 620 }, { "epoch": 0.6182531894013739, "grad_norm": 0.033752694725990295, "learning_rate": 4.691364082433759e-05, "loss": 0.0613, "step": 630 }, { "epoch": 0.6280667320902846, "grad_norm": 0.10115873068571091, "learning_rate": 4.6864573110893036e-05, "loss": 0.0622, "step": 640 }, { "epoch": 0.6378802747791953, "grad_norm": 0.10119258612394333, "learning_rate": 4.6815505397448485e-05, "loss": 0.0049, "step": 650 }, { "epoch": 0.647693817468106, "grad_norm": 0.029145225882530212, "learning_rate": 4.676643768400393e-05, "loss": 0.0446, "step": 660 }, { "epoch": 0.6575073601570167, "grad_norm": 0.026971256360411644, "learning_rate": 4.671736997055937e-05, "loss": 0.0035, "step": 670 }, { "epoch": 0.6673209028459274, "grad_norm": 0.05692035332322121, "learning_rate": 4.666830225711482e-05, "loss": 0.0051, "step": 680 }, { "epoch": 0.677134445534838, "grad_norm": 0.023497162386775017, "learning_rate": 4.661923454367027e-05, "loss": 0.0031, "step": 690 }, { "epoch": 0.6869479882237488, "grad_norm": 44.54225540161133, "learning_rate": 4.657016683022572e-05, "loss": 0.0392, "step": 700 }, { "epoch": 0.6967615309126595, "grad_norm": 79.16250610351562, "learning_rate": 4.652109911678116e-05, "loss": 0.034, "step": 710 }, { "epoch": 0.7065750736015701, "grad_norm": 7.570616722106934, "learning_rate": 4.647203140333661e-05, "loss": 0.5111, "step": 720 }, { "epoch": 0.7163886162904809, "grad_norm": 0.05000855773687363, "learning_rate": 4.642296368989205e-05, "loss": 0.0211, "step": 730 }, { "epoch": 0.7262021589793916, "grad_norm": 0.04305935651063919, "learning_rate": 4.63738959764475e-05, "loss": 0.1129, "step": 740 }, { "epoch": 0.7360157016683022, "grad_norm": 0.03774946555495262, "learning_rate": 4.632482826300295e-05, "loss": 0.0963, "step": 750 }, { "epoch": 0.745829244357213, "grad_norm": 0.04534028843045235, "learning_rate": 4.62757605495584e-05, "loss": 0.027, "step": 760 }, { "epoch": 0.7556427870461236, "grad_norm": 0.03019464947283268, "learning_rate": 4.622669283611384e-05, "loss": 0.0148, "step": 770 }, { "epoch": 0.7654563297350343, "grad_norm": 0.028817512094974518, "learning_rate": 4.617762512266929e-05, "loss": 0.1937, "step": 780 }, { "epoch": 0.7752698724239451, "grad_norm": 13.481318473815918, "learning_rate": 4.612855740922473e-05, "loss": 0.0143, "step": 790 }, { "epoch": 0.7850834151128557, "grad_norm": 0.029670100659132004, "learning_rate": 4.607948969578018e-05, "loss": 0.132, "step": 800 }, { "epoch": 0.7948969578017664, "grad_norm": 0.02657695673406124, "learning_rate": 4.603042198233563e-05, "loss": 0.0469, "step": 810 }, { "epoch": 0.8047105004906772, "grad_norm": 17.418657302856445, "learning_rate": 4.598135426889107e-05, "loss": 0.2565, "step": 820 }, { "epoch": 0.8145240431795878, "grad_norm": 32.446815490722656, "learning_rate": 4.593228655544652e-05, "loss": 0.1315, "step": 830 }, { "epoch": 0.8243375858684985, "grad_norm": 0.07365565747022629, "learning_rate": 4.588321884200196e-05, "loss": 0.0099, "step": 840 }, { "epoch": 0.8341511285574092, "grad_norm": 0.027613624930381775, "learning_rate": 4.583415112855741e-05, "loss": 0.1194, "step": 850 }, { "epoch": 0.8439646712463199, "grad_norm": 0.0382981114089489, "learning_rate": 4.578508341511285e-05, "loss": 0.0584, "step": 860 }, { "epoch": 0.8537782139352306, "grad_norm": 0.03215600922703743, "learning_rate": 4.573601570166831e-05, "loss": 0.0699, "step": 870 }, { "epoch": 0.8635917566241413, "grad_norm": 0.09949254989624023, "learning_rate": 4.568694798822375e-05, "loss": 0.0052, "step": 880 }, { "epoch": 0.873405299313052, "grad_norm": 0.024233952164649963, "learning_rate": 4.56378802747792e-05, "loss": 0.0739, "step": 890 }, { "epoch": 0.8832188420019627, "grad_norm": 0.04114871844649315, "learning_rate": 4.558881256133464e-05, "loss": 0.0629, "step": 900 }, { "epoch": 0.8930323846908734, "grad_norm": 0.02779584936797619, "learning_rate": 4.553974484789009e-05, "loss": 0.0137, "step": 910 }, { "epoch": 0.9028459273797841, "grad_norm": 0.02605428174138069, "learning_rate": 4.549067713444553e-05, "loss": 0.0669, "step": 920 }, { "epoch": 0.9126594700686947, "grad_norm": 0.02386913076043129, "learning_rate": 4.544160942100099e-05, "loss": 0.1172, "step": 930 }, { "epoch": 0.9224730127576055, "grad_norm": 7.601992130279541, "learning_rate": 4.539254170755643e-05, "loss": 0.0611, "step": 940 }, { "epoch": 0.9322865554465162, "grad_norm": 37.712459564208984, "learning_rate": 4.534347399411188e-05, "loss": 0.0191, "step": 950 }, { "epoch": 0.9421000981354269, "grad_norm": 0.02080320380628109, "learning_rate": 4.529440628066732e-05, "loss": 0.0026, "step": 960 }, { "epoch": 0.9519136408243376, "grad_norm": 56.210121154785156, "learning_rate": 4.5245338567222765e-05, "loss": 0.0201, "step": 970 }, { "epoch": 0.9617271835132483, "grad_norm": 0.020656289532780647, "learning_rate": 4.5196270853778214e-05, "loss": 0.0601, "step": 980 }, { "epoch": 0.971540726202159, "grad_norm": 2.9532973766326904, "learning_rate": 4.514720314033366e-05, "loss": 0.2589, "step": 990 }, { "epoch": 0.9813542688910697, "grad_norm": 0.6973829865455627, "learning_rate": 4.509813542688911e-05, "loss": 0.1333, "step": 1000 }, { "epoch": 0.9911678115799804, "grad_norm": 0.07507435232400894, "learning_rate": 4.5049067713444554e-05, "loss": 0.033, "step": 1010 }, { "epoch": 1.000981354268891, "grad_norm": 0.025814848020672798, "learning_rate": 4.5e-05, "loss": 0.0752, "step": 1020 }, { "epoch": 1.0107948969578018, "grad_norm": 0.02597714029252529, "learning_rate": 4.4950932286555445e-05, "loss": 0.073, "step": 1030 }, { "epoch": 1.0206084396467126, "grad_norm": 0.045735057443380356, "learning_rate": 4.4901864573110894e-05, "loss": 0.0649, "step": 1040 }, { "epoch": 1.030421982335623, "grad_norm": 0.022413084283471107, "learning_rate": 4.485279685966634e-05, "loss": 0.1111, "step": 1050 }, { "epoch": 1.0402355250245339, "grad_norm": 0.07954522222280502, "learning_rate": 4.480372914622179e-05, "loss": 0.1623, "step": 1060 }, { "epoch": 1.0500490677134446, "grad_norm": 0.042877595871686935, "learning_rate": 4.4754661432777234e-05, "loss": 0.0954, "step": 1070 }, { "epoch": 1.0598626104023552, "grad_norm": 0.3953896462917328, "learning_rate": 4.470559371933268e-05, "loss": 0.007, "step": 1080 }, { "epoch": 1.069676153091266, "grad_norm": 0.023617839440703392, "learning_rate": 4.4656526005888125e-05, "loss": 0.0037, "step": 1090 }, { "epoch": 1.0794896957801767, "grad_norm": 0.018734032288193703, "learning_rate": 4.4607458292443574e-05, "loss": 0.0046, "step": 1100 }, { "epoch": 1.0893032384690873, "grad_norm": 0.0192360058426857, "learning_rate": 4.455839057899902e-05, "loss": 0.1574, "step": 1110 }, { "epoch": 1.099116781157998, "grad_norm": 0.028046630322933197, "learning_rate": 4.4509322865554466e-05, "loss": 0.0033, "step": 1120 }, { "epoch": 1.1089303238469088, "grad_norm": 0.01861531473696232, "learning_rate": 4.4460255152109915e-05, "loss": 0.0025, "step": 1130 }, { "epoch": 1.1187438665358194, "grad_norm": 0.018167071044445038, "learning_rate": 4.441118743866536e-05, "loss": 0.0023, "step": 1140 }, { "epoch": 1.1285574092247301, "grad_norm": 0.07722876965999603, "learning_rate": 4.4362119725220806e-05, "loss": 0.058, "step": 1150 }, { "epoch": 1.138370951913641, "grad_norm": 0.018998106941580772, "learning_rate": 4.4313052011776255e-05, "loss": 0.0022, "step": 1160 }, { "epoch": 1.1481844946025515, "grad_norm": 0.01866711676120758, "learning_rate": 4.4263984298331704e-05, "loss": 0.0025, "step": 1170 }, { "epoch": 1.1579980372914622, "grad_norm": 0.017769252881407738, "learning_rate": 4.4214916584887146e-05, "loss": 0.0022, "step": 1180 }, { "epoch": 1.167811579980373, "grad_norm": 0.019811883568763733, "learning_rate": 4.4165848871442595e-05, "loss": 0.0019, "step": 1190 }, { "epoch": 1.1776251226692835, "grad_norm": 0.02496548742055893, "learning_rate": 4.411678115799804e-05, "loss": 0.0021, "step": 1200 }, { "epoch": 1.1874386653581943, "grad_norm": 0.01511597540229559, "learning_rate": 4.4067713444553486e-05, "loss": 0.0019, "step": 1210 }, { "epoch": 1.197252208047105, "grad_norm": 0.01455361396074295, "learning_rate": 4.4018645731108935e-05, "loss": 0.0019, "step": 1220 }, { "epoch": 1.2070657507360156, "grad_norm": 0.0400017648935318, "learning_rate": 4.3969578017664384e-05, "loss": 0.0018, "step": 1230 }, { "epoch": 1.2168792934249264, "grad_norm": 0.016889173537492752, "learning_rate": 4.3920510304219826e-05, "loss": 0.1328, "step": 1240 }, { "epoch": 1.2266928361138372, "grad_norm": 0.07678048312664032, "learning_rate": 4.3871442590775275e-05, "loss": 0.002, "step": 1250 }, { "epoch": 1.2365063788027477, "grad_norm": 0.022459661588072777, "learning_rate": 4.382237487733072e-05, "loss": 0.0419, "step": 1260 }, { "epoch": 1.2463199214916585, "grad_norm": 0.015639062970876694, "learning_rate": 4.377330716388616e-05, "loss": 0.0021, "step": 1270 }, { "epoch": 1.2561334641805693, "grad_norm": 0.014097293838858604, "learning_rate": 4.372423945044161e-05, "loss": 0.013, "step": 1280 }, { "epoch": 1.2659470068694798, "grad_norm": 0.014198847115039825, "learning_rate": 4.367517173699706e-05, "loss": 0.0018, "step": 1290 }, { "epoch": 1.2757605495583906, "grad_norm": 0.020636072382330894, "learning_rate": 4.3626104023552507e-05, "loss": 0.002, "step": 1300 }, { "epoch": 1.2855740922473013, "grad_norm": 0.013957252725958824, "learning_rate": 4.357703631010795e-05, "loss": 0.0016, "step": 1310 }, { "epoch": 1.295387634936212, "grad_norm": 0.8039536476135254, "learning_rate": 4.35279685966634e-05, "loss": 0.0174, "step": 1320 }, { "epoch": 1.3052011776251227, "grad_norm": 0.034514885395765305, "learning_rate": 4.347890088321884e-05, "loss": 0.0018, "step": 1330 }, { "epoch": 1.3150147203140334, "grad_norm": 0.0127074820920825, "learning_rate": 4.342983316977429e-05, "loss": 0.2055, "step": 1340 }, { "epoch": 1.324828263002944, "grad_norm": 0.09654640406370163, "learning_rate": 4.338076545632974e-05, "loss": 0.0653, "step": 1350 }, { "epoch": 1.3346418056918548, "grad_norm": 0.018491486087441444, "learning_rate": 4.333169774288519e-05, "loss": 0.0019, "step": 1360 }, { "epoch": 1.3444553483807655, "grad_norm": 0.014405413530766964, "learning_rate": 4.328263002944063e-05, "loss": 0.0752, "step": 1370 }, { "epoch": 1.354268891069676, "grad_norm": 0.3947644531726837, "learning_rate": 4.323356231599608e-05, "loss": 0.0651, "step": 1380 }, { "epoch": 1.3640824337585868, "grad_norm": 0.027137773111462593, "learning_rate": 4.318449460255152e-05, "loss": 0.0833, "step": 1390 }, { "epoch": 1.3738959764474976, "grad_norm": 0.03568737953901291, "learning_rate": 4.313542688910697e-05, "loss": 0.002, "step": 1400 }, { "epoch": 1.3837095191364082, "grad_norm": 0.14877857267856598, "learning_rate": 4.308635917566242e-05, "loss": 0.0691, "step": 1410 }, { "epoch": 1.393523061825319, "grad_norm": 0.018405767157673836, "learning_rate": 4.303729146221786e-05, "loss": 0.0029, "step": 1420 }, { "epoch": 1.4033366045142297, "grad_norm": 0.5927426815032959, "learning_rate": 4.298822374877331e-05, "loss": 0.0024, "step": 1430 }, { "epoch": 1.4131501472031402, "grad_norm": 0.018540162593126297, "learning_rate": 4.293915603532875e-05, "loss": 0.0125, "step": 1440 }, { "epoch": 1.422963689892051, "grad_norm": 27.07039451599121, "learning_rate": 4.28900883218842e-05, "loss": 0.125, "step": 1450 }, { "epoch": 1.4327772325809618, "grad_norm": 0.020999347791075706, "learning_rate": 4.284102060843965e-05, "loss": 0.0787, "step": 1460 }, { "epoch": 1.4425907752698723, "grad_norm": 0.09069288522005081, "learning_rate": 4.27919528949951e-05, "loss": 0.0228, "step": 1470 }, { "epoch": 1.452404317958783, "grad_norm": 0.014280487783253193, "learning_rate": 4.274288518155054e-05, "loss": 0.0406, "step": 1480 }, { "epoch": 1.4622178606476939, "grad_norm": 0.014194531366229057, "learning_rate": 4.269381746810599e-05, "loss": 0.0024, "step": 1490 }, { "epoch": 1.4720314033366044, "grad_norm": 0.019226528704166412, "learning_rate": 4.264474975466143e-05, "loss": 0.0019, "step": 1500 }, { "epoch": 1.4818449460255152, "grad_norm": 0.015254977159202099, "learning_rate": 4.259568204121688e-05, "loss": 0.0022, "step": 1510 }, { "epoch": 1.491658488714426, "grad_norm": 5.0018768310546875, "learning_rate": 4.254661432777233e-05, "loss": 0.1376, "step": 1520 }, { "epoch": 1.5014720314033365, "grad_norm": 0.032981228083372116, "learning_rate": 4.249754661432778e-05, "loss": 0.0174, "step": 1530 }, { "epoch": 1.5112855740922473, "grad_norm": 0.011964638717472553, "learning_rate": 4.244847890088322e-05, "loss": 0.0028, "step": 1540 }, { "epoch": 1.521099116781158, "grad_norm": 0.011394723318517208, "learning_rate": 4.239941118743867e-05, "loss": 0.0514, "step": 1550 }, { "epoch": 1.5309126594700686, "grad_norm": 0.01083845179527998, "learning_rate": 4.235034347399411e-05, "loss": 0.0428, "step": 1560 }, { "epoch": 1.5407262021589794, "grad_norm": 0.017966322600841522, "learning_rate": 4.230127576054956e-05, "loss": 0.0015, "step": 1570 }, { "epoch": 1.5505397448478901, "grad_norm": 0.029729802161455154, "learning_rate": 4.225220804710501e-05, "loss": 0.0021, "step": 1580 }, { "epoch": 1.5603532875368007, "grad_norm": 0.01271316409111023, "learning_rate": 4.220314033366045e-05, "loss": 0.1416, "step": 1590 }, { "epoch": 1.5701668302257115, "grad_norm": 0.01406879723072052, "learning_rate": 4.21540726202159e-05, "loss": 0.0022, "step": 1600 }, { "epoch": 1.5799803729146222, "grad_norm": 0.01311685424298048, "learning_rate": 4.2105004906771344e-05, "loss": 0.0505, "step": 1610 }, { "epoch": 1.5897939156035328, "grad_norm": 0.015997188165783882, "learning_rate": 4.205593719332679e-05, "loss": 0.1032, "step": 1620 }, { "epoch": 1.5996074582924436, "grad_norm": 0.021411443129181862, "learning_rate": 4.2006869479882235e-05, "loss": 0.0024, "step": 1630 }, { "epoch": 1.6094210009813543, "grad_norm": 0.011407027952373028, "learning_rate": 4.195780176643769e-05, "loss": 0.0021, "step": 1640 }, { "epoch": 1.6192345436702649, "grad_norm": 0.03794229030609131, "learning_rate": 4.190873405299313e-05, "loss": 0.0521, "step": 1650 }, { "epoch": 1.6290480863591756, "grad_norm": 0.012096689082682133, "learning_rate": 4.185966633954858e-05, "loss": 0.0017, "step": 1660 }, { "epoch": 1.6388616290480864, "grad_norm": 0.013807197101414204, "learning_rate": 4.1810598626104024e-05, "loss": 0.0014, "step": 1670 }, { "epoch": 1.648675171736997, "grad_norm": 0.010036585852503777, "learning_rate": 4.176153091265947e-05, "loss": 0.0016, "step": 1680 }, { "epoch": 1.6584887144259077, "grad_norm": 0.009630713611841202, "learning_rate": 4.1712463199214915e-05, "loss": 0.0017, "step": 1690 }, { "epoch": 1.6683022571148185, "grad_norm": 0.009277078323066235, "learning_rate": 4.1663395485770364e-05, "loss": 0.0421, "step": 1700 }, { "epoch": 1.678115799803729, "grad_norm": 0.01374200638383627, "learning_rate": 4.161432777232581e-05, "loss": 0.0755, "step": 1710 }, { "epoch": 1.6879293424926398, "grad_norm": 0.010488603264093399, "learning_rate": 4.1565260058881255e-05, "loss": 0.1944, "step": 1720 }, { "epoch": 1.6977428851815506, "grad_norm": 1.660660982131958, "learning_rate": 4.1516192345436704e-05, "loss": 0.0907, "step": 1730 }, { "epoch": 1.7075564278704611, "grad_norm": 0.1264234334230423, "learning_rate": 4.1467124631992147e-05, "loss": 0.0785, "step": 1740 }, { "epoch": 1.717369970559372, "grad_norm": 0.1920449286699295, "learning_rate": 4.1418056918547595e-05, "loss": 0.1021, "step": 1750 }, { "epoch": 1.7271835132482827, "grad_norm": 0.012796717695891857, "learning_rate": 4.1368989205103044e-05, "loss": 0.0559, "step": 1760 }, { "epoch": 1.7369970559371932, "grad_norm": 27.7369384765625, "learning_rate": 4.1319921491658493e-05, "loss": 0.0922, "step": 1770 }, { "epoch": 1.746810598626104, "grad_norm": 6.674612522125244, "learning_rate": 4.1270853778213936e-05, "loss": 0.0059, "step": 1780 }, { "epoch": 1.7566241413150148, "grad_norm": 0.016583973541855812, "learning_rate": 4.1221786064769385e-05, "loss": 0.0692, "step": 1790 }, { "epoch": 1.7664376840039253, "grad_norm": 9.963221549987793, "learning_rate": 4.117271835132483e-05, "loss": 0.1275, "step": 1800 }, { "epoch": 1.776251226692836, "grad_norm": 0.011029438115656376, "learning_rate": 4.1123650637880276e-05, "loss": 0.0016, "step": 1810 }, { "epoch": 1.7860647693817469, "grad_norm": 0.019155096262693405, "learning_rate": 4.1074582924435725e-05, "loss": 0.0013, "step": 1820 }, { "epoch": 1.7958783120706574, "grad_norm": 0.011651580221951008, "learning_rate": 4.1025515210991174e-05, "loss": 0.0022, "step": 1830 }, { "epoch": 1.8056918547595682, "grad_norm": 0.08313179016113281, "learning_rate": 4.0976447497546616e-05, "loss": 0.0016, "step": 1840 }, { "epoch": 1.815505397448479, "grad_norm": 0.0103986244648695, "learning_rate": 4.092737978410206e-05, "loss": 0.0791, "step": 1850 }, { "epoch": 1.8253189401373895, "grad_norm": 0.009013223461806774, "learning_rate": 4.087831207065751e-05, "loss": 0.0582, "step": 1860 }, { "epoch": 1.8351324828263003, "grad_norm": 0.010367879644036293, "learning_rate": 4.0829244357212956e-05, "loss": 0.0111, "step": 1870 }, { "epoch": 1.844946025515211, "grad_norm": 0.03960138186812401, "learning_rate": 4.0780176643768405e-05, "loss": 0.0027, "step": 1880 }, { "epoch": 1.8547595682041216, "grad_norm": 8.385934829711914, "learning_rate": 4.073110893032385e-05, "loss": 0.0666, "step": 1890 }, { "epoch": 1.8645731108930323, "grad_norm": 0.008534184657037258, "learning_rate": 4.0682041216879296e-05, "loss": 0.0012, "step": 1900 }, { "epoch": 1.8743866535819431, "grad_norm": 0.009065428748726845, "learning_rate": 4.063297350343474e-05, "loss": 0.0017, "step": 1910 }, { "epoch": 1.8842001962708537, "grad_norm": 0.6313449144363403, "learning_rate": 4.058390578999019e-05, "loss": 0.13, "step": 1920 }, { "epoch": 1.8940137389597644, "grad_norm": 7.004214286804199, "learning_rate": 4.0534838076545636e-05, "loss": 0.1427, "step": 1930 }, { "epoch": 1.9038272816486752, "grad_norm": 0.045783668756484985, "learning_rate": 4.0485770363101085e-05, "loss": 0.1027, "step": 1940 }, { "epoch": 1.9136408243375858, "grad_norm": 0.28262466192245483, "learning_rate": 4.043670264965653e-05, "loss": 0.0067, "step": 1950 }, { "epoch": 1.9234543670264965, "grad_norm": 0.010344293899834156, "learning_rate": 4.038763493621198e-05, "loss": 0.0525, "step": 1960 }, { "epoch": 1.9332679097154073, "grad_norm": 0.06860088557004929, "learning_rate": 4.033856722276742e-05, "loss": 0.1299, "step": 1970 }, { "epoch": 1.9430814524043178, "grad_norm": 1.1787910461425781, "learning_rate": 4.028949950932287e-05, "loss": 0.123, "step": 1980 }, { "epoch": 1.9528949950932286, "grad_norm": 0.008300206623971462, "learning_rate": 4.024043179587832e-05, "loss": 0.0547, "step": 1990 }, { "epoch": 1.9627085377821394, "grad_norm": 19.000707626342773, "learning_rate": 4.019136408243376e-05, "loss": 0.1152, "step": 2000 }, { "epoch": 1.97252208047105, "grad_norm": 0.17480367422103882, "learning_rate": 4.014229636898921e-05, "loss": 0.0875, "step": 2010 }, { "epoch": 1.9823356231599607, "grad_norm": 0.014936638996005058, "learning_rate": 4.009322865554465e-05, "loss": 0.0463, "step": 2020 }, { "epoch": 1.9921491658488715, "grad_norm": 0.04136461392045021, "learning_rate": 4.00441609421001e-05, "loss": 0.0205, "step": 2030 }, { "epoch": 2.001962708537782, "grad_norm": 0.008648752234876156, "learning_rate": 3.999509322865554e-05, "loss": 0.0015, "step": 2040 }, { "epoch": 2.011776251226693, "grad_norm": 0.03223758190870285, "learning_rate": 3.9946025515211e-05, "loss": 0.0015, "step": 2050 }, { "epoch": 2.0215897939156036, "grad_norm": 0.07674010843038559, "learning_rate": 3.989695780176644e-05, "loss": 0.0437, "step": 2060 }, { "epoch": 2.031403336604514, "grad_norm": 0.00833066739141941, "learning_rate": 3.984789008832189e-05, "loss": 0.0073, "step": 2070 }, { "epoch": 2.041216879293425, "grad_norm": 0.04498624801635742, "learning_rate": 3.979882237487733e-05, "loss": 0.001, "step": 2080 }, { "epoch": 2.0510304219823356, "grad_norm": 0.00966518186032772, "learning_rate": 3.974975466143278e-05, "loss": 0.0009, "step": 2090 }, { "epoch": 2.060843964671246, "grad_norm": 0.00706259673461318, "learning_rate": 3.970068694798822e-05, "loss": 0.0399, "step": 2100 }, { "epoch": 2.070657507360157, "grad_norm": 0.8026844263076782, "learning_rate": 3.965161923454368e-05, "loss": 0.0022, "step": 2110 }, { "epoch": 2.0804710500490677, "grad_norm": 0.006883909460157156, "learning_rate": 3.960255152109912e-05, "loss": 0.0477, "step": 2120 }, { "epoch": 2.0902845927379783, "grad_norm": 0.007603227626532316, "learning_rate": 3.955348380765457e-05, "loss": 0.001, "step": 2130 }, { "epoch": 2.1000981354268893, "grad_norm": 0.007116556167602539, "learning_rate": 3.950441609421001e-05, "loss": 0.1037, "step": 2140 }, { "epoch": 2.1099116781158, "grad_norm": 0.01041839923709631, "learning_rate": 3.945534838076545e-05, "loss": 0.002, "step": 2150 }, { "epoch": 2.1197252208047104, "grad_norm": 0.007161868270486593, "learning_rate": 3.94062806673209e-05, "loss": 0.0019, "step": 2160 }, { "epoch": 2.1295387634936214, "grad_norm": 5.720839977264404, "learning_rate": 3.935721295387635e-05, "loss": 0.0342, "step": 2170 }, { "epoch": 2.139352306182532, "grad_norm": 0.006590616423636675, "learning_rate": 3.93081452404318e-05, "loss": 0.0009, "step": 2180 }, { "epoch": 2.1491658488714425, "grad_norm": 0.006484217941761017, "learning_rate": 3.925907752698724e-05, "loss": 0.0009, "step": 2190 }, { "epoch": 2.1589793915603535, "grad_norm": 0.006214112509042025, "learning_rate": 3.921000981354269e-05, "loss": 0.0007, "step": 2200 }, { "epoch": 2.168792934249264, "grad_norm": 0.006126193795353174, "learning_rate": 3.9160942100098133e-05, "loss": 0.0007, "step": 2210 }, { "epoch": 2.1786064769381746, "grad_norm": 0.006230359897017479, "learning_rate": 3.911187438665358e-05, "loss": 0.0008, "step": 2220 }, { "epoch": 2.1884200196270855, "grad_norm": 0.0061959754675626755, "learning_rate": 3.906280667320903e-05, "loss": 0.0008, "step": 2230 }, { "epoch": 2.198233562315996, "grad_norm": 0.006188957951962948, "learning_rate": 3.901373895976448e-05, "loss": 0.0007, "step": 2240 }, { "epoch": 2.2080471050049066, "grad_norm": 0.005858860444277525, "learning_rate": 3.896467124631992e-05, "loss": 0.0705, "step": 2250 }, { "epoch": 2.2178606476938176, "grad_norm": 0.006062773987650871, "learning_rate": 3.891560353287537e-05, "loss": 0.0526, "step": 2260 }, { "epoch": 2.227674190382728, "grad_norm": 0.005856741685420275, "learning_rate": 3.8866535819430814e-05, "loss": 0.0007, "step": 2270 }, { "epoch": 2.2374877330716387, "grad_norm": 0.009888865053653717, "learning_rate": 3.881746810598626e-05, "loss": 0.0467, "step": 2280 }, { "epoch": 2.2473012757605497, "grad_norm": 31.81001853942871, "learning_rate": 3.876840039254171e-05, "loss": 0.008, "step": 2290 }, { "epoch": 2.2571148184494603, "grad_norm": 0.006301484536379576, "learning_rate": 3.8719332679097154e-05, "loss": 0.0007, "step": 2300 }, { "epoch": 2.266928361138371, "grad_norm": 0.006280009169131517, "learning_rate": 3.86702649656526e-05, "loss": 0.001, "step": 2310 }, { "epoch": 2.276741903827282, "grad_norm": 1.7603585720062256, "learning_rate": 3.8621197252208045e-05, "loss": 0.1061, "step": 2320 }, { "epoch": 2.2865554465161924, "grad_norm": 0.006645340472459793, "learning_rate": 3.8572129538763494e-05, "loss": 0.0007, "step": 2330 }, { "epoch": 2.296368989205103, "grad_norm": 0.008996882475912571, "learning_rate": 3.852306182531894e-05, "loss": 0.0857, "step": 2340 }, { "epoch": 2.306182531894014, "grad_norm": 16.020790100097656, "learning_rate": 3.847399411187439e-05, "loss": 0.0797, "step": 2350 }, { "epoch": 2.3159960745829244, "grad_norm": 30.399795532226562, "learning_rate": 3.8424926398429834e-05, "loss": 0.0316, "step": 2360 }, { "epoch": 2.325809617271835, "grad_norm": 0.019369609653949738, "learning_rate": 3.837585868498528e-05, "loss": 0.0244, "step": 2370 }, { "epoch": 2.335623159960746, "grad_norm": 0.009330210275948048, "learning_rate": 3.8326790971540725e-05, "loss": 0.0308, "step": 2380 }, { "epoch": 2.3454367026496565, "grad_norm": 0.016002874821424484, "learning_rate": 3.8277723258096174e-05, "loss": 0.0056, "step": 2390 }, { "epoch": 2.355250245338567, "grad_norm": 0.008927385322749615, "learning_rate": 3.8228655544651623e-05, "loss": 0.001, "step": 2400 }, { "epoch": 2.365063788027478, "grad_norm": 0.010494639165699482, "learning_rate": 3.817958783120707e-05, "loss": 0.0014, "step": 2410 }, { "epoch": 2.3748773307163886, "grad_norm": 0.007917719893157482, "learning_rate": 3.8130520117762515e-05, "loss": 0.0009, "step": 2420 }, { "epoch": 2.384690873405299, "grad_norm": 0.005997834727168083, "learning_rate": 3.8081452404317964e-05, "loss": 0.001, "step": 2430 }, { "epoch": 2.39450441609421, "grad_norm": 0.006399065256118774, "learning_rate": 3.8032384690873406e-05, "loss": 0.0008, "step": 2440 }, { "epoch": 2.4043179587831207, "grad_norm": 0.010832864791154861, "learning_rate": 3.798331697742885e-05, "loss": 0.0016, "step": 2450 }, { "epoch": 2.4141315014720313, "grad_norm": 0.007472364231944084, "learning_rate": 3.7934249263984304e-05, "loss": 0.0007, "step": 2460 }, { "epoch": 2.4239450441609423, "grad_norm": 0.005750945303589106, "learning_rate": 3.7885181550539746e-05, "loss": 0.0007, "step": 2470 }, { "epoch": 2.433758586849853, "grad_norm": 0.00842629000544548, "learning_rate": 3.7836113837095195e-05, "loss": 0.0009, "step": 2480 }, { "epoch": 2.4435721295387633, "grad_norm": 1.4052761793136597, "learning_rate": 3.778704612365064e-05, "loss": 0.1304, "step": 2490 }, { "epoch": 2.4533856722276743, "grad_norm": 0.007391956634819508, "learning_rate": 3.7737978410206086e-05, "loss": 0.0007, "step": 2500 }, { "epoch": 2.463199214916585, "grad_norm": 20.43938446044922, "learning_rate": 3.768891069676153e-05, "loss": 0.1268, "step": 2510 }, { "epoch": 2.4730127576054954, "grad_norm": 0.05450147017836571, "learning_rate": 3.7639842983316984e-05, "loss": 0.0139, "step": 2520 }, { "epoch": 2.4828263002944064, "grad_norm": 0.03355271369218826, "learning_rate": 3.7590775269872426e-05, "loss": 0.0147, "step": 2530 }, { "epoch": 2.492639842983317, "grad_norm": 0.007103215903043747, "learning_rate": 3.7541707556427875e-05, "loss": 0.1271, "step": 2540 }, { "epoch": 2.5024533856722275, "grad_norm": 0.007379031740128994, "learning_rate": 3.749263984298332e-05, "loss": 0.0009, "step": 2550 }, { "epoch": 2.5122669283611385, "grad_norm": 0.2919241786003113, "learning_rate": 3.7443572129538766e-05, "loss": 0.0025, "step": 2560 }, { "epoch": 2.522080471050049, "grad_norm": 0.01704682782292366, "learning_rate": 3.739450441609421e-05, "loss": 0.0007, "step": 2570 }, { "epoch": 2.5318940137389596, "grad_norm": 0.005663587246090174, "learning_rate": 3.734543670264966e-05, "loss": 0.001, "step": 2580 }, { "epoch": 2.5417075564278706, "grad_norm": 0.006800634786486626, "learning_rate": 3.7296368989205107e-05, "loss": 0.0006, "step": 2590 }, { "epoch": 2.551521099116781, "grad_norm": 0.0049642156809568405, "learning_rate": 3.724730127576055e-05, "loss": 0.0007, "step": 2600 }, { "epoch": 2.5613346418056917, "grad_norm": 0.0051542771980166435, "learning_rate": 3.7198233562316e-05, "loss": 0.0006, "step": 2610 }, { "epoch": 2.5711481844946027, "grad_norm": 0.005128064192831516, "learning_rate": 3.714916584887144e-05, "loss": 0.0006, "step": 2620 }, { "epoch": 2.5809617271835132, "grad_norm": 0.005018030758947134, "learning_rate": 3.710009813542689e-05, "loss": 0.0007, "step": 2630 }, { "epoch": 2.590775269872424, "grad_norm": 0.004934113007038832, "learning_rate": 3.705103042198234e-05, "loss": 0.0006, "step": 2640 }, { "epoch": 2.600588812561335, "grad_norm": 0.004958492703735828, "learning_rate": 3.700196270853779e-05, "loss": 0.0006, "step": 2650 }, { "epoch": 2.6104023552502453, "grad_norm": 0.0050879898481070995, "learning_rate": 3.695289499509323e-05, "loss": 0.0006, "step": 2660 }, { "epoch": 2.620215897939156, "grad_norm": 0.004783379379659891, "learning_rate": 3.690382728164868e-05, "loss": 0.0006, "step": 2670 }, { "epoch": 2.630029440628067, "grad_norm": 0.004975931718945503, "learning_rate": 3.685475956820412e-05, "loss": 0.0006, "step": 2680 }, { "epoch": 2.6398429833169774, "grad_norm": 0.006240040063858032, "learning_rate": 3.680569185475957e-05, "loss": 0.0006, "step": 2690 }, { "epoch": 2.649656526005888, "grad_norm": 0.0050759222358465195, "learning_rate": 3.675662414131502e-05, "loss": 0.0006, "step": 2700 }, { "epoch": 2.659470068694799, "grad_norm": 0.004622638691216707, "learning_rate": 3.670755642787047e-05, "loss": 0.0005, "step": 2710 }, { "epoch": 2.6692836113837095, "grad_norm": 0.005237213335931301, "learning_rate": 3.665848871442591e-05, "loss": 0.0526, "step": 2720 }, { "epoch": 2.67909715407262, "grad_norm": 0.15502117574214935, "learning_rate": 3.660942100098136e-05, "loss": 0.0014, "step": 2730 }, { "epoch": 2.688910696761531, "grad_norm": 0.004649411886930466, "learning_rate": 3.65603532875368e-05, "loss": 0.0327, "step": 2740 }, { "epoch": 2.6987242394504416, "grad_norm": 0.004374220035970211, "learning_rate": 3.651128557409225e-05, "loss": 0.0019, "step": 2750 }, { "epoch": 2.708537782139352, "grad_norm": 7.425215244293213, "learning_rate": 3.64622178606477e-05, "loss": 0.0813, "step": 2760 }, { "epoch": 2.718351324828263, "grad_norm": 0.004420330747961998, "learning_rate": 3.641315014720314e-05, "loss": 0.0006, "step": 2770 }, { "epoch": 2.7281648675171737, "grad_norm": 0.004426442552357912, "learning_rate": 3.636408243375859e-05, "loss": 0.0008, "step": 2780 }, { "epoch": 2.7379784102060842, "grad_norm": 0.005173469893634319, "learning_rate": 3.631501472031403e-05, "loss": 0.0006, "step": 2790 }, { "epoch": 2.7477919528949952, "grad_norm": 0.0050672367215156555, "learning_rate": 3.626594700686948e-05, "loss": 0.0853, "step": 2800 }, { "epoch": 2.7576054955839058, "grad_norm": 0.005417036823928356, "learning_rate": 3.621687929342493e-05, "loss": 0.001, "step": 2810 }, { "epoch": 2.7674190382728163, "grad_norm": 0.005575211253017187, "learning_rate": 3.616781157998038e-05, "loss": 0.0007, "step": 2820 }, { "epoch": 2.7772325809617273, "grad_norm": 0.0057277195155620575, "learning_rate": 3.611874386653582e-05, "loss": 0.0006, "step": 2830 }, { "epoch": 2.787046123650638, "grad_norm": 0.005416057072579861, "learning_rate": 3.606967615309127e-05, "loss": 0.0006, "step": 2840 }, { "epoch": 2.7968596663395484, "grad_norm": 0.004573365673422813, "learning_rate": 3.602060843964671e-05, "loss": 0.0011, "step": 2850 }, { "epoch": 2.8066732090284594, "grad_norm": 0.0056626503355801105, "learning_rate": 3.597154072620216e-05, "loss": 0.001, "step": 2860 }, { "epoch": 2.81648675171737, "grad_norm": 0.006272735074162483, "learning_rate": 3.592247301275761e-05, "loss": 0.0005, "step": 2870 }, { "epoch": 2.8263002944062805, "grad_norm": 0.004290241748094559, "learning_rate": 3.587340529931306e-05, "loss": 0.0006, "step": 2880 }, { "epoch": 2.8361138370951915, "grad_norm": 0.0073272231966257095, "learning_rate": 3.58243375858685e-05, "loss": 0.059, "step": 2890 }, { "epoch": 2.845927379784102, "grad_norm": 0.0045128497295081615, "learning_rate": 3.5775269872423944e-05, "loss": 0.0773, "step": 2900 }, { "epoch": 2.8557409224730126, "grad_norm": 0.005028576590120792, "learning_rate": 3.572620215897939e-05, "loss": 0.0008, "step": 2910 }, { "epoch": 2.8655544651619236, "grad_norm": 0.004786403849720955, "learning_rate": 3.5677134445534835e-05, "loss": 0.0855, "step": 2920 }, { "epoch": 2.875368007850834, "grad_norm": 0.02878345362842083, "learning_rate": 3.5628066732090284e-05, "loss": 0.0006, "step": 2930 }, { "epoch": 2.8851815505397447, "grad_norm": 32.582359313964844, "learning_rate": 3.557899901864573e-05, "loss": 0.0652, "step": 2940 }, { "epoch": 2.8949950932286557, "grad_norm": 0.06951310485601425, "learning_rate": 3.552993130520118e-05, "loss": 0.033, "step": 2950 }, { "epoch": 2.904808635917566, "grad_norm": 0.00533737288787961, "learning_rate": 3.5480863591756624e-05, "loss": 0.0057, "step": 2960 }, { "epoch": 2.9146221786064768, "grad_norm": 0.005290019791573286, "learning_rate": 3.543179587831207e-05, "loss": 0.095, "step": 2970 }, { "epoch": 2.9244357212953878, "grad_norm": 0.0044818902388215065, "learning_rate": 3.5382728164867515e-05, "loss": 0.0009, "step": 2980 }, { "epoch": 2.9342492639842983, "grad_norm": 0.005349620245397091, "learning_rate": 3.5333660451422964e-05, "loss": 0.0829, "step": 2990 }, { "epoch": 2.944062806673209, "grad_norm": 0.011460080742835999, "learning_rate": 3.528459273797841e-05, "loss": 0.0529, "step": 3000 }, { "epoch": 2.95387634936212, "grad_norm": 0.0047313557006418705, "learning_rate": 3.523552502453386e-05, "loss": 0.0037, "step": 3010 }, { "epoch": 2.9636898920510304, "grad_norm": 0.01534937135875225, "learning_rate": 3.5186457311089304e-05, "loss": 0.1236, "step": 3020 }, { "epoch": 2.973503434739941, "grad_norm": 0.007522872183471918, "learning_rate": 3.5137389597644747e-05, "loss": 0.0115, "step": 3030 }, { "epoch": 2.983316977428852, "grad_norm": 0.024374373257160187, "learning_rate": 3.5088321884200196e-05, "loss": 0.0008, "step": 3040 }, { "epoch": 2.9931305201177625, "grad_norm": 0.08516921103000641, "learning_rate": 3.5039254170755645e-05, "loss": 0.0956, "step": 3050 }, { "epoch": 3.002944062806673, "grad_norm": 0.005535255651921034, "learning_rate": 3.4990186457311094e-05, "loss": 0.0519, "step": 3060 }, { "epoch": 3.012757605495584, "grad_norm": 0.015444884076714516, "learning_rate": 3.4941118743866536e-05, "loss": 0.0013, "step": 3070 }, { "epoch": 3.0225711481844946, "grad_norm": 0.00661628320813179, "learning_rate": 3.4892051030421985e-05, "loss": 0.0008, "step": 3080 }, { "epoch": 3.032384690873405, "grad_norm": 0.01968499645590782, "learning_rate": 3.484298331697743e-05, "loss": 0.0041, "step": 3090 }, { "epoch": 3.042198233562316, "grad_norm": 0.004277428146451712, "learning_rate": 3.4793915603532876e-05, "loss": 0.0104, "step": 3100 }, { "epoch": 3.0520117762512267, "grad_norm": 0.007642016280442476, "learning_rate": 3.4744847890088325e-05, "loss": 0.0006, "step": 3110 }, { "epoch": 3.061825318940137, "grad_norm": 0.004083346109837294, "learning_rate": 3.4695780176643774e-05, "loss": 0.128, "step": 3120 }, { "epoch": 3.071638861629048, "grad_norm": 0.01271857414394617, "learning_rate": 3.4646712463199216e-05, "loss": 0.0659, "step": 3130 }, { "epoch": 3.0814524043179587, "grad_norm": 0.009639259427785873, "learning_rate": 3.4597644749754665e-05, "loss": 0.0424, "step": 3140 }, { "epoch": 3.0912659470068693, "grad_norm": 0.023669827729463577, "learning_rate": 3.454857703631011e-05, "loss": 0.0309, "step": 3150 }, { "epoch": 3.1010794896957803, "grad_norm": 0.004919820465147495, "learning_rate": 3.4499509322865556e-05, "loss": 0.0146, "step": 3160 }, { "epoch": 3.110893032384691, "grad_norm": 0.003851011861115694, "learning_rate": 3.4450441609421005e-05, "loss": 0.0006, "step": 3170 }, { "epoch": 3.1207065750736014, "grad_norm": 0.005380318965762854, "learning_rate": 3.440137389597645e-05, "loss": 0.0177, "step": 3180 }, { "epoch": 3.1305201177625124, "grad_norm": 0.00603041285648942, "learning_rate": 3.4352306182531896e-05, "loss": 0.0006, "step": 3190 }, { "epoch": 3.140333660451423, "grad_norm": 0.003694745246320963, "learning_rate": 3.430323846908734e-05, "loss": 0.0007, "step": 3200 }, { "epoch": 3.1501472031403335, "grad_norm": 0.009091987274587154, "learning_rate": 3.425417075564279e-05, "loss": 0.0046, "step": 3210 }, { "epoch": 3.1599607458292445, "grad_norm": 44.486915588378906, "learning_rate": 3.4205103042198237e-05, "loss": 0.0156, "step": 3220 }, { "epoch": 3.169774288518155, "grad_norm": 0.003716795239597559, "learning_rate": 3.4156035328753686e-05, "loss": 0.0006, "step": 3230 }, { "epoch": 3.1795878312070656, "grad_norm": 0.010979007929563522, "learning_rate": 3.410696761530913e-05, "loss": 0.0006, "step": 3240 }, { "epoch": 3.1894013738959766, "grad_norm": 0.0035946909338235855, "learning_rate": 3.405789990186458e-05, "loss": 0.0526, "step": 3250 }, { "epoch": 3.199214916584887, "grad_norm": 0.0067933835089206696, "learning_rate": 3.400883218842002e-05, "loss": 0.0004, "step": 3260 }, { "epoch": 3.2090284592737977, "grad_norm": 0.0035232524387538433, "learning_rate": 3.395976447497547e-05, "loss": 0.0009, "step": 3270 }, { "epoch": 3.2188420019627086, "grad_norm": 0.02211836725473404, "learning_rate": 3.391069676153091e-05, "loss": 0.0004, "step": 3280 }, { "epoch": 3.228655544651619, "grad_norm": 0.0037303888238966465, "learning_rate": 3.3861629048086366e-05, "loss": 0.0006, "step": 3290 }, { "epoch": 3.2384690873405297, "grad_norm": 0.007376148831099272, "learning_rate": 3.381256133464181e-05, "loss": 0.0004, "step": 3300 }, { "epoch": 3.2482826300294407, "grad_norm": 0.003410831792280078, "learning_rate": 3.376349362119726e-05, "loss": 0.0004, "step": 3310 }, { "epoch": 3.2580961727183513, "grad_norm": 0.0033686254173517227, "learning_rate": 3.37144259077527e-05, "loss": 0.0004, "step": 3320 }, { "epoch": 3.267909715407262, "grad_norm": 0.0036628427915275097, "learning_rate": 3.366535819430814e-05, "loss": 0.0004, "step": 3330 }, { "epoch": 3.277723258096173, "grad_norm": 0.0034903271589428186, "learning_rate": 3.361629048086359e-05, "loss": 0.0676, "step": 3340 }, { "epoch": 3.2875368007850834, "grad_norm": 0.007418110966682434, "learning_rate": 3.356722276741904e-05, "loss": 0.0998, "step": 3350 }, { "epoch": 3.297350343473994, "grad_norm": 0.003807367756962776, "learning_rate": 3.351815505397449e-05, "loss": 0.0004, "step": 3360 }, { "epoch": 3.307163886162905, "grad_norm": 0.006307406350970268, "learning_rate": 3.346908734052993e-05, "loss": 0.0546, "step": 3370 }, { "epoch": 3.3169774288518155, "grad_norm": 0.004091127309948206, "learning_rate": 3.342001962708538e-05, "loss": 0.0688, "step": 3380 }, { "epoch": 3.326790971540726, "grad_norm": 0.008122970350086689, "learning_rate": 3.337095191364082e-05, "loss": 0.0535, "step": 3390 }, { "epoch": 3.336604514229637, "grad_norm": 0.2856459617614746, "learning_rate": 3.332188420019627e-05, "loss": 0.0479, "step": 3400 }, { "epoch": 3.3464180569185475, "grad_norm": 0.011355056427419186, "learning_rate": 3.327281648675172e-05, "loss": 0.062, "step": 3410 }, { "epoch": 3.356231599607458, "grad_norm": 0.010982933454215527, "learning_rate": 3.322374877330717e-05, "loss": 0.119, "step": 3420 }, { "epoch": 3.366045142296369, "grad_norm": 0.14039351046085358, "learning_rate": 3.317468105986261e-05, "loss": 0.0129, "step": 3430 }, { "epoch": 3.3758586849852796, "grad_norm": 0.005223874468356371, "learning_rate": 3.312561334641806e-05, "loss": 0.0417, "step": 3440 }, { "epoch": 3.38567222767419, "grad_norm": 0.0041849189437925816, "learning_rate": 3.30765456329735e-05, "loss": 0.036, "step": 3450 }, { "epoch": 3.395485770363101, "grad_norm": 0.004221642389893532, "learning_rate": 3.302747791952895e-05, "loss": 0.0021, "step": 3460 }, { "epoch": 3.4052993130520117, "grad_norm": 57.141910552978516, "learning_rate": 3.29784102060844e-05, "loss": 0.0982, "step": 3470 }, { "epoch": 3.4151128557409223, "grad_norm": 0.009060889482498169, "learning_rate": 3.292934249263984e-05, "loss": 0.0006, "step": 3480 }, { "epoch": 3.4249263984298333, "grad_norm": 0.003756599733605981, "learning_rate": 3.288027477919529e-05, "loss": 0.0033, "step": 3490 }, { "epoch": 3.434739941118744, "grad_norm": 0.0041136653162539005, "learning_rate": 3.2831207065750733e-05, "loss": 0.0015, "step": 3500 }, { "epoch": 3.4445534838076544, "grad_norm": 0.003665735013782978, "learning_rate": 3.278213935230618e-05, "loss": 0.0614, "step": 3510 }, { "epoch": 3.4543670264965654, "grad_norm": 0.003554809372872114, "learning_rate": 3.273307163886163e-05, "loss": 0.001, "step": 3520 }, { "epoch": 3.464180569185476, "grad_norm": 0.0034583976957947016, "learning_rate": 3.268400392541708e-05, "loss": 0.0008, "step": 3530 }, { "epoch": 3.4739941118743864, "grad_norm": 0.003728943644091487, "learning_rate": 3.263493621197252e-05, "loss": 0.0004, "step": 3540 }, { "epoch": 3.4838076545632974, "grad_norm": 0.003582128556445241, "learning_rate": 3.258586849852797e-05, "loss": 0.0004, "step": 3550 }, { "epoch": 3.493621197252208, "grad_norm": 0.0033694806043058634, "learning_rate": 3.2536800785083414e-05, "loss": 0.0004, "step": 3560 }, { "epoch": 3.5034347399411185, "grad_norm": 0.10974390059709549, "learning_rate": 3.248773307163886e-05, "loss": 0.0004, "step": 3570 }, { "epoch": 3.5132482826300295, "grad_norm": 0.003454001620411873, "learning_rate": 3.243866535819431e-05, "loss": 0.0004, "step": 3580 }, { "epoch": 3.52306182531894, "grad_norm": 0.0036948120687156916, "learning_rate": 3.238959764474976e-05, "loss": 0.0657, "step": 3590 }, { "epoch": 3.5328753680078506, "grad_norm": 0.0033204422798007727, "learning_rate": 3.23405299313052e-05, "loss": 0.0604, "step": 3600 }, { "epoch": 3.5426889106967616, "grad_norm": 0.00640474446117878, "learning_rate": 3.229146221786065e-05, "loss": 0.0013, "step": 3610 }, { "epoch": 3.552502453385672, "grad_norm": 0.0037864702753722668, "learning_rate": 3.2242394504416094e-05, "loss": 0.0388, "step": 3620 }, { "epoch": 3.5623159960745827, "grad_norm": 0.0036447476595640182, "learning_rate": 3.2193326790971536e-05, "loss": 0.0015, "step": 3630 }, { "epoch": 3.5721295387634937, "grad_norm": 0.004586994647979736, "learning_rate": 3.214425907752699e-05, "loss": 0.0008, "step": 3640 }, { "epoch": 3.5819430814524043, "grad_norm": 0.006420999765396118, "learning_rate": 3.2095191364082434e-05, "loss": 0.081, "step": 3650 }, { "epoch": 3.591756624141315, "grad_norm": 0.037869326770305634, "learning_rate": 3.204612365063788e-05, "loss": 0.0027, "step": 3660 }, { "epoch": 3.601570166830226, "grad_norm": 0.0033209428656846285, "learning_rate": 3.1997055937193325e-05, "loss": 0.0004, "step": 3670 }, { "epoch": 3.6113837095191363, "grad_norm": 0.0032525446731597185, "learning_rate": 3.1947988223748774e-05, "loss": 0.0004, "step": 3680 }, { "epoch": 3.621197252208047, "grad_norm": 0.0034604640677571297, "learning_rate": 3.189892051030422e-05, "loss": 0.0004, "step": 3690 }, { "epoch": 3.631010794896958, "grad_norm": 0.0048661488108336926, "learning_rate": 3.184985279685967e-05, "loss": 0.0006, "step": 3700 }, { "epoch": 3.6408243375858684, "grad_norm": 0.003736069891601801, "learning_rate": 3.1800785083415115e-05, "loss": 0.0004, "step": 3710 }, { "epoch": 3.650637880274779, "grad_norm": 0.0031651423778384924, "learning_rate": 3.1751717369970564e-05, "loss": 0.0004, "step": 3720 }, { "epoch": 3.66045142296369, "grad_norm": 0.0032348737586289644, "learning_rate": 3.1702649656526006e-05, "loss": 0.0004, "step": 3730 }, { "epoch": 3.6702649656526005, "grad_norm": 0.003265490522608161, "learning_rate": 3.1653581943081455e-05, "loss": 0.0004, "step": 3740 }, { "epoch": 3.680078508341511, "grad_norm": 0.18621397018432617, "learning_rate": 3.16045142296369e-05, "loss": 0.0432, "step": 3750 }, { "epoch": 3.689892051030422, "grad_norm": 49.72319793701172, "learning_rate": 3.155544651619235e-05, "loss": 0.0419, "step": 3760 }, { "epoch": 3.6997055937193326, "grad_norm": 0.003202399704605341, "learning_rate": 3.1506378802747795e-05, "loss": 0.0005, "step": 3770 }, { "epoch": 3.709519136408243, "grad_norm": 0.003484070301055908, "learning_rate": 3.145731108930324e-05, "loss": 0.0004, "step": 3780 }, { "epoch": 3.719332679097154, "grad_norm": 0.003013091627508402, "learning_rate": 3.1408243375858686e-05, "loss": 0.0004, "step": 3790 }, { "epoch": 3.7291462217860647, "grad_norm": 0.0030194155406206846, "learning_rate": 3.135917566241413e-05, "loss": 0.0008, "step": 3800 }, { "epoch": 3.7389597644749752, "grad_norm": 0.0030365772545337677, "learning_rate": 3.131010794896958e-05, "loss": 0.0004, "step": 3810 }, { "epoch": 3.7487733071638862, "grad_norm": 0.002989945001900196, "learning_rate": 3.1261040235525026e-05, "loss": 0.0006, "step": 3820 }, { "epoch": 3.758586849852797, "grad_norm": 0.0031468605156987906, "learning_rate": 3.1211972522080475e-05, "loss": 0.0005, "step": 3830 }, { "epoch": 3.7684003925417073, "grad_norm": 0.004800264723598957, "learning_rate": 3.116290480863592e-05, "loss": 0.0108, "step": 3840 }, { "epoch": 3.7782139352306183, "grad_norm": 0.0044929636642336845, "learning_rate": 3.1113837095191366e-05, "loss": 0.0009, "step": 3850 }, { "epoch": 3.788027477919529, "grad_norm": 0.0028576962649822235, "learning_rate": 3.106476938174681e-05, "loss": 0.0003, "step": 3860 }, { "epoch": 3.7978410206084394, "grad_norm": 0.0031541618518531322, "learning_rate": 3.101570166830226e-05, "loss": 0.0003, "step": 3870 }, { "epoch": 3.8076545632973504, "grad_norm": 0.0027680331841111183, "learning_rate": 3.096663395485771e-05, "loss": 0.0004, "step": 3880 }, { "epoch": 3.817468105986261, "grad_norm": 0.0027752595487982035, "learning_rate": 3.0917566241413156e-05, "loss": 0.0003, "step": 3890 }, { "epoch": 3.8272816486751715, "grad_norm": 0.0027524100150913, "learning_rate": 3.08684985279686e-05, "loss": 0.0003, "step": 3900 }, { "epoch": 3.8370951913640825, "grad_norm": 0.0028699261602014303, "learning_rate": 3.081943081452405e-05, "loss": 0.0005, "step": 3910 }, { "epoch": 3.846908734052993, "grad_norm": 0.002797529799863696, "learning_rate": 3.077036310107949e-05, "loss": 0.0003, "step": 3920 }, { "epoch": 3.8567222767419036, "grad_norm": 0.002745892619714141, "learning_rate": 3.072129538763494e-05, "loss": 0.0003, "step": 3930 }, { "epoch": 3.8665358194308146, "grad_norm": 0.0030794497579336166, "learning_rate": 3.067222767419039e-05, "loss": 0.1156, "step": 3940 }, { "epoch": 3.876349362119725, "grad_norm": 0.007240855600684881, "learning_rate": 3.062315996074583e-05, "loss": 0.0018, "step": 3950 }, { "epoch": 3.8861629048086357, "grad_norm": 0.0031701885163784027, "learning_rate": 3.057409224730128e-05, "loss": 0.0008, "step": 3960 }, { "epoch": 3.8959764474975467, "grad_norm": 0.007171397563070059, "learning_rate": 3.052502453385672e-05, "loss": 0.0545, "step": 3970 }, { "epoch": 3.9057899901864572, "grad_norm": 0.0026376626919955015, "learning_rate": 3.0475956820412173e-05, "loss": 0.1456, "step": 3980 }, { "epoch": 3.9156035328753678, "grad_norm": 0.004865538328886032, "learning_rate": 3.0426889106967615e-05, "loss": 0.054, "step": 3990 }, { "epoch": 3.9254170755642788, "grad_norm": 0.06169740855693817, "learning_rate": 3.0377821393523064e-05, "loss": 0.1127, "step": 4000 }, { "epoch": 3.9352306182531893, "grad_norm": 0.01491067185997963, "learning_rate": 3.032875368007851e-05, "loss": 0.0013, "step": 4010 }, { "epoch": 3.9450441609421, "grad_norm": 0.003037821501493454, "learning_rate": 3.027968596663396e-05, "loss": 0.1189, "step": 4020 }, { "epoch": 3.954857703631011, "grad_norm": 0.05605999380350113, "learning_rate": 3.02306182531894e-05, "loss": 0.0008, "step": 4030 }, { "epoch": 3.9646712463199214, "grad_norm": 0.0034519529435783625, "learning_rate": 3.0181550539744853e-05, "loss": 0.0003, "step": 4040 }, { "epoch": 3.974484789008832, "grad_norm": 0.0033396417275071144, "learning_rate": 3.0132482826300295e-05, "loss": 0.0431, "step": 4050 }, { "epoch": 3.984298331697743, "grad_norm": 0.002848528092727065, "learning_rate": 3.0083415112855744e-05, "loss": 0.0003, "step": 4060 }, { "epoch": 3.9941118743866535, "grad_norm": 0.09806457161903381, "learning_rate": 3.003434739941119e-05, "loss": 0.0576, "step": 4070 }, { "epoch": 4.003925417075564, "grad_norm": 0.009162220172584057, "learning_rate": 2.9985279685966632e-05, "loss": 0.0003, "step": 4080 }, { "epoch": 4.013738959764475, "grad_norm": 0.039267465472221375, "learning_rate": 2.993621197252208e-05, "loss": 0.0004, "step": 4090 }, { "epoch": 4.023552502453386, "grad_norm": 0.002605535788461566, "learning_rate": 2.9887144259077527e-05, "loss": 0.0004, "step": 4100 }, { "epoch": 4.033366045142296, "grad_norm": 0.003241546219214797, "learning_rate": 2.9838076545632976e-05, "loss": 0.0004, "step": 4110 }, { "epoch": 4.043179587831207, "grad_norm": 11.987616539001465, "learning_rate": 2.978900883218842e-05, "loss": 0.0393, "step": 4120 }, { "epoch": 4.052993130520118, "grad_norm": 0.002549890661612153, "learning_rate": 2.973994111874387e-05, "loss": 0.0011, "step": 4130 }, { "epoch": 4.062806673209028, "grad_norm": 0.002623894950374961, "learning_rate": 2.9690873405299312e-05, "loss": 0.0008, "step": 4140 }, { "epoch": 4.072620215897939, "grad_norm": 0.0023546249140053988, "learning_rate": 2.964180569185476e-05, "loss": 0.0026, "step": 4150 }, { "epoch": 4.08243375858685, "grad_norm": 0.0023659905418753624, "learning_rate": 2.9592737978410207e-05, "loss": 0.078, "step": 4160 }, { "epoch": 4.09224730127576, "grad_norm": 0.002533614169806242, "learning_rate": 2.9543670264965656e-05, "loss": 0.0745, "step": 4170 }, { "epoch": 4.102060843964671, "grad_norm": 0.012038661167025566, "learning_rate": 2.94946025515211e-05, "loss": 0.0551, "step": 4180 }, { "epoch": 4.111874386653582, "grad_norm": 26.8253173828125, "learning_rate": 2.944553483807655e-05, "loss": 0.0928, "step": 4190 }, { "epoch": 4.121687929342492, "grad_norm": 0.03977564349770546, "learning_rate": 2.9396467124631993e-05, "loss": 0.0528, "step": 4200 }, { "epoch": 4.131501472031403, "grad_norm": 0.0031746893655508757, "learning_rate": 2.934739941118744e-05, "loss": 0.0106, "step": 4210 }, { "epoch": 4.141315014720314, "grad_norm": 0.0031474102288484573, "learning_rate": 2.9298331697742887e-05, "loss": 0.0008, "step": 4220 }, { "epoch": 4.1511285574092245, "grad_norm": 0.04280337691307068, "learning_rate": 2.924926398429833e-05, "loss": 0.0004, "step": 4230 }, { "epoch": 4.1609421000981355, "grad_norm": 0.002831744961440563, "learning_rate": 2.920019627085378e-05, "loss": 0.0004, "step": 4240 }, { "epoch": 4.1707556427870465, "grad_norm": 0.002495395252481103, "learning_rate": 2.9151128557409224e-05, "loss": 0.0009, "step": 4250 }, { "epoch": 4.180569185475957, "grad_norm": 0.0024046385660767555, "learning_rate": 2.9102060843964673e-05, "loss": 0.0007, "step": 4260 }, { "epoch": 4.190382728164868, "grad_norm": 0.0030680035706609488, "learning_rate": 2.905299313052012e-05, "loss": 0.0005, "step": 4270 }, { "epoch": 4.200196270853779, "grad_norm": 0.0061622122302651405, "learning_rate": 2.9003925417075568e-05, "loss": 0.0018, "step": 4280 }, { "epoch": 4.210009813542689, "grad_norm": 0.0022845251951366663, "learning_rate": 2.895485770363101e-05, "loss": 0.0366, "step": 4290 }, { "epoch": 4.2198233562316, "grad_norm": 0.011359172873198986, "learning_rate": 2.890578999018646e-05, "loss": 0.0005, "step": 4300 }, { "epoch": 4.229636898920511, "grad_norm": 0.002846726682037115, "learning_rate": 2.8856722276741904e-05, "loss": 0.0004, "step": 4310 }, { "epoch": 4.239450441609421, "grad_norm": 0.002284892601892352, "learning_rate": 2.8807654563297353e-05, "loss": 0.0984, "step": 4320 }, { "epoch": 4.249263984298332, "grad_norm": 0.002528236713260412, "learning_rate": 2.87585868498528e-05, "loss": 0.0007, "step": 4330 }, { "epoch": 4.259077526987243, "grad_norm": 0.003352473024278879, "learning_rate": 2.8709519136408248e-05, "loss": 0.0004, "step": 4340 }, { "epoch": 4.268891069676153, "grad_norm": 0.004708737134933472, "learning_rate": 2.866045142296369e-05, "loss": 0.0013, "step": 4350 }, { "epoch": 4.278704612365064, "grad_norm": 0.358195036649704, "learning_rate": 2.8611383709519136e-05, "loss": 0.0282, "step": 4360 }, { "epoch": 4.288518155053975, "grad_norm": 0.002740907482802868, "learning_rate": 2.8562315996074585e-05, "loss": 0.1356, "step": 4370 }, { "epoch": 4.298331697742885, "grad_norm": 0.002787757897749543, "learning_rate": 2.8513248282630027e-05, "loss": 0.0028, "step": 4380 }, { "epoch": 4.308145240431796, "grad_norm": 8.950927734375, "learning_rate": 2.846418056918548e-05, "loss": 0.0024, "step": 4390 }, { "epoch": 4.317958783120707, "grad_norm": 0.0048212092369794846, "learning_rate": 2.841511285574092e-05, "loss": 0.0006, "step": 4400 }, { "epoch": 4.327772325809617, "grad_norm": 0.0025500452611595392, "learning_rate": 2.836604514229637e-05, "loss": 0.0006, "step": 4410 }, { "epoch": 4.337585868498528, "grad_norm": 0.0027642964851111174, "learning_rate": 2.8316977428851816e-05, "loss": 0.0436, "step": 4420 }, { "epoch": 4.347399411187439, "grad_norm": 0.0026419861242175102, "learning_rate": 2.8267909715407265e-05, "loss": 0.0004, "step": 4430 }, { "epoch": 4.357212953876349, "grad_norm": 0.004611722193658352, "learning_rate": 2.8218842001962707e-05, "loss": 0.0823, "step": 4440 }, { "epoch": 4.36702649656526, "grad_norm": 0.0055962237529456615, "learning_rate": 2.816977428851816e-05, "loss": 0.0005, "step": 4450 }, { "epoch": 4.376840039254171, "grad_norm": 0.004676250275224447, "learning_rate": 2.8120706575073602e-05, "loss": 0.0004, "step": 4460 }, { "epoch": 4.386653581943081, "grad_norm": 0.0034532281570136547, "learning_rate": 2.807163886162905e-05, "loss": 0.0006, "step": 4470 }, { "epoch": 4.396467124631992, "grad_norm": 0.016467634588479996, "learning_rate": 2.8022571148184496e-05, "loss": 0.0005, "step": 4480 }, { "epoch": 4.406280667320903, "grad_norm": 0.011575533077120781, "learning_rate": 2.7973503434739945e-05, "loss": 0.0004, "step": 4490 }, { "epoch": 4.416094210009813, "grad_norm": 0.002654923591762781, "learning_rate": 2.7924435721295388e-05, "loss": 0.0003, "step": 4500 }, { "epoch": 4.425907752698724, "grad_norm": 0.00244724890217185, "learning_rate": 2.7875368007850833e-05, "loss": 0.0003, "step": 4510 }, { "epoch": 4.435721295387635, "grad_norm": 0.00237859645858407, "learning_rate": 2.7826300294406282e-05, "loss": 0.0004, "step": 4520 }, { "epoch": 4.445534838076545, "grad_norm": 0.002822286682203412, "learning_rate": 2.7777232580961728e-05, "loss": 0.0003, "step": 4530 }, { "epoch": 4.455348380765456, "grad_norm": 0.0033684764057397842, "learning_rate": 2.7728164867517177e-05, "loss": 0.0568, "step": 4540 }, { "epoch": 4.465161923454367, "grad_norm": 0.003250374225899577, "learning_rate": 2.767909715407262e-05, "loss": 0.0004, "step": 4550 }, { "epoch": 4.4749754661432775, "grad_norm": 0.0025315198581665754, "learning_rate": 2.7630029440628068e-05, "loss": 0.0008, "step": 4560 }, { "epoch": 4.4847890088321885, "grad_norm": 0.010016725398600101, "learning_rate": 2.7580961727183514e-05, "loss": 0.0572, "step": 4570 }, { "epoch": 4.494602551521099, "grad_norm": 0.0029501202516257763, "learning_rate": 2.7531894013738963e-05, "loss": 0.0003, "step": 4580 }, { "epoch": 4.5044160942100095, "grad_norm": 0.002199607901275158, "learning_rate": 2.7482826300294405e-05, "loss": 0.0003, "step": 4590 }, { "epoch": 4.5142296368989205, "grad_norm": 0.002721391385421157, "learning_rate": 2.7433758586849857e-05, "loss": 0.0319, "step": 4600 }, { "epoch": 4.5240431795878315, "grad_norm": 0.0022027925588190556, "learning_rate": 2.73846908734053e-05, "loss": 0.001, "step": 4610 }, { "epoch": 4.533856722276742, "grad_norm": 0.002053765347227454, "learning_rate": 2.7335623159960748e-05, "loss": 0.0539, "step": 4620 }, { "epoch": 4.543670264965653, "grad_norm": 0.019470343366265297, "learning_rate": 2.7286555446516194e-05, "loss": 0.07, "step": 4630 }, { "epoch": 4.553483807654564, "grad_norm": 0.0020952706690877676, "learning_rate": 2.7237487733071643e-05, "loss": 0.0896, "step": 4640 }, { "epoch": 4.563297350343474, "grad_norm": 0.0032566250301897526, "learning_rate": 2.7188420019627085e-05, "loss": 0.0004, "step": 4650 }, { "epoch": 4.573110893032385, "grad_norm": 0.01735255867242813, "learning_rate": 2.713935230618253e-05, "loss": 0.0763, "step": 4660 }, { "epoch": 4.582924435721296, "grad_norm": 0.005380355753004551, "learning_rate": 2.709028459273798e-05, "loss": 0.0004, "step": 4670 }, { "epoch": 4.592737978410206, "grad_norm": 0.021537847816944122, "learning_rate": 2.7041216879293425e-05, "loss": 0.008, "step": 4680 }, { "epoch": 4.602551521099117, "grad_norm": 0.004185757599771023, "learning_rate": 2.6992149165848874e-05, "loss": 0.0004, "step": 4690 }, { "epoch": 4.612365063788028, "grad_norm": 0.012351655401289463, "learning_rate": 2.6943081452404316e-05, "loss": 0.0016, "step": 4700 }, { "epoch": 4.622178606476938, "grad_norm": 0.002798211993649602, "learning_rate": 2.6894013738959765e-05, "loss": 0.0403, "step": 4710 }, { "epoch": 4.631992149165849, "grad_norm": 0.002490241779014468, "learning_rate": 2.684494602551521e-05, "loss": 0.0003, "step": 4720 }, { "epoch": 4.64180569185476, "grad_norm": 0.020930418744683266, "learning_rate": 2.679587831207066e-05, "loss": 0.0029, "step": 4730 }, { "epoch": 4.65161923454367, "grad_norm": 37.52565383911133, "learning_rate": 2.6746810598626106e-05, "loss": 0.0143, "step": 4740 }, { "epoch": 4.661432777232581, "grad_norm": 0.002027578419074416, "learning_rate": 2.6697742885181555e-05, "loss": 0.0002, "step": 4750 }, { "epoch": 4.671246319921492, "grad_norm": 0.0019966133404523134, "learning_rate": 2.6648675171736997e-05, "loss": 0.0002, "step": 4760 }, { "epoch": 4.681059862610402, "grad_norm": 0.001950482139363885, "learning_rate": 2.6599607458292446e-05, "loss": 0.0002, "step": 4770 }, { "epoch": 4.690873405299313, "grad_norm": 0.0020267153158783913, "learning_rate": 2.655053974484789e-05, "loss": 0.0826, "step": 4780 }, { "epoch": 4.700686947988224, "grad_norm": 17.92084503173828, "learning_rate": 2.650147203140334e-05, "loss": 0.0432, "step": 4790 }, { "epoch": 4.710500490677134, "grad_norm": 0.002029955852776766, "learning_rate": 2.6452404317958786e-05, "loss": 0.0004, "step": 4800 }, { "epoch": 4.720314033366045, "grad_norm": 0.002043253742158413, "learning_rate": 2.6403336604514228e-05, "loss": 0.0004, "step": 4810 }, { "epoch": 4.730127576054956, "grad_norm": 8.624307632446289, "learning_rate": 2.6354268891069677e-05, "loss": 0.0568, "step": 4820 }, { "epoch": 4.739941118743866, "grad_norm": 0.0022134315222501755, "learning_rate": 2.6305201177625123e-05, "loss": 0.0003, "step": 4830 }, { "epoch": 4.749754661432777, "grad_norm": 0.003354401560500264, "learning_rate": 2.625613346418057e-05, "loss": 0.0004, "step": 4840 }, { "epoch": 4.759568204121688, "grad_norm": 0.025983460247516632, "learning_rate": 2.6207065750736014e-05, "loss": 0.0021, "step": 4850 }, { "epoch": 4.769381746810598, "grad_norm": 0.0028674264904111624, "learning_rate": 2.6157998037291466e-05, "loss": 0.0003, "step": 4860 }, { "epoch": 4.779195289499509, "grad_norm": 0.0024552124086767435, "learning_rate": 2.610893032384691e-05, "loss": 0.0161, "step": 4870 }, { "epoch": 4.78900883218842, "grad_norm": 0.01599975675344467, "learning_rate": 2.6059862610402357e-05, "loss": 0.0003, "step": 4880 }, { "epoch": 4.79882237487733, "grad_norm": 0.05640334263443947, "learning_rate": 2.6010794896957803e-05, "loss": 0.0003, "step": 4890 }, { "epoch": 4.808635917566241, "grad_norm": 0.1503908485174179, "learning_rate": 2.5961727183513252e-05, "loss": 0.0426, "step": 4900 }, { "epoch": 4.818449460255152, "grad_norm": 0.0021854902151972055, "learning_rate": 2.5912659470068694e-05, "loss": 0.0003, "step": 4910 }, { "epoch": 4.8282630029440625, "grad_norm": 0.0022083704825490713, "learning_rate": 2.5863591756624143e-05, "loss": 0.0002, "step": 4920 }, { "epoch": 4.8380765456329735, "grad_norm": 0.0018174449214711785, "learning_rate": 2.581452404317959e-05, "loss": 0.0441, "step": 4930 }, { "epoch": 4.8478900883218845, "grad_norm": 0.0019975032191723585, "learning_rate": 2.5765456329735038e-05, "loss": 0.0008, "step": 4940 }, { "epoch": 4.857703631010795, "grad_norm": 0.0022116098552942276, "learning_rate": 2.5716388616290483e-05, "loss": 0.0002, "step": 4950 }, { "epoch": 4.867517173699706, "grad_norm": 0.0019927374087274075, "learning_rate": 2.5667320902845926e-05, "loss": 0.0005, "step": 4960 }, { "epoch": 4.877330716388617, "grad_norm": 0.003586186794564128, "learning_rate": 2.5618253189401375e-05, "loss": 0.0357, "step": 4970 }, { "epoch": 4.887144259077527, "grad_norm": 0.006166779901832342, "learning_rate": 2.556918547595682e-05, "loss": 0.0003, "step": 4980 }, { "epoch": 4.896957801766438, "grad_norm": 0.03852635622024536, "learning_rate": 2.552011776251227e-05, "loss": 0.0005, "step": 4990 }, { "epoch": 4.906771344455349, "grad_norm": 0.001826342660933733, "learning_rate": 2.547105004906771e-05, "loss": 0.0007, "step": 5000 }, { "epoch": 4.916584887144259, "grad_norm": 0.0018040341092273593, "learning_rate": 2.5421982335623164e-05, "loss": 0.0002, "step": 5010 }, { "epoch": 4.92639842983317, "grad_norm": 0.0018869714112952352, "learning_rate": 2.5372914622178606e-05, "loss": 0.0002, "step": 5020 }, { "epoch": 4.936211972522081, "grad_norm": 0.0017143889563158154, "learning_rate": 2.5323846908734055e-05, "loss": 0.0002, "step": 5030 }, { "epoch": 4.946025515210991, "grad_norm": 0.0018076589331030846, "learning_rate": 2.52747791952895e-05, "loss": 0.0002, "step": 5040 }, { "epoch": 4.955839057899902, "grad_norm": 0.002003490924835205, "learning_rate": 2.522571148184495e-05, "loss": 0.0002, "step": 5050 }, { "epoch": 4.965652600588813, "grad_norm": 0.001990032149478793, "learning_rate": 2.517664376840039e-05, "loss": 0.0002, "step": 5060 }, { "epoch": 4.975466143277723, "grad_norm": 0.0017091418849304318, "learning_rate": 2.5127576054955844e-05, "loss": 0.0002, "step": 5070 }, { "epoch": 4.985279685966634, "grad_norm": 0.0019396455027163029, "learning_rate": 2.5078508341511286e-05, "loss": 0.0002, "step": 5080 }, { "epoch": 4.995093228655545, "grad_norm": 0.0016776375705376267, "learning_rate": 2.5029440628066735e-05, "loss": 0.0002, "step": 5090 }, { "epoch": 5.004906771344455, "grad_norm": 0.0017573200166225433, "learning_rate": 2.498037291462218e-05, "loss": 0.0442, "step": 5100 }, { "epoch": 5.014720314033366, "grad_norm": 0.0017642441671341658, "learning_rate": 2.4931305201177626e-05, "loss": 0.0002, "step": 5110 }, { "epoch": 5.024533856722277, "grad_norm": 0.0016604288248345256, "learning_rate": 2.4882237487733072e-05, "loss": 0.0016, "step": 5120 }, { "epoch": 5.034347399411187, "grad_norm": 0.001731898752041161, "learning_rate": 2.483316977428852e-05, "loss": 0.0002, "step": 5130 }, { "epoch": 5.044160942100098, "grad_norm": 0.0016634787898510695, "learning_rate": 2.4784102060843967e-05, "loss": 0.0002, "step": 5140 }, { "epoch": 5.053974484789009, "grad_norm": 0.0016294183442369103, "learning_rate": 2.4735034347399412e-05, "loss": 0.0002, "step": 5150 }, { "epoch": 5.063788027477919, "grad_norm": 0.0017350780544802547, "learning_rate": 2.468596663395486e-05, "loss": 0.0006, "step": 5160 }, { "epoch": 5.07360157016683, "grad_norm": 0.0015964311314746737, "learning_rate": 2.4636898920510303e-05, "loss": 0.0033, "step": 5170 }, { "epoch": 5.083415112855741, "grad_norm": 0.0018725765403360128, "learning_rate": 2.4587831207065752e-05, "loss": 0.0002, "step": 5180 }, { "epoch": 5.093228655544651, "grad_norm": 0.001561222830787301, "learning_rate": 2.4538763493621198e-05, "loss": 0.0002, "step": 5190 }, { "epoch": 5.103042198233562, "grad_norm": 0.0913846343755722, "learning_rate": 2.4489695780176643e-05, "loss": 0.0003, "step": 5200 }, { "epoch": 5.112855740922473, "grad_norm": 0.001611059415154159, "learning_rate": 2.4440628066732092e-05, "loss": 0.0002, "step": 5210 }, { "epoch": 5.122669283611383, "grad_norm": 0.0015166820958256721, "learning_rate": 2.4391560353287538e-05, "loss": 0.0002, "step": 5220 }, { "epoch": 5.132482826300294, "grad_norm": 0.0015743138501420617, "learning_rate": 2.4342492639842984e-05, "loss": 0.0002, "step": 5230 }, { "epoch": 5.142296368989205, "grad_norm": 0.0015384262660518289, "learning_rate": 2.429342492639843e-05, "loss": 0.0002, "step": 5240 }, { "epoch": 5.1521099116781155, "grad_norm": 0.004025735892355442, "learning_rate": 2.4244357212953878e-05, "loss": 0.1963, "step": 5250 }, { "epoch": 5.1619234543670265, "grad_norm": 0.052797187119722366, "learning_rate": 2.4195289499509324e-05, "loss": 0.001, "step": 5260 }, { "epoch": 5.1717369970559375, "grad_norm": 0.0028745972085744143, "learning_rate": 2.414622178606477e-05, "loss": 0.0236, "step": 5270 }, { "epoch": 5.181550539744848, "grad_norm": 0.005021668970584869, "learning_rate": 2.409715407262022e-05, "loss": 0.0002, "step": 5280 }, { "epoch": 5.191364082433759, "grad_norm": 0.08027222752571106, "learning_rate": 2.4048086359175664e-05, "loss": 0.0006, "step": 5290 }, { "epoch": 5.20117762512267, "grad_norm": 0.001573985326103866, "learning_rate": 2.399901864573111e-05, "loss": 0.0002, "step": 5300 }, { "epoch": 5.21099116781158, "grad_norm": 0.0015114896232262254, "learning_rate": 2.394995093228656e-05, "loss": 0.0002, "step": 5310 }, { "epoch": 5.220804710500491, "grad_norm": 0.0018572395201772451, "learning_rate": 2.3900883218842e-05, "loss": 0.0003, "step": 5320 }, { "epoch": 5.230618253189402, "grad_norm": 0.0015180202899500728, "learning_rate": 2.385181550539745e-05, "loss": 0.0003, "step": 5330 }, { "epoch": 5.240431795878312, "grad_norm": 0.0016390462405979633, "learning_rate": 2.3802747791952895e-05, "loss": 0.0002, "step": 5340 }, { "epoch": 5.250245338567223, "grad_norm": 0.0015287548303604126, "learning_rate": 2.375368007850834e-05, "loss": 0.0002, "step": 5350 }, { "epoch": 5.260058881256134, "grad_norm": 0.0014907275326550007, "learning_rate": 2.370461236506379e-05, "loss": 0.0002, "step": 5360 }, { "epoch": 5.269872423945044, "grad_norm": 0.001557844690978527, "learning_rate": 2.3655544651619236e-05, "loss": 0.0002, "step": 5370 }, { "epoch": 5.279685966633955, "grad_norm": 0.0018678128253668547, "learning_rate": 2.360647693817468e-05, "loss": 0.0002, "step": 5380 }, { "epoch": 5.289499509322866, "grad_norm": 0.0015175668522715569, "learning_rate": 2.355740922473013e-05, "loss": 0.0002, "step": 5390 }, { "epoch": 5.299313052011776, "grad_norm": 0.0014625848270952702, "learning_rate": 2.3508341511285576e-05, "loss": 0.0002, "step": 5400 }, { "epoch": 5.309126594700687, "grad_norm": 0.00429932726547122, "learning_rate": 2.345927379784102e-05, "loss": 0.0607, "step": 5410 }, { "epoch": 5.318940137389598, "grad_norm": 0.0014821887016296387, "learning_rate": 2.341020608439647e-05, "loss": 0.0033, "step": 5420 }, { "epoch": 5.328753680078508, "grad_norm": 0.001476548844948411, "learning_rate": 2.3361138370951916e-05, "loss": 0.0002, "step": 5430 }, { "epoch": 5.338567222767419, "grad_norm": 0.0014416587073355913, "learning_rate": 2.331207065750736e-05, "loss": 0.0002, "step": 5440 }, { "epoch": 5.34838076545633, "grad_norm": 0.001489553484134376, "learning_rate": 2.326300294406281e-05, "loss": 0.0698, "step": 5450 }, { "epoch": 5.35819430814524, "grad_norm": 0.004528726451098919, "learning_rate": 2.3213935230618256e-05, "loss": 0.0628, "step": 5460 }, { "epoch": 5.368007850834151, "grad_norm": 0.0017940645338967443, "learning_rate": 2.3164867517173698e-05, "loss": 0.0002, "step": 5470 }, { "epoch": 5.377821393523062, "grad_norm": 0.0015537918079644442, "learning_rate": 2.3115799803729147e-05, "loss": 0.0002, "step": 5480 }, { "epoch": 5.387634936211972, "grad_norm": 0.0015587827656418085, "learning_rate": 2.3066732090284593e-05, "loss": 0.0002, "step": 5490 }, { "epoch": 5.397448478900883, "grad_norm": 0.0015445965109393, "learning_rate": 2.301766437684004e-05, "loss": 0.0002, "step": 5500 }, { "epoch": 5.407262021589794, "grad_norm": 0.00230443780310452, "learning_rate": 2.2968596663395487e-05, "loss": 0.0002, "step": 5510 }, { "epoch": 5.417075564278704, "grad_norm": 0.001530683832243085, "learning_rate": 2.2919528949950933e-05, "loss": 0.0002, "step": 5520 }, { "epoch": 5.426889106967615, "grad_norm": 0.006643714848905802, "learning_rate": 2.287046123650638e-05, "loss": 0.0003, "step": 5530 }, { "epoch": 5.436702649656526, "grad_norm": 0.0021695613395422697, "learning_rate": 2.2821393523061828e-05, "loss": 0.0002, "step": 5540 }, { "epoch": 5.446516192345436, "grad_norm": 0.0014126679161563516, "learning_rate": 2.2772325809617273e-05, "loss": 0.0655, "step": 5550 }, { "epoch": 5.456329735034347, "grad_norm": 0.01729333959519863, "learning_rate": 2.272325809617272e-05, "loss": 0.0002, "step": 5560 }, { "epoch": 5.466143277723258, "grad_norm": 0.0014916701475158334, "learning_rate": 2.2674190382728168e-05, "loss": 0.0002, "step": 5570 }, { "epoch": 5.4759568204121685, "grad_norm": 0.001467019901610911, "learning_rate": 2.2625122669283613e-05, "loss": 0.0002, "step": 5580 }, { "epoch": 5.4857703631010795, "grad_norm": 0.0014575383393093944, "learning_rate": 2.257605495583906e-05, "loss": 0.0002, "step": 5590 }, { "epoch": 5.4955839057899905, "grad_norm": 0.0014117214595898986, "learning_rate": 2.2526987242394508e-05, "loss": 0.0002, "step": 5600 }, { "epoch": 5.505397448478901, "grad_norm": 0.0014430248411372304, "learning_rate": 2.2477919528949953e-05, "loss": 0.0002, "step": 5610 }, { "epoch": 5.5152109911678115, "grad_norm": 0.001443715300410986, "learning_rate": 2.2428851815505396e-05, "loss": 0.0002, "step": 5620 }, { "epoch": 5.5250245338567225, "grad_norm": 0.0013962725643068552, "learning_rate": 2.2379784102060845e-05, "loss": 0.0767, "step": 5630 }, { "epoch": 5.534838076545633, "grad_norm": 0.0016859096940606833, "learning_rate": 2.233071638861629e-05, "loss": 0.0314, "step": 5640 }, { "epoch": 5.544651619234544, "grad_norm": 0.0021301463712006807, "learning_rate": 2.2281648675171736e-05, "loss": 0.0968, "step": 5650 }, { "epoch": 5.554465161923455, "grad_norm": 0.003654947504401207, "learning_rate": 2.2232580961727185e-05, "loss": 0.0051, "step": 5660 }, { "epoch": 5.564278704612365, "grad_norm": 0.003530005691573024, "learning_rate": 2.218351324828263e-05, "loss": 0.0053, "step": 5670 }, { "epoch": 5.574092247301276, "grad_norm": 0.004440093878656626, "learning_rate": 2.2134445534838076e-05, "loss": 0.0012, "step": 5680 }, { "epoch": 5.583905789990187, "grad_norm": 0.0015052916714921594, "learning_rate": 2.2085377821393525e-05, "loss": 0.0013, "step": 5690 }, { "epoch": 5.593719332679097, "grad_norm": 0.0014009432634338737, "learning_rate": 2.203631010794897e-05, "loss": 0.0492, "step": 5700 }, { "epoch": 5.603532875368008, "grad_norm": 0.0015393829671666026, "learning_rate": 2.1987242394504416e-05, "loss": 0.0005, "step": 5710 }, { "epoch": 5.613346418056919, "grad_norm": 0.0039021980483084917, "learning_rate": 2.1938174681059865e-05, "loss": 0.0002, "step": 5720 }, { "epoch": 5.623159960745829, "grad_norm": 0.0014669959200546145, "learning_rate": 2.188910696761531e-05, "loss": 0.0004, "step": 5730 }, { "epoch": 5.63297350343474, "grad_norm": 0.0015139420283958316, "learning_rate": 2.1840039254170756e-05, "loss": 0.0303, "step": 5740 }, { "epoch": 5.642787046123651, "grad_norm": 0.001543746329843998, "learning_rate": 2.1790971540726205e-05, "loss": 0.0009, "step": 5750 }, { "epoch": 5.652600588812561, "grad_norm": 0.2851181924343109, "learning_rate": 2.174190382728165e-05, "loss": 0.0021, "step": 5760 }, { "epoch": 5.662414131501472, "grad_norm": 0.001427607610821724, "learning_rate": 2.1692836113837096e-05, "loss": 0.0002, "step": 5770 }, { "epoch": 5.672227674190383, "grad_norm": 0.0017000263324007392, "learning_rate": 2.1643768400392542e-05, "loss": 0.0101, "step": 5780 }, { "epoch": 5.682041216879293, "grad_norm": 0.02387947216629982, "learning_rate": 2.1594700686947988e-05, "loss": 0.0002, "step": 5790 }, { "epoch": 5.691854759568204, "grad_norm": 0.0013223286950960755, "learning_rate": 2.1545632973503437e-05, "loss": 0.0433, "step": 5800 }, { "epoch": 5.701668302257115, "grad_norm": 0.0013629156164824963, "learning_rate": 2.1496565260058882e-05, "loss": 0.0002, "step": 5810 }, { "epoch": 5.711481844946025, "grad_norm": 0.0015034314710646868, "learning_rate": 2.1447497546614328e-05, "loss": 0.0009, "step": 5820 }, { "epoch": 5.721295387634936, "grad_norm": 0.001305502257309854, "learning_rate": 2.1398429833169777e-05, "loss": 0.0002, "step": 5830 }, { "epoch": 5.731108930323847, "grad_norm": 0.0013675469672307372, "learning_rate": 2.1349362119725222e-05, "loss": 0.0002, "step": 5840 }, { "epoch": 5.740922473012757, "grad_norm": 0.0012498252326622605, "learning_rate": 2.1300294406280668e-05, "loss": 0.0285, "step": 5850 }, { "epoch": 5.750736015701668, "grad_norm": 0.001314906869083643, "learning_rate": 2.1251226692836117e-05, "loss": 0.0002, "step": 5860 }, { "epoch": 5.760549558390579, "grad_norm": 0.004441590514034033, "learning_rate": 2.1202158979391563e-05, "loss": 0.1261, "step": 5870 }, { "epoch": 5.770363101079489, "grad_norm": 0.018667880445718765, "learning_rate": 2.1153091265947008e-05, "loss": 0.0004, "step": 5880 }, { "epoch": 5.7801766437684, "grad_norm": 21.129253387451172, "learning_rate": 2.1104023552502454e-05, "loss": 0.0915, "step": 5890 }, { "epoch": 5.789990186457311, "grad_norm": 0.0012923305621370673, "learning_rate": 2.1054955839057903e-05, "loss": 0.0008, "step": 5900 }, { "epoch": 5.799803729146221, "grad_norm": 0.1054319515824318, "learning_rate": 2.1005888125613345e-05, "loss": 0.0003, "step": 5910 }, { "epoch": 5.809617271835132, "grad_norm": 0.002641110448166728, "learning_rate": 2.0956820412168794e-05, "loss": 0.0021, "step": 5920 }, { "epoch": 5.819430814524043, "grad_norm": 0.0012492777314037085, "learning_rate": 2.090775269872424e-05, "loss": 0.0002, "step": 5930 }, { "epoch": 5.8292443572129535, "grad_norm": 0.0012710640439763665, "learning_rate": 2.0858684985279685e-05, "loss": 0.0005, "step": 5940 }, { "epoch": 5.8390578999018645, "grad_norm": 0.0014566375175490975, "learning_rate": 2.0809617271835134e-05, "loss": 0.0004, "step": 5950 }, { "epoch": 5.8488714425907755, "grad_norm": 0.0022309215273708105, "learning_rate": 2.076054955839058e-05, "loss": 0.0755, "step": 5960 }, { "epoch": 5.858684985279686, "grad_norm": 0.00341408746317029, "learning_rate": 2.0711481844946025e-05, "loss": 0.014, "step": 5970 }, { "epoch": 5.868498527968597, "grad_norm": 0.001304444158449769, "learning_rate": 2.0662414131501474e-05, "loss": 0.0015, "step": 5980 }, { "epoch": 5.878312070657508, "grad_norm": 0.0012671782169491053, "learning_rate": 2.061334641805692e-05, "loss": 0.0002, "step": 5990 }, { "epoch": 5.888125613346418, "grad_norm": 0.0035885085817426443, "learning_rate": 2.0564278704612365e-05, "loss": 0.0002, "step": 6000 }, { "epoch": 5.897939156035329, "grad_norm": 0.0014621804002672434, "learning_rate": 2.0515210991167814e-05, "loss": 0.0003, "step": 6010 }, { "epoch": 5.90775269872424, "grad_norm": 0.001226249267347157, "learning_rate": 2.046614327772326e-05, "loss": 0.0002, "step": 6020 }, { "epoch": 5.91756624141315, "grad_norm": 0.0012753872433677316, "learning_rate": 2.0417075564278706e-05, "loss": 0.0002, "step": 6030 }, { "epoch": 5.927379784102061, "grad_norm": 0.0011854572221636772, "learning_rate": 2.0368007850834155e-05, "loss": 0.0002, "step": 6040 }, { "epoch": 5.937193326790972, "grad_norm": 0.0012309462763369083, "learning_rate": 2.03189401373896e-05, "loss": 0.0001, "step": 6050 }, { "epoch": 5.947006869479882, "grad_norm": 0.001222968683578074, "learning_rate": 2.0269872423945042e-05, "loss": 0.0001, "step": 6060 }, { "epoch": 5.956820412168793, "grad_norm": 0.004279905930161476, "learning_rate": 2.022080471050049e-05, "loss": 0.0002, "step": 6070 }, { "epoch": 5.966633954857704, "grad_norm": 0.0012088885996490717, "learning_rate": 2.0171736997055937e-05, "loss": 0.0001, "step": 6080 }, { "epoch": 5.976447497546614, "grad_norm": 0.0017938670935109258, "learning_rate": 2.0122669283611383e-05, "loss": 0.0001, "step": 6090 }, { "epoch": 5.986261040235525, "grad_norm": 0.033533725887537, "learning_rate": 2.007360157016683e-05, "loss": 0.0002, "step": 6100 }, { "epoch": 5.996074582924436, "grad_norm": 0.0012293298495933414, "learning_rate": 2.0024533856722277e-05, "loss": 0.0001, "step": 6110 }, { "epoch": 6.005888125613346, "grad_norm": 0.001815044553950429, "learning_rate": 1.9975466143277723e-05, "loss": 0.0002, "step": 6120 }, { "epoch": 6.015701668302257, "grad_norm": 0.001358096138574183, "learning_rate": 1.9926398429833172e-05, "loss": 0.0002, "step": 6130 }, { "epoch": 6.025515210991168, "grad_norm": 0.015642492100596428, "learning_rate": 1.9877330716388617e-05, "loss": 0.0001, "step": 6140 }, { "epoch": 6.035328753680078, "grad_norm": 0.001149074058048427, "learning_rate": 1.9828263002944063e-05, "loss": 0.0001, "step": 6150 }, { "epoch": 6.045142296368989, "grad_norm": 0.0011097900569438934, "learning_rate": 1.9779195289499512e-05, "loss": 0.0001, "step": 6160 }, { "epoch": 6.0549558390579, "grad_norm": 0.0014940439723432064, "learning_rate": 1.9730127576054957e-05, "loss": 0.0001, "step": 6170 }, { "epoch": 6.06476938174681, "grad_norm": 0.03362993523478508, "learning_rate": 1.9681059862610403e-05, "loss": 0.0004, "step": 6180 }, { "epoch": 6.074582924435721, "grad_norm": 0.16991779208183289, "learning_rate": 1.9631992149165852e-05, "loss": 0.0003, "step": 6190 }, { "epoch": 6.084396467124632, "grad_norm": 0.0011343214428052306, "learning_rate": 1.9582924435721298e-05, "loss": 0.0021, "step": 6200 }, { "epoch": 6.094210009813542, "grad_norm": 0.0011410163715481758, "learning_rate": 1.9533856722276743e-05, "loss": 0.0001, "step": 6210 }, { "epoch": 6.104023552502453, "grad_norm": 0.0011109898332506418, "learning_rate": 1.948478900883219e-05, "loss": 0.0001, "step": 6220 }, { "epoch": 6.113837095191364, "grad_norm": 0.001067674602381885, "learning_rate": 1.9435721295387634e-05, "loss": 0.0001, "step": 6230 }, { "epoch": 6.123650637880274, "grad_norm": 0.0010826255893334746, "learning_rate": 1.938665358194308e-05, "loss": 0.0001, "step": 6240 }, { "epoch": 6.133464180569185, "grad_norm": 0.0010834899730980396, "learning_rate": 1.933758586849853e-05, "loss": 0.0541, "step": 6250 }, { "epoch": 6.143277723258096, "grad_norm": 0.007662464864552021, "learning_rate": 1.9288518155053975e-05, "loss": 0.0002, "step": 6260 }, { "epoch": 6.1530912659470065, "grad_norm": 0.0014158189296722412, "learning_rate": 1.923945044160942e-05, "loss": 0.1787, "step": 6270 }, { "epoch": 6.1629048086359175, "grad_norm": 0.0036792519968003035, "learning_rate": 1.919038272816487e-05, "loss": 0.0462, "step": 6280 }, { "epoch": 6.1727183513248285, "grad_norm": 0.1517615020275116, "learning_rate": 1.9141315014720315e-05, "loss": 0.0004, "step": 6290 }, { "epoch": 6.182531894013739, "grad_norm": 0.002872257027775049, "learning_rate": 1.909224730127576e-05, "loss": 0.0003, "step": 6300 }, { "epoch": 6.19234543670265, "grad_norm": 0.0014831377193331718, "learning_rate": 1.904317958783121e-05, "loss": 0.0002, "step": 6310 }, { "epoch": 6.202158979391561, "grad_norm": 0.0015966458013281226, "learning_rate": 1.8994111874386655e-05, "loss": 0.0006, "step": 6320 }, { "epoch": 6.211972522080471, "grad_norm": 0.001315574860200286, "learning_rate": 1.89450441609421e-05, "loss": 0.0002, "step": 6330 }, { "epoch": 6.221786064769382, "grad_norm": 0.003673387225717306, "learning_rate": 1.889597644749755e-05, "loss": 0.0002, "step": 6340 }, { "epoch": 6.231599607458293, "grad_norm": 0.0022277962416410446, "learning_rate": 1.8846908734052995e-05, "loss": 0.0002, "step": 6350 }, { "epoch": 6.241413150147203, "grad_norm": 0.0013255071826279163, "learning_rate": 1.879784102060844e-05, "loss": 0.0002, "step": 6360 }, { "epoch": 6.251226692836114, "grad_norm": 0.0024367074947804213, "learning_rate": 1.8748773307163886e-05, "loss": 0.0002, "step": 6370 }, { "epoch": 6.261040235525025, "grad_norm": 0.0018190988339483738, "learning_rate": 1.8699705593719332e-05, "loss": 0.0002, "step": 6380 }, { "epoch": 6.270853778213935, "grad_norm": 0.002138520823791623, "learning_rate": 1.865063788027478e-05, "loss": 0.0002, "step": 6390 }, { "epoch": 6.280667320902846, "grad_norm": 0.0013069864362478256, "learning_rate": 1.8601570166830226e-05, "loss": 0.0002, "step": 6400 }, { "epoch": 6.290480863591757, "grad_norm": 0.0013102535158395767, "learning_rate": 1.8552502453385672e-05, "loss": 0.0003, "step": 6410 }, { "epoch": 6.300294406280667, "grad_norm": 0.004578573163598776, "learning_rate": 1.850343473994112e-05, "loss": 0.0002, "step": 6420 }, { "epoch": 6.310107948969578, "grad_norm": 0.00831854809075594, "learning_rate": 1.8454367026496567e-05, "loss": 0.0002, "step": 6430 }, { "epoch": 6.319921491658489, "grad_norm": 0.0014605351025238633, "learning_rate": 1.8405299313052012e-05, "loss": 0.0002, "step": 6440 }, { "epoch": 6.329735034347399, "grad_norm": 0.0013795517152175307, "learning_rate": 1.835623159960746e-05, "loss": 0.0002, "step": 6450 }, { "epoch": 6.33954857703631, "grad_norm": 0.0015935307601466775, "learning_rate": 1.8307163886162907e-05, "loss": 0.0002, "step": 6460 }, { "epoch": 6.349362119725221, "grad_norm": 0.0013198903761804104, "learning_rate": 1.8258096172718352e-05, "loss": 0.0003, "step": 6470 }, { "epoch": 6.359175662414131, "grad_norm": 0.002860839944332838, "learning_rate": 1.82090284592738e-05, "loss": 0.0002, "step": 6480 }, { "epoch": 6.368989205103042, "grad_norm": 0.0013555525802075863, "learning_rate": 1.8159960745829247e-05, "loss": 0.0002, "step": 6490 }, { "epoch": 6.378802747791953, "grad_norm": 0.0020145312882959843, "learning_rate": 1.811089303238469e-05, "loss": 0.0002, "step": 6500 }, { "epoch": 6.388616290480863, "grad_norm": 0.00473778136074543, "learning_rate": 1.8061825318940138e-05, "loss": 0.0001, "step": 6510 }, { "epoch": 6.398429833169774, "grad_norm": 0.0017492013284936547, "learning_rate": 1.8012757605495584e-05, "loss": 0.0002, "step": 6520 }, { "epoch": 6.408243375858685, "grad_norm": 0.0012156120501458645, "learning_rate": 1.796368989205103e-05, "loss": 0.0001, "step": 6530 }, { "epoch": 6.418056918547595, "grad_norm": 0.001362017123028636, "learning_rate": 1.7914622178606478e-05, "loss": 0.0005, "step": 6540 }, { "epoch": 6.427870461236506, "grad_norm": 0.0011874830815941095, "learning_rate": 1.7865554465161924e-05, "loss": 0.0002, "step": 6550 }, { "epoch": 6.437684003925417, "grad_norm": 0.0020989649929106236, "learning_rate": 1.781648675171737e-05, "loss": 0.0002, "step": 6560 }, { "epoch": 6.447497546614327, "grad_norm": 0.001271673827432096, "learning_rate": 1.776741903827282e-05, "loss": 0.0858, "step": 6570 }, { "epoch": 6.457311089303238, "grad_norm": 0.001192873460240662, "learning_rate": 1.7718351324828264e-05, "loss": 0.0001, "step": 6580 }, { "epoch": 6.467124631992149, "grad_norm": 0.011513526551425457, "learning_rate": 1.766928361138371e-05, "loss": 0.1562, "step": 6590 }, { "epoch": 6.4769381746810595, "grad_norm": 0.001225059386342764, "learning_rate": 1.762021589793916e-05, "loss": 0.0002, "step": 6600 }, { "epoch": 6.4867517173699705, "grad_norm": 0.0013161891838535666, "learning_rate": 1.7571148184494604e-05, "loss": 0.0074, "step": 6610 }, { "epoch": 6.4965652600588815, "grad_norm": 0.001231314497999847, "learning_rate": 1.752208047105005e-05, "loss": 0.0001, "step": 6620 }, { "epoch": 6.506378802747792, "grad_norm": 0.0012088754447177052, "learning_rate": 1.74730127576055e-05, "loss": 0.0285, "step": 6630 }, { "epoch": 6.516192345436703, "grad_norm": 0.0013558064820244908, "learning_rate": 1.7423945044160944e-05, "loss": 0.0004, "step": 6640 }, { "epoch": 6.5260058881256136, "grad_norm": 0.0016369909280911088, "learning_rate": 1.7374877330716387e-05, "loss": 0.0016, "step": 6650 }, { "epoch": 6.535819430814524, "grad_norm": 0.035988856106996536, "learning_rate": 1.7325809617271836e-05, "loss": 0.0002, "step": 6660 }, { "epoch": 6.545632973503435, "grad_norm": 0.0011288542300462723, "learning_rate": 1.727674190382728e-05, "loss": 0.0213, "step": 6670 }, { "epoch": 6.555446516192346, "grad_norm": 0.0014625934418290854, "learning_rate": 1.7227674190382727e-05, "loss": 0.0002, "step": 6680 }, { "epoch": 6.565260058881256, "grad_norm": 0.0011535960948094726, "learning_rate": 1.7178606476938176e-05, "loss": 0.0001, "step": 6690 }, { "epoch": 6.575073601570167, "grad_norm": 0.0011100315023213625, "learning_rate": 1.712953876349362e-05, "loss": 0.0002, "step": 6700 }, { "epoch": 6.584887144259078, "grad_norm": 0.0011173097882419825, "learning_rate": 1.7080471050049067e-05, "loss": 0.0001, "step": 6710 }, { "epoch": 6.594700686947988, "grad_norm": 0.0011760563356801867, "learning_rate": 1.7031403336604516e-05, "loss": 0.0002, "step": 6720 }, { "epoch": 6.604514229636899, "grad_norm": 0.0012068103533238173, "learning_rate": 1.698233562315996e-05, "loss": 0.0001, "step": 6730 }, { "epoch": 6.61432777232581, "grad_norm": 0.0010894141159951687, "learning_rate": 1.6933267909715407e-05, "loss": 0.0001, "step": 6740 }, { "epoch": 6.62414131501472, "grad_norm": 0.0014370041899383068, "learning_rate": 1.6884200196270856e-05, "loss": 0.0001, "step": 6750 }, { "epoch": 6.633954857703631, "grad_norm": 0.002420579083263874, "learning_rate": 1.68351324828263e-05, "loss": 0.0001, "step": 6760 }, { "epoch": 6.643768400392542, "grad_norm": 0.001223103143274784, "learning_rate": 1.6786064769381747e-05, "loss": 0.0001, "step": 6770 }, { "epoch": 6.653581943081452, "grad_norm": 0.0010998743819072843, "learning_rate": 1.6736997055937196e-05, "loss": 0.0314, "step": 6780 }, { "epoch": 6.663395485770363, "grad_norm": 0.00108517415355891, "learning_rate": 1.6687929342492642e-05, "loss": 0.0001, "step": 6790 }, { "epoch": 6.673209028459274, "grad_norm": 0.0011395640904083848, "learning_rate": 1.6638861629048087e-05, "loss": 0.0001, "step": 6800 }, { "epoch": 6.683022571148184, "grad_norm": 0.001564236357808113, "learning_rate": 1.6589793915603533e-05, "loss": 0.0962, "step": 6810 }, { "epoch": 6.692836113837095, "grad_norm": 0.0016074421582743526, "learning_rate": 1.654072620215898e-05, "loss": 0.0002, "step": 6820 }, { "epoch": 6.702649656526006, "grad_norm": 0.0012334993807598948, "learning_rate": 1.6491658488714428e-05, "loss": 0.0023, "step": 6830 }, { "epoch": 6.712463199214916, "grad_norm": 0.0011435603955760598, "learning_rate": 1.6442590775269873e-05, "loss": 0.0001, "step": 6840 }, { "epoch": 6.722276741903827, "grad_norm": 0.0016410372918471694, "learning_rate": 1.639352306182532e-05, "loss": 0.0298, "step": 6850 }, { "epoch": 6.732090284592738, "grad_norm": 0.0012846958125010133, "learning_rate": 1.6344455348380768e-05, "loss": 0.0001, "step": 6860 }, { "epoch": 6.741903827281648, "grad_norm": 0.0011800202773883939, "learning_rate": 1.6295387634936213e-05, "loss": 0.0002, "step": 6870 }, { "epoch": 6.751717369970559, "grad_norm": 0.0015586729859933257, "learning_rate": 1.624631992149166e-05, "loss": 0.0003, "step": 6880 }, { "epoch": 6.76153091265947, "grad_norm": 0.001090590376406908, "learning_rate": 1.6197252208047105e-05, "loss": 0.0002, "step": 6890 }, { "epoch": 6.77134445534838, "grad_norm": 0.0011874845949932933, "learning_rate": 1.6148184494602554e-05, "loss": 0.0001, "step": 6900 }, { "epoch": 6.781157998037291, "grad_norm": 0.0011030100286006927, "learning_rate": 1.6099116781158e-05, "loss": 0.0001, "step": 6910 }, { "epoch": 6.790971540726202, "grad_norm": 0.0012315625790506601, "learning_rate": 1.6050049067713445e-05, "loss": 0.0001, "step": 6920 }, { "epoch": 6.8007850834151125, "grad_norm": 0.0011062839766964316, "learning_rate": 1.6000981354268894e-05, "loss": 0.0001, "step": 6930 }, { "epoch": 6.8105986261040234, "grad_norm": 0.0011281865881755948, "learning_rate": 1.595191364082434e-05, "loss": 0.0001, "step": 6940 }, { "epoch": 6.820412168792934, "grad_norm": 0.001074342057108879, "learning_rate": 1.5902845927379785e-05, "loss": 0.0001, "step": 6950 }, { "epoch": 6.8302257114818445, "grad_norm": 0.0011061643017455935, "learning_rate": 1.585377821393523e-05, "loss": 0.0001, "step": 6960 }, { "epoch": 6.8400392541707555, "grad_norm": 0.002780759707093239, "learning_rate": 1.5804710500490676e-05, "loss": 0.0001, "step": 6970 }, { "epoch": 6.8498527968596665, "grad_norm": 0.0010947277769446373, "learning_rate": 1.5755642787046125e-05, "loss": 0.0001, "step": 6980 }, { "epoch": 6.859666339548577, "grad_norm": 0.001039006281644106, "learning_rate": 1.570657507360157e-05, "loss": 0.0001, "step": 6990 }, { "epoch": 6.869479882237488, "grad_norm": 0.0011975034140050411, "learning_rate": 1.5657507360157016e-05, "loss": 0.0001, "step": 7000 }, { "epoch": 6.879293424926399, "grad_norm": 0.0010505706304684281, "learning_rate": 1.5608439646712465e-05, "loss": 0.0001, "step": 7010 }, { "epoch": 6.889106967615309, "grad_norm": 0.001015416462905705, "learning_rate": 1.555937193326791e-05, "loss": 0.0001, "step": 7020 }, { "epoch": 6.89892051030422, "grad_norm": 0.001166634145192802, "learning_rate": 1.5510304219823356e-05, "loss": 0.0001, "step": 7030 }, { "epoch": 6.908734052993131, "grad_norm": 0.005132897291332483, "learning_rate": 1.5461236506378805e-05, "loss": 0.0001, "step": 7040 }, { "epoch": 6.918547595682041, "grad_norm": 0.001034508110024035, "learning_rate": 1.541216879293425e-05, "loss": 0.0001, "step": 7050 }, { "epoch": 6.928361138370952, "grad_norm": 0.0013660124968737364, "learning_rate": 1.5363101079489697e-05, "loss": 0.0001, "step": 7060 }, { "epoch": 6.938174681059863, "grad_norm": 0.001023141318000853, "learning_rate": 1.5314033366045146e-05, "loss": 0.0001, "step": 7070 }, { "epoch": 6.947988223748773, "grad_norm": 0.0009852561634033918, "learning_rate": 1.526496565260059e-05, "loss": 0.0006, "step": 7080 }, { "epoch": 6.957801766437684, "grad_norm": 0.0028536063618957996, "learning_rate": 1.5215897939156035e-05, "loss": 0.0001, "step": 7090 }, { "epoch": 6.967615309126595, "grad_norm": 0.0010083414381369948, "learning_rate": 1.516683022571148e-05, "loss": 0.0001, "step": 7100 }, { "epoch": 6.977428851815505, "grad_norm": 0.0009895939147099853, "learning_rate": 1.5117762512266928e-05, "loss": 0.0003, "step": 7110 }, { "epoch": 6.987242394504416, "grad_norm": 0.0009826653404161334, "learning_rate": 1.5068694798822375e-05, "loss": 0.0002, "step": 7120 }, { "epoch": 6.997055937193327, "grad_norm": 0.0010616799117997289, "learning_rate": 1.501962708537782e-05, "loss": 0.0001, "step": 7130 }, { "epoch": 7.006869479882237, "grad_norm": 0.000987286097370088, "learning_rate": 1.4970559371933268e-05, "loss": 0.0001, "step": 7140 }, { "epoch": 7.016683022571148, "grad_norm": 0.0009880246361717582, "learning_rate": 1.4921491658488715e-05, "loss": 0.0001, "step": 7150 }, { "epoch": 7.026496565260059, "grad_norm": 0.0010042705107480288, "learning_rate": 1.4872423945044161e-05, "loss": 0.0001, "step": 7160 }, { "epoch": 7.036310107948969, "grad_norm": 0.0010432158596813679, "learning_rate": 1.4823356231599608e-05, "loss": 0.0001, "step": 7170 }, { "epoch": 7.04612365063788, "grad_norm": 0.0010717209661379457, "learning_rate": 1.4774288518155056e-05, "loss": 0.0047, "step": 7180 }, { "epoch": 7.055937193326791, "grad_norm": 0.001039078924804926, "learning_rate": 1.4725220804710501e-05, "loss": 0.0001, "step": 7190 }, { "epoch": 7.065750736015701, "grad_norm": 0.0010033833095803857, "learning_rate": 1.4676153091265948e-05, "loss": 0.0002, "step": 7200 }, { "epoch": 7.075564278704612, "grad_norm": 0.003389047458767891, "learning_rate": 1.4627085377821396e-05, "loss": 0.0001, "step": 7210 }, { "epoch": 7.085377821393523, "grad_norm": 0.0009605743689462543, "learning_rate": 1.4578017664376841e-05, "loss": 0.0002, "step": 7220 }, { "epoch": 7.095191364082433, "grad_norm": 0.0016811139648780227, "learning_rate": 1.4528949950932289e-05, "loss": 0.0001, "step": 7230 }, { "epoch": 7.105004906771344, "grad_norm": 0.0009887183550745249, "learning_rate": 1.4479882237487732e-05, "loss": 0.0146, "step": 7240 }, { "epoch": 7.114818449460255, "grad_norm": 0.0009474638500250876, "learning_rate": 1.443081452404318e-05, "loss": 0.0001, "step": 7250 }, { "epoch": 7.124631992149165, "grad_norm": 0.0017863448010757565, "learning_rate": 1.4381746810598625e-05, "loss": 0.0001, "step": 7260 }, { "epoch": 7.134445534838076, "grad_norm": 0.0009406275930814445, "learning_rate": 1.4332679097154073e-05, "loss": 0.0069, "step": 7270 }, { "epoch": 7.144259077526987, "grad_norm": 0.0010239857947453856, "learning_rate": 1.428361138370952e-05, "loss": 0.1679, "step": 7280 }, { "epoch": 7.1540726202158975, "grad_norm": 0.0018188258400186896, "learning_rate": 1.4234543670264966e-05, "loss": 0.0003, "step": 7290 }, { "epoch": 7.1638861629048085, "grad_norm": 0.0012613933067768812, "learning_rate": 1.4185475956820413e-05, "loss": 0.0001, "step": 7300 }, { "epoch": 7.1736997055937195, "grad_norm": 0.019094325602054596, "learning_rate": 1.413640824337586e-05, "loss": 0.1222, "step": 7310 }, { "epoch": 7.18351324828263, "grad_norm": 0.013140466995537281, "learning_rate": 1.4087340529931306e-05, "loss": 0.0002, "step": 7320 }, { "epoch": 7.193326790971541, "grad_norm": 0.001887351623736322, "learning_rate": 1.4038272816486753e-05, "loss": 0.0028, "step": 7330 }, { "epoch": 7.203140333660452, "grad_norm": 0.008172539062798023, "learning_rate": 1.39892051030422e-05, "loss": 0.0002, "step": 7340 }, { "epoch": 7.212953876349362, "grad_norm": 0.017021648585796356, "learning_rate": 1.3940137389597646e-05, "loss": 0.0002, "step": 7350 }, { "epoch": 7.222767419038273, "grad_norm": 0.0010052472352981567, "learning_rate": 1.3891069676153093e-05, "loss": 0.0015, "step": 7360 }, { "epoch": 7.232580961727184, "grad_norm": 0.001076782587915659, "learning_rate": 1.3842001962708539e-05, "loss": 0.0048, "step": 7370 }, { "epoch": 7.242394504416094, "grad_norm": 0.05915454775094986, "learning_rate": 1.3792934249263986e-05, "loss": 0.0002, "step": 7380 }, { "epoch": 7.252208047105005, "grad_norm": 0.0009720100206322968, "learning_rate": 1.374386653581943e-05, "loss": 0.0002, "step": 7390 }, { "epoch": 7.262021589793916, "grad_norm": 0.019856898114085197, "learning_rate": 1.3694798822374877e-05, "loss": 0.0002, "step": 7400 }, { "epoch": 7.271835132482826, "grad_norm": 0.0022591969463974237, "learning_rate": 1.3645731108930323e-05, "loss": 0.0004, "step": 7410 }, { "epoch": 7.281648675171737, "grad_norm": 0.0010053004371002316, "learning_rate": 1.359666339548577e-05, "loss": 0.0001, "step": 7420 }, { "epoch": 7.291462217860648, "grad_norm": 0.0015725187258794904, "learning_rate": 1.3547595682041217e-05, "loss": 0.0002, "step": 7430 }, { "epoch": 7.301275760549558, "grad_norm": 0.0009938733419403434, "learning_rate": 1.3498527968596663e-05, "loss": 0.0001, "step": 7440 }, { "epoch": 7.311089303238469, "grad_norm": 0.0009750658646225929, "learning_rate": 1.344946025515211e-05, "loss": 0.0001, "step": 7450 }, { "epoch": 7.32090284592738, "grad_norm": 0.0026528111193329096, "learning_rate": 1.3400392541707558e-05, "loss": 0.0001, "step": 7460 }, { "epoch": 7.33071638861629, "grad_norm": 0.0010182139230892062, "learning_rate": 1.3351324828263003e-05, "loss": 0.0001, "step": 7470 }, { "epoch": 7.340529931305201, "grad_norm": 0.0009615565068088472, "learning_rate": 1.330225711481845e-05, "loss": 0.0001, "step": 7480 }, { "epoch": 7.350343473994112, "grad_norm": 0.000971368863247335, "learning_rate": 1.3253189401373898e-05, "loss": 0.0004, "step": 7490 }, { "epoch": 7.360157016683022, "grad_norm": 0.027576476335525513, "learning_rate": 1.3204121687929343e-05, "loss": 0.0002, "step": 7500 }, { "epoch": 7.369970559371933, "grad_norm": 0.0009151269332505763, "learning_rate": 1.315505397448479e-05, "loss": 0.0003, "step": 7510 }, { "epoch": 7.379784102060844, "grad_norm": 0.0013021818595007062, "learning_rate": 1.3105986261040238e-05, "loss": 0.0001, "step": 7520 }, { "epoch": 7.389597644749754, "grad_norm": 0.001062211929820478, "learning_rate": 1.3056918547595683e-05, "loss": 0.0001, "step": 7530 }, { "epoch": 7.399411187438665, "grad_norm": 114.82591247558594, "learning_rate": 1.3007850834151127e-05, "loss": 0.029, "step": 7540 }, { "epoch": 7.409224730127576, "grad_norm": 0.0009047266212292016, "learning_rate": 1.2958783120706575e-05, "loss": 0.0001, "step": 7550 }, { "epoch": 7.419038272816486, "grad_norm": 0.0017496274085715413, "learning_rate": 1.2909715407262022e-05, "loss": 0.0001, "step": 7560 }, { "epoch": 7.428851815505397, "grad_norm": 0.0009102231124415994, "learning_rate": 1.2860647693817468e-05, "loss": 0.0001, "step": 7570 }, { "epoch": 7.438665358194308, "grad_norm": 0.0017243401380255818, "learning_rate": 1.2811579980372915e-05, "loss": 0.0003, "step": 7580 }, { "epoch": 7.448478900883218, "grad_norm": 0.05692388117313385, "learning_rate": 1.2762512266928362e-05, "loss": 0.0001, "step": 7590 }, { "epoch": 7.458292443572129, "grad_norm": 0.0009312523761764169, "learning_rate": 1.2713444553483808e-05, "loss": 0.0915, "step": 7600 }, { "epoch": 7.46810598626104, "grad_norm": 0.0009480075677856803, "learning_rate": 1.2664376840039255e-05, "loss": 0.0001, "step": 7610 }, { "epoch": 7.4779195289499505, "grad_norm": 0.0009222645312547684, "learning_rate": 1.2615309126594702e-05, "loss": 0.0002, "step": 7620 }, { "epoch": 7.4877330716388615, "grad_norm": 0.0009319439996033907, "learning_rate": 1.2566241413150148e-05, "loss": 0.0001, "step": 7630 }, { "epoch": 7.4975466143277725, "grad_norm": 0.0008977550896815956, "learning_rate": 1.2517173699705595e-05, "loss": 0.0001, "step": 7640 }, { "epoch": 7.507360157016683, "grad_norm": 0.0010047757532447577, "learning_rate": 1.246810598626104e-05, "loss": 0.0001, "step": 7650 }, { "epoch": 7.517173699705594, "grad_norm": 0.0038417112082242966, "learning_rate": 1.2419038272816486e-05, "loss": 0.0001, "step": 7660 }, { "epoch": 7.526987242394505, "grad_norm": 0.0010750400833785534, "learning_rate": 1.2369970559371934e-05, "loss": 0.0001, "step": 7670 }, { "epoch": 7.536800785083415, "grad_norm": 0.0008918473613448441, "learning_rate": 1.2320902845927381e-05, "loss": 0.0001, "step": 7680 }, { "epoch": 7.546614327772326, "grad_norm": 0.0010516536422073841, "learning_rate": 1.2271835132482827e-05, "loss": 0.0001, "step": 7690 }, { "epoch": 7.556427870461237, "grad_norm": 0.0009324781713075936, "learning_rate": 1.2222767419038274e-05, "loss": 0.0001, "step": 7700 }, { "epoch": 7.566241413150147, "grad_norm": 0.0009400816052220762, "learning_rate": 1.2173699705593721e-05, "loss": 0.0001, "step": 7710 }, { "epoch": 7.576054955839058, "grad_norm": 0.0008609534706920385, "learning_rate": 1.2124631992149165e-05, "loss": 0.0001, "step": 7720 }, { "epoch": 7.585868498527969, "grad_norm": 0.0009011939982883632, "learning_rate": 1.2075564278704612e-05, "loss": 0.0001, "step": 7730 }, { "epoch": 7.595682041216879, "grad_norm": 0.0008951441268436611, "learning_rate": 1.202649656526006e-05, "loss": 0.0001, "step": 7740 }, { "epoch": 7.60549558390579, "grad_norm": 0.0008742365753278136, "learning_rate": 1.1977428851815505e-05, "loss": 0.0001, "step": 7750 }, { "epoch": 7.615309126594701, "grad_norm": 0.17169933021068573, "learning_rate": 1.1928361138370952e-05, "loss": 0.0002, "step": 7760 }, { "epoch": 7.625122669283611, "grad_norm": 0.0008738868637010455, "learning_rate": 1.18792934249264e-05, "loss": 0.0001, "step": 7770 }, { "epoch": 7.634936211972522, "grad_norm": 0.0011609562207013369, "learning_rate": 1.1830225711481845e-05, "loss": 0.0001, "step": 7780 }, { "epoch": 7.644749754661433, "grad_norm": 0.0009239889914169908, "learning_rate": 1.1781157998037293e-05, "loss": 0.0001, "step": 7790 }, { "epoch": 7.654563297350343, "grad_norm": 0.0009010569774545729, "learning_rate": 1.1732090284592738e-05, "loss": 0.0001, "step": 7800 }, { "epoch": 7.664376840039254, "grad_norm": 0.000879693659953773, "learning_rate": 1.1683022571148185e-05, "loss": 0.0001, "step": 7810 }, { "epoch": 7.674190382728165, "grad_norm": 0.0008639395819045603, "learning_rate": 1.1633954857703631e-05, "loss": 0.0001, "step": 7820 }, { "epoch": 7.684003925417075, "grad_norm": 0.0008466942235827446, "learning_rate": 1.1584887144259078e-05, "loss": 0.0001, "step": 7830 }, { "epoch": 7.693817468105986, "grad_norm": 0.0008819219656288624, "learning_rate": 1.1535819430814526e-05, "loss": 0.0001, "step": 7840 }, { "epoch": 7.703631010794897, "grad_norm": 0.009510258212685585, "learning_rate": 1.1486751717369971e-05, "loss": 0.0001, "step": 7850 }, { "epoch": 7.713444553483807, "grad_norm": 0.0008892007754184306, "learning_rate": 1.1437684003925419e-05, "loss": 0.0001, "step": 7860 }, { "epoch": 7.723258096172718, "grad_norm": 0.0009460031287744641, "learning_rate": 1.1388616290480864e-05, "loss": 0.0001, "step": 7870 }, { "epoch": 7.733071638861629, "grad_norm": 0.0008965510060079396, "learning_rate": 1.133954857703631e-05, "loss": 0.0001, "step": 7880 }, { "epoch": 7.742885181550539, "grad_norm": 0.05526250973343849, "learning_rate": 1.1290480863591757e-05, "loss": 0.0001, "step": 7890 }, { "epoch": 7.75269872423945, "grad_norm": 0.000836291816085577, "learning_rate": 1.1241413150147204e-05, "loss": 0.0001, "step": 7900 }, { "epoch": 7.762512266928361, "grad_norm": 0.0008228803635574877, "learning_rate": 1.119234543670265e-05, "loss": 0.0001, "step": 7910 }, { "epoch": 7.772325809617271, "grad_norm": 0.0009072457323782146, "learning_rate": 1.1143277723258097e-05, "loss": 0.0001, "step": 7920 }, { "epoch": 7.782139352306182, "grad_norm": 0.0010595405474305153, "learning_rate": 1.1094210009813544e-05, "loss": 0.0001, "step": 7930 }, { "epoch": 7.791952894995093, "grad_norm": 0.0008154577808454633, "learning_rate": 1.1045142296368988e-05, "loss": 0.0001, "step": 7940 }, { "epoch": 7.8017664376840035, "grad_norm": 0.0009557644953019917, "learning_rate": 1.0996074582924436e-05, "loss": 0.0001, "step": 7950 }, { "epoch": 7.8115799803729145, "grad_norm": 0.0008630304364487529, "learning_rate": 1.0947006869479883e-05, "loss": 0.0001, "step": 7960 }, { "epoch": 7.8213935230618254, "grad_norm": 0.004290347453206778, "learning_rate": 1.0897939156035329e-05, "loss": 0.0001, "step": 7970 }, { "epoch": 7.8312070657507356, "grad_norm": 0.0008026896975934505, "learning_rate": 1.0848871442590776e-05, "loss": 0.0001, "step": 7980 }, { "epoch": 7.8410206084396465, "grad_norm": 0.0008485147845931351, "learning_rate": 1.0799803729146223e-05, "loss": 0.0001, "step": 7990 }, { "epoch": 7.8508341511285575, "grad_norm": 0.0009684371179901063, "learning_rate": 1.0750736015701669e-05, "loss": 0.0001, "step": 8000 }, { "epoch": 7.860647693817468, "grad_norm": 0.00081270607188344, "learning_rate": 1.0701668302257116e-05, "loss": 0.0001, "step": 8010 }, { "epoch": 7.870461236506379, "grad_norm": 0.0008527148747816682, "learning_rate": 1.0652600588812562e-05, "loss": 0.0001, "step": 8020 }, { "epoch": 7.88027477919529, "grad_norm": 0.0011228329967707396, "learning_rate": 1.0603532875368007e-05, "loss": 0.0001, "step": 8030 }, { "epoch": 7.8900883218842, "grad_norm": 0.0011605530744418502, "learning_rate": 1.0554465161923454e-05, "loss": 0.0001, "step": 8040 }, { "epoch": 7.899901864573111, "grad_norm": 0.0008033498888835311, "learning_rate": 1.0505397448478902e-05, "loss": 0.0001, "step": 8050 }, { "epoch": 7.909715407262022, "grad_norm": 0.0008764792000874877, "learning_rate": 1.0456329735034347e-05, "loss": 0.0955, "step": 8060 }, { "epoch": 7.919528949950932, "grad_norm": 0.04982365667819977, "learning_rate": 1.0407262021589795e-05, "loss": 0.0002, "step": 8070 }, { "epoch": 7.929342492639843, "grad_norm": 0.0008406474371440709, "learning_rate": 1.0358194308145242e-05, "loss": 0.0002, "step": 8080 }, { "epoch": 7.939156035328754, "grad_norm": 0.000985965714789927, "learning_rate": 1.0309126594700687e-05, "loss": 0.0002, "step": 8090 }, { "epoch": 7.948969578017664, "grad_norm": 0.0008393987664021552, "learning_rate": 1.0260058881256133e-05, "loss": 0.0002, "step": 8100 }, { "epoch": 7.958783120706575, "grad_norm": 0.0008538268739357591, "learning_rate": 1.021099116781158e-05, "loss": 0.0001, "step": 8110 }, { "epoch": 7.968596663395486, "grad_norm": 0.0054728141985833645, "learning_rate": 1.0161923454367028e-05, "loss": 0.0001, "step": 8120 }, { "epoch": 7.978410206084396, "grad_norm": 0.0009096296853385866, "learning_rate": 1.0112855740922473e-05, "loss": 0.0001, "step": 8130 }, { "epoch": 7.988223748773307, "grad_norm": 0.0008633875986561179, "learning_rate": 1.006378802747792e-05, "loss": 0.0001, "step": 8140 }, { "epoch": 7.998037291462218, "grad_norm": 0.0009331282926723361, "learning_rate": 1.0014720314033368e-05, "loss": 0.0001, "step": 8150 }, { "epoch": 8.007850834151128, "grad_norm": 0.0007973794708959758, "learning_rate": 9.965652600588813e-06, "loss": 0.0001, "step": 8160 }, { "epoch": 8.01766437684004, "grad_norm": 0.0007764511392451823, "learning_rate": 9.916584887144259e-06, "loss": 0.0001, "step": 8170 }, { "epoch": 8.02747791952895, "grad_norm": 0.0008435621275566518, "learning_rate": 9.867517173699706e-06, "loss": 0.0001, "step": 8180 }, { "epoch": 8.03729146221786, "grad_norm": 0.0008471392211504281, "learning_rate": 9.818449460255152e-06, "loss": 0.0001, "step": 8190 }, { "epoch": 8.047105004906772, "grad_norm": 0.0015691117150709033, "learning_rate": 9.7693817468106e-06, "loss": 0.0001, "step": 8200 }, { "epoch": 8.056918547595682, "grad_norm": 0.0007785743218846619, "learning_rate": 9.720314033366046e-06, "loss": 0.0002, "step": 8210 }, { "epoch": 8.066732090284592, "grad_norm": 0.001200017984956503, "learning_rate": 9.671246319921492e-06, "loss": 0.0001, "step": 8220 }, { "epoch": 8.076545632973504, "grad_norm": 0.0007911358843557537, "learning_rate": 9.62217860647694e-06, "loss": 0.0001, "step": 8230 }, { "epoch": 8.086359175662414, "grad_norm": 0.0007746540359221399, "learning_rate": 9.573110893032385e-06, "loss": 0.0002, "step": 8240 }, { "epoch": 8.096172718351324, "grad_norm": 0.0007667599711567163, "learning_rate": 9.52404317958783e-06, "loss": 0.0001, "step": 8250 }, { "epoch": 8.105986261040236, "grad_norm": 0.0008504064753651619, "learning_rate": 9.474975466143278e-06, "loss": 0.0001, "step": 8260 }, { "epoch": 8.115799803729146, "grad_norm": 0.0007812583935447037, "learning_rate": 9.425907752698725e-06, "loss": 0.0001, "step": 8270 }, { "epoch": 8.125613346418056, "grad_norm": 0.0013848438393324614, "learning_rate": 9.37684003925417e-06, "loss": 0.0001, "step": 8280 }, { "epoch": 8.135426889106968, "grad_norm": 0.0008914385107345879, "learning_rate": 9.327772325809618e-06, "loss": 0.0001, "step": 8290 }, { "epoch": 8.145240431795878, "grad_norm": 0.0007613406050950289, "learning_rate": 9.278704612365065e-06, "loss": 0.0004, "step": 8300 }, { "epoch": 8.155053974484789, "grad_norm": 0.0007612567278556526, "learning_rate": 9.229636898920511e-06, "loss": 0.0786, "step": 8310 }, { "epoch": 8.1648675171737, "grad_norm": 0.0008015862549655139, "learning_rate": 9.180569185475956e-06, "loss": 0.0001, "step": 8320 }, { "epoch": 8.17468105986261, "grad_norm": 0.0007999239605851471, "learning_rate": 9.131501472031404e-06, "loss": 0.0001, "step": 8330 }, { "epoch": 8.18449460255152, "grad_norm": 0.0008124898886308074, "learning_rate": 9.082433758586851e-06, "loss": 0.0001, "step": 8340 }, { "epoch": 8.194308145240432, "grad_norm": 0.0007925584213808179, "learning_rate": 9.033366045142297e-06, "loss": 0.0001, "step": 8350 }, { "epoch": 8.204121687929343, "grad_norm": 0.0007474345620721579, "learning_rate": 8.984298331697744e-06, "loss": 0.0001, "step": 8360 }, { "epoch": 8.213935230618253, "grad_norm": 0.00099629582837224, "learning_rate": 8.93523061825319e-06, "loss": 0.0001, "step": 8370 }, { "epoch": 8.223748773307165, "grad_norm": 0.0007709822966717184, "learning_rate": 8.886162904808637e-06, "loss": 0.0001, "step": 8380 }, { "epoch": 8.233562315996075, "grad_norm": 0.0008605083567090333, "learning_rate": 8.837095191364082e-06, "loss": 0.0001, "step": 8390 }, { "epoch": 8.243375858684985, "grad_norm": 0.0007616875227540731, "learning_rate": 8.78802747791953e-06, "loss": 0.0001, "step": 8400 }, { "epoch": 8.253189401373897, "grad_norm": 0.0008648928487673402, "learning_rate": 8.738959764474975e-06, "loss": 0.0001, "step": 8410 }, { "epoch": 8.263002944062807, "grad_norm": 0.0007865010411478579, "learning_rate": 8.689892051030423e-06, "loss": 0.0001, "step": 8420 }, { "epoch": 8.272816486751717, "grad_norm": 0.0007759992149658501, "learning_rate": 8.64082433758587e-06, "loss": 0.0001, "step": 8430 }, { "epoch": 8.282630029440629, "grad_norm": 0.0007434001890942454, "learning_rate": 8.591756624141315e-06, "loss": 0.0001, "step": 8440 }, { "epoch": 8.292443572129539, "grad_norm": 0.0007561213569715619, "learning_rate": 8.542688910696763e-06, "loss": 0.0001, "step": 8450 }, { "epoch": 8.302257114818449, "grad_norm": 0.0008792446460574865, "learning_rate": 8.493621197252208e-06, "loss": 0.0001, "step": 8460 }, { "epoch": 8.31207065750736, "grad_norm": 0.0008201678283512592, "learning_rate": 8.444553483807654e-06, "loss": 0.0001, "step": 8470 }, { "epoch": 8.321884200196271, "grad_norm": 0.0007656855159439147, "learning_rate": 8.395485770363101e-06, "loss": 0.0001, "step": 8480 }, { "epoch": 8.331697742885181, "grad_norm": 0.0009525881614536047, "learning_rate": 8.346418056918548e-06, "loss": 0.0001, "step": 8490 }, { "epoch": 8.341511285574093, "grad_norm": 0.0007570137386210263, "learning_rate": 8.297350343473994e-06, "loss": 0.0001, "step": 8500 }, { "epoch": 8.351324828263003, "grad_norm": 0.002343561267480254, "learning_rate": 8.248282630029441e-06, "loss": 0.0001, "step": 8510 }, { "epoch": 8.361138370951913, "grad_norm": 0.000724265119060874, "learning_rate": 8.199214916584889e-06, "loss": 0.0001, "step": 8520 }, { "epoch": 8.370951913640825, "grad_norm": 0.0007559550576843321, "learning_rate": 8.150147203140333e-06, "loss": 0.0001, "step": 8530 }, { "epoch": 8.380765456329735, "grad_norm": 0.0007767178467474878, "learning_rate": 8.10107948969578e-06, "loss": 0.0001, "step": 8540 }, { "epoch": 8.390578999018645, "grad_norm": 0.0014819581992924213, "learning_rate": 8.052011776251227e-06, "loss": 0.095, "step": 8550 }, { "epoch": 8.400392541707557, "grad_norm": 0.0008053283672779799, "learning_rate": 8.002944062806673e-06, "loss": 0.0001, "step": 8560 }, { "epoch": 8.410206084396467, "grad_norm": 0.000741046154871583, "learning_rate": 7.95387634936212e-06, "loss": 0.0001, "step": 8570 }, { "epoch": 8.420019627085377, "grad_norm": 0.0009256862103939056, "learning_rate": 7.904808635917567e-06, "loss": 0.0001, "step": 8580 }, { "epoch": 8.42983316977429, "grad_norm": 0.0007935376488603652, "learning_rate": 7.855740922473013e-06, "loss": 0.0001, "step": 8590 }, { "epoch": 8.4396467124632, "grad_norm": 0.010961124673485756, "learning_rate": 7.80667320902846e-06, "loss": 0.0002, "step": 8600 }, { "epoch": 8.44946025515211, "grad_norm": 0.006000032182782888, "learning_rate": 7.757605495583906e-06, "loss": 0.0001, "step": 8610 }, { "epoch": 8.459273797841021, "grad_norm": 0.008318673819303513, "learning_rate": 7.708537782139353e-06, "loss": 0.0001, "step": 8620 }, { "epoch": 8.469087340529931, "grad_norm": 0.0007991963066160679, "learning_rate": 7.659470068694799e-06, "loss": 0.0616, "step": 8630 }, { "epoch": 8.478900883218841, "grad_norm": 0.0011054244823753834, "learning_rate": 7.610402355250246e-06, "loss": 0.0002, "step": 8640 }, { "epoch": 8.488714425907753, "grad_norm": 0.0007553680334240198, "learning_rate": 7.561334641805692e-06, "loss": 0.0001, "step": 8650 }, { "epoch": 8.498527968596663, "grad_norm": 0.0007292833179235458, "learning_rate": 7.512266928361139e-06, "loss": 0.0001, "step": 8660 }, { "epoch": 8.508341511285574, "grad_norm": 0.0007289135828614235, "learning_rate": 7.463199214916586e-06, "loss": 0.0001, "step": 8670 }, { "epoch": 8.518155053974485, "grad_norm": 0.0007795288693159819, "learning_rate": 7.414131501472031e-06, "loss": 0.0001, "step": 8680 }, { "epoch": 8.527968596663396, "grad_norm": 0.0007076899637468159, "learning_rate": 7.365063788027478e-06, "loss": 0.0001, "step": 8690 }, { "epoch": 8.537782139352306, "grad_norm": 0.0007375687710009515, "learning_rate": 7.3159960745829246e-06, "loss": 0.0001, "step": 8700 }, { "epoch": 8.547595682041218, "grad_norm": 0.0007277546101249754, "learning_rate": 7.266928361138371e-06, "loss": 0.0001, "step": 8710 }, { "epoch": 8.557409224730128, "grad_norm": 0.0007561793318018317, "learning_rate": 7.217860647693818e-06, "loss": 0.0001, "step": 8720 }, { "epoch": 8.567222767419038, "grad_norm": 0.000729912135284394, "learning_rate": 7.168792934249265e-06, "loss": 0.0001, "step": 8730 }, { "epoch": 8.57703631010795, "grad_norm": 0.0007444035727530718, "learning_rate": 7.119725220804711e-06, "loss": 0.0004, "step": 8740 }, { "epoch": 8.58684985279686, "grad_norm": 0.002724673831835389, "learning_rate": 7.0706575073601584e-06, "loss": 0.0001, "step": 8750 }, { "epoch": 8.59666339548577, "grad_norm": 0.0009111511171795428, "learning_rate": 7.021589793915603e-06, "loss": 0.0001, "step": 8760 }, { "epoch": 8.606476938174682, "grad_norm": 0.0007101638475432992, "learning_rate": 6.97252208047105e-06, "loss": 0.0638, "step": 8770 }, { "epoch": 8.616290480863592, "grad_norm": 0.0007315074326470494, "learning_rate": 6.923454367026497e-06, "loss": 0.0001, "step": 8780 }, { "epoch": 8.626104023552502, "grad_norm": 0.0007471499848179519, "learning_rate": 6.874386653581943e-06, "loss": 0.0001, "step": 8790 }, { "epoch": 8.635917566241414, "grad_norm": 0.001743357628583908, "learning_rate": 6.82531894013739e-06, "loss": 0.0001, "step": 8800 }, { "epoch": 8.645731108930324, "grad_norm": 0.0007213126518763602, "learning_rate": 6.776251226692837e-06, "loss": 0.0001, "step": 8810 }, { "epoch": 8.655544651619234, "grad_norm": 0.006596927065402269, "learning_rate": 6.7271835132482835e-06, "loss": 0.0001, "step": 8820 }, { "epoch": 8.665358194308146, "grad_norm": 0.0007276834803633392, "learning_rate": 6.678115799803729e-06, "loss": 0.0001, "step": 8830 }, { "epoch": 8.675171736997056, "grad_norm": 0.0007477464969269931, "learning_rate": 6.6290480863591756e-06, "loss": 0.0001, "step": 8840 }, { "epoch": 8.684985279685966, "grad_norm": 0.0008145422907546163, "learning_rate": 6.579980372914622e-06, "loss": 0.0758, "step": 8850 }, { "epoch": 8.694798822374878, "grad_norm": 0.0007996530621312559, "learning_rate": 6.530912659470069e-06, "loss": 0.0001, "step": 8860 }, { "epoch": 8.704612365063788, "grad_norm": 0.0007228550384752452, "learning_rate": 6.481844946025516e-06, "loss": 0.0003, "step": 8870 }, { "epoch": 8.714425907752698, "grad_norm": 0.0007497305050492287, "learning_rate": 6.432777232580962e-06, "loss": 0.0001, "step": 8880 }, { "epoch": 8.72423945044161, "grad_norm": 0.0018314624903723598, "learning_rate": 6.3837095191364094e-06, "loss": 0.0001, "step": 8890 }, { "epoch": 8.73405299313052, "grad_norm": 0.005474488250911236, "learning_rate": 6.334641805691854e-06, "loss": 0.0001, "step": 8900 }, { "epoch": 8.74386653581943, "grad_norm": 0.0007096781046129763, "learning_rate": 6.2855740922473015e-06, "loss": 0.0004, "step": 8910 }, { "epoch": 8.753680078508342, "grad_norm": 0.0007362039759755135, "learning_rate": 6.236506378802748e-06, "loss": 0.0001, "step": 8920 }, { "epoch": 8.763493621197252, "grad_norm": 0.0007442686473950744, "learning_rate": 6.187438665358194e-06, "loss": 0.0698, "step": 8930 }, { "epoch": 8.773307163886162, "grad_norm": 0.002618088386952877, "learning_rate": 6.138370951913641e-06, "loss": 0.0001, "step": 8940 }, { "epoch": 8.783120706575074, "grad_norm": 0.0007100084330886602, "learning_rate": 6.089303238469088e-06, "loss": 0.0001, "step": 8950 }, { "epoch": 8.792934249263984, "grad_norm": 0.0009615476010367274, "learning_rate": 6.040235525024534e-06, "loss": 0.0001, "step": 8960 }, { "epoch": 8.802747791952894, "grad_norm": 0.0011131309438496828, "learning_rate": 5.991167811579981e-06, "loss": 0.0001, "step": 8970 }, { "epoch": 8.812561334641806, "grad_norm": 0.000781961134634912, "learning_rate": 5.942100098135427e-06, "loss": 0.0001, "step": 8980 }, { "epoch": 8.822374877330716, "grad_norm": 0.0007456222083419561, "learning_rate": 5.893032384690874e-06, "loss": 0.0001, "step": 8990 }, { "epoch": 8.832188420019627, "grad_norm": 0.0007512273732572794, "learning_rate": 5.84396467124632e-06, "loss": 0.0001, "step": 9000 }, { "epoch": 8.842001962708538, "grad_norm": 0.0007723625167272985, "learning_rate": 5.794896957801767e-06, "loss": 0.0001, "step": 9010 }, { "epoch": 8.851815505397449, "grad_norm": 0.0006950558163225651, "learning_rate": 5.745829244357213e-06, "loss": 0.0001, "step": 9020 }, { "epoch": 8.861629048086359, "grad_norm": 0.0006956023280508816, "learning_rate": 5.69676153091266e-06, "loss": 0.0001, "step": 9030 }, { "epoch": 8.87144259077527, "grad_norm": 0.0006997225573286414, "learning_rate": 5.647693817468106e-06, "loss": 0.0001, "step": 9040 }, { "epoch": 8.88125613346418, "grad_norm": 0.0007857059827074409, "learning_rate": 5.5986261040235525e-06, "loss": 0.0001, "step": 9050 }, { "epoch": 8.89106967615309, "grad_norm": 0.0020336457528173923, "learning_rate": 5.549558390579e-06, "loss": 0.0001, "step": 9060 }, { "epoch": 8.900883218842003, "grad_norm": 0.0007115107146091759, "learning_rate": 5.500490677134445e-06, "loss": 0.0001, "step": 9070 }, { "epoch": 8.910696761530913, "grad_norm": 0.0007492152508348227, "learning_rate": 5.451422963689893e-06, "loss": 0.0001, "step": 9080 }, { "epoch": 8.920510304219823, "grad_norm": 0.0029001296497881413, "learning_rate": 5.402355250245339e-06, "loss": 0.0001, "step": 9090 }, { "epoch": 8.930323846908735, "grad_norm": 0.0006878664717078209, "learning_rate": 5.3532875368007855e-06, "loss": 0.0002, "step": 9100 }, { "epoch": 8.940137389597645, "grad_norm": 0.0007307238993234932, "learning_rate": 5.304219823356232e-06, "loss": 0.0001, "step": 9110 }, { "epoch": 8.949950932286555, "grad_norm": 0.0007018332253210247, "learning_rate": 5.255152109911678e-06, "loss": 0.0001, "step": 9120 }, { "epoch": 8.959764474975467, "grad_norm": 0.01293295156210661, "learning_rate": 5.206084396467125e-06, "loss": 0.0001, "step": 9130 }, { "epoch": 8.969578017664377, "grad_norm": 0.0006952588446438313, "learning_rate": 5.157016683022571e-06, "loss": 0.0001, "step": 9140 }, { "epoch": 8.979391560353287, "grad_norm": 0.0006813241052441299, "learning_rate": 5.107948969578018e-06, "loss": 0.0002, "step": 9150 }, { "epoch": 8.989205103042199, "grad_norm": 0.0007726841140538454, "learning_rate": 5.058881256133464e-06, "loss": 0.0001, "step": 9160 }, { "epoch": 8.999018645731109, "grad_norm": 0.0008112427312880754, "learning_rate": 5.0098135426889115e-06, "loss": 0.0001, "step": 9170 }, { "epoch": 9.008832188420019, "grad_norm": 0.0007123980321921408, "learning_rate": 4.960745829244357e-06, "loss": 0.0001, "step": 9180 }, { "epoch": 9.018645731108931, "grad_norm": 0.0006635936442762613, "learning_rate": 4.9116781157998035e-06, "loss": 0.0001, "step": 9190 }, { "epoch": 9.028459273797841, "grad_norm": 0.0006985082291066647, "learning_rate": 4.862610402355251e-06, "loss": 0.0001, "step": 9200 }, { "epoch": 9.038272816486751, "grad_norm": 0.0009679241920821369, "learning_rate": 4.813542688910697e-06, "loss": 0.0001, "step": 9210 }, { "epoch": 9.048086359175663, "grad_norm": 0.0007227755268104374, "learning_rate": 4.764474975466144e-06, "loss": 0.0001, "step": 9220 }, { "epoch": 9.057899901864573, "grad_norm": 0.0006677210330963135, "learning_rate": 4.71540726202159e-06, "loss": 0.0001, "step": 9230 }, { "epoch": 9.067713444553483, "grad_norm": 0.0007603775011375546, "learning_rate": 4.6663395485770365e-06, "loss": 0.0001, "step": 9240 }, { "epoch": 9.077526987242395, "grad_norm": 0.0007459863554686308, "learning_rate": 4.617271835132483e-06, "loss": 0.0001, "step": 9250 }, { "epoch": 9.087340529931305, "grad_norm": 0.001274227281101048, "learning_rate": 4.568204121687929e-06, "loss": 0.0001, "step": 9260 }, { "epoch": 9.097154072620215, "grad_norm": 0.0008047525770962238, "learning_rate": 4.519136408243376e-06, "loss": 0.0001, "step": 9270 }, { "epoch": 9.106967615309127, "grad_norm": 0.0009299516095779836, "learning_rate": 4.470068694798823e-06, "loss": 0.0001, "step": 9280 }, { "epoch": 9.116781157998037, "grad_norm": 0.0007023366051726043, "learning_rate": 4.421000981354269e-06, "loss": 0.0001, "step": 9290 }, { "epoch": 9.126594700686947, "grad_norm": 0.0008517011883668602, "learning_rate": 4.371933267909715e-06, "loss": 0.0001, "step": 9300 }, { "epoch": 9.13640824337586, "grad_norm": 0.0007383475895039737, "learning_rate": 4.3228655544651625e-06, "loss": 0.0001, "step": 9310 }, { "epoch": 9.14622178606477, "grad_norm": 0.0006836687098257244, "learning_rate": 4.273797841020609e-06, "loss": 0.0011, "step": 9320 }, { "epoch": 9.15603532875368, "grad_norm": 0.0007958838832564652, "learning_rate": 4.224730127576055e-06, "loss": 0.0001, "step": 9330 }, { "epoch": 9.165848871442591, "grad_norm": 0.0051173255778849125, "learning_rate": 4.175662414131502e-06, "loss": 0.0736, "step": 9340 }, { "epoch": 9.175662414131502, "grad_norm": 0.0006751357577741146, "learning_rate": 4.126594700686948e-06, "loss": 0.0001, "step": 9350 }, { "epoch": 9.185475956820412, "grad_norm": 0.0008489376050420105, "learning_rate": 4.077526987242395e-06, "loss": 0.0001, "step": 9360 }, { "epoch": 9.195289499509324, "grad_norm": 0.0006518946029245853, "learning_rate": 4.028459273797841e-06, "loss": 0.0001, "step": 9370 }, { "epoch": 9.205103042198234, "grad_norm": 0.0006742589175701141, "learning_rate": 3.9793915603532875e-06, "loss": 0.0002, "step": 9380 }, { "epoch": 9.214916584887144, "grad_norm": 0.0006998268072493374, "learning_rate": 3.930323846908735e-06, "loss": 0.0001, "step": 9390 }, { "epoch": 9.224730127576056, "grad_norm": 0.0006446267361752689, "learning_rate": 3.8812561334641804e-06, "loss": 0.0001, "step": 9400 }, { "epoch": 9.234543670264966, "grad_norm": 0.0006532249972224236, "learning_rate": 3.832188420019627e-06, "loss": 0.0001, "step": 9410 }, { "epoch": 9.244357212953876, "grad_norm": 0.0023807811085134745, "learning_rate": 3.7831207065750737e-06, "loss": 0.0279, "step": 9420 }, { "epoch": 9.254170755642788, "grad_norm": 39.921600341796875, "learning_rate": 3.7340529931305206e-06, "loss": 0.0554, "step": 9430 }, { "epoch": 9.263984298331698, "grad_norm": 0.08718841522932053, "learning_rate": 3.6849852796859666e-06, "loss": 0.0003, "step": 9440 }, { "epoch": 9.273797841020608, "grad_norm": 0.0006705551641061902, "learning_rate": 3.6359175662414135e-06, "loss": 0.0001, "step": 9450 }, { "epoch": 9.28361138370952, "grad_norm": 0.0007212602067738771, "learning_rate": 3.58684985279686e-06, "loss": 0.0001, "step": 9460 }, { "epoch": 9.29342492639843, "grad_norm": 0.0006956434808671474, "learning_rate": 3.5377821393523068e-06, "loss": 0.0001, "step": 9470 }, { "epoch": 9.30323846908734, "grad_norm": 0.0007320587756112218, "learning_rate": 3.488714425907753e-06, "loss": 0.0001, "step": 9480 }, { "epoch": 9.313052011776252, "grad_norm": 0.0006620934000238776, "learning_rate": 3.4396467124631992e-06, "loss": 0.0001, "step": 9490 }, { "epoch": 9.322865554465162, "grad_norm": 0.0008825812255963683, "learning_rate": 3.390578999018646e-06, "loss": 0.0001, "step": 9500 }, { "epoch": 9.332679097154072, "grad_norm": 0.000674651877488941, "learning_rate": 3.341511285574092e-06, "loss": 0.0001, "step": 9510 }, { "epoch": 9.342492639842984, "grad_norm": 0.0006929274532012641, "learning_rate": 3.292443572129539e-06, "loss": 0.0001, "step": 9520 }, { "epoch": 9.352306182531894, "grad_norm": 0.0007789958617649972, "learning_rate": 3.2433758586849854e-06, "loss": 0.0001, "step": 9530 }, { "epoch": 9.362119725220804, "grad_norm": 0.0006808873731642962, "learning_rate": 3.1943081452404323e-06, "loss": 0.0001, "step": 9540 }, { "epoch": 9.371933267909716, "grad_norm": 0.0006374814547598362, "learning_rate": 3.1452404317958783e-06, "loss": 0.0001, "step": 9550 }, { "epoch": 9.381746810598626, "grad_norm": 0.0006496473215520382, "learning_rate": 3.0961727183513247e-06, "loss": 0.0921, "step": 9560 }, { "epoch": 9.391560353287536, "grad_norm": 0.0006751060136593878, "learning_rate": 3.0471050049067716e-06, "loss": 0.0001, "step": 9570 }, { "epoch": 9.401373895976448, "grad_norm": 0.0006818071124143898, "learning_rate": 2.998037291462218e-06, "loss": 0.0001, "step": 9580 }, { "epoch": 9.411187438665358, "grad_norm": 0.008655051700770855, "learning_rate": 2.9489695780176645e-06, "loss": 0.0182, "step": 9590 }, { "epoch": 9.421000981354268, "grad_norm": 0.0007353053661063313, "learning_rate": 2.899901864573111e-06, "loss": 0.0001, "step": 9600 }, { "epoch": 9.43081452404318, "grad_norm": 0.0007057326729409397, "learning_rate": 2.8508341511285574e-06, "loss": 0.0001, "step": 9610 }, { "epoch": 9.44062806673209, "grad_norm": 0.0007384234922938049, "learning_rate": 2.8017664376840042e-06, "loss": 0.0004, "step": 9620 }, { "epoch": 9.450441609421, "grad_norm": 0.0007162457914091647, "learning_rate": 2.7526987242394502e-06, "loss": 0.0001, "step": 9630 }, { "epoch": 9.460255152109912, "grad_norm": 0.002138860058039427, "learning_rate": 2.703631010794897e-06, "loss": 0.0001, "step": 9640 }, { "epoch": 9.470068694798822, "grad_norm": 0.0006910859956406057, "learning_rate": 2.6545632973503435e-06, "loss": 0.0704, "step": 9650 }, { "epoch": 9.479882237487733, "grad_norm": 0.0006951667019166052, "learning_rate": 2.6054955839057904e-06, "loss": 0.0001, "step": 9660 }, { "epoch": 9.489695780176644, "grad_norm": 0.0007034169393591583, "learning_rate": 2.5564278704612364e-06, "loss": 0.0001, "step": 9670 }, { "epoch": 9.499509322865554, "grad_norm": 0.0009360564290545881, "learning_rate": 2.5073601570166833e-06, "loss": 0.0001, "step": 9680 }, { "epoch": 9.509322865554465, "grad_norm": 0.0009853884112089872, "learning_rate": 2.4582924435721297e-06, "loss": 0.0001, "step": 9690 }, { "epoch": 9.519136408243376, "grad_norm": 0.0009145635995082557, "learning_rate": 2.409224730127576e-06, "loss": 0.0001, "step": 9700 }, { "epoch": 9.528949950932287, "grad_norm": 0.000657937373034656, "learning_rate": 2.3601570166830226e-06, "loss": 0.0001, "step": 9710 }, { "epoch": 9.538763493621197, "grad_norm": 0.0009716741042211652, "learning_rate": 2.3110893032384695e-06, "loss": 0.0001, "step": 9720 }, { "epoch": 9.548577036310109, "grad_norm": 0.0008224455523304641, "learning_rate": 2.262021589793916e-06, "loss": 0.0001, "step": 9730 }, { "epoch": 9.558390578999019, "grad_norm": 0.0007166486466303468, "learning_rate": 2.212953876349362e-06, "loss": 0.0001, "step": 9740 }, { "epoch": 9.568204121687929, "grad_norm": 0.0006854226812720299, "learning_rate": 2.1638861629048088e-06, "loss": 0.0001, "step": 9750 }, { "epoch": 9.57801766437684, "grad_norm": 0.0013223073910921812, "learning_rate": 2.1148184494602552e-06, "loss": 0.0001, "step": 9760 }, { "epoch": 9.58783120706575, "grad_norm": 0.0006512215477414429, "learning_rate": 2.0657507360157017e-06, "loss": 0.0001, "step": 9770 }, { "epoch": 9.59764474975466, "grad_norm": 0.0006538184825330973, "learning_rate": 2.016683022571148e-06, "loss": 0.0001, "step": 9780 }, { "epoch": 9.607458292443573, "grad_norm": 0.0006954250857234001, "learning_rate": 1.967615309126595e-06, "loss": 0.0001, "step": 9790 }, { "epoch": 9.617271835132483, "grad_norm": 0.0006559567409567535, "learning_rate": 1.9185475956820414e-06, "loss": 0.0001, "step": 9800 }, { "epoch": 9.627085377821393, "grad_norm": 0.0012906268239021301, "learning_rate": 1.8694798822374878e-06, "loss": 0.0001, "step": 9810 }, { "epoch": 9.636898920510305, "grad_norm": 0.0006794478395022452, "learning_rate": 1.8204121687929343e-06, "loss": 0.0001, "step": 9820 }, { "epoch": 9.646712463199215, "grad_norm": 0.0007485067471861839, "learning_rate": 1.771344455348381e-06, "loss": 0.0001, "step": 9830 }, { "epoch": 9.656526005888125, "grad_norm": 0.0007018350879661739, "learning_rate": 1.7222767419038274e-06, "loss": 0.0001, "step": 9840 }, { "epoch": 9.666339548577037, "grad_norm": 0.000663910701405257, "learning_rate": 1.6732090284592738e-06, "loss": 0.0001, "step": 9850 }, { "epoch": 9.676153091265947, "grad_norm": 0.000718809780664742, "learning_rate": 1.6241413150147205e-06, "loss": 0.0001, "step": 9860 }, { "epoch": 9.685966633954857, "grad_norm": 0.0008578874403610826, "learning_rate": 1.5750736015701667e-06, "loss": 0.0001, "step": 9870 }, { "epoch": 9.695780176643769, "grad_norm": 0.0007033746223896742, "learning_rate": 1.5260058881256136e-06, "loss": 0.0001, "step": 9880 }, { "epoch": 9.70559371933268, "grad_norm": 0.00067708152346313, "learning_rate": 1.47693817468106e-06, "loss": 0.0001, "step": 9890 }, { "epoch": 9.71540726202159, "grad_norm": 0.0006639899802394211, "learning_rate": 1.4278704612365064e-06, "loss": 0.0001, "step": 9900 }, { "epoch": 9.725220804710501, "grad_norm": 0.0006598685868084431, "learning_rate": 1.3788027477919529e-06, "loss": 0.0001, "step": 9910 }, { "epoch": 9.735034347399411, "grad_norm": 0.01395090576261282, "learning_rate": 1.3297350343473993e-06, "loss": 0.0001, "step": 9920 }, { "epoch": 9.744847890088321, "grad_norm": 0.0008143746526911855, "learning_rate": 1.280667320902846e-06, "loss": 0.0001, "step": 9930 }, { "epoch": 9.754661432777233, "grad_norm": 0.0010220261756330729, "learning_rate": 1.2315996074582924e-06, "loss": 0.0001, "step": 9940 }, { "epoch": 9.764474975466143, "grad_norm": 0.003531807102262974, "learning_rate": 1.182531894013739e-06, "loss": 0.0001, "step": 9950 }, { "epoch": 9.774288518155053, "grad_norm": 0.0006864424794912338, "learning_rate": 1.1334641805691855e-06, "loss": 0.0001, "step": 9960 }, { "epoch": 9.784102060843965, "grad_norm": 0.0008860233356244862, "learning_rate": 1.0843964671246322e-06, "loss": 0.0001, "step": 9970 }, { "epoch": 9.793915603532875, "grad_norm": 0.001267165644094348, "learning_rate": 1.0353287536800786e-06, "loss": 0.0001, "step": 9980 }, { "epoch": 9.803729146221785, "grad_norm": 0.0006668745772913098, "learning_rate": 9.86261040235525e-07, "loss": 0.0001, "step": 9990 }, { "epoch": 9.813542688910697, "grad_norm": 0.000662625883705914, "learning_rate": 9.371933267909717e-07, "loss": 0.0001, "step": 10000 }, { "epoch": 9.823356231599607, "grad_norm": 0.0006619680789299309, "learning_rate": 8.881256133464181e-07, "loss": 0.0001, "step": 10010 }, { "epoch": 9.833169774288518, "grad_norm": 0.000696695176884532, "learning_rate": 8.390578999018647e-07, "loss": 0.0001, "step": 10020 }, { "epoch": 9.84298331697743, "grad_norm": 0.0006725791026838124, "learning_rate": 7.89990186457311e-07, "loss": 0.0001, "step": 10030 }, { "epoch": 9.85279685966634, "grad_norm": 0.0006575717707164586, "learning_rate": 7.409224730127577e-07, "loss": 0.0001, "step": 10040 }, { "epoch": 9.86261040235525, "grad_norm": 0.0006885197362862527, "learning_rate": 6.918547595682042e-07, "loss": 0.0001, "step": 10050 }, { "epoch": 9.872423945044162, "grad_norm": 0.0018992492696270347, "learning_rate": 6.427870461236506e-07, "loss": 0.0185, "step": 10060 }, { "epoch": 9.882237487733072, "grad_norm": 0.000654397183097899, "learning_rate": 5.937193326790972e-07, "loss": 0.0001, "step": 10070 }, { "epoch": 9.892051030421982, "grad_norm": 0.0006650349241681397, "learning_rate": 5.446516192345437e-07, "loss": 0.0001, "step": 10080 }, { "epoch": 9.901864573110894, "grad_norm": 0.0006798275862820446, "learning_rate": 4.955839057899902e-07, "loss": 0.0001, "step": 10090 }, { "epoch": 9.911678115799804, "grad_norm": 0.0006500816671177745, "learning_rate": 4.4651619234543677e-07, "loss": 0.0001, "step": 10100 }, { "epoch": 9.921491658488714, "grad_norm": 0.0008393925963900983, "learning_rate": 3.9744847890088327e-07, "loss": 0.0001, "step": 10110 }, { "epoch": 9.931305201177626, "grad_norm": 0.0007067256956361234, "learning_rate": 3.4838076545632976e-07, "loss": 0.0001, "step": 10120 }, { "epoch": 9.941118743866536, "grad_norm": 0.0007005089428275824, "learning_rate": 2.9931305201177625e-07, "loss": 0.0002, "step": 10130 }, { "epoch": 9.950932286555446, "grad_norm": 0.0006531529943458736, "learning_rate": 2.502453385672228e-07, "loss": 0.0001, "step": 10140 }, { "epoch": 9.960745829244358, "grad_norm": 0.0009129344252869487, "learning_rate": 2.0117762512266932e-07, "loss": 0.0001, "step": 10150 }, { "epoch": 9.970559371933268, "grad_norm": 0.0006892773672007024, "learning_rate": 1.521099116781158e-07, "loss": 0.0001, "step": 10160 }, { "epoch": 9.980372914622178, "grad_norm": 0.0006506032077595592, "learning_rate": 1.0304219823356231e-07, "loss": 0.0001, "step": 10170 }, { "epoch": 9.99018645731109, "grad_norm": 0.0006497541908174753, "learning_rate": 5.3974484789008834e-08, "loss": 0.0001, "step": 10180 }, { "epoch": 10.0, "grad_norm": 0.0008801518124528229, "learning_rate": 4.906771344455348e-09, "loss": 0.0001, "step": 10190 }, { "epoch": 10.0, "step": 10190, "total_flos": 6.31327239390081e+18, "train_loss": 0.024946213553118556, "train_runtime": 4093.664, "train_samples_per_second": 19.901, "train_steps_per_second": 2.489 } ], "logging_steps": 10, "max_steps": 10190, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.31327239390081e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }