{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 10990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045495905368516835, "grad_norm": 10.08474414591373, "learning_rate": 5e-06, "loss": 0.4268, "step": 1 }, { "epoch": 0.0009099181073703367, "grad_norm": 7.187634396883529, "learning_rate": 4.999999897855645e-06, "loss": 0.4238, "step": 2 }, { "epoch": 0.001364877161055505, "grad_norm": 3.8327630883917294, "learning_rate": 4.9999995914225884e-06, "loss": 0.2838, "step": 3 }, { "epoch": 0.0018198362147406734, "grad_norm": 4.248807424602059, "learning_rate": 4.999999080700855e-06, "loss": 0.236, "step": 4 }, { "epoch": 0.0022747952684258415, "grad_norm": 4.089663323785212, "learning_rate": 4.999998365690486e-06, "loss": 0.2601, "step": 5 }, { "epoch": 0.00272975432211101, "grad_norm": 3.9876649053708864, "learning_rate": 4.999997446391542e-06, "loss": 0.2326, "step": 6 }, { "epoch": 0.0031847133757961785, "grad_norm": 2.9111466473566785, "learning_rate": 4.999996322804095e-06, "loss": 0.2269, "step": 7 }, { "epoch": 0.003639672429481347, "grad_norm": 2.5524867538991827, "learning_rate": 4.999994994928239e-06, "loss": 0.2052, "step": 8 }, { "epoch": 0.004094631483166515, "grad_norm": 2.5545174637937094, "learning_rate": 4.999993462764082e-06, "loss": 0.2696, "step": 9 }, { "epoch": 0.004549590536851683, "grad_norm": 2.4518559406151006, "learning_rate": 4.999991726311749e-06, "loss": 0.1618, "step": 10 }, { "epoch": 0.005004549590536852, "grad_norm": 3.405646599606387, "learning_rate": 4.999989785571382e-06, "loss": 0.2355, "step": 11 }, { "epoch": 0.00545950864422202, "grad_norm": 3.4944361203618186, "learning_rate": 4.999987640543139e-06, "loss": 0.2585, "step": 12 }, { "epoch": 0.005914467697907188, "grad_norm": 2.5057214280143674, "learning_rate": 4.999985291227196e-06, "loss": 0.2235, "step": 13 }, { "epoch": 0.006369426751592357, "grad_norm": 3.617819326198201, "learning_rate": 4.999982737623746e-06, "loss": 0.3207, "step": 14 }, { "epoch": 0.006824385805277525, "grad_norm": 2.953536091708363, "learning_rate": 4.999979979732995e-06, "loss": 0.2543, "step": 15 }, { "epoch": 0.007279344858962694, "grad_norm": 2.6415876340824465, "learning_rate": 4.999977017555171e-06, "loss": 0.174, "step": 16 }, { "epoch": 0.0077343039126478615, "grad_norm": 2.62032982183088, "learning_rate": 4.999973851090514e-06, "loss": 0.2526, "step": 17 }, { "epoch": 0.00818926296633303, "grad_norm": 2.208495589846344, "learning_rate": 4.999970480339284e-06, "loss": 0.2381, "step": 18 }, { "epoch": 0.008644222020018199, "grad_norm": 4.827328107147866, "learning_rate": 4.9999669053017564e-06, "loss": 0.2259, "step": 19 }, { "epoch": 0.009099181073703366, "grad_norm": 3.293711347019613, "learning_rate": 4.9999631259782235e-06, "loss": 0.1889, "step": 20 }, { "epoch": 0.009554140127388535, "grad_norm": 2.9449987435140708, "learning_rate": 4.999959142368993e-06, "loss": 0.1916, "step": 21 }, { "epoch": 0.010009099181073703, "grad_norm": 2.4684804441032533, "learning_rate": 4.999954954474391e-06, "loss": 0.2267, "step": 22 }, { "epoch": 0.010464058234758872, "grad_norm": 2.420072565048825, "learning_rate": 4.9999505622947594e-06, "loss": 0.1781, "step": 23 }, { "epoch": 0.01091901728844404, "grad_norm": 3.190045330917334, "learning_rate": 4.999945965830458e-06, "loss": 0.204, "step": 24 }, { "epoch": 0.011373976342129208, "grad_norm": 3.144753224980832, "learning_rate": 4.999941165081863e-06, "loss": 0.1837, "step": 25 }, { "epoch": 0.011828935395814377, "grad_norm": 2.2772166419161026, "learning_rate": 4.999936160049364e-06, "loss": 0.203, "step": 26 }, { "epoch": 0.012283894449499545, "grad_norm": 2.842182064416549, "learning_rate": 4.999930950733373e-06, "loss": 0.2594, "step": 27 }, { "epoch": 0.012738853503184714, "grad_norm": 2.689259909233601, "learning_rate": 4.999925537134312e-06, "loss": 0.1829, "step": 28 }, { "epoch": 0.013193812556869881, "grad_norm": 2.6543387078431233, "learning_rate": 4.9999199192526286e-06, "loss": 0.209, "step": 29 }, { "epoch": 0.01364877161055505, "grad_norm": 2.660710953873218, "learning_rate": 4.9999140970887775e-06, "loss": 0.2084, "step": 30 }, { "epoch": 0.014103730664240218, "grad_norm": 3.1124474906382065, "learning_rate": 4.999908070643236e-06, "loss": 0.2088, "step": 31 }, { "epoch": 0.014558689717925387, "grad_norm": 2.750714892828661, "learning_rate": 4.999901839916495e-06, "loss": 0.1738, "step": 32 }, { "epoch": 0.015013648771610554, "grad_norm": 2.6053321715737314, "learning_rate": 4.999895404909067e-06, "loss": 0.1723, "step": 33 }, { "epoch": 0.015468607825295723, "grad_norm": 2.8576481166567587, "learning_rate": 4.999888765621476e-06, "loss": 0.1729, "step": 34 }, { "epoch": 0.01592356687898089, "grad_norm": 2.773654545068012, "learning_rate": 4.999881922054264e-06, "loss": 0.1453, "step": 35 }, { "epoch": 0.01637852593266606, "grad_norm": 2.037109443657936, "learning_rate": 4.999874874207991e-06, "loss": 0.1197, "step": 36 }, { "epoch": 0.01683348498635123, "grad_norm": 2.6994551736744268, "learning_rate": 4.999867622083232e-06, "loss": 0.2238, "step": 37 }, { "epoch": 0.017288444040036398, "grad_norm": 2.634969731102202, "learning_rate": 4.99986016568058e-06, "loss": 0.2118, "step": 38 }, { "epoch": 0.017743403093721567, "grad_norm": 2.955393409573457, "learning_rate": 4.999852505000646e-06, "loss": 0.2215, "step": 39 }, { "epoch": 0.018198362147406732, "grad_norm": 2.0111122791563285, "learning_rate": 4.999844640044053e-06, "loss": 0.1216, "step": 40 }, { "epoch": 0.0186533212010919, "grad_norm": 2.7660608350268077, "learning_rate": 4.999836570811445e-06, "loss": 0.1948, "step": 41 }, { "epoch": 0.01910828025477707, "grad_norm": 2.581238704515564, "learning_rate": 4.999828297303483e-06, "loss": 0.2053, "step": 42 }, { "epoch": 0.019563239308462238, "grad_norm": 2.921825171868496, "learning_rate": 4.9998198195208405e-06, "loss": 0.2124, "step": 43 }, { "epoch": 0.020018198362147407, "grad_norm": 2.5257433259743145, "learning_rate": 4.999811137464212e-06, "loss": 0.1754, "step": 44 }, { "epoch": 0.020473157415832575, "grad_norm": 2.4051206013490947, "learning_rate": 4.999802251134307e-06, "loss": 0.2384, "step": 45 }, { "epoch": 0.020928116469517744, "grad_norm": 2.824019582183984, "learning_rate": 4.99979316053185e-06, "loss": 0.1845, "step": 46 }, { "epoch": 0.021383075523202913, "grad_norm": 2.4758052686748395, "learning_rate": 4.999783865657585e-06, "loss": 0.2639, "step": 47 }, { "epoch": 0.02183803457688808, "grad_norm": 3.3028306393170053, "learning_rate": 4.999774366512272e-06, "loss": 0.221, "step": 48 }, { "epoch": 0.022292993630573247, "grad_norm": 3.108709580219038, "learning_rate": 4.9997646630966865e-06, "loss": 0.2205, "step": 49 }, { "epoch": 0.022747952684258416, "grad_norm": 2.076369424843288, "learning_rate": 4.999754755411621e-06, "loss": 0.1336, "step": 50 }, { "epoch": 0.023202911737943584, "grad_norm": 2.7444959299225715, "learning_rate": 4.9997446434578865e-06, "loss": 0.1836, "step": 51 }, { "epoch": 0.023657870791628753, "grad_norm": 3.2836031890921418, "learning_rate": 4.999734327236307e-06, "loss": 0.1877, "step": 52 }, { "epoch": 0.024112829845313922, "grad_norm": 1.951056721435438, "learning_rate": 4.999723806747728e-06, "loss": 0.1151, "step": 53 }, { "epoch": 0.02456778889899909, "grad_norm": 2.6138639966442203, "learning_rate": 4.99971308199301e-06, "loss": 0.1363, "step": 54 }, { "epoch": 0.02502274795268426, "grad_norm": 2.444124379430723, "learning_rate": 4.999702152973025e-06, "loss": 0.1482, "step": 55 }, { "epoch": 0.025477707006369428, "grad_norm": 2.4597235759126987, "learning_rate": 4.9996910196886694e-06, "loss": 0.133, "step": 56 }, { "epoch": 0.025932666060054597, "grad_norm": 2.6784146485916343, "learning_rate": 4.999679682140852e-06, "loss": 0.1174, "step": 57 }, { "epoch": 0.026387625113739762, "grad_norm": 2.7424790633709564, "learning_rate": 4.999668140330499e-06, "loss": 0.252, "step": 58 }, { "epoch": 0.02684258416742493, "grad_norm": 3.348265074283292, "learning_rate": 4.999656394258555e-06, "loss": 0.1925, "step": 59 }, { "epoch": 0.0272975432211101, "grad_norm": 2.1154638113016193, "learning_rate": 4.999644443925978e-06, "loss": 0.1836, "step": 60 }, { "epoch": 0.027752502274795268, "grad_norm": 2.4179191653959484, "learning_rate": 4.999632289333746e-06, "loss": 0.153, "step": 61 }, { "epoch": 0.028207461328480437, "grad_norm": 3.9087207564649495, "learning_rate": 4.999619930482852e-06, "loss": 0.17, "step": 62 }, { "epoch": 0.028662420382165606, "grad_norm": 3.9984836138839994, "learning_rate": 4.999607367374304e-06, "loss": 0.2311, "step": 63 }, { "epoch": 0.029117379435850774, "grad_norm": 3.296600637312694, "learning_rate": 4.999594600009131e-06, "loss": 0.1665, "step": 64 }, { "epoch": 0.029572338489535943, "grad_norm": 3.086306216989983, "learning_rate": 4.999581628388375e-06, "loss": 0.212, "step": 65 }, { "epoch": 0.03002729754322111, "grad_norm": 2.48917207768275, "learning_rate": 4.999568452513097e-06, "loss": 0.236, "step": 66 }, { "epoch": 0.030482256596906277, "grad_norm": 2.42340749830043, "learning_rate": 4.9995550723843726e-06, "loss": 0.1917, "step": 67 }, { "epoch": 0.030937215650591446, "grad_norm": 3.0972614391682396, "learning_rate": 4.999541488003295e-06, "loss": 0.1765, "step": 68 }, { "epoch": 0.03139217470427662, "grad_norm": 2.3696589048498193, "learning_rate": 4.999527699370975e-06, "loss": 0.1814, "step": 69 }, { "epoch": 0.03184713375796178, "grad_norm": 2.875746597678631, "learning_rate": 4.99951370648854e-06, "loss": 0.1878, "step": 70 }, { "epoch": 0.03230209281164695, "grad_norm": 2.4253311315699606, "learning_rate": 4.999499509357132e-06, "loss": 0.15, "step": 71 }, { "epoch": 0.03275705186533212, "grad_norm": 2.766432808739805, "learning_rate": 4.999485107977912e-06, "loss": 0.1889, "step": 72 }, { "epoch": 0.033212010919017286, "grad_norm": 2.625328870617005, "learning_rate": 4.999470502352057e-06, "loss": 0.1719, "step": 73 }, { "epoch": 0.03366696997270246, "grad_norm": 2.982643055808138, "learning_rate": 4.999455692480759e-06, "loss": 0.2113, "step": 74 }, { "epoch": 0.034121929026387623, "grad_norm": 2.242621960634031, "learning_rate": 4.999440678365229e-06, "loss": 0.1721, "step": 75 }, { "epoch": 0.034576888080072796, "grad_norm": 2.4926186894362976, "learning_rate": 4.999425460006695e-06, "loss": 0.173, "step": 76 }, { "epoch": 0.03503184713375796, "grad_norm": 2.3671699591796305, "learning_rate": 4.9994100374063995e-06, "loss": 0.1687, "step": 77 }, { "epoch": 0.03548680618744313, "grad_norm": 3.4429608280507216, "learning_rate": 4.9993944105656035e-06, "loss": 0.2649, "step": 78 }, { "epoch": 0.0359417652411283, "grad_norm": 2.0807531109765987, "learning_rate": 4.999378579485582e-06, "loss": 0.1476, "step": 79 }, { "epoch": 0.036396724294813464, "grad_norm": 2.5883097677868334, "learning_rate": 4.999362544167632e-06, "loss": 0.162, "step": 80 }, { "epoch": 0.036851683348498636, "grad_norm": 1.9494729618347428, "learning_rate": 4.99934630461306e-06, "loss": 0.1869, "step": 81 }, { "epoch": 0.0373066424021838, "grad_norm": 3.2846426885249205, "learning_rate": 4.999329860823197e-06, "loss": 0.203, "step": 82 }, { "epoch": 0.03776160145586897, "grad_norm": 2.6587615060855616, "learning_rate": 4.999313212799383e-06, "loss": 0.1773, "step": 83 }, { "epoch": 0.03821656050955414, "grad_norm": 3.210244688238914, "learning_rate": 4.99929636054298e-06, "loss": 0.2184, "step": 84 }, { "epoch": 0.03867151956323931, "grad_norm": 2.2958732125888224, "learning_rate": 4.999279304055366e-06, "loss": 0.2084, "step": 85 }, { "epoch": 0.039126478616924476, "grad_norm": 2.3139948703024857, "learning_rate": 4.999262043337933e-06, "loss": 0.1973, "step": 86 }, { "epoch": 0.03958143767060965, "grad_norm": 2.6677501256903002, "learning_rate": 4.999244578392094e-06, "loss": 0.1808, "step": 87 }, { "epoch": 0.040036396724294813, "grad_norm": 2.1844571391295524, "learning_rate": 4.9992269092192736e-06, "loss": 0.1761, "step": 88 }, { "epoch": 0.04049135577797998, "grad_norm": 2.4616623603088947, "learning_rate": 4.9992090358209166e-06, "loss": 0.1731, "step": 89 }, { "epoch": 0.04094631483166515, "grad_norm": 2.337094817685032, "learning_rate": 4.9991909581984835e-06, "loss": 0.1714, "step": 90 }, { "epoch": 0.041401273885350316, "grad_norm": 2.769205118473802, "learning_rate": 4.999172676353451e-06, "loss": 0.1286, "step": 91 }, { "epoch": 0.04185623293903549, "grad_norm": 1.993822184781022, "learning_rate": 4.999154190287314e-06, "loss": 0.1722, "step": 92 }, { "epoch": 0.042311191992720654, "grad_norm": 2.4020441009943716, "learning_rate": 4.999135500001583e-06, "loss": 0.2235, "step": 93 }, { "epoch": 0.042766151046405826, "grad_norm": 2.0794454896454013, "learning_rate": 4.9991166054977844e-06, "loss": 0.1424, "step": 94 }, { "epoch": 0.04322111010009099, "grad_norm": 2.5362620116303636, "learning_rate": 4.999097506777463e-06, "loss": 0.1878, "step": 95 }, { "epoch": 0.04367606915377616, "grad_norm": 2.3575608544869393, "learning_rate": 4.999078203842179e-06, "loss": 0.2241, "step": 96 }, { "epoch": 0.04413102820746133, "grad_norm": 2.0445052328297217, "learning_rate": 4.999058696693511e-06, "loss": 0.1196, "step": 97 }, { "epoch": 0.044585987261146494, "grad_norm": 2.7989157148193615, "learning_rate": 4.99903898533305e-06, "loss": 0.186, "step": 98 }, { "epoch": 0.045040946314831666, "grad_norm": 2.6048410678209177, "learning_rate": 4.99901906976241e-06, "loss": 0.1675, "step": 99 }, { "epoch": 0.04549590536851683, "grad_norm": 2.232255651321915, "learning_rate": 4.998998949983217e-06, "loss": 0.1379, "step": 100 }, { "epoch": 0.045950864422202004, "grad_norm": 2.8190134265237203, "learning_rate": 4.998978625997115e-06, "loss": 0.2079, "step": 101 }, { "epoch": 0.04640582347588717, "grad_norm": 2.3706098438086003, "learning_rate": 4.998958097805765e-06, "loss": 0.141, "step": 102 }, { "epoch": 0.04686078252957234, "grad_norm": 2.44520778150716, "learning_rate": 4.9989373654108445e-06, "loss": 0.164, "step": 103 }, { "epoch": 0.047315741583257506, "grad_norm": 3.5342837078815115, "learning_rate": 4.9989164288140465e-06, "loss": 0.1548, "step": 104 }, { "epoch": 0.04777070063694268, "grad_norm": 2.0458160494053836, "learning_rate": 4.998895288017085e-06, "loss": 0.179, "step": 105 }, { "epoch": 0.048225659690627844, "grad_norm": 2.205598400099282, "learning_rate": 4.998873943021684e-06, "loss": 0.1614, "step": 106 }, { "epoch": 0.04868061874431301, "grad_norm": 2.511554629528065, "learning_rate": 4.998852393829589e-06, "loss": 0.1659, "step": 107 }, { "epoch": 0.04913557779799818, "grad_norm": 3.219796004043862, "learning_rate": 4.9988306404425625e-06, "loss": 0.2276, "step": 108 }, { "epoch": 0.049590536851683346, "grad_norm": 1.752131198173806, "learning_rate": 4.99880868286238e-06, "loss": 0.1742, "step": 109 }, { "epoch": 0.05004549590536852, "grad_norm": 3.361908404370123, "learning_rate": 4.998786521090836e-06, "loss": 0.1724, "step": 110 }, { "epoch": 0.050500454959053684, "grad_norm": 2.360660279895669, "learning_rate": 4.9987641551297426e-06, "loss": 0.1999, "step": 111 }, { "epoch": 0.050955414012738856, "grad_norm": 2.307324595436377, "learning_rate": 4.998741584980926e-06, "loss": 0.2101, "step": 112 }, { "epoch": 0.05141037306642402, "grad_norm": 2.6034298836542247, "learning_rate": 4.9987188106462314e-06, "loss": 0.167, "step": 113 }, { "epoch": 0.051865332120109194, "grad_norm": 1.5842459657245014, "learning_rate": 4.99869583212752e-06, "loss": 0.1538, "step": 114 }, { "epoch": 0.05232029117379436, "grad_norm": 2.627805184680893, "learning_rate": 4.9986726494266694e-06, "loss": 0.2522, "step": 115 }, { "epoch": 0.052775250227479524, "grad_norm": 2.5410809044474907, "learning_rate": 4.998649262545574e-06, "loss": 0.1776, "step": 116 }, { "epoch": 0.053230209281164696, "grad_norm": 2.076630177156468, "learning_rate": 4.998625671486144e-06, "loss": 0.1828, "step": 117 }, { "epoch": 0.05368516833484986, "grad_norm": 2.5484627386038343, "learning_rate": 4.998601876250308e-06, "loss": 0.1781, "step": 118 }, { "epoch": 0.054140127388535034, "grad_norm": 2.0245969343413983, "learning_rate": 4.998577876840011e-06, "loss": 0.1157, "step": 119 }, { "epoch": 0.0545950864422202, "grad_norm": 2.1240696181789143, "learning_rate": 4.9985536732572124e-06, "loss": 0.2097, "step": 120 }, { "epoch": 0.05505004549590537, "grad_norm": 2.4280518543324776, "learning_rate": 4.998529265503891e-06, "loss": 0.1631, "step": 121 }, { "epoch": 0.055505004549590536, "grad_norm": 2.203499108228096, "learning_rate": 4.9985046535820416e-06, "loss": 0.2094, "step": 122 }, { "epoch": 0.05595996360327571, "grad_norm": 1.7616968616285278, "learning_rate": 4.998479837493675e-06, "loss": 0.1265, "step": 123 }, { "epoch": 0.056414922656960874, "grad_norm": 2.790115396130319, "learning_rate": 4.9984548172408195e-06, "loss": 0.162, "step": 124 }, { "epoch": 0.05686988171064604, "grad_norm": 2.7234581680187087, "learning_rate": 4.998429592825519e-06, "loss": 0.1901, "step": 125 }, { "epoch": 0.05732484076433121, "grad_norm": 2.7369239231742375, "learning_rate": 4.998404164249835e-06, "loss": 0.2289, "step": 126 }, { "epoch": 0.05777979981801638, "grad_norm": 2.145081624481222, "learning_rate": 4.998378531515845e-06, "loss": 0.1267, "step": 127 }, { "epoch": 0.05823475887170155, "grad_norm": 3.4112888898442586, "learning_rate": 4.998352694625645e-06, "loss": 0.1536, "step": 128 }, { "epoch": 0.058689717925386714, "grad_norm": 1.8616422473229426, "learning_rate": 4.998326653581343e-06, "loss": 0.1342, "step": 129 }, { "epoch": 0.059144676979071886, "grad_norm": 2.107533644057457, "learning_rate": 4.998300408385072e-06, "loss": 0.1774, "step": 130 }, { "epoch": 0.05959963603275705, "grad_norm": 3.079768243729869, "learning_rate": 4.998273959038972e-06, "loss": 0.228, "step": 131 }, { "epoch": 0.06005459508644222, "grad_norm": 1.7403897659478, "learning_rate": 4.998247305545207e-06, "loss": 0.1257, "step": 132 }, { "epoch": 0.06050955414012739, "grad_norm": 1.663929944748691, "learning_rate": 4.998220447905953e-06, "loss": 0.1857, "step": 133 }, { "epoch": 0.060964513193812554, "grad_norm": 2.604082553460826, "learning_rate": 4.998193386123408e-06, "loss": 0.1724, "step": 134 }, { "epoch": 0.061419472247497726, "grad_norm": 2.662434521006077, "learning_rate": 4.99816612019978e-06, "loss": 0.1858, "step": 135 }, { "epoch": 0.06187443130118289, "grad_norm": 2.747586314783755, "learning_rate": 4.998138650137298e-06, "loss": 0.1764, "step": 136 }, { "epoch": 0.062329390354868064, "grad_norm": 2.299433423879838, "learning_rate": 4.998110975938208e-06, "loss": 0.2321, "step": 137 }, { "epoch": 0.06278434940855324, "grad_norm": 2.527715242455789, "learning_rate": 4.998083097604769e-06, "loss": 0.2159, "step": 138 }, { "epoch": 0.0632393084622384, "grad_norm": 2.5218619075726285, "learning_rate": 4.998055015139261e-06, "loss": 0.1608, "step": 139 }, { "epoch": 0.06369426751592357, "grad_norm": 3.0047644164754495, "learning_rate": 4.998026728543979e-06, "loss": 0.2065, "step": 140 }, { "epoch": 0.06414922656960874, "grad_norm": 2.178572369709547, "learning_rate": 4.997998237821233e-06, "loss": 0.1865, "step": 141 }, { "epoch": 0.0646041856232939, "grad_norm": 1.5759272732327654, "learning_rate": 4.997969542973352e-06, "loss": 0.141, "step": 142 }, { "epoch": 0.06505914467697907, "grad_norm": 2.0811820514545554, "learning_rate": 4.997940644002681e-06, "loss": 0.1676, "step": 143 }, { "epoch": 0.06551410373066424, "grad_norm": 3.4671123551644403, "learning_rate": 4.997911540911581e-06, "loss": 0.2163, "step": 144 }, { "epoch": 0.06596906278434941, "grad_norm": 2.2842746412883312, "learning_rate": 4.99788223370243e-06, "loss": 0.1677, "step": 145 }, { "epoch": 0.06642402183803457, "grad_norm": 2.3367815299616734, "learning_rate": 4.9978527223776245e-06, "loss": 0.1811, "step": 146 }, { "epoch": 0.06687898089171974, "grad_norm": 2.088943555321838, "learning_rate": 4.9978230069395735e-06, "loss": 0.1627, "step": 147 }, { "epoch": 0.06733393994540492, "grad_norm": 2.5972570174963474, "learning_rate": 4.9977930873907065e-06, "loss": 0.1415, "step": 148 }, { "epoch": 0.06778889899909009, "grad_norm": 2.3401595363726595, "learning_rate": 4.997762963733468e-06, "loss": 0.148, "step": 149 }, { "epoch": 0.06824385805277525, "grad_norm": 2.894021920414895, "learning_rate": 4.997732635970321e-06, "loss": 0.2262, "step": 150 }, { "epoch": 0.06869881710646042, "grad_norm": 1.7373422038949267, "learning_rate": 4.9977021041037425e-06, "loss": 0.1697, "step": 151 }, { "epoch": 0.06915377616014559, "grad_norm": 2.5175987385537697, "learning_rate": 4.9976713681362265e-06, "loss": 0.2353, "step": 152 }, { "epoch": 0.06960873521383075, "grad_norm": 2.4396682297474563, "learning_rate": 4.997640428070286e-06, "loss": 0.2143, "step": 153 }, { "epoch": 0.07006369426751592, "grad_norm": 2.2947939267715087, "learning_rate": 4.99760928390845e-06, "loss": 0.1369, "step": 154 }, { "epoch": 0.0705186533212011, "grad_norm": 2.4758802729165326, "learning_rate": 4.997577935653262e-06, "loss": 0.1498, "step": 155 }, { "epoch": 0.07097361237488627, "grad_norm": 2.283530414912182, "learning_rate": 4.9975463833072835e-06, "loss": 0.1558, "step": 156 }, { "epoch": 0.07142857142857142, "grad_norm": 1.785546461501872, "learning_rate": 4.997514626873093e-06, "loss": 0.1548, "step": 157 }, { "epoch": 0.0718835304822566, "grad_norm": 2.5778925367686645, "learning_rate": 4.997482666353287e-06, "loss": 0.1568, "step": 158 }, { "epoch": 0.07233848953594177, "grad_norm": 2.14376664899083, "learning_rate": 4.997450501750476e-06, "loss": 0.169, "step": 159 }, { "epoch": 0.07279344858962693, "grad_norm": 1.7889496418860382, "learning_rate": 4.997418133067288e-06, "loss": 0.1178, "step": 160 }, { "epoch": 0.0732484076433121, "grad_norm": 2.734023407734539, "learning_rate": 4.997385560306368e-06, "loss": 0.2024, "step": 161 }, { "epoch": 0.07370336669699727, "grad_norm": 2.438529690680932, "learning_rate": 4.997352783470379e-06, "loss": 0.1877, "step": 162 }, { "epoch": 0.07415832575068244, "grad_norm": 2.358353345441234, "learning_rate": 4.997319802561997e-06, "loss": 0.1349, "step": 163 }, { "epoch": 0.0746132848043676, "grad_norm": 2.1448042331352677, "learning_rate": 4.9972866175839196e-06, "loss": 0.1268, "step": 164 }, { "epoch": 0.07506824385805277, "grad_norm": 2.279102892849676, "learning_rate": 4.9972532285388575e-06, "loss": 0.1799, "step": 165 }, { "epoch": 0.07552320291173795, "grad_norm": 2.5140889210625543, "learning_rate": 4.997219635429538e-06, "loss": 0.1876, "step": 166 }, { "epoch": 0.07597816196542312, "grad_norm": 2.6687467871063664, "learning_rate": 4.997185838258709e-06, "loss": 0.1787, "step": 167 }, { "epoch": 0.07643312101910828, "grad_norm": 3.3415050416363354, "learning_rate": 4.997151837029129e-06, "loss": 0.1799, "step": 168 }, { "epoch": 0.07688808007279345, "grad_norm": 1.9269629920973084, "learning_rate": 4.997117631743579e-06, "loss": 0.1397, "step": 169 }, { "epoch": 0.07734303912647862, "grad_norm": 3.00621227688512, "learning_rate": 4.997083222404852e-06, "loss": 0.1967, "step": 170 }, { "epoch": 0.07779799818016378, "grad_norm": 2.2615169475731327, "learning_rate": 4.997048609015762e-06, "loss": 0.1288, "step": 171 }, { "epoch": 0.07825295723384895, "grad_norm": 2.4342779650863724, "learning_rate": 4.997013791579136e-06, "loss": 0.186, "step": 172 }, { "epoch": 0.07870791628753412, "grad_norm": 2.4576007392784542, "learning_rate": 4.996978770097819e-06, "loss": 0.1577, "step": 173 }, { "epoch": 0.0791628753412193, "grad_norm": 2.4106466164039766, "learning_rate": 4.996943544574673e-06, "loss": 0.1886, "step": 174 }, { "epoch": 0.07961783439490445, "grad_norm": 2.5961861603572225, "learning_rate": 4.996908115012576e-06, "loss": 0.1621, "step": 175 }, { "epoch": 0.08007279344858963, "grad_norm": 2.833499016976519, "learning_rate": 4.996872481414425e-06, "loss": 0.1818, "step": 176 }, { "epoch": 0.0805277525022748, "grad_norm": 3.5757833649912834, "learning_rate": 4.9968366437831305e-06, "loss": 0.2517, "step": 177 }, { "epoch": 0.08098271155595996, "grad_norm": 1.5552303076468192, "learning_rate": 4.99680060212162e-06, "loss": 0.1245, "step": 178 }, { "epoch": 0.08143767060964513, "grad_norm": 2.2202920086611213, "learning_rate": 4.996764356432841e-06, "loss": 0.2174, "step": 179 }, { "epoch": 0.0818926296633303, "grad_norm": 2.1293059669722196, "learning_rate": 4.996727906719754e-06, "loss": 0.1605, "step": 180 }, { "epoch": 0.08234758871701547, "grad_norm": 2.212380091830394, "learning_rate": 4.9966912529853365e-06, "loss": 0.125, "step": 181 }, { "epoch": 0.08280254777070063, "grad_norm": 2.1098748731042507, "learning_rate": 4.996654395232585e-06, "loss": 0.17, "step": 182 }, { "epoch": 0.0832575068243858, "grad_norm": 2.3315908475718183, "learning_rate": 4.996617333464512e-06, "loss": 0.1678, "step": 183 }, { "epoch": 0.08371246587807098, "grad_norm": 2.100678357161413, "learning_rate": 4.996580067684145e-06, "loss": 0.1512, "step": 184 }, { "epoch": 0.08416742493175614, "grad_norm": 1.6542642571071706, "learning_rate": 4.996542597894528e-06, "loss": 0.1875, "step": 185 }, { "epoch": 0.08462238398544131, "grad_norm": 1.500567296289452, "learning_rate": 4.996504924098726e-06, "loss": 0.1579, "step": 186 }, { "epoch": 0.08507734303912648, "grad_norm": 1.5859042172394868, "learning_rate": 4.9964670462998145e-06, "loss": 0.146, "step": 187 }, { "epoch": 0.08553230209281165, "grad_norm": 1.7178165607526288, "learning_rate": 4.99642896450089e-06, "loss": 0.2372, "step": 188 }, { "epoch": 0.08598726114649681, "grad_norm": 2.8492778772061484, "learning_rate": 4.9963906787050656e-06, "loss": 0.2504, "step": 189 }, { "epoch": 0.08644222020018198, "grad_norm": 1.9406179967433874, "learning_rate": 4.996352188915467e-06, "loss": 0.1733, "step": 190 }, { "epoch": 0.08689717925386715, "grad_norm": 2.811015878830941, "learning_rate": 4.996313495135242e-06, "loss": 0.2133, "step": 191 }, { "epoch": 0.08735213830755233, "grad_norm": 2.222839682156962, "learning_rate": 4.9962745973675505e-06, "loss": 0.2113, "step": 192 }, { "epoch": 0.08780709736123748, "grad_norm": 2.6159522481523343, "learning_rate": 4.996235495615572e-06, "loss": 0.1622, "step": 193 }, { "epoch": 0.08826205641492266, "grad_norm": 2.3708185697184847, "learning_rate": 4.996196189882503e-06, "loss": 0.1685, "step": 194 }, { "epoch": 0.08871701546860783, "grad_norm": 3.228308382699869, "learning_rate": 4.996156680171552e-06, "loss": 0.2332, "step": 195 }, { "epoch": 0.08917197452229299, "grad_norm": 2.351705904801359, "learning_rate": 4.996116966485951e-06, "loss": 0.1816, "step": 196 }, { "epoch": 0.08962693357597816, "grad_norm": 2.320092450855665, "learning_rate": 4.996077048828944e-06, "loss": 0.2321, "step": 197 }, { "epoch": 0.09008189262966333, "grad_norm": 1.960036016410063, "learning_rate": 4.996036927203793e-06, "loss": 0.1745, "step": 198 }, { "epoch": 0.0905368516833485, "grad_norm": 2.3679323522005573, "learning_rate": 4.995996601613775e-06, "loss": 0.1927, "step": 199 }, { "epoch": 0.09099181073703366, "grad_norm": 2.1775512973195723, "learning_rate": 4.9959560720621875e-06, "loss": 0.1576, "step": 200 }, { "epoch": 0.09144676979071883, "grad_norm": 2.286317354363178, "learning_rate": 4.995915338552341e-06, "loss": 0.2184, "step": 201 }, { "epoch": 0.09190172884440401, "grad_norm": 2.0945800180559275, "learning_rate": 4.995874401087565e-06, "loss": 0.1572, "step": 202 }, { "epoch": 0.09235668789808917, "grad_norm": 2.741714725855865, "learning_rate": 4.9958332596712035e-06, "loss": 0.2087, "step": 203 }, { "epoch": 0.09281164695177434, "grad_norm": 3.0871074584367864, "learning_rate": 4.99579191430662e-06, "loss": 0.1968, "step": 204 }, { "epoch": 0.09326660600545951, "grad_norm": 1.9723075192584005, "learning_rate": 4.995750364997192e-06, "loss": 0.1507, "step": 205 }, { "epoch": 0.09372156505914468, "grad_norm": 1.8988997770559113, "learning_rate": 4.995708611746314e-06, "loss": 0.1288, "step": 206 }, { "epoch": 0.09417652411282984, "grad_norm": 2.420700916830186, "learning_rate": 4.995666654557399e-06, "loss": 0.1988, "step": 207 }, { "epoch": 0.09463148316651501, "grad_norm": 2.370720479747693, "learning_rate": 4.995624493433876e-06, "loss": 0.2215, "step": 208 }, { "epoch": 0.09508644222020018, "grad_norm": 2.2764445558307607, "learning_rate": 4.995582128379189e-06, "loss": 0.1984, "step": 209 }, { "epoch": 0.09554140127388536, "grad_norm": 2.382102062046725, "learning_rate": 4.9955395593968e-06, "loss": 0.2535, "step": 210 }, { "epoch": 0.09599636032757052, "grad_norm": 2.833827673252778, "learning_rate": 4.99549678649019e-06, "loss": 0.1998, "step": 211 }, { "epoch": 0.09645131938125569, "grad_norm": 2.486472694935685, "learning_rate": 4.99545380966285e-06, "loss": 0.2118, "step": 212 }, { "epoch": 0.09690627843494086, "grad_norm": 3.0088319794179883, "learning_rate": 4.995410628918294e-06, "loss": 0.1584, "step": 213 }, { "epoch": 0.09736123748862602, "grad_norm": 1.975326638907469, "learning_rate": 4.995367244260052e-06, "loss": 0.1832, "step": 214 }, { "epoch": 0.09781619654231119, "grad_norm": 1.9912128526989044, "learning_rate": 4.995323655691667e-06, "loss": 0.1346, "step": 215 }, { "epoch": 0.09827115559599636, "grad_norm": 2.603090937917312, "learning_rate": 4.995279863216702e-06, "loss": 0.2124, "step": 216 }, { "epoch": 0.09872611464968153, "grad_norm": 2.053886430988171, "learning_rate": 4.995235866838735e-06, "loss": 0.1567, "step": 217 }, { "epoch": 0.09918107370336669, "grad_norm": 2.039621450617981, "learning_rate": 4.995191666561361e-06, "loss": 0.1694, "step": 218 }, { "epoch": 0.09963603275705187, "grad_norm": 2.0601930905500394, "learning_rate": 4.995147262388192e-06, "loss": 0.1264, "step": 219 }, { "epoch": 0.10009099181073704, "grad_norm": 3.3199244613439802, "learning_rate": 4.995102654322858e-06, "loss": 0.2204, "step": 220 }, { "epoch": 0.1005459508644222, "grad_norm": 2.1212806825874906, "learning_rate": 4.995057842369002e-06, "loss": 0.1122, "step": 221 }, { "epoch": 0.10100090991810737, "grad_norm": 2.157454599738766, "learning_rate": 4.995012826530287e-06, "loss": 0.1977, "step": 222 }, { "epoch": 0.10145586897179254, "grad_norm": 1.9698536511203952, "learning_rate": 4.99496760681039e-06, "loss": 0.1934, "step": 223 }, { "epoch": 0.10191082802547771, "grad_norm": 1.9533190562259675, "learning_rate": 4.994922183213009e-06, "loss": 0.1686, "step": 224 }, { "epoch": 0.10236578707916287, "grad_norm": 1.8311151598660917, "learning_rate": 4.9948765557418535e-06, "loss": 0.1376, "step": 225 }, { "epoch": 0.10282074613284804, "grad_norm": 2.6814547442935766, "learning_rate": 4.994830724400653e-06, "loss": 0.2536, "step": 226 }, { "epoch": 0.10327570518653321, "grad_norm": 2.065521179879655, "learning_rate": 4.994784689193151e-06, "loss": 0.1594, "step": 227 }, { "epoch": 0.10373066424021839, "grad_norm": 2.082741947039302, "learning_rate": 4.994738450123111e-06, "loss": 0.1792, "step": 228 }, { "epoch": 0.10418562329390355, "grad_norm": 2.1268040832192896, "learning_rate": 4.994692007194312e-06, "loss": 0.1746, "step": 229 }, { "epoch": 0.10464058234758872, "grad_norm": 1.6028966765046104, "learning_rate": 4.994645360410547e-06, "loss": 0.1442, "step": 230 }, { "epoch": 0.10509554140127389, "grad_norm": 2.051519186273431, "learning_rate": 4.99459850977563e-06, "loss": 0.1501, "step": 231 }, { "epoch": 0.10555050045495905, "grad_norm": 2.0348997381654774, "learning_rate": 4.994551455293388e-06, "loss": 0.1544, "step": 232 }, { "epoch": 0.10600545950864422, "grad_norm": 2.1087346651931758, "learning_rate": 4.9945041969676654e-06, "loss": 0.1768, "step": 233 }, { "epoch": 0.10646041856232939, "grad_norm": 2.2918772612100704, "learning_rate": 4.994456734802325e-06, "loss": 0.1361, "step": 234 }, { "epoch": 0.10691537761601456, "grad_norm": 1.6027315868889764, "learning_rate": 4.994409068801247e-06, "loss": 0.1905, "step": 235 }, { "epoch": 0.10737033666969972, "grad_norm": 1.3896946472755238, "learning_rate": 4.994361198968323e-06, "loss": 0.1282, "step": 236 }, { "epoch": 0.1078252957233849, "grad_norm": 2.8336860099519687, "learning_rate": 4.994313125307466e-06, "loss": 0.1795, "step": 237 }, { "epoch": 0.10828025477707007, "grad_norm": 2.3591551410924034, "learning_rate": 4.994264847822605e-06, "loss": 0.2012, "step": 238 }, { "epoch": 0.10873521383075523, "grad_norm": 1.963795078441063, "learning_rate": 4.994216366517684e-06, "loss": 0.122, "step": 239 }, { "epoch": 0.1091901728844404, "grad_norm": 2.2161995153888356, "learning_rate": 4.994167681396667e-06, "loss": 0.2013, "step": 240 }, { "epoch": 0.10964513193812557, "grad_norm": 2.116594401017286, "learning_rate": 4.994118792463529e-06, "loss": 0.1678, "step": 241 }, { "epoch": 0.11010009099181074, "grad_norm": 2.004374732998671, "learning_rate": 4.994069699722267e-06, "loss": 0.1937, "step": 242 }, { "epoch": 0.1105550500454959, "grad_norm": 1.8488901498313728, "learning_rate": 4.994020403176893e-06, "loss": 0.1668, "step": 243 }, { "epoch": 0.11101000909918107, "grad_norm": 1.9972157818131948, "learning_rate": 4.9939709028314345e-06, "loss": 0.1589, "step": 244 }, { "epoch": 0.11146496815286625, "grad_norm": 2.748474268313726, "learning_rate": 4.993921198689935e-06, "loss": 0.1244, "step": 245 }, { "epoch": 0.11191992720655142, "grad_norm": 2.2905102593877893, "learning_rate": 4.993871290756459e-06, "loss": 0.1828, "step": 246 }, { "epoch": 0.11237488626023658, "grad_norm": 2.4243824405880825, "learning_rate": 4.9938211790350835e-06, "loss": 0.1534, "step": 247 }, { "epoch": 0.11282984531392175, "grad_norm": 2.7563047154810767, "learning_rate": 4.993770863529902e-06, "loss": 0.2186, "step": 248 }, { "epoch": 0.11328480436760692, "grad_norm": 2.0782876036120044, "learning_rate": 4.993720344245029e-06, "loss": 0.1519, "step": 249 }, { "epoch": 0.11373976342129208, "grad_norm": 2.1737696697985065, "learning_rate": 4.99366962118459e-06, "loss": 0.1705, "step": 250 }, { "epoch": 0.11419472247497725, "grad_norm": 2.117835290775163, "learning_rate": 4.99361869435273e-06, "loss": 0.1279, "step": 251 }, { "epoch": 0.11464968152866242, "grad_norm": 2.2816195263684906, "learning_rate": 4.993567563753613e-06, "loss": 0.1498, "step": 252 }, { "epoch": 0.1151046405823476, "grad_norm": 2.303960194203604, "learning_rate": 4.993516229391414e-06, "loss": 0.1505, "step": 253 }, { "epoch": 0.11555959963603275, "grad_norm": 2.932533158282557, "learning_rate": 4.993464691270331e-06, "loss": 0.1672, "step": 254 }, { "epoch": 0.11601455868971793, "grad_norm": 2.050977411803408, "learning_rate": 4.993412949394572e-06, "loss": 0.1511, "step": 255 }, { "epoch": 0.1164695177434031, "grad_norm": 1.9367899744301398, "learning_rate": 4.993361003768369e-06, "loss": 0.1203, "step": 256 }, { "epoch": 0.11692447679708826, "grad_norm": 2.3417493914717027, "learning_rate": 4.993308854395963e-06, "loss": 0.1782, "step": 257 }, { "epoch": 0.11737943585077343, "grad_norm": 2.2791020802299498, "learning_rate": 4.993256501281618e-06, "loss": 0.1643, "step": 258 }, { "epoch": 0.1178343949044586, "grad_norm": 2.051233293233244, "learning_rate": 4.993203944429611e-06, "loss": 0.1761, "step": 259 }, { "epoch": 0.11828935395814377, "grad_norm": 2.554462221777923, "learning_rate": 4.993151183844236e-06, "loss": 0.1654, "step": 260 }, { "epoch": 0.11874431301182893, "grad_norm": 1.8796649091666686, "learning_rate": 4.9930982195298065e-06, "loss": 0.1826, "step": 261 }, { "epoch": 0.1191992720655141, "grad_norm": 2.1843940505934336, "learning_rate": 4.9930450514906484e-06, "loss": 0.1755, "step": 262 }, { "epoch": 0.11965423111919928, "grad_norm": 2.600288448730721, "learning_rate": 4.9929916797311075e-06, "loss": 0.1758, "step": 263 }, { "epoch": 0.12010919017288443, "grad_norm": 2.0789865508427714, "learning_rate": 4.992938104255545e-06, "loss": 0.1571, "step": 264 }, { "epoch": 0.1205641492265696, "grad_norm": 2.6999799828889546, "learning_rate": 4.992884325068339e-06, "loss": 0.2177, "step": 265 }, { "epoch": 0.12101910828025478, "grad_norm": 2.1928099848185756, "learning_rate": 4.992830342173882e-06, "loss": 0.1831, "step": 266 }, { "epoch": 0.12147406733393995, "grad_norm": 1.6337451712782205, "learning_rate": 4.992776155576589e-06, "loss": 0.1506, "step": 267 }, { "epoch": 0.12192902638762511, "grad_norm": 1.2235042033062622, "learning_rate": 4.992721765280884e-06, "loss": 0.1214, "step": 268 }, { "epoch": 0.12238398544131028, "grad_norm": 2.8845660466122873, "learning_rate": 4.992667171291215e-06, "loss": 0.2148, "step": 269 }, { "epoch": 0.12283894449499545, "grad_norm": 2.7398139900638476, "learning_rate": 4.992612373612042e-06, "loss": 0.1661, "step": 270 }, { "epoch": 0.12329390354868063, "grad_norm": 3.738889974273454, "learning_rate": 4.99255737224784e-06, "loss": 0.2297, "step": 271 }, { "epoch": 0.12374886260236578, "grad_norm": 1.5329721181759282, "learning_rate": 4.9925021672031075e-06, "loss": 0.1486, "step": 272 }, { "epoch": 0.12420382165605096, "grad_norm": 2.3823467276559875, "learning_rate": 4.992446758482353e-06, "loss": 0.1552, "step": 273 }, { "epoch": 0.12465878070973613, "grad_norm": 2.1454290127697924, "learning_rate": 4.992391146090106e-06, "loss": 0.1736, "step": 274 }, { "epoch": 0.1251137397634213, "grad_norm": 1.4949223744659494, "learning_rate": 4.99233533003091e-06, "loss": 0.1373, "step": 275 }, { "epoch": 0.12556869881710647, "grad_norm": 1.5553413773794396, "learning_rate": 4.992279310309326e-06, "loss": 0.1835, "step": 276 }, { "epoch": 0.12602365787079162, "grad_norm": 2.969806225573073, "learning_rate": 4.9922230869299316e-06, "loss": 0.2793, "step": 277 }, { "epoch": 0.1264786169244768, "grad_norm": 2.3168611268442763, "learning_rate": 4.992166659897321e-06, "loss": 0.1922, "step": 278 }, { "epoch": 0.12693357597816196, "grad_norm": 2.3995795142770455, "learning_rate": 4.992110029216106e-06, "loss": 0.1955, "step": 279 }, { "epoch": 0.12738853503184713, "grad_norm": 1.6975631974230885, "learning_rate": 4.992053194890914e-06, "loss": 0.1112, "step": 280 }, { "epoch": 0.1278434940855323, "grad_norm": 2.087297197910066, "learning_rate": 4.991996156926388e-06, "loss": 0.1333, "step": 281 }, { "epoch": 0.12829845313921748, "grad_norm": 2.6326611217122475, "learning_rate": 4.9919389153271904e-06, "loss": 0.2017, "step": 282 }, { "epoch": 0.12875341219290265, "grad_norm": 1.4167548054089978, "learning_rate": 4.991881470097998e-06, "loss": 0.2074, "step": 283 }, { "epoch": 0.1292083712465878, "grad_norm": 2.325650637419427, "learning_rate": 4.991823821243505e-06, "loss": 0.1777, "step": 284 }, { "epoch": 0.12966333030027297, "grad_norm": 2.7279251785825, "learning_rate": 4.991765968768422e-06, "loss": 0.1801, "step": 285 }, { "epoch": 0.13011828935395814, "grad_norm": 2.9061020144564087, "learning_rate": 4.991707912677477e-06, "loss": 0.1702, "step": 286 }, { "epoch": 0.1305732484076433, "grad_norm": 1.8358268112205725, "learning_rate": 4.991649652975414e-06, "loss": 0.1433, "step": 287 }, { "epoch": 0.13102820746132848, "grad_norm": 2.5332736723438636, "learning_rate": 4.991591189666994e-06, "loss": 0.2469, "step": 288 }, { "epoch": 0.13148316651501366, "grad_norm": 2.1606263891645527, "learning_rate": 4.991532522756993e-06, "loss": 0.18, "step": 289 }, { "epoch": 0.13193812556869883, "grad_norm": 1.995831189895407, "learning_rate": 4.991473652250207e-06, "loss": 0.1577, "step": 290 }, { "epoch": 0.13239308462238397, "grad_norm": 2.4955613558163754, "learning_rate": 4.991414578151445e-06, "loss": 0.1544, "step": 291 }, { "epoch": 0.13284804367606914, "grad_norm": 2.2942486381281326, "learning_rate": 4.991355300465535e-06, "loss": 0.1794, "step": 292 }, { "epoch": 0.13330300272975432, "grad_norm": 2.6074492667183486, "learning_rate": 4.99129581919732e-06, "loss": 0.2319, "step": 293 }, { "epoch": 0.1337579617834395, "grad_norm": 2.563328131279355, "learning_rate": 4.9912361343516616e-06, "loss": 0.1498, "step": 294 }, { "epoch": 0.13421292083712466, "grad_norm": 2.2818975551142535, "learning_rate": 4.991176245933437e-06, "loss": 0.1996, "step": 295 }, { "epoch": 0.13466787989080983, "grad_norm": 2.3084476659986874, "learning_rate": 4.9911161539475385e-06, "loss": 0.1837, "step": 296 }, { "epoch": 0.135122838944495, "grad_norm": 2.271697592195805, "learning_rate": 4.991055858398879e-06, "loss": 0.1839, "step": 297 }, { "epoch": 0.13557779799818018, "grad_norm": 2.7071752536725993, "learning_rate": 4.990995359292384e-06, "loss": 0.2051, "step": 298 }, { "epoch": 0.13603275705186532, "grad_norm": 2.1654433443615444, "learning_rate": 4.990934656632997e-06, "loss": 0.1845, "step": 299 }, { "epoch": 0.1364877161055505, "grad_norm": 2.56820477539861, "learning_rate": 4.990873750425679e-06, "loss": 0.1987, "step": 300 }, { "epoch": 0.13694267515923567, "grad_norm": 1.8972328280195017, "learning_rate": 4.990812640675406e-06, "loss": 0.1352, "step": 301 }, { "epoch": 0.13739763421292084, "grad_norm": 2.160948607003053, "learning_rate": 4.990751327387174e-06, "loss": 0.1788, "step": 302 }, { "epoch": 0.137852593266606, "grad_norm": 2.2034240871386026, "learning_rate": 4.99068981056599e-06, "loss": 0.14, "step": 303 }, { "epoch": 0.13830755232029118, "grad_norm": 2.273981179049363, "learning_rate": 4.990628090216885e-06, "loss": 0.1914, "step": 304 }, { "epoch": 0.13876251137397635, "grad_norm": 2.0189718711860096, "learning_rate": 4.990566166344898e-06, "loss": 0.1455, "step": 305 }, { "epoch": 0.1392174704276615, "grad_norm": 2.596979330537977, "learning_rate": 4.990504038955092e-06, "loss": 0.1503, "step": 306 }, { "epoch": 0.13967242948134667, "grad_norm": 2.694293011033057, "learning_rate": 4.990441708052542e-06, "loss": 0.1582, "step": 307 }, { "epoch": 0.14012738853503184, "grad_norm": 2.00968932243832, "learning_rate": 4.9903791736423435e-06, "loss": 0.1531, "step": 308 }, { "epoch": 0.14058234758871702, "grad_norm": 1.7247039385783955, "learning_rate": 4.9903164357296044e-06, "loss": 0.1258, "step": 309 }, { "epoch": 0.1410373066424022, "grad_norm": 1.4795211673422664, "learning_rate": 4.990253494319453e-06, "loss": 0.1918, "step": 310 }, { "epoch": 0.14149226569608736, "grad_norm": 2.4289846785611573, "learning_rate": 4.990190349417032e-06, "loss": 0.264, "step": 311 }, { "epoch": 0.14194722474977253, "grad_norm": 2.1742573666821245, "learning_rate": 4.990127001027501e-06, "loss": 0.1382, "step": 312 }, { "epoch": 0.14240218380345768, "grad_norm": 2.051388070470128, "learning_rate": 4.990063449156037e-06, "loss": 0.234, "step": 313 }, { "epoch": 0.14285714285714285, "grad_norm": 2.3613735603207435, "learning_rate": 4.989999693807832e-06, "loss": 0.1963, "step": 314 }, { "epoch": 0.14331210191082802, "grad_norm": 3.162328527546947, "learning_rate": 4.989935734988098e-06, "loss": 0.1913, "step": 315 }, { "epoch": 0.1437670609645132, "grad_norm": 2.8669333432356967, "learning_rate": 4.98987157270206e-06, "loss": 0.15, "step": 316 }, { "epoch": 0.14422202001819837, "grad_norm": 2.383827835780797, "learning_rate": 4.989807206954961e-06, "loss": 0.2103, "step": 317 }, { "epoch": 0.14467697907188354, "grad_norm": 1.6341024017470744, "learning_rate": 4.9897426377520605e-06, "loss": 0.1393, "step": 318 }, { "epoch": 0.1451319381255687, "grad_norm": 2.146073254076934, "learning_rate": 4.989677865098636e-06, "loss": 0.1836, "step": 319 }, { "epoch": 0.14558689717925385, "grad_norm": 1.6889700199846902, "learning_rate": 4.989612888999978e-06, "loss": 0.1257, "step": 320 }, { "epoch": 0.14604185623293903, "grad_norm": 1.6032091805420865, "learning_rate": 4.9895477094614e-06, "loss": 0.1578, "step": 321 }, { "epoch": 0.1464968152866242, "grad_norm": 1.8161786006418608, "learning_rate": 4.989482326488225e-06, "loss": 0.1492, "step": 322 }, { "epoch": 0.14695177434030937, "grad_norm": 1.9978970628488169, "learning_rate": 4.989416740085796e-06, "loss": 0.1637, "step": 323 }, { "epoch": 0.14740673339399454, "grad_norm": 2.7066161025891335, "learning_rate": 4.9893509502594735e-06, "loss": 0.1963, "step": 324 }, { "epoch": 0.14786169244767972, "grad_norm": 2.420242793982077, "learning_rate": 4.9892849570146335e-06, "loss": 0.1877, "step": 325 }, { "epoch": 0.1483166515013649, "grad_norm": 2.153067326288121, "learning_rate": 4.989218760356668e-06, "loss": 0.1635, "step": 326 }, { "epoch": 0.14877161055505003, "grad_norm": 2.0543349130585216, "learning_rate": 4.989152360290987e-06, "loss": 0.1744, "step": 327 }, { "epoch": 0.1492265696087352, "grad_norm": 2.1211312409383716, "learning_rate": 4.989085756823015e-06, "loss": 0.2104, "step": 328 }, { "epoch": 0.14968152866242038, "grad_norm": 1.9888085672791085, "learning_rate": 4.989018949958197e-06, "loss": 0.1876, "step": 329 }, { "epoch": 0.15013648771610555, "grad_norm": 1.7510207885281333, "learning_rate": 4.98895193970199e-06, "loss": 0.1251, "step": 330 }, { "epoch": 0.15059144676979072, "grad_norm": 2.132384994640236, "learning_rate": 4.9888847260598705e-06, "loss": 0.154, "step": 331 }, { "epoch": 0.1510464058234759, "grad_norm": 2.323691709571053, "learning_rate": 4.98881730903733e-06, "loss": 0.1599, "step": 332 }, { "epoch": 0.15150136487716107, "grad_norm": 1.7667120167873211, "learning_rate": 4.98874968863988e-06, "loss": 0.1706, "step": 333 }, { "epoch": 0.15195632393084624, "grad_norm": 2.2465388060545424, "learning_rate": 4.988681864873044e-06, "loss": 0.152, "step": 334 }, { "epoch": 0.15241128298453138, "grad_norm": 2.150731238347554, "learning_rate": 4.988613837742364e-06, "loss": 0.1784, "step": 335 }, { "epoch": 0.15286624203821655, "grad_norm": 2.6552266788081913, "learning_rate": 4.9885456072534015e-06, "loss": 0.1692, "step": 336 }, { "epoch": 0.15332120109190173, "grad_norm": 2.6431963904654867, "learning_rate": 4.988477173411728e-06, "loss": 0.2313, "step": 337 }, { "epoch": 0.1537761601455869, "grad_norm": 1.6862589720746106, "learning_rate": 4.988408536222939e-06, "loss": 0.1569, "step": 338 }, { "epoch": 0.15423111919927207, "grad_norm": 2.4287850849792343, "learning_rate": 4.9883396956926416e-06, "loss": 0.2077, "step": 339 }, { "epoch": 0.15468607825295724, "grad_norm": 2.1310532776354556, "learning_rate": 4.988270651826462e-06, "loss": 0.1603, "step": 340 }, { "epoch": 0.15514103730664242, "grad_norm": 2.426464258613891, "learning_rate": 4.988201404630041e-06, "loss": 0.1804, "step": 341 }, { "epoch": 0.15559599636032756, "grad_norm": 2.2461225244692966, "learning_rate": 4.988131954109038e-06, "loss": 0.1749, "step": 342 }, { "epoch": 0.15605095541401273, "grad_norm": 1.7543756867291544, "learning_rate": 4.988062300269128e-06, "loss": 0.2141, "step": 343 }, { "epoch": 0.1565059144676979, "grad_norm": 1.8842714079345257, "learning_rate": 4.987992443116003e-06, "loss": 0.1509, "step": 344 }, { "epoch": 0.15696087352138308, "grad_norm": 2.5046760683256917, "learning_rate": 4.987922382655372e-06, "loss": 0.1555, "step": 345 }, { "epoch": 0.15741583257506825, "grad_norm": 2.3171833195987186, "learning_rate": 4.987852118892958e-06, "loss": 0.259, "step": 346 }, { "epoch": 0.15787079162875342, "grad_norm": 1.7971407845013883, "learning_rate": 4.987781651834503e-06, "loss": 0.2111, "step": 347 }, { "epoch": 0.1583257506824386, "grad_norm": 2.229282526599637, "learning_rate": 4.987710981485768e-06, "loss": 0.1639, "step": 348 }, { "epoch": 0.15878070973612374, "grad_norm": 2.090625191317677, "learning_rate": 4.987640107852525e-06, "loss": 0.2123, "step": 349 }, { "epoch": 0.1592356687898089, "grad_norm": 2.117001720390773, "learning_rate": 4.987569030940567e-06, "loss": 0.1762, "step": 350 }, { "epoch": 0.15969062784349408, "grad_norm": 1.7897158962626623, "learning_rate": 4.987497750755702e-06, "loss": 0.0935, "step": 351 }, { "epoch": 0.16014558689717925, "grad_norm": 2.0946360877045906, "learning_rate": 4.987426267303753e-06, "loss": 0.2049, "step": 352 }, { "epoch": 0.16060054595086443, "grad_norm": 2.07614941330386, "learning_rate": 4.987354580590563e-06, "loss": 0.1858, "step": 353 }, { "epoch": 0.1610555050045496, "grad_norm": 1.6797770286484157, "learning_rate": 4.987282690621991e-06, "loss": 0.1652, "step": 354 }, { "epoch": 0.16151046405823477, "grad_norm": 1.6413851962480772, "learning_rate": 4.987210597403907e-06, "loss": 0.156, "step": 355 }, { "epoch": 0.16196542311191992, "grad_norm": 2.5143144976994285, "learning_rate": 4.987138300942208e-06, "loss": 0.1804, "step": 356 }, { "epoch": 0.1624203821656051, "grad_norm": 2.128297430906798, "learning_rate": 4.987065801242798e-06, "loss": 0.1634, "step": 357 }, { "epoch": 0.16287534121929026, "grad_norm": 2.039358127433988, "learning_rate": 4.986993098311601e-06, "loss": 0.172, "step": 358 }, { "epoch": 0.16333030027297543, "grad_norm": 2.2470477292441906, "learning_rate": 4.986920192154561e-06, "loss": 0.1419, "step": 359 }, { "epoch": 0.1637852593266606, "grad_norm": 1.8708576936226033, "learning_rate": 4.986847082777632e-06, "loss": 0.165, "step": 360 }, { "epoch": 0.16424021838034578, "grad_norm": 2.2426713628374406, "learning_rate": 4.986773770186791e-06, "loss": 0.2113, "step": 361 }, { "epoch": 0.16469517743403095, "grad_norm": 2.1231842278965716, "learning_rate": 4.986700254388027e-06, "loss": 0.2583, "step": 362 }, { "epoch": 0.1651501364877161, "grad_norm": 1.9962414368551604, "learning_rate": 4.986626535387349e-06, "loss": 0.2146, "step": 363 }, { "epoch": 0.16560509554140126, "grad_norm": 2.7738560722941656, "learning_rate": 4.9865526131907795e-06, "loss": 0.1913, "step": 364 }, { "epoch": 0.16606005459508644, "grad_norm": 1.8910905183030835, "learning_rate": 4.9864784878043595e-06, "loss": 0.2243, "step": 365 }, { "epoch": 0.1665150136487716, "grad_norm": 2.943803252646498, "learning_rate": 4.986404159234146e-06, "loss": 0.2169, "step": 366 }, { "epoch": 0.16696997270245678, "grad_norm": 2.067283855325497, "learning_rate": 4.986329627486213e-06, "loss": 0.1392, "step": 367 }, { "epoch": 0.16742493175614195, "grad_norm": 1.7900649282380081, "learning_rate": 4.986254892566652e-06, "loss": 0.1929, "step": 368 }, { "epoch": 0.16787989080982713, "grad_norm": 2.05364008592912, "learning_rate": 4.9861799544815684e-06, "loss": 0.1539, "step": 369 }, { "epoch": 0.16833484986351227, "grad_norm": 1.8722252354131819, "learning_rate": 4.986104813237086e-06, "loss": 0.1584, "step": 370 }, { "epoch": 0.16878980891719744, "grad_norm": 2.127812745723865, "learning_rate": 4.986029468839346e-06, "loss": 0.1618, "step": 371 }, { "epoch": 0.16924476797088261, "grad_norm": 2.4926065420888643, "learning_rate": 4.985953921294505e-06, "loss": 0.2601, "step": 372 }, { "epoch": 0.1696997270245678, "grad_norm": 2.973425717527041, "learning_rate": 4.985878170608736e-06, "loss": 0.1919, "step": 373 }, { "epoch": 0.17015468607825296, "grad_norm": 2.1354583522718604, "learning_rate": 4.985802216788228e-06, "loss": 0.1904, "step": 374 }, { "epoch": 0.17060964513193813, "grad_norm": 2.4618549416407634, "learning_rate": 4.98572605983919e-06, "loss": 0.2137, "step": 375 }, { "epoch": 0.1710646041856233, "grad_norm": 1.3365138469487268, "learning_rate": 4.985649699767842e-06, "loss": 0.1069, "step": 376 }, { "epoch": 0.17151956323930848, "grad_norm": 1.9602605162416638, "learning_rate": 4.985573136580427e-06, "loss": 0.1723, "step": 377 }, { "epoch": 0.17197452229299362, "grad_norm": 1.6915428216688142, "learning_rate": 4.9854963702832e-06, "loss": 0.1673, "step": 378 }, { "epoch": 0.1724294813466788, "grad_norm": 2.0131015516091875, "learning_rate": 4.985419400882433e-06, "loss": 0.2159, "step": 379 }, { "epoch": 0.17288444040036396, "grad_norm": 1.8436996177818286, "learning_rate": 4.985342228384418e-06, "loss": 0.1777, "step": 380 }, { "epoch": 0.17333939945404914, "grad_norm": 3.2955423815059257, "learning_rate": 4.985264852795459e-06, "loss": 0.2759, "step": 381 }, { "epoch": 0.1737943585077343, "grad_norm": 2.386347589584829, "learning_rate": 4.98518727412188e-06, "loss": 0.1958, "step": 382 }, { "epoch": 0.17424931756141948, "grad_norm": 2.5771465793014294, "learning_rate": 4.98510949237002e-06, "loss": 0.1861, "step": 383 }, { "epoch": 0.17470427661510465, "grad_norm": 2.420697255730561, "learning_rate": 4.985031507546234e-06, "loss": 0.1538, "step": 384 }, { "epoch": 0.1751592356687898, "grad_norm": 2.6016330527075895, "learning_rate": 4.984953319656896e-06, "loss": 0.1981, "step": 385 }, { "epoch": 0.17561419472247497, "grad_norm": 2.671850671096213, "learning_rate": 4.984874928708395e-06, "loss": 0.1802, "step": 386 }, { "epoch": 0.17606915377616014, "grad_norm": 2.329893515854394, "learning_rate": 4.984796334707136e-06, "loss": 0.1916, "step": 387 }, { "epoch": 0.17652411282984531, "grad_norm": 2.900381887848387, "learning_rate": 4.984717537659542e-06, "loss": 0.1851, "step": 388 }, { "epoch": 0.1769790718835305, "grad_norm": 2.8920348384518295, "learning_rate": 4.984638537572052e-06, "loss": 0.1614, "step": 389 }, { "epoch": 0.17743403093721566, "grad_norm": 1.7590905699687769, "learning_rate": 4.984559334451121e-06, "loss": 0.1182, "step": 390 }, { "epoch": 0.17788898999090083, "grad_norm": 1.992998204932115, "learning_rate": 4.984479928303221e-06, "loss": 0.1097, "step": 391 }, { "epoch": 0.17834394904458598, "grad_norm": 1.7032225308271054, "learning_rate": 4.984400319134841e-06, "loss": 0.1166, "step": 392 }, { "epoch": 0.17879890809827115, "grad_norm": 2.170562253873519, "learning_rate": 4.984320506952487e-06, "loss": 0.2253, "step": 393 }, { "epoch": 0.17925386715195632, "grad_norm": 2.237592089222373, "learning_rate": 4.9842404917626796e-06, "loss": 0.1949, "step": 394 }, { "epoch": 0.1797088262056415, "grad_norm": 2.0106916989450587, "learning_rate": 4.984160273571959e-06, "loss": 0.1681, "step": 395 }, { "epoch": 0.18016378525932666, "grad_norm": 1.5887484417784243, "learning_rate": 4.9840798523868785e-06, "loss": 0.1987, "step": 396 }, { "epoch": 0.18061874431301184, "grad_norm": 2.1863186231198677, "learning_rate": 4.983999228214011e-06, "loss": 0.1688, "step": 397 }, { "epoch": 0.181073703366697, "grad_norm": 1.73818173181658, "learning_rate": 4.983918401059943e-06, "loss": 0.1667, "step": 398 }, { "epoch": 0.18152866242038215, "grad_norm": 2.507383020515962, "learning_rate": 4.983837370931282e-06, "loss": 0.1969, "step": 399 }, { "epoch": 0.18198362147406733, "grad_norm": 2.0632014051403793, "learning_rate": 4.983756137834647e-06, "loss": 0.183, "step": 400 }, { "epoch": 0.1824385805277525, "grad_norm": 2.830188740520148, "learning_rate": 4.9836747017766765e-06, "loss": 0.2093, "step": 401 }, { "epoch": 0.18289353958143767, "grad_norm": 2.5110616036547038, "learning_rate": 4.983593062764027e-06, "loss": 0.2322, "step": 402 }, { "epoch": 0.18334849863512284, "grad_norm": 3.686743248745681, "learning_rate": 4.983511220803367e-06, "loss": 0.2445, "step": 403 }, { "epoch": 0.18380345768880801, "grad_norm": 1.679708381839253, "learning_rate": 4.983429175901386e-06, "loss": 0.1796, "step": 404 }, { "epoch": 0.1842584167424932, "grad_norm": 2.1827593155516722, "learning_rate": 4.983346928064788e-06, "loss": 0.1674, "step": 405 }, { "epoch": 0.18471337579617833, "grad_norm": 1.60561536399989, "learning_rate": 4.9832644773002935e-06, "loss": 0.1696, "step": 406 }, { "epoch": 0.1851683348498635, "grad_norm": 2.3818871014331418, "learning_rate": 4.98318182361464e-06, "loss": 0.231, "step": 407 }, { "epoch": 0.18562329390354868, "grad_norm": 2.466498074147868, "learning_rate": 4.9830989670145825e-06, "loss": 0.2363, "step": 408 }, { "epoch": 0.18607825295723385, "grad_norm": 2.3360214493938485, "learning_rate": 4.9830159075068905e-06, "loss": 0.2211, "step": 409 }, { "epoch": 0.18653321201091902, "grad_norm": 1.8065829881072444, "learning_rate": 4.9829326450983514e-06, "loss": 0.1743, "step": 410 }, { "epoch": 0.1869881710646042, "grad_norm": 2.69540573324766, "learning_rate": 4.98284917979577e-06, "loss": 0.1876, "step": 411 }, { "epoch": 0.18744313011828936, "grad_norm": 1.8906354216406325, "learning_rate": 4.9827655116059656e-06, "loss": 0.1592, "step": 412 }, { "epoch": 0.18789808917197454, "grad_norm": 1.743151148777257, "learning_rate": 4.9826816405357755e-06, "loss": 0.1746, "step": 413 }, { "epoch": 0.18835304822565968, "grad_norm": 1.5963849264202556, "learning_rate": 4.982597566592054e-06, "loss": 0.1244, "step": 414 }, { "epoch": 0.18880800727934485, "grad_norm": 2.7157092244830205, "learning_rate": 4.982513289781671e-06, "loss": 0.2332, "step": 415 }, { "epoch": 0.18926296633303002, "grad_norm": 1.9931400703765212, "learning_rate": 4.982428810111512e-06, "loss": 0.2113, "step": 416 }, { "epoch": 0.1897179253867152, "grad_norm": 1.3604077425808516, "learning_rate": 4.9823441275884814e-06, "loss": 0.1305, "step": 417 }, { "epoch": 0.19017288444040037, "grad_norm": 2.2607598123619517, "learning_rate": 4.982259242219499e-06, "loss": 0.1723, "step": 418 }, { "epoch": 0.19062784349408554, "grad_norm": 1.867118589561207, "learning_rate": 4.9821741540115006e-06, "loss": 0.1355, "step": 419 }, { "epoch": 0.1910828025477707, "grad_norm": 2.11150758750875, "learning_rate": 4.982088862971441e-06, "loss": 0.2181, "step": 420 }, { "epoch": 0.19153776160145586, "grad_norm": 2.922634212063935, "learning_rate": 4.982003369106287e-06, "loss": 0.1935, "step": 421 }, { "epoch": 0.19199272065514103, "grad_norm": 1.8213621057521336, "learning_rate": 4.981917672423028e-06, "loss": 0.1159, "step": 422 }, { "epoch": 0.1924476797088262, "grad_norm": 1.9973203363112062, "learning_rate": 4.981831772928664e-06, "loss": 0.1644, "step": 423 }, { "epoch": 0.19290263876251137, "grad_norm": 1.6435298569620178, "learning_rate": 4.981745670630216e-06, "loss": 0.1676, "step": 424 }, { "epoch": 0.19335759781619655, "grad_norm": 1.7090737346215599, "learning_rate": 4.981659365534718e-06, "loss": 0.1947, "step": 425 }, { "epoch": 0.19381255686988172, "grad_norm": 2.8644071628055365, "learning_rate": 4.981572857649225e-06, "loss": 0.2412, "step": 426 }, { "epoch": 0.1942675159235669, "grad_norm": 1.5071870677678134, "learning_rate": 4.981486146980804e-06, "loss": 0.1247, "step": 427 }, { "epoch": 0.19472247497725204, "grad_norm": 2.5523639597283436, "learning_rate": 4.9813992335365415e-06, "loss": 0.1636, "step": 428 }, { "epoch": 0.1951774340309372, "grad_norm": 1.6766352791010617, "learning_rate": 4.98131211732354e-06, "loss": 0.1659, "step": 429 }, { "epoch": 0.19563239308462238, "grad_norm": 2.6626571731411985, "learning_rate": 4.981224798348917e-06, "loss": 0.1777, "step": 430 }, { "epoch": 0.19608735213830755, "grad_norm": 1.7748484056177547, "learning_rate": 4.981137276619809e-06, "loss": 0.2038, "step": 431 }, { "epoch": 0.19654231119199272, "grad_norm": 1.6726970249923665, "learning_rate": 4.9810495521433675e-06, "loss": 0.167, "step": 432 }, { "epoch": 0.1969972702456779, "grad_norm": 2.3836088959731407, "learning_rate": 4.9809616249267616e-06, "loss": 0.1967, "step": 433 }, { "epoch": 0.19745222929936307, "grad_norm": 1.9478244630239012, "learning_rate": 4.980873494977174e-06, "loss": 0.2259, "step": 434 }, { "epoch": 0.1979071883530482, "grad_norm": 2.601912538074716, "learning_rate": 4.98078516230181e-06, "loss": 0.196, "step": 435 }, { "epoch": 0.19836214740673339, "grad_norm": 1.8252963162031037, "learning_rate": 4.980696626907884e-06, "loss": 0.1551, "step": 436 }, { "epoch": 0.19881710646041856, "grad_norm": 1.7882792458437706, "learning_rate": 4.980607888802633e-06, "loss": 0.1547, "step": 437 }, { "epoch": 0.19927206551410373, "grad_norm": 1.8674433444840757, "learning_rate": 4.980518947993307e-06, "loss": 0.1625, "step": 438 }, { "epoch": 0.1997270245677889, "grad_norm": 2.050135562104488, "learning_rate": 4.980429804487176e-06, "loss": 0.1706, "step": 439 }, { "epoch": 0.20018198362147407, "grad_norm": 3.040028729336044, "learning_rate": 4.980340458291521e-06, "loss": 0.2235, "step": 440 }, { "epoch": 0.20063694267515925, "grad_norm": 1.755025572252995, "learning_rate": 4.980250909413646e-06, "loss": 0.1451, "step": 441 }, { "epoch": 0.2010919017288444, "grad_norm": 2.636610646301175, "learning_rate": 4.980161157860867e-06, "loss": 0.1869, "step": 442 }, { "epoch": 0.20154686078252956, "grad_norm": 2.5942914069340715, "learning_rate": 4.980071203640519e-06, "loss": 0.1633, "step": 443 }, { "epoch": 0.20200181983621474, "grad_norm": 1.5184266230548011, "learning_rate": 4.979981046759952e-06, "loss": 0.1441, "step": 444 }, { "epoch": 0.2024567788898999, "grad_norm": 1.8681142182661066, "learning_rate": 4.979890687226533e-06, "loss": 0.1596, "step": 445 }, { "epoch": 0.20291173794358508, "grad_norm": 2.48564323404002, "learning_rate": 4.979800125047647e-06, "loss": 0.1481, "step": 446 }, { "epoch": 0.20336669699727025, "grad_norm": 2.3390506413519514, "learning_rate": 4.979709360230692e-06, "loss": 0.1889, "step": 447 }, { "epoch": 0.20382165605095542, "grad_norm": 2.017468095007692, "learning_rate": 4.979618392783087e-06, "loss": 0.1417, "step": 448 }, { "epoch": 0.20427661510464057, "grad_norm": 1.729598330112352, "learning_rate": 4.979527222712266e-06, "loss": 0.142, "step": 449 }, { "epoch": 0.20473157415832574, "grad_norm": 2.1368144580931747, "learning_rate": 4.9794358500256765e-06, "loss": 0.1636, "step": 450 }, { "epoch": 0.2051865332120109, "grad_norm": 1.9994448136168699, "learning_rate": 4.979344274730786e-06, "loss": 0.1604, "step": 451 }, { "epoch": 0.20564149226569609, "grad_norm": 3.428795563882251, "learning_rate": 4.979252496835079e-06, "loss": 0.2394, "step": 452 }, { "epoch": 0.20609645131938126, "grad_norm": 2.6996852974810768, "learning_rate": 4.979160516346054e-06, "loss": 0.2375, "step": 453 }, { "epoch": 0.20655141037306643, "grad_norm": 1.9797680166732188, "learning_rate": 4.979068333271227e-06, "loss": 0.1842, "step": 454 }, { "epoch": 0.2070063694267516, "grad_norm": 3.003957390141276, "learning_rate": 4.978975947618131e-06, "loss": 0.193, "step": 455 }, { "epoch": 0.20746132848043677, "grad_norm": 2.00845771414247, "learning_rate": 4.978883359394316e-06, "loss": 0.198, "step": 456 }, { "epoch": 0.20791628753412192, "grad_norm": 2.0203437551682186, "learning_rate": 4.978790568607347e-06, "loss": 0.1643, "step": 457 }, { "epoch": 0.2083712465878071, "grad_norm": 2.112746362210305, "learning_rate": 4.9786975752648076e-06, "loss": 0.2327, "step": 458 }, { "epoch": 0.20882620564149226, "grad_norm": 1.9220582393008747, "learning_rate": 4.978604379374295e-06, "loss": 0.1549, "step": 459 }, { "epoch": 0.20928116469517744, "grad_norm": 2.1402572457657545, "learning_rate": 4.978510980943427e-06, "loss": 0.139, "step": 460 }, { "epoch": 0.2097361237488626, "grad_norm": 2.4018554173698914, "learning_rate": 4.978417379979834e-06, "loss": 0.2455, "step": 461 }, { "epoch": 0.21019108280254778, "grad_norm": 1.951258020011642, "learning_rate": 4.978323576491165e-06, "loss": 0.1552, "step": 462 }, { "epoch": 0.21064604185623295, "grad_norm": 2.1010768496853323, "learning_rate": 4.978229570485085e-06, "loss": 0.2383, "step": 463 }, { "epoch": 0.2111010009099181, "grad_norm": 1.5821441832613072, "learning_rate": 4.978135361969276e-06, "loss": 0.1851, "step": 464 }, { "epoch": 0.21155595996360327, "grad_norm": 1.6009355908322205, "learning_rate": 4.9780409509514375e-06, "loss": 0.175, "step": 465 }, { "epoch": 0.21201091901728844, "grad_norm": 1.8650365534886528, "learning_rate": 4.977946337439282e-06, "loss": 0.2302, "step": 466 }, { "epoch": 0.2124658780709736, "grad_norm": 1.6321720020750403, "learning_rate": 4.9778515214405436e-06, "loss": 0.1919, "step": 467 }, { "epoch": 0.21292083712465878, "grad_norm": 1.5102194582450883, "learning_rate": 4.977756502962967e-06, "loss": 0.1206, "step": 468 }, { "epoch": 0.21337579617834396, "grad_norm": 2.069100224324352, "learning_rate": 4.97766128201432e-06, "loss": 0.1429, "step": 469 }, { "epoch": 0.21383075523202913, "grad_norm": 1.8931152672148568, "learning_rate": 4.977565858602381e-06, "loss": 0.1634, "step": 470 }, { "epoch": 0.21428571428571427, "grad_norm": 1.9388931474803874, "learning_rate": 4.977470232734949e-06, "loss": 0.1138, "step": 471 }, { "epoch": 0.21474067333939945, "grad_norm": 2.52659442383892, "learning_rate": 4.977374404419838e-06, "loss": 0.2011, "step": 472 }, { "epoch": 0.21519563239308462, "grad_norm": 1.9831728669000206, "learning_rate": 4.977278373664877e-06, "loss": 0.1475, "step": 473 }, { "epoch": 0.2156505914467698, "grad_norm": 1.8342304339485977, "learning_rate": 4.977182140477916e-06, "loss": 0.1801, "step": 474 }, { "epoch": 0.21610555050045496, "grad_norm": 1.9321185937866436, "learning_rate": 4.977085704866817e-06, "loss": 0.1787, "step": 475 }, { "epoch": 0.21656050955414013, "grad_norm": 1.8230541452731504, "learning_rate": 4.97698906683946e-06, "loss": 0.202, "step": 476 }, { "epoch": 0.2170154686078253, "grad_norm": 2.4982489548908062, "learning_rate": 4.9768922264037435e-06, "loss": 0.2283, "step": 477 }, { "epoch": 0.21747042766151045, "grad_norm": 2.134742327126813, "learning_rate": 4.976795183567579e-06, "loss": 0.1544, "step": 478 }, { "epoch": 0.21792538671519562, "grad_norm": 2.9581764452635184, "learning_rate": 4.976697938338898e-06, "loss": 0.1674, "step": 479 }, { "epoch": 0.2183803457688808, "grad_norm": 1.712602080023381, "learning_rate": 4.976600490725645e-06, "loss": 0.1568, "step": 480 }, { "epoch": 0.21883530482256597, "grad_norm": 1.7418610812844693, "learning_rate": 4.976502840735785e-06, "loss": 0.1945, "step": 481 }, { "epoch": 0.21929026387625114, "grad_norm": 2.138071978494717, "learning_rate": 4.976404988377297e-06, "loss": 0.1512, "step": 482 }, { "epoch": 0.2197452229299363, "grad_norm": 2.346885929916554, "learning_rate": 4.976306933658176e-06, "loss": 0.2262, "step": 483 }, { "epoch": 0.22020018198362148, "grad_norm": 2.020074510485992, "learning_rate": 4.976208676586435e-06, "loss": 0.2141, "step": 484 }, { "epoch": 0.22065514103730663, "grad_norm": 1.8763221281396283, "learning_rate": 4.976110217170104e-06, "loss": 0.1491, "step": 485 }, { "epoch": 0.2211101000909918, "grad_norm": 2.235721601006219, "learning_rate": 4.976011555417228e-06, "loss": 0.2058, "step": 486 }, { "epoch": 0.22156505914467697, "grad_norm": 1.315034818762656, "learning_rate": 4.975912691335869e-06, "loss": 0.1244, "step": 487 }, { "epoch": 0.22202001819836215, "grad_norm": 2.1199398350029757, "learning_rate": 4.975813624934106e-06, "loss": 0.1412, "step": 488 }, { "epoch": 0.22247497725204732, "grad_norm": 1.8709221572870474, "learning_rate": 4.975714356220035e-06, "loss": 0.1527, "step": 489 }, { "epoch": 0.2229299363057325, "grad_norm": 2.2421419230230657, "learning_rate": 4.975614885201766e-06, "loss": 0.1608, "step": 490 }, { "epoch": 0.22338489535941766, "grad_norm": 2.3078261939110454, "learning_rate": 4.975515211887429e-06, "loss": 0.1465, "step": 491 }, { "epoch": 0.22383985441310283, "grad_norm": 1.5895485837834087, "learning_rate": 4.9754153362851684e-06, "loss": 0.1197, "step": 492 }, { "epoch": 0.22429481346678798, "grad_norm": 1.7459488111256227, "learning_rate": 4.975315258403145e-06, "loss": 0.1528, "step": 493 }, { "epoch": 0.22474977252047315, "grad_norm": 1.7723162295113712, "learning_rate": 4.975214978249537e-06, "loss": 0.192, "step": 494 }, { "epoch": 0.22520473157415832, "grad_norm": 2.1669137038937905, "learning_rate": 4.975114495832539e-06, "loss": 0.2359, "step": 495 }, { "epoch": 0.2256596906278435, "grad_norm": 2.0603228355359535, "learning_rate": 4.975013811160362e-06, "loss": 0.1745, "step": 496 }, { "epoch": 0.22611464968152867, "grad_norm": 2.043894775326392, "learning_rate": 4.974912924241233e-06, "loss": 0.1624, "step": 497 }, { "epoch": 0.22656960873521384, "grad_norm": 1.6841728525009554, "learning_rate": 4.974811835083397e-06, "loss": 0.2189, "step": 498 }, { "epoch": 0.227024567788899, "grad_norm": 2.6366675854172335, "learning_rate": 4.974710543695114e-06, "loss": 0.2328, "step": 499 }, { "epoch": 0.22747952684258416, "grad_norm": 2.4052804548672304, "learning_rate": 4.974609050084661e-06, "loss": 0.1886, "step": 500 }, { "epoch": 0.22793448589626933, "grad_norm": 2.0535117318370633, "learning_rate": 4.974507354260332e-06, "loss": 0.2303, "step": 501 }, { "epoch": 0.2283894449499545, "grad_norm": 2.0269029978513555, "learning_rate": 4.974405456230436e-06, "loss": 0.1671, "step": 502 }, { "epoch": 0.22884440400363967, "grad_norm": 2.7642802872985293, "learning_rate": 4.974303356003301e-06, "loss": 0.1344, "step": 503 }, { "epoch": 0.22929936305732485, "grad_norm": 1.7887955204908959, "learning_rate": 4.974201053587268e-06, "loss": 0.1681, "step": 504 }, { "epoch": 0.22975432211101002, "grad_norm": 1.9742201804444028, "learning_rate": 4.9740985489907005e-06, "loss": 0.138, "step": 505 }, { "epoch": 0.2302092811646952, "grad_norm": 2.166941374479256, "learning_rate": 4.973995842221971e-06, "loss": 0.1868, "step": 506 }, { "epoch": 0.23066424021838033, "grad_norm": 2.225119335059734, "learning_rate": 4.973892933289476e-06, "loss": 0.1567, "step": 507 }, { "epoch": 0.2311191992720655, "grad_norm": 1.8892762650773542, "learning_rate": 4.97378982220162e-06, "loss": 0.1488, "step": 508 }, { "epoch": 0.23157415832575068, "grad_norm": 1.8158100523332013, "learning_rate": 4.973686508966832e-06, "loss": 0.1301, "step": 509 }, { "epoch": 0.23202911737943585, "grad_norm": 2.0245407202628836, "learning_rate": 4.973582993593554e-06, "loss": 0.1695, "step": 510 }, { "epoch": 0.23248407643312102, "grad_norm": 2.7034498126253674, "learning_rate": 4.973479276090244e-06, "loss": 0.1737, "step": 511 }, { "epoch": 0.2329390354868062, "grad_norm": 2.065622568041038, "learning_rate": 4.973375356465378e-06, "loss": 0.149, "step": 512 }, { "epoch": 0.23339399454049137, "grad_norm": 1.9812676900095911, "learning_rate": 4.973271234727447e-06, "loss": 0.173, "step": 513 }, { "epoch": 0.2338489535941765, "grad_norm": 1.5726806580344541, "learning_rate": 4.97316691088496e-06, "loss": 0.1254, "step": 514 }, { "epoch": 0.23430391264786168, "grad_norm": 2.191785122658953, "learning_rate": 4.973062384946442e-06, "loss": 0.2233, "step": 515 }, { "epoch": 0.23475887170154686, "grad_norm": 1.035062440323858, "learning_rate": 4.9729576569204345e-06, "loss": 0.1013, "step": 516 }, { "epoch": 0.23521383075523203, "grad_norm": 1.6618268618936451, "learning_rate": 4.972852726815495e-06, "loss": 0.1611, "step": 517 }, { "epoch": 0.2356687898089172, "grad_norm": 1.3381515796606562, "learning_rate": 4.972747594640197e-06, "loss": 0.1669, "step": 518 }, { "epoch": 0.23612374886260237, "grad_norm": 2.0887228759944327, "learning_rate": 4.9726422604031335e-06, "loss": 0.1718, "step": 519 }, { "epoch": 0.23657870791628755, "grad_norm": 1.424194176219749, "learning_rate": 4.97253672411291e-06, "loss": 0.1771, "step": 520 }, { "epoch": 0.2370336669699727, "grad_norm": 1.5373795467776654, "learning_rate": 4.972430985778152e-06, "loss": 0.1118, "step": 521 }, { "epoch": 0.23748862602365786, "grad_norm": 2.6972031210443506, "learning_rate": 4.972325045407499e-06, "loss": 0.1702, "step": 522 }, { "epoch": 0.23794358507734303, "grad_norm": 3.1350549460340957, "learning_rate": 4.972218903009608e-06, "loss": 0.2161, "step": 523 }, { "epoch": 0.2383985441310282, "grad_norm": 2.1422204131037628, "learning_rate": 4.972112558593153e-06, "loss": 0.1902, "step": 524 }, { "epoch": 0.23885350318471338, "grad_norm": 2.041726060026698, "learning_rate": 4.972006012166823e-06, "loss": 0.2079, "step": 525 }, { "epoch": 0.23930846223839855, "grad_norm": 1.7346734861898188, "learning_rate": 4.971899263739326e-06, "loss": 0.1394, "step": 526 }, { "epoch": 0.23976342129208372, "grad_norm": 1.959916622945104, "learning_rate": 4.971792313319384e-06, "loss": 0.1901, "step": 527 }, { "epoch": 0.24021838034576887, "grad_norm": 1.6780700319385458, "learning_rate": 4.971685160915737e-06, "loss": 0.1623, "step": 528 }, { "epoch": 0.24067333939945404, "grad_norm": 2.08830134651656, "learning_rate": 4.971577806537139e-06, "loss": 0.1607, "step": 529 }, { "epoch": 0.2411282984531392, "grad_norm": 2.205231289993063, "learning_rate": 4.971470250192366e-06, "loss": 0.1851, "step": 530 }, { "epoch": 0.24158325750682438, "grad_norm": 2.911292420170041, "learning_rate": 4.9713624918902045e-06, "loss": 0.2235, "step": 531 }, { "epoch": 0.24203821656050956, "grad_norm": 2.1164751998531344, "learning_rate": 4.971254531639461e-06, "loss": 0.1556, "step": 532 }, { "epoch": 0.24249317561419473, "grad_norm": 2.740398833115599, "learning_rate": 4.971146369448957e-06, "loss": 0.206, "step": 533 }, { "epoch": 0.2429481346678799, "grad_norm": 1.797962382814168, "learning_rate": 4.971038005327532e-06, "loss": 0.161, "step": 534 }, { "epoch": 0.24340309372156507, "grad_norm": 1.995555524717142, "learning_rate": 4.970929439284039e-06, "loss": 0.1808, "step": 535 }, { "epoch": 0.24385805277525022, "grad_norm": 2.1172122131281927, "learning_rate": 4.970820671327351e-06, "loss": 0.189, "step": 536 }, { "epoch": 0.2443130118289354, "grad_norm": 1.8090573461125563, "learning_rate": 4.9707117014663565e-06, "loss": 0.1522, "step": 537 }, { "epoch": 0.24476797088262056, "grad_norm": 1.8419040286839186, "learning_rate": 4.97060252970996e-06, "loss": 0.2046, "step": 538 }, { "epoch": 0.24522292993630573, "grad_norm": 2.268977185876009, "learning_rate": 4.970493156067081e-06, "loss": 0.2247, "step": 539 }, { "epoch": 0.2456778889899909, "grad_norm": 2.1193932268543314, "learning_rate": 4.970383580546658e-06, "loss": 0.159, "step": 540 }, { "epoch": 0.24613284804367608, "grad_norm": 2.173218123449192, "learning_rate": 4.970273803157645e-06, "loss": 0.1851, "step": 541 }, { "epoch": 0.24658780709736125, "grad_norm": 1.9062873437813912, "learning_rate": 4.970163823909013e-06, "loss": 0.1431, "step": 542 }, { "epoch": 0.2470427661510464, "grad_norm": 2.2598849919184936, "learning_rate": 4.970053642809748e-06, "loss": 0.1831, "step": 543 }, { "epoch": 0.24749772520473157, "grad_norm": 2.181038873894579, "learning_rate": 4.969943259868853e-06, "loss": 0.1924, "step": 544 }, { "epoch": 0.24795268425841674, "grad_norm": 1.8247639377537164, "learning_rate": 4.969832675095351e-06, "loss": 0.151, "step": 545 }, { "epoch": 0.2484076433121019, "grad_norm": 1.9978374370947616, "learning_rate": 4.969721888498275e-06, "loss": 0.2343, "step": 546 }, { "epoch": 0.24886260236578708, "grad_norm": 2.0040249698932953, "learning_rate": 4.96961090008668e-06, "loss": 0.144, "step": 547 }, { "epoch": 0.24931756141947226, "grad_norm": 1.58491785029609, "learning_rate": 4.969499709869635e-06, "loss": 0.2297, "step": 548 }, { "epoch": 0.24977252047315743, "grad_norm": 1.9099928105281807, "learning_rate": 4.969388317856225e-06, "loss": 0.1643, "step": 549 }, { "epoch": 0.2502274795268426, "grad_norm": 2.506622826362881, "learning_rate": 4.969276724055554e-06, "loss": 0.2302, "step": 550 }, { "epoch": 0.25068243858052774, "grad_norm": 1.886779327578952, "learning_rate": 4.969164928476741e-06, "loss": 0.1305, "step": 551 }, { "epoch": 0.25113739763421294, "grad_norm": 2.193853436337964, "learning_rate": 4.969052931128919e-06, "loss": 0.1942, "step": 552 }, { "epoch": 0.2515923566878981, "grad_norm": 1.696380624819296, "learning_rate": 4.968940732021243e-06, "loss": 0.1757, "step": 553 }, { "epoch": 0.25204731574158323, "grad_norm": 1.9308212907452063, "learning_rate": 4.9688283311628795e-06, "loss": 0.1953, "step": 554 }, { "epoch": 0.25250227479526843, "grad_norm": 2.2015952320833927, "learning_rate": 4.968715728563014e-06, "loss": 0.2188, "step": 555 }, { "epoch": 0.2529572338489536, "grad_norm": 1.8518723960249535, "learning_rate": 4.968602924230847e-06, "loss": 0.1439, "step": 556 }, { "epoch": 0.2534121929026388, "grad_norm": 3.211322079508386, "learning_rate": 4.968489918175598e-06, "loss": 0.1758, "step": 557 }, { "epoch": 0.2538671519563239, "grad_norm": 2.949982147696011, "learning_rate": 4.9683767104065014e-06, "loss": 0.1802, "step": 558 }, { "epoch": 0.2543221110100091, "grad_norm": 2.2092600896288697, "learning_rate": 4.968263300932806e-06, "loss": 0.1898, "step": 559 }, { "epoch": 0.25477707006369427, "grad_norm": 1.7931135921014567, "learning_rate": 4.968149689763781e-06, "loss": 0.1544, "step": 560 }, { "epoch": 0.2552320291173794, "grad_norm": 1.7030840422806155, "learning_rate": 4.968035876908708e-06, "loss": 0.1639, "step": 561 }, { "epoch": 0.2556869881710646, "grad_norm": 1.8718848217622976, "learning_rate": 4.967921862376889e-06, "loss": 0.2434, "step": 562 }, { "epoch": 0.25614194722474976, "grad_norm": 2.2371670340279235, "learning_rate": 4.9678076461776415e-06, "loss": 0.2335, "step": 563 }, { "epoch": 0.25659690627843496, "grad_norm": 1.8393455682211606, "learning_rate": 4.9676932283202965e-06, "loss": 0.1499, "step": 564 }, { "epoch": 0.2570518653321201, "grad_norm": 2.4142531578801387, "learning_rate": 4.967578608814205e-06, "loss": 0.1949, "step": 565 }, { "epoch": 0.2575068243858053, "grad_norm": 2.0642965255799735, "learning_rate": 4.9674637876687345e-06, "loss": 0.1858, "step": 566 }, { "epoch": 0.25796178343949044, "grad_norm": 1.2532956879082058, "learning_rate": 4.967348764893265e-06, "loss": 0.1256, "step": 567 }, { "epoch": 0.2584167424931756, "grad_norm": 2.1919850476807574, "learning_rate": 4.967233540497197e-06, "loss": 0.1554, "step": 568 }, { "epoch": 0.2588717015468608, "grad_norm": 2.1554599148015186, "learning_rate": 4.967118114489946e-06, "loss": 0.2131, "step": 569 }, { "epoch": 0.25932666060054593, "grad_norm": 1.7423629235975449, "learning_rate": 4.967002486880944e-06, "loss": 0.1488, "step": 570 }, { "epoch": 0.25978161965423113, "grad_norm": 2.7181048243188, "learning_rate": 4.966886657679641e-06, "loss": 0.2501, "step": 571 }, { "epoch": 0.2602365787079163, "grad_norm": 1.6717232797306434, "learning_rate": 4.966770626895499e-06, "loss": 0.1664, "step": 572 }, { "epoch": 0.2606915377616015, "grad_norm": 2.1767030645167162, "learning_rate": 4.966654394538002e-06, "loss": 0.1921, "step": 573 }, { "epoch": 0.2611464968152866, "grad_norm": 1.2471699088039891, "learning_rate": 4.966537960616646e-06, "loss": 0.0848, "step": 574 }, { "epoch": 0.26160145586897177, "grad_norm": 2.0523431055962402, "learning_rate": 4.9664213251409486e-06, "loss": 0.2032, "step": 575 }, { "epoch": 0.26205641492265697, "grad_norm": 1.9891959124678449, "learning_rate": 4.9663044881204375e-06, "loss": 0.1962, "step": 576 }, { "epoch": 0.2625113739763421, "grad_norm": 2.0537761706631947, "learning_rate": 4.9661874495646615e-06, "loss": 0.1484, "step": 577 }, { "epoch": 0.2629663330300273, "grad_norm": 1.7414302230897167, "learning_rate": 4.9660702094831845e-06, "loss": 0.1959, "step": 578 }, { "epoch": 0.26342129208371245, "grad_norm": 2.975109707839724, "learning_rate": 4.965952767885587e-06, "loss": 0.215, "step": 579 }, { "epoch": 0.26387625113739765, "grad_norm": 3.1187687651037126, "learning_rate": 4.965835124781465e-06, "loss": 0.2326, "step": 580 }, { "epoch": 0.2643312101910828, "grad_norm": 1.7844067959067744, "learning_rate": 4.965717280180432e-06, "loss": 0.1616, "step": 581 }, { "epoch": 0.26478616924476794, "grad_norm": 1.981807539010698, "learning_rate": 4.965599234092118e-06, "loss": 0.1275, "step": 582 }, { "epoch": 0.26524112829845314, "grad_norm": 2.3418573353915964, "learning_rate": 4.96548098652617e-06, "loss": 0.2029, "step": 583 }, { "epoch": 0.2656960873521383, "grad_norm": 1.9501727944201128, "learning_rate": 4.965362537492249e-06, "loss": 0.1839, "step": 584 }, { "epoch": 0.2661510464058235, "grad_norm": 1.735679302563917, "learning_rate": 4.9652438870000356e-06, "loss": 0.185, "step": 585 }, { "epoch": 0.26660600545950863, "grad_norm": 1.3821743738209817, "learning_rate": 4.965125035059224e-06, "loss": 0.117, "step": 586 }, { "epoch": 0.26706096451319383, "grad_norm": 2.0524973617804196, "learning_rate": 4.965005981679527e-06, "loss": 0.1563, "step": 587 }, { "epoch": 0.267515923566879, "grad_norm": 2.2596791906895395, "learning_rate": 4.964886726870673e-06, "loss": 0.2165, "step": 588 }, { "epoch": 0.2679708826205642, "grad_norm": 1.890432704603994, "learning_rate": 4.964767270642407e-06, "loss": 0.1884, "step": 589 }, { "epoch": 0.2684258416742493, "grad_norm": 1.6149961858038402, "learning_rate": 4.964647613004491e-06, "loss": 0.1353, "step": 590 }, { "epoch": 0.26888080072793447, "grad_norm": 1.7116103543510561, "learning_rate": 4.964527753966702e-06, "loss": 0.1403, "step": 591 }, { "epoch": 0.26933575978161967, "grad_norm": 2.400216438390535, "learning_rate": 4.964407693538834e-06, "loss": 0.1712, "step": 592 }, { "epoch": 0.2697907188353048, "grad_norm": 2.3569276822171012, "learning_rate": 4.9642874317307e-06, "loss": 0.2541, "step": 593 }, { "epoch": 0.27024567788899, "grad_norm": 1.3583233690609127, "learning_rate": 4.964166968552124e-06, "loss": 0.1881, "step": 594 }, { "epoch": 0.27070063694267515, "grad_norm": 2.041956563972623, "learning_rate": 4.9640463040129525e-06, "loss": 0.2013, "step": 595 }, { "epoch": 0.27115559599636035, "grad_norm": 2.1339742915351083, "learning_rate": 4.963925438123044e-06, "loss": 0.1486, "step": 596 }, { "epoch": 0.2716105550500455, "grad_norm": 2.3589739110244947, "learning_rate": 4.963804370892276e-06, "loss": 0.1671, "step": 597 }, { "epoch": 0.27206551410373064, "grad_norm": 2.041024711316621, "learning_rate": 4.9636831023305405e-06, "loss": 0.1773, "step": 598 }, { "epoch": 0.27252047315741584, "grad_norm": 1.6966086145560721, "learning_rate": 4.963561632447748e-06, "loss": 0.1536, "step": 599 }, { "epoch": 0.272975432211101, "grad_norm": 1.7956646862639238, "learning_rate": 4.9634399612538255e-06, "loss": 0.1665, "step": 600 }, { "epoch": 0.2734303912647862, "grad_norm": 2.4039450245635816, "learning_rate": 4.963318088758714e-06, "loss": 0.186, "step": 601 }, { "epoch": 0.27388535031847133, "grad_norm": 2.573374996121704, "learning_rate": 4.963196014972371e-06, "loss": 0.181, "step": 602 }, { "epoch": 0.27434030937215653, "grad_norm": 2.3031446562333158, "learning_rate": 4.963073739904775e-06, "loss": 0.1896, "step": 603 }, { "epoch": 0.2747952684258417, "grad_norm": 2.9296704327439533, "learning_rate": 4.962951263565915e-06, "loss": 0.2168, "step": 604 }, { "epoch": 0.2752502274795268, "grad_norm": 2.3617995527569557, "learning_rate": 4.962828585965801e-06, "loss": 0.1815, "step": 605 }, { "epoch": 0.275705186533212, "grad_norm": 2.1546354601106956, "learning_rate": 4.962705707114457e-06, "loss": 0.1658, "step": 606 }, { "epoch": 0.27616014558689717, "grad_norm": 1.9872717123396686, "learning_rate": 4.962582627021923e-06, "loss": 0.1885, "step": 607 }, { "epoch": 0.27661510464058237, "grad_norm": 2.3902452238732077, "learning_rate": 4.962459345698258e-06, "loss": 0.1934, "step": 608 }, { "epoch": 0.2770700636942675, "grad_norm": 2.6613012891469334, "learning_rate": 4.962335863153537e-06, "loss": 0.2002, "step": 609 }, { "epoch": 0.2775250227479527, "grad_norm": 1.5351443788779375, "learning_rate": 4.962212179397847e-06, "loss": 0.1524, "step": 610 }, { "epoch": 0.27797998180163785, "grad_norm": 1.8149311646504362, "learning_rate": 4.962088294441299e-06, "loss": 0.1091, "step": 611 }, { "epoch": 0.278434940855323, "grad_norm": 1.6923849341814876, "learning_rate": 4.9619642082940135e-06, "loss": 0.2258, "step": 612 }, { "epoch": 0.2788898999090082, "grad_norm": 2.300540388195039, "learning_rate": 4.9618399209661305e-06, "loss": 0.1544, "step": 613 }, { "epoch": 0.27934485896269334, "grad_norm": 2.2841254960366375, "learning_rate": 4.961715432467807e-06, "loss": 0.1537, "step": 614 }, { "epoch": 0.27979981801637854, "grad_norm": 2.1565671846973764, "learning_rate": 4.961590742809216e-06, "loss": 0.1818, "step": 615 }, { "epoch": 0.2802547770700637, "grad_norm": 1.4848634903553593, "learning_rate": 4.961465852000545e-06, "loss": 0.1379, "step": 616 }, { "epoch": 0.2807097361237489, "grad_norm": 2.886386939634882, "learning_rate": 4.961340760052001e-06, "loss": 0.2137, "step": 617 }, { "epoch": 0.28116469517743403, "grad_norm": 2.12342498143493, "learning_rate": 4.961215466973806e-06, "loss": 0.1609, "step": 618 }, { "epoch": 0.2816196542311192, "grad_norm": 1.6272561537794945, "learning_rate": 4.961089972776197e-06, "loss": 0.1704, "step": 619 }, { "epoch": 0.2820746132848044, "grad_norm": 2.177134514236334, "learning_rate": 4.9609642774694285e-06, "loss": 0.1844, "step": 620 }, { "epoch": 0.2825295723384895, "grad_norm": 2.0060823387879396, "learning_rate": 4.960838381063774e-06, "loss": 0.1639, "step": 621 }, { "epoch": 0.2829845313921747, "grad_norm": 2.0396430448753047, "learning_rate": 4.960712283569521e-06, "loss": 0.1832, "step": 622 }, { "epoch": 0.28343949044585987, "grad_norm": 2.1577816713540345, "learning_rate": 4.960585984996971e-06, "loss": 0.1795, "step": 623 }, { "epoch": 0.28389444949954507, "grad_norm": 2.1362683979802997, "learning_rate": 4.960459485356447e-06, "loss": 0.2442, "step": 624 }, { "epoch": 0.2843494085532302, "grad_norm": 1.7854499328292173, "learning_rate": 4.960332784658285e-06, "loss": 0.1461, "step": 625 }, { "epoch": 0.28480436760691535, "grad_norm": 2.1713858060672218, "learning_rate": 4.960205882912839e-06, "loss": 0.1743, "step": 626 }, { "epoch": 0.28525932666060055, "grad_norm": 2.143444693552156, "learning_rate": 4.9600787801304785e-06, "loss": 0.2084, "step": 627 }, { "epoch": 0.2857142857142857, "grad_norm": 1.8522682250986475, "learning_rate": 4.959951476321589e-06, "loss": 0.1946, "step": 628 }, { "epoch": 0.2861692447679709, "grad_norm": 1.5982375639062243, "learning_rate": 4.959823971496575e-06, "loss": 0.1772, "step": 629 }, { "epoch": 0.28662420382165604, "grad_norm": 1.8898991951503732, "learning_rate": 4.959696265665853e-06, "loss": 0.1804, "step": 630 }, { "epoch": 0.28707916287534124, "grad_norm": 1.9040168480447408, "learning_rate": 4.959568358839862e-06, "loss": 0.1258, "step": 631 }, { "epoch": 0.2875341219290264, "grad_norm": 1.8463510477056075, "learning_rate": 4.95944025102905e-06, "loss": 0.1414, "step": 632 }, { "epoch": 0.28798908098271153, "grad_norm": 2.3179780847953055, "learning_rate": 4.959311942243888e-06, "loss": 0.2031, "step": 633 }, { "epoch": 0.28844404003639673, "grad_norm": 1.724174452868963, "learning_rate": 4.95918343249486e-06, "loss": 0.1377, "step": 634 }, { "epoch": 0.2888989990900819, "grad_norm": 1.7281757474887716, "learning_rate": 4.959054721792469e-06, "loss": 0.2074, "step": 635 }, { "epoch": 0.2893539581437671, "grad_norm": 1.749321520269807, "learning_rate": 4.958925810147231e-06, "loss": 0.104, "step": 636 }, { "epoch": 0.2898089171974522, "grad_norm": 1.8727315308914843, "learning_rate": 4.958796697569679e-06, "loss": 0.1325, "step": 637 }, { "epoch": 0.2902638762511374, "grad_norm": 2.800322102970211, "learning_rate": 4.958667384070365e-06, "loss": 0.1583, "step": 638 }, { "epoch": 0.29071883530482256, "grad_norm": 1.7822844611072806, "learning_rate": 4.958537869659855e-06, "loss": 0.2057, "step": 639 }, { "epoch": 0.2911737943585077, "grad_norm": 2.745456907200946, "learning_rate": 4.958408154348734e-06, "loss": 0.1605, "step": 640 }, { "epoch": 0.2916287534121929, "grad_norm": 2.233718920040976, "learning_rate": 4.9582782381476e-06, "loss": 0.1996, "step": 641 }, { "epoch": 0.29208371246587805, "grad_norm": 2.2702620107271567, "learning_rate": 4.958148121067071e-06, "loss": 0.2927, "step": 642 }, { "epoch": 0.29253867151956325, "grad_norm": 2.150177934476292, "learning_rate": 4.9580178031177775e-06, "loss": 0.1949, "step": 643 }, { "epoch": 0.2929936305732484, "grad_norm": 1.4333466510228, "learning_rate": 4.9578872843103694e-06, "loss": 0.1481, "step": 644 }, { "epoch": 0.2934485896269336, "grad_norm": 1.8148623461294702, "learning_rate": 4.957756564655513e-06, "loss": 0.1736, "step": 645 }, { "epoch": 0.29390354868061874, "grad_norm": 1.8574102016300988, "learning_rate": 4.957625644163888e-06, "loss": 0.1893, "step": 646 }, { "epoch": 0.2943585077343039, "grad_norm": 2.0598318825039694, "learning_rate": 4.957494522846194e-06, "loss": 0.1511, "step": 647 }, { "epoch": 0.2948134667879891, "grad_norm": 1.8631745332908447, "learning_rate": 4.957363200713146e-06, "loss": 0.2403, "step": 648 }, { "epoch": 0.29526842584167423, "grad_norm": 1.934970676847201, "learning_rate": 4.957231677775475e-06, "loss": 0.1782, "step": 649 }, { "epoch": 0.29572338489535943, "grad_norm": 2.162311103918465, "learning_rate": 4.957099954043928e-06, "loss": 0.1894, "step": 650 }, { "epoch": 0.2961783439490446, "grad_norm": 1.3750044807559711, "learning_rate": 4.956968029529269e-06, "loss": 0.1948, "step": 651 }, { "epoch": 0.2966333030027298, "grad_norm": 1.7571318861097756, "learning_rate": 4.956835904242277e-06, "loss": 0.1715, "step": 652 }, { "epoch": 0.2970882620564149, "grad_norm": 1.964585802559125, "learning_rate": 4.9567035781937516e-06, "loss": 0.1103, "step": 653 }, { "epoch": 0.29754322111010006, "grad_norm": 1.9039563589608381, "learning_rate": 4.9565710513945024e-06, "loss": 0.1668, "step": 654 }, { "epoch": 0.29799818016378526, "grad_norm": 1.837562224402912, "learning_rate": 4.956438323855362e-06, "loss": 0.129, "step": 655 }, { "epoch": 0.2984531392174704, "grad_norm": 1.7630804326653742, "learning_rate": 4.956305395587174e-06, "loss": 0.1906, "step": 656 }, { "epoch": 0.2989080982711556, "grad_norm": 1.910058814511253, "learning_rate": 4.956172266600802e-06, "loss": 0.124, "step": 657 }, { "epoch": 0.29936305732484075, "grad_norm": 2.2105167684195757, "learning_rate": 4.956038936907125e-06, "loss": 0.1593, "step": 658 }, { "epoch": 0.29981801637852595, "grad_norm": 2.253935685217962, "learning_rate": 4.955905406517036e-06, "loss": 0.1581, "step": 659 }, { "epoch": 0.3002729754322111, "grad_norm": 2.5313373580598424, "learning_rate": 4.95577167544145e-06, "loss": 0.1813, "step": 660 }, { "epoch": 0.30072793448589624, "grad_norm": 2.406722714489674, "learning_rate": 4.955637743691291e-06, "loss": 0.1633, "step": 661 }, { "epoch": 0.30118289353958144, "grad_norm": 2.4238606966439487, "learning_rate": 4.955503611277506e-06, "loss": 0.1917, "step": 662 }, { "epoch": 0.3016378525932666, "grad_norm": 2.6124988273739893, "learning_rate": 4.955369278211055e-06, "loss": 0.2094, "step": 663 }, { "epoch": 0.3020928116469518, "grad_norm": 2.976761995472576, "learning_rate": 4.955234744502914e-06, "loss": 0.1909, "step": 664 }, { "epoch": 0.30254777070063693, "grad_norm": 2.0362637594213053, "learning_rate": 4.955100010164079e-06, "loss": 0.1968, "step": 665 }, { "epoch": 0.30300272975432213, "grad_norm": 1.8717270849356715, "learning_rate": 4.954965075205557e-06, "loss": 0.1612, "step": 666 }, { "epoch": 0.3034576888080073, "grad_norm": 2.4021794148968953, "learning_rate": 4.9548299396383755e-06, "loss": 0.2181, "step": 667 }, { "epoch": 0.3039126478616925, "grad_norm": 2.1388957119580367, "learning_rate": 4.954694603473578e-06, "loss": 0.1692, "step": 668 }, { "epoch": 0.3043676069153776, "grad_norm": 2.1096028848377855, "learning_rate": 4.954559066722222e-06, "loss": 0.204, "step": 669 }, { "epoch": 0.30482256596906276, "grad_norm": 1.9629095047383018, "learning_rate": 4.954423329395385e-06, "loss": 0.1997, "step": 670 }, { "epoch": 0.30527752502274796, "grad_norm": 1.9442418917085225, "learning_rate": 4.954287391504156e-06, "loss": 0.1944, "step": 671 }, { "epoch": 0.3057324840764331, "grad_norm": 2.229272182184504, "learning_rate": 4.9541512530596455e-06, "loss": 0.2029, "step": 672 }, { "epoch": 0.3061874431301183, "grad_norm": 2.080623617831735, "learning_rate": 4.954014914072978e-06, "loss": 0.1881, "step": 673 }, { "epoch": 0.30664240218380345, "grad_norm": 1.3909729404333016, "learning_rate": 4.9538783745552934e-06, "loss": 0.1446, "step": 674 }, { "epoch": 0.30709736123748865, "grad_norm": 2.5204656795127303, "learning_rate": 4.95374163451775e-06, "loss": 0.2251, "step": 675 }, { "epoch": 0.3075523202911738, "grad_norm": 2.8855471273631585, "learning_rate": 4.953604693971521e-06, "loss": 0.1832, "step": 676 }, { "epoch": 0.30800727934485894, "grad_norm": 2.415452060739297, "learning_rate": 4.953467552927798e-06, "loss": 0.188, "step": 677 }, { "epoch": 0.30846223839854414, "grad_norm": 3.3704774970598215, "learning_rate": 4.9533302113977845e-06, "loss": 0.2644, "step": 678 }, { "epoch": 0.3089171974522293, "grad_norm": 3.0964762790397233, "learning_rate": 4.9531926693927055e-06, "loss": 0.1891, "step": 679 }, { "epoch": 0.3093721565059145, "grad_norm": 2.3617921935041646, "learning_rate": 4.953054926923801e-06, "loss": 0.1791, "step": 680 }, { "epoch": 0.30982711555959963, "grad_norm": 2.1015907363587836, "learning_rate": 4.952916984002325e-06, "loss": 0.154, "step": 681 }, { "epoch": 0.31028207461328483, "grad_norm": 2.5909443467360944, "learning_rate": 4.95277884063955e-06, "loss": 0.1758, "step": 682 }, { "epoch": 0.31073703366697, "grad_norm": 1.9161503782177982, "learning_rate": 4.952640496846766e-06, "loss": 0.1883, "step": 683 }, { "epoch": 0.3111919927206551, "grad_norm": 2.2723462143890187, "learning_rate": 4.952501952635276e-06, "loss": 0.1813, "step": 684 }, { "epoch": 0.3116469517743403, "grad_norm": 1.5779544920569608, "learning_rate": 4.952363208016402e-06, "loss": 0.183, "step": 685 }, { "epoch": 0.31210191082802546, "grad_norm": 2.3768180703064834, "learning_rate": 4.952224263001482e-06, "loss": 0.139, "step": 686 }, { "epoch": 0.31255686988171066, "grad_norm": 1.7932474239157794, "learning_rate": 4.952085117601868e-06, "loss": 0.1698, "step": 687 }, { "epoch": 0.3130118289353958, "grad_norm": 2.1109045834120157, "learning_rate": 4.951945771828933e-06, "loss": 0.2482, "step": 688 }, { "epoch": 0.313466787989081, "grad_norm": 1.6399625432585407, "learning_rate": 4.951806225694061e-06, "loss": 0.1809, "step": 689 }, { "epoch": 0.31392174704276615, "grad_norm": 2.610023079079643, "learning_rate": 4.951666479208658e-06, "loss": 0.1964, "step": 690 }, { "epoch": 0.3143767060964513, "grad_norm": 2.574945774612913, "learning_rate": 4.951526532384141e-06, "loss": 0.1827, "step": 691 }, { "epoch": 0.3148316651501365, "grad_norm": 1.8594925752682625, "learning_rate": 4.951386385231946e-06, "loss": 0.1674, "step": 692 }, { "epoch": 0.31528662420382164, "grad_norm": 1.6516261883969883, "learning_rate": 4.951246037763528e-06, "loss": 0.1342, "step": 693 }, { "epoch": 0.31574158325750684, "grad_norm": 1.8608275979712807, "learning_rate": 4.9511054899903524e-06, "loss": 0.1657, "step": 694 }, { "epoch": 0.316196542311192, "grad_norm": 2.3555359764575545, "learning_rate": 4.950964741923905e-06, "loss": 0.2022, "step": 695 }, { "epoch": 0.3166515013648772, "grad_norm": 1.782390866267192, "learning_rate": 4.950823793575688e-06, "loss": 0.1517, "step": 696 }, { "epoch": 0.31710646041856233, "grad_norm": 2.001725151610439, "learning_rate": 4.950682644957218e-06, "loss": 0.1745, "step": 697 }, { "epoch": 0.3175614194722475, "grad_norm": 2.6801559375906585, "learning_rate": 4.9505412960800295e-06, "loss": 0.2196, "step": 698 }, { "epoch": 0.3180163785259327, "grad_norm": 2.0435969601142583, "learning_rate": 4.950399746955673e-06, "loss": 0.1823, "step": 699 }, { "epoch": 0.3184713375796178, "grad_norm": 3.135001392998494, "learning_rate": 4.950257997595716e-06, "loss": 0.1932, "step": 700 }, { "epoch": 0.318926296633303, "grad_norm": 2.3774677479838484, "learning_rate": 4.950116048011739e-06, "loss": 0.1905, "step": 701 }, { "epoch": 0.31938125568698816, "grad_norm": 1.8516165333723722, "learning_rate": 4.949973898215344e-06, "loss": 0.1503, "step": 702 }, { "epoch": 0.31983621474067336, "grad_norm": 2.343561651154435, "learning_rate": 4.949831548218146e-06, "loss": 0.1441, "step": 703 }, { "epoch": 0.3202911737943585, "grad_norm": 1.8104402427163653, "learning_rate": 4.949688998031777e-06, "loss": 0.1558, "step": 704 }, { "epoch": 0.32074613284804365, "grad_norm": 2.144991489680201, "learning_rate": 4.949546247667886e-06, "loss": 0.1305, "step": 705 }, { "epoch": 0.32120109190172885, "grad_norm": 1.8279214675219737, "learning_rate": 4.949403297138137e-06, "loss": 0.1336, "step": 706 }, { "epoch": 0.321656050955414, "grad_norm": 2.3674168986503767, "learning_rate": 4.949260146454212e-06, "loss": 0.1764, "step": 707 }, { "epoch": 0.3221110100090992, "grad_norm": 1.6483989227538907, "learning_rate": 4.94911679562781e-06, "loss": 0.159, "step": 708 }, { "epoch": 0.32256596906278434, "grad_norm": 2.038187279529794, "learning_rate": 4.948973244670643e-06, "loss": 0.1485, "step": 709 }, { "epoch": 0.32302092811646954, "grad_norm": 2.41476196989692, "learning_rate": 4.948829493594441e-06, "loss": 0.2091, "step": 710 }, { "epoch": 0.3234758871701547, "grad_norm": 2.222757795496577, "learning_rate": 4.9486855424109524e-06, "loss": 0.1503, "step": 711 }, { "epoch": 0.32393084622383983, "grad_norm": 1.850862512986181, "learning_rate": 4.948541391131939e-06, "loss": 0.1505, "step": 712 }, { "epoch": 0.32438580527752503, "grad_norm": 2.3940666777003137, "learning_rate": 4.948397039769181e-06, "loss": 0.1578, "step": 713 }, { "epoch": 0.3248407643312102, "grad_norm": 2.0487809609035113, "learning_rate": 4.948252488334474e-06, "loss": 0.1327, "step": 714 }, { "epoch": 0.3252957233848954, "grad_norm": 1.4541195656219779, "learning_rate": 4.948107736839629e-06, "loss": 0.1994, "step": 715 }, { "epoch": 0.3257506824385805, "grad_norm": 1.6302160419859526, "learning_rate": 4.947962785296476e-06, "loss": 0.1665, "step": 716 }, { "epoch": 0.3262056414922657, "grad_norm": 2.761516841692211, "learning_rate": 4.9478176337168594e-06, "loss": 0.1622, "step": 717 }, { "epoch": 0.32666060054595086, "grad_norm": 2.2365611293446865, "learning_rate": 4.9476722821126386e-06, "loss": 0.2191, "step": 718 }, { "epoch": 0.327115559599636, "grad_norm": 2.267629869433733, "learning_rate": 4.9475267304956945e-06, "loss": 0.1608, "step": 719 }, { "epoch": 0.3275705186533212, "grad_norm": 2.8370903035030812, "learning_rate": 4.947380978877917e-06, "loss": 0.2059, "step": 720 }, { "epoch": 0.32802547770700635, "grad_norm": 1.7629045012494435, "learning_rate": 4.947235027271219e-06, "loss": 0.1644, "step": 721 }, { "epoch": 0.32848043676069155, "grad_norm": 1.7514209523720954, "learning_rate": 4.9470888756875265e-06, "loss": 0.1443, "step": 722 }, { "epoch": 0.3289353958143767, "grad_norm": 1.996409560436198, "learning_rate": 4.946942524138782e-06, "loss": 0.1589, "step": 723 }, { "epoch": 0.3293903548680619, "grad_norm": 1.9499597954033492, "learning_rate": 4.946795972636944e-06, "loss": 0.1856, "step": 724 }, { "epoch": 0.32984531392174704, "grad_norm": 1.6935756093459424, "learning_rate": 4.94664922119399e-06, "loss": 0.1866, "step": 725 }, { "epoch": 0.3303002729754322, "grad_norm": 2.2750870343308818, "learning_rate": 4.94650226982191e-06, "loss": 0.1894, "step": 726 }, { "epoch": 0.3307552320291174, "grad_norm": 1.7773678651655342, "learning_rate": 4.9463551185327115e-06, "loss": 0.2623, "step": 727 }, { "epoch": 0.33121019108280253, "grad_norm": 2.3870710697996302, "learning_rate": 4.946207767338422e-06, "loss": 0.1708, "step": 728 }, { "epoch": 0.33166515013648773, "grad_norm": 1.8969974183881673, "learning_rate": 4.9460602162510805e-06, "loss": 0.1758, "step": 729 }, { "epoch": 0.3321201091901729, "grad_norm": 1.9352911073022974, "learning_rate": 4.945912465282744e-06, "loss": 0.1199, "step": 730 }, { "epoch": 0.3325750682438581, "grad_norm": 1.8878423547131853, "learning_rate": 4.945764514445487e-06, "loss": 0.2117, "step": 731 }, { "epoch": 0.3330300272975432, "grad_norm": 2.575730274178936, "learning_rate": 4.9456163637513986e-06, "loss": 0.2044, "step": 732 }, { "epoch": 0.33348498635122836, "grad_norm": 2.7338168638267066, "learning_rate": 4.945468013212585e-06, "loss": 0.2238, "step": 733 }, { "epoch": 0.33393994540491356, "grad_norm": 2.1060940314978702, "learning_rate": 4.945319462841169e-06, "loss": 0.1727, "step": 734 }, { "epoch": 0.3343949044585987, "grad_norm": 1.8942361555213085, "learning_rate": 4.94517071264929e-06, "loss": 0.2168, "step": 735 }, { "epoch": 0.3348498635122839, "grad_norm": 2.455108985215525, "learning_rate": 4.945021762649102e-06, "loss": 0.1525, "step": 736 }, { "epoch": 0.33530482256596905, "grad_norm": 1.8066289984722876, "learning_rate": 4.9448726128527776e-06, "loss": 0.2014, "step": 737 }, { "epoch": 0.33575978161965425, "grad_norm": 2.142750327891088, "learning_rate": 4.944723263272504e-06, "loss": 0.2155, "step": 738 }, { "epoch": 0.3362147406733394, "grad_norm": 2.0611633591265814, "learning_rate": 4.944573713920485e-06, "loss": 0.19, "step": 739 }, { "epoch": 0.33666969972702454, "grad_norm": 1.5473212219148849, "learning_rate": 4.944423964808943e-06, "loss": 0.1829, "step": 740 }, { "epoch": 0.33712465878070974, "grad_norm": 1.7792548638263834, "learning_rate": 4.944274015950113e-06, "loss": 0.1563, "step": 741 }, { "epoch": 0.3375796178343949, "grad_norm": 2.27825782486859, "learning_rate": 4.944123867356249e-06, "loss": 0.1462, "step": 742 }, { "epoch": 0.3380345768880801, "grad_norm": 2.544197436295867, "learning_rate": 4.943973519039619e-06, "loss": 0.23, "step": 743 }, { "epoch": 0.33848953594176523, "grad_norm": 2.1742558484011836, "learning_rate": 4.943822971012511e-06, "loss": 0.1382, "step": 744 }, { "epoch": 0.33894449499545043, "grad_norm": 1.986842417086239, "learning_rate": 4.943672223287226e-06, "loss": 0.1751, "step": 745 }, { "epoch": 0.3393994540491356, "grad_norm": 2.0458092345288144, "learning_rate": 4.9435212758760815e-06, "loss": 0.2008, "step": 746 }, { "epoch": 0.3398544131028208, "grad_norm": 1.3986293648043162, "learning_rate": 4.943370128791413e-06, "loss": 0.1209, "step": 747 }, { "epoch": 0.3403093721565059, "grad_norm": 1.7739101505934052, "learning_rate": 4.943218782045574e-06, "loss": 0.1651, "step": 748 }, { "epoch": 0.34076433121019106, "grad_norm": 2.0878587765611867, "learning_rate": 4.943067235650927e-06, "loss": 0.1705, "step": 749 }, { "epoch": 0.34121929026387626, "grad_norm": 1.7446405914839491, "learning_rate": 4.942915489619859e-06, "loss": 0.1604, "step": 750 }, { "epoch": 0.3416742493175614, "grad_norm": 2.165396057344333, "learning_rate": 4.9427635439647704e-06, "loss": 0.1923, "step": 751 }, { "epoch": 0.3421292083712466, "grad_norm": 1.7166625815039147, "learning_rate": 4.942611398698075e-06, "loss": 0.145, "step": 752 }, { "epoch": 0.34258416742493175, "grad_norm": 1.3978926196223211, "learning_rate": 4.942459053832208e-06, "loss": 0.1246, "step": 753 }, { "epoch": 0.34303912647861695, "grad_norm": 1.5203589407780953, "learning_rate": 4.942306509379617e-06, "loss": 0.1472, "step": 754 }, { "epoch": 0.3434940855323021, "grad_norm": 1.6513608457469287, "learning_rate": 4.942153765352767e-06, "loss": 0.1408, "step": 755 }, { "epoch": 0.34394904458598724, "grad_norm": 1.8035254782552455, "learning_rate": 4.94200082176414e-06, "loss": 0.1474, "step": 756 }, { "epoch": 0.34440400363967244, "grad_norm": 2.1335404521767414, "learning_rate": 4.941847678626234e-06, "loss": 0.1755, "step": 757 }, { "epoch": 0.3448589626933576, "grad_norm": 1.9408426816261404, "learning_rate": 4.941694335951563e-06, "loss": 0.2154, "step": 758 }, { "epoch": 0.3453139217470428, "grad_norm": 1.749049542240047, "learning_rate": 4.9415407937526575e-06, "loss": 0.1482, "step": 759 }, { "epoch": 0.34576888080072793, "grad_norm": 2.2747218478213598, "learning_rate": 4.9413870520420635e-06, "loss": 0.2213, "step": 760 }, { "epoch": 0.34622383985441313, "grad_norm": 1.9679998520100659, "learning_rate": 4.941233110832346e-06, "loss": 0.1482, "step": 761 }, { "epoch": 0.3466787989080983, "grad_norm": 2.7634133318079135, "learning_rate": 4.941078970136082e-06, "loss": 0.1649, "step": 762 }, { "epoch": 0.3471337579617834, "grad_norm": 1.4323163769051608, "learning_rate": 4.940924629965869e-06, "loss": 0.152, "step": 763 }, { "epoch": 0.3475887170154686, "grad_norm": 2.269381697045094, "learning_rate": 4.940770090334319e-06, "loss": 0.1446, "step": 764 }, { "epoch": 0.34804367606915376, "grad_norm": 1.8723783038369444, "learning_rate": 4.940615351254059e-06, "loss": 0.1142, "step": 765 }, { "epoch": 0.34849863512283896, "grad_norm": 1.8076648915776874, "learning_rate": 4.940460412737734e-06, "loss": 0.1944, "step": 766 }, { "epoch": 0.3489535941765241, "grad_norm": 2.080159914413928, "learning_rate": 4.940305274798005e-06, "loss": 0.1582, "step": 767 }, { "epoch": 0.3494085532302093, "grad_norm": 2.330746693235809, "learning_rate": 4.940149937447549e-06, "loss": 0.2007, "step": 768 }, { "epoch": 0.34986351228389445, "grad_norm": 1.536246049438816, "learning_rate": 4.939994400699061e-06, "loss": 0.1408, "step": 769 }, { "epoch": 0.3503184713375796, "grad_norm": 2.2894795215614994, "learning_rate": 4.939838664565248e-06, "loss": 0.1837, "step": 770 }, { "epoch": 0.3507734303912648, "grad_norm": 1.6850122967374852, "learning_rate": 4.939682729058839e-06, "loss": 0.1289, "step": 771 }, { "epoch": 0.35122838944494994, "grad_norm": 1.446339812351698, "learning_rate": 4.939526594192574e-06, "loss": 0.1329, "step": 772 }, { "epoch": 0.35168334849863514, "grad_norm": 1.776973239663882, "learning_rate": 4.939370259979213e-06, "loss": 0.1178, "step": 773 }, { "epoch": 0.3521383075523203, "grad_norm": 2.818513132709455, "learning_rate": 4.9392137264315295e-06, "loss": 0.2364, "step": 774 }, { "epoch": 0.3525932666060055, "grad_norm": 1.6041796316256967, "learning_rate": 4.939056993562316e-06, "loss": 0.2278, "step": 775 }, { "epoch": 0.35304822565969063, "grad_norm": 2.268295214561187, "learning_rate": 4.9389000613843805e-06, "loss": 0.1499, "step": 776 }, { "epoch": 0.3535031847133758, "grad_norm": 2.531973358561036, "learning_rate": 4.938742929910546e-06, "loss": 0.1743, "step": 777 }, { "epoch": 0.353958143767061, "grad_norm": 1.3677758044070074, "learning_rate": 4.938585599153652e-06, "loss": 0.1351, "step": 778 }, { "epoch": 0.3544131028207461, "grad_norm": 2.4047975606277947, "learning_rate": 4.938428069126555e-06, "loss": 0.1951, "step": 779 }, { "epoch": 0.3548680618744313, "grad_norm": 1.6598587480853697, "learning_rate": 4.9382703398421285e-06, "loss": 0.1602, "step": 780 }, { "epoch": 0.35532302092811646, "grad_norm": 2.501614606596268, "learning_rate": 4.938112411313261e-06, "loss": 0.193, "step": 781 }, { "epoch": 0.35577797998180166, "grad_norm": 1.7808472248973335, "learning_rate": 4.937954283552858e-06, "loss": 0.1322, "step": 782 }, { "epoch": 0.3562329390354868, "grad_norm": 2.397821173092958, "learning_rate": 4.93779595657384e-06, "loss": 0.1819, "step": 783 }, { "epoch": 0.35668789808917195, "grad_norm": 2.0407668064122495, "learning_rate": 4.937637430389145e-06, "loss": 0.1722, "step": 784 }, { "epoch": 0.35714285714285715, "grad_norm": 1.9792096843409923, "learning_rate": 4.937478705011729e-06, "loss": 0.1349, "step": 785 }, { "epoch": 0.3575978161965423, "grad_norm": 1.5581979975977567, "learning_rate": 4.937319780454559e-06, "loss": 0.1891, "step": 786 }, { "epoch": 0.3580527752502275, "grad_norm": 1.3563862115066228, "learning_rate": 4.937160656730625e-06, "loss": 0.1622, "step": 787 }, { "epoch": 0.35850773430391264, "grad_norm": 1.7874560137459294, "learning_rate": 4.9370013338529274e-06, "loss": 0.1606, "step": 788 }, { "epoch": 0.35896269335759784, "grad_norm": 1.695354030268494, "learning_rate": 4.936841811834486e-06, "loss": 0.1725, "step": 789 }, { "epoch": 0.359417652411283, "grad_norm": 1.5018417297722055, "learning_rate": 4.936682090688337e-06, "loss": 0.1568, "step": 790 }, { "epoch": 0.35987261146496813, "grad_norm": 2.3008919876499276, "learning_rate": 4.936522170427531e-06, "loss": 0.1607, "step": 791 }, { "epoch": 0.36032757051865333, "grad_norm": 2.145424436631978, "learning_rate": 4.936362051065136e-06, "loss": 0.136, "step": 792 }, { "epoch": 0.3607825295723385, "grad_norm": 2.023227990902717, "learning_rate": 4.936201732614238e-06, "loss": 0.1568, "step": 793 }, { "epoch": 0.3612374886260237, "grad_norm": 1.8119576330565363, "learning_rate": 4.9360412150879355e-06, "loss": 0.1291, "step": 794 }, { "epoch": 0.3616924476797088, "grad_norm": 1.904733745689391, "learning_rate": 4.935880498499346e-06, "loss": 0.1262, "step": 795 }, { "epoch": 0.362147406733394, "grad_norm": 2.1050139123506235, "learning_rate": 4.935719582861604e-06, "loss": 0.2027, "step": 796 }, { "epoch": 0.36260236578707916, "grad_norm": 1.5866289163873395, "learning_rate": 4.935558468187855e-06, "loss": 0.1713, "step": 797 }, { "epoch": 0.3630573248407643, "grad_norm": 2.266843952674795, "learning_rate": 4.935397154491268e-06, "loss": 0.1881, "step": 798 }, { "epoch": 0.3635122838944495, "grad_norm": 1.9774458028018125, "learning_rate": 4.935235641785023e-06, "loss": 0.1837, "step": 799 }, { "epoch": 0.36396724294813465, "grad_norm": 2.1853087729094796, "learning_rate": 4.935073930082319e-06, "loss": 0.176, "step": 800 }, { "epoch": 0.36442220200181985, "grad_norm": 2.525766342273085, "learning_rate": 4.93491201939637e-06, "loss": 0.2015, "step": 801 }, { "epoch": 0.364877161055505, "grad_norm": 2.5055378214905843, "learning_rate": 4.934749909740408e-06, "loss": 0.1961, "step": 802 }, { "epoch": 0.3653321201091902, "grad_norm": 2.0645024314881035, "learning_rate": 4.934587601127677e-06, "loss": 0.1644, "step": 803 }, { "epoch": 0.36578707916287534, "grad_norm": 2.0158906472533373, "learning_rate": 4.934425093571442e-06, "loss": 0.1911, "step": 804 }, { "epoch": 0.3662420382165605, "grad_norm": 2.284162498710454, "learning_rate": 4.934262387084984e-06, "loss": 0.2008, "step": 805 }, { "epoch": 0.3666969972702457, "grad_norm": 2.0973583334570547, "learning_rate": 4.934099481681595e-06, "loss": 0.1557, "step": 806 }, { "epoch": 0.36715195632393083, "grad_norm": 2.2021201797945356, "learning_rate": 4.933936377374589e-06, "loss": 0.1524, "step": 807 }, { "epoch": 0.36760691537761603, "grad_norm": 1.806808877742582, "learning_rate": 4.933773074177293e-06, "loss": 0.1738, "step": 808 }, { "epoch": 0.3680618744313012, "grad_norm": 2.2345290767527386, "learning_rate": 4.933609572103053e-06, "loss": 0.1442, "step": 809 }, { "epoch": 0.3685168334849864, "grad_norm": 1.9706491037079354, "learning_rate": 4.933445871165229e-06, "loss": 0.2354, "step": 810 }, { "epoch": 0.3689717925386715, "grad_norm": 2.404773980417632, "learning_rate": 4.933281971377197e-06, "loss": 0.1719, "step": 811 }, { "epoch": 0.36942675159235666, "grad_norm": 1.720683846457796, "learning_rate": 4.933117872752352e-06, "loss": 0.1914, "step": 812 }, { "epoch": 0.36988171064604186, "grad_norm": 2.532410934524191, "learning_rate": 4.932953575304102e-06, "loss": 0.2144, "step": 813 }, { "epoch": 0.370336669699727, "grad_norm": 5.0403677379252425, "learning_rate": 4.932789079045873e-06, "loss": 0.2595, "step": 814 }, { "epoch": 0.3707916287534122, "grad_norm": 2.0578633523076437, "learning_rate": 4.932624383991106e-06, "loss": 0.1739, "step": 815 }, { "epoch": 0.37124658780709735, "grad_norm": 1.9986709520957122, "learning_rate": 4.9324594901532605e-06, "loss": 0.1838, "step": 816 }, { "epoch": 0.37170154686078255, "grad_norm": 1.7217394600458333, "learning_rate": 4.93229439754581e-06, "loss": 0.1579, "step": 817 }, { "epoch": 0.3721565059144677, "grad_norm": 2.1321573080305813, "learning_rate": 4.932129106182246e-06, "loss": 0.1926, "step": 818 }, { "epoch": 0.37261146496815284, "grad_norm": 2.793277438622436, "learning_rate": 4.931963616076075e-06, "loss": 0.2136, "step": 819 }, { "epoch": 0.37306642402183804, "grad_norm": 1.7394149868487567, "learning_rate": 4.93179792724082e-06, "loss": 0.128, "step": 820 }, { "epoch": 0.3735213830755232, "grad_norm": 1.82657006763275, "learning_rate": 4.9316320396900195e-06, "loss": 0.17, "step": 821 }, { "epoch": 0.3739763421292084, "grad_norm": 1.823894210494748, "learning_rate": 4.9314659534372305e-06, "loss": 0.1981, "step": 822 }, { "epoch": 0.37443130118289353, "grad_norm": 2.3539272175568775, "learning_rate": 4.931299668496024e-06, "loss": 0.1576, "step": 823 }, { "epoch": 0.37488626023657873, "grad_norm": 2.5070798015414666, "learning_rate": 4.931133184879988e-06, "loss": 0.1886, "step": 824 }, { "epoch": 0.37534121929026387, "grad_norm": 2.008848059538202, "learning_rate": 4.930966502602727e-06, "loss": 0.1605, "step": 825 }, { "epoch": 0.37579617834394907, "grad_norm": 2.407030934613122, "learning_rate": 4.930799621677862e-06, "loss": 0.1802, "step": 826 }, { "epoch": 0.3762511373976342, "grad_norm": 1.8420833153352183, "learning_rate": 4.93063254211903e-06, "loss": 0.1641, "step": 827 }, { "epoch": 0.37670609645131936, "grad_norm": 2.159279850858488, "learning_rate": 4.930465263939882e-06, "loss": 0.1669, "step": 828 }, { "epoch": 0.37716105550500456, "grad_norm": 2.5834296367648477, "learning_rate": 4.9302977871540894e-06, "loss": 0.2047, "step": 829 }, { "epoch": 0.3776160145586897, "grad_norm": 1.6914011736844907, "learning_rate": 4.930130111775336e-06, "loss": 0.1153, "step": 830 }, { "epoch": 0.3780709736123749, "grad_norm": 1.9730937065718759, "learning_rate": 4.9299622378173245e-06, "loss": 0.1321, "step": 831 }, { "epoch": 0.37852593266606005, "grad_norm": 4.287590804185868, "learning_rate": 4.929794165293773e-06, "loss": 0.2942, "step": 832 }, { "epoch": 0.37898089171974525, "grad_norm": 1.3332137290220585, "learning_rate": 4.9296258942184145e-06, "loss": 0.1089, "step": 833 }, { "epoch": 0.3794358507734304, "grad_norm": 1.5962116643063975, "learning_rate": 4.929457424605e-06, "loss": 0.1214, "step": 834 }, { "epoch": 0.37989080982711554, "grad_norm": 2.022957256898634, "learning_rate": 4.929288756467296e-06, "loss": 0.1853, "step": 835 }, { "epoch": 0.38034576888080074, "grad_norm": 2.4282570688213863, "learning_rate": 4.929119889819086e-06, "loss": 0.1873, "step": 836 }, { "epoch": 0.3808007279344859, "grad_norm": 2.9395172179458, "learning_rate": 4.928950824674169e-06, "loss": 0.2634, "step": 837 }, { "epoch": 0.3812556869881711, "grad_norm": 2.2127049960140335, "learning_rate": 4.928781561046359e-06, "loss": 0.1663, "step": 838 }, { "epoch": 0.3817106460418562, "grad_norm": 2.536562760970294, "learning_rate": 4.928612098949488e-06, "loss": 0.2011, "step": 839 }, { "epoch": 0.3821656050955414, "grad_norm": 2.1855699037821514, "learning_rate": 4.9284424383974026e-06, "loss": 0.1794, "step": 840 }, { "epoch": 0.38262056414922657, "grad_norm": 1.6332101956979397, "learning_rate": 4.928272579403969e-06, "loss": 0.1279, "step": 841 }, { "epoch": 0.3830755232029117, "grad_norm": 1.5663751127122882, "learning_rate": 4.928102521983067e-06, "loss": 0.1985, "step": 842 }, { "epoch": 0.3835304822565969, "grad_norm": 2.4747913159024195, "learning_rate": 4.9279322661485906e-06, "loss": 0.244, "step": 843 }, { "epoch": 0.38398544131028206, "grad_norm": 1.9419499604147055, "learning_rate": 4.927761811914455e-06, "loss": 0.1996, "step": 844 }, { "epoch": 0.38444040036396726, "grad_norm": 1.8790570447198083, "learning_rate": 4.927591159294587e-06, "loss": 0.1746, "step": 845 }, { "epoch": 0.3848953594176524, "grad_norm": 3.2586686346278992, "learning_rate": 4.927420308302933e-06, "loss": 0.2099, "step": 846 }, { "epoch": 0.3853503184713376, "grad_norm": 1.8912381154957223, "learning_rate": 4.927249258953454e-06, "loss": 0.2159, "step": 847 }, { "epoch": 0.38580527752502275, "grad_norm": 2.5636879906209242, "learning_rate": 4.927078011260126e-06, "loss": 0.2142, "step": 848 }, { "epoch": 0.3862602365787079, "grad_norm": 2.2557014215101794, "learning_rate": 4.926906565236943e-06, "loss": 0.2158, "step": 849 }, { "epoch": 0.3867151956323931, "grad_norm": 2.0433651062149076, "learning_rate": 4.926734920897916e-06, "loss": 0.1564, "step": 850 }, { "epoch": 0.38717015468607824, "grad_norm": 1.1448398063326757, "learning_rate": 4.926563078257071e-06, "loss": 0.1274, "step": 851 }, { "epoch": 0.38762511373976344, "grad_norm": 1.5601081736798879, "learning_rate": 4.926391037328448e-06, "loss": 0.1742, "step": 852 }, { "epoch": 0.3880800727934486, "grad_norm": 1.735106713842307, "learning_rate": 4.926218798126108e-06, "loss": 0.17, "step": 853 }, { "epoch": 0.3885350318471338, "grad_norm": 1.8524828246659681, "learning_rate": 4.926046360664124e-06, "loss": 0.1359, "step": 854 }, { "epoch": 0.3889899909008189, "grad_norm": 1.8327900649742344, "learning_rate": 4.925873724956588e-06, "loss": 0.1276, "step": 855 }, { "epoch": 0.38944494995450407, "grad_norm": 1.7997603613849789, "learning_rate": 4.9257008910176065e-06, "loss": 0.236, "step": 856 }, { "epoch": 0.38989990900818927, "grad_norm": 2.1973741478380893, "learning_rate": 4.925527858861302e-06, "loss": 0.1935, "step": 857 }, { "epoch": 0.3903548680618744, "grad_norm": 2.086365440068575, "learning_rate": 4.925354628501814e-06, "loss": 0.1652, "step": 858 }, { "epoch": 0.3908098271155596, "grad_norm": 1.8116013889379734, "learning_rate": 4.925181199953299e-06, "loss": 0.1612, "step": 859 }, { "epoch": 0.39126478616924476, "grad_norm": 1.9247913507109833, "learning_rate": 4.9250075732299285e-06, "loss": 0.1588, "step": 860 }, { "epoch": 0.39171974522292996, "grad_norm": 2.514293103428901, "learning_rate": 4.92483374834589e-06, "loss": 0.19, "step": 861 }, { "epoch": 0.3921747042766151, "grad_norm": 2.0316288184050024, "learning_rate": 4.9246597253153884e-06, "loss": 0.1831, "step": 862 }, { "epoch": 0.39262966333030025, "grad_norm": 1.5754846974100747, "learning_rate": 4.924485504152644e-06, "loss": 0.1466, "step": 863 }, { "epoch": 0.39308462238398545, "grad_norm": 2.1731555902481685, "learning_rate": 4.924311084871892e-06, "loss": 0.1937, "step": 864 }, { "epoch": 0.3935395814376706, "grad_norm": 1.5966819404904389, "learning_rate": 4.924136467487387e-06, "loss": 0.1251, "step": 865 }, { "epoch": 0.3939945404913558, "grad_norm": 1.8663994781934827, "learning_rate": 4.923961652013397e-06, "loss": 0.1523, "step": 866 }, { "epoch": 0.39444949954504094, "grad_norm": 2.1002789601399257, "learning_rate": 4.923786638464207e-06, "loss": 0.2129, "step": 867 }, { "epoch": 0.39490445859872614, "grad_norm": 2.081418128383539, "learning_rate": 4.9236114268541196e-06, "loss": 0.1437, "step": 868 }, { "epoch": 0.3953594176524113, "grad_norm": 2.447658119106072, "learning_rate": 4.923436017197451e-06, "loss": 0.201, "step": 869 }, { "epoch": 0.3958143767060964, "grad_norm": 1.7750379508150516, "learning_rate": 4.923260409508535e-06, "loss": 0.1282, "step": 870 }, { "epoch": 0.3962693357597816, "grad_norm": 1.6418670453366244, "learning_rate": 4.9230846038017214e-06, "loss": 0.2087, "step": 871 }, { "epoch": 0.39672429481346677, "grad_norm": 1.7770417360691049, "learning_rate": 4.922908600091378e-06, "loss": 0.1372, "step": 872 }, { "epoch": 0.39717925386715197, "grad_norm": 1.690148135895664, "learning_rate": 4.9227323983918835e-06, "loss": 0.1855, "step": 873 }, { "epoch": 0.3976342129208371, "grad_norm": 1.5404851420453596, "learning_rate": 4.922555998717639e-06, "loss": 0.1398, "step": 874 }, { "epoch": 0.3980891719745223, "grad_norm": 2.1706268320484328, "learning_rate": 4.922379401083058e-06, "loss": 0.1486, "step": 875 }, { "epoch": 0.39854413102820746, "grad_norm": 3.0077672507475786, "learning_rate": 4.922202605502573e-06, "loss": 0.2077, "step": 876 }, { "epoch": 0.3989990900818926, "grad_norm": 1.5486893349846256, "learning_rate": 4.922025611990629e-06, "loss": 0.1604, "step": 877 }, { "epoch": 0.3994540491355778, "grad_norm": 1.8667533652947603, "learning_rate": 4.92184842056169e-06, "loss": 0.1722, "step": 878 }, { "epoch": 0.39990900818926295, "grad_norm": 2.289002791626951, "learning_rate": 4.921671031230235e-06, "loss": 0.1647, "step": 879 }, { "epoch": 0.40036396724294815, "grad_norm": 1.8286186193347604, "learning_rate": 4.921493444010759e-06, "loss": 0.1773, "step": 880 }, { "epoch": 0.4008189262966333, "grad_norm": 1.8147441438330003, "learning_rate": 4.921315658917774e-06, "loss": 0.1711, "step": 881 }, { "epoch": 0.4012738853503185, "grad_norm": 2.00913911322474, "learning_rate": 4.921137675965809e-06, "loss": 0.1263, "step": 882 }, { "epoch": 0.40172884440400364, "grad_norm": 1.3862791101345426, "learning_rate": 4.920959495169406e-06, "loss": 0.1465, "step": 883 }, { "epoch": 0.4021838034576888, "grad_norm": 2.4187567327639234, "learning_rate": 4.920781116543126e-06, "loss": 0.2198, "step": 884 }, { "epoch": 0.402638762511374, "grad_norm": 1.6465776945830464, "learning_rate": 4.920602540101546e-06, "loss": 0.1309, "step": 885 }, { "epoch": 0.4030937215650591, "grad_norm": 2.6312019573375682, "learning_rate": 4.920423765859257e-06, "loss": 0.1948, "step": 886 }, { "epoch": 0.4035486806187443, "grad_norm": 1.9940911601496167, "learning_rate": 4.920244793830869e-06, "loss": 0.1657, "step": 887 }, { "epoch": 0.40400363967242947, "grad_norm": 1.9526243984241491, "learning_rate": 4.920065624031006e-06, "loss": 0.1616, "step": 888 }, { "epoch": 0.40445859872611467, "grad_norm": 1.5338098697837441, "learning_rate": 4.919886256474309e-06, "loss": 0.1512, "step": 889 }, { "epoch": 0.4049135577797998, "grad_norm": 2.0468687722376773, "learning_rate": 4.919706691175435e-06, "loss": 0.1701, "step": 890 }, { "epoch": 0.40536851683348496, "grad_norm": 2.200436787943407, "learning_rate": 4.919526928149058e-06, "loss": 0.2293, "step": 891 }, { "epoch": 0.40582347588717016, "grad_norm": 1.8050882174330405, "learning_rate": 4.919346967409867e-06, "loss": 0.1602, "step": 892 }, { "epoch": 0.4062784349408553, "grad_norm": 1.7135594043707498, "learning_rate": 4.919166808972567e-06, "loss": 0.2064, "step": 893 }, { "epoch": 0.4067333939945405, "grad_norm": 2.612056409341394, "learning_rate": 4.918986452851881e-06, "loss": 0.1668, "step": 894 }, { "epoch": 0.40718835304822565, "grad_norm": 2.016673285347467, "learning_rate": 4.918805899062545e-06, "loss": 0.1925, "step": 895 }, { "epoch": 0.40764331210191085, "grad_norm": 1.4000022926360023, "learning_rate": 4.9186251476193146e-06, "loss": 0.1592, "step": 896 }, { "epoch": 0.408098271155596, "grad_norm": 1.60492731991447, "learning_rate": 4.918444198536959e-06, "loss": 0.1731, "step": 897 }, { "epoch": 0.40855323020928114, "grad_norm": 1.673902690478855, "learning_rate": 4.918263051830267e-06, "loss": 0.1228, "step": 898 }, { "epoch": 0.40900818926296634, "grad_norm": 2.6755237129572484, "learning_rate": 4.918081707514037e-06, "loss": 0.1409, "step": 899 }, { "epoch": 0.4094631483166515, "grad_norm": 1.9078463274657658, "learning_rate": 4.917900165603091e-06, "loss": 0.1276, "step": 900 }, { "epoch": 0.4099181073703367, "grad_norm": 2.234681815409533, "learning_rate": 4.9177184261122624e-06, "loss": 0.1652, "step": 901 }, { "epoch": 0.4103730664240218, "grad_norm": 2.839831167960225, "learning_rate": 4.917536489056402e-06, "loss": 0.1798, "step": 902 }, { "epoch": 0.410828025477707, "grad_norm": 2.010867770048541, "learning_rate": 4.9173543544503775e-06, "loss": 0.1154, "step": 903 }, { "epoch": 0.41128298453139217, "grad_norm": 2.08218098114304, "learning_rate": 4.917172022309072e-06, "loss": 0.1455, "step": 904 }, { "epoch": 0.41173794358507737, "grad_norm": 1.7302162150410665, "learning_rate": 4.916989492647385e-06, "loss": 0.1193, "step": 905 }, { "epoch": 0.4121929026387625, "grad_norm": 1.5485580925696725, "learning_rate": 4.916806765480231e-06, "loss": 0.0922, "step": 906 }, { "epoch": 0.41264786169244766, "grad_norm": 1.7263185607767098, "learning_rate": 4.9166238408225416e-06, "loss": 0.2167, "step": 907 }, { "epoch": 0.41310282074613286, "grad_norm": 1.9178320379998328, "learning_rate": 4.916440718689267e-06, "loss": 0.1554, "step": 908 }, { "epoch": 0.413557779799818, "grad_norm": 1.8197306174687815, "learning_rate": 4.916257399095369e-06, "loss": 0.1474, "step": 909 }, { "epoch": 0.4140127388535032, "grad_norm": 1.7449499320119561, "learning_rate": 4.916073882055827e-06, "loss": 0.1327, "step": 910 }, { "epoch": 0.41446769790718835, "grad_norm": 2.4422880124371646, "learning_rate": 4.91589016758564e-06, "loss": 0.1937, "step": 911 }, { "epoch": 0.41492265696087355, "grad_norm": 1.6511138034689814, "learning_rate": 4.915706255699817e-06, "loss": 0.1363, "step": 912 }, { "epoch": 0.4153776160145587, "grad_norm": 2.143275165439444, "learning_rate": 4.915522146413389e-06, "loss": 0.2735, "step": 913 }, { "epoch": 0.41583257506824384, "grad_norm": 1.924782534095729, "learning_rate": 4.9153378397413985e-06, "loss": 0.1751, "step": 914 }, { "epoch": 0.41628753412192904, "grad_norm": 1.951438348175618, "learning_rate": 4.915153335698908e-06, "loss": 0.1619, "step": 915 }, { "epoch": 0.4167424931756142, "grad_norm": 2.2127088657857548, "learning_rate": 4.914968634300994e-06, "loss": 0.2147, "step": 916 }, { "epoch": 0.4171974522292994, "grad_norm": 1.6061838128612729, "learning_rate": 4.914783735562748e-06, "loss": 0.1499, "step": 917 }, { "epoch": 0.4176524112829845, "grad_norm": 1.4285312675375041, "learning_rate": 4.914598639499281e-06, "loss": 0.1583, "step": 918 }, { "epoch": 0.4181073703366697, "grad_norm": 1.6360040253886021, "learning_rate": 4.914413346125717e-06, "loss": 0.1066, "step": 919 }, { "epoch": 0.41856232939035487, "grad_norm": 2.343895109900456, "learning_rate": 4.914227855457199e-06, "loss": 0.1823, "step": 920 }, { "epoch": 0.41901728844404, "grad_norm": 2.318188728357057, "learning_rate": 4.914042167508881e-06, "loss": 0.1437, "step": 921 }, { "epoch": 0.4194722474977252, "grad_norm": 2.3202387804341336, "learning_rate": 4.9138562822959416e-06, "loss": 0.1589, "step": 922 }, { "epoch": 0.41992720655141036, "grad_norm": 2.608072279082345, "learning_rate": 4.913670199833566e-06, "loss": 0.1851, "step": 923 }, { "epoch": 0.42038216560509556, "grad_norm": 2.181253773511138, "learning_rate": 4.913483920136961e-06, "loss": 0.1756, "step": 924 }, { "epoch": 0.4208371246587807, "grad_norm": 2.211521150780038, "learning_rate": 4.91329744322135e-06, "loss": 0.1732, "step": 925 }, { "epoch": 0.4212920837124659, "grad_norm": 1.812598878243348, "learning_rate": 4.913110769101971e-06, "loss": 0.166, "step": 926 }, { "epoch": 0.42174704276615105, "grad_norm": 2.205776388483361, "learning_rate": 4.912923897794077e-06, "loss": 0.1771, "step": 927 }, { "epoch": 0.4222020018198362, "grad_norm": 1.423655928174165, "learning_rate": 4.912736829312938e-06, "loss": 0.1489, "step": 928 }, { "epoch": 0.4226569608735214, "grad_norm": 1.831805612119293, "learning_rate": 4.912549563673842e-06, "loss": 0.168, "step": 929 }, { "epoch": 0.42311191992720654, "grad_norm": 1.4699738850406474, "learning_rate": 4.912362100892091e-06, "loss": 0.1674, "step": 930 }, { "epoch": 0.42356687898089174, "grad_norm": 1.9047547244636083, "learning_rate": 4.912174440983002e-06, "loss": 0.1639, "step": 931 }, { "epoch": 0.4240218380345769, "grad_norm": 2.0520314286066843, "learning_rate": 4.911986583961912e-06, "loss": 0.2138, "step": 932 }, { "epoch": 0.4244767970882621, "grad_norm": 2.5542601480975278, "learning_rate": 4.91179852984417e-06, "loss": 0.2276, "step": 933 }, { "epoch": 0.4249317561419472, "grad_norm": 1.5302053494447614, "learning_rate": 4.911610278645144e-06, "loss": 0.1489, "step": 934 }, { "epoch": 0.42538671519563237, "grad_norm": 1.7414787617118297, "learning_rate": 4.911421830380217e-06, "loss": 0.1182, "step": 935 }, { "epoch": 0.42584167424931757, "grad_norm": 1.7429292851594573, "learning_rate": 4.911233185064788e-06, "loss": 0.2064, "step": 936 }, { "epoch": 0.4262966333030027, "grad_norm": 2.3105951171285968, "learning_rate": 4.911044342714272e-06, "loss": 0.1405, "step": 937 }, { "epoch": 0.4267515923566879, "grad_norm": 1.779382452074537, "learning_rate": 4.9108553033440995e-06, "loss": 0.1291, "step": 938 }, { "epoch": 0.42720655141037306, "grad_norm": 1.7957846625134024, "learning_rate": 4.91066606696972e-06, "loss": 0.1647, "step": 939 }, { "epoch": 0.42766151046405826, "grad_norm": 2.3261521372348057, "learning_rate": 4.910476633606597e-06, "loss": 0.1927, "step": 940 }, { "epoch": 0.4281164695177434, "grad_norm": 1.9153006743556071, "learning_rate": 4.9102870032702075e-06, "loss": 0.1584, "step": 941 }, { "epoch": 0.42857142857142855, "grad_norm": 1.8033286854373174, "learning_rate": 4.910097175976049e-06, "loss": 0.1825, "step": 942 }, { "epoch": 0.42902638762511375, "grad_norm": 2.8388348591880597, "learning_rate": 4.909907151739634e-06, "loss": 0.2113, "step": 943 }, { "epoch": 0.4294813466787989, "grad_norm": 2.6244899475003813, "learning_rate": 4.909716930576489e-06, "loss": 0.1704, "step": 944 }, { "epoch": 0.4299363057324841, "grad_norm": 2.112585064442849, "learning_rate": 4.909526512502158e-06, "loss": 0.1589, "step": 945 }, { "epoch": 0.43039126478616924, "grad_norm": 2.289068427554651, "learning_rate": 4.9093358975322025e-06, "loss": 0.1714, "step": 946 }, { "epoch": 0.43084622383985444, "grad_norm": 2.5327374827374065, "learning_rate": 4.909145085682198e-06, "loss": 0.2278, "step": 947 }, { "epoch": 0.4313011828935396, "grad_norm": 2.1519696726150315, "learning_rate": 4.908954076967737e-06, "loss": 0.1561, "step": 948 }, { "epoch": 0.4317561419472247, "grad_norm": 2.3965497202736485, "learning_rate": 4.908762871404427e-06, "loss": 0.2721, "step": 949 }, { "epoch": 0.4322111010009099, "grad_norm": 1.7303730946554432, "learning_rate": 4.908571469007893e-06, "loss": 0.1886, "step": 950 }, { "epoch": 0.43266606005459507, "grad_norm": 1.867974683826286, "learning_rate": 4.908379869793776e-06, "loss": 0.1621, "step": 951 }, { "epoch": 0.43312101910828027, "grad_norm": 2.0573077802321134, "learning_rate": 4.908188073777732e-06, "loss": 0.1897, "step": 952 }, { "epoch": 0.4335759781619654, "grad_norm": 1.4532292026282405, "learning_rate": 4.9079960809754334e-06, "loss": 0.1729, "step": 953 }, { "epoch": 0.4340309372156506, "grad_norm": 1.962539816890548, "learning_rate": 4.90780389140257e-06, "loss": 0.1301, "step": 954 }, { "epoch": 0.43448589626933576, "grad_norm": 2.4468234331381677, "learning_rate": 4.907611505074846e-06, "loss": 0.1709, "step": 955 }, { "epoch": 0.4349408553230209, "grad_norm": 2.666497869750462, "learning_rate": 4.907418922007983e-06, "loss": 0.1628, "step": 956 }, { "epoch": 0.4353958143767061, "grad_norm": 2.2137035827801226, "learning_rate": 4.907226142217717e-06, "loss": 0.1353, "step": 957 }, { "epoch": 0.43585077343039125, "grad_norm": 2.572062185697332, "learning_rate": 4.9070331657198015e-06, "loss": 0.1745, "step": 958 }, { "epoch": 0.43630573248407645, "grad_norm": 2.405655176153194, "learning_rate": 4.906839992530006e-06, "loss": 0.2171, "step": 959 }, { "epoch": 0.4367606915377616, "grad_norm": 1.836795075502022, "learning_rate": 4.906646622664115e-06, "loss": 0.168, "step": 960 }, { "epoch": 0.4372156505914468, "grad_norm": 2.166035033805183, "learning_rate": 4.906453056137931e-06, "loss": 0.1223, "step": 961 }, { "epoch": 0.43767060964513194, "grad_norm": 2.072717194766617, "learning_rate": 4.90625929296727e-06, "loss": 0.2248, "step": 962 }, { "epoch": 0.4381255686988171, "grad_norm": 1.8024426189806846, "learning_rate": 4.9060653331679665e-06, "loss": 0.1956, "step": 963 }, { "epoch": 0.4385805277525023, "grad_norm": 1.8368071839220441, "learning_rate": 4.90587117675587e-06, "loss": 0.1601, "step": 964 }, { "epoch": 0.4390354868061874, "grad_norm": 1.6602305730067044, "learning_rate": 4.905676823746846e-06, "loss": 0.1433, "step": 965 }, { "epoch": 0.4394904458598726, "grad_norm": 1.2991365263950634, "learning_rate": 4.9054822741567745e-06, "loss": 0.1361, "step": 966 }, { "epoch": 0.43994540491355777, "grad_norm": 2.1130749414647463, "learning_rate": 4.905287528001555e-06, "loss": 0.145, "step": 967 }, { "epoch": 0.44040036396724297, "grad_norm": 1.8646843859502422, "learning_rate": 4.905092585297102e-06, "loss": 0.1685, "step": 968 }, { "epoch": 0.4408553230209281, "grad_norm": 2.1749982799245693, "learning_rate": 4.904897446059344e-06, "loss": 0.1621, "step": 969 }, { "epoch": 0.44131028207461326, "grad_norm": 2.334038135097662, "learning_rate": 4.9047021103042255e-06, "loss": 0.1486, "step": 970 }, { "epoch": 0.44176524112829846, "grad_norm": 2.600358800525879, "learning_rate": 4.904506578047712e-06, "loss": 0.1603, "step": 971 }, { "epoch": 0.4422202001819836, "grad_norm": 2.0684781990731436, "learning_rate": 4.9043108493057785e-06, "loss": 0.1708, "step": 972 }, { "epoch": 0.4426751592356688, "grad_norm": 2.0255722402037852, "learning_rate": 4.904114924094421e-06, "loss": 0.1608, "step": 973 }, { "epoch": 0.44313011828935395, "grad_norm": 2.1489494601016434, "learning_rate": 4.903918802429648e-06, "loss": 0.1829, "step": 974 }, { "epoch": 0.44358507734303915, "grad_norm": 1.787442464619014, "learning_rate": 4.9037224843274875e-06, "loss": 0.2043, "step": 975 }, { "epoch": 0.4440400363967243, "grad_norm": 2.343300114421743, "learning_rate": 4.903525969803979e-06, "loss": 0.2699, "step": 976 }, { "epoch": 0.44449499545040944, "grad_norm": 1.865479334461903, "learning_rate": 4.903329258875184e-06, "loss": 0.1195, "step": 977 }, { "epoch": 0.44494995450409464, "grad_norm": 1.9494468159837486, "learning_rate": 4.903132351557175e-06, "loss": 0.1465, "step": 978 }, { "epoch": 0.4454049135577798, "grad_norm": 2.502406890844037, "learning_rate": 4.902935247866043e-06, "loss": 0.1378, "step": 979 }, { "epoch": 0.445859872611465, "grad_norm": 2.036041143606274, "learning_rate": 4.9027379478178935e-06, "loss": 0.1483, "step": 980 }, { "epoch": 0.4463148316651501, "grad_norm": 1.3077265314607576, "learning_rate": 4.90254045142885e-06, "loss": 0.0969, "step": 981 }, { "epoch": 0.4467697907188353, "grad_norm": 2.0861883133616828, "learning_rate": 4.90234275871505e-06, "loss": 0.1392, "step": 982 }, { "epoch": 0.44722474977252047, "grad_norm": 2.5093809608609274, "learning_rate": 4.9021448696926486e-06, "loss": 0.1743, "step": 983 }, { "epoch": 0.44767970882620567, "grad_norm": 1.575875578739379, "learning_rate": 4.901946784377816e-06, "loss": 0.176, "step": 984 }, { "epoch": 0.4481346678798908, "grad_norm": 1.5356501213932587, "learning_rate": 4.90174850278674e-06, "loss": 0.1484, "step": 985 }, { "epoch": 0.44858962693357596, "grad_norm": 1.823863525681817, "learning_rate": 4.901550024935623e-06, "loss": 0.1854, "step": 986 }, { "epoch": 0.44904458598726116, "grad_norm": 1.3758352509840184, "learning_rate": 4.901351350840683e-06, "loss": 0.1349, "step": 987 }, { "epoch": 0.4494995450409463, "grad_norm": 2.0693941858838762, "learning_rate": 4.901152480518155e-06, "loss": 0.1663, "step": 988 }, { "epoch": 0.4499545040946315, "grad_norm": 1.8873877263165615, "learning_rate": 4.900953413984289e-06, "loss": 0.1692, "step": 989 }, { "epoch": 0.45040946314831665, "grad_norm": 1.4776284855591897, "learning_rate": 4.900754151255353e-06, "loss": 0.1971, "step": 990 }, { "epoch": 0.45086442220200185, "grad_norm": 2.5191235263020912, "learning_rate": 4.9005546923476305e-06, "loss": 0.1998, "step": 991 }, { "epoch": 0.451319381255687, "grad_norm": 1.8842919796768522, "learning_rate": 4.9003550372774185e-06, "loss": 0.1399, "step": 992 }, { "epoch": 0.45177434030937214, "grad_norm": 2.063855552138974, "learning_rate": 4.900155186061033e-06, "loss": 0.1526, "step": 993 }, { "epoch": 0.45222929936305734, "grad_norm": 2.367561517511786, "learning_rate": 4.8999551387148045e-06, "loss": 0.1599, "step": 994 }, { "epoch": 0.4526842584167425, "grad_norm": 1.898969473092516, "learning_rate": 4.89975489525508e-06, "loss": 0.1902, "step": 995 }, { "epoch": 0.4531392174704277, "grad_norm": 1.8129578397632808, "learning_rate": 4.899554455698223e-06, "loss": 0.1693, "step": 996 }, { "epoch": 0.4535941765241128, "grad_norm": 1.8699568695488074, "learning_rate": 4.899353820060612e-06, "loss": 0.1581, "step": 997 }, { "epoch": 0.454049135577798, "grad_norm": 1.7239980533667612, "learning_rate": 4.899152988358643e-06, "loss": 0.2098, "step": 998 }, { "epoch": 0.45450409463148317, "grad_norm": 1.8097885043847937, "learning_rate": 4.898951960608725e-06, "loss": 0.1715, "step": 999 }, { "epoch": 0.4549590536851683, "grad_norm": 1.8523553420273773, "learning_rate": 4.8987507368272865e-06, "loss": 0.16, "step": 1000 }, { "epoch": 0.4554140127388535, "grad_norm": 2.0000127792736904, "learning_rate": 4.898549317030772e-06, "loss": 0.1632, "step": 1001 }, { "epoch": 0.45586897179253866, "grad_norm": 1.862660132529776, "learning_rate": 4.898347701235637e-06, "loss": 0.1465, "step": 1002 }, { "epoch": 0.45632393084622386, "grad_norm": 1.7361264176719555, "learning_rate": 4.89814588945836e-06, "loss": 0.1869, "step": 1003 }, { "epoch": 0.456778889899909, "grad_norm": 1.978059539176156, "learning_rate": 4.89794388171543e-06, "loss": 0.1659, "step": 1004 }, { "epoch": 0.4572338489535942, "grad_norm": 2.2207578653400906, "learning_rate": 4.897741678023356e-06, "loss": 0.1939, "step": 1005 }, { "epoch": 0.45768880800727935, "grad_norm": 2.103052599683253, "learning_rate": 4.897539278398659e-06, "loss": 0.1812, "step": 1006 }, { "epoch": 0.4581437670609645, "grad_norm": 2.546107708354434, "learning_rate": 4.8973366828578804e-06, "loss": 0.2054, "step": 1007 }, { "epoch": 0.4585987261146497, "grad_norm": 1.9562513052044435, "learning_rate": 4.897133891417574e-06, "loss": 0.1693, "step": 1008 }, { "epoch": 0.45905368516833484, "grad_norm": 2.5635809078172103, "learning_rate": 4.896930904094311e-06, "loss": 0.1689, "step": 1009 }, { "epoch": 0.45950864422202004, "grad_norm": 2.401849938137445, "learning_rate": 4.896727720904679e-06, "loss": 0.1731, "step": 1010 }, { "epoch": 0.4599636032757052, "grad_norm": 1.3521913269323886, "learning_rate": 4.896524341865282e-06, "loss": 0.0961, "step": 1011 }, { "epoch": 0.4604185623293904, "grad_norm": 1.773432887084503, "learning_rate": 4.896320766992737e-06, "loss": 0.1875, "step": 1012 }, { "epoch": 0.4608735213830755, "grad_norm": 1.7325101393470637, "learning_rate": 4.896116996303682e-06, "loss": 0.1534, "step": 1013 }, { "epoch": 0.46132848043676067, "grad_norm": 1.8711913127871913, "learning_rate": 4.895913029814766e-06, "loss": 0.1476, "step": 1014 }, { "epoch": 0.46178343949044587, "grad_norm": 1.98409281551755, "learning_rate": 4.895708867542658e-06, "loss": 0.2099, "step": 1015 }, { "epoch": 0.462238398544131, "grad_norm": 1.835471556122073, "learning_rate": 4.895504509504039e-06, "loss": 0.141, "step": 1016 }, { "epoch": 0.4626933575978162, "grad_norm": 1.7126193650485422, "learning_rate": 4.89529995571561e-06, "loss": 0.1569, "step": 1017 }, { "epoch": 0.46314831665150136, "grad_norm": 1.5756476134085153, "learning_rate": 4.895095206194086e-06, "loss": 0.1599, "step": 1018 }, { "epoch": 0.46360327570518656, "grad_norm": 1.6305833927339777, "learning_rate": 4.894890260956198e-06, "loss": 0.1266, "step": 1019 }, { "epoch": 0.4640582347588717, "grad_norm": 2.8915138386415107, "learning_rate": 4.8946851200186925e-06, "loss": 0.1983, "step": 1020 }, { "epoch": 0.46451319381255685, "grad_norm": 2.2750148686402873, "learning_rate": 4.894479783398334e-06, "loss": 0.2161, "step": 1021 }, { "epoch": 0.46496815286624205, "grad_norm": 1.901328095270706, "learning_rate": 4.8942742511119004e-06, "loss": 0.2033, "step": 1022 }, { "epoch": 0.4654231119199272, "grad_norm": 3.2947250275495747, "learning_rate": 4.894068523176187e-06, "loss": 0.258, "step": 1023 }, { "epoch": 0.4658780709736124, "grad_norm": 1.9323682134416058, "learning_rate": 4.8938625996080056e-06, "loss": 0.1788, "step": 1024 }, { "epoch": 0.46633303002729753, "grad_norm": 1.6185621650651296, "learning_rate": 4.893656480424184e-06, "loss": 0.1651, "step": 1025 }, { "epoch": 0.46678798908098273, "grad_norm": 2.2508459323489, "learning_rate": 4.893450165641564e-06, "loss": 0.1558, "step": 1026 }, { "epoch": 0.4672429481346679, "grad_norm": 1.213648480980067, "learning_rate": 4.893243655277005e-06, "loss": 0.1507, "step": 1027 }, { "epoch": 0.467697907188353, "grad_norm": 2.1023746671368513, "learning_rate": 4.893036949347383e-06, "loss": 0.1721, "step": 1028 }, { "epoch": 0.4681528662420382, "grad_norm": 2.278948598326534, "learning_rate": 4.892830047869588e-06, "loss": 0.1884, "step": 1029 }, { "epoch": 0.46860782529572337, "grad_norm": 2.9197430954616683, "learning_rate": 4.892622950860527e-06, "loss": 0.1741, "step": 1030 }, { "epoch": 0.46906278434940857, "grad_norm": 1.2879852782085728, "learning_rate": 4.892415658337123e-06, "loss": 0.1342, "step": 1031 }, { "epoch": 0.4695177434030937, "grad_norm": 2.0909020173909973, "learning_rate": 4.892208170316317e-06, "loss": 0.1907, "step": 1032 }, { "epoch": 0.4699727024567789, "grad_norm": 2.0408884938878957, "learning_rate": 4.892000486815062e-06, "loss": 0.1553, "step": 1033 }, { "epoch": 0.47042766151046406, "grad_norm": 1.8109063186030263, "learning_rate": 4.891792607850328e-06, "loss": 0.154, "step": 1034 }, { "epoch": 0.4708826205641492, "grad_norm": 2.2630304525012126, "learning_rate": 4.891584533439104e-06, "loss": 0.2079, "step": 1035 }, { "epoch": 0.4713375796178344, "grad_norm": 1.539332632871382, "learning_rate": 4.891376263598393e-06, "loss": 0.1432, "step": 1036 }, { "epoch": 0.47179253867151955, "grad_norm": 1.7957849792578133, "learning_rate": 4.891167798345213e-06, "loss": 0.1511, "step": 1037 }, { "epoch": 0.47224749772520475, "grad_norm": 2.741729093401805, "learning_rate": 4.890959137696598e-06, "loss": 0.2263, "step": 1038 }, { "epoch": 0.4727024567788899, "grad_norm": 1.7348700401664916, "learning_rate": 4.890750281669601e-06, "loss": 0.1298, "step": 1039 }, { "epoch": 0.4731574158325751, "grad_norm": 1.7001320886150055, "learning_rate": 4.890541230281287e-06, "loss": 0.1168, "step": 1040 }, { "epoch": 0.47361237488626023, "grad_norm": 1.8500860192841622, "learning_rate": 4.8903319835487385e-06, "loss": 0.1644, "step": 1041 }, { "epoch": 0.4740673339399454, "grad_norm": 1.8582195164199888, "learning_rate": 4.890122541489056e-06, "loss": 0.2426, "step": 1042 }, { "epoch": 0.4745222929936306, "grad_norm": 1.2923171102528221, "learning_rate": 4.889912904119353e-06, "loss": 0.165, "step": 1043 }, { "epoch": 0.4749772520473157, "grad_norm": 2.2842684826182778, "learning_rate": 4.88970307145676e-06, "loss": 0.1853, "step": 1044 }, { "epoch": 0.4754322111010009, "grad_norm": 1.8277244050495731, "learning_rate": 4.889493043518423e-06, "loss": 0.2139, "step": 1045 }, { "epoch": 0.47588717015468607, "grad_norm": 2.021142913969995, "learning_rate": 4.889282820321506e-06, "loss": 0.1312, "step": 1046 }, { "epoch": 0.47634212920837127, "grad_norm": 1.8896361944599618, "learning_rate": 4.889072401883187e-06, "loss": 0.224, "step": 1047 }, { "epoch": 0.4767970882620564, "grad_norm": 1.552692831396847, "learning_rate": 4.88886178822066e-06, "loss": 0.1772, "step": 1048 }, { "epoch": 0.47725204731574156, "grad_norm": 1.8340975280187983, "learning_rate": 4.888650979351136e-06, "loss": 0.1702, "step": 1049 }, { "epoch": 0.47770700636942676, "grad_norm": 2.1830448534590547, "learning_rate": 4.888439975291841e-06, "loss": 0.2436, "step": 1050 }, { "epoch": 0.4781619654231119, "grad_norm": 1.6348401870707816, "learning_rate": 4.888228776060017e-06, "loss": 0.1926, "step": 1051 }, { "epoch": 0.4786169244767971, "grad_norm": 1.7078513906709398, "learning_rate": 4.888017381672923e-06, "loss": 0.1601, "step": 1052 }, { "epoch": 0.47907188353048225, "grad_norm": 2.240745720528745, "learning_rate": 4.887805792147832e-06, "loss": 0.1766, "step": 1053 }, { "epoch": 0.47952684258416745, "grad_norm": 2.428487112277442, "learning_rate": 4.887594007502036e-06, "loss": 0.1789, "step": 1054 }, { "epoch": 0.4799818016378526, "grad_norm": 2.1865518873285645, "learning_rate": 4.887382027752838e-06, "loss": 0.199, "step": 1055 }, { "epoch": 0.48043676069153773, "grad_norm": 1.898629261883439, "learning_rate": 4.8871698529175636e-06, "loss": 0.1665, "step": 1056 }, { "epoch": 0.48089171974522293, "grad_norm": 1.7954561311174488, "learning_rate": 4.886957483013549e-06, "loss": 0.1812, "step": 1057 }, { "epoch": 0.4813466787989081, "grad_norm": 1.8221114015246185, "learning_rate": 4.886744918058149e-06, "loss": 0.2063, "step": 1058 }, { "epoch": 0.4818016378525933, "grad_norm": 2.7770081846232544, "learning_rate": 4.886532158068732e-06, "loss": 0.2088, "step": 1059 }, { "epoch": 0.4822565969062784, "grad_norm": 2.1115268373643477, "learning_rate": 4.886319203062683e-06, "loss": 0.1444, "step": 1060 }, { "epoch": 0.4827115559599636, "grad_norm": 2.071172717908372, "learning_rate": 4.886106053057408e-06, "loss": 0.1661, "step": 1061 }, { "epoch": 0.48316651501364877, "grad_norm": 2.2607152479196104, "learning_rate": 4.88589270807032e-06, "loss": 0.1859, "step": 1062 }, { "epoch": 0.48362147406733397, "grad_norm": 1.692360966817902, "learning_rate": 4.885679168118855e-06, "loss": 0.1864, "step": 1063 }, { "epoch": 0.4840764331210191, "grad_norm": 1.7710659763891554, "learning_rate": 4.8854654332204635e-06, "loss": 0.1646, "step": 1064 }, { "epoch": 0.48453139217470426, "grad_norm": 1.9598218562809384, "learning_rate": 4.885251503392607e-06, "loss": 0.1402, "step": 1065 }, { "epoch": 0.48498635122838946, "grad_norm": 2.1204508830789988, "learning_rate": 4.885037378652771e-06, "loss": 0.1891, "step": 1066 }, { "epoch": 0.4854413102820746, "grad_norm": 2.3589815655452653, "learning_rate": 4.884823059018451e-06, "loss": 0.1555, "step": 1067 }, { "epoch": 0.4858962693357598, "grad_norm": 2.5392202747520245, "learning_rate": 4.88460854450716e-06, "loss": 0.192, "step": 1068 }, { "epoch": 0.48635122838944495, "grad_norm": 2.3012454866986833, "learning_rate": 4.884393835136427e-06, "loss": 0.2073, "step": 1069 }, { "epoch": 0.48680618744313015, "grad_norm": 1.7363057272250617, "learning_rate": 4.884178930923799e-06, "loss": 0.1909, "step": 1070 }, { "epoch": 0.4872611464968153, "grad_norm": 2.5682234171797638, "learning_rate": 4.883963831886834e-06, "loss": 0.2505, "step": 1071 }, { "epoch": 0.48771610555050043, "grad_norm": 2.1085560059563435, "learning_rate": 4.8837485380431115e-06, "loss": 0.1713, "step": 1072 }, { "epoch": 0.48817106460418563, "grad_norm": 1.8882533184752026, "learning_rate": 4.883533049410223e-06, "loss": 0.1602, "step": 1073 }, { "epoch": 0.4886260236578708, "grad_norm": 2.9321383026683985, "learning_rate": 4.8833173660057785e-06, "loss": 0.2554, "step": 1074 }, { "epoch": 0.489080982711556, "grad_norm": 2.531195131930091, "learning_rate": 4.8831014878474004e-06, "loss": 0.1797, "step": 1075 }, { "epoch": 0.4895359417652411, "grad_norm": 1.9044052944051695, "learning_rate": 4.882885414952732e-06, "loss": 0.1738, "step": 1076 }, { "epoch": 0.4899909008189263, "grad_norm": 1.8646399638677997, "learning_rate": 4.882669147339428e-06, "loss": 0.123, "step": 1077 }, { "epoch": 0.49044585987261147, "grad_norm": 1.6244921355768605, "learning_rate": 4.882452685025161e-06, "loss": 0.1207, "step": 1078 }, { "epoch": 0.4909008189262966, "grad_norm": 2.6418064094824625, "learning_rate": 4.88223602802762e-06, "loss": 0.1651, "step": 1079 }, { "epoch": 0.4913557779799818, "grad_norm": 1.7280688771591737, "learning_rate": 4.882019176364509e-06, "loss": 0.1654, "step": 1080 }, { "epoch": 0.49181073703366696, "grad_norm": 1.688117545561323, "learning_rate": 4.881802130053548e-06, "loss": 0.1779, "step": 1081 }, { "epoch": 0.49226569608735216, "grad_norm": 1.9343631314892762, "learning_rate": 4.881584889112473e-06, "loss": 0.1378, "step": 1082 }, { "epoch": 0.4927206551410373, "grad_norm": 2.0445775883054194, "learning_rate": 4.881367453559036e-06, "loss": 0.1945, "step": 1083 }, { "epoch": 0.4931756141947225, "grad_norm": 2.0708720739438835, "learning_rate": 4.881149823411005e-06, "loss": 0.155, "step": 1084 }, { "epoch": 0.49363057324840764, "grad_norm": 1.8016295656127952, "learning_rate": 4.880931998686162e-06, "loss": 0.1374, "step": 1085 }, { "epoch": 0.4940855323020928, "grad_norm": 1.8010911071848295, "learning_rate": 4.880713979402311e-06, "loss": 0.2764, "step": 1086 }, { "epoch": 0.494540491355778, "grad_norm": 2.2201593715577945, "learning_rate": 4.880495765577263e-06, "loss": 0.1785, "step": 1087 }, { "epoch": 0.49499545040946313, "grad_norm": 2.5150440926935183, "learning_rate": 4.880277357228852e-06, "loss": 0.1415, "step": 1088 }, { "epoch": 0.49545040946314833, "grad_norm": 1.4882801876169178, "learning_rate": 4.880058754374923e-06, "loss": 0.1528, "step": 1089 }, { "epoch": 0.4959053685168335, "grad_norm": 1.9307317316728292, "learning_rate": 4.879839957033343e-06, "loss": 0.1661, "step": 1090 }, { "epoch": 0.4963603275705187, "grad_norm": 1.6645987589280862, "learning_rate": 4.879620965221987e-06, "loss": 0.1058, "step": 1091 }, { "epoch": 0.4968152866242038, "grad_norm": 1.1436431770468727, "learning_rate": 4.879401778958755e-06, "loss": 0.0867, "step": 1092 }, { "epoch": 0.49727024567788897, "grad_norm": 2.072303030104995, "learning_rate": 4.8791823982615525e-06, "loss": 0.1454, "step": 1093 }, { "epoch": 0.49772520473157417, "grad_norm": 1.4026343543836923, "learning_rate": 4.878962823148308e-06, "loss": 0.1176, "step": 1094 }, { "epoch": 0.4981801637852593, "grad_norm": 2.4971931111745795, "learning_rate": 4.878743053636968e-06, "loss": 0.2058, "step": 1095 }, { "epoch": 0.4986351228389445, "grad_norm": 1.9096703970153857, "learning_rate": 4.878523089745485e-06, "loss": 0.2389, "step": 1096 }, { "epoch": 0.49909008189262966, "grad_norm": 1.7150797344948416, "learning_rate": 4.878302931491837e-06, "loss": 0.1408, "step": 1097 }, { "epoch": 0.49954504094631486, "grad_norm": 1.8467538410779647, "learning_rate": 4.8780825788940145e-06, "loss": 0.1212, "step": 1098 }, { "epoch": 0.5, "grad_norm": 1.6653067013861202, "learning_rate": 4.877862031970023e-06, "loss": 0.165, "step": 1099 }, { "epoch": 0.5004549590536852, "grad_norm": 2.7780746831303866, "learning_rate": 4.8776412907378845e-06, "loss": 0.1959, "step": 1100 }, { "epoch": 0.5009099181073703, "grad_norm": 2.735253037622924, "learning_rate": 4.877420355215637e-06, "loss": 0.1853, "step": 1101 }, { "epoch": 0.5013648771610555, "grad_norm": 2.2801170489693474, "learning_rate": 4.877199225421334e-06, "loss": 0.223, "step": 1102 }, { "epoch": 0.5018198362147407, "grad_norm": 1.5121408545649673, "learning_rate": 4.8769779013730454e-06, "loss": 0.1766, "step": 1103 }, { "epoch": 0.5022747952684259, "grad_norm": 1.2736560565952975, "learning_rate": 4.876756383088858e-06, "loss": 0.1147, "step": 1104 }, { "epoch": 0.502729754322111, "grad_norm": 2.234019869097899, "learning_rate": 4.876534670586872e-06, "loss": 0.1353, "step": 1105 }, { "epoch": 0.5031847133757962, "grad_norm": 2.231499299533909, "learning_rate": 4.8763127638852045e-06, "loss": 0.1542, "step": 1106 }, { "epoch": 0.5036396724294814, "grad_norm": 1.8302676611966564, "learning_rate": 4.87609066300199e-06, "loss": 0.1494, "step": 1107 }, { "epoch": 0.5040946314831665, "grad_norm": 2.4154877250923157, "learning_rate": 4.875868367955376e-06, "loss": 0.1937, "step": 1108 }, { "epoch": 0.5045495905368517, "grad_norm": 2.752908247632549, "learning_rate": 4.87564587876353e-06, "loss": 0.2127, "step": 1109 }, { "epoch": 0.5050045495905369, "grad_norm": 1.718053996922888, "learning_rate": 4.87542319544463e-06, "loss": 0.1702, "step": 1110 }, { "epoch": 0.5054595086442221, "grad_norm": 1.5702103077744012, "learning_rate": 4.875200318016873e-06, "loss": 0.1566, "step": 1111 }, { "epoch": 0.5059144676979072, "grad_norm": 2.0381911393844825, "learning_rate": 4.8749772464984736e-06, "loss": 0.2017, "step": 1112 }, { "epoch": 0.5063694267515924, "grad_norm": 1.8176309130216741, "learning_rate": 4.874753980907658e-06, "loss": 0.1864, "step": 1113 }, { "epoch": 0.5068243858052776, "grad_norm": 2.1308929915187753, "learning_rate": 4.8745305212626714e-06, "loss": 0.1726, "step": 1114 }, { "epoch": 0.5072793448589626, "grad_norm": 1.8139775978694637, "learning_rate": 4.874306867581775e-06, "loss": 0.1761, "step": 1115 }, { "epoch": 0.5077343039126478, "grad_norm": 1.7183373875600083, "learning_rate": 4.874083019883242e-06, "loss": 0.1333, "step": 1116 }, { "epoch": 0.508189262966333, "grad_norm": 1.8665339958095688, "learning_rate": 4.873858978185367e-06, "loss": 0.1932, "step": 1117 }, { "epoch": 0.5086442220200182, "grad_norm": 2.352764145779797, "learning_rate": 4.8736347425064565e-06, "loss": 0.2031, "step": 1118 }, { "epoch": 0.5090991810737033, "grad_norm": 2.678329346866304, "learning_rate": 4.873410312864833e-06, "loss": 0.214, "step": 1119 }, { "epoch": 0.5095541401273885, "grad_norm": 2.350844354697721, "learning_rate": 4.8731856892788384e-06, "loss": 0.2144, "step": 1120 }, { "epoch": 0.5100090991810737, "grad_norm": 1.9729175722269603, "learning_rate": 4.872960871766826e-06, "loss": 0.2081, "step": 1121 }, { "epoch": 0.5104640582347588, "grad_norm": 1.4433063314456696, "learning_rate": 4.8727358603471675e-06, "loss": 0.1703, "step": 1122 }, { "epoch": 0.510919017288444, "grad_norm": 2.5283375453779704, "learning_rate": 4.872510655038249e-06, "loss": 0.1536, "step": 1123 }, { "epoch": 0.5113739763421292, "grad_norm": 1.3858205152408392, "learning_rate": 4.872285255858476e-06, "loss": 0.1458, "step": 1124 }, { "epoch": 0.5118289353958144, "grad_norm": 2.0487135879281024, "learning_rate": 4.872059662826263e-06, "loss": 0.1661, "step": 1125 }, { "epoch": 0.5122838944494995, "grad_norm": 1.9472322837633822, "learning_rate": 4.8718338759600465e-06, "loss": 0.1786, "step": 1126 }, { "epoch": 0.5127388535031847, "grad_norm": 1.6310032173739817, "learning_rate": 4.871607895278278e-06, "loss": 0.1626, "step": 1127 }, { "epoch": 0.5131938125568699, "grad_norm": 1.985456014356635, "learning_rate": 4.871381720799421e-06, "loss": 0.1702, "step": 1128 }, { "epoch": 0.513648771610555, "grad_norm": 2.402370971493488, "learning_rate": 4.8711553525419595e-06, "loss": 0.194, "step": 1129 }, { "epoch": 0.5141037306642402, "grad_norm": 1.5072231886887353, "learning_rate": 4.87092879052439e-06, "loss": 0.1573, "step": 1130 }, { "epoch": 0.5145586897179254, "grad_norm": 1.449165092170168, "learning_rate": 4.8707020347652275e-06, "loss": 0.1246, "step": 1131 }, { "epoch": 0.5150136487716106, "grad_norm": 1.8608730682993475, "learning_rate": 4.870475085283001e-06, "loss": 0.1831, "step": 1132 }, { "epoch": 0.5154686078252957, "grad_norm": 1.9183857631670505, "learning_rate": 4.870247942096254e-06, "loss": 0.1638, "step": 1133 }, { "epoch": 0.5159235668789809, "grad_norm": 1.6563135044982633, "learning_rate": 4.870020605223551e-06, "loss": 0.1421, "step": 1134 }, { "epoch": 0.5163785259326661, "grad_norm": 1.7995838783709266, "learning_rate": 4.869793074683466e-06, "loss": 0.147, "step": 1135 }, { "epoch": 0.5168334849863512, "grad_norm": 1.9548719996118153, "learning_rate": 4.8695653504945925e-06, "loss": 0.1575, "step": 1136 }, { "epoch": 0.5172884440400364, "grad_norm": 1.7522375644775081, "learning_rate": 4.8693374326755405e-06, "loss": 0.1495, "step": 1137 }, { "epoch": 0.5177434030937216, "grad_norm": 1.3062327753186558, "learning_rate": 4.869109321244932e-06, "loss": 0.116, "step": 1138 }, { "epoch": 0.5181983621474068, "grad_norm": 1.8868647769132803, "learning_rate": 4.86888101622141e-06, "loss": 0.1794, "step": 1139 }, { "epoch": 0.5186533212010919, "grad_norm": 1.8158313749710562, "learning_rate": 4.868652517623629e-06, "loss": 0.1391, "step": 1140 }, { "epoch": 0.5191082802547771, "grad_norm": 1.8111217984491637, "learning_rate": 4.86842382547026e-06, "loss": 0.1494, "step": 1141 }, { "epoch": 0.5195632393084623, "grad_norm": 2.8090775733585835, "learning_rate": 4.868194939779992e-06, "loss": 0.1896, "step": 1142 }, { "epoch": 0.5200181983621474, "grad_norm": 1.9497550190765165, "learning_rate": 4.867965860571529e-06, "loss": 0.1552, "step": 1143 }, { "epoch": 0.5204731574158326, "grad_norm": 1.990627059765444, "learning_rate": 4.867736587863589e-06, "loss": 0.2094, "step": 1144 }, { "epoch": 0.5209281164695178, "grad_norm": 2.247771495871837, "learning_rate": 4.867507121674907e-06, "loss": 0.2391, "step": 1145 }, { "epoch": 0.521383075523203, "grad_norm": 2.120187054464733, "learning_rate": 4.867277462024235e-06, "loss": 0.1775, "step": 1146 }, { "epoch": 0.521838034576888, "grad_norm": 1.7774801845384391, "learning_rate": 4.8670476089303395e-06, "loss": 0.2129, "step": 1147 }, { "epoch": 0.5222929936305732, "grad_norm": 1.5308807746672268, "learning_rate": 4.866817562412003e-06, "loss": 0.2109, "step": 1148 }, { "epoch": 0.5227479526842584, "grad_norm": 1.219763540490379, "learning_rate": 4.866587322488024e-06, "loss": 0.1529, "step": 1149 }, { "epoch": 0.5232029117379435, "grad_norm": 1.63359106412129, "learning_rate": 4.866356889177216e-06, "loss": 0.1663, "step": 1150 }, { "epoch": 0.5236578707916287, "grad_norm": 1.4722036099751108, "learning_rate": 4.866126262498409e-06, "loss": 0.1727, "step": 1151 }, { "epoch": 0.5241128298453139, "grad_norm": 2.4915301409486172, "learning_rate": 4.865895442470449e-06, "loss": 0.1966, "step": 1152 }, { "epoch": 0.5245677888989991, "grad_norm": 1.5523199921916622, "learning_rate": 4.865664429112199e-06, "loss": 0.1451, "step": 1153 }, { "epoch": 0.5250227479526842, "grad_norm": 2.0323520596343627, "learning_rate": 4.8654332224425345e-06, "loss": 0.1504, "step": 1154 }, { "epoch": 0.5254777070063694, "grad_norm": 2.4530093356672094, "learning_rate": 4.865201822480349e-06, "loss": 0.1872, "step": 1155 }, { "epoch": 0.5259326660600546, "grad_norm": 1.3735368464159743, "learning_rate": 4.864970229244552e-06, "loss": 0.111, "step": 1156 }, { "epoch": 0.5263876251137397, "grad_norm": 1.824736780190326, "learning_rate": 4.864738442754068e-06, "loss": 0.135, "step": 1157 }, { "epoch": 0.5268425841674249, "grad_norm": 1.9990020682765113, "learning_rate": 4.864506463027837e-06, "loss": 0.1745, "step": 1158 }, { "epoch": 0.5272975432211101, "grad_norm": 1.4799527599994446, "learning_rate": 4.864274290084816e-06, "loss": 0.167, "step": 1159 }, { "epoch": 0.5277525022747953, "grad_norm": 2.4687809077301295, "learning_rate": 4.864041923943978e-06, "loss": 0.1732, "step": 1160 }, { "epoch": 0.5282074613284804, "grad_norm": 2.641693873435684, "learning_rate": 4.863809364624309e-06, "loss": 0.2128, "step": 1161 }, { "epoch": 0.5286624203821656, "grad_norm": 1.5492373372050023, "learning_rate": 4.863576612144814e-06, "loss": 0.1592, "step": 1162 }, { "epoch": 0.5291173794358508, "grad_norm": 2.3572852462486313, "learning_rate": 4.863343666524512e-06, "loss": 0.2061, "step": 1163 }, { "epoch": 0.5295723384895359, "grad_norm": 1.8838845870200471, "learning_rate": 4.863110527782437e-06, "loss": 0.1798, "step": 1164 }, { "epoch": 0.5300272975432211, "grad_norm": 2.304263001470561, "learning_rate": 4.8628771959376435e-06, "loss": 0.1556, "step": 1165 }, { "epoch": 0.5304822565969063, "grad_norm": 1.7674923531547297, "learning_rate": 4.862643671009195e-06, "loss": 0.1333, "step": 1166 }, { "epoch": 0.5309372156505915, "grad_norm": 1.393097189340672, "learning_rate": 4.862409953016175e-06, "loss": 0.155, "step": 1167 }, { "epoch": 0.5313921747042766, "grad_norm": 1.74325807786759, "learning_rate": 4.862176041977683e-06, "loss": 0.1656, "step": 1168 }, { "epoch": 0.5318471337579618, "grad_norm": 1.572029172895186, "learning_rate": 4.861941937912832e-06, "loss": 0.131, "step": 1169 }, { "epoch": 0.532302092811647, "grad_norm": 2.008491262720168, "learning_rate": 4.861707640840752e-06, "loss": 0.1548, "step": 1170 }, { "epoch": 0.5327570518653321, "grad_norm": 1.482082349852649, "learning_rate": 4.861473150780589e-06, "loss": 0.1628, "step": 1171 }, { "epoch": 0.5332120109190173, "grad_norm": 1.6791945913602067, "learning_rate": 4.8612384677515054e-06, "loss": 0.1785, "step": 1172 }, { "epoch": 0.5336669699727025, "grad_norm": 2.20432127668894, "learning_rate": 4.861003591772677e-06, "loss": 0.1716, "step": 1173 }, { "epoch": 0.5341219290263877, "grad_norm": 1.9304068948412856, "learning_rate": 4.860768522863297e-06, "loss": 0.1503, "step": 1174 }, { "epoch": 0.5345768880800728, "grad_norm": 1.5238718585240933, "learning_rate": 4.860533261042574e-06, "loss": 0.1539, "step": 1175 }, { "epoch": 0.535031847133758, "grad_norm": 1.2432245247502896, "learning_rate": 4.8602978063297336e-06, "loss": 0.1721, "step": 1176 }, { "epoch": 0.5354868061874432, "grad_norm": 1.8986627233525826, "learning_rate": 4.8600621587440155e-06, "loss": 0.1717, "step": 1177 }, { "epoch": 0.5359417652411284, "grad_norm": 1.6746020896303164, "learning_rate": 4.859826318304676e-06, "loss": 0.198, "step": 1178 }, { "epoch": 0.5363967242948134, "grad_norm": 1.0811516795291998, "learning_rate": 4.859590285030986e-06, "loss": 0.1441, "step": 1179 }, { "epoch": 0.5368516833484986, "grad_norm": 1.3182569447840091, "learning_rate": 4.859354058942234e-06, "loss": 0.143, "step": 1180 }, { "epoch": 0.5373066424021838, "grad_norm": 1.5442971076277365, "learning_rate": 4.859117640057723e-06, "loss": 0.1708, "step": 1181 }, { "epoch": 0.5377616014558689, "grad_norm": 2.2346125174953744, "learning_rate": 4.858881028396773e-06, "loss": 0.2581, "step": 1182 }, { "epoch": 0.5382165605095541, "grad_norm": 2.16866059231189, "learning_rate": 4.8586442239787165e-06, "loss": 0.1566, "step": 1183 }, { "epoch": 0.5386715195632393, "grad_norm": 2.2940342617095357, "learning_rate": 4.858407226822906e-06, "loss": 0.2362, "step": 1184 }, { "epoch": 0.5391264786169245, "grad_norm": 1.722886466945642, "learning_rate": 4.858170036948707e-06, "loss": 0.1643, "step": 1185 }, { "epoch": 0.5395814376706096, "grad_norm": 1.8036922634291395, "learning_rate": 4.857932654375503e-06, "loss": 0.1399, "step": 1186 }, { "epoch": 0.5400363967242948, "grad_norm": 2.4595201733911995, "learning_rate": 4.857695079122691e-06, "loss": 0.2806, "step": 1187 }, { "epoch": 0.54049135577798, "grad_norm": 1.5611995597597812, "learning_rate": 4.857457311209683e-06, "loss": 0.1436, "step": 1188 }, { "epoch": 0.5409463148316651, "grad_norm": 2.155441619580459, "learning_rate": 4.857219350655911e-06, "loss": 0.1502, "step": 1189 }, { "epoch": 0.5414012738853503, "grad_norm": 1.7590257643884393, "learning_rate": 4.856981197480818e-06, "loss": 0.1832, "step": 1190 }, { "epoch": 0.5418562329390355, "grad_norm": 1.5219476359124613, "learning_rate": 4.856742851703866e-06, "loss": 0.1478, "step": 1191 }, { "epoch": 0.5423111919927207, "grad_norm": 1.9739406001713575, "learning_rate": 4.856504313344531e-06, "loss": 0.2435, "step": 1192 }, { "epoch": 0.5427661510464058, "grad_norm": 2.084318032784521, "learning_rate": 4.8562655824223055e-06, "loss": 0.1409, "step": 1193 }, { "epoch": 0.543221110100091, "grad_norm": 1.1509311969673588, "learning_rate": 4.856026658956697e-06, "loss": 0.1281, "step": 1194 }, { "epoch": 0.5436760691537762, "grad_norm": 1.14005541818581, "learning_rate": 4.8557875429672295e-06, "loss": 0.1438, "step": 1195 }, { "epoch": 0.5441310282074613, "grad_norm": 1.6453379692427774, "learning_rate": 4.855548234473444e-06, "loss": 0.1898, "step": 1196 }, { "epoch": 0.5445859872611465, "grad_norm": 3.715053618797708, "learning_rate": 4.8553087334948935e-06, "loss": 0.1884, "step": 1197 }, { "epoch": 0.5450409463148317, "grad_norm": 1.9604960579417277, "learning_rate": 4.855069040051149e-06, "loss": 0.1668, "step": 1198 }, { "epoch": 0.5454959053685169, "grad_norm": 2.008712099431151, "learning_rate": 4.854829154161799e-06, "loss": 0.2458, "step": 1199 }, { "epoch": 0.545950864422202, "grad_norm": 1.670617885602165, "learning_rate": 4.854589075846445e-06, "loss": 0.195, "step": 1200 }, { "epoch": 0.5464058234758872, "grad_norm": 1.3262735122543114, "learning_rate": 4.854348805124704e-06, "loss": 0.1564, "step": 1201 }, { "epoch": 0.5468607825295724, "grad_norm": 1.9039774091054742, "learning_rate": 4.85410834201621e-06, "loss": 0.1379, "step": 1202 }, { "epoch": 0.5473157415832575, "grad_norm": 2.3929812156260364, "learning_rate": 4.8538676865406155e-06, "loss": 0.2412, "step": 1203 }, { "epoch": 0.5477707006369427, "grad_norm": 1.551384727017807, "learning_rate": 4.853626838717582e-06, "loss": 0.117, "step": 1204 }, { "epoch": 0.5482256596906279, "grad_norm": 5.414582318339853, "learning_rate": 4.853385798566793e-06, "loss": 0.1437, "step": 1205 }, { "epoch": 0.5486806187443131, "grad_norm": 1.6881825100786558, "learning_rate": 4.8531445661079444e-06, "loss": 0.1232, "step": 1206 }, { "epoch": 0.5491355777979982, "grad_norm": 1.6096306897948298, "learning_rate": 4.852903141360749e-06, "loss": 0.161, "step": 1207 }, { "epoch": 0.5495905368516834, "grad_norm": 1.7692527628598336, "learning_rate": 4.852661524344933e-06, "loss": 0.1217, "step": 1208 }, { "epoch": 0.5500454959053686, "grad_norm": 2.162642212991987, "learning_rate": 4.852419715080244e-06, "loss": 0.1986, "step": 1209 }, { "epoch": 0.5505004549590536, "grad_norm": 1.4975052036096086, "learning_rate": 4.852177713586437e-06, "loss": 0.1435, "step": 1210 }, { "epoch": 0.5509554140127388, "grad_norm": 1.5907183445636404, "learning_rate": 4.85193551988329e-06, "loss": 0.1642, "step": 1211 }, { "epoch": 0.551410373066424, "grad_norm": 1.9999573598736464, "learning_rate": 4.851693133990594e-06, "loss": 0.1807, "step": 1212 }, { "epoch": 0.5518653321201092, "grad_norm": 2.294525710441773, "learning_rate": 4.851450555928155e-06, "loss": 0.1624, "step": 1213 }, { "epoch": 0.5523202911737943, "grad_norm": 2.233884971616304, "learning_rate": 4.851207785715797e-06, "loss": 0.2324, "step": 1214 }, { "epoch": 0.5527752502274795, "grad_norm": 2.0057194457772924, "learning_rate": 4.850964823373355e-06, "loss": 0.2105, "step": 1215 }, { "epoch": 0.5532302092811647, "grad_norm": 1.9893088992044121, "learning_rate": 4.850721668920685e-06, "loss": 0.1784, "step": 1216 }, { "epoch": 0.5536851683348498, "grad_norm": 1.811776169286512, "learning_rate": 4.850478322377657e-06, "loss": 0.1716, "step": 1217 }, { "epoch": 0.554140127388535, "grad_norm": 2.4345407872833134, "learning_rate": 4.8502347837641536e-06, "loss": 0.2649, "step": 1218 }, { "epoch": 0.5545950864422202, "grad_norm": 1.4197781095132433, "learning_rate": 4.8499910531000776e-06, "loss": 0.1473, "step": 1219 }, { "epoch": 0.5550500454959054, "grad_norm": 2.7980447769263637, "learning_rate": 4.849747130405346e-06, "loss": 0.2153, "step": 1220 }, { "epoch": 0.5555050045495905, "grad_norm": 1.6352047446815658, "learning_rate": 4.849503015699889e-06, "loss": 0.1485, "step": 1221 }, { "epoch": 0.5559599636032757, "grad_norm": 2.1831084601819066, "learning_rate": 4.849258709003657e-06, "loss": 0.1818, "step": 1222 }, { "epoch": 0.5564149226569609, "grad_norm": 1.541290763289794, "learning_rate": 4.849014210336612e-06, "loss": 0.1947, "step": 1223 }, { "epoch": 0.556869881710646, "grad_norm": 2.2775888091930723, "learning_rate": 4.848769519718734e-06, "loss": 0.2152, "step": 1224 }, { "epoch": 0.5573248407643312, "grad_norm": 2.473887631559974, "learning_rate": 4.848524637170018e-06, "loss": 0.1588, "step": 1225 }, { "epoch": 0.5577797998180164, "grad_norm": 1.7255823206927379, "learning_rate": 4.848279562710474e-06, "loss": 0.2174, "step": 1226 }, { "epoch": 0.5582347588717016, "grad_norm": 1.8250707498997563, "learning_rate": 4.848034296360129e-06, "loss": 0.1404, "step": 1227 }, { "epoch": 0.5586897179253867, "grad_norm": 1.3973858443687242, "learning_rate": 4.847788838139025e-06, "loss": 0.1598, "step": 1228 }, { "epoch": 0.5591446769790719, "grad_norm": 1.6880241463364833, "learning_rate": 4.847543188067219e-06, "loss": 0.1361, "step": 1229 }, { "epoch": 0.5595996360327571, "grad_norm": 1.6583472347876314, "learning_rate": 4.847297346164786e-06, "loss": 0.1681, "step": 1230 }, { "epoch": 0.5600545950864422, "grad_norm": 1.5526904315702266, "learning_rate": 4.8470513124518134e-06, "loss": 0.1704, "step": 1231 }, { "epoch": 0.5605095541401274, "grad_norm": 2.9080178304839333, "learning_rate": 4.8468050869484075e-06, "loss": 0.2189, "step": 1232 }, { "epoch": 0.5609645131938126, "grad_norm": 2.272625265359496, "learning_rate": 4.846558669674688e-06, "loss": 0.1796, "step": 1233 }, { "epoch": 0.5614194722474978, "grad_norm": 2.1487306294232997, "learning_rate": 4.8463120606507904e-06, "loss": 0.1853, "step": 1234 }, { "epoch": 0.5618744313011829, "grad_norm": 2.013831962718606, "learning_rate": 4.846065259896867e-06, "loss": 0.1844, "step": 1235 }, { "epoch": 0.5623293903548681, "grad_norm": 1.8287089471640992, "learning_rate": 4.845818267433086e-06, "loss": 0.1784, "step": 1236 }, { "epoch": 0.5627843494085533, "grad_norm": 1.800058629818333, "learning_rate": 4.845571083279629e-06, "loss": 0.1552, "step": 1237 }, { "epoch": 0.5632393084622384, "grad_norm": 1.2446217689129786, "learning_rate": 4.845323707456696e-06, "loss": 0.1685, "step": 1238 }, { "epoch": 0.5636942675159236, "grad_norm": 2.6424245053307787, "learning_rate": 4.845076139984502e-06, "loss": 0.2754, "step": 1239 }, { "epoch": 0.5641492265696088, "grad_norm": 1.9189782085118383, "learning_rate": 4.844828380883274e-06, "loss": 0.1686, "step": 1240 }, { "epoch": 0.564604185623294, "grad_norm": 1.2992527617302185, "learning_rate": 4.844580430173261e-06, "loss": 0.1576, "step": 1241 }, { "epoch": 0.565059144676979, "grad_norm": 1.771767593474412, "learning_rate": 4.8443322878747236e-06, "loss": 0.1201, "step": 1242 }, { "epoch": 0.5655141037306642, "grad_norm": 1.3113844210494432, "learning_rate": 4.844083954007938e-06, "loss": 0.1933, "step": 1243 }, { "epoch": 0.5659690627843494, "grad_norm": 1.780274550683715, "learning_rate": 4.843835428593198e-06, "loss": 0.2449, "step": 1244 }, { "epoch": 0.5664240218380345, "grad_norm": 2.0286348942605734, "learning_rate": 4.84358671165081e-06, "loss": 0.2206, "step": 1245 }, { "epoch": 0.5668789808917197, "grad_norm": 1.9183674174882497, "learning_rate": 4.843337803201102e-06, "loss": 0.1932, "step": 1246 }, { "epoch": 0.5673339399454049, "grad_norm": 1.8589987750417598, "learning_rate": 4.8430887032644094e-06, "loss": 0.2063, "step": 1247 }, { "epoch": 0.5677888989990901, "grad_norm": 1.8997293354336255, "learning_rate": 4.842839411861089e-06, "loss": 0.15, "step": 1248 }, { "epoch": 0.5682438580527752, "grad_norm": 1.5956283554174595, "learning_rate": 4.842589929011513e-06, "loss": 0.1249, "step": 1249 }, { "epoch": 0.5686988171064604, "grad_norm": 1.7264729567079007, "learning_rate": 4.8423402547360665e-06, "loss": 0.1731, "step": 1250 }, { "epoch": 0.5691537761601456, "grad_norm": 1.9220135807111425, "learning_rate": 4.842090389055153e-06, "loss": 0.1143, "step": 1251 }, { "epoch": 0.5696087352138307, "grad_norm": 1.7921638992770812, "learning_rate": 4.841840331989189e-06, "loss": 0.1976, "step": 1252 }, { "epoch": 0.5700636942675159, "grad_norm": 2.000993623816501, "learning_rate": 4.841590083558608e-06, "loss": 0.1768, "step": 1253 }, { "epoch": 0.5705186533212011, "grad_norm": 2.4830094815396304, "learning_rate": 4.841339643783861e-06, "loss": 0.2043, "step": 1254 }, { "epoch": 0.5709736123748863, "grad_norm": 1.5989796561168585, "learning_rate": 4.841089012685412e-06, "loss": 0.1778, "step": 1255 }, { "epoch": 0.5714285714285714, "grad_norm": 1.8137268898691017, "learning_rate": 4.840838190283741e-06, "loss": 0.1692, "step": 1256 }, { "epoch": 0.5718835304822566, "grad_norm": 1.7559519711217326, "learning_rate": 4.8405871765993435e-06, "loss": 0.0939, "step": 1257 }, { "epoch": 0.5723384895359418, "grad_norm": 1.7192722836354088, "learning_rate": 4.840335971652732e-06, "loss": 0.1255, "step": 1258 }, { "epoch": 0.5727934485896269, "grad_norm": 1.9835814338763256, "learning_rate": 4.840084575464434e-06, "loss": 0.1945, "step": 1259 }, { "epoch": 0.5732484076433121, "grad_norm": 1.8517843659588205, "learning_rate": 4.839832988054992e-06, "loss": 0.187, "step": 1260 }, { "epoch": 0.5737033666969973, "grad_norm": 1.8951856802928044, "learning_rate": 4.839581209444966e-06, "loss": 0.1196, "step": 1261 }, { "epoch": 0.5741583257506825, "grad_norm": 2.3401876182004386, "learning_rate": 4.839329239654927e-06, "loss": 0.2252, "step": 1262 }, { "epoch": 0.5746132848043676, "grad_norm": 2.1924333176646145, "learning_rate": 4.839077078705468e-06, "loss": 0.137, "step": 1263 }, { "epoch": 0.5750682438580528, "grad_norm": 1.6673068426763284, "learning_rate": 4.838824726617194e-06, "loss": 0.157, "step": 1264 }, { "epoch": 0.575523202911738, "grad_norm": 1.7250800520215972, "learning_rate": 4.838572183410725e-06, "loss": 0.1808, "step": 1265 }, { "epoch": 0.5759781619654231, "grad_norm": 1.6457142786345031, "learning_rate": 4.838319449106697e-06, "loss": 0.1635, "step": 1266 }, { "epoch": 0.5764331210191083, "grad_norm": 1.5575525689618337, "learning_rate": 4.838066523725764e-06, "loss": 0.1127, "step": 1267 }, { "epoch": 0.5768880800727935, "grad_norm": 2.5767156490698833, "learning_rate": 4.837813407288594e-06, "loss": 0.1798, "step": 1268 }, { "epoch": 0.5773430391264787, "grad_norm": 1.9108956938528818, "learning_rate": 4.837560099815869e-06, "loss": 0.202, "step": 1269 }, { "epoch": 0.5777979981801638, "grad_norm": 1.956778308687979, "learning_rate": 4.837306601328289e-06, "loss": 0.1806, "step": 1270 }, { "epoch": 0.578252957233849, "grad_norm": 1.775478489276246, "learning_rate": 4.837052911846569e-06, "loss": 0.1695, "step": 1271 }, { "epoch": 0.5787079162875342, "grad_norm": 1.787242091669647, "learning_rate": 4.836799031391439e-06, "loss": 0.1745, "step": 1272 }, { "epoch": 0.5791628753412192, "grad_norm": 1.0591727928255608, "learning_rate": 4.836544959983645e-06, "loss": 0.1343, "step": 1273 }, { "epoch": 0.5796178343949044, "grad_norm": 1.5740206900027498, "learning_rate": 4.8362906976439485e-06, "loss": 0.1635, "step": 1274 }, { "epoch": 0.5800727934485896, "grad_norm": 1.5937545527814416, "learning_rate": 4.836036244393127e-06, "loss": 0.1581, "step": 1275 }, { "epoch": 0.5805277525022748, "grad_norm": 1.813708807716678, "learning_rate": 4.835781600251973e-06, "loss": 0.2269, "step": 1276 }, { "epoch": 0.5809827115559599, "grad_norm": 2.0796570235313836, "learning_rate": 4.835526765241295e-06, "loss": 0.1924, "step": 1277 }, { "epoch": 0.5814376706096451, "grad_norm": 1.6083810261665097, "learning_rate": 4.835271739381917e-06, "loss": 0.1541, "step": 1278 }, { "epoch": 0.5818926296633303, "grad_norm": 1.365537997124497, "learning_rate": 4.835016522694678e-06, "loss": 0.136, "step": 1279 }, { "epoch": 0.5823475887170154, "grad_norm": 1.8893838729814614, "learning_rate": 4.834761115200434e-06, "loss": 0.2207, "step": 1280 }, { "epoch": 0.5828025477707006, "grad_norm": 1.4870021241117473, "learning_rate": 4.834505516920055e-06, "loss": 0.1879, "step": 1281 }, { "epoch": 0.5832575068243858, "grad_norm": 1.4165326048713465, "learning_rate": 4.834249727874428e-06, "loss": 0.1263, "step": 1282 }, { "epoch": 0.583712465878071, "grad_norm": 1.8197657860371343, "learning_rate": 4.833993748084455e-06, "loss": 0.1727, "step": 1283 }, { "epoch": 0.5841674249317561, "grad_norm": 1.715508493394312, "learning_rate": 4.833737577571052e-06, "loss": 0.1497, "step": 1284 }, { "epoch": 0.5846223839854413, "grad_norm": 2.0061239985491555, "learning_rate": 4.833481216355153e-06, "loss": 0.1646, "step": 1285 }, { "epoch": 0.5850773430391265, "grad_norm": 4.355130184989222, "learning_rate": 4.833224664457709e-06, "loss": 0.2076, "step": 1286 }, { "epoch": 0.5855323020928116, "grad_norm": 3.155573393148588, "learning_rate": 4.83296792189968e-06, "loss": 0.2413, "step": 1287 }, { "epoch": 0.5859872611464968, "grad_norm": 1.656224319251134, "learning_rate": 4.83271098870205e-06, "loss": 0.1237, "step": 1288 }, { "epoch": 0.586442220200182, "grad_norm": 1.5671815338330013, "learning_rate": 4.832453864885811e-06, "loss": 0.1461, "step": 1289 }, { "epoch": 0.5868971792538672, "grad_norm": 1.4490558461440097, "learning_rate": 4.832196550471976e-06, "loss": 0.1719, "step": 1290 }, { "epoch": 0.5873521383075523, "grad_norm": 1.4391467760040138, "learning_rate": 4.831939045481571e-06, "loss": 0.1598, "step": 1291 }, { "epoch": 0.5878070973612375, "grad_norm": 1.853086020668375, "learning_rate": 4.8316813499356375e-06, "loss": 0.1654, "step": 1292 }, { "epoch": 0.5882620564149227, "grad_norm": 1.6999807809193854, "learning_rate": 4.831423463855235e-06, "loss": 0.1516, "step": 1293 }, { "epoch": 0.5887170154686078, "grad_norm": 2.070573438132845, "learning_rate": 4.8311653872614345e-06, "loss": 0.1161, "step": 1294 }, { "epoch": 0.589171974522293, "grad_norm": 1.6686744603097172, "learning_rate": 4.830907120175327e-06, "loss": 0.1584, "step": 1295 }, { "epoch": 0.5896269335759782, "grad_norm": 2.089342697132724, "learning_rate": 4.830648662618015e-06, "loss": 0.2365, "step": 1296 }, { "epoch": 0.5900818926296634, "grad_norm": 1.5894012047277333, "learning_rate": 4.83039001461062e-06, "loss": 0.1097, "step": 1297 }, { "epoch": 0.5905368516833485, "grad_norm": 1.8782696857030252, "learning_rate": 4.830131176174276e-06, "loss": 0.151, "step": 1298 }, { "epoch": 0.5909918107370337, "grad_norm": 1.958971362169023, "learning_rate": 4.829872147330136e-06, "loss": 0.1841, "step": 1299 }, { "epoch": 0.5914467697907189, "grad_norm": 1.3360501731813752, "learning_rate": 4.829612928099366e-06, "loss": 0.1457, "step": 1300 }, { "epoch": 0.591901728844404, "grad_norm": 1.638219511935524, "learning_rate": 4.829353518503147e-06, "loss": 0.1583, "step": 1301 }, { "epoch": 0.5923566878980892, "grad_norm": 2.0096056545692025, "learning_rate": 4.829093918562678e-06, "loss": 0.1491, "step": 1302 }, { "epoch": 0.5928116469517744, "grad_norm": 1.7893380227892468, "learning_rate": 4.828834128299173e-06, "loss": 0.1592, "step": 1303 }, { "epoch": 0.5932666060054596, "grad_norm": 1.5760903095424181, "learning_rate": 4.828574147733859e-06, "loss": 0.1646, "step": 1304 }, { "epoch": 0.5937215650591446, "grad_norm": 1.6385972545017617, "learning_rate": 4.828313976887982e-06, "loss": 0.1228, "step": 1305 }, { "epoch": 0.5941765241128298, "grad_norm": 1.7350084151113443, "learning_rate": 4.8280536157828e-06, "loss": 0.1532, "step": 1306 }, { "epoch": 0.594631483166515, "grad_norm": 2.1711615974874223, "learning_rate": 4.827793064439592e-06, "loss": 0.1551, "step": 1307 }, { "epoch": 0.5950864422202001, "grad_norm": 2.5688116012952125, "learning_rate": 4.8275323228796455e-06, "loss": 0.18, "step": 1308 }, { "epoch": 0.5955414012738853, "grad_norm": 1.534845536955317, "learning_rate": 4.8272713911242695e-06, "loss": 0.121, "step": 1309 }, { "epoch": 0.5959963603275705, "grad_norm": 1.9028349069881882, "learning_rate": 4.827010269194785e-06, "loss": 0.1228, "step": 1310 }, { "epoch": 0.5964513193812557, "grad_norm": 2.1051164199599, "learning_rate": 4.8267489571125295e-06, "loss": 0.1465, "step": 1311 }, { "epoch": 0.5969062784349408, "grad_norm": 2.9999435749849073, "learning_rate": 4.826487454898857e-06, "loss": 0.2635, "step": 1312 }, { "epoch": 0.597361237488626, "grad_norm": 1.880715290875366, "learning_rate": 4.826225762575136e-06, "loss": 0.194, "step": 1313 }, { "epoch": 0.5978161965423112, "grad_norm": 1.6843651365954362, "learning_rate": 4.825963880162752e-06, "loss": 0.1792, "step": 1314 }, { "epoch": 0.5982711555959963, "grad_norm": 1.606704753365435, "learning_rate": 4.825701807683102e-06, "loss": 0.1399, "step": 1315 }, { "epoch": 0.5987261146496815, "grad_norm": 1.8783582719750365, "learning_rate": 4.825439545157603e-06, "loss": 0.1743, "step": 1316 }, { "epoch": 0.5991810737033667, "grad_norm": 2.39834669557369, "learning_rate": 4.825177092607687e-06, "loss": 0.2576, "step": 1317 }, { "epoch": 0.5996360327570519, "grad_norm": 1.5809346444064956, "learning_rate": 4.8249144500547995e-06, "loss": 0.1266, "step": 1318 }, { "epoch": 0.600090991810737, "grad_norm": 1.6731917139944308, "learning_rate": 4.824651617520402e-06, "loss": 0.1722, "step": 1319 }, { "epoch": 0.6005459508644222, "grad_norm": 1.9934684665371283, "learning_rate": 4.824388595025972e-06, "loss": 0.1863, "step": 1320 }, { "epoch": 0.6010009099181074, "grad_norm": 1.7396149145777957, "learning_rate": 4.824125382593003e-06, "loss": 0.1582, "step": 1321 }, { "epoch": 0.6014558689717925, "grad_norm": 1.7746494679795604, "learning_rate": 4.823861980243003e-06, "loss": 0.1485, "step": 1322 }, { "epoch": 0.6019108280254777, "grad_norm": 1.8309083669399964, "learning_rate": 4.823598387997497e-06, "loss": 0.1495, "step": 1323 }, { "epoch": 0.6023657870791629, "grad_norm": 1.9534496331991582, "learning_rate": 4.823334605878024e-06, "loss": 0.1462, "step": 1324 }, { "epoch": 0.6028207461328481, "grad_norm": 2.1011605763315138, "learning_rate": 4.82307063390614e-06, "loss": 0.1853, "step": 1325 }, { "epoch": 0.6032757051865332, "grad_norm": 2.5503968401256465, "learning_rate": 4.822806472103413e-06, "loss": 0.2297, "step": 1326 }, { "epoch": 0.6037306642402184, "grad_norm": 1.4853028085158964, "learning_rate": 4.822542120491431e-06, "loss": 0.1692, "step": 1327 }, { "epoch": 0.6041856232939036, "grad_norm": 1.5826380640650177, "learning_rate": 4.822277579091796e-06, "loss": 0.1845, "step": 1328 }, { "epoch": 0.6046405823475887, "grad_norm": 1.7941875470339128, "learning_rate": 4.822012847926125e-06, "loss": 0.1723, "step": 1329 }, { "epoch": 0.6050955414012739, "grad_norm": 1.6317178871077942, "learning_rate": 4.821747927016049e-06, "loss": 0.1309, "step": 1330 }, { "epoch": 0.6055505004549591, "grad_norm": 1.5814757694833934, "learning_rate": 4.821482816383219e-06, "loss": 0.1565, "step": 1331 }, { "epoch": 0.6060054595086443, "grad_norm": 1.5304957435111453, "learning_rate": 4.821217516049296e-06, "loss": 0.1373, "step": 1332 }, { "epoch": 0.6064604185623294, "grad_norm": 1.5147254102931988, "learning_rate": 4.82095202603596e-06, "loss": 0.1431, "step": 1333 }, { "epoch": 0.6069153776160146, "grad_norm": 1.5663436015338144, "learning_rate": 4.820686346364906e-06, "loss": 0.156, "step": 1334 }, { "epoch": 0.6073703366696998, "grad_norm": 2.053796214560493, "learning_rate": 4.820420477057843e-06, "loss": 0.1874, "step": 1335 }, { "epoch": 0.607825295723385, "grad_norm": 1.95739593906374, "learning_rate": 4.820154418136498e-06, "loss": 0.1526, "step": 1336 }, { "epoch": 0.60828025477707, "grad_norm": 1.8483495445052411, "learning_rate": 4.819888169622612e-06, "loss": 0.2036, "step": 1337 }, { "epoch": 0.6087352138307552, "grad_norm": 1.9503495611822523, "learning_rate": 4.819621731537942e-06, "loss": 0.2066, "step": 1338 }, { "epoch": 0.6091901728844404, "grad_norm": 1.5722530391175293, "learning_rate": 4.819355103904259e-06, "loss": 0.1419, "step": 1339 }, { "epoch": 0.6096451319381255, "grad_norm": 1.8367765104613556, "learning_rate": 4.81908828674335e-06, "loss": 0.1775, "step": 1340 }, { "epoch": 0.6101000909918107, "grad_norm": 1.6359632675531957, "learning_rate": 4.81882128007702e-06, "loss": 0.1361, "step": 1341 }, { "epoch": 0.6105550500454959, "grad_norm": 1.6249604007945537, "learning_rate": 4.818554083927086e-06, "loss": 0.1501, "step": 1342 }, { "epoch": 0.6110100090991811, "grad_norm": 1.507444149214357, "learning_rate": 4.818286698315383e-06, "loss": 0.1318, "step": 1343 }, { "epoch": 0.6114649681528662, "grad_norm": 1.714948580415853, "learning_rate": 4.818019123263761e-06, "loss": 0.1576, "step": 1344 }, { "epoch": 0.6119199272065514, "grad_norm": 1.6310779918465994, "learning_rate": 4.817751358794084e-06, "loss": 0.1505, "step": 1345 }, { "epoch": 0.6123748862602366, "grad_norm": 1.9516095925204497, "learning_rate": 4.8174834049282325e-06, "loss": 0.1513, "step": 1346 }, { "epoch": 0.6128298453139217, "grad_norm": 1.6535718997078614, "learning_rate": 4.817215261688104e-06, "loss": 0.1509, "step": 1347 }, { "epoch": 0.6132848043676069, "grad_norm": 1.7050249250163263, "learning_rate": 4.816946929095607e-06, "loss": 0.143, "step": 1348 }, { "epoch": 0.6137397634212921, "grad_norm": 1.9555072177299098, "learning_rate": 4.816678407172671e-06, "loss": 0.1702, "step": 1349 }, { "epoch": 0.6141947224749773, "grad_norm": 1.6603270300616475, "learning_rate": 4.816409695941238e-06, "loss": 0.1525, "step": 1350 }, { "epoch": 0.6146496815286624, "grad_norm": 2.052319098264881, "learning_rate": 4.816140795423265e-06, "loss": 0.1553, "step": 1351 }, { "epoch": 0.6151046405823476, "grad_norm": 2.316846556963137, "learning_rate": 4.8158717056407255e-06, "loss": 0.2204, "step": 1352 }, { "epoch": 0.6155595996360328, "grad_norm": 2.268897705180763, "learning_rate": 4.815602426615609e-06, "loss": 0.172, "step": 1353 }, { "epoch": 0.6160145586897179, "grad_norm": 2.277033042904883, "learning_rate": 4.815332958369919e-06, "loss": 0.1952, "step": 1354 }, { "epoch": 0.6164695177434031, "grad_norm": 2.203261308039804, "learning_rate": 4.815063300925677e-06, "loss": 0.1778, "step": 1355 }, { "epoch": 0.6169244767970883, "grad_norm": 1.5542993423497844, "learning_rate": 4.814793454304915e-06, "loss": 0.1831, "step": 1356 }, { "epoch": 0.6173794358507735, "grad_norm": 1.3687836885728237, "learning_rate": 4.814523418529686e-06, "loss": 0.1438, "step": 1357 }, { "epoch": 0.6178343949044586, "grad_norm": 1.803336916930759, "learning_rate": 4.814253193622056e-06, "loss": 0.1426, "step": 1358 }, { "epoch": 0.6182893539581438, "grad_norm": 1.521636702652137, "learning_rate": 4.813982779604106e-06, "loss": 0.1214, "step": 1359 }, { "epoch": 0.618744313011829, "grad_norm": 1.5404670484043497, "learning_rate": 4.813712176497933e-06, "loss": 0.1366, "step": 1360 }, { "epoch": 0.6191992720655141, "grad_norm": 1.689965450022471, "learning_rate": 4.813441384325649e-06, "loss": 0.1346, "step": 1361 }, { "epoch": 0.6196542311191993, "grad_norm": 1.7814370136900919, "learning_rate": 4.813170403109383e-06, "loss": 0.1444, "step": 1362 }, { "epoch": 0.6201091901728845, "grad_norm": 1.72215098605925, "learning_rate": 4.8128992328712774e-06, "loss": 0.1127, "step": 1363 }, { "epoch": 0.6205641492265697, "grad_norm": 1.504745997390183, "learning_rate": 4.812627873633492e-06, "loss": 0.149, "step": 1364 }, { "epoch": 0.6210191082802548, "grad_norm": 1.6905090686600799, "learning_rate": 4.8123563254182e-06, "loss": 0.1457, "step": 1365 }, { "epoch": 0.62147406733394, "grad_norm": 2.1753494024731683, "learning_rate": 4.8120845882475924e-06, "loss": 0.1995, "step": 1366 }, { "epoch": 0.6219290263876252, "grad_norm": 2.793385404562888, "learning_rate": 4.8118126621438734e-06, "loss": 0.2318, "step": 1367 }, { "epoch": 0.6223839854413102, "grad_norm": 1.7667899225260022, "learning_rate": 4.811540547129263e-06, "loss": 0.2251, "step": 1368 }, { "epoch": 0.6228389444949954, "grad_norm": 2.1679573967859787, "learning_rate": 4.811268243225999e-06, "loss": 0.1784, "step": 1369 }, { "epoch": 0.6232939035486806, "grad_norm": 2.4497161330069424, "learning_rate": 4.810995750456331e-06, "loss": 0.1795, "step": 1370 }, { "epoch": 0.6237488626023658, "grad_norm": 1.9632525184445888, "learning_rate": 4.810723068842526e-06, "loss": 0.1757, "step": 1371 }, { "epoch": 0.6242038216560509, "grad_norm": 1.78757490589868, "learning_rate": 4.810450198406867e-06, "loss": 0.1994, "step": 1372 }, { "epoch": 0.6246587807097361, "grad_norm": 2.5975000715086907, "learning_rate": 4.810177139171653e-06, "loss": 0.2177, "step": 1373 }, { "epoch": 0.6251137397634213, "grad_norm": 1.8861807982376269, "learning_rate": 4.809903891159195e-06, "loss": 0.1318, "step": 1374 }, { "epoch": 0.6255686988171064, "grad_norm": 2.474014583254649, "learning_rate": 4.809630454391822e-06, "loss": 0.165, "step": 1375 }, { "epoch": 0.6260236578707916, "grad_norm": 1.8420180765220768, "learning_rate": 4.80935682889188e-06, "loss": 0.1997, "step": 1376 }, { "epoch": 0.6264786169244768, "grad_norm": 1.8871529282732857, "learning_rate": 4.809083014681726e-06, "loss": 0.239, "step": 1377 }, { "epoch": 0.626933575978162, "grad_norm": 1.776688876661572, "learning_rate": 4.808809011783735e-06, "loss": 0.1876, "step": 1378 }, { "epoch": 0.6273885350318471, "grad_norm": 1.806661163792066, "learning_rate": 4.808534820220299e-06, "loss": 0.148, "step": 1379 }, { "epoch": 0.6278434940855323, "grad_norm": 2.0820743721382007, "learning_rate": 4.8082604400138226e-06, "loss": 0.2015, "step": 1380 }, { "epoch": 0.6282984531392175, "grad_norm": 1.5614420996583043, "learning_rate": 4.807985871186726e-06, "loss": 0.1277, "step": 1381 }, { "epoch": 0.6287534121929026, "grad_norm": 1.5694923603817514, "learning_rate": 4.8077111137614484e-06, "loss": 0.1345, "step": 1382 }, { "epoch": 0.6292083712465878, "grad_norm": 2.2610976098352116, "learning_rate": 4.8074361677604394e-06, "loss": 0.1732, "step": 1383 }, { "epoch": 0.629663330300273, "grad_norm": 2.0760282221755704, "learning_rate": 4.807161033206168e-06, "loss": 0.1936, "step": 1384 }, { "epoch": 0.6301182893539582, "grad_norm": 1.805894786082926, "learning_rate": 4.806885710121114e-06, "loss": 0.1536, "step": 1385 }, { "epoch": 0.6305732484076433, "grad_norm": 1.9667669224198192, "learning_rate": 4.806610198527779e-06, "loss": 0.1729, "step": 1386 }, { "epoch": 0.6310282074613285, "grad_norm": 1.9797354051419906, "learning_rate": 4.8063344984486755e-06, "loss": 0.2071, "step": 1387 }, { "epoch": 0.6314831665150137, "grad_norm": 2.5955252782084224, "learning_rate": 4.806058609906331e-06, "loss": 0.181, "step": 1388 }, { "epoch": 0.6319381255686988, "grad_norm": 2.707367730234045, "learning_rate": 4.805782532923292e-06, "loss": 0.2299, "step": 1389 }, { "epoch": 0.632393084622384, "grad_norm": 1.579292631208614, "learning_rate": 4.805506267522116e-06, "loss": 0.2235, "step": 1390 }, { "epoch": 0.6328480436760692, "grad_norm": 3.332380342069127, "learning_rate": 4.80522981372538e-06, "loss": 0.2485, "step": 1391 }, { "epoch": 0.6333030027297544, "grad_norm": 1.597247684736274, "learning_rate": 4.804953171555674e-06, "loss": 0.1511, "step": 1392 }, { "epoch": 0.6337579617834395, "grad_norm": 2.050037449702685, "learning_rate": 4.8046763410356046e-06, "loss": 0.1732, "step": 1393 }, { "epoch": 0.6342129208371247, "grad_norm": 1.6703199484658815, "learning_rate": 4.804399322187791e-06, "loss": 0.1832, "step": 1394 }, { "epoch": 0.6346678798908099, "grad_norm": 2.4171080690553155, "learning_rate": 4.8041221150348725e-06, "loss": 0.2519, "step": 1395 }, { "epoch": 0.635122838944495, "grad_norm": 1.7415236452607812, "learning_rate": 4.8038447195995e-06, "loss": 0.1942, "step": 1396 }, { "epoch": 0.6355777979981801, "grad_norm": 2.0585293521798, "learning_rate": 4.80356713590434e-06, "loss": 0.1806, "step": 1397 }, { "epoch": 0.6360327570518653, "grad_norm": 1.6543360161164664, "learning_rate": 4.803289363972078e-06, "loss": 0.1953, "step": 1398 }, { "epoch": 0.6364877161055505, "grad_norm": 1.952726003661859, "learning_rate": 4.8030114038254094e-06, "loss": 0.164, "step": 1399 }, { "epoch": 0.6369426751592356, "grad_norm": 1.6177022530921434, "learning_rate": 4.80273325548705e-06, "loss": 0.1798, "step": 1400 }, { "epoch": 0.6373976342129208, "grad_norm": 1.9292090840839082, "learning_rate": 4.802454918979728e-06, "loss": 0.1652, "step": 1401 }, { "epoch": 0.637852593266606, "grad_norm": 1.9210595574243916, "learning_rate": 4.802176394326187e-06, "loss": 0.2007, "step": 1402 }, { "epoch": 0.6383075523202911, "grad_norm": 1.464054312422107, "learning_rate": 4.801897681549188e-06, "loss": 0.129, "step": 1403 }, { "epoch": 0.6387625113739763, "grad_norm": 1.9150864430756966, "learning_rate": 4.801618780671506e-06, "loss": 0.1634, "step": 1404 }, { "epoch": 0.6392174704276615, "grad_norm": 1.4873483060535149, "learning_rate": 4.801339691715932e-06, "loss": 0.1463, "step": 1405 }, { "epoch": 0.6396724294813467, "grad_norm": 2.3690804594133623, "learning_rate": 4.8010604147052695e-06, "loss": 0.1606, "step": 1406 }, { "epoch": 0.6401273885350318, "grad_norm": 2.3100068394442497, "learning_rate": 4.800780949662343e-06, "loss": 0.1904, "step": 1407 }, { "epoch": 0.640582347588717, "grad_norm": 1.5363867596702172, "learning_rate": 4.800501296609986e-06, "loss": 0.1053, "step": 1408 }, { "epoch": 0.6410373066424022, "grad_norm": 1.606538550331431, "learning_rate": 4.800221455571053e-06, "loss": 0.1397, "step": 1409 }, { "epoch": 0.6414922656960873, "grad_norm": 1.611596105149799, "learning_rate": 4.7999414265684105e-06, "loss": 0.1303, "step": 1410 }, { "epoch": 0.6419472247497725, "grad_norm": 1.6262064168900117, "learning_rate": 4.79966120962494e-06, "loss": 0.1564, "step": 1411 }, { "epoch": 0.6424021838034577, "grad_norm": 2.015359106142208, "learning_rate": 4.799380804763542e-06, "loss": 0.1619, "step": 1412 }, { "epoch": 0.6428571428571429, "grad_norm": 2.0480276409863465, "learning_rate": 4.799100212007128e-06, "loss": 0.1711, "step": 1413 }, { "epoch": 0.643312101910828, "grad_norm": 1.9220142745677993, "learning_rate": 4.7988194313786275e-06, "loss": 0.1496, "step": 1414 }, { "epoch": 0.6437670609645132, "grad_norm": 1.5592119110073082, "learning_rate": 4.798538462900984e-06, "loss": 0.1563, "step": 1415 }, { "epoch": 0.6442220200181984, "grad_norm": 2.7928579618942764, "learning_rate": 4.798257306597157e-06, "loss": 0.2031, "step": 1416 }, { "epoch": 0.6446769790718835, "grad_norm": 1.579272373938799, "learning_rate": 4.797975962490122e-06, "loss": 0.1501, "step": 1417 }, { "epoch": 0.6451319381255687, "grad_norm": 1.5556034741269746, "learning_rate": 4.797694430602869e-06, "loss": 0.1125, "step": 1418 }, { "epoch": 0.6455868971792539, "grad_norm": 2.4067503053827273, "learning_rate": 4.797412710958405e-06, "loss": 0.2154, "step": 1419 }, { "epoch": 0.6460418562329391, "grad_norm": 2.143935212981359, "learning_rate": 4.797130803579747e-06, "loss": 0.1694, "step": 1420 }, { "epoch": 0.6464968152866242, "grad_norm": 2.6240019391696667, "learning_rate": 4.796848708489935e-06, "loss": 0.2811, "step": 1421 }, { "epoch": 0.6469517743403094, "grad_norm": 1.5174877651602559, "learning_rate": 4.796566425712018e-06, "loss": 0.1435, "step": 1422 }, { "epoch": 0.6474067333939946, "grad_norm": 1.6834754436981423, "learning_rate": 4.796283955269065e-06, "loss": 0.1816, "step": 1423 }, { "epoch": 0.6478616924476797, "grad_norm": 1.5804322468618368, "learning_rate": 4.796001297184156e-06, "loss": 0.1471, "step": 1424 }, { "epoch": 0.6483166515013649, "grad_norm": 1.8327883828431184, "learning_rate": 4.79571845148039e-06, "loss": 0.2011, "step": 1425 }, { "epoch": 0.6487716105550501, "grad_norm": 1.4039853389905468, "learning_rate": 4.795435418180879e-06, "loss": 0.1074, "step": 1426 }, { "epoch": 0.6492265696087353, "grad_norm": 1.664983557085843, "learning_rate": 4.795152197308753e-06, "loss": 0.148, "step": 1427 }, { "epoch": 0.6496815286624203, "grad_norm": 1.6844695222093484, "learning_rate": 4.794868788887154e-06, "loss": 0.1207, "step": 1428 }, { "epoch": 0.6501364877161055, "grad_norm": 1.3430612047901953, "learning_rate": 4.79458519293924e-06, "loss": 0.1437, "step": 1429 }, { "epoch": 0.6505914467697907, "grad_norm": 1.6637985127807216, "learning_rate": 4.794301409488187e-06, "loss": 0.1478, "step": 1430 }, { "epoch": 0.6510464058234758, "grad_norm": 1.385729637043462, "learning_rate": 4.7940174385571835e-06, "loss": 0.1627, "step": 1431 }, { "epoch": 0.651501364877161, "grad_norm": 2.0471057598981632, "learning_rate": 4.793733280169435e-06, "loss": 0.2172, "step": 1432 }, { "epoch": 0.6519563239308462, "grad_norm": 2.804939948704313, "learning_rate": 4.7934489343481614e-06, "loss": 0.2366, "step": 1433 }, { "epoch": 0.6524112829845314, "grad_norm": 2.1472377723290568, "learning_rate": 4.7931644011165975e-06, "loss": 0.1418, "step": 1434 }, { "epoch": 0.6528662420382165, "grad_norm": 1.9918480481257164, "learning_rate": 4.792879680497995e-06, "loss": 0.186, "step": 1435 }, { "epoch": 0.6533212010919017, "grad_norm": 2.5064644756915655, "learning_rate": 4.79259477251562e-06, "loss": 0.2048, "step": 1436 }, { "epoch": 0.6537761601455869, "grad_norm": 2.3512727211776263, "learning_rate": 4.792309677192753e-06, "loss": 0.2052, "step": 1437 }, { "epoch": 0.654231119199272, "grad_norm": 1.9202855097301381, "learning_rate": 4.79202439455269e-06, "loss": 0.1458, "step": 1438 }, { "epoch": 0.6546860782529572, "grad_norm": 1.4271813740118833, "learning_rate": 4.791738924618745e-06, "loss": 0.1211, "step": 1439 }, { "epoch": 0.6551410373066424, "grad_norm": 2.032712581115854, "learning_rate": 4.791453267414245e-06, "loss": 0.1836, "step": 1440 }, { "epoch": 0.6555959963603276, "grad_norm": 1.858326597247768, "learning_rate": 4.7911674229625316e-06, "loss": 0.1539, "step": 1441 }, { "epoch": 0.6560509554140127, "grad_norm": 1.9149985878919944, "learning_rate": 4.790881391286963e-06, "loss": 0.1492, "step": 1442 }, { "epoch": 0.6565059144676979, "grad_norm": 2.224611827457958, "learning_rate": 4.790595172410914e-06, "loss": 0.1771, "step": 1443 }, { "epoch": 0.6569608735213831, "grad_norm": 2.2710831934815423, "learning_rate": 4.79030876635777e-06, "loss": 0.1816, "step": 1444 }, { "epoch": 0.6574158325750682, "grad_norm": 1.686396567912197, "learning_rate": 4.790022173150938e-06, "loss": 0.1715, "step": 1445 }, { "epoch": 0.6578707916287534, "grad_norm": 1.6844379519791872, "learning_rate": 4.789735392813835e-06, "loss": 0.1612, "step": 1446 }, { "epoch": 0.6583257506824386, "grad_norm": 1.9308684762069341, "learning_rate": 4.789448425369896e-06, "loss": 0.1943, "step": 1447 }, { "epoch": 0.6587807097361238, "grad_norm": 1.7813876642605184, "learning_rate": 4.789161270842571e-06, "loss": 0.133, "step": 1448 }, { "epoch": 0.6592356687898089, "grad_norm": 1.7016656003147437, "learning_rate": 4.7888739292553235e-06, "loss": 0.1787, "step": 1449 }, { "epoch": 0.6596906278434941, "grad_norm": 1.788996418731665, "learning_rate": 4.788586400631636e-06, "loss": 0.2144, "step": 1450 }, { "epoch": 0.6601455868971793, "grad_norm": 1.1868611743252886, "learning_rate": 4.788298684995003e-06, "loss": 0.1411, "step": 1451 }, { "epoch": 0.6606005459508644, "grad_norm": 1.3784782394299329, "learning_rate": 4.7880107823689355e-06, "loss": 0.1394, "step": 1452 }, { "epoch": 0.6610555050045496, "grad_norm": 2.38570648853941, "learning_rate": 4.787722692776958e-06, "loss": 0.2177, "step": 1453 }, { "epoch": 0.6615104640582348, "grad_norm": 1.885827372966156, "learning_rate": 4.787434416242615e-06, "loss": 0.1932, "step": 1454 }, { "epoch": 0.66196542311192, "grad_norm": 2.0741165529803305, "learning_rate": 4.787145952789461e-06, "loss": 0.1916, "step": 1455 }, { "epoch": 0.6624203821656051, "grad_norm": 2.2824023726624216, "learning_rate": 4.786857302441069e-06, "loss": 0.154, "step": 1456 }, { "epoch": 0.6628753412192903, "grad_norm": 1.9364048955005693, "learning_rate": 4.786568465221025e-06, "loss": 0.1456, "step": 1457 }, { "epoch": 0.6633303002729755, "grad_norm": 2.085706626351343, "learning_rate": 4.7862794411529315e-06, "loss": 0.2085, "step": 1458 }, { "epoch": 0.6637852593266605, "grad_norm": 1.614288560024189, "learning_rate": 4.7859902302604075e-06, "loss": 0.174, "step": 1459 }, { "epoch": 0.6642402183803457, "grad_norm": 2.5891987139037305, "learning_rate": 4.785700832567085e-06, "loss": 0.2207, "step": 1460 }, { "epoch": 0.664695177434031, "grad_norm": 1.60390922794205, "learning_rate": 4.785411248096613e-06, "loss": 0.1694, "step": 1461 }, { "epoch": 0.6651501364877161, "grad_norm": 1.9008758556011767, "learning_rate": 4.785121476872654e-06, "loss": 0.1917, "step": 1462 }, { "epoch": 0.6656050955414012, "grad_norm": 1.8830534414569509, "learning_rate": 4.784831518918888e-06, "loss": 0.1738, "step": 1463 }, { "epoch": 0.6660600545950864, "grad_norm": 1.7207750442706227, "learning_rate": 4.784541374259008e-06, "loss": 0.15, "step": 1464 }, { "epoch": 0.6665150136487716, "grad_norm": 1.875368507153303, "learning_rate": 4.7842510429167244e-06, "loss": 0.1785, "step": 1465 }, { "epoch": 0.6669699727024567, "grad_norm": 1.423039570984651, "learning_rate": 4.783960524915761e-06, "loss": 0.1618, "step": 1466 }, { "epoch": 0.6674249317561419, "grad_norm": 3.369804205318982, "learning_rate": 4.783669820279858e-06, "loss": 0.2151, "step": 1467 }, { "epoch": 0.6678798908098271, "grad_norm": 1.7236530224714224, "learning_rate": 4.783378929032769e-06, "loss": 0.1449, "step": 1468 }, { "epoch": 0.6683348498635123, "grad_norm": 1.897670469007501, "learning_rate": 4.783087851198267e-06, "loss": 0.1565, "step": 1469 }, { "epoch": 0.6687898089171974, "grad_norm": 2.120484944530229, "learning_rate": 4.7827965868001356e-06, "loss": 0.146, "step": 1470 }, { "epoch": 0.6692447679708826, "grad_norm": 1.5164080428619426, "learning_rate": 4.782505135862176e-06, "loss": 0.1948, "step": 1471 }, { "epoch": 0.6696997270245678, "grad_norm": 1.7069357913374903, "learning_rate": 4.782213498408205e-06, "loss": 0.1592, "step": 1472 }, { "epoch": 0.6701546860782529, "grad_norm": 1.809748302750509, "learning_rate": 4.781921674462053e-06, "loss": 0.1314, "step": 1473 }, { "epoch": 0.6706096451319381, "grad_norm": 2.1951569204558927, "learning_rate": 4.781629664047566e-06, "loss": 0.1845, "step": 1474 }, { "epoch": 0.6710646041856233, "grad_norm": 1.3071594737849044, "learning_rate": 4.781337467188607e-06, "loss": 0.1436, "step": 1475 }, { "epoch": 0.6715195632393085, "grad_norm": 1.945295439800649, "learning_rate": 4.781045083909053e-06, "loss": 0.1855, "step": 1476 }, { "epoch": 0.6719745222929936, "grad_norm": 2.1383665971380053, "learning_rate": 4.780752514232796e-06, "loss": 0.1746, "step": 1477 }, { "epoch": 0.6724294813466788, "grad_norm": 1.9493775213300697, "learning_rate": 4.780459758183743e-06, "loss": 0.136, "step": 1478 }, { "epoch": 0.672884440400364, "grad_norm": 1.5588501717449852, "learning_rate": 4.780166815785817e-06, "loss": 0.1564, "step": 1479 }, { "epoch": 0.6733393994540491, "grad_norm": 1.9111191141451183, "learning_rate": 4.7798736870629554e-06, "loss": 0.1722, "step": 1480 }, { "epoch": 0.6737943585077343, "grad_norm": 1.7396374086258946, "learning_rate": 4.779580372039113e-06, "loss": 0.1569, "step": 1481 }, { "epoch": 0.6742493175614195, "grad_norm": 2.2814229407003563, "learning_rate": 4.779286870738256e-06, "loss": 0.1576, "step": 1482 }, { "epoch": 0.6747042766151047, "grad_norm": 2.543619017373989, "learning_rate": 4.778993183184371e-06, "loss": 0.1743, "step": 1483 }, { "epoch": 0.6751592356687898, "grad_norm": 2.003249738108025, "learning_rate": 4.778699309401453e-06, "loss": 0.2083, "step": 1484 }, { "epoch": 0.675614194722475, "grad_norm": 1.7140899951572492, "learning_rate": 4.7784052494135195e-06, "loss": 0.1649, "step": 1485 }, { "epoch": 0.6760691537761602, "grad_norm": 1.6177440846188005, "learning_rate": 4.778111003244596e-06, "loss": 0.1706, "step": 1486 }, { "epoch": 0.6765241128298453, "grad_norm": 1.3540158476274282, "learning_rate": 4.777816570918731e-06, "loss": 0.1474, "step": 1487 }, { "epoch": 0.6769790718835305, "grad_norm": 1.8863006900369008, "learning_rate": 4.777521952459982e-06, "loss": 0.1995, "step": 1488 }, { "epoch": 0.6774340309372157, "grad_norm": 2.2667108941921073, "learning_rate": 4.777227147892424e-06, "loss": 0.1855, "step": 1489 }, { "epoch": 0.6778889899909009, "grad_norm": 1.9407891934102777, "learning_rate": 4.776932157240147e-06, "loss": 0.1503, "step": 1490 }, { "epoch": 0.678343949044586, "grad_norm": 2.102459646475576, "learning_rate": 4.776636980527257e-06, "loss": 0.1388, "step": 1491 }, { "epoch": 0.6787989080982711, "grad_norm": 2.08408986696494, "learning_rate": 4.776341617777874e-06, "loss": 0.1933, "step": 1492 }, { "epoch": 0.6792538671519563, "grad_norm": 1.5090681867773854, "learning_rate": 4.776046069016135e-06, "loss": 0.1617, "step": 1493 }, { "epoch": 0.6797088262056415, "grad_norm": 2.463007954699752, "learning_rate": 4.775750334266188e-06, "loss": 0.2267, "step": 1494 }, { "epoch": 0.6801637852593266, "grad_norm": 1.0819737688059052, "learning_rate": 4.775454413552202e-06, "loss": 0.1047, "step": 1495 }, { "epoch": 0.6806187443130118, "grad_norm": 2.180583587749644, "learning_rate": 4.775158306898358e-06, "loss": 0.1147, "step": 1496 }, { "epoch": 0.681073703366697, "grad_norm": 1.4888818210097596, "learning_rate": 4.774862014328849e-06, "loss": 0.1531, "step": 1497 }, { "epoch": 0.6815286624203821, "grad_norm": 1.4821796970713637, "learning_rate": 4.774565535867892e-06, "loss": 0.163, "step": 1498 }, { "epoch": 0.6819836214740673, "grad_norm": 1.9349751384396998, "learning_rate": 4.77426887153971e-06, "loss": 0.1602, "step": 1499 }, { "epoch": 0.6824385805277525, "grad_norm": 2.068635944499767, "learning_rate": 4.773972021368546e-06, "loss": 0.1934, "step": 1500 }, { "epoch": 0.6828935395814377, "grad_norm": 1.9557854149934149, "learning_rate": 4.773674985378658e-06, "loss": 0.2143, "step": 1501 }, { "epoch": 0.6833484986351228, "grad_norm": 2.6563423898144936, "learning_rate": 4.773377763594319e-06, "loss": 0.1837, "step": 1502 }, { "epoch": 0.683803457688808, "grad_norm": 2.4819107124862856, "learning_rate": 4.773080356039814e-06, "loss": 0.1975, "step": 1503 }, { "epoch": 0.6842584167424932, "grad_norm": 1.7036233463379575, "learning_rate": 4.772782762739448e-06, "loss": 0.1848, "step": 1504 }, { "epoch": 0.6847133757961783, "grad_norm": 1.9141994818014876, "learning_rate": 4.772484983717539e-06, "loss": 0.2006, "step": 1505 }, { "epoch": 0.6851683348498635, "grad_norm": 2.4521735191952114, "learning_rate": 4.77218701899842e-06, "loss": 0.2101, "step": 1506 }, { "epoch": 0.6856232939035487, "grad_norm": 2.0961682322351174, "learning_rate": 4.771888868606438e-06, "loss": 0.2065, "step": 1507 }, { "epoch": 0.6860782529572339, "grad_norm": 1.6218330474990592, "learning_rate": 4.771590532565957e-06, "loss": 0.1255, "step": 1508 }, { "epoch": 0.686533212010919, "grad_norm": 1.9721609486698313, "learning_rate": 4.771292010901357e-06, "loss": 0.1303, "step": 1509 }, { "epoch": 0.6869881710646042, "grad_norm": 2.121063258188487, "learning_rate": 4.77099330363703e-06, "loss": 0.149, "step": 1510 }, { "epoch": 0.6874431301182894, "grad_norm": 1.4516172378682393, "learning_rate": 4.770694410797387e-06, "loss": 0.1318, "step": 1511 }, { "epoch": 0.6878980891719745, "grad_norm": 1.6701384225121902, "learning_rate": 4.770395332406851e-06, "loss": 0.1459, "step": 1512 }, { "epoch": 0.6883530482256597, "grad_norm": 1.6796065018549693, "learning_rate": 4.770096068489861e-06, "loss": 0.1599, "step": 1513 }, { "epoch": 0.6888080072793449, "grad_norm": 1.235533430237688, "learning_rate": 4.769796619070872e-06, "loss": 0.1519, "step": 1514 }, { "epoch": 0.6892629663330301, "grad_norm": 1.3347747968404207, "learning_rate": 4.769496984174353e-06, "loss": 0.1064, "step": 1515 }, { "epoch": 0.6897179253867152, "grad_norm": 1.5781140890537728, "learning_rate": 4.769197163824791e-06, "loss": 0.1435, "step": 1516 }, { "epoch": 0.6901728844404004, "grad_norm": 2.213137403753888, "learning_rate": 4.768897158046683e-06, "loss": 0.1866, "step": 1517 }, { "epoch": 0.6906278434940856, "grad_norm": 1.5778012312077723, "learning_rate": 4.768596966864546e-06, "loss": 0.1604, "step": 1518 }, { "epoch": 0.6910828025477707, "grad_norm": 1.652969574663111, "learning_rate": 4.76829659030291e-06, "loss": 0.1869, "step": 1519 }, { "epoch": 0.6915377616014559, "grad_norm": 1.5361209471256771, "learning_rate": 4.767996028386319e-06, "loss": 0.1457, "step": 1520 }, { "epoch": 0.6919927206551411, "grad_norm": 2.936222163725796, "learning_rate": 4.767695281139336e-06, "loss": 0.1881, "step": 1521 }, { "epoch": 0.6924476797088263, "grad_norm": 2.3134771803324905, "learning_rate": 4.767394348586535e-06, "loss": 0.1599, "step": 1522 }, { "epoch": 0.6929026387625113, "grad_norm": 2.4498437084815428, "learning_rate": 4.767093230752507e-06, "loss": 0.2138, "step": 1523 }, { "epoch": 0.6933575978161965, "grad_norm": 1.5332362659492962, "learning_rate": 4.766791927661859e-06, "loss": 0.151, "step": 1524 }, { "epoch": 0.6938125568698817, "grad_norm": 1.7915535564744174, "learning_rate": 4.766490439339211e-06, "loss": 0.1318, "step": 1525 }, { "epoch": 0.6942675159235668, "grad_norm": 1.6447847233863087, "learning_rate": 4.7661887658092e-06, "loss": 0.162, "step": 1526 }, { "epoch": 0.694722474977252, "grad_norm": 2.9781233092582866, "learning_rate": 4.765886907096477e-06, "loss": 0.2619, "step": 1527 }, { "epoch": 0.6951774340309372, "grad_norm": 1.7140676149721272, "learning_rate": 4.7655848632257084e-06, "loss": 0.1425, "step": 1528 }, { "epoch": 0.6956323930846224, "grad_norm": 2.4534906180849116, "learning_rate": 4.7652826342215764e-06, "loss": 0.236, "step": 1529 }, { "epoch": 0.6960873521383075, "grad_norm": 1.6478858265647598, "learning_rate": 4.764980220108777e-06, "loss": 0.1955, "step": 1530 }, { "epoch": 0.6965423111919927, "grad_norm": 2.306316562409567, "learning_rate": 4.764677620912022e-06, "loss": 0.2079, "step": 1531 }, { "epoch": 0.6969972702456779, "grad_norm": 1.644994735808915, "learning_rate": 4.764374836656041e-06, "loss": 0.1442, "step": 1532 }, { "epoch": 0.697452229299363, "grad_norm": 1.4036507182888944, "learning_rate": 4.764071867365571e-06, "loss": 0.1638, "step": 1533 }, { "epoch": 0.6979071883530482, "grad_norm": 1.5164218367626467, "learning_rate": 4.763768713065375e-06, "loss": 0.156, "step": 1534 }, { "epoch": 0.6983621474067334, "grad_norm": 1.7701773803690557, "learning_rate": 4.763465373780223e-06, "loss": 0.1145, "step": 1535 }, { "epoch": 0.6988171064604186, "grad_norm": 2.076859289782232, "learning_rate": 4.763161849534902e-06, "loss": 0.1561, "step": 1536 }, { "epoch": 0.6992720655141037, "grad_norm": 1.6167208008101008, "learning_rate": 4.762858140354214e-06, "loss": 0.1621, "step": 1537 }, { "epoch": 0.6997270245677889, "grad_norm": 1.4746209465407152, "learning_rate": 4.7625542462629785e-06, "loss": 0.1768, "step": 1538 }, { "epoch": 0.7001819836214741, "grad_norm": 1.4200002114989836, "learning_rate": 4.762250167286027e-06, "loss": 0.0995, "step": 1539 }, { "epoch": 0.7006369426751592, "grad_norm": 2.080064440715621, "learning_rate": 4.761945903448209e-06, "loss": 0.2274, "step": 1540 }, { "epoch": 0.7010919017288444, "grad_norm": 1.346792584477521, "learning_rate": 4.761641454774386e-06, "loss": 0.1219, "step": 1541 }, { "epoch": 0.7015468607825296, "grad_norm": 2.36691492405669, "learning_rate": 4.761336821289436e-06, "loss": 0.2965, "step": 1542 }, { "epoch": 0.7020018198362148, "grad_norm": 1.773901757295841, "learning_rate": 4.761032003018254e-06, "loss": 0.163, "step": 1543 }, { "epoch": 0.7024567788898999, "grad_norm": 1.6774939072873407, "learning_rate": 4.760726999985748e-06, "loss": 0.1315, "step": 1544 }, { "epoch": 0.7029117379435851, "grad_norm": 1.6552217973496692, "learning_rate": 4.7604218122168406e-06, "loss": 0.1298, "step": 1545 }, { "epoch": 0.7033666969972703, "grad_norm": 1.91830208867601, "learning_rate": 4.760116439736471e-06, "loss": 0.2525, "step": 1546 }, { "epoch": 0.7038216560509554, "grad_norm": 1.564874376143588, "learning_rate": 4.759810882569591e-06, "loss": 0.1863, "step": 1547 }, { "epoch": 0.7042766151046406, "grad_norm": 1.4864041422513101, "learning_rate": 4.759505140741172e-06, "loss": 0.1063, "step": 1548 }, { "epoch": 0.7047315741583258, "grad_norm": 2.549801333631036, "learning_rate": 4.759199214276196e-06, "loss": 0.2505, "step": 1549 }, { "epoch": 0.705186533212011, "grad_norm": 1.5401594920414479, "learning_rate": 4.758893103199665e-06, "loss": 0.1624, "step": 1550 }, { "epoch": 0.7056414922656961, "grad_norm": 1.6343764429957106, "learning_rate": 4.758586807536588e-06, "loss": 0.1545, "step": 1551 }, { "epoch": 0.7060964513193813, "grad_norm": 1.6039711645022867, "learning_rate": 4.758280327311998e-06, "loss": 0.1134, "step": 1552 }, { "epoch": 0.7065514103730665, "grad_norm": 2.2883990951010063, "learning_rate": 4.757973662550938e-06, "loss": 0.1899, "step": 1553 }, { "epoch": 0.7070063694267515, "grad_norm": 1.7249554511478242, "learning_rate": 4.757666813278466e-06, "loss": 0.1725, "step": 1554 }, { "epoch": 0.7074613284804367, "grad_norm": 2.041262841608907, "learning_rate": 4.757359779519659e-06, "loss": 0.2481, "step": 1555 }, { "epoch": 0.707916287534122, "grad_norm": 1.7815243564082959, "learning_rate": 4.757052561299604e-06, "loss": 0.2166, "step": 1556 }, { "epoch": 0.7083712465878071, "grad_norm": 1.5514238648411727, "learning_rate": 4.756745158643407e-06, "loss": 0.224, "step": 1557 }, { "epoch": 0.7088262056414922, "grad_norm": 1.8608039671832461, "learning_rate": 4.7564375715761865e-06, "loss": 0.2223, "step": 1558 }, { "epoch": 0.7092811646951774, "grad_norm": 1.6157629653628103, "learning_rate": 4.756129800123078e-06, "loss": 0.1293, "step": 1559 }, { "epoch": 0.7097361237488626, "grad_norm": 1.4596213449886457, "learning_rate": 4.755821844309232e-06, "loss": 0.1805, "step": 1560 }, { "epoch": 0.7101910828025477, "grad_norm": 1.7295068196827752, "learning_rate": 4.75551370415981e-06, "loss": 0.1599, "step": 1561 }, { "epoch": 0.7106460418562329, "grad_norm": 2.0606393433385612, "learning_rate": 4.755205379699996e-06, "loss": 0.1941, "step": 1562 }, { "epoch": 0.7111010009099181, "grad_norm": 2.0979325727754294, "learning_rate": 4.75489687095498e-06, "loss": 0.1913, "step": 1563 }, { "epoch": 0.7115559599636033, "grad_norm": 2.2303398669678076, "learning_rate": 4.754588177949977e-06, "loss": 0.1478, "step": 1564 }, { "epoch": 0.7120109190172884, "grad_norm": 2.093261606281437, "learning_rate": 4.7542793007102086e-06, "loss": 0.1815, "step": 1565 }, { "epoch": 0.7124658780709736, "grad_norm": 1.4472751266274675, "learning_rate": 4.7539702392609165e-06, "loss": 0.1697, "step": 1566 }, { "epoch": 0.7129208371246588, "grad_norm": 2.0281126718428077, "learning_rate": 4.753660993627356e-06, "loss": 0.0948, "step": 1567 }, { "epoch": 0.7133757961783439, "grad_norm": 1.5189147438423232, "learning_rate": 4.753351563834795e-06, "loss": 0.1727, "step": 1568 }, { "epoch": 0.7138307552320291, "grad_norm": 1.7409543127807352, "learning_rate": 4.753041949908521e-06, "loss": 0.1642, "step": 1569 }, { "epoch": 0.7142857142857143, "grad_norm": 2.194503112395564, "learning_rate": 4.752732151873834e-06, "loss": 0.2196, "step": 1570 }, { "epoch": 0.7147406733393995, "grad_norm": 1.697163266188786, "learning_rate": 4.752422169756048e-06, "loss": 0.1672, "step": 1571 }, { "epoch": 0.7151956323930846, "grad_norm": 1.8134253244717562, "learning_rate": 4.752112003580495e-06, "loss": 0.1603, "step": 1572 }, { "epoch": 0.7156505914467698, "grad_norm": 2.3783985389961915, "learning_rate": 4.751801653372518e-06, "loss": 0.1731, "step": 1573 }, { "epoch": 0.716105550500455, "grad_norm": 2.5039159852054795, "learning_rate": 4.751491119157481e-06, "loss": 0.1865, "step": 1574 }, { "epoch": 0.7165605095541401, "grad_norm": 1.619599621691377, "learning_rate": 4.751180400960756e-06, "loss": 0.1746, "step": 1575 }, { "epoch": 0.7170154686078253, "grad_norm": 1.65152231646464, "learning_rate": 4.7508694988077355e-06, "loss": 0.1515, "step": 1576 }, { "epoch": 0.7174704276615105, "grad_norm": 2.465040491157821, "learning_rate": 4.750558412723824e-06, "loss": 0.1966, "step": 1577 }, { "epoch": 0.7179253867151957, "grad_norm": 2.2789812780893364, "learning_rate": 4.750247142734442e-06, "loss": 0.1599, "step": 1578 }, { "epoch": 0.7183803457688808, "grad_norm": 1.7581577660091943, "learning_rate": 4.749935688865026e-06, "loss": 0.141, "step": 1579 }, { "epoch": 0.718835304822566, "grad_norm": 2.1794165158833914, "learning_rate": 4.749624051141026e-06, "loss": 0.1088, "step": 1580 }, { "epoch": 0.7192902638762512, "grad_norm": 1.443223743964179, "learning_rate": 4.7493122295879076e-06, "loss": 0.1189, "step": 1581 }, { "epoch": 0.7197452229299363, "grad_norm": 2.35745890496679, "learning_rate": 4.7490002242311525e-06, "loss": 0.2129, "step": 1582 }, { "epoch": 0.7202001819836215, "grad_norm": 1.5523835122804504, "learning_rate": 4.748688035096255e-06, "loss": 0.2081, "step": 1583 }, { "epoch": 0.7206551410373067, "grad_norm": 2.4968010568360692, "learning_rate": 4.748375662208726e-06, "loss": 0.1759, "step": 1584 }, { "epoch": 0.7211101000909919, "grad_norm": 1.9165363158958804, "learning_rate": 4.748063105594092e-06, "loss": 0.2267, "step": 1585 }, { "epoch": 0.721565059144677, "grad_norm": 1.7864622532435137, "learning_rate": 4.747750365277892e-06, "loss": 0.1648, "step": 1586 }, { "epoch": 0.7220200181983621, "grad_norm": 1.8532777769110087, "learning_rate": 4.747437441285684e-06, "loss": 0.1501, "step": 1587 }, { "epoch": 0.7224749772520473, "grad_norm": 1.7539173333380942, "learning_rate": 4.747124333643038e-06, "loss": 0.1883, "step": 1588 }, { "epoch": 0.7229299363057324, "grad_norm": 1.7153189766040051, "learning_rate": 4.746811042375538e-06, "loss": 0.1308, "step": 1589 }, { "epoch": 0.7233848953594176, "grad_norm": 1.5162583630812903, "learning_rate": 4.746497567508787e-06, "loss": 0.1571, "step": 1590 }, { "epoch": 0.7238398544131028, "grad_norm": 1.5546810521185177, "learning_rate": 4.7461839090684e-06, "loss": 0.1694, "step": 1591 }, { "epoch": 0.724294813466788, "grad_norm": 2.0021940033485404, "learning_rate": 4.745870067080007e-06, "loss": 0.171, "step": 1592 }, { "epoch": 0.7247497725204731, "grad_norm": 2.221217513727709, "learning_rate": 4.7455560415692545e-06, "loss": 0.231, "step": 1593 }, { "epoch": 0.7252047315741583, "grad_norm": 2.222153805045267, "learning_rate": 4.745241832561803e-06, "loss": 0.1446, "step": 1594 }, { "epoch": 0.7256596906278435, "grad_norm": 1.784667663061202, "learning_rate": 4.744927440083329e-06, "loss": 0.1646, "step": 1595 }, { "epoch": 0.7261146496815286, "grad_norm": 1.7626687045318659, "learning_rate": 4.744612864159522e-06, "loss": 0.1685, "step": 1596 }, { "epoch": 0.7265696087352138, "grad_norm": 1.9909235520315078, "learning_rate": 4.7442981048160895e-06, "loss": 0.1854, "step": 1597 }, { "epoch": 0.727024567788899, "grad_norm": 2.4131359111724464, "learning_rate": 4.74398316207875e-06, "loss": 0.1784, "step": 1598 }, { "epoch": 0.7274795268425842, "grad_norm": 2.3390737079991215, "learning_rate": 4.74366803597324e-06, "loss": 0.28, "step": 1599 }, { "epoch": 0.7279344858962693, "grad_norm": 1.5176778250654925, "learning_rate": 4.743352726525311e-06, "loss": 0.1119, "step": 1600 }, { "epoch": 0.7283894449499545, "grad_norm": 1.612075524542219, "learning_rate": 4.743037233760728e-06, "loss": 0.1548, "step": 1601 }, { "epoch": 0.7288444040036397, "grad_norm": 2.082336981370237, "learning_rate": 4.742721557705271e-06, "loss": 0.1907, "step": 1602 }, { "epoch": 0.7292993630573248, "grad_norm": 1.8874163681919673, "learning_rate": 4.7424056983847374e-06, "loss": 0.1872, "step": 1603 }, { "epoch": 0.72975432211101, "grad_norm": 1.9161874420851024, "learning_rate": 4.7420896558249366e-06, "loss": 0.1199, "step": 1604 }, { "epoch": 0.7302092811646952, "grad_norm": 1.9339794473206677, "learning_rate": 4.741773430051694e-06, "loss": 0.1467, "step": 1605 }, { "epoch": 0.7306642402183804, "grad_norm": 1.5901851811892251, "learning_rate": 4.74145702109085e-06, "loss": 0.1094, "step": 1606 }, { "epoch": 0.7311191992720655, "grad_norm": 2.678117310973907, "learning_rate": 4.741140428968261e-06, "loss": 0.2545, "step": 1607 }, { "epoch": 0.7315741583257507, "grad_norm": 1.4456239768846677, "learning_rate": 4.740823653709797e-06, "loss": 0.101, "step": 1608 }, { "epoch": 0.7320291173794359, "grad_norm": 1.5614448809750465, "learning_rate": 4.740506695341343e-06, "loss": 0.135, "step": 1609 }, { "epoch": 0.732484076433121, "grad_norm": 1.9409375225046157, "learning_rate": 4.740189553888801e-06, "loss": 0.2674, "step": 1610 }, { "epoch": 0.7329390354868062, "grad_norm": 1.757285590607046, "learning_rate": 4.739872229378085e-06, "loss": 0.1358, "step": 1611 }, { "epoch": 0.7333939945404914, "grad_norm": 1.7119351957596494, "learning_rate": 4.739554721835125e-06, "loss": 0.1405, "step": 1612 }, { "epoch": 0.7338489535941766, "grad_norm": 1.5407585285384973, "learning_rate": 4.739237031285867e-06, "loss": 0.1789, "step": 1613 }, { "epoch": 0.7343039126478617, "grad_norm": 1.8412394540639878, "learning_rate": 4.738919157756272e-06, "loss": 0.1741, "step": 1614 }, { "epoch": 0.7347588717015469, "grad_norm": 1.9093990086684758, "learning_rate": 4.738601101272313e-06, "loss": 0.1972, "step": 1615 }, { "epoch": 0.7352138307552321, "grad_norm": 1.6531050134000445, "learning_rate": 4.738282861859983e-06, "loss": 0.1828, "step": 1616 }, { "epoch": 0.7356687898089171, "grad_norm": 1.6958094821678005, "learning_rate": 4.737964439545284e-06, "loss": 0.1623, "step": 1617 }, { "epoch": 0.7361237488626023, "grad_norm": 1.9487516983862898, "learning_rate": 4.737645834354238e-06, "loss": 0.1761, "step": 1618 }, { "epoch": 0.7365787079162875, "grad_norm": 1.5339742875273046, "learning_rate": 4.737327046312879e-06, "loss": 0.1188, "step": 1619 }, { "epoch": 0.7370336669699727, "grad_norm": 1.8259875586922627, "learning_rate": 4.737008075447259e-06, "loss": 0.13, "step": 1620 }, { "epoch": 0.7374886260236578, "grad_norm": 2.112705655098723, "learning_rate": 4.73668892178344e-06, "loss": 0.162, "step": 1621 }, { "epoch": 0.737943585077343, "grad_norm": 2.1191881288248755, "learning_rate": 4.736369585347503e-06, "loss": 0.1882, "step": 1622 }, { "epoch": 0.7383985441310282, "grad_norm": 2.42511490554677, "learning_rate": 4.736050066165544e-06, "loss": 0.168, "step": 1623 }, { "epoch": 0.7388535031847133, "grad_norm": 2.5180747249974678, "learning_rate": 4.735730364263671e-06, "loss": 0.2462, "step": 1624 }, { "epoch": 0.7393084622383985, "grad_norm": 1.899152814897376, "learning_rate": 4.735410479668009e-06, "loss": 0.1649, "step": 1625 }, { "epoch": 0.7397634212920837, "grad_norm": 2.5891320586414506, "learning_rate": 4.735090412404697e-06, "loss": 0.2112, "step": 1626 }, { "epoch": 0.7402183803457689, "grad_norm": 1.6256945799338343, "learning_rate": 4.734770162499891e-06, "loss": 0.0995, "step": 1627 }, { "epoch": 0.740673339399454, "grad_norm": 2.115890838067561, "learning_rate": 4.734449729979759e-06, "loss": 0.1863, "step": 1628 }, { "epoch": 0.7411282984531392, "grad_norm": 1.8207130234699649, "learning_rate": 4.734129114870486e-06, "loss": 0.1621, "step": 1629 }, { "epoch": 0.7415832575068244, "grad_norm": 2.419448299752305, "learning_rate": 4.733808317198271e-06, "loss": 0.1682, "step": 1630 }, { "epoch": 0.7420382165605095, "grad_norm": 1.864719563201482, "learning_rate": 4.733487336989327e-06, "loss": 0.1534, "step": 1631 }, { "epoch": 0.7424931756141947, "grad_norm": 2.480364363656269, "learning_rate": 4.733166174269886e-06, "loss": 0.186, "step": 1632 }, { "epoch": 0.7429481346678799, "grad_norm": 2.0606766178805116, "learning_rate": 4.732844829066189e-06, "loss": 0.2189, "step": 1633 }, { "epoch": 0.7434030937215651, "grad_norm": 2.162055464706376, "learning_rate": 4.732523301404497e-06, "loss": 0.1969, "step": 1634 }, { "epoch": 0.7438580527752502, "grad_norm": 2.12376584678073, "learning_rate": 4.732201591311082e-06, "loss": 0.2101, "step": 1635 }, { "epoch": 0.7443130118289354, "grad_norm": 1.5079389097876976, "learning_rate": 4.731879698812233e-06, "loss": 0.1802, "step": 1636 }, { "epoch": 0.7447679708826206, "grad_norm": 1.744034863658637, "learning_rate": 4.731557623934255e-06, "loss": 0.1398, "step": 1637 }, { "epoch": 0.7452229299363057, "grad_norm": 2.7848754471064043, "learning_rate": 4.7312353667034645e-06, "loss": 0.2499, "step": 1638 }, { "epoch": 0.7456778889899909, "grad_norm": 2.58334353852049, "learning_rate": 4.730912927146197e-06, "loss": 0.2203, "step": 1639 }, { "epoch": 0.7461328480436761, "grad_norm": 2.0325933883862066, "learning_rate": 4.7305903052888e-06, "loss": 0.1563, "step": 1640 }, { "epoch": 0.7465878070973613, "grad_norm": 2.3443549071357057, "learning_rate": 4.730267501157636e-06, "loss": 0.1896, "step": 1641 }, { "epoch": 0.7470427661510464, "grad_norm": 2.003548520587404, "learning_rate": 4.729944514779084e-06, "loss": 0.1705, "step": 1642 }, { "epoch": 0.7474977252047316, "grad_norm": 1.3567793569480755, "learning_rate": 4.729621346179536e-06, "loss": 0.1429, "step": 1643 }, { "epoch": 0.7479526842584168, "grad_norm": 1.9172209433761784, "learning_rate": 4.7292979953854e-06, "loss": 0.1224, "step": 1644 }, { "epoch": 0.7484076433121019, "grad_norm": 1.7854487682262081, "learning_rate": 4.7289744624231004e-06, "loss": 0.1753, "step": 1645 }, { "epoch": 0.7488626023657871, "grad_norm": 2.0357381373480377, "learning_rate": 4.728650747319073e-06, "loss": 0.1844, "step": 1646 }, { "epoch": 0.7493175614194723, "grad_norm": 2.295347780668863, "learning_rate": 4.728326850099771e-06, "loss": 0.1949, "step": 1647 }, { "epoch": 0.7497725204731575, "grad_norm": 2.2592022682113564, "learning_rate": 4.728002770791663e-06, "loss": 0.1641, "step": 1648 }, { "epoch": 0.7502274795268425, "grad_norm": 1.8794487431290805, "learning_rate": 4.727678509421229e-06, "loss": 0.1672, "step": 1649 }, { "epoch": 0.7506824385805277, "grad_norm": 1.471409298797821, "learning_rate": 4.727354066014968e-06, "loss": 0.1251, "step": 1650 }, { "epoch": 0.7511373976342129, "grad_norm": 1.2272497564159228, "learning_rate": 4.727029440599391e-06, "loss": 0.1165, "step": 1651 }, { "epoch": 0.7515923566878981, "grad_norm": 1.7826119947445478, "learning_rate": 4.726704633201025e-06, "loss": 0.1367, "step": 1652 }, { "epoch": 0.7520473157415832, "grad_norm": 1.5654538387161951, "learning_rate": 4.726379643846412e-06, "loss": 0.1622, "step": 1653 }, { "epoch": 0.7525022747952684, "grad_norm": 2.0792625449816255, "learning_rate": 4.726054472562109e-06, "loss": 0.1741, "step": 1654 }, { "epoch": 0.7529572338489536, "grad_norm": 1.5223527837461277, "learning_rate": 4.725729119374687e-06, "loss": 0.1198, "step": 1655 }, { "epoch": 0.7534121929026387, "grad_norm": 1.5290108835892176, "learning_rate": 4.725403584310734e-06, "loss": 0.1026, "step": 1656 }, { "epoch": 0.7538671519563239, "grad_norm": 2.155319535005024, "learning_rate": 4.725077867396849e-06, "loss": 0.1652, "step": 1657 }, { "epoch": 0.7543221110100091, "grad_norm": 1.565904420652083, "learning_rate": 4.724751968659648e-06, "loss": 0.1628, "step": 1658 }, { "epoch": 0.7547770700636943, "grad_norm": 2.9773420234850345, "learning_rate": 4.724425888125764e-06, "loss": 0.2409, "step": 1659 }, { "epoch": 0.7552320291173794, "grad_norm": 2.3428961739867304, "learning_rate": 4.724099625821842e-06, "loss": 0.2216, "step": 1660 }, { "epoch": 0.7556869881710646, "grad_norm": 1.7855741504776685, "learning_rate": 4.723773181774543e-06, "loss": 0.1468, "step": 1661 }, { "epoch": 0.7561419472247498, "grad_norm": 1.96972618488323, "learning_rate": 4.723446556010542e-06, "loss": 0.1981, "step": 1662 }, { "epoch": 0.7565969062784349, "grad_norm": 1.6758348642722924, "learning_rate": 4.7231197485565275e-06, "loss": 0.169, "step": 1663 }, { "epoch": 0.7570518653321201, "grad_norm": 1.3954523503838552, "learning_rate": 4.722792759439209e-06, "loss": 0.1224, "step": 1664 }, { "epoch": 0.7575068243858053, "grad_norm": 2.060909913997174, "learning_rate": 4.722465588685302e-06, "loss": 0.2087, "step": 1665 }, { "epoch": 0.7579617834394905, "grad_norm": 1.5474467660765128, "learning_rate": 4.722138236321545e-06, "loss": 0.1013, "step": 1666 }, { "epoch": 0.7584167424931756, "grad_norm": 2.430153104930812, "learning_rate": 4.721810702374687e-06, "loss": 0.1439, "step": 1667 }, { "epoch": 0.7588717015468608, "grad_norm": 1.7773306327385723, "learning_rate": 4.721482986871491e-06, "loss": 0.1485, "step": 1668 }, { "epoch": 0.759326660600546, "grad_norm": 2.927464615752266, "learning_rate": 4.721155089838738e-06, "loss": 0.1962, "step": 1669 }, { "epoch": 0.7597816196542311, "grad_norm": 1.9730589581225906, "learning_rate": 4.720827011303222e-06, "loss": 0.1503, "step": 1670 }, { "epoch": 0.7602365787079163, "grad_norm": 1.953497394359563, "learning_rate": 4.720498751291751e-06, "loss": 0.182, "step": 1671 }, { "epoch": 0.7606915377616015, "grad_norm": 1.7839379977035983, "learning_rate": 4.72017030983115e-06, "loss": 0.2198, "step": 1672 }, { "epoch": 0.7611464968152867, "grad_norm": 1.7993088459777005, "learning_rate": 4.7198416869482575e-06, "loss": 0.1696, "step": 1673 }, { "epoch": 0.7616014558689718, "grad_norm": 1.892794250792964, "learning_rate": 4.719512882669926e-06, "loss": 0.1776, "step": 1674 }, { "epoch": 0.762056414922657, "grad_norm": 2.0202484520052035, "learning_rate": 4.719183897023027e-06, "loss": 0.1673, "step": 1675 }, { "epoch": 0.7625113739763422, "grad_norm": 2.2601047076044414, "learning_rate": 4.718854730034441e-06, "loss": 0.2183, "step": 1676 }, { "epoch": 0.7629663330300273, "grad_norm": 1.8760309869672118, "learning_rate": 4.718525381731066e-06, "loss": 0.1476, "step": 1677 }, { "epoch": 0.7634212920837125, "grad_norm": 1.5663417379599454, "learning_rate": 4.718195852139816e-06, "loss": 0.2014, "step": 1678 }, { "epoch": 0.7638762511373977, "grad_norm": 2.338496392531513, "learning_rate": 4.717866141287618e-06, "loss": 0.2422, "step": 1679 }, { "epoch": 0.7643312101910829, "grad_norm": 1.9053967868206603, "learning_rate": 4.717536249201416e-06, "loss": 0.1953, "step": 1680 }, { "epoch": 0.7647861692447679, "grad_norm": 1.831121224420973, "learning_rate": 4.7172061759081646e-06, "loss": 0.1626, "step": 1681 }, { "epoch": 0.7652411282984531, "grad_norm": 2.234380631915828, "learning_rate": 4.716875921434838e-06, "loss": 0.1754, "step": 1682 }, { "epoch": 0.7656960873521383, "grad_norm": 1.9990356821604962, "learning_rate": 4.716545485808421e-06, "loss": 0.1613, "step": 1683 }, { "epoch": 0.7661510464058234, "grad_norm": 1.956500719133962, "learning_rate": 4.716214869055918e-06, "loss": 0.1747, "step": 1684 }, { "epoch": 0.7666060054595086, "grad_norm": 1.7944596997359672, "learning_rate": 4.715884071204344e-06, "loss": 0.116, "step": 1685 }, { "epoch": 0.7670609645131938, "grad_norm": 1.93926106516618, "learning_rate": 4.715553092280731e-06, "loss": 0.2121, "step": 1686 }, { "epoch": 0.767515923566879, "grad_norm": 2.4656357214922626, "learning_rate": 4.7152219323121246e-06, "loss": 0.1772, "step": 1687 }, { "epoch": 0.7679708826205641, "grad_norm": 2.2402320393494253, "learning_rate": 4.714890591325586e-06, "loss": 0.2021, "step": 1688 }, { "epoch": 0.7684258416742493, "grad_norm": 1.7903156076682725, "learning_rate": 4.714559069348189e-06, "loss": 0.1825, "step": 1689 }, { "epoch": 0.7688808007279345, "grad_norm": 1.6420985192646667, "learning_rate": 4.714227366407027e-06, "loss": 0.1475, "step": 1690 }, { "epoch": 0.7693357597816196, "grad_norm": 2.2750484746487936, "learning_rate": 4.7138954825292035e-06, "loss": 0.1492, "step": 1691 }, { "epoch": 0.7697907188353048, "grad_norm": 1.992613507205851, "learning_rate": 4.71356341774184e-06, "loss": 0.2004, "step": 1692 }, { "epoch": 0.77024567788899, "grad_norm": 1.8507536466532999, "learning_rate": 4.713231172072069e-06, "loss": 0.1665, "step": 1693 }, { "epoch": 0.7707006369426752, "grad_norm": 2.074124213121433, "learning_rate": 4.712898745547043e-06, "loss": 0.1901, "step": 1694 }, { "epoch": 0.7711555959963603, "grad_norm": 2.2217772464991628, "learning_rate": 4.712566138193923e-06, "loss": 0.2007, "step": 1695 }, { "epoch": 0.7716105550500455, "grad_norm": 2.1110958043430936, "learning_rate": 4.712233350039892e-06, "loss": 0.1711, "step": 1696 }, { "epoch": 0.7720655141037307, "grad_norm": 1.7733407712061509, "learning_rate": 4.711900381112141e-06, "loss": 0.1401, "step": 1697 }, { "epoch": 0.7725204731574158, "grad_norm": 1.9082417250906683, "learning_rate": 4.71156723143788e-06, "loss": 0.1707, "step": 1698 }, { "epoch": 0.772975432211101, "grad_norm": 1.8677365381806925, "learning_rate": 4.711233901044332e-06, "loss": 0.1868, "step": 1699 }, { "epoch": 0.7734303912647862, "grad_norm": 2.0411961738002464, "learning_rate": 4.710900389958735e-06, "loss": 0.1744, "step": 1700 }, { "epoch": 0.7738853503184714, "grad_norm": 2.1935749697701, "learning_rate": 4.710566698208343e-06, "loss": 0.2385, "step": 1701 }, { "epoch": 0.7743403093721565, "grad_norm": 1.7404480081781704, "learning_rate": 4.710232825820424e-06, "loss": 0.1499, "step": 1702 }, { "epoch": 0.7747952684258417, "grad_norm": 1.477154965489664, "learning_rate": 4.709898772822258e-06, "loss": 0.1207, "step": 1703 }, { "epoch": 0.7752502274795269, "grad_norm": 1.7903520569742504, "learning_rate": 4.709564539241145e-06, "loss": 0.1257, "step": 1704 }, { "epoch": 0.775705186533212, "grad_norm": 1.509438293191361, "learning_rate": 4.709230125104396e-06, "loss": 0.1333, "step": 1705 }, { "epoch": 0.7761601455868972, "grad_norm": 1.352600254451033, "learning_rate": 4.708895530439339e-06, "loss": 0.1297, "step": 1706 }, { "epoch": 0.7766151046405824, "grad_norm": 2.80931496450313, "learning_rate": 4.708560755273313e-06, "loss": 0.1572, "step": 1707 }, { "epoch": 0.7770700636942676, "grad_norm": 2.614552054035137, "learning_rate": 4.7082257996336765e-06, "loss": 0.2392, "step": 1708 }, { "epoch": 0.7775250227479527, "grad_norm": 1.3897711262928594, "learning_rate": 4.707890663547801e-06, "loss": 0.1898, "step": 1709 }, { "epoch": 0.7779799818016379, "grad_norm": 1.3068004754745945, "learning_rate": 4.7075553470430695e-06, "loss": 0.1541, "step": 1710 }, { "epoch": 0.778434940855323, "grad_norm": 2.0471283874239337, "learning_rate": 4.707219850146885e-06, "loss": 0.189, "step": 1711 }, { "epoch": 0.7788898999090081, "grad_norm": 1.406237335222361, "learning_rate": 4.706884172886662e-06, "loss": 0.1534, "step": 1712 }, { "epoch": 0.7793448589626933, "grad_norm": 1.430209112364991, "learning_rate": 4.706548315289831e-06, "loss": 0.1505, "step": 1713 }, { "epoch": 0.7797998180163785, "grad_norm": 1.9880980157191188, "learning_rate": 4.706212277383836e-06, "loss": 0.1455, "step": 1714 }, { "epoch": 0.7802547770700637, "grad_norm": 1.9444624934450598, "learning_rate": 4.705876059196136e-06, "loss": 0.1919, "step": 1715 }, { "epoch": 0.7807097361237488, "grad_norm": 1.845006648683808, "learning_rate": 4.705539660754208e-06, "loss": 0.1379, "step": 1716 }, { "epoch": 0.781164695177434, "grad_norm": 1.7044046674717437, "learning_rate": 4.705203082085538e-06, "loss": 0.1323, "step": 1717 }, { "epoch": 0.7816196542311192, "grad_norm": 1.7912067195327883, "learning_rate": 4.70486632321763e-06, "loss": 0.2117, "step": 1718 }, { "epoch": 0.7820746132848043, "grad_norm": 1.9320743936658202, "learning_rate": 4.7045293841780034e-06, "loss": 0.1375, "step": 1719 }, { "epoch": 0.7825295723384895, "grad_norm": 1.7315009532080885, "learning_rate": 4.704192264994193e-06, "loss": 0.1162, "step": 1720 }, { "epoch": 0.7829845313921747, "grad_norm": 1.6176947094849203, "learning_rate": 4.703854965693743e-06, "loss": 0.1318, "step": 1721 }, { "epoch": 0.7834394904458599, "grad_norm": 2.40560948473341, "learning_rate": 4.703517486304218e-06, "loss": 0.1747, "step": 1722 }, { "epoch": 0.783894449499545, "grad_norm": 1.6675266396651778, "learning_rate": 4.703179826853195e-06, "loss": 0.1853, "step": 1723 }, { "epoch": 0.7843494085532302, "grad_norm": 1.8036543539560768, "learning_rate": 4.702841987368265e-06, "loss": 0.1358, "step": 1724 }, { "epoch": 0.7848043676069154, "grad_norm": 2.164797051503019, "learning_rate": 4.702503967877038e-06, "loss": 0.1531, "step": 1725 }, { "epoch": 0.7852593266606005, "grad_norm": 1.6083401375044635, "learning_rate": 4.702165768407132e-06, "loss": 0.1984, "step": 1726 }, { "epoch": 0.7857142857142857, "grad_norm": 1.9227015105668148, "learning_rate": 4.701827388986185e-06, "loss": 0.1962, "step": 1727 }, { "epoch": 0.7861692447679709, "grad_norm": 2.234973410496376, "learning_rate": 4.701488829641845e-06, "loss": 0.1313, "step": 1728 }, { "epoch": 0.7866242038216561, "grad_norm": 1.4707235359776172, "learning_rate": 4.701150090401782e-06, "loss": 0.1384, "step": 1729 }, { "epoch": 0.7870791628753412, "grad_norm": 1.4795549767962248, "learning_rate": 4.700811171293673e-06, "loss": 0.1192, "step": 1730 }, { "epoch": 0.7875341219290264, "grad_norm": 1.4765672888773027, "learning_rate": 4.700472072345214e-06, "loss": 0.1445, "step": 1731 }, { "epoch": 0.7879890809827116, "grad_norm": 1.7959851809677527, "learning_rate": 4.700132793584113e-06, "loss": 0.176, "step": 1732 }, { "epoch": 0.7884440400363967, "grad_norm": 2.0011742977871365, "learning_rate": 4.699793335038098e-06, "loss": 0.2073, "step": 1733 }, { "epoch": 0.7888989990900819, "grad_norm": 1.5877933891450462, "learning_rate": 4.699453696734905e-06, "loss": 0.1163, "step": 1734 }, { "epoch": 0.7893539581437671, "grad_norm": 1.994398441190682, "learning_rate": 4.699113878702288e-06, "loss": 0.1997, "step": 1735 }, { "epoch": 0.7898089171974523, "grad_norm": 1.6186509072614172, "learning_rate": 4.698773880968017e-06, "loss": 0.1359, "step": 1736 }, { "epoch": 0.7902638762511374, "grad_norm": 1.3756660961296079, "learning_rate": 4.698433703559874e-06, "loss": 0.1717, "step": 1737 }, { "epoch": 0.7907188353048226, "grad_norm": 1.4461545675657563, "learning_rate": 4.698093346505656e-06, "loss": 0.1381, "step": 1738 }, { "epoch": 0.7911737943585078, "grad_norm": 1.975346854852977, "learning_rate": 4.697752809833177e-06, "loss": 0.1651, "step": 1739 }, { "epoch": 0.7916287534121929, "grad_norm": 2.098203427770575, "learning_rate": 4.697412093570263e-06, "loss": 0.1966, "step": 1740 }, { "epoch": 0.792083712465878, "grad_norm": 1.7884148647415081, "learning_rate": 4.697071197744756e-06, "loss": 0.1603, "step": 1741 }, { "epoch": 0.7925386715195633, "grad_norm": 2.20000836754146, "learning_rate": 4.6967301223845115e-06, "loss": 0.168, "step": 1742 }, { "epoch": 0.7929936305732485, "grad_norm": 1.469643335454165, "learning_rate": 4.696388867517403e-06, "loss": 0.1574, "step": 1743 }, { "epoch": 0.7934485896269335, "grad_norm": 1.7067059652811334, "learning_rate": 4.696047433171316e-06, "loss": 0.098, "step": 1744 }, { "epoch": 0.7939035486806187, "grad_norm": 2.0780505106943896, "learning_rate": 4.695705819374149e-06, "loss": 0.178, "step": 1745 }, { "epoch": 0.7943585077343039, "grad_norm": 1.8450097546428101, "learning_rate": 4.695364026153818e-06, "loss": 0.1637, "step": 1746 }, { "epoch": 0.794813466787989, "grad_norm": 1.4237762817404553, "learning_rate": 4.695022053538253e-06, "loss": 0.1416, "step": 1747 }, { "epoch": 0.7952684258416742, "grad_norm": 2.485744457764155, "learning_rate": 4.694679901555398e-06, "loss": 0.2207, "step": 1748 }, { "epoch": 0.7957233848953594, "grad_norm": 2.5149587392089128, "learning_rate": 4.694337570233213e-06, "loss": 0.1485, "step": 1749 }, { "epoch": 0.7961783439490446, "grad_norm": 2.0342522616249736, "learning_rate": 4.693995059599672e-06, "loss": 0.2071, "step": 1750 }, { "epoch": 0.7966333030027297, "grad_norm": 1.7181022322257762, "learning_rate": 4.693652369682762e-06, "loss": 0.2112, "step": 1751 }, { "epoch": 0.7970882620564149, "grad_norm": 1.843190625559269, "learning_rate": 4.693309500510487e-06, "loss": 0.1632, "step": 1752 }, { "epoch": 0.7975432211101001, "grad_norm": 2.7841529899485917, "learning_rate": 4.692966452110864e-06, "loss": 0.1534, "step": 1753 }, { "epoch": 0.7979981801637852, "grad_norm": 1.5395427013532956, "learning_rate": 4.6926232245119265e-06, "loss": 0.2195, "step": 1754 }, { "epoch": 0.7984531392174704, "grad_norm": 2.5074996998585335, "learning_rate": 4.69227981774172e-06, "loss": 0.1856, "step": 1755 }, { "epoch": 0.7989080982711556, "grad_norm": 2.449264992514986, "learning_rate": 4.691936231828308e-06, "loss": 0.1779, "step": 1756 }, { "epoch": 0.7993630573248408, "grad_norm": 2.481345422810722, "learning_rate": 4.691592466799766e-06, "loss": 0.1889, "step": 1757 }, { "epoch": 0.7998180163785259, "grad_norm": 1.637751747233988, "learning_rate": 4.691248522684184e-06, "loss": 0.1349, "step": 1758 }, { "epoch": 0.8002729754322111, "grad_norm": 1.6804430027452057, "learning_rate": 4.690904399509668e-06, "loss": 0.1435, "step": 1759 }, { "epoch": 0.8007279344858963, "grad_norm": 2.742847873433655, "learning_rate": 4.69056009730434e-06, "loss": 0.2232, "step": 1760 }, { "epoch": 0.8011828935395814, "grad_norm": 2.40569741832729, "learning_rate": 4.690215616096332e-06, "loss": 0.1711, "step": 1761 }, { "epoch": 0.8016378525932666, "grad_norm": 2.4832090753479834, "learning_rate": 4.689870955913796e-06, "loss": 0.1587, "step": 1762 }, { "epoch": 0.8020928116469518, "grad_norm": 2.0194488171697063, "learning_rate": 4.689526116784894e-06, "loss": 0.167, "step": 1763 }, { "epoch": 0.802547770700637, "grad_norm": 3.338733219322262, "learning_rate": 4.689181098737805e-06, "loss": 0.2404, "step": 1764 }, { "epoch": 0.8030027297543221, "grad_norm": 2.150659967515375, "learning_rate": 4.6888359018007235e-06, "loss": 0.1288, "step": 1765 }, { "epoch": 0.8034576888080073, "grad_norm": 1.9131033030180753, "learning_rate": 4.6884905260018565e-06, "loss": 0.1638, "step": 1766 }, { "epoch": 0.8039126478616925, "grad_norm": 1.7799343855450172, "learning_rate": 4.688144971369427e-06, "loss": 0.2032, "step": 1767 }, { "epoch": 0.8043676069153776, "grad_norm": 1.9191485121544656, "learning_rate": 4.687799237931673e-06, "loss": 0.1597, "step": 1768 }, { "epoch": 0.8048225659690628, "grad_norm": 1.5130848101685814, "learning_rate": 4.687453325716844e-06, "loss": 0.1572, "step": 1769 }, { "epoch": 0.805277525022748, "grad_norm": 2.380748372992281, "learning_rate": 4.687107234753208e-06, "loss": 0.1617, "step": 1770 }, { "epoch": 0.8057324840764332, "grad_norm": 2.7874285940928067, "learning_rate": 4.686760965069046e-06, "loss": 0.1679, "step": 1771 }, { "epoch": 0.8061874431301183, "grad_norm": 1.9146816786227654, "learning_rate": 4.686414516692653e-06, "loss": 0.2267, "step": 1772 }, { "epoch": 0.8066424021838035, "grad_norm": 1.6656788150165645, "learning_rate": 4.68606788965234e-06, "loss": 0.1608, "step": 1773 }, { "epoch": 0.8070973612374887, "grad_norm": 2.859758352959496, "learning_rate": 4.68572108397643e-06, "loss": 0.2065, "step": 1774 }, { "epoch": 0.8075523202911737, "grad_norm": 1.7922493594029372, "learning_rate": 4.6853740996932645e-06, "loss": 0.1331, "step": 1775 }, { "epoch": 0.8080072793448589, "grad_norm": 1.9382561831132192, "learning_rate": 4.685026936831196e-06, "loss": 0.1693, "step": 1776 }, { "epoch": 0.8084622383985441, "grad_norm": 2.2029297725133237, "learning_rate": 4.684679595418595e-06, "loss": 0.1988, "step": 1777 }, { "epoch": 0.8089171974522293, "grad_norm": 1.6643742621321755, "learning_rate": 4.684332075483843e-06, "loss": 0.1776, "step": 1778 }, { "epoch": 0.8093721565059144, "grad_norm": 1.928150435175855, "learning_rate": 4.6839843770553374e-06, "loss": 0.2135, "step": 1779 }, { "epoch": 0.8098271155595996, "grad_norm": 1.5299034058186116, "learning_rate": 4.683636500161491e-06, "loss": 0.1287, "step": 1780 }, { "epoch": 0.8102820746132848, "grad_norm": 1.7105211102821978, "learning_rate": 4.683288444830732e-06, "loss": 0.1858, "step": 1781 }, { "epoch": 0.8107370336669699, "grad_norm": 2.065121875110959, "learning_rate": 4.6829402110915015e-06, "loss": 0.1573, "step": 1782 }, { "epoch": 0.8111919927206551, "grad_norm": 1.7915836692891514, "learning_rate": 4.682591798972253e-06, "loss": 0.163, "step": 1783 }, { "epoch": 0.8116469517743403, "grad_norm": 1.9011358499015634, "learning_rate": 4.682243208501461e-06, "loss": 0.1565, "step": 1784 }, { "epoch": 0.8121019108280255, "grad_norm": 1.8705464277674988, "learning_rate": 4.681894439707609e-06, "loss": 0.1532, "step": 1785 }, { "epoch": 0.8125568698817106, "grad_norm": 1.5282025887885624, "learning_rate": 4.681545492619195e-06, "loss": 0.1212, "step": 1786 }, { "epoch": 0.8130118289353958, "grad_norm": 2.4618870744714823, "learning_rate": 4.681196367264736e-06, "loss": 0.1737, "step": 1787 }, { "epoch": 0.813466787989081, "grad_norm": 1.5010216528583702, "learning_rate": 4.680847063672761e-06, "loss": 0.1349, "step": 1788 }, { "epoch": 0.8139217470427661, "grad_norm": 1.577176673126615, "learning_rate": 4.680497581871811e-06, "loss": 0.1736, "step": 1789 }, { "epoch": 0.8143767060964513, "grad_norm": 2.2216456467027603, "learning_rate": 4.680147921890447e-06, "loss": 0.1589, "step": 1790 }, { "epoch": 0.8148316651501365, "grad_norm": 2.2828861135151377, "learning_rate": 4.67979808375724e-06, "loss": 0.1864, "step": 1791 }, { "epoch": 0.8152866242038217, "grad_norm": 2.411410847612128, "learning_rate": 4.679448067500777e-06, "loss": 0.1704, "step": 1792 }, { "epoch": 0.8157415832575068, "grad_norm": 2.6745924756823274, "learning_rate": 4.67909787314966e-06, "loss": 0.1855, "step": 1793 }, { "epoch": 0.816196542311192, "grad_norm": 1.7549666443082432, "learning_rate": 4.678747500732505e-06, "loss": 0.2204, "step": 1794 }, { "epoch": 0.8166515013648772, "grad_norm": 2.4603767836599086, "learning_rate": 4.6783969502779455e-06, "loss": 0.1805, "step": 1795 }, { "epoch": 0.8171064604185623, "grad_norm": 1.5762472297440564, "learning_rate": 4.6780462218146236e-06, "loss": 0.1393, "step": 1796 }, { "epoch": 0.8175614194722475, "grad_norm": 1.6619849736476204, "learning_rate": 4.6776953153712005e-06, "loss": 0.2041, "step": 1797 }, { "epoch": 0.8180163785259327, "grad_norm": 1.7094043878723117, "learning_rate": 4.67734423097635e-06, "loss": 0.1603, "step": 1798 }, { "epoch": 0.8184713375796179, "grad_norm": 1.2928545358221282, "learning_rate": 4.676992968658762e-06, "loss": 0.1517, "step": 1799 }, { "epoch": 0.818926296633303, "grad_norm": 1.4763652797153222, "learning_rate": 4.67664152844714e-06, "loss": 0.0939, "step": 1800 }, { "epoch": 0.8193812556869882, "grad_norm": 2.260551569771672, "learning_rate": 4.676289910370202e-06, "loss": 0.1902, "step": 1801 }, { "epoch": 0.8198362147406734, "grad_norm": 2.047407982208326, "learning_rate": 4.675938114456682e-06, "loss": 0.1767, "step": 1802 }, { "epoch": 0.8202911737943585, "grad_norm": 1.5430069759954768, "learning_rate": 4.675586140735323e-06, "loss": 0.1955, "step": 1803 }, { "epoch": 0.8207461328480437, "grad_norm": 2.2295561077574404, "learning_rate": 4.675233989234891e-06, "loss": 0.211, "step": 1804 }, { "epoch": 0.8212010919017289, "grad_norm": 1.639085591327469, "learning_rate": 4.67488165998416e-06, "loss": 0.1163, "step": 1805 }, { "epoch": 0.821656050955414, "grad_norm": 1.8522836776109448, "learning_rate": 4.674529153011922e-06, "loss": 0.1879, "step": 1806 }, { "epoch": 0.8221110100090991, "grad_norm": 2.1812381655305653, "learning_rate": 4.674176468346982e-06, "loss": 0.1773, "step": 1807 }, { "epoch": 0.8225659690627843, "grad_norm": 1.9367383257783326, "learning_rate": 4.673823606018158e-06, "loss": 0.2019, "step": 1808 }, { "epoch": 0.8230209281164695, "grad_norm": 1.8576560873873327, "learning_rate": 4.673470566054288e-06, "loss": 0.1492, "step": 1809 }, { "epoch": 0.8234758871701547, "grad_norm": 1.9497069876088635, "learning_rate": 4.673117348484217e-06, "loss": 0.1745, "step": 1810 }, { "epoch": 0.8239308462238398, "grad_norm": 1.4193615554141685, "learning_rate": 4.672763953336811e-06, "loss": 0.1463, "step": 1811 }, { "epoch": 0.824385805277525, "grad_norm": 2.8057971610463928, "learning_rate": 4.672410380640946e-06, "loss": 0.2285, "step": 1812 }, { "epoch": 0.8248407643312102, "grad_norm": 1.8069198589432123, "learning_rate": 4.672056630425516e-06, "loss": 0.1228, "step": 1813 }, { "epoch": 0.8252957233848953, "grad_norm": 1.3408435512517318, "learning_rate": 4.671702702719426e-06, "loss": 0.1436, "step": 1814 }, { "epoch": 0.8257506824385805, "grad_norm": 2.0862527734688197, "learning_rate": 4.671348597551599e-06, "loss": 0.2169, "step": 1815 }, { "epoch": 0.8262056414922657, "grad_norm": 2.132580252859084, "learning_rate": 4.670994314950971e-06, "loss": 0.2017, "step": 1816 }, { "epoch": 0.8266606005459509, "grad_norm": 2.5991268132353853, "learning_rate": 4.6706398549464905e-06, "loss": 0.2089, "step": 1817 }, { "epoch": 0.827115559599636, "grad_norm": 2.3181044896129275, "learning_rate": 4.670285217567124e-06, "loss": 0.1531, "step": 1818 }, { "epoch": 0.8275705186533212, "grad_norm": 1.7235570690460182, "learning_rate": 4.6699304028418516e-06, "loss": 0.1933, "step": 1819 }, { "epoch": 0.8280254777070064, "grad_norm": 3.3016739021057884, "learning_rate": 4.669575410799665e-06, "loss": 0.2017, "step": 1820 }, { "epoch": 0.8284804367606915, "grad_norm": 1.3897879186817867, "learning_rate": 4.669220241469573e-06, "loss": 0.1393, "step": 1821 }, { "epoch": 0.8289353958143767, "grad_norm": 1.7530097372349214, "learning_rate": 4.668864894880599e-06, "loss": 0.2163, "step": 1822 }, { "epoch": 0.8293903548680619, "grad_norm": 2.7080878088048337, "learning_rate": 4.668509371061781e-06, "loss": 0.2166, "step": 1823 }, { "epoch": 0.8298453139217471, "grad_norm": 1.9706360861102925, "learning_rate": 4.668153670042171e-06, "loss": 0.2253, "step": 1824 }, { "epoch": 0.8303002729754322, "grad_norm": 1.830442854507149, "learning_rate": 4.667797791850833e-06, "loss": 0.1526, "step": 1825 }, { "epoch": 0.8307552320291174, "grad_norm": 1.7672909680061333, "learning_rate": 4.6674417365168495e-06, "loss": 0.156, "step": 1826 }, { "epoch": 0.8312101910828026, "grad_norm": 1.627604242773907, "learning_rate": 4.667085504069315e-06, "loss": 0.1965, "step": 1827 }, { "epoch": 0.8316651501364877, "grad_norm": 1.6049507259721845, "learning_rate": 4.66672909453734e-06, "loss": 0.1678, "step": 1828 }, { "epoch": 0.8321201091901729, "grad_norm": 2.31296929571475, "learning_rate": 4.6663725079500485e-06, "loss": 0.2126, "step": 1829 }, { "epoch": 0.8325750682438581, "grad_norm": 1.7717243650944572, "learning_rate": 4.666015744336578e-06, "loss": 0.1333, "step": 1830 }, { "epoch": 0.8330300272975433, "grad_norm": 1.9376666152516604, "learning_rate": 4.665658803726083e-06, "loss": 0.161, "step": 1831 }, { "epoch": 0.8334849863512284, "grad_norm": 1.9363986365280477, "learning_rate": 4.6653016861477315e-06, "loss": 0.1736, "step": 1832 }, { "epoch": 0.8339399454049136, "grad_norm": 1.0684481587552732, "learning_rate": 4.664944391630704e-06, "loss": 0.1187, "step": 1833 }, { "epoch": 0.8343949044585988, "grad_norm": 1.9806679260858633, "learning_rate": 4.664586920204197e-06, "loss": 0.1945, "step": 1834 }, { "epoch": 0.8348498635122839, "grad_norm": 2.002852794100086, "learning_rate": 4.664229271897422e-06, "loss": 0.1449, "step": 1835 }, { "epoch": 0.835304822565969, "grad_norm": 1.416210291100934, "learning_rate": 4.663871446739606e-06, "loss": 0.2015, "step": 1836 }, { "epoch": 0.8357597816196543, "grad_norm": 1.8546810159993223, "learning_rate": 4.663513444759986e-06, "loss": 0.1461, "step": 1837 }, { "epoch": 0.8362147406733395, "grad_norm": 2.054627126988846, "learning_rate": 4.663155265987818e-06, "loss": 0.1779, "step": 1838 }, { "epoch": 0.8366696997270245, "grad_norm": 1.8928121217305771, "learning_rate": 4.66279691045237e-06, "loss": 0.1843, "step": 1839 }, { "epoch": 0.8371246587807097, "grad_norm": 2.3586323101492552, "learning_rate": 4.662438378182927e-06, "loss": 0.2396, "step": 1840 }, { "epoch": 0.8375796178343949, "grad_norm": 1.8299500333063181, "learning_rate": 4.662079669208783e-06, "loss": 0.1645, "step": 1841 }, { "epoch": 0.83803457688808, "grad_norm": 2.3480837865967215, "learning_rate": 4.661720783559254e-06, "loss": 0.1788, "step": 1842 }, { "epoch": 0.8384895359417652, "grad_norm": 1.883623814508302, "learning_rate": 4.661361721263664e-06, "loss": 0.1624, "step": 1843 }, { "epoch": 0.8389444949954504, "grad_norm": 2.160772908461247, "learning_rate": 4.661002482351355e-06, "loss": 0.1908, "step": 1844 }, { "epoch": 0.8393994540491356, "grad_norm": 2.187162279477086, "learning_rate": 4.660643066851682e-06, "loss": 0.1808, "step": 1845 }, { "epoch": 0.8398544131028207, "grad_norm": 1.8531325552871911, "learning_rate": 4.6602834747940155e-06, "loss": 0.1914, "step": 1846 }, { "epoch": 0.8403093721565059, "grad_norm": 1.4831331336407363, "learning_rate": 4.6599237062077385e-06, "loss": 0.142, "step": 1847 }, { "epoch": 0.8407643312101911, "grad_norm": 1.931468706645427, "learning_rate": 4.65956376112225e-06, "loss": 0.2224, "step": 1848 }, { "epoch": 0.8412192902638762, "grad_norm": 1.3694171323558038, "learning_rate": 4.659203639566965e-06, "loss": 0.1375, "step": 1849 }, { "epoch": 0.8416742493175614, "grad_norm": 1.936812425945626, "learning_rate": 4.658843341571308e-06, "loss": 0.1342, "step": 1850 }, { "epoch": 0.8421292083712466, "grad_norm": 1.6211061965620477, "learning_rate": 4.6584828671647235e-06, "loss": 0.1241, "step": 1851 }, { "epoch": 0.8425841674249318, "grad_norm": 1.1366286902231244, "learning_rate": 4.658122216376666e-06, "loss": 0.1378, "step": 1852 }, { "epoch": 0.8430391264786169, "grad_norm": 1.6359146658906643, "learning_rate": 4.657761389236607e-06, "loss": 0.2118, "step": 1853 }, { "epoch": 0.8434940855323021, "grad_norm": 2.5329878550243734, "learning_rate": 4.657400385774032e-06, "loss": 0.2193, "step": 1854 }, { "epoch": 0.8439490445859873, "grad_norm": 2.5278755724681425, "learning_rate": 4.65703920601844e-06, "loss": 0.2768, "step": 1855 }, { "epoch": 0.8444040036396724, "grad_norm": 1.590463345818293, "learning_rate": 4.656677849999345e-06, "loss": 0.139, "step": 1856 }, { "epoch": 0.8448589626933576, "grad_norm": 2.5309928033982154, "learning_rate": 4.656316317746275e-06, "loss": 0.1896, "step": 1857 }, { "epoch": 0.8453139217470428, "grad_norm": 1.9131067732573241, "learning_rate": 4.655954609288775e-06, "loss": 0.1584, "step": 1858 }, { "epoch": 0.845768880800728, "grad_norm": 1.676858006295649, "learning_rate": 4.655592724656399e-06, "loss": 0.1413, "step": 1859 }, { "epoch": 0.8462238398544131, "grad_norm": 1.2591774278723207, "learning_rate": 4.655230663878721e-06, "loss": 0.106, "step": 1860 }, { "epoch": 0.8466787989080983, "grad_norm": 1.7932854876030564, "learning_rate": 4.654868426985326e-06, "loss": 0.1417, "step": 1861 }, { "epoch": 0.8471337579617835, "grad_norm": 1.6149020601443298, "learning_rate": 4.654506014005814e-06, "loss": 0.1632, "step": 1862 }, { "epoch": 0.8475887170154686, "grad_norm": 2.4429847082643734, "learning_rate": 4.6541434249698e-06, "loss": 0.1726, "step": 1863 }, { "epoch": 0.8480436760691538, "grad_norm": 1.958537494840022, "learning_rate": 4.6537806599069144e-06, "loss": 0.1918, "step": 1864 }, { "epoch": 0.848498635122839, "grad_norm": 2.0524656641640573, "learning_rate": 4.653417718846799e-06, "loss": 0.1824, "step": 1865 }, { "epoch": 0.8489535941765242, "grad_norm": 2.471476245561928, "learning_rate": 4.6530546018191126e-06, "loss": 0.1833, "step": 1866 }, { "epoch": 0.8494085532302093, "grad_norm": 1.792641798980951, "learning_rate": 4.652691308853526e-06, "loss": 0.1409, "step": 1867 }, { "epoch": 0.8498635122838945, "grad_norm": 1.663452952464092, "learning_rate": 4.652327839979729e-06, "loss": 0.1707, "step": 1868 }, { "epoch": 0.8503184713375797, "grad_norm": 2.201926398578509, "learning_rate": 4.651964195227419e-06, "loss": 0.1399, "step": 1869 }, { "epoch": 0.8507734303912647, "grad_norm": 1.8923698849228574, "learning_rate": 4.651600374626315e-06, "loss": 0.1381, "step": 1870 }, { "epoch": 0.8512283894449499, "grad_norm": 1.4952325363772294, "learning_rate": 4.651236378206144e-06, "loss": 0.1521, "step": 1871 }, { "epoch": 0.8516833484986351, "grad_norm": 1.6339894998223452, "learning_rate": 4.650872205996651e-06, "loss": 0.1813, "step": 1872 }, { "epoch": 0.8521383075523203, "grad_norm": 2.105965789292229, "learning_rate": 4.650507858027595e-06, "loss": 0.1482, "step": 1873 }, { "epoch": 0.8525932666060054, "grad_norm": 1.9949585656638686, "learning_rate": 4.6501433343287475e-06, "loss": 0.1851, "step": 1874 }, { "epoch": 0.8530482256596906, "grad_norm": 2.4070104220391326, "learning_rate": 4.6497786349298975e-06, "loss": 0.1662, "step": 1875 }, { "epoch": 0.8535031847133758, "grad_norm": 1.7461886999738794, "learning_rate": 4.649413759860846e-06, "loss": 0.1254, "step": 1876 }, { "epoch": 0.8539581437670609, "grad_norm": 2.2792475326190016, "learning_rate": 4.649048709151408e-06, "loss": 0.2312, "step": 1877 }, { "epoch": 0.8544131028207461, "grad_norm": 1.3426843322261688, "learning_rate": 4.648683482831415e-06, "loss": 0.1172, "step": 1878 }, { "epoch": 0.8548680618744313, "grad_norm": 2.382337203322208, "learning_rate": 4.648318080930711e-06, "loss": 0.2074, "step": 1879 }, { "epoch": 0.8553230209281165, "grad_norm": 1.712854915430822, "learning_rate": 4.647952503479154e-06, "loss": 0.1704, "step": 1880 }, { "epoch": 0.8557779799818016, "grad_norm": 1.8081149874596518, "learning_rate": 4.6475867505066195e-06, "loss": 0.1917, "step": 1881 }, { "epoch": 0.8562329390354868, "grad_norm": 1.9751613678879965, "learning_rate": 4.647220822042995e-06, "loss": 0.1735, "step": 1882 }, { "epoch": 0.856687898089172, "grad_norm": 2.1327662706521906, "learning_rate": 4.64685471811818e-06, "loss": 0.1449, "step": 1883 }, { "epoch": 0.8571428571428571, "grad_norm": 2.064198155606807, "learning_rate": 4.646488438762094e-06, "loss": 0.23, "step": 1884 }, { "epoch": 0.8575978161965423, "grad_norm": 1.506998926934666, "learning_rate": 4.646121984004666e-06, "loss": 0.165, "step": 1885 }, { "epoch": 0.8580527752502275, "grad_norm": 1.8322392109933523, "learning_rate": 4.64575535387584e-06, "loss": 0.2264, "step": 1886 }, { "epoch": 0.8585077343039127, "grad_norm": 2.0388479228852048, "learning_rate": 4.645388548405578e-06, "loss": 0.2175, "step": 1887 }, { "epoch": 0.8589626933575978, "grad_norm": 2.097249131206244, "learning_rate": 4.645021567623852e-06, "loss": 0.2196, "step": 1888 }, { "epoch": 0.859417652411283, "grad_norm": 1.5275188180484371, "learning_rate": 4.644654411560651e-06, "loss": 0.1417, "step": 1889 }, { "epoch": 0.8598726114649682, "grad_norm": 1.8944498906948435, "learning_rate": 4.644287080245975e-06, "loss": 0.1795, "step": 1890 }, { "epoch": 0.8603275705186533, "grad_norm": 1.983029598334522, "learning_rate": 4.643919573709843e-06, "loss": 0.1986, "step": 1891 }, { "epoch": 0.8607825295723385, "grad_norm": 1.6266032809421398, "learning_rate": 4.6435518919822854e-06, "loss": 0.207, "step": 1892 }, { "epoch": 0.8612374886260237, "grad_norm": 2.19323813493903, "learning_rate": 4.643184035093348e-06, "loss": 0.1393, "step": 1893 }, { "epoch": 0.8616924476797089, "grad_norm": 1.8257509692409855, "learning_rate": 4.642816003073089e-06, "loss": 0.1634, "step": 1894 }, { "epoch": 0.862147406733394, "grad_norm": 1.2900997861243053, "learning_rate": 4.6424477959515836e-06, "loss": 0.1654, "step": 1895 }, { "epoch": 0.8626023657870792, "grad_norm": 1.540771601167976, "learning_rate": 4.642079413758919e-06, "loss": 0.1518, "step": 1896 }, { "epoch": 0.8630573248407644, "grad_norm": 1.899942137953783, "learning_rate": 4.641710856525199e-06, "loss": 0.1821, "step": 1897 }, { "epoch": 0.8635122838944495, "grad_norm": 1.4129439458546442, "learning_rate": 4.641342124280539e-06, "loss": 0.1716, "step": 1898 }, { "epoch": 0.8639672429481347, "grad_norm": 2.3313958542346995, "learning_rate": 4.6409732170550705e-06, "loss": 0.1687, "step": 1899 }, { "epoch": 0.8644222020018199, "grad_norm": 1.4646430511341277, "learning_rate": 4.64060413487894e-06, "loss": 0.1321, "step": 1900 }, { "epoch": 0.864877161055505, "grad_norm": 1.6578645771032987, "learning_rate": 4.640234877782306e-06, "loss": 0.1339, "step": 1901 }, { "epoch": 0.8653321201091901, "grad_norm": 2.115428055628657, "learning_rate": 4.639865445795344e-06, "loss": 0.155, "step": 1902 }, { "epoch": 0.8657870791628753, "grad_norm": 1.4246658971760144, "learning_rate": 4.63949583894824e-06, "loss": 0.1211, "step": 1903 }, { "epoch": 0.8662420382165605, "grad_norm": 1.8915165798317974, "learning_rate": 4.639126057271199e-06, "loss": 0.1943, "step": 1904 }, { "epoch": 0.8666969972702456, "grad_norm": 1.4359286477489568, "learning_rate": 4.6387561007944355e-06, "loss": 0.1927, "step": 1905 }, { "epoch": 0.8671519563239308, "grad_norm": 1.7402908671263166, "learning_rate": 4.638385969548183e-06, "loss": 0.197, "step": 1906 }, { "epoch": 0.867606915377616, "grad_norm": 1.6362731205557584, "learning_rate": 4.638015663562686e-06, "loss": 0.1383, "step": 1907 }, { "epoch": 0.8680618744313012, "grad_norm": 2.4913116222464837, "learning_rate": 4.637645182868204e-06, "loss": 0.2, "step": 1908 }, { "epoch": 0.8685168334849863, "grad_norm": 1.254842356705368, "learning_rate": 4.637274527495011e-06, "loss": 0.121, "step": 1909 }, { "epoch": 0.8689717925386715, "grad_norm": 1.5120107885745528, "learning_rate": 4.6369036974733955e-06, "loss": 0.1464, "step": 1910 }, { "epoch": 0.8694267515923567, "grad_norm": 1.392142485713207, "learning_rate": 4.63653269283366e-06, "loss": 0.1325, "step": 1911 }, { "epoch": 0.8698817106460418, "grad_norm": 1.6362489180779098, "learning_rate": 4.636161513606122e-06, "loss": 0.1887, "step": 1912 }, { "epoch": 0.870336669699727, "grad_norm": 1.7061998927826107, "learning_rate": 4.6357901598211105e-06, "loss": 0.1559, "step": 1913 }, { "epoch": 0.8707916287534122, "grad_norm": 1.7490187306928824, "learning_rate": 4.635418631508974e-06, "loss": 0.1504, "step": 1914 }, { "epoch": 0.8712465878070974, "grad_norm": 1.7459918799385958, "learning_rate": 4.635046928700069e-06, "loss": 0.1737, "step": 1915 }, { "epoch": 0.8717015468607825, "grad_norm": 2.128565340614342, "learning_rate": 4.634675051424771e-06, "loss": 0.1843, "step": 1916 }, { "epoch": 0.8721565059144677, "grad_norm": 1.5616930523249197, "learning_rate": 4.634302999713468e-06, "loss": 0.1004, "step": 1917 }, { "epoch": 0.8726114649681529, "grad_norm": 1.886440296737102, "learning_rate": 4.633930773596563e-06, "loss": 0.2085, "step": 1918 }, { "epoch": 0.873066424021838, "grad_norm": 1.6874199025414718, "learning_rate": 4.633558373104472e-06, "loss": 0.1965, "step": 1919 }, { "epoch": 0.8735213830755232, "grad_norm": 1.4248884120885352, "learning_rate": 4.633185798267625e-06, "loss": 0.1814, "step": 1920 }, { "epoch": 0.8739763421292084, "grad_norm": 2.0576525781987107, "learning_rate": 4.632813049116467e-06, "loss": 0.2251, "step": 1921 }, { "epoch": 0.8744313011828936, "grad_norm": 2.422851032077204, "learning_rate": 4.63244012568146e-06, "loss": 0.1949, "step": 1922 }, { "epoch": 0.8748862602365787, "grad_norm": 2.1417664356799087, "learning_rate": 4.632067027993076e-06, "loss": 0.1548, "step": 1923 }, { "epoch": 0.8753412192902639, "grad_norm": 1.4407274073506169, "learning_rate": 4.631693756081802e-06, "loss": 0.1252, "step": 1924 }, { "epoch": 0.8757961783439491, "grad_norm": 1.6004631673541039, "learning_rate": 4.631320309978141e-06, "loss": 0.1876, "step": 1925 }, { "epoch": 0.8762511373976342, "grad_norm": 1.7251546761372085, "learning_rate": 4.630946689712609e-06, "loss": 0.1624, "step": 1926 }, { "epoch": 0.8767060964513194, "grad_norm": 1.7738030549432209, "learning_rate": 4.630572895315737e-06, "loss": 0.1748, "step": 1927 }, { "epoch": 0.8771610555050046, "grad_norm": 1.0086649768907636, "learning_rate": 4.63019892681807e-06, "loss": 0.1032, "step": 1928 }, { "epoch": 0.8776160145586898, "grad_norm": 1.2701304891541718, "learning_rate": 4.629824784250166e-06, "loss": 0.1192, "step": 1929 }, { "epoch": 0.8780709736123748, "grad_norm": 1.6784044296991356, "learning_rate": 4.629450467642599e-06, "loss": 0.1265, "step": 1930 }, { "epoch": 0.87852593266606, "grad_norm": 1.976065902819502, "learning_rate": 4.629075977025957e-06, "loss": 0.1681, "step": 1931 }, { "epoch": 0.8789808917197452, "grad_norm": 1.6213814808866245, "learning_rate": 4.62870131243084e-06, "loss": 0.1493, "step": 1932 }, { "epoch": 0.8794358507734303, "grad_norm": 1.9807101332336867, "learning_rate": 4.628326473887865e-06, "loss": 0.1095, "step": 1933 }, { "epoch": 0.8798908098271155, "grad_norm": 1.3613443516857038, "learning_rate": 4.627951461427663e-06, "loss": 0.0886, "step": 1934 }, { "epoch": 0.8803457688808007, "grad_norm": 2.294295361155117, "learning_rate": 4.627576275080876e-06, "loss": 0.1782, "step": 1935 }, { "epoch": 0.8808007279344859, "grad_norm": 1.465162455531879, "learning_rate": 4.627200914878165e-06, "loss": 0.1689, "step": 1936 }, { "epoch": 0.881255686988171, "grad_norm": 1.9852567754309711, "learning_rate": 4.6268253808502005e-06, "loss": 0.1953, "step": 1937 }, { "epoch": 0.8817106460418562, "grad_norm": 1.3259365892059651, "learning_rate": 4.626449673027671e-06, "loss": 0.1186, "step": 1938 }, { "epoch": 0.8821656050955414, "grad_norm": 2.311627846572585, "learning_rate": 4.626073791441278e-06, "loss": 0.175, "step": 1939 }, { "epoch": 0.8826205641492265, "grad_norm": 1.403685443623727, "learning_rate": 4.625697736121735e-06, "loss": 0.1632, "step": 1940 }, { "epoch": 0.8830755232029117, "grad_norm": 1.8370812337880758, "learning_rate": 4.6253215070997735e-06, "loss": 0.1805, "step": 1941 }, { "epoch": 0.8835304822565969, "grad_norm": 1.7617734494239499, "learning_rate": 4.624945104406135e-06, "loss": 0.1484, "step": 1942 }, { "epoch": 0.8839854413102821, "grad_norm": 1.2929099916167694, "learning_rate": 4.624568528071579e-06, "loss": 0.1109, "step": 1943 }, { "epoch": 0.8844404003639672, "grad_norm": 1.6991526267122765, "learning_rate": 4.624191778126879e-06, "loss": 0.1833, "step": 1944 }, { "epoch": 0.8848953594176524, "grad_norm": 1.947027254377722, "learning_rate": 4.623814854602818e-06, "loss": 0.2251, "step": 1945 }, { "epoch": 0.8853503184713376, "grad_norm": 1.7473125338322357, "learning_rate": 4.623437757530198e-06, "loss": 0.1144, "step": 1946 }, { "epoch": 0.8858052775250227, "grad_norm": 1.56986818124434, "learning_rate": 4.623060486939835e-06, "loss": 0.1507, "step": 1947 }, { "epoch": 0.8862602365787079, "grad_norm": 2.2731317429688995, "learning_rate": 4.622683042862556e-06, "loss": 0.1854, "step": 1948 }, { "epoch": 0.8867151956323931, "grad_norm": 1.5668080033034493, "learning_rate": 4.622305425329205e-06, "loss": 0.1093, "step": 1949 }, { "epoch": 0.8871701546860783, "grad_norm": 1.4666243413929643, "learning_rate": 4.621927634370638e-06, "loss": 0.1179, "step": 1950 }, { "epoch": 0.8876251137397634, "grad_norm": 2.142207445885291, "learning_rate": 4.621549670017727e-06, "loss": 0.2196, "step": 1951 }, { "epoch": 0.8880800727934486, "grad_norm": 1.9929367654553447, "learning_rate": 4.6211715323013595e-06, "loss": 0.1926, "step": 1952 }, { "epoch": 0.8885350318471338, "grad_norm": 1.8377495474805912, "learning_rate": 4.6207932212524325e-06, "loss": 0.1879, "step": 1953 }, { "epoch": 0.8889899909008189, "grad_norm": 1.8025632169370749, "learning_rate": 4.620414736901861e-06, "loss": 0.1627, "step": 1954 }, { "epoch": 0.8894449499545041, "grad_norm": 1.7867128092311804, "learning_rate": 4.620036079280573e-06, "loss": 0.2169, "step": 1955 }, { "epoch": 0.8898999090081893, "grad_norm": 2.4571527122530776, "learning_rate": 4.619657248419511e-06, "loss": 0.2337, "step": 1956 }, { "epoch": 0.8903548680618745, "grad_norm": 1.5424608043537418, "learning_rate": 4.61927824434963e-06, "loss": 0.134, "step": 1957 }, { "epoch": 0.8908098271155596, "grad_norm": 1.8248865805885555, "learning_rate": 4.6188990671019015e-06, "loss": 0.1473, "step": 1958 }, { "epoch": 0.8912647861692448, "grad_norm": 1.2825883167116863, "learning_rate": 4.618519716707311e-06, "loss": 0.1377, "step": 1959 }, { "epoch": 0.89171974522293, "grad_norm": 1.9837251078508047, "learning_rate": 4.618140193196856e-06, "loss": 0.1736, "step": 1960 }, { "epoch": 0.892174704276615, "grad_norm": 1.604956750795707, "learning_rate": 4.61776049660155e-06, "loss": 0.1711, "step": 1961 }, { "epoch": 0.8926296633303002, "grad_norm": 1.5703167687380166, "learning_rate": 4.61738062695242e-06, "loss": 0.1519, "step": 1962 }, { "epoch": 0.8930846223839854, "grad_norm": 2.2186984451911638, "learning_rate": 4.617000584280506e-06, "loss": 0.1443, "step": 1963 }, { "epoch": 0.8935395814376706, "grad_norm": 1.906102770647992, "learning_rate": 4.616620368616866e-06, "loss": 0.1878, "step": 1964 }, { "epoch": 0.8939945404913557, "grad_norm": 2.0871942985325167, "learning_rate": 4.616239979992568e-06, "loss": 0.2384, "step": 1965 }, { "epoch": 0.8944494995450409, "grad_norm": 1.6638677246444422, "learning_rate": 4.615859418438695e-06, "loss": 0.1792, "step": 1966 }, { "epoch": 0.8949044585987261, "grad_norm": 1.387205154257509, "learning_rate": 4.615478683986345e-06, "loss": 0.144, "step": 1967 }, { "epoch": 0.8953594176524113, "grad_norm": 1.8836562093395437, "learning_rate": 4.6150977766666315e-06, "loss": 0.2174, "step": 1968 }, { "epoch": 0.8958143767060964, "grad_norm": 1.9229400987313323, "learning_rate": 4.614716696510679e-06, "loss": 0.2241, "step": 1969 }, { "epoch": 0.8962693357597816, "grad_norm": 1.8744101552937114, "learning_rate": 4.614335443549628e-06, "loss": 0.1402, "step": 1970 }, { "epoch": 0.8967242948134668, "grad_norm": 1.7357579966910537, "learning_rate": 4.613954017814633e-06, "loss": 0.1286, "step": 1971 }, { "epoch": 0.8971792538671519, "grad_norm": 1.8840478367784224, "learning_rate": 4.613572419336862e-06, "loss": 0.1342, "step": 1972 }, { "epoch": 0.8976342129208371, "grad_norm": 1.5927521655138008, "learning_rate": 4.613190648147497e-06, "loss": 0.1513, "step": 1973 }, { "epoch": 0.8980891719745223, "grad_norm": 2.065610545817281, "learning_rate": 4.612808704277736e-06, "loss": 0.2084, "step": 1974 }, { "epoch": 0.8985441310282075, "grad_norm": 1.5284731538672136, "learning_rate": 4.612426587758789e-06, "loss": 0.188, "step": 1975 }, { "epoch": 0.8989990900818926, "grad_norm": 2.023375971468293, "learning_rate": 4.612044298621881e-06, "loss": 0.1344, "step": 1976 }, { "epoch": 0.8994540491355778, "grad_norm": 1.9534402095489405, "learning_rate": 4.611661836898252e-06, "loss": 0.1738, "step": 1977 }, { "epoch": 0.899909008189263, "grad_norm": 1.9156260955002997, "learning_rate": 4.611279202619151e-06, "loss": 0.1668, "step": 1978 }, { "epoch": 0.9003639672429481, "grad_norm": 1.9526723286463348, "learning_rate": 4.61089639581585e-06, "loss": 0.1669, "step": 1979 }, { "epoch": 0.9008189262966333, "grad_norm": 1.9056078059584818, "learning_rate": 4.610513416519628e-06, "loss": 0.1507, "step": 1980 }, { "epoch": 0.9012738853503185, "grad_norm": 1.5105931587228634, "learning_rate": 4.6101302647617806e-06, "loss": 0.1488, "step": 1981 }, { "epoch": 0.9017288444040037, "grad_norm": 2.0835062062044347, "learning_rate": 4.609746940573617e-06, "loss": 0.1324, "step": 1982 }, { "epoch": 0.9021838034576888, "grad_norm": 1.9577939305337912, "learning_rate": 4.609363443986461e-06, "loss": 0.1636, "step": 1983 }, { "epoch": 0.902638762511374, "grad_norm": 1.7800989438629395, "learning_rate": 4.60897977503165e-06, "loss": 0.1754, "step": 1984 }, { "epoch": 0.9030937215650592, "grad_norm": 2.1110656440447544, "learning_rate": 4.608595933740536e-06, "loss": 0.2122, "step": 1985 }, { "epoch": 0.9035486806187443, "grad_norm": 1.286237936407134, "learning_rate": 4.608211920144485e-06, "loss": 0.202, "step": 1986 }, { "epoch": 0.9040036396724295, "grad_norm": 2.2604741864786178, "learning_rate": 4.607827734274876e-06, "loss": 0.1669, "step": 1987 }, { "epoch": 0.9044585987261147, "grad_norm": 1.7607840905259224, "learning_rate": 4.607443376163104e-06, "loss": 0.1375, "step": 1988 }, { "epoch": 0.9049135577797999, "grad_norm": 1.7402029650347348, "learning_rate": 4.607058845840576e-06, "loss": 0.1431, "step": 1989 }, { "epoch": 0.905368516833485, "grad_norm": 1.666160268732321, "learning_rate": 4.606674143338714e-06, "loss": 0.1485, "step": 1990 }, { "epoch": 0.9058234758871702, "grad_norm": 2.0612124207721654, "learning_rate": 4.606289268688955e-06, "loss": 0.1419, "step": 1991 }, { "epoch": 0.9062784349408554, "grad_norm": 1.9143034406546822, "learning_rate": 4.605904221922749e-06, "loss": 0.1842, "step": 1992 }, { "epoch": 0.9067333939945404, "grad_norm": 2.410587966058405, "learning_rate": 4.6055190030715605e-06, "loss": 0.1858, "step": 1993 }, { "epoch": 0.9071883530482256, "grad_norm": 1.4389936850061738, "learning_rate": 4.605133612166868e-06, "loss": 0.1387, "step": 1994 }, { "epoch": 0.9076433121019108, "grad_norm": 1.546723165322591, "learning_rate": 4.604748049240162e-06, "loss": 0.1353, "step": 1995 }, { "epoch": 0.908098271155596, "grad_norm": 1.510897129777589, "learning_rate": 4.604362314322951e-06, "loss": 0.1322, "step": 1996 }, { "epoch": 0.9085532302092811, "grad_norm": 2.3885439589368147, "learning_rate": 4.603976407446756e-06, "loss": 0.1656, "step": 1997 }, { "epoch": 0.9090081892629663, "grad_norm": 1.193637078798613, "learning_rate": 4.603590328643108e-06, "loss": 0.1057, "step": 1998 }, { "epoch": 0.9094631483166515, "grad_norm": 1.910033395843472, "learning_rate": 4.60320407794356e-06, "loss": 0.1519, "step": 1999 }, { "epoch": 0.9099181073703366, "grad_norm": 1.6867999496406765, "learning_rate": 4.602817655379672e-06, "loss": 0.1776, "step": 2000 }, { "epoch": 0.9103730664240218, "grad_norm": 1.7117660414525686, "learning_rate": 4.602431060983022e-06, "loss": 0.1451, "step": 2001 }, { "epoch": 0.910828025477707, "grad_norm": 1.4990428536514322, "learning_rate": 4.6020442947852e-06, "loss": 0.1409, "step": 2002 }, { "epoch": 0.9112829845313922, "grad_norm": 1.446262498955875, "learning_rate": 4.6016573568178105e-06, "loss": 0.1135, "step": 2003 }, { "epoch": 0.9117379435850773, "grad_norm": 1.6571232403743137, "learning_rate": 4.601270247112473e-06, "loss": 0.2404, "step": 2004 }, { "epoch": 0.9121929026387625, "grad_norm": 2.0064329107593646, "learning_rate": 4.60088296570082e-06, "loss": 0.1905, "step": 2005 }, { "epoch": 0.9126478616924477, "grad_norm": 1.4125062029338067, "learning_rate": 4.600495512614499e-06, "loss": 0.1117, "step": 2006 }, { "epoch": 0.9131028207461328, "grad_norm": 1.8059848267053757, "learning_rate": 4.60010788788517e-06, "loss": 0.2289, "step": 2007 }, { "epoch": 0.913557779799818, "grad_norm": 1.8237596303340968, "learning_rate": 4.5997200915445095e-06, "loss": 0.1983, "step": 2008 }, { "epoch": 0.9140127388535032, "grad_norm": 1.6824481144619179, "learning_rate": 4.599332123624204e-06, "loss": 0.1361, "step": 2009 }, { "epoch": 0.9144676979071884, "grad_norm": 1.5469841434239995, "learning_rate": 4.598943984155959e-06, "loss": 0.1561, "step": 2010 }, { "epoch": 0.9149226569608735, "grad_norm": 1.1721008124510859, "learning_rate": 4.598555673171489e-06, "loss": 0.0997, "step": 2011 }, { "epoch": 0.9153776160145587, "grad_norm": 1.367389738430673, "learning_rate": 4.5981671907025275e-06, "loss": 0.124, "step": 2012 }, { "epoch": 0.9158325750682439, "grad_norm": 1.9852471647698953, "learning_rate": 4.597778536780818e-06, "loss": 0.1746, "step": 2013 }, { "epoch": 0.916287534121929, "grad_norm": 2.1379896488178405, "learning_rate": 4.597389711438121e-06, "loss": 0.2387, "step": 2014 }, { "epoch": 0.9167424931756142, "grad_norm": 1.4433682072802856, "learning_rate": 4.597000714706207e-06, "loss": 0.1261, "step": 2015 }, { "epoch": 0.9171974522292994, "grad_norm": 1.92195373557543, "learning_rate": 4.596611546616865e-06, "loss": 0.1982, "step": 2016 }, { "epoch": 0.9176524112829846, "grad_norm": 1.9323067168518875, "learning_rate": 4.596222207201896e-06, "loss": 0.1767, "step": 2017 }, { "epoch": 0.9181073703366697, "grad_norm": 1.7925696405315172, "learning_rate": 4.595832696493115e-06, "loss": 0.1692, "step": 2018 }, { "epoch": 0.9185623293903549, "grad_norm": 1.6896362560345692, "learning_rate": 4.59544301452235e-06, "loss": 0.1527, "step": 2019 }, { "epoch": 0.9190172884440401, "grad_norm": 2.6520358388003307, "learning_rate": 4.595053161321444e-06, "loss": 0.2183, "step": 2020 }, { "epoch": 0.9194722474977252, "grad_norm": 1.8502691763569332, "learning_rate": 4.594663136922256e-06, "loss": 0.2027, "step": 2021 }, { "epoch": 0.9199272065514104, "grad_norm": 1.66876391954138, "learning_rate": 4.594272941356655e-06, "loss": 0.1592, "step": 2022 }, { "epoch": 0.9203821656050956, "grad_norm": 2.000282499671209, "learning_rate": 4.593882574656528e-06, "loss": 0.1899, "step": 2023 }, { "epoch": 0.9208371246587808, "grad_norm": 2.1057167872680864, "learning_rate": 4.5934920368537724e-06, "loss": 0.1649, "step": 2024 }, { "epoch": 0.9212920837124658, "grad_norm": 2.3421388058050603, "learning_rate": 4.593101327980301e-06, "loss": 0.1953, "step": 2025 }, { "epoch": 0.921747042766151, "grad_norm": 1.4619166894313524, "learning_rate": 4.592710448068043e-06, "loss": 0.1645, "step": 2026 }, { "epoch": 0.9222020018198362, "grad_norm": 2.1135622970646457, "learning_rate": 4.592319397148936e-06, "loss": 0.1391, "step": 2027 }, { "epoch": 0.9226569608735213, "grad_norm": 1.2948388707877838, "learning_rate": 4.5919281752549386e-06, "loss": 0.1465, "step": 2028 }, { "epoch": 0.9231119199272065, "grad_norm": 2.587913347360957, "learning_rate": 4.5915367824180165e-06, "loss": 0.2171, "step": 2029 }, { "epoch": 0.9235668789808917, "grad_norm": 1.2685293245744347, "learning_rate": 4.591145218670154e-06, "loss": 0.1127, "step": 2030 }, { "epoch": 0.9240218380345769, "grad_norm": 1.99832008478398, "learning_rate": 4.590753484043348e-06, "loss": 0.1795, "step": 2031 }, { "epoch": 0.924476797088262, "grad_norm": 1.9341588389439468, "learning_rate": 4.590361578569609e-06, "loss": 0.1625, "step": 2032 }, { "epoch": 0.9249317561419472, "grad_norm": 1.906987896729889, "learning_rate": 4.589969502280962e-06, "loss": 0.1292, "step": 2033 }, { "epoch": 0.9253867151956324, "grad_norm": 1.3759296704205837, "learning_rate": 4.589577255209445e-06, "loss": 0.1618, "step": 2034 }, { "epoch": 0.9258416742493175, "grad_norm": 1.7824080215785223, "learning_rate": 4.589184837387112e-06, "loss": 0.1571, "step": 2035 }, { "epoch": 0.9262966333030027, "grad_norm": 1.969233090292503, "learning_rate": 4.588792248846028e-06, "loss": 0.1565, "step": 2036 }, { "epoch": 0.9267515923566879, "grad_norm": 2.0350441155725982, "learning_rate": 4.588399489618274e-06, "loss": 0.2092, "step": 2037 }, { "epoch": 0.9272065514103731, "grad_norm": 1.3739303279350978, "learning_rate": 4.588006559735945e-06, "loss": 0.1144, "step": 2038 }, { "epoch": 0.9276615104640582, "grad_norm": 1.8231719010868002, "learning_rate": 4.587613459231149e-06, "loss": 0.19, "step": 2039 }, { "epoch": 0.9281164695177434, "grad_norm": 1.7222249399366698, "learning_rate": 4.5872201881360105e-06, "loss": 0.1818, "step": 2040 }, { "epoch": 0.9285714285714286, "grad_norm": 1.9962016913755094, "learning_rate": 4.586826746482662e-06, "loss": 0.1858, "step": 2041 }, { "epoch": 0.9290263876251137, "grad_norm": 1.581565012958607, "learning_rate": 4.586433134303257e-06, "loss": 0.1388, "step": 2042 }, { "epoch": 0.9294813466787989, "grad_norm": 2.2212237230761342, "learning_rate": 4.586039351629959e-06, "loss": 0.1627, "step": 2043 }, { "epoch": 0.9299363057324841, "grad_norm": 2.4442840318574954, "learning_rate": 4.585645398494944e-06, "loss": 0.1421, "step": 2044 }, { "epoch": 0.9303912647861693, "grad_norm": 1.63124630524275, "learning_rate": 4.585251274930406e-06, "loss": 0.1553, "step": 2045 }, { "epoch": 0.9308462238398544, "grad_norm": 1.9068361286149722, "learning_rate": 4.584856980968552e-06, "loss": 0.195, "step": 2046 }, { "epoch": 0.9313011828935396, "grad_norm": 1.8750052649788462, "learning_rate": 4.584462516641599e-06, "loss": 0.1843, "step": 2047 }, { "epoch": 0.9317561419472248, "grad_norm": 1.8692305314343534, "learning_rate": 4.584067881981784e-06, "loss": 0.1607, "step": 2048 }, { "epoch": 0.9322111010009099, "grad_norm": 1.7454178600595318, "learning_rate": 4.583673077021352e-06, "loss": 0.1166, "step": 2049 }, { "epoch": 0.9326660600545951, "grad_norm": 1.7370379964519336, "learning_rate": 4.583278101792567e-06, "loss": 0.1658, "step": 2050 }, { "epoch": 0.9331210191082803, "grad_norm": 1.6957581344539345, "learning_rate": 4.582882956327704e-06, "loss": 0.1394, "step": 2051 }, { "epoch": 0.9335759781619655, "grad_norm": 1.8052091804015933, "learning_rate": 4.58248764065905e-06, "loss": 0.1571, "step": 2052 }, { "epoch": 0.9340309372156506, "grad_norm": 1.5675006184278855, "learning_rate": 4.582092154818912e-06, "loss": 0.145, "step": 2053 }, { "epoch": 0.9344858962693358, "grad_norm": 1.6024320375744705, "learning_rate": 4.581696498839605e-06, "loss": 0.2042, "step": 2054 }, { "epoch": 0.934940855323021, "grad_norm": 1.8058483639041405, "learning_rate": 4.581300672753462e-06, "loss": 0.1661, "step": 2055 }, { "epoch": 0.935395814376706, "grad_norm": 1.9556770558432066, "learning_rate": 4.580904676592826e-06, "loss": 0.1767, "step": 2056 }, { "epoch": 0.9358507734303912, "grad_norm": 1.5186464139909968, "learning_rate": 4.580508510390057e-06, "loss": 0.1131, "step": 2057 }, { "epoch": 0.9363057324840764, "grad_norm": 1.5844512517498417, "learning_rate": 4.580112174177529e-06, "loss": 0.1815, "step": 2058 }, { "epoch": 0.9367606915377616, "grad_norm": 1.382066796659836, "learning_rate": 4.5797156679876274e-06, "loss": 0.1073, "step": 2059 }, { "epoch": 0.9372156505914467, "grad_norm": 2.7590592902292332, "learning_rate": 4.5793189918527524e-06, "loss": 0.3083, "step": 2060 }, { "epoch": 0.9376706096451319, "grad_norm": 2.097729619621905, "learning_rate": 4.5789221458053205e-06, "loss": 0.1572, "step": 2061 }, { "epoch": 0.9381255686988171, "grad_norm": 2.269383743265302, "learning_rate": 4.578525129877759e-06, "loss": 0.2157, "step": 2062 }, { "epoch": 0.9385805277525022, "grad_norm": 1.704369436738576, "learning_rate": 4.5781279441025105e-06, "loss": 0.1746, "step": 2063 }, { "epoch": 0.9390354868061874, "grad_norm": 1.961199267422335, "learning_rate": 4.577730588512031e-06, "loss": 0.1794, "step": 2064 }, { "epoch": 0.9394904458598726, "grad_norm": 2.0070527773957663, "learning_rate": 4.577333063138791e-06, "loss": 0.1744, "step": 2065 }, { "epoch": 0.9399454049135578, "grad_norm": 1.4918844273699323, "learning_rate": 4.576935368015274e-06, "loss": 0.1614, "step": 2066 }, { "epoch": 0.9404003639672429, "grad_norm": 1.957075251939811, "learning_rate": 4.576537503173978e-06, "loss": 0.2007, "step": 2067 }, { "epoch": 0.9408553230209281, "grad_norm": 2.1344327287579916, "learning_rate": 4.576139468647415e-06, "loss": 0.1953, "step": 2068 }, { "epoch": 0.9413102820746133, "grad_norm": 2.052141999542276, "learning_rate": 4.575741264468111e-06, "loss": 0.1247, "step": 2069 }, { "epoch": 0.9417652411282984, "grad_norm": 1.9687685313144003, "learning_rate": 4.575342890668603e-06, "loss": 0.1941, "step": 2070 }, { "epoch": 0.9422202001819836, "grad_norm": 2.1906738543597695, "learning_rate": 4.574944347281448e-06, "loss": 0.2436, "step": 2071 }, { "epoch": 0.9426751592356688, "grad_norm": 2.0326378397322253, "learning_rate": 4.5745456343392114e-06, "loss": 0.1916, "step": 2072 }, { "epoch": 0.943130118289354, "grad_norm": 1.9398275581691273, "learning_rate": 4.574146751874473e-06, "loss": 0.2243, "step": 2073 }, { "epoch": 0.9435850773430391, "grad_norm": 1.583576444036144, "learning_rate": 4.57374769991983e-06, "loss": 0.1335, "step": 2074 }, { "epoch": 0.9440400363967243, "grad_norm": 1.49493272878593, "learning_rate": 4.573348478507888e-06, "loss": 0.132, "step": 2075 }, { "epoch": 0.9444949954504095, "grad_norm": 2.191087505295727, "learning_rate": 4.5729490876712725e-06, "loss": 0.2728, "step": 2076 }, { "epoch": 0.9449499545040946, "grad_norm": 1.5696743668055735, "learning_rate": 4.572549527442619e-06, "loss": 0.1167, "step": 2077 }, { "epoch": 0.9454049135577798, "grad_norm": 1.4703104600885406, "learning_rate": 4.572149797854578e-06, "loss": 0.1481, "step": 2078 }, { "epoch": 0.945859872611465, "grad_norm": 1.3375471658633535, "learning_rate": 4.571749898939813e-06, "loss": 0.1448, "step": 2079 }, { "epoch": 0.9463148316651502, "grad_norm": 1.1353706299658501, "learning_rate": 4.5713498307310024e-06, "loss": 0.1095, "step": 2080 }, { "epoch": 0.9467697907188353, "grad_norm": 1.170226192835475, "learning_rate": 4.570949593260837e-06, "loss": 0.1025, "step": 2081 }, { "epoch": 0.9472247497725205, "grad_norm": 1.611590656998796, "learning_rate": 4.570549186562024e-06, "loss": 0.1648, "step": 2082 }, { "epoch": 0.9476797088262057, "grad_norm": 1.9894469425244659, "learning_rate": 4.570148610667281e-06, "loss": 0.2171, "step": 2083 }, { "epoch": 0.9481346678798908, "grad_norm": 2.6290643290299403, "learning_rate": 4.569747865609343e-06, "loss": 0.2035, "step": 2084 }, { "epoch": 0.948589626933576, "grad_norm": 1.9997278123807103, "learning_rate": 4.569346951420957e-06, "loss": 0.219, "step": 2085 }, { "epoch": 0.9490445859872612, "grad_norm": 2.3647369288676465, "learning_rate": 4.568945868134882e-06, "loss": 0.1821, "step": 2086 }, { "epoch": 0.9494995450409464, "grad_norm": 1.4361032491832602, "learning_rate": 4.568544615783894e-06, "loss": 0.174, "step": 2087 }, { "epoch": 0.9499545040946314, "grad_norm": 2.4948435319990794, "learning_rate": 4.568143194400782e-06, "loss": 0.162, "step": 2088 }, { "epoch": 0.9504094631483166, "grad_norm": 2.3391791745125823, "learning_rate": 4.567741604018348e-06, "loss": 0.1731, "step": 2089 }, { "epoch": 0.9508644222020018, "grad_norm": 1.9417130047261684, "learning_rate": 4.567339844669407e-06, "loss": 0.2115, "step": 2090 }, { "epoch": 0.9513193812556869, "grad_norm": 1.341309783614821, "learning_rate": 4.566937916386791e-06, "loss": 0.1207, "step": 2091 }, { "epoch": 0.9517743403093721, "grad_norm": 1.8063160975644432, "learning_rate": 4.566535819203342e-06, "loss": 0.1484, "step": 2092 }, { "epoch": 0.9522292993630573, "grad_norm": 1.4064547804406506, "learning_rate": 4.566133553151918e-06, "loss": 0.1696, "step": 2093 }, { "epoch": 0.9526842584167425, "grad_norm": 1.5123792301862293, "learning_rate": 4.565731118265392e-06, "loss": 0.1513, "step": 2094 }, { "epoch": 0.9531392174704276, "grad_norm": 2.6660242675499974, "learning_rate": 4.5653285145766465e-06, "loss": 0.1967, "step": 2095 }, { "epoch": 0.9535941765241128, "grad_norm": 1.3182075171271719, "learning_rate": 4.564925742118583e-06, "loss": 0.1647, "step": 2096 }, { "epoch": 0.954049135577798, "grad_norm": 2.0246143369138583, "learning_rate": 4.564522800924111e-06, "loss": 0.1933, "step": 2097 }, { "epoch": 0.9545040946314831, "grad_norm": 1.5229871866624265, "learning_rate": 4.56411969102616e-06, "loss": 0.1262, "step": 2098 }, { "epoch": 0.9549590536851683, "grad_norm": 1.6259281484911337, "learning_rate": 4.5637164124576695e-06, "loss": 0.22, "step": 2099 }, { "epoch": 0.9554140127388535, "grad_norm": 2.2924228140977534, "learning_rate": 4.563312965251594e-06, "loss": 0.1788, "step": 2100 }, { "epoch": 0.9558689717925387, "grad_norm": 2.145017083065323, "learning_rate": 4.562909349440899e-06, "loss": 0.1997, "step": 2101 }, { "epoch": 0.9563239308462238, "grad_norm": 1.4998751606083633, "learning_rate": 4.5625055650585695e-06, "loss": 0.1268, "step": 2102 }, { "epoch": 0.956778889899909, "grad_norm": 2.212976295267469, "learning_rate": 4.562101612137599e-06, "loss": 0.1717, "step": 2103 }, { "epoch": 0.9572338489535942, "grad_norm": 1.679438029199367, "learning_rate": 4.561697490710998e-06, "loss": 0.1072, "step": 2104 }, { "epoch": 0.9576888080072793, "grad_norm": 2.079365510674891, "learning_rate": 4.561293200811787e-06, "loss": 0.1746, "step": 2105 }, { "epoch": 0.9581437670609645, "grad_norm": 1.686198495026396, "learning_rate": 4.560888742473005e-06, "loss": 0.1561, "step": 2106 }, { "epoch": 0.9585987261146497, "grad_norm": 1.6637740262678333, "learning_rate": 4.560484115727703e-06, "loss": 0.202, "step": 2107 }, { "epoch": 0.9590536851683349, "grad_norm": 1.3363367490497915, "learning_rate": 4.560079320608942e-06, "loss": 0.1505, "step": 2108 }, { "epoch": 0.95950864422202, "grad_norm": 1.3524224143962482, "learning_rate": 4.5596743571498035e-06, "loss": 0.1556, "step": 2109 }, { "epoch": 0.9599636032757052, "grad_norm": 2.051012825316942, "learning_rate": 4.5592692253833775e-06, "loss": 0.1557, "step": 2110 }, { "epoch": 0.9604185623293904, "grad_norm": 1.8725405774246842, "learning_rate": 4.5588639253427705e-06, "loss": 0.1361, "step": 2111 }, { "epoch": 0.9608735213830755, "grad_norm": 1.6129721682768872, "learning_rate": 4.558458457061101e-06, "loss": 0.1604, "step": 2112 }, { "epoch": 0.9613284804367607, "grad_norm": 2.4257644594708654, "learning_rate": 4.5580528205715024e-06, "loss": 0.1728, "step": 2113 }, { "epoch": 0.9617834394904459, "grad_norm": 2.2020262494310714, "learning_rate": 4.557647015907121e-06, "loss": 0.1982, "step": 2114 }, { "epoch": 0.9622383985441311, "grad_norm": 1.3942660783602792, "learning_rate": 4.557241043101118e-06, "loss": 0.1263, "step": 2115 }, { "epoch": 0.9626933575978162, "grad_norm": 1.6927990416728342, "learning_rate": 4.556834902186667e-06, "loss": 0.2537, "step": 2116 }, { "epoch": 0.9631483166515014, "grad_norm": 2.0785259665220646, "learning_rate": 4.556428593196956e-06, "loss": 0.1927, "step": 2117 }, { "epoch": 0.9636032757051866, "grad_norm": 1.7131650413165849, "learning_rate": 4.556022116165189e-06, "loss": 0.2146, "step": 2118 }, { "epoch": 0.9640582347588716, "grad_norm": 1.7560312461053569, "learning_rate": 4.555615471124578e-06, "loss": 0.1429, "step": 2119 }, { "epoch": 0.9645131938125568, "grad_norm": 1.4424071339171873, "learning_rate": 4.555208658108354e-06, "loss": 0.1017, "step": 2120 }, { "epoch": 0.964968152866242, "grad_norm": 2.366476482520588, "learning_rate": 4.55480167714976e-06, "loss": 0.1701, "step": 2121 }, { "epoch": 0.9654231119199272, "grad_norm": 1.3193271811867113, "learning_rate": 4.554394528282052e-06, "loss": 0.1608, "step": 2122 }, { "epoch": 0.9658780709736123, "grad_norm": 1.6112197038225973, "learning_rate": 4.553987211538501e-06, "loss": 0.1663, "step": 2123 }, { "epoch": 0.9663330300272975, "grad_norm": 2.2120821423419477, "learning_rate": 4.5535797269523906e-06, "loss": 0.1761, "step": 2124 }, { "epoch": 0.9667879890809827, "grad_norm": 1.9459325657347053, "learning_rate": 4.55317207455702e-06, "loss": 0.1648, "step": 2125 }, { "epoch": 0.9672429481346679, "grad_norm": 1.2258892841488513, "learning_rate": 4.552764254385697e-06, "loss": 0.113, "step": 2126 }, { "epoch": 0.967697907188353, "grad_norm": 1.7595258140929935, "learning_rate": 4.552356266471751e-06, "loss": 0.1773, "step": 2127 }, { "epoch": 0.9681528662420382, "grad_norm": 1.9664757298212556, "learning_rate": 4.55194811084852e-06, "loss": 0.165, "step": 2128 }, { "epoch": 0.9686078252957234, "grad_norm": 2.222530250938157, "learning_rate": 4.551539787549354e-06, "loss": 0.2096, "step": 2129 }, { "epoch": 0.9690627843494085, "grad_norm": 1.3774868751004326, "learning_rate": 4.551131296607623e-06, "loss": 0.1089, "step": 2130 }, { "epoch": 0.9695177434030937, "grad_norm": 1.8067013761642468, "learning_rate": 4.550722638056703e-06, "loss": 0.1323, "step": 2131 }, { "epoch": 0.9699727024567789, "grad_norm": 2.24991176799243, "learning_rate": 4.550313811929993e-06, "loss": 0.1334, "step": 2132 }, { "epoch": 0.9704276615104641, "grad_norm": 2.72004150671695, "learning_rate": 4.549904818260895e-06, "loss": 0.1775, "step": 2133 }, { "epoch": 0.9708826205641492, "grad_norm": 2.342721771224346, "learning_rate": 4.549495657082834e-06, "loss": 0.191, "step": 2134 }, { "epoch": 0.9713375796178344, "grad_norm": 2.2728812324499534, "learning_rate": 4.549086328429242e-06, "loss": 0.1425, "step": 2135 }, { "epoch": 0.9717925386715196, "grad_norm": 1.453499597882781, "learning_rate": 4.548676832333569e-06, "loss": 0.1316, "step": 2136 }, { "epoch": 0.9722474977252047, "grad_norm": 2.01603990428807, "learning_rate": 4.548267168829279e-06, "loss": 0.1307, "step": 2137 }, { "epoch": 0.9727024567788899, "grad_norm": 1.6605060275137966, "learning_rate": 4.547857337949844e-06, "loss": 0.1399, "step": 2138 }, { "epoch": 0.9731574158325751, "grad_norm": 1.5535531332266466, "learning_rate": 4.5474473397287556e-06, "loss": 0.1321, "step": 2139 }, { "epoch": 0.9736123748862603, "grad_norm": 1.5373238474360202, "learning_rate": 4.547037174199517e-06, "loss": 0.1343, "step": 2140 }, { "epoch": 0.9740673339399454, "grad_norm": 1.8078338860297858, "learning_rate": 4.546626841395645e-06, "loss": 0.1635, "step": 2141 }, { "epoch": 0.9745222929936306, "grad_norm": 2.3652157653146326, "learning_rate": 4.54621634135067e-06, "loss": 0.1574, "step": 2142 }, { "epoch": 0.9749772520473158, "grad_norm": 1.582720512511224, "learning_rate": 4.545805674098136e-06, "loss": 0.1834, "step": 2143 }, { "epoch": 0.9754322111010009, "grad_norm": 1.603799084987541, "learning_rate": 4.545394839671601e-06, "loss": 0.1464, "step": 2144 }, { "epoch": 0.9758871701546861, "grad_norm": 2.2937187508235612, "learning_rate": 4.544983838104637e-06, "loss": 0.1689, "step": 2145 }, { "epoch": 0.9763421292083713, "grad_norm": 1.5827694703198016, "learning_rate": 4.544572669430828e-06, "loss": 0.1974, "step": 2146 }, { "epoch": 0.9767970882620565, "grad_norm": 1.5229863728993667, "learning_rate": 4.544161333683775e-06, "loss": 0.1347, "step": 2147 }, { "epoch": 0.9772520473157416, "grad_norm": 1.7227170284858135, "learning_rate": 4.543749830897088e-06, "loss": 0.2186, "step": 2148 }, { "epoch": 0.9777070063694268, "grad_norm": 1.9401788313572834, "learning_rate": 4.543338161104395e-06, "loss": 0.1674, "step": 2149 }, { "epoch": 0.978161965423112, "grad_norm": 1.4440321556413929, "learning_rate": 4.542926324339335e-06, "loss": 0.1518, "step": 2150 }, { "epoch": 0.978616924476797, "grad_norm": 1.5863469206535143, "learning_rate": 4.542514320635561e-06, "loss": 0.1548, "step": 2151 }, { "epoch": 0.9790718835304822, "grad_norm": 1.7952124026440508, "learning_rate": 4.542102150026741e-06, "loss": 0.2011, "step": 2152 }, { "epoch": 0.9795268425841674, "grad_norm": 1.2781168765483073, "learning_rate": 4.541689812546556e-06, "loss": 0.1708, "step": 2153 }, { "epoch": 0.9799818016378526, "grad_norm": 2.275201017608769, "learning_rate": 4.541277308228698e-06, "loss": 0.2655, "step": 2154 }, { "epoch": 0.9804367606915377, "grad_norm": 1.6797512508176873, "learning_rate": 4.540864637106879e-06, "loss": 0.1526, "step": 2155 }, { "epoch": 0.9808917197452229, "grad_norm": 1.7795439392430585, "learning_rate": 4.540451799214817e-06, "loss": 0.1561, "step": 2156 }, { "epoch": 0.9813466787989081, "grad_norm": 2.2915523451786766, "learning_rate": 4.540038794586248e-06, "loss": 0.1603, "step": 2157 }, { "epoch": 0.9818016378525932, "grad_norm": 2.2274131509949537, "learning_rate": 4.539625623254923e-06, "loss": 0.1423, "step": 2158 }, { "epoch": 0.9822565969062784, "grad_norm": 1.3978925866840657, "learning_rate": 4.539212285254601e-06, "loss": 0.1708, "step": 2159 }, { "epoch": 0.9827115559599636, "grad_norm": 1.7857894009279391, "learning_rate": 4.5387987806190615e-06, "loss": 0.1893, "step": 2160 }, { "epoch": 0.9831665150136488, "grad_norm": 1.518791485457489, "learning_rate": 4.538385109382093e-06, "loss": 0.1709, "step": 2161 }, { "epoch": 0.9836214740673339, "grad_norm": 1.3743190231639797, "learning_rate": 4.537971271577498e-06, "loss": 0.1746, "step": 2162 }, { "epoch": 0.9840764331210191, "grad_norm": 1.1750088863525163, "learning_rate": 4.537557267239093e-06, "loss": 0.108, "step": 2163 }, { "epoch": 0.9845313921747043, "grad_norm": 1.2225308832618265, "learning_rate": 4.537143096400712e-06, "loss": 0.1061, "step": 2164 }, { "epoch": 0.9849863512283894, "grad_norm": 2.1247362714767415, "learning_rate": 4.536728759096195e-06, "loss": 0.179, "step": 2165 }, { "epoch": 0.9854413102820746, "grad_norm": 1.808580318181682, "learning_rate": 4.536314255359402e-06, "loss": 0.1335, "step": 2166 }, { "epoch": 0.9858962693357598, "grad_norm": 1.6790298431680175, "learning_rate": 4.535899585224204e-06, "loss": 0.1493, "step": 2167 }, { "epoch": 0.986351228389445, "grad_norm": 3.0332484593824245, "learning_rate": 4.535484748724486e-06, "loss": 0.2063, "step": 2168 }, { "epoch": 0.9868061874431301, "grad_norm": 1.6421323451507468, "learning_rate": 4.535069745894147e-06, "loss": 0.1673, "step": 2169 }, { "epoch": 0.9872611464968153, "grad_norm": 1.9282204111223042, "learning_rate": 4.534654576767098e-06, "loss": 0.1428, "step": 2170 }, { "epoch": 0.9877161055505005, "grad_norm": 1.4541197485662065, "learning_rate": 4.534239241377266e-06, "loss": 0.1901, "step": 2171 }, { "epoch": 0.9881710646041856, "grad_norm": 3.2268329342995554, "learning_rate": 4.5338237397585895e-06, "loss": 0.2441, "step": 2172 }, { "epoch": 0.9886260236578708, "grad_norm": 2.4649363175751646, "learning_rate": 4.533408071945021e-06, "loss": 0.1763, "step": 2173 }, { "epoch": 0.989080982711556, "grad_norm": 1.8464040284824113, "learning_rate": 4.532992237970528e-06, "loss": 0.1646, "step": 2174 }, { "epoch": 0.9895359417652412, "grad_norm": 2.115464473457186, "learning_rate": 4.532576237869091e-06, "loss": 0.1468, "step": 2175 }, { "epoch": 0.9899909008189263, "grad_norm": 1.6765582325152246, "learning_rate": 4.5321600716747025e-06, "loss": 0.1377, "step": 2176 }, { "epoch": 0.9904458598726115, "grad_norm": 1.8413627666297776, "learning_rate": 4.531743739421369e-06, "loss": 0.181, "step": 2177 }, { "epoch": 0.9909008189262967, "grad_norm": 1.7110916137165555, "learning_rate": 4.531327241143114e-06, "loss": 0.1418, "step": 2178 }, { "epoch": 0.9913557779799818, "grad_norm": 2.3165603295554726, "learning_rate": 4.530910576873969e-06, "loss": 0.1666, "step": 2179 }, { "epoch": 0.991810737033667, "grad_norm": 2.0264888702689254, "learning_rate": 4.530493746647984e-06, "loss": 0.1653, "step": 2180 }, { "epoch": 0.9922656960873522, "grad_norm": 3.7082736074441227, "learning_rate": 4.530076750499219e-06, "loss": 0.1955, "step": 2181 }, { "epoch": 0.9927206551410374, "grad_norm": 1.4980795502080217, "learning_rate": 4.52965958846175e-06, "loss": 0.1763, "step": 2182 }, { "epoch": 0.9931756141947224, "grad_norm": 1.328886576986546, "learning_rate": 4.529242260569665e-06, "loss": 0.135, "step": 2183 }, { "epoch": 0.9936305732484076, "grad_norm": 2.4602783485410478, "learning_rate": 4.528824766857067e-06, "loss": 0.225, "step": 2184 }, { "epoch": 0.9940855323020928, "grad_norm": 2.656745825690249, "learning_rate": 4.5284071073580715e-06, "loss": 0.1623, "step": 2185 }, { "epoch": 0.9945404913557779, "grad_norm": 2.191300990353365, "learning_rate": 4.527989282106807e-06, "loss": 0.145, "step": 2186 }, { "epoch": 0.9949954504094631, "grad_norm": 2.3096174225453043, "learning_rate": 4.527571291137416e-06, "loss": 0.2047, "step": 2187 }, { "epoch": 0.9954504094631483, "grad_norm": 2.2206355508554374, "learning_rate": 4.527153134484056e-06, "loss": 0.1978, "step": 2188 }, { "epoch": 0.9959053685168335, "grad_norm": 1.5575737643430931, "learning_rate": 4.5267348121808965e-06, "loss": 0.1083, "step": 2189 }, { "epoch": 0.9963603275705186, "grad_norm": 1.1842592978237663, "learning_rate": 4.526316324262121e-06, "loss": 0.1418, "step": 2190 }, { "epoch": 0.9968152866242038, "grad_norm": 2.066729296311549, "learning_rate": 4.525897670761926e-06, "loss": 0.1555, "step": 2191 }, { "epoch": 0.997270245677889, "grad_norm": 1.8945946795231638, "learning_rate": 4.525478851714522e-06, "loss": 0.1602, "step": 2192 }, { "epoch": 0.9977252047315741, "grad_norm": 2.288603637382534, "learning_rate": 4.525059867154133e-06, "loss": 0.1728, "step": 2193 }, { "epoch": 0.9981801637852593, "grad_norm": 1.548625455808381, "learning_rate": 4.5246407171149975e-06, "loss": 0.1535, "step": 2194 }, { "epoch": 0.9986351228389445, "grad_norm": 1.7795058207338135, "learning_rate": 4.5242214016313655e-06, "loss": 0.1937, "step": 2195 }, { "epoch": 0.9990900818926297, "grad_norm": 1.8173123394415125, "learning_rate": 4.523801920737501e-06, "loss": 0.1855, "step": 2196 }, { "epoch": 0.9995450409463148, "grad_norm": 1.5328423318772029, "learning_rate": 4.523382274467684e-06, "loss": 0.1734, "step": 2197 }, { "epoch": 1.0, "grad_norm": 1.6888871167302404, "learning_rate": 4.522962462856206e-06, "loss": 0.1061, "step": 2198 }, { "epoch": 1.000454959053685, "grad_norm": 1.0169999119479456, "learning_rate": 4.522542485937369e-06, "loss": 0.051, "step": 2199 }, { "epoch": 1.0009099181073704, "grad_norm": 1.6609923808472133, "learning_rate": 4.522122343745495e-06, "loss": 0.0982, "step": 2200 }, { "epoch": 1.0013648771610555, "grad_norm": 1.2283700830083324, "learning_rate": 4.521702036314915e-06, "loss": 0.068, "step": 2201 }, { "epoch": 1.0018198362147406, "grad_norm": 1.220074312624483, "learning_rate": 4.521281563679973e-06, "loss": 0.0629, "step": 2202 }, { "epoch": 1.0022747952684259, "grad_norm": 1.4941719880778739, "learning_rate": 4.5208609258750314e-06, "loss": 0.0755, "step": 2203 }, { "epoch": 1.002729754322111, "grad_norm": 1.1143728511252875, "learning_rate": 4.52044012293446e-06, "loss": 0.0587, "step": 2204 }, { "epoch": 1.0031847133757963, "grad_norm": 1.5319847923881116, "learning_rate": 4.520019154892646e-06, "loss": 0.0851, "step": 2205 }, { "epoch": 1.0036396724294814, "grad_norm": 1.2636498680398078, "learning_rate": 4.519598021783989e-06, "loss": 0.0993, "step": 2206 }, { "epoch": 1.0040946314831665, "grad_norm": 1.5487488091959216, "learning_rate": 4.519176723642903e-06, "loss": 0.113, "step": 2207 }, { "epoch": 1.0045495905368518, "grad_norm": 1.5557166129958784, "learning_rate": 4.518755260503813e-06, "loss": 0.0788, "step": 2208 }, { "epoch": 1.0050045495905369, "grad_norm": 1.2818157097100387, "learning_rate": 4.51833363240116e-06, "loss": 0.0743, "step": 2209 }, { "epoch": 1.005459508644222, "grad_norm": 1.200932009259888, "learning_rate": 4.517911839369398e-06, "loss": 0.0811, "step": 2210 }, { "epoch": 1.0059144676979073, "grad_norm": 1.4486327355662423, "learning_rate": 4.517489881442993e-06, "loss": 0.062, "step": 2211 }, { "epoch": 1.0063694267515924, "grad_norm": 1.3527098955371344, "learning_rate": 4.517067758656424e-06, "loss": 0.0627, "step": 2212 }, { "epoch": 1.0068243858052774, "grad_norm": 1.4047497974003487, "learning_rate": 4.516645471044188e-06, "loss": 0.0651, "step": 2213 }, { "epoch": 1.0072793448589628, "grad_norm": 1.4164244968906639, "learning_rate": 4.516223018640791e-06, "loss": 0.0714, "step": 2214 }, { "epoch": 1.0077343039126478, "grad_norm": 1.5809882117425458, "learning_rate": 4.515800401480754e-06, "loss": 0.0989, "step": 2215 }, { "epoch": 1.008189262966333, "grad_norm": 1.6844068994280326, "learning_rate": 4.515377619598612e-06, "loss": 0.1007, "step": 2216 }, { "epoch": 1.0086442220200182, "grad_norm": 1.5732620970585767, "learning_rate": 4.514954673028913e-06, "loss": 0.0765, "step": 2217 }, { "epoch": 1.0090991810737033, "grad_norm": 1.3651454362527589, "learning_rate": 4.5145315618062155e-06, "loss": 0.0817, "step": 2218 }, { "epoch": 1.0095541401273886, "grad_norm": 1.7849697070364972, "learning_rate": 4.514108285965098e-06, "loss": 0.0946, "step": 2219 }, { "epoch": 1.0100090991810737, "grad_norm": 1.4164875410963866, "learning_rate": 4.513684845540146e-06, "loss": 0.067, "step": 2220 }, { "epoch": 1.0104640582347588, "grad_norm": 1.7807110987231174, "learning_rate": 4.5132612405659625e-06, "loss": 0.1131, "step": 2221 }, { "epoch": 1.0109190172884441, "grad_norm": 1.6962102867596296, "learning_rate": 4.5128374710771625e-06, "loss": 0.1001, "step": 2222 }, { "epoch": 1.0113739763421292, "grad_norm": 1.9807611103838136, "learning_rate": 4.512413537108374e-06, "loss": 0.1216, "step": 2223 }, { "epoch": 1.0118289353958143, "grad_norm": 2.2071849786855195, "learning_rate": 4.511989438694239e-06, "loss": 0.0758, "step": 2224 }, { "epoch": 1.0122838944494996, "grad_norm": 1.41006582199038, "learning_rate": 4.511565175869415e-06, "loss": 0.0676, "step": 2225 }, { "epoch": 1.0127388535031847, "grad_norm": 1.5005194178509522, "learning_rate": 4.511140748668566e-06, "loss": 0.0845, "step": 2226 }, { "epoch": 1.0131938125568698, "grad_norm": 1.2291494575864939, "learning_rate": 4.510716157126379e-06, "loss": 0.0611, "step": 2227 }, { "epoch": 1.013648771610555, "grad_norm": 2.4795116846611975, "learning_rate": 4.510291401277548e-06, "loss": 0.0983, "step": 2228 }, { "epoch": 1.0141037306642402, "grad_norm": 2.657277286309681, "learning_rate": 4.509866481156781e-06, "loss": 0.1101, "step": 2229 }, { "epoch": 1.0145586897179253, "grad_norm": 1.8196308245882602, "learning_rate": 4.509441396798802e-06, "loss": 0.0998, "step": 2230 }, { "epoch": 1.0150136487716106, "grad_norm": 1.9314931582074881, "learning_rate": 4.5090161482383475e-06, "loss": 0.0936, "step": 2231 }, { "epoch": 1.0154686078252957, "grad_norm": 1.2746342487726179, "learning_rate": 4.508590735510166e-06, "loss": 0.0676, "step": 2232 }, { "epoch": 1.015923566878981, "grad_norm": 1.8859048739802027, "learning_rate": 4.508165158649019e-06, "loss": 0.0811, "step": 2233 }, { "epoch": 1.016378525932666, "grad_norm": 1.6756178231136896, "learning_rate": 4.507739417689685e-06, "loss": 0.0747, "step": 2234 }, { "epoch": 1.0168334849863512, "grad_norm": 1.3984270258928366, "learning_rate": 4.507313512666953e-06, "loss": 0.075, "step": 2235 }, { "epoch": 1.0172884440400365, "grad_norm": 1.5242107845200688, "learning_rate": 4.506887443615625e-06, "loss": 0.0823, "step": 2236 }, { "epoch": 1.0177434030937216, "grad_norm": 1.5995342787535922, "learning_rate": 4.506461210570518e-06, "loss": 0.0971, "step": 2237 }, { "epoch": 1.0181983621474067, "grad_norm": 1.1425078029916038, "learning_rate": 4.506034813566462e-06, "loss": 0.1233, "step": 2238 }, { "epoch": 1.018653321201092, "grad_norm": 1.4187790734010148, "learning_rate": 4.505608252638301e-06, "loss": 0.0934, "step": 2239 }, { "epoch": 1.019108280254777, "grad_norm": 1.9848336082848856, "learning_rate": 4.50518152782089e-06, "loss": 0.1203, "step": 2240 }, { "epoch": 1.0195632393084622, "grad_norm": 1.2043374157232327, "learning_rate": 4.504754639149101e-06, "loss": 0.0709, "step": 2241 }, { "epoch": 1.0200181983621475, "grad_norm": 1.36618996999929, "learning_rate": 4.504327586657814e-06, "loss": 0.0647, "step": 2242 }, { "epoch": 1.0204731574158326, "grad_norm": 1.563535065138085, "learning_rate": 4.50390037038193e-06, "loss": 0.0833, "step": 2243 }, { "epoch": 1.0209281164695176, "grad_norm": 1.5296584792807861, "learning_rate": 4.503472990356357e-06, "loss": 0.0946, "step": 2244 }, { "epoch": 1.021383075523203, "grad_norm": 1.512634883619265, "learning_rate": 4.503045446616018e-06, "loss": 0.0715, "step": 2245 }, { "epoch": 1.021838034576888, "grad_norm": 1.3010427168043244, "learning_rate": 4.502617739195852e-06, "loss": 0.0873, "step": 2246 }, { "epoch": 1.0222929936305734, "grad_norm": 1.387157397416425, "learning_rate": 4.502189868130807e-06, "loss": 0.0763, "step": 2247 }, { "epoch": 1.0227479526842584, "grad_norm": 1.828795187833686, "learning_rate": 4.501761833455849e-06, "loss": 0.1319, "step": 2248 }, { "epoch": 1.0232029117379435, "grad_norm": 1.3445669290205065, "learning_rate": 4.501333635205952e-06, "loss": 0.068, "step": 2249 }, { "epoch": 1.0236578707916288, "grad_norm": 1.5610944674651466, "learning_rate": 4.5009052734161095e-06, "loss": 0.0739, "step": 2250 }, { "epoch": 1.024112829845314, "grad_norm": 1.2525841076083186, "learning_rate": 4.500476748121324e-06, "loss": 0.1094, "step": 2251 }, { "epoch": 1.024567788898999, "grad_norm": 1.5118810013113924, "learning_rate": 4.500048059356613e-06, "loss": 0.1041, "step": 2252 }, { "epoch": 1.0250227479526843, "grad_norm": 1.318153460904525, "learning_rate": 4.499619207157007e-06, "loss": 0.0851, "step": 2253 }, { "epoch": 1.0254777070063694, "grad_norm": 1.3005012388734132, "learning_rate": 4.499190191557549e-06, "loss": 0.1007, "step": 2254 }, { "epoch": 1.0259326660600545, "grad_norm": 1.7684251321269342, "learning_rate": 4.498761012593296e-06, "loss": 0.1144, "step": 2255 }, { "epoch": 1.0263876251137398, "grad_norm": 1.2065670700113398, "learning_rate": 4.498331670299321e-06, "loss": 0.1344, "step": 2256 }, { "epoch": 1.026842584167425, "grad_norm": 1.6857989870574055, "learning_rate": 4.497902164710704e-06, "loss": 0.0642, "step": 2257 }, { "epoch": 1.02729754322111, "grad_norm": 1.6473004600696095, "learning_rate": 4.497472495862547e-06, "loss": 0.0981, "step": 2258 }, { "epoch": 1.0277525022747953, "grad_norm": 1.3689985527437365, "learning_rate": 4.497042663789957e-06, "loss": 0.0813, "step": 2259 }, { "epoch": 1.0282074613284804, "grad_norm": 1.6484955662328646, "learning_rate": 4.496612668528059e-06, "loss": 0.1318, "step": 2260 }, { "epoch": 1.0286624203821657, "grad_norm": 1.2301308018690613, "learning_rate": 4.496182510111991e-06, "loss": 0.1323, "step": 2261 }, { "epoch": 1.0291173794358508, "grad_norm": 1.3974663767006335, "learning_rate": 4.495752188576902e-06, "loss": 0.1113, "step": 2262 }, { "epoch": 1.0295723384895359, "grad_norm": 1.9572449646613161, "learning_rate": 4.4953217039579574e-06, "loss": 0.1108, "step": 2263 }, { "epoch": 1.0300272975432212, "grad_norm": 1.5604560381918156, "learning_rate": 4.494891056290335e-06, "loss": 0.126, "step": 2264 }, { "epoch": 1.0304822565969063, "grad_norm": 1.7509136256359128, "learning_rate": 4.494460245609223e-06, "loss": 0.0767, "step": 2265 }, { "epoch": 1.0309372156505914, "grad_norm": 1.5345571279100725, "learning_rate": 4.494029271949827e-06, "loss": 0.1008, "step": 2266 }, { "epoch": 1.0313921747042767, "grad_norm": 1.0263814664645543, "learning_rate": 4.493598135347363e-06, "loss": 0.0931, "step": 2267 }, { "epoch": 1.0318471337579618, "grad_norm": 2.0480255592331584, "learning_rate": 4.493166835837064e-06, "loss": 0.0681, "step": 2268 }, { "epoch": 1.0323020928116469, "grad_norm": 1.8761109395251792, "learning_rate": 4.492735373454171e-06, "loss": 0.1086, "step": 2269 }, { "epoch": 1.0327570518653322, "grad_norm": 1.897488467663145, "learning_rate": 4.492303748233943e-06, "loss": 0.1267, "step": 2270 }, { "epoch": 1.0332120109190173, "grad_norm": 1.7630394900644286, "learning_rate": 4.49187196021165e-06, "loss": 0.148, "step": 2271 }, { "epoch": 1.0336669699727024, "grad_norm": 1.557460432820476, "learning_rate": 4.491440009422575e-06, "loss": 0.0822, "step": 2272 }, { "epoch": 1.0341219290263877, "grad_norm": 2.2035963282826474, "learning_rate": 4.491007895902016e-06, "loss": 0.1237, "step": 2273 }, { "epoch": 1.0345768880800728, "grad_norm": 1.7055574933768018, "learning_rate": 4.490575619685283e-06, "loss": 0.101, "step": 2274 }, { "epoch": 1.035031847133758, "grad_norm": 2.3176332211637103, "learning_rate": 4.4901431808077e-06, "loss": 0.0965, "step": 2275 }, { "epoch": 1.0354868061874432, "grad_norm": 1.9372753009751453, "learning_rate": 4.489710579304603e-06, "loss": 0.1356, "step": 2276 }, { "epoch": 1.0359417652411282, "grad_norm": 1.3110102653721396, "learning_rate": 4.489277815211343e-06, "loss": 0.0544, "step": 2277 }, { "epoch": 1.0363967242948136, "grad_norm": 1.4905691930121885, "learning_rate": 4.488844888563284e-06, "loss": 0.1552, "step": 2278 }, { "epoch": 1.0368516833484986, "grad_norm": 1.2129187548833384, "learning_rate": 4.488411799395802e-06, "loss": 0.0635, "step": 2279 }, { "epoch": 1.0373066424021837, "grad_norm": 1.7307605999371245, "learning_rate": 4.487978547744287e-06, "loss": 0.0718, "step": 2280 }, { "epoch": 1.037761601455869, "grad_norm": 4.002919733780402, "learning_rate": 4.487545133644143e-06, "loss": 0.0918, "step": 2281 }, { "epoch": 1.0382165605095541, "grad_norm": 1.434451235166591, "learning_rate": 4.487111557130787e-06, "loss": 0.1087, "step": 2282 }, { "epoch": 1.0386715195632392, "grad_norm": 1.6326264823457393, "learning_rate": 4.486677818239647e-06, "loss": 0.0943, "step": 2283 }, { "epoch": 1.0391264786169245, "grad_norm": 1.6173934297359729, "learning_rate": 4.486243917006169e-06, "loss": 0.0825, "step": 2284 }, { "epoch": 1.0395814376706096, "grad_norm": 1.330454351983684, "learning_rate": 4.485809853465807e-06, "loss": 0.0505, "step": 2285 }, { "epoch": 1.0400363967242947, "grad_norm": 1.3258755084207146, "learning_rate": 4.4853756276540315e-06, "loss": 0.0877, "step": 2286 }, { "epoch": 1.04049135577798, "grad_norm": 1.4601501745351109, "learning_rate": 4.484941239606326e-06, "loss": 0.0861, "step": 2287 }, { "epoch": 1.040946314831665, "grad_norm": 1.978079069134469, "learning_rate": 4.484506689358186e-06, "loss": 0.1226, "step": 2288 }, { "epoch": 1.0414012738853504, "grad_norm": 1.3962311543656398, "learning_rate": 4.484071976945121e-06, "loss": 0.0687, "step": 2289 }, { "epoch": 1.0418562329390355, "grad_norm": 1.2605481862079213, "learning_rate": 4.483637102402655e-06, "loss": 0.1035, "step": 2290 }, { "epoch": 1.0423111919927206, "grad_norm": 1.3191554559607057, "learning_rate": 4.4832020657663224e-06, "loss": 0.0789, "step": 2291 }, { "epoch": 1.042766151046406, "grad_norm": 1.7983136808453735, "learning_rate": 4.482766867071673e-06, "loss": 0.068, "step": 2292 }, { "epoch": 1.043221110100091, "grad_norm": 1.3901753138130788, "learning_rate": 4.482331506354269e-06, "loss": 0.1017, "step": 2293 }, { "epoch": 1.043676069153776, "grad_norm": 1.581469571449512, "learning_rate": 4.4818959836496876e-06, "loss": 0.0639, "step": 2294 }, { "epoch": 1.0441310282074614, "grad_norm": 1.269815942746802, "learning_rate": 4.481460298993515e-06, "loss": 0.0625, "step": 2295 }, { "epoch": 1.0445859872611465, "grad_norm": 1.3773026873827707, "learning_rate": 4.481024452421357e-06, "loss": 0.0815, "step": 2296 }, { "epoch": 1.0450409463148316, "grad_norm": 1.4926712499107542, "learning_rate": 4.480588443968825e-06, "loss": 0.0651, "step": 2297 }, { "epoch": 1.0454959053685169, "grad_norm": 1.3393174273757424, "learning_rate": 4.4801522736715505e-06, "loss": 0.0853, "step": 2298 }, { "epoch": 1.045950864422202, "grad_norm": 1.5129017760803518, "learning_rate": 4.479715941565174e-06, "loss": 0.054, "step": 2299 }, { "epoch": 1.046405823475887, "grad_norm": 2.0616493840890255, "learning_rate": 4.4792794476853514e-06, "loss": 0.0808, "step": 2300 }, { "epoch": 1.0468607825295724, "grad_norm": 1.5861310389241974, "learning_rate": 4.47884279206775e-06, "loss": 0.0927, "step": 2301 }, { "epoch": 1.0473157415832575, "grad_norm": 0.928390801162424, "learning_rate": 4.478405974748054e-06, "loss": 0.0722, "step": 2302 }, { "epoch": 1.0477707006369428, "grad_norm": 1.5458094332092187, "learning_rate": 4.477968995761954e-06, "loss": 0.0867, "step": 2303 }, { "epoch": 1.0482256596906279, "grad_norm": 1.5404011995876956, "learning_rate": 4.477531855145161e-06, "loss": 0.0902, "step": 2304 }, { "epoch": 1.048680618744313, "grad_norm": 1.3434412855749513, "learning_rate": 4.477094552933395e-06, "loss": 0.0655, "step": 2305 }, { "epoch": 1.0491355777979983, "grad_norm": 1.083100442302988, "learning_rate": 4.476657089162391e-06, "loss": 0.066, "step": 2306 }, { "epoch": 1.0495905368516834, "grad_norm": 1.3871586676322527, "learning_rate": 4.476219463867897e-06, "loss": 0.1087, "step": 2307 }, { "epoch": 1.0500454959053684, "grad_norm": 1.7852029642214748, "learning_rate": 4.475781677085671e-06, "loss": 0.0916, "step": 2308 }, { "epoch": 1.0505004549590538, "grad_norm": 1.4206975802030928, "learning_rate": 4.4753437288514904e-06, "loss": 0.0664, "step": 2309 }, { "epoch": 1.0509554140127388, "grad_norm": 1.464232148884979, "learning_rate": 4.47490561920114e-06, "loss": 0.098, "step": 2310 }, { "epoch": 1.051410373066424, "grad_norm": 1.7389093637922037, "learning_rate": 4.474467348170421e-06, "loss": 0.0926, "step": 2311 }, { "epoch": 1.0518653321201092, "grad_norm": 1.6567765919211275, "learning_rate": 4.474028915795148e-06, "loss": 0.1079, "step": 2312 }, { "epoch": 1.0523202911737943, "grad_norm": 0.8043045141598315, "learning_rate": 4.473590322111145e-06, "loss": 0.0639, "step": 2313 }, { "epoch": 1.0527752502274794, "grad_norm": 1.535130658359192, "learning_rate": 4.473151567154255e-06, "loss": 0.0806, "step": 2314 }, { "epoch": 1.0532302092811647, "grad_norm": 1.2136793848488039, "learning_rate": 4.472712650960328e-06, "loss": 0.0732, "step": 2315 }, { "epoch": 1.0536851683348498, "grad_norm": 1.4191160149688276, "learning_rate": 4.472273573565234e-06, "loss": 0.1603, "step": 2316 }, { "epoch": 1.0541401273885351, "grad_norm": 1.812354142724077, "learning_rate": 4.471834335004849e-06, "loss": 0.1629, "step": 2317 }, { "epoch": 1.0545950864422202, "grad_norm": 1.1853207063745665, "learning_rate": 4.471394935315067e-06, "loss": 0.0429, "step": 2318 }, { "epoch": 1.0550500454959053, "grad_norm": 1.7435537882257561, "learning_rate": 4.470955374531794e-06, "loss": 0.1269, "step": 2319 }, { "epoch": 1.0555050045495906, "grad_norm": 1.7557827405058806, "learning_rate": 4.470515652690947e-06, "loss": 0.065, "step": 2320 }, { "epoch": 1.0559599636032757, "grad_norm": 1.413841453700311, "learning_rate": 4.470075769828461e-06, "loss": 0.0972, "step": 2321 }, { "epoch": 1.0564149226569608, "grad_norm": 2.027164177434821, "learning_rate": 4.46963572598028e-06, "loss": 0.1036, "step": 2322 }, { "epoch": 1.056869881710646, "grad_norm": 1.3937151595286825, "learning_rate": 4.469195521182362e-06, "loss": 0.0962, "step": 2323 }, { "epoch": 1.0573248407643312, "grad_norm": 1.6401213468826432, "learning_rate": 4.468755155470679e-06, "loss": 0.0932, "step": 2324 }, { "epoch": 1.0577797998180163, "grad_norm": 2.338885175215576, "learning_rate": 4.468314628881214e-06, "loss": 0.0962, "step": 2325 }, { "epoch": 1.0582347588717016, "grad_norm": 1.4115973810191336, "learning_rate": 4.467873941449969e-06, "loss": 0.1021, "step": 2326 }, { "epoch": 1.0586897179253867, "grad_norm": 1.982422405584423, "learning_rate": 4.46743309321295e-06, "loss": 0.1079, "step": 2327 }, { "epoch": 1.0591446769790718, "grad_norm": 1.7740653248101632, "learning_rate": 4.466992084206185e-06, "loss": 0.1169, "step": 2328 }, { "epoch": 1.059599636032757, "grad_norm": 1.116268548969285, "learning_rate": 4.466550914465709e-06, "loss": 0.0657, "step": 2329 }, { "epoch": 1.0600545950864422, "grad_norm": 1.8360092943419488, "learning_rate": 4.466109584027573e-06, "loss": 0.127, "step": 2330 }, { "epoch": 1.0605095541401275, "grad_norm": 1.3810676537742754, "learning_rate": 4.465668092927841e-06, "loss": 0.0856, "step": 2331 }, { "epoch": 1.0609645131938126, "grad_norm": 2.185972325771388, "learning_rate": 4.465226441202589e-06, "loss": 0.0851, "step": 2332 }, { "epoch": 1.0614194722474977, "grad_norm": 1.3875472079527142, "learning_rate": 4.464784628887908e-06, "loss": 0.0792, "step": 2333 }, { "epoch": 1.061874431301183, "grad_norm": 1.2775951274791801, "learning_rate": 4.4643426560199e-06, "loss": 0.104, "step": 2334 }, { "epoch": 1.062329390354868, "grad_norm": 1.5319736940172268, "learning_rate": 4.46390052263468e-06, "loss": 0.1104, "step": 2335 }, { "epoch": 1.0627843494085532, "grad_norm": 1.812780273198809, "learning_rate": 4.463458228768378e-06, "loss": 0.0949, "step": 2336 }, { "epoch": 1.0632393084622385, "grad_norm": 1.5756060982683149, "learning_rate": 4.463015774457137e-06, "loss": 0.082, "step": 2337 }, { "epoch": 1.0636942675159236, "grad_norm": 2.6744844011663917, "learning_rate": 4.462573159737113e-06, "loss": 0.1212, "step": 2338 }, { "epoch": 1.0641492265696086, "grad_norm": 1.2563398274616853, "learning_rate": 4.462130384644472e-06, "loss": 0.0768, "step": 2339 }, { "epoch": 1.064604185623294, "grad_norm": 1.8057420294279858, "learning_rate": 4.461687449215397e-06, "loss": 0.1099, "step": 2340 }, { "epoch": 1.065059144676979, "grad_norm": 1.6208315079433049, "learning_rate": 4.4612443534860826e-06, "loss": 0.1144, "step": 2341 }, { "epoch": 1.0655141037306644, "grad_norm": 1.9711864344243992, "learning_rate": 4.460801097492737e-06, "loss": 0.0856, "step": 2342 }, { "epoch": 1.0659690627843494, "grad_norm": 1.3323713152755212, "learning_rate": 4.460357681271579e-06, "loss": 0.0715, "step": 2343 }, { "epoch": 1.0664240218380345, "grad_norm": 1.6353594143577714, "learning_rate": 4.4599141048588454e-06, "loss": 0.111, "step": 2344 }, { "epoch": 1.0668789808917198, "grad_norm": 1.921680218643112, "learning_rate": 4.4594703682907825e-06, "loss": 0.1084, "step": 2345 }, { "epoch": 1.067333939945405, "grad_norm": 1.6583549389810224, "learning_rate": 4.459026471603649e-06, "loss": 0.1051, "step": 2346 }, { "epoch": 1.06778889899909, "grad_norm": 1.7686266077660249, "learning_rate": 4.45858241483372e-06, "loss": 0.1108, "step": 2347 }, { "epoch": 1.0682438580527753, "grad_norm": 1.2657212497494363, "learning_rate": 4.458138198017281e-06, "loss": 0.0775, "step": 2348 }, { "epoch": 1.0686988171064604, "grad_norm": 1.294854322669401, "learning_rate": 4.457693821190631e-06, "loss": 0.0991, "step": 2349 }, { "epoch": 1.0691537761601455, "grad_norm": 1.6787540486710895, "learning_rate": 4.4572492843900815e-06, "loss": 0.1061, "step": 2350 }, { "epoch": 1.0696087352138308, "grad_norm": 1.2916611688046353, "learning_rate": 4.456804587651961e-06, "loss": 0.0997, "step": 2351 }, { "epoch": 1.070063694267516, "grad_norm": 1.1797535857178234, "learning_rate": 4.456359731012606e-06, "loss": 0.1019, "step": 2352 }, { "epoch": 1.070518653321201, "grad_norm": 1.4074451049825587, "learning_rate": 4.455914714508369e-06, "loss": 0.0639, "step": 2353 }, { "epoch": 1.0709736123748863, "grad_norm": 0.7791870489522308, "learning_rate": 4.455469538175614e-06, "loss": 0.0293, "step": 2354 }, { "epoch": 1.0714285714285714, "grad_norm": 1.3432260603887558, "learning_rate": 4.455024202050719e-06, "loss": 0.086, "step": 2355 }, { "epoch": 1.0718835304822565, "grad_norm": 1.4625155799519551, "learning_rate": 4.454578706170075e-06, "loss": 0.0726, "step": 2356 }, { "epoch": 1.0723384895359418, "grad_norm": 1.9522119831099414, "learning_rate": 4.454133050570087e-06, "loss": 0.0687, "step": 2357 }, { "epoch": 1.0727934485896269, "grad_norm": 1.561587548295498, "learning_rate": 4.453687235287169e-06, "loss": 0.133, "step": 2358 }, { "epoch": 1.0732484076433122, "grad_norm": 1.2057828723386872, "learning_rate": 4.453241260357754e-06, "loss": 0.0913, "step": 2359 }, { "epoch": 1.0737033666969973, "grad_norm": 1.666054721084408, "learning_rate": 4.452795125818283e-06, "loss": 0.0971, "step": 2360 }, { "epoch": 1.0741583257506824, "grad_norm": 1.758685408172953, "learning_rate": 4.4523488317052146e-06, "loss": 0.1075, "step": 2361 }, { "epoch": 1.0746132848043677, "grad_norm": 1.105397570856634, "learning_rate": 4.451902378055015e-06, "loss": 0.0573, "step": 2362 }, { "epoch": 1.0750682438580528, "grad_norm": 1.192901271256021, "learning_rate": 4.451455764904169e-06, "loss": 0.0809, "step": 2363 }, { "epoch": 1.0755232029117379, "grad_norm": 1.819087657943071, "learning_rate": 4.45100899228917e-06, "loss": 0.0997, "step": 2364 }, { "epoch": 1.0759781619654232, "grad_norm": 1.3969388862666674, "learning_rate": 4.4505620602465275e-06, "loss": 0.0601, "step": 2365 }, { "epoch": 1.0764331210191083, "grad_norm": 2.1004515911969937, "learning_rate": 4.450114968812761e-06, "loss": 0.1059, "step": 2366 }, { "epoch": 1.0768880800727934, "grad_norm": 1.3898874863369548, "learning_rate": 4.449667718024406e-06, "loss": 0.1217, "step": 2367 }, { "epoch": 1.0773430391264787, "grad_norm": 1.624148028385408, "learning_rate": 4.449220307918011e-06, "loss": 0.1426, "step": 2368 }, { "epoch": 1.0777979981801638, "grad_norm": 1.3957158550214264, "learning_rate": 4.448772738530134e-06, "loss": 0.065, "step": 2369 }, { "epoch": 1.078252957233849, "grad_norm": 1.2170939851594698, "learning_rate": 4.44832500989735e-06, "loss": 0.0431, "step": 2370 }, { "epoch": 1.0787079162875342, "grad_norm": 1.4145038782998978, "learning_rate": 4.447877122056243e-06, "loss": 0.0672, "step": 2371 }, { "epoch": 1.0791628753412192, "grad_norm": 1.6983412550072923, "learning_rate": 4.447429075043416e-06, "loss": 0.0645, "step": 2372 }, { "epoch": 1.0796178343949046, "grad_norm": 1.9437215682706028, "learning_rate": 4.4469808688954786e-06, "loss": 0.0798, "step": 2373 }, { "epoch": 1.0800727934485896, "grad_norm": 1.3885506691120681, "learning_rate": 4.446532503649058e-06, "loss": 0.1103, "step": 2374 }, { "epoch": 1.0805277525022747, "grad_norm": 1.3760694731918508, "learning_rate": 4.44608397934079e-06, "loss": 0.0658, "step": 2375 }, { "epoch": 1.08098271155596, "grad_norm": 1.4014742842676748, "learning_rate": 4.445635296007329e-06, "loss": 0.0777, "step": 2376 }, { "epoch": 1.0814376706096451, "grad_norm": 1.5083231204611136, "learning_rate": 4.445186453685339e-06, "loss": 0.0765, "step": 2377 }, { "epoch": 1.0818926296633302, "grad_norm": 2.31100453638565, "learning_rate": 4.444737452411494e-06, "loss": 0.1285, "step": 2378 }, { "epoch": 1.0823475887170155, "grad_norm": 2.400477978408628, "learning_rate": 4.444288292222488e-06, "loss": 0.1032, "step": 2379 }, { "epoch": 1.0828025477707006, "grad_norm": 1.2288090886103258, "learning_rate": 4.443838973155023e-06, "loss": 0.0732, "step": 2380 }, { "epoch": 1.0832575068243857, "grad_norm": 1.7401608518222071, "learning_rate": 4.443389495245816e-06, "loss": 0.1038, "step": 2381 }, { "epoch": 1.083712465878071, "grad_norm": 1.0676718989217244, "learning_rate": 4.442939858531594e-06, "loss": 0.0977, "step": 2382 }, { "epoch": 1.084167424931756, "grad_norm": 2.16417029576833, "learning_rate": 4.442490063049103e-06, "loss": 0.1247, "step": 2383 }, { "epoch": 1.0846223839854412, "grad_norm": 1.7397604358649068, "learning_rate": 4.442040108835095e-06, "loss": 0.0734, "step": 2384 }, { "epoch": 1.0850773430391265, "grad_norm": 1.3344372550818824, "learning_rate": 4.44158999592634e-06, "loss": 0.0738, "step": 2385 }, { "epoch": 1.0855323020928116, "grad_norm": 1.464102086807412, "learning_rate": 4.441139724359617e-06, "loss": 0.069, "step": 2386 }, { "epoch": 1.085987261146497, "grad_norm": 1.2702083100987853, "learning_rate": 4.440689294171724e-06, "loss": 0.0731, "step": 2387 }, { "epoch": 1.086442220200182, "grad_norm": 1.7208341236115763, "learning_rate": 4.440238705399465e-06, "loss": 0.0894, "step": 2388 }, { "epoch": 1.086897179253867, "grad_norm": 1.717461266806642, "learning_rate": 4.439787958079662e-06, "loss": 0.0913, "step": 2389 }, { "epoch": 1.0873521383075524, "grad_norm": 1.5936201417077822, "learning_rate": 4.439337052249146e-06, "loss": 0.0853, "step": 2390 }, { "epoch": 1.0878070973612375, "grad_norm": 1.5280204524637513, "learning_rate": 4.4388859879447645e-06, "loss": 0.0725, "step": 2391 }, { "epoch": 1.0882620564149226, "grad_norm": 1.7709159752994665, "learning_rate": 4.438434765203376e-06, "loss": 0.1374, "step": 2392 }, { "epoch": 1.0887170154686079, "grad_norm": 1.7267099736271705, "learning_rate": 4.4379833840618524e-06, "loss": 0.1174, "step": 2393 }, { "epoch": 1.089171974522293, "grad_norm": 1.4910726524631923, "learning_rate": 4.4375318445570785e-06, "loss": 0.0655, "step": 2394 }, { "epoch": 1.089626933575978, "grad_norm": 1.8163886098625441, "learning_rate": 4.437080146725951e-06, "loss": 0.0546, "step": 2395 }, { "epoch": 1.0900818926296634, "grad_norm": 1.2219692369480206, "learning_rate": 4.436628290605384e-06, "loss": 0.0672, "step": 2396 }, { "epoch": 1.0905368516833485, "grad_norm": 1.6116626987809923, "learning_rate": 4.436176276232297e-06, "loss": 0.1028, "step": 2397 }, { "epoch": 1.0909918107370338, "grad_norm": 2.3052452656431255, "learning_rate": 4.4357241036436294e-06, "loss": 0.0939, "step": 2398 }, { "epoch": 1.0914467697907189, "grad_norm": 0.9223535743607304, "learning_rate": 4.435271772876329e-06, "loss": 0.0689, "step": 2399 }, { "epoch": 1.091901728844404, "grad_norm": 1.531866494757431, "learning_rate": 4.434819283967359e-06, "loss": 0.1145, "step": 2400 }, { "epoch": 1.0923566878980893, "grad_norm": 2.012408668977357, "learning_rate": 4.434366636953695e-06, "loss": 0.0655, "step": 2401 }, { "epoch": 1.0928116469517744, "grad_norm": 1.4296585397558859, "learning_rate": 4.433913831872324e-06, "loss": 0.0663, "step": 2402 }, { "epoch": 1.0932666060054594, "grad_norm": 1.5463695757532308, "learning_rate": 4.43346086876025e-06, "loss": 0.1785, "step": 2403 }, { "epoch": 1.0937215650591448, "grad_norm": 2.2667173046164253, "learning_rate": 4.433007747654484e-06, "loss": 0.0963, "step": 2404 }, { "epoch": 1.0941765241128298, "grad_norm": 1.7874869125348338, "learning_rate": 4.432554468592054e-06, "loss": 0.1245, "step": 2405 }, { "epoch": 1.094631483166515, "grad_norm": 2.0669862144476387, "learning_rate": 4.432101031610001e-06, "loss": 0.1237, "step": 2406 }, { "epoch": 1.0950864422202002, "grad_norm": 1.6979511768981763, "learning_rate": 4.431647436745376e-06, "loss": 0.0888, "step": 2407 }, { "epoch": 1.0955414012738853, "grad_norm": 1.9257787054792377, "learning_rate": 4.431193684035246e-06, "loss": 0.0816, "step": 2408 }, { "epoch": 1.0959963603275704, "grad_norm": 1.541493056259052, "learning_rate": 4.43073977351669e-06, "loss": 0.0766, "step": 2409 }, { "epoch": 1.0964513193812557, "grad_norm": 2.051380197110344, "learning_rate": 4.430285705226799e-06, "loss": 0.0692, "step": 2410 }, { "epoch": 1.0969062784349408, "grad_norm": 1.574334878171295, "learning_rate": 4.429831479202676e-06, "loss": 0.0867, "step": 2411 }, { "epoch": 1.097361237488626, "grad_norm": 1.297944277206769, "learning_rate": 4.429377095481441e-06, "loss": 0.0729, "step": 2412 }, { "epoch": 1.0978161965423112, "grad_norm": 1.4644868521714023, "learning_rate": 4.428922554100221e-06, "loss": 0.1372, "step": 2413 }, { "epoch": 1.0982711555959963, "grad_norm": 1.1220705548281613, "learning_rate": 4.428467855096163e-06, "loss": 0.0775, "step": 2414 }, { "epoch": 1.0987261146496816, "grad_norm": 2.3884661536435043, "learning_rate": 4.428012998506419e-06, "loss": 0.0783, "step": 2415 }, { "epoch": 1.0991810737033667, "grad_norm": 1.3934936655417303, "learning_rate": 4.42755798436816e-06, "loss": 0.0993, "step": 2416 }, { "epoch": 1.0996360327570518, "grad_norm": 1.7787119321180418, "learning_rate": 4.427102812718568e-06, "loss": 0.0923, "step": 2417 }, { "epoch": 1.100090991810737, "grad_norm": 2.0287950182704018, "learning_rate": 4.426647483594836e-06, "loss": 0.1214, "step": 2418 }, { "epoch": 1.1005459508644222, "grad_norm": 1.2227878126042278, "learning_rate": 4.4261919970341724e-06, "loss": 0.109, "step": 2419 }, { "epoch": 1.1010009099181073, "grad_norm": 1.4547250907863465, "learning_rate": 4.425736353073798e-06, "loss": 0.0639, "step": 2420 }, { "epoch": 1.1014558689717926, "grad_norm": 1.361745944169816, "learning_rate": 4.425280551750945e-06, "loss": 0.0779, "step": 2421 }, { "epoch": 1.1019108280254777, "grad_norm": 1.4312448198815029, "learning_rate": 4.42482459310286e-06, "loss": 0.097, "step": 2422 }, { "epoch": 1.1023657870791628, "grad_norm": 1.5917118093221942, "learning_rate": 4.424368477166801e-06, "loss": 0.0981, "step": 2423 }, { "epoch": 1.102820746132848, "grad_norm": 1.4650250955165152, "learning_rate": 4.423912203980041e-06, "loss": 0.114, "step": 2424 }, { "epoch": 1.1032757051865332, "grad_norm": 1.6849750447492673, "learning_rate": 4.423455773579865e-06, "loss": 0.072, "step": 2425 }, { "epoch": 1.1037306642402185, "grad_norm": 1.678029572619772, "learning_rate": 4.422999186003568e-06, "loss": 0.0943, "step": 2426 }, { "epoch": 1.1041856232939036, "grad_norm": 1.1098076423379506, "learning_rate": 4.422542441288462e-06, "loss": 0.0731, "step": 2427 }, { "epoch": 1.1046405823475887, "grad_norm": 1.4743567185549873, "learning_rate": 4.42208553947187e-06, "loss": 0.109, "step": 2428 }, { "epoch": 1.105095541401274, "grad_norm": 1.3759474671598095, "learning_rate": 4.4216284805911275e-06, "loss": 0.0924, "step": 2429 }, { "epoch": 1.105550500454959, "grad_norm": 2.0527322032275794, "learning_rate": 4.421171264683584e-06, "loss": 0.106, "step": 2430 }, { "epoch": 1.1060054595086442, "grad_norm": 1.664729158421169, "learning_rate": 4.4207138917866e-06, "loss": 0.1339, "step": 2431 }, { "epoch": 1.1064604185623295, "grad_norm": 1.8178200019923791, "learning_rate": 4.420256361937551e-06, "loss": 0.093, "step": 2432 }, { "epoch": 1.1069153776160146, "grad_norm": 1.1183446921626512, "learning_rate": 4.419798675173824e-06, "loss": 0.0646, "step": 2433 }, { "epoch": 1.1073703366696996, "grad_norm": 1.3726858689513264, "learning_rate": 4.419340831532819e-06, "loss": 0.0813, "step": 2434 }, { "epoch": 1.107825295723385, "grad_norm": 1.3403945446236318, "learning_rate": 4.418882831051949e-06, "loss": 0.0754, "step": 2435 }, { "epoch": 1.10828025477707, "grad_norm": 1.6141383424379385, "learning_rate": 4.418424673768639e-06, "loss": 0.0661, "step": 2436 }, { "epoch": 1.1087352138307551, "grad_norm": 1.0940032242798146, "learning_rate": 4.417966359720329e-06, "loss": 0.0318, "step": 2437 }, { "epoch": 1.1091901728844404, "grad_norm": 1.3623311010378927, "learning_rate": 4.417507888944469e-06, "loss": 0.0637, "step": 2438 }, { "epoch": 1.1096451319381255, "grad_norm": 2.141865035990428, "learning_rate": 4.417049261478525e-06, "loss": 0.1037, "step": 2439 }, { "epoch": 1.1101000909918108, "grad_norm": 1.420497893607898, "learning_rate": 4.416590477359971e-06, "loss": 0.0564, "step": 2440 }, { "epoch": 1.110555050045496, "grad_norm": 1.2732829960352239, "learning_rate": 4.416131536626299e-06, "loss": 0.1076, "step": 2441 }, { "epoch": 1.111010009099181, "grad_norm": 1.4336397689648444, "learning_rate": 4.415672439315011e-06, "loss": 0.1066, "step": 2442 }, { "epoch": 1.1114649681528663, "grad_norm": 1.0286658142783538, "learning_rate": 4.415213185463623e-06, "loss": 0.0992, "step": 2443 }, { "epoch": 1.1119199272065514, "grad_norm": 1.5137672717842037, "learning_rate": 4.414753775109661e-06, "loss": 0.0474, "step": 2444 }, { "epoch": 1.1123748862602365, "grad_norm": 1.7400780554313313, "learning_rate": 4.414294208290669e-06, "loss": 0.1138, "step": 2445 }, { "epoch": 1.1128298453139218, "grad_norm": 1.644624340954533, "learning_rate": 4.413834485044199e-06, "loss": 0.08, "step": 2446 }, { "epoch": 1.113284804367607, "grad_norm": 1.4630415788998294, "learning_rate": 4.413374605407817e-06, "loss": 0.0523, "step": 2447 }, { "epoch": 1.113739763421292, "grad_norm": 1.8356228780285462, "learning_rate": 4.412914569419103e-06, "loss": 0.0811, "step": 2448 }, { "epoch": 1.1141947224749773, "grad_norm": 1.324899907458732, "learning_rate": 4.412454377115649e-06, "loss": 0.0888, "step": 2449 }, { "epoch": 1.1146496815286624, "grad_norm": 1.4895058777507912, "learning_rate": 4.411994028535061e-06, "loss": 0.1094, "step": 2450 }, { "epoch": 1.1151046405823477, "grad_norm": 1.6376764275236961, "learning_rate": 4.411533523714954e-06, "loss": 0.0661, "step": 2451 }, { "epoch": 1.1155595996360328, "grad_norm": 1.3175933666660855, "learning_rate": 4.41107286269296e-06, "loss": 0.0832, "step": 2452 }, { "epoch": 1.1160145586897179, "grad_norm": 1.4664317140231247, "learning_rate": 4.410612045506722e-06, "loss": 0.1019, "step": 2453 }, { "epoch": 1.1164695177434032, "grad_norm": 1.697124490095177, "learning_rate": 4.410151072193897e-06, "loss": 0.1164, "step": 2454 }, { "epoch": 1.1169244767970883, "grad_norm": 1.520297101782584, "learning_rate": 4.409689942792152e-06, "loss": 0.0824, "step": 2455 }, { "epoch": 1.1173794358507734, "grad_norm": 1.693914191969565, "learning_rate": 4.409228657339168e-06, "loss": 0.13, "step": 2456 }, { "epoch": 1.1178343949044587, "grad_norm": 2.024825308244833, "learning_rate": 4.4087672158726415e-06, "loss": 0.0874, "step": 2457 }, { "epoch": 1.1182893539581438, "grad_norm": 1.6218817682748383, "learning_rate": 4.408305618430277e-06, "loss": 0.0877, "step": 2458 }, { "epoch": 1.1187443130118289, "grad_norm": 2.1554598427149054, "learning_rate": 4.407843865049797e-06, "loss": 0.0932, "step": 2459 }, { "epoch": 1.1191992720655142, "grad_norm": 1.711228616600094, "learning_rate": 4.40738195576893e-06, "loss": 0.064, "step": 2460 }, { "epoch": 1.1196542311191993, "grad_norm": 1.8471856875898178, "learning_rate": 4.406919890625424e-06, "loss": 0.0987, "step": 2461 }, { "epoch": 1.1201091901728844, "grad_norm": 1.1003500159856345, "learning_rate": 4.406457669657036e-06, "loss": 0.0759, "step": 2462 }, { "epoch": 1.1205641492265697, "grad_norm": 2.109594577114758, "learning_rate": 4.405995292901537e-06, "loss": 0.0942, "step": 2463 }, { "epoch": 1.1210191082802548, "grad_norm": 1.8182386073569805, "learning_rate": 4.40553276039671e-06, "loss": 0.1389, "step": 2464 }, { "epoch": 1.1214740673339398, "grad_norm": 1.4379586293025806, "learning_rate": 4.4050700721803505e-06, "loss": 0.099, "step": 2465 }, { "epoch": 1.1219290263876252, "grad_norm": 1.4425166537042247, "learning_rate": 4.404607228290269e-06, "loss": 0.0861, "step": 2466 }, { "epoch": 1.1223839854413102, "grad_norm": 1.4093172987847846, "learning_rate": 4.404144228764285e-06, "loss": 0.0621, "step": 2467 }, { "epoch": 1.1228389444949956, "grad_norm": 1.8641838091648237, "learning_rate": 4.403681073640235e-06, "loss": 0.1364, "step": 2468 }, { "epoch": 1.1232939035486806, "grad_norm": 1.4149844792642807, "learning_rate": 4.403217762955963e-06, "loss": 0.0738, "step": 2469 }, { "epoch": 1.1237488626023657, "grad_norm": 1.167003064546788, "learning_rate": 4.402754296749331e-06, "loss": 0.1399, "step": 2470 }, { "epoch": 1.124203821656051, "grad_norm": 1.3706100775947843, "learning_rate": 4.402290675058211e-06, "loss": 0.0743, "step": 2471 }, { "epoch": 1.1246587807097361, "grad_norm": 1.3145920684357588, "learning_rate": 4.401826897920487e-06, "loss": 0.1099, "step": 2472 }, { "epoch": 1.1251137397634212, "grad_norm": 1.5982593223467985, "learning_rate": 4.4013629653740575e-06, "loss": 0.0645, "step": 2473 }, { "epoch": 1.1255686988171065, "grad_norm": 1.652131477085118, "learning_rate": 4.400898877456833e-06, "loss": 0.1091, "step": 2474 }, { "epoch": 1.1260236578707916, "grad_norm": 1.1449819643243202, "learning_rate": 4.400434634206737e-06, "loss": 0.068, "step": 2475 }, { "epoch": 1.1264786169244767, "grad_norm": 1.144310552102497, "learning_rate": 4.399970235661705e-06, "loss": 0.0685, "step": 2476 }, { "epoch": 1.126933575978162, "grad_norm": 1.2448262081573807, "learning_rate": 4.399505681859685e-06, "loss": 0.0932, "step": 2477 }, { "epoch": 1.127388535031847, "grad_norm": 1.1408663298803172, "learning_rate": 4.399040972838639e-06, "loss": 0.0423, "step": 2478 }, { "epoch": 1.1278434940855324, "grad_norm": 1.699409897859247, "learning_rate": 4.398576108636541e-06, "loss": 0.0787, "step": 2479 }, { "epoch": 1.1282984531392175, "grad_norm": 1.7864933002408017, "learning_rate": 4.398111089291378e-06, "loss": 0.0892, "step": 2480 }, { "epoch": 1.1287534121929026, "grad_norm": 2.14798840196358, "learning_rate": 4.3976459148411464e-06, "loss": 0.1009, "step": 2481 }, { "epoch": 1.129208371246588, "grad_norm": 1.5385879391737598, "learning_rate": 4.3971805853238616e-06, "loss": 0.081, "step": 2482 }, { "epoch": 1.129663330300273, "grad_norm": 2.531930467512664, "learning_rate": 4.396715100777547e-06, "loss": 0.0686, "step": 2483 }, { "epoch": 1.130118289353958, "grad_norm": 1.8968573987064818, "learning_rate": 4.39624946124024e-06, "loss": 0.1027, "step": 2484 }, { "epoch": 1.1305732484076434, "grad_norm": 1.5129833288445977, "learning_rate": 4.39578366674999e-06, "loss": 0.072, "step": 2485 }, { "epoch": 1.1310282074613285, "grad_norm": 1.4623536249588729, "learning_rate": 4.395317717344861e-06, "loss": 0.0924, "step": 2486 }, { "epoch": 1.1314831665150136, "grad_norm": 1.9901397225611637, "learning_rate": 4.394851613062927e-06, "loss": 0.0852, "step": 2487 }, { "epoch": 1.1319381255686989, "grad_norm": 1.3624251358159498, "learning_rate": 4.394385353942275e-06, "loss": 0.0543, "step": 2488 }, { "epoch": 1.132393084622384, "grad_norm": 2.097016286942742, "learning_rate": 4.393918940021008e-06, "loss": 0.1261, "step": 2489 }, { "epoch": 1.132848043676069, "grad_norm": 1.7568839339292304, "learning_rate": 4.393452371337238e-06, "loss": 0.0754, "step": 2490 }, { "epoch": 1.1333030027297544, "grad_norm": 1.4870006844681243, "learning_rate": 4.39298564792909e-06, "loss": 0.0765, "step": 2491 }, { "epoch": 1.1337579617834395, "grad_norm": 2.3747689669640204, "learning_rate": 4.392518769834705e-06, "loss": 0.1088, "step": 2492 }, { "epoch": 1.1342129208371245, "grad_norm": 1.8391194648070115, "learning_rate": 4.392051737092231e-06, "loss": 0.1038, "step": 2493 }, { "epoch": 1.1346678798908099, "grad_norm": 1.3181948862231594, "learning_rate": 4.391584549739834e-06, "loss": 0.0953, "step": 2494 }, { "epoch": 1.135122838944495, "grad_norm": 1.768253423337537, "learning_rate": 4.391117207815691e-06, "loss": 0.0861, "step": 2495 }, { "epoch": 1.1355777979981803, "grad_norm": 1.7733681614801209, "learning_rate": 4.3906497113579895e-06, "loss": 0.0869, "step": 2496 }, { "epoch": 1.1360327570518653, "grad_norm": 1.7107321819304122, "learning_rate": 4.390182060404931e-06, "loss": 0.0522, "step": 2497 }, { "epoch": 1.1364877161055504, "grad_norm": 1.434552421646011, "learning_rate": 4.389714254994732e-06, "loss": 0.0846, "step": 2498 }, { "epoch": 1.1369426751592357, "grad_norm": 1.5226850377251067, "learning_rate": 4.389246295165617e-06, "loss": 0.083, "step": 2499 }, { "epoch": 1.1373976342129208, "grad_norm": 1.1587798025261624, "learning_rate": 4.388778180955826e-06, "loss": 0.0715, "step": 2500 }, { "epoch": 1.137852593266606, "grad_norm": 2.2145425207872735, "learning_rate": 4.388309912403612e-06, "loss": 0.126, "step": 2501 }, { "epoch": 1.1383075523202912, "grad_norm": 1.860918476304708, "learning_rate": 4.38784148954724e-06, "loss": 0.0825, "step": 2502 }, { "epoch": 1.1387625113739763, "grad_norm": 1.5494754816427427, "learning_rate": 4.387372912424987e-06, "loss": 0.0664, "step": 2503 }, { "epoch": 1.1392174704276614, "grad_norm": 1.4756280948745337, "learning_rate": 4.386904181075142e-06, "loss": 0.1292, "step": 2504 }, { "epoch": 1.1396724294813467, "grad_norm": 1.4970335285969478, "learning_rate": 4.386435295536008e-06, "loss": 0.0617, "step": 2505 }, { "epoch": 1.1401273885350318, "grad_norm": 1.3926364015804897, "learning_rate": 4.385966255845902e-06, "loss": 0.0978, "step": 2506 }, { "epoch": 1.1405823475887171, "grad_norm": 1.392316755067547, "learning_rate": 4.38549706204315e-06, "loss": 0.1051, "step": 2507 }, { "epoch": 1.1410373066424022, "grad_norm": 1.337875750299131, "learning_rate": 4.385027714166094e-06, "loss": 0.0818, "step": 2508 }, { "epoch": 1.1414922656960873, "grad_norm": 1.7636561267412383, "learning_rate": 4.384558212253084e-06, "loss": 0.058, "step": 2509 }, { "epoch": 1.1419472247497726, "grad_norm": 1.4667430941313127, "learning_rate": 4.384088556342488e-06, "loss": 0.0757, "step": 2510 }, { "epoch": 1.1424021838034577, "grad_norm": 1.4237110238919748, "learning_rate": 4.383618746472686e-06, "loss": 0.0769, "step": 2511 }, { "epoch": 1.1428571428571428, "grad_norm": 1.5730790632789893, "learning_rate": 4.383148782682064e-06, "loss": 0.0653, "step": 2512 }, { "epoch": 1.143312101910828, "grad_norm": 1.4241196656590642, "learning_rate": 4.382678665009028e-06, "loss": 0.1399, "step": 2513 }, { "epoch": 1.1437670609645132, "grad_norm": 1.343619807338348, "learning_rate": 4.382208393491994e-06, "loss": 0.1179, "step": 2514 }, { "epoch": 1.1442220200181983, "grad_norm": 1.5009441966445611, "learning_rate": 4.381737968169389e-06, "loss": 0.0771, "step": 2515 }, { "epoch": 1.1446769790718836, "grad_norm": 1.986426705123048, "learning_rate": 4.381267389079657e-06, "loss": 0.0701, "step": 2516 }, { "epoch": 1.1451319381255687, "grad_norm": 1.55910702321473, "learning_rate": 4.380796656261248e-06, "loss": 0.0972, "step": 2517 }, { "epoch": 1.1455868971792538, "grad_norm": 1.3317020576259018, "learning_rate": 4.38032576975263e-06, "loss": 0.0611, "step": 2518 }, { "epoch": 1.146041856232939, "grad_norm": 1.2157043472122377, "learning_rate": 4.3798547295922825e-06, "loss": 0.0699, "step": 2519 }, { "epoch": 1.1464968152866242, "grad_norm": 2.724328439334893, "learning_rate": 4.3793835358186955e-06, "loss": 0.0797, "step": 2520 }, { "epoch": 1.1469517743403093, "grad_norm": 1.7128126611421937, "learning_rate": 4.378912188470374e-06, "loss": 0.1045, "step": 2521 }, { "epoch": 1.1474067333939946, "grad_norm": 1.4469267749443473, "learning_rate": 4.378440687585832e-06, "loss": 0.0924, "step": 2522 }, { "epoch": 1.1478616924476797, "grad_norm": 1.8130770437623378, "learning_rate": 4.3779690332036005e-06, "loss": 0.1218, "step": 2523 }, { "epoch": 1.148316651501365, "grad_norm": 1.7468548582501024, "learning_rate": 4.3774972253622205e-06, "loss": 0.1111, "step": 2524 }, { "epoch": 1.14877161055505, "grad_norm": 1.4797480492586725, "learning_rate": 4.377025264100246e-06, "loss": 0.0854, "step": 2525 }, { "epoch": 1.1492265696087351, "grad_norm": 1.7116967965378072, "learning_rate": 4.376553149456244e-06, "loss": 0.0594, "step": 2526 }, { "epoch": 1.1496815286624205, "grad_norm": 1.643705257307874, "learning_rate": 4.376080881468793e-06, "loss": 0.0696, "step": 2527 }, { "epoch": 1.1501364877161055, "grad_norm": 1.1326114868014416, "learning_rate": 4.375608460176483e-06, "loss": 0.0705, "step": 2528 }, { "epoch": 1.1505914467697906, "grad_norm": 1.7031789207462111, "learning_rate": 4.375135885617922e-06, "loss": 0.0812, "step": 2529 }, { "epoch": 1.151046405823476, "grad_norm": 1.41010135204267, "learning_rate": 4.3746631578317236e-06, "loss": 0.086, "step": 2530 }, { "epoch": 1.151501364877161, "grad_norm": 1.6943016984534656, "learning_rate": 4.374190276856517e-06, "loss": 0.0754, "step": 2531 }, { "epoch": 1.1519563239308463, "grad_norm": 2.0617449393261165, "learning_rate": 4.373717242730946e-06, "loss": 0.09, "step": 2532 }, { "epoch": 1.1524112829845314, "grad_norm": 1.7367594980944636, "learning_rate": 4.373244055493663e-06, "loss": 0.0623, "step": 2533 }, { "epoch": 1.1528662420382165, "grad_norm": 1.9342760133428794, "learning_rate": 4.372770715183336e-06, "loss": 0.1147, "step": 2534 }, { "epoch": 1.1533212010919018, "grad_norm": 2.0637174188437255, "learning_rate": 4.372297221838642e-06, "loss": 0.1456, "step": 2535 }, { "epoch": 1.153776160145587, "grad_norm": 1.640815829478928, "learning_rate": 4.3718235754982755e-06, "loss": 0.1097, "step": 2536 }, { "epoch": 1.154231119199272, "grad_norm": 1.4969972221702579, "learning_rate": 4.371349776200939e-06, "loss": 0.1089, "step": 2537 }, { "epoch": 1.1546860782529573, "grad_norm": 1.7453973329666645, "learning_rate": 4.37087582398535e-06, "loss": 0.081, "step": 2538 }, { "epoch": 1.1551410373066424, "grad_norm": 1.3301344902434764, "learning_rate": 4.370401718890237e-06, "loss": 0.0839, "step": 2539 }, { "epoch": 1.1555959963603275, "grad_norm": 1.3726509501801365, "learning_rate": 4.369927460954342e-06, "loss": 0.0757, "step": 2540 }, { "epoch": 1.1560509554140128, "grad_norm": 1.7575525897527056, "learning_rate": 4.36945305021642e-06, "loss": 0.0984, "step": 2541 }, { "epoch": 1.156505914467698, "grad_norm": 1.0573468860101436, "learning_rate": 4.368978486715237e-06, "loss": 0.0858, "step": 2542 }, { "epoch": 1.156960873521383, "grad_norm": 1.2811400584279555, "learning_rate": 4.368503770489573e-06, "loss": 0.0956, "step": 2543 }, { "epoch": 1.1574158325750683, "grad_norm": 1.3937719698326214, "learning_rate": 4.368028901578218e-06, "loss": 0.0721, "step": 2544 }, { "epoch": 1.1578707916287534, "grad_norm": 1.3592341439150106, "learning_rate": 4.367553880019977e-06, "loss": 0.072, "step": 2545 }, { "epoch": 1.1583257506824385, "grad_norm": 1.6455271567667071, "learning_rate": 4.367078705853667e-06, "loss": 0.0688, "step": 2546 }, { "epoch": 1.1587807097361238, "grad_norm": 1.6810345974728753, "learning_rate": 4.366603379118117e-06, "loss": 0.1038, "step": 2547 }, { "epoch": 1.1592356687898089, "grad_norm": 1.4578278036788574, "learning_rate": 4.366127899852169e-06, "loss": 0.0865, "step": 2548 }, { "epoch": 1.159690627843494, "grad_norm": 1.3103780377545284, "learning_rate": 4.365652268094675e-06, "loss": 0.0674, "step": 2549 }, { "epoch": 1.1601455868971793, "grad_norm": 1.7957120553998775, "learning_rate": 4.365176483884504e-06, "loss": 0.1312, "step": 2550 }, { "epoch": 1.1606005459508644, "grad_norm": 1.6492238946584739, "learning_rate": 4.364700547260533e-06, "loss": 0.0907, "step": 2551 }, { "epoch": 1.1610555050045497, "grad_norm": 1.3864243311454894, "learning_rate": 4.3642244582616545e-06, "loss": 0.0977, "step": 2552 }, { "epoch": 1.1615104640582348, "grad_norm": 1.5321223648985156, "learning_rate": 4.363748216926772e-06, "loss": 0.0975, "step": 2553 }, { "epoch": 1.1619654231119199, "grad_norm": 1.428088888774431, "learning_rate": 4.363271823294802e-06, "loss": 0.1138, "step": 2554 }, { "epoch": 1.1624203821656052, "grad_norm": 1.9030961957887997, "learning_rate": 4.362795277404673e-06, "loss": 0.1121, "step": 2555 }, { "epoch": 1.1628753412192903, "grad_norm": 1.1462755051031488, "learning_rate": 4.362318579295326e-06, "loss": 0.0467, "step": 2556 }, { "epoch": 1.1633303002729753, "grad_norm": 1.4980767963568005, "learning_rate": 4.361841729005715e-06, "loss": 0.1018, "step": 2557 }, { "epoch": 1.1637852593266607, "grad_norm": 2.2145503141446614, "learning_rate": 4.361364726574806e-06, "loss": 0.0853, "step": 2558 }, { "epoch": 1.1642402183803457, "grad_norm": 1.1989117424823872, "learning_rate": 4.360887572041578e-06, "loss": 0.0868, "step": 2559 }, { "epoch": 1.164695177434031, "grad_norm": 1.9066512245156881, "learning_rate": 4.36041026544502e-06, "loss": 0.1471, "step": 2560 }, { "epoch": 1.1651501364877161, "grad_norm": 1.428837377276699, "learning_rate": 4.359932806824138e-06, "loss": 0.0718, "step": 2561 }, { "epoch": 1.1656050955414012, "grad_norm": 1.417125208635274, "learning_rate": 4.359455196217946e-06, "loss": 0.0614, "step": 2562 }, { "epoch": 1.1660600545950865, "grad_norm": 1.6663939403921464, "learning_rate": 4.358977433665471e-06, "loss": 0.0586, "step": 2563 }, { "epoch": 1.1665150136487716, "grad_norm": 1.3921354785427886, "learning_rate": 4.3584995192057565e-06, "loss": 0.0691, "step": 2564 }, { "epoch": 1.1669699727024567, "grad_norm": 1.1683109281081594, "learning_rate": 4.358021452877854e-06, "loss": 0.0952, "step": 2565 }, { "epoch": 1.167424931756142, "grad_norm": 1.5985810446894706, "learning_rate": 4.357543234720829e-06, "loss": 0.0771, "step": 2566 }, { "epoch": 1.1678798908098271, "grad_norm": 1.726758001874974, "learning_rate": 4.357064864773761e-06, "loss": 0.0852, "step": 2567 }, { "epoch": 1.1683348498635122, "grad_norm": 1.376146728666042, "learning_rate": 4.3565863430757375e-06, "loss": 0.0816, "step": 2568 }, { "epoch": 1.1687898089171975, "grad_norm": 1.266164839412077, "learning_rate": 4.356107669665862e-06, "loss": 0.095, "step": 2569 }, { "epoch": 1.1692447679708826, "grad_norm": 1.7363433482517434, "learning_rate": 4.355628844583249e-06, "loss": 0.1348, "step": 2570 }, { "epoch": 1.1696997270245677, "grad_norm": 1.5900315387927095, "learning_rate": 4.355149867867029e-06, "loss": 0.0785, "step": 2571 }, { "epoch": 1.170154686078253, "grad_norm": 1.7031570854225535, "learning_rate": 4.354670739556338e-06, "loss": 0.0903, "step": 2572 }, { "epoch": 1.170609645131938, "grad_norm": 1.553459320102983, "learning_rate": 4.35419145969033e-06, "loss": 0.0808, "step": 2573 }, { "epoch": 1.1710646041856232, "grad_norm": 1.624748274996521, "learning_rate": 4.35371202830817e-06, "loss": 0.0946, "step": 2574 }, { "epoch": 1.1715195632393085, "grad_norm": 1.998220943026382, "learning_rate": 4.353232445449034e-06, "loss": 0.1007, "step": 2575 }, { "epoch": 1.1719745222929936, "grad_norm": 1.3879277679859046, "learning_rate": 4.352752711152112e-06, "loss": 0.0752, "step": 2576 }, { "epoch": 1.1724294813466787, "grad_norm": 2.043253151446217, "learning_rate": 4.352272825456605e-06, "loss": 0.1392, "step": 2577 }, { "epoch": 1.172884440400364, "grad_norm": 1.4430794602564747, "learning_rate": 4.3517927884017275e-06, "loss": 0.1071, "step": 2578 }, { "epoch": 1.173339399454049, "grad_norm": 1.3026567584819855, "learning_rate": 4.351312600026706e-06, "loss": 0.0907, "step": 2579 }, { "epoch": 1.1737943585077344, "grad_norm": 1.4101005705511307, "learning_rate": 4.350832260370779e-06, "loss": 0.1012, "step": 2580 }, { "epoch": 1.1742493175614195, "grad_norm": 1.3419121345653944, "learning_rate": 4.350351769473198e-06, "loss": 0.0696, "step": 2581 }, { "epoch": 1.1747042766151046, "grad_norm": 1.350413613603601, "learning_rate": 4.349871127373226e-06, "loss": 0.0917, "step": 2582 }, { "epoch": 1.1751592356687899, "grad_norm": 1.5328058199569599, "learning_rate": 4.349390334110141e-06, "loss": 0.1113, "step": 2583 }, { "epoch": 1.175614194722475, "grad_norm": 1.1093873947356732, "learning_rate": 4.348909389723228e-06, "loss": 0.0659, "step": 2584 }, { "epoch": 1.17606915377616, "grad_norm": 1.6756868000210596, "learning_rate": 4.348428294251791e-06, "loss": 0.0998, "step": 2585 }, { "epoch": 1.1765241128298454, "grad_norm": 1.4020895191217355, "learning_rate": 4.34794704773514e-06, "loss": 0.0756, "step": 2586 }, { "epoch": 1.1769790718835305, "grad_norm": 1.619901575556969, "learning_rate": 4.347465650212602e-06, "loss": 0.1049, "step": 2587 }, { "epoch": 1.1774340309372158, "grad_norm": 1.2820911146358447, "learning_rate": 4.346984101723513e-06, "loss": 0.099, "step": 2588 }, { "epoch": 1.1778889899909009, "grad_norm": 1.5114352969050147, "learning_rate": 4.3465024023072255e-06, "loss": 0.1257, "step": 2589 }, { "epoch": 1.178343949044586, "grad_norm": 1.3539463988206946, "learning_rate": 4.3460205520031006e-06, "loss": 0.0593, "step": 2590 }, { "epoch": 1.1787989080982713, "grad_norm": 1.951842216649359, "learning_rate": 4.345538550850512e-06, "loss": 0.1236, "step": 2591 }, { "epoch": 1.1792538671519563, "grad_norm": 1.8285849146657949, "learning_rate": 4.345056398888847e-06, "loss": 0.0928, "step": 2592 }, { "epoch": 1.1797088262056414, "grad_norm": 1.5041066242121004, "learning_rate": 4.3445740961575066e-06, "loss": 0.0687, "step": 2593 }, { "epoch": 1.1801637852593267, "grad_norm": 1.6575747108346124, "learning_rate": 4.3440916426959e-06, "loss": 0.0904, "step": 2594 }, { "epoch": 1.1806187443130118, "grad_norm": 1.3214979838016756, "learning_rate": 4.343609038543452e-06, "loss": 0.0899, "step": 2595 }, { "epoch": 1.181073703366697, "grad_norm": 1.4859231565076656, "learning_rate": 4.3431262837396e-06, "loss": 0.0978, "step": 2596 }, { "epoch": 1.1815286624203822, "grad_norm": 1.6150637319977543, "learning_rate": 4.342643378323791e-06, "loss": 0.0842, "step": 2597 }, { "epoch": 1.1819836214740673, "grad_norm": 1.413038987453138, "learning_rate": 4.342160322335487e-06, "loss": 0.0654, "step": 2598 }, { "epoch": 1.1824385805277524, "grad_norm": 2.182860548460036, "learning_rate": 4.34167711581416e-06, "loss": 0.0841, "step": 2599 }, { "epoch": 1.1828935395814377, "grad_norm": 1.275297167024451, "learning_rate": 4.3411937587992955e-06, "loss": 0.0722, "step": 2600 }, { "epoch": 1.1833484986351228, "grad_norm": 1.1799530738898074, "learning_rate": 4.340710251330393e-06, "loss": 0.0662, "step": 2601 }, { "epoch": 1.183803457688808, "grad_norm": 1.872220715095368, "learning_rate": 4.34022659344696e-06, "loss": 0.1292, "step": 2602 }, { "epoch": 1.1842584167424932, "grad_norm": 1.6772862778704278, "learning_rate": 4.339742785188521e-06, "loss": 0.0966, "step": 2603 }, { "epoch": 1.1847133757961783, "grad_norm": 1.6082753483614305, "learning_rate": 4.339258826594611e-06, "loss": 0.0582, "step": 2604 }, { "epoch": 1.1851683348498634, "grad_norm": 1.6117792608004555, "learning_rate": 4.338774717704774e-06, "loss": 0.0643, "step": 2605 }, { "epoch": 1.1856232939035487, "grad_norm": 1.7422517232972539, "learning_rate": 4.338290458558572e-06, "loss": 0.1766, "step": 2606 }, { "epoch": 1.1860782529572338, "grad_norm": 2.1476781837506818, "learning_rate": 4.3378060491955744e-06, "loss": 0.1463, "step": 2607 }, { "epoch": 1.186533212010919, "grad_norm": 1.8922581543540133, "learning_rate": 4.337321489655366e-06, "loss": 0.1528, "step": 2608 }, { "epoch": 1.1869881710646042, "grad_norm": 1.7516502810489014, "learning_rate": 4.336836779977543e-06, "loss": 0.1038, "step": 2609 }, { "epoch": 1.1874431301182893, "grad_norm": 1.4511814170214454, "learning_rate": 4.336351920201714e-06, "loss": 0.1005, "step": 2610 }, { "epoch": 1.1878980891719746, "grad_norm": 1.5620930461894496, "learning_rate": 4.335866910367498e-06, "loss": 0.0492, "step": 2611 }, { "epoch": 1.1883530482256597, "grad_norm": 2.7082970498760117, "learning_rate": 4.3353817505145294e-06, "loss": 0.0909, "step": 2612 }, { "epoch": 1.1888080072793448, "grad_norm": 1.5743219982804768, "learning_rate": 4.334896440682452e-06, "loss": 0.077, "step": 2613 }, { "epoch": 1.18926296633303, "grad_norm": 1.3966339148129352, "learning_rate": 4.334410980910924e-06, "loss": 0.1218, "step": 2614 }, { "epoch": 1.1897179253867152, "grad_norm": 1.4856452151376027, "learning_rate": 4.333925371239615e-06, "loss": 0.1035, "step": 2615 }, { "epoch": 1.1901728844404005, "grad_norm": 1.6127438575709883, "learning_rate": 4.3334396117082065e-06, "loss": 0.1052, "step": 2616 }, { "epoch": 1.1906278434940856, "grad_norm": 1.7288330036362787, "learning_rate": 4.332953702356393e-06, "loss": 0.1607, "step": 2617 }, { "epoch": 1.1910828025477707, "grad_norm": 1.2779780017213267, "learning_rate": 4.33246764322388e-06, "loss": 0.0664, "step": 2618 }, { "epoch": 1.191537761601456, "grad_norm": 1.843632743904082, "learning_rate": 4.331981434350387e-06, "loss": 0.1535, "step": 2619 }, { "epoch": 1.191992720655141, "grad_norm": 1.3210812550635276, "learning_rate": 4.331495075775644e-06, "loss": 0.1404, "step": 2620 }, { "epoch": 1.1924476797088261, "grad_norm": 1.3878492439329282, "learning_rate": 4.331008567539395e-06, "loss": 0.0747, "step": 2621 }, { "epoch": 1.1929026387625115, "grad_norm": 1.3357463507965919, "learning_rate": 4.330521909681394e-06, "loss": 0.0766, "step": 2622 }, { "epoch": 1.1933575978161965, "grad_norm": 1.6211605147229922, "learning_rate": 4.330035102241409e-06, "loss": 0.1197, "step": 2623 }, { "epoch": 1.1938125568698816, "grad_norm": 1.496864935979414, "learning_rate": 4.32954814525922e-06, "loss": 0.0701, "step": 2624 }, { "epoch": 1.194267515923567, "grad_norm": 1.3041113510202, "learning_rate": 4.329061038774619e-06, "loss": 0.071, "step": 2625 }, { "epoch": 1.194722474977252, "grad_norm": 1.3390637893903103, "learning_rate": 4.32857378282741e-06, "loss": 0.0951, "step": 2626 }, { "epoch": 1.1951774340309371, "grad_norm": 1.3209742325562313, "learning_rate": 4.328086377457409e-06, "loss": 0.0844, "step": 2627 }, { "epoch": 1.1956323930846224, "grad_norm": 1.8118172786335158, "learning_rate": 4.327598822704444e-06, "loss": 0.1175, "step": 2628 }, { "epoch": 1.1960873521383075, "grad_norm": 1.6299368669430234, "learning_rate": 4.327111118608357e-06, "loss": 0.1467, "step": 2629 }, { "epoch": 1.1965423111919926, "grad_norm": 1.5688063002459107, "learning_rate": 4.326623265209001e-06, "loss": 0.0803, "step": 2630 }, { "epoch": 1.196997270245678, "grad_norm": 1.6465294755773725, "learning_rate": 4.326135262546241e-06, "loss": 0.0705, "step": 2631 }, { "epoch": 1.197452229299363, "grad_norm": 1.6238105525738482, "learning_rate": 4.325647110659954e-06, "loss": 0.1254, "step": 2632 }, { "epoch": 1.197907188353048, "grad_norm": 1.7891444626148267, "learning_rate": 4.325158809590028e-06, "loss": 0.0718, "step": 2633 }, { "epoch": 1.1983621474067334, "grad_norm": 1.047556103709193, "learning_rate": 4.324670359376368e-06, "loss": 0.0548, "step": 2634 }, { "epoch": 1.1988171064604185, "grad_norm": 1.4266407858751808, "learning_rate": 4.3241817600588865e-06, "loss": 0.0799, "step": 2635 }, { "epoch": 1.1992720655141038, "grad_norm": 1.0758052671422083, "learning_rate": 4.3236930116775086e-06, "loss": 0.0469, "step": 2636 }, { "epoch": 1.199727024567789, "grad_norm": 1.8000162783707994, "learning_rate": 4.323204114272174e-06, "loss": 0.1349, "step": 2637 }, { "epoch": 1.200181983621474, "grad_norm": 2.2216878566032836, "learning_rate": 4.3227150678828335e-06, "loss": 0.1198, "step": 2638 }, { "epoch": 1.2006369426751593, "grad_norm": 1.674728333776232, "learning_rate": 4.322225872549448e-06, "loss": 0.1025, "step": 2639 }, { "epoch": 1.2010919017288444, "grad_norm": 1.689368542839076, "learning_rate": 4.321736528311994e-06, "loss": 0.1048, "step": 2640 }, { "epoch": 1.2015468607825295, "grad_norm": 1.4354075881450123, "learning_rate": 4.321247035210456e-06, "loss": 0.0692, "step": 2641 }, { "epoch": 1.2020018198362148, "grad_norm": 1.6563738642729477, "learning_rate": 4.320757393284837e-06, "loss": 0.0767, "step": 2642 }, { "epoch": 1.2024567788898999, "grad_norm": 1.379611923602435, "learning_rate": 4.3202676025751455e-06, "loss": 0.0591, "step": 2643 }, { "epoch": 1.2029117379435852, "grad_norm": 1.6479290456698004, "learning_rate": 4.319777663121406e-06, "loss": 0.0961, "step": 2644 }, { "epoch": 1.2033666969972703, "grad_norm": 1.9415821059711678, "learning_rate": 4.319287574963653e-06, "loss": 0.1624, "step": 2645 }, { "epoch": 1.2038216560509554, "grad_norm": 1.5187755572188995, "learning_rate": 4.318797338141936e-06, "loss": 0.0799, "step": 2646 }, { "epoch": 1.2042766151046407, "grad_norm": 1.2261158559841066, "learning_rate": 4.318306952696314e-06, "loss": 0.0789, "step": 2647 }, { "epoch": 1.2047315741583258, "grad_norm": 1.5350997195388667, "learning_rate": 4.317816418666859e-06, "loss": 0.0648, "step": 2648 }, { "epoch": 1.2051865332120109, "grad_norm": 2.0282859482323135, "learning_rate": 4.317325736093656e-06, "loss": 0.1003, "step": 2649 }, { "epoch": 1.2056414922656962, "grad_norm": 1.099438335437198, "learning_rate": 4.316834905016801e-06, "loss": 0.0749, "step": 2650 }, { "epoch": 1.2060964513193813, "grad_norm": 1.6955258737212886, "learning_rate": 4.3163439254764015e-06, "loss": 0.0799, "step": 2651 }, { "epoch": 1.2065514103730663, "grad_norm": 1.4782312844645842, "learning_rate": 4.31585279751258e-06, "loss": 0.0812, "step": 2652 }, { "epoch": 1.2070063694267517, "grad_norm": 0.962225205333111, "learning_rate": 4.315361521165467e-06, "loss": 0.0421, "step": 2653 }, { "epoch": 1.2074613284804367, "grad_norm": 1.475944438171979, "learning_rate": 4.314870096475209e-06, "loss": 0.0797, "step": 2654 }, { "epoch": 1.2079162875341218, "grad_norm": 1.9568750202890988, "learning_rate": 4.3143785234819624e-06, "loss": 0.1064, "step": 2655 }, { "epoch": 1.2083712465878071, "grad_norm": 1.2968330567546162, "learning_rate": 4.3138868022258974e-06, "loss": 0.0541, "step": 2656 }, { "epoch": 1.2088262056414922, "grad_norm": 1.3512605939635933, "learning_rate": 4.313394932747194e-06, "loss": 0.084, "step": 2657 }, { "epoch": 1.2092811646951773, "grad_norm": 1.2788458917599885, "learning_rate": 4.312902915086045e-06, "loss": 0.078, "step": 2658 }, { "epoch": 1.2097361237488626, "grad_norm": 1.2087340265742859, "learning_rate": 4.312410749282658e-06, "loss": 0.083, "step": 2659 }, { "epoch": 1.2101910828025477, "grad_norm": 1.51675138627556, "learning_rate": 4.311918435377248e-06, "loss": 0.098, "step": 2660 }, { "epoch": 1.210646041856233, "grad_norm": 1.767606141999641, "learning_rate": 4.311425973410047e-06, "loss": 0.1403, "step": 2661 }, { "epoch": 1.2111010009099181, "grad_norm": 1.8607859425213837, "learning_rate": 4.310933363421296e-06, "loss": 0.1002, "step": 2662 }, { "epoch": 1.2115559599636032, "grad_norm": 2.188295719120762, "learning_rate": 4.310440605451248e-06, "loss": 0.1062, "step": 2663 }, { "epoch": 1.2120109190172885, "grad_norm": 1.6007893169355347, "learning_rate": 4.30994769954017e-06, "loss": 0.0855, "step": 2664 }, { "epoch": 1.2124658780709736, "grad_norm": 1.7264264512353125, "learning_rate": 4.30945464572834e-06, "loss": 0.1561, "step": 2665 }, { "epoch": 1.2129208371246587, "grad_norm": 1.4708066988612976, "learning_rate": 4.3089614440560465e-06, "loss": 0.0607, "step": 2666 }, { "epoch": 1.213375796178344, "grad_norm": 1.5600890024513265, "learning_rate": 4.3084680945635946e-06, "loss": 0.1364, "step": 2667 }, { "epoch": 1.213830755232029, "grad_norm": 1.876498244558624, "learning_rate": 4.307974597291296e-06, "loss": 0.1076, "step": 2668 }, { "epoch": 1.2142857142857142, "grad_norm": 1.37065103914952, "learning_rate": 4.307480952279478e-06, "loss": 0.0523, "step": 2669 }, { "epoch": 1.2147406733393995, "grad_norm": 1.4444820040999051, "learning_rate": 4.3069871595684795e-06, "loss": 0.0739, "step": 2670 }, { "epoch": 1.2151956323930846, "grad_norm": 1.5069719193608038, "learning_rate": 4.30649321919865e-06, "loss": 0.0911, "step": 2671 }, { "epoch": 1.21565059144677, "grad_norm": 1.2934622383879057, "learning_rate": 4.305999131210353e-06, "loss": 0.0837, "step": 2672 }, { "epoch": 1.216105550500455, "grad_norm": 1.5853581830621495, "learning_rate": 4.305504895643963e-06, "loss": 0.0833, "step": 2673 }, { "epoch": 1.21656050955414, "grad_norm": 1.3709517382273528, "learning_rate": 4.305010512539867e-06, "loss": 0.1159, "step": 2674 }, { "epoch": 1.2170154686078254, "grad_norm": 1.4168456459509742, "learning_rate": 4.304515981938462e-06, "loss": 0.0606, "step": 2675 }, { "epoch": 1.2174704276615105, "grad_norm": 1.5616363029677887, "learning_rate": 4.304021303880161e-06, "loss": 0.0996, "step": 2676 }, { "epoch": 1.2179253867151956, "grad_norm": 1.708179628273713, "learning_rate": 4.303526478405386e-06, "loss": 0.1065, "step": 2677 }, { "epoch": 1.2183803457688809, "grad_norm": 2.116672264038859, "learning_rate": 4.3030315055545715e-06, "loss": 0.128, "step": 2678 }, { "epoch": 1.218835304822566, "grad_norm": 1.6986733358840764, "learning_rate": 4.302536385368165e-06, "loss": 0.082, "step": 2679 }, { "epoch": 1.219290263876251, "grad_norm": 1.6851973141425958, "learning_rate": 4.3020411178866246e-06, "loss": 0.0666, "step": 2680 }, { "epoch": 1.2197452229299364, "grad_norm": 1.3268862435295075, "learning_rate": 4.3015457031504226e-06, "loss": 0.0615, "step": 2681 }, { "epoch": 1.2202001819836215, "grad_norm": 2.894618285414545, "learning_rate": 4.301050141200041e-06, "loss": 0.1161, "step": 2682 }, { "epoch": 1.2206551410373065, "grad_norm": 1.8518976016980668, "learning_rate": 4.300554432075975e-06, "loss": 0.0677, "step": 2683 }, { "epoch": 1.2211101000909919, "grad_norm": 1.9252846318661894, "learning_rate": 4.300058575818733e-06, "loss": 0.1195, "step": 2684 }, { "epoch": 1.221565059144677, "grad_norm": 1.7916218908549502, "learning_rate": 4.299562572468833e-06, "loss": 0.1264, "step": 2685 }, { "epoch": 1.222020018198362, "grad_norm": 1.3194566331820348, "learning_rate": 4.299066422066807e-06, "loss": 0.044, "step": 2686 }, { "epoch": 1.2224749772520473, "grad_norm": 1.702059632495899, "learning_rate": 4.2985701246531965e-06, "loss": 0.1094, "step": 2687 }, { "epoch": 1.2229299363057324, "grad_norm": 1.3985606136942172, "learning_rate": 4.2980736802685575e-06, "loss": 0.0476, "step": 2688 }, { "epoch": 1.2233848953594177, "grad_norm": 1.8905242980121515, "learning_rate": 4.297577088953458e-06, "loss": 0.0676, "step": 2689 }, { "epoch": 1.2238398544131028, "grad_norm": 0.8842330436141602, "learning_rate": 4.2970803507484756e-06, "loss": 0.0528, "step": 2690 }, { "epoch": 1.224294813466788, "grad_norm": 1.5087671057266334, "learning_rate": 4.296583465694204e-06, "loss": 0.0781, "step": 2691 }, { "epoch": 1.2247497725204732, "grad_norm": 2.1139760440967112, "learning_rate": 4.296086433831244e-06, "loss": 0.0995, "step": 2692 }, { "epoch": 1.2252047315741583, "grad_norm": 1.3607345905968589, "learning_rate": 4.295589255200212e-06, "loss": 0.0842, "step": 2693 }, { "epoch": 1.2256596906278434, "grad_norm": 1.7864471189286306, "learning_rate": 4.295091929841734e-06, "loss": 0.0839, "step": 2694 }, { "epoch": 1.2261146496815287, "grad_norm": 1.4725627389737213, "learning_rate": 4.2945944577964516e-06, "loss": 0.1817, "step": 2695 }, { "epoch": 1.2265696087352138, "grad_norm": 1.1876699089763878, "learning_rate": 4.294096839105013e-06, "loss": 0.0614, "step": 2696 }, { "epoch": 1.2270245677888991, "grad_norm": 1.4225833533824312, "learning_rate": 4.293599073808083e-06, "loss": 0.0796, "step": 2697 }, { "epoch": 1.2274795268425842, "grad_norm": 1.3288722678195426, "learning_rate": 4.293101161946337e-06, "loss": 0.0555, "step": 2698 }, { "epoch": 1.2279344858962693, "grad_norm": 1.2424148095147949, "learning_rate": 4.292603103560462e-06, "loss": 0.0488, "step": 2699 }, { "epoch": 1.2283894449499546, "grad_norm": 1.2746073892843495, "learning_rate": 4.292104898691157e-06, "loss": 0.0965, "step": 2700 }, { "epoch": 1.2288444040036397, "grad_norm": 1.9553417584027957, "learning_rate": 4.291606547379131e-06, "loss": 0.0863, "step": 2701 }, { "epoch": 1.2292993630573248, "grad_norm": 1.6292687158685326, "learning_rate": 4.291108049665109e-06, "loss": 0.1039, "step": 2702 }, { "epoch": 1.22975432211101, "grad_norm": 1.6141920925692421, "learning_rate": 4.290609405589827e-06, "loss": 0.0702, "step": 2703 }, { "epoch": 1.2302092811646952, "grad_norm": 1.568358524006938, "learning_rate": 4.29011061519403e-06, "loss": 0.1305, "step": 2704 }, { "epoch": 1.2306642402183803, "grad_norm": 1.5832578242534308, "learning_rate": 4.289611678518478e-06, "loss": 0.0943, "step": 2705 }, { "epoch": 1.2311191992720656, "grad_norm": 1.7204606734278, "learning_rate": 4.289112595603941e-06, "loss": 0.1271, "step": 2706 }, { "epoch": 1.2315741583257507, "grad_norm": 1.878311333320497, "learning_rate": 4.288613366491202e-06, "loss": 0.0753, "step": 2707 }, { "epoch": 1.2320291173794358, "grad_norm": 1.6190494499887427, "learning_rate": 4.288113991221057e-06, "loss": 0.0815, "step": 2708 }, { "epoch": 1.232484076433121, "grad_norm": 1.4265449920467896, "learning_rate": 4.2876144698343115e-06, "loss": 0.0905, "step": 2709 }, { "epoch": 1.2329390354868062, "grad_norm": 1.5792299252383166, "learning_rate": 4.287114802371783e-06, "loss": 0.0933, "step": 2710 }, { "epoch": 1.2333939945404913, "grad_norm": 1.5541962345380622, "learning_rate": 4.286614988874304e-06, "loss": 0.1018, "step": 2711 }, { "epoch": 1.2338489535941766, "grad_norm": 1.4933850317503654, "learning_rate": 4.286115029382717e-06, "loss": 0.1448, "step": 2712 }, { "epoch": 1.2343039126478617, "grad_norm": 1.778907316114548, "learning_rate": 4.285614923937876e-06, "loss": 0.1101, "step": 2713 }, { "epoch": 1.2347588717015467, "grad_norm": 1.3970757565526302, "learning_rate": 4.285114672580647e-06, "loss": 0.0862, "step": 2714 }, { "epoch": 1.235213830755232, "grad_norm": 1.9653421473113715, "learning_rate": 4.284614275351907e-06, "loss": 0.1155, "step": 2715 }, { "epoch": 1.2356687898089171, "grad_norm": 1.4818183158109117, "learning_rate": 4.2841137322925495e-06, "loss": 0.1109, "step": 2716 }, { "epoch": 1.2361237488626025, "grad_norm": 1.395827472007909, "learning_rate": 4.283613043443474e-06, "loss": 0.0615, "step": 2717 }, { "epoch": 1.2365787079162875, "grad_norm": 1.2600494580099084, "learning_rate": 4.2831122088455955e-06, "loss": 0.0588, "step": 2718 }, { "epoch": 1.2370336669699726, "grad_norm": 1.731274261725021, "learning_rate": 4.2826112285398395e-06, "loss": 0.1502, "step": 2719 }, { "epoch": 1.237488626023658, "grad_norm": 1.0227517272317024, "learning_rate": 4.282110102567145e-06, "loss": 0.0517, "step": 2720 }, { "epoch": 1.237943585077343, "grad_norm": 1.3776885997310226, "learning_rate": 4.28160883096846e-06, "loss": 0.0663, "step": 2721 }, { "epoch": 1.2383985441310281, "grad_norm": 1.2572442124919356, "learning_rate": 4.281107413784747e-06, "loss": 0.067, "step": 2722 }, { "epoch": 1.2388535031847134, "grad_norm": 1.2741809908905852, "learning_rate": 4.28060585105698e-06, "loss": 0.1001, "step": 2723 }, { "epoch": 1.2393084622383985, "grad_norm": 1.6333661735440708, "learning_rate": 4.280104142826143e-06, "loss": 0.0787, "step": 2724 }, { "epoch": 1.2397634212920838, "grad_norm": 2.1072595872871984, "learning_rate": 4.2796022891332355e-06, "loss": 0.1632, "step": 2725 }, { "epoch": 1.240218380345769, "grad_norm": 2.029930265466161, "learning_rate": 4.279100290019265e-06, "loss": 0.0732, "step": 2726 }, { "epoch": 1.240673339399454, "grad_norm": 1.3800193403031813, "learning_rate": 4.278598145525253e-06, "loss": 0.1215, "step": 2727 }, { "epoch": 1.2411282984531393, "grad_norm": 2.1334796621942074, "learning_rate": 4.278095855692233e-06, "loss": 0.1028, "step": 2728 }, { "epoch": 1.2415832575068244, "grad_norm": 1.9037023983095858, "learning_rate": 4.277593420561249e-06, "loss": 0.0583, "step": 2729 }, { "epoch": 1.2420382165605095, "grad_norm": 1.5266711911694233, "learning_rate": 4.277090840173359e-06, "loss": 0.0727, "step": 2730 }, { "epoch": 1.2424931756141948, "grad_norm": 1.779852269680275, "learning_rate": 4.276588114569631e-06, "loss": 0.1165, "step": 2731 }, { "epoch": 1.24294813466788, "grad_norm": 1.1686354520981554, "learning_rate": 4.2760852437911436e-06, "loss": 0.0696, "step": 2732 }, { "epoch": 1.243403093721565, "grad_norm": 1.6281358508365982, "learning_rate": 4.2755822278789926e-06, "loss": 0.0748, "step": 2733 }, { "epoch": 1.2438580527752503, "grad_norm": 1.9348550299278917, "learning_rate": 4.2750790668742795e-06, "loss": 0.0771, "step": 2734 }, { "epoch": 1.2443130118289354, "grad_norm": 1.6843775010519313, "learning_rate": 4.274575760818122e-06, "loss": 0.1291, "step": 2735 }, { "epoch": 1.2447679708826205, "grad_norm": 1.7400214741336621, "learning_rate": 4.274072309751646e-06, "loss": 0.0736, "step": 2736 }, { "epoch": 1.2452229299363058, "grad_norm": 1.3279822498973282, "learning_rate": 4.273568713715993e-06, "loss": 0.105, "step": 2737 }, { "epoch": 1.2456778889899909, "grad_norm": 1.4181047264694318, "learning_rate": 4.2730649727523145e-06, "loss": 0.1044, "step": 2738 }, { "epoch": 1.246132848043676, "grad_norm": 1.5420933585436614, "learning_rate": 4.272561086901773e-06, "loss": 0.0742, "step": 2739 }, { "epoch": 1.2465878070973613, "grad_norm": 2.0627213117577616, "learning_rate": 4.272057056205544e-06, "loss": 0.1002, "step": 2740 }, { "epoch": 1.2470427661510464, "grad_norm": 1.6373337151018261, "learning_rate": 4.271552880704815e-06, "loss": 0.0786, "step": 2741 }, { "epoch": 1.2474977252047315, "grad_norm": 1.4066801307959027, "learning_rate": 4.271048560440786e-06, "loss": 0.0951, "step": 2742 }, { "epoch": 1.2479526842584168, "grad_norm": 1.4840597932593944, "learning_rate": 4.2705440954546665e-06, "loss": 0.1449, "step": 2743 }, { "epoch": 1.2484076433121019, "grad_norm": 1.4874386819240102, "learning_rate": 4.270039485787678e-06, "loss": 0.0979, "step": 2744 }, { "epoch": 1.2488626023657872, "grad_norm": 1.4996547701951468, "learning_rate": 4.269534731481057e-06, "loss": 0.1153, "step": 2745 }, { "epoch": 1.2493175614194723, "grad_norm": 1.748368630407863, "learning_rate": 4.269029832576048e-06, "loss": 0.0701, "step": 2746 }, { "epoch": 1.2497725204731573, "grad_norm": 1.2272157062443403, "learning_rate": 4.2685247891139114e-06, "loss": 0.0742, "step": 2747 }, { "epoch": 1.2502274795268427, "grad_norm": 1.2535267297683748, "learning_rate": 4.268019601135914e-06, "loss": 0.0663, "step": 2748 }, { "epoch": 1.2506824385805277, "grad_norm": 2.2232595843640954, "learning_rate": 4.26751426868334e-06, "loss": 0.0552, "step": 2749 }, { "epoch": 1.251137397634213, "grad_norm": 1.6413257670602424, "learning_rate": 4.2670087917974826e-06, "loss": 0.0953, "step": 2750 }, { "epoch": 1.2515923566878981, "grad_norm": 2.525956129850652, "learning_rate": 4.266503170519645e-06, "loss": 0.1019, "step": 2751 }, { "epoch": 1.2520473157415832, "grad_norm": 1.7532088817176623, "learning_rate": 4.265997404891147e-06, "loss": 0.0962, "step": 2752 }, { "epoch": 1.2525022747952685, "grad_norm": 1.7385955199194223, "learning_rate": 4.265491494953316e-06, "loss": 0.0829, "step": 2753 }, { "epoch": 1.2529572338489536, "grad_norm": 1.5355610337039685, "learning_rate": 4.2649854407474925e-06, "loss": 0.1359, "step": 2754 }, { "epoch": 1.2534121929026387, "grad_norm": 1.28022022581084, "learning_rate": 4.26447924231503e-06, "loss": 0.0558, "step": 2755 }, { "epoch": 1.253867151956324, "grad_norm": 1.3880085094165089, "learning_rate": 4.263972899697292e-06, "loss": 0.0976, "step": 2756 }, { "epoch": 1.2543221110100091, "grad_norm": 1.274974064159807, "learning_rate": 4.263466412935654e-06, "loss": 0.1164, "step": 2757 }, { "epoch": 1.2547770700636942, "grad_norm": 1.3582086906964457, "learning_rate": 4.262959782071505e-06, "loss": 0.0524, "step": 2758 }, { "epoch": 1.2552320291173795, "grad_norm": 1.8565157639016567, "learning_rate": 4.262453007146244e-06, "loss": 0.1207, "step": 2759 }, { "epoch": 1.2556869881710646, "grad_norm": 1.1179278766341727, "learning_rate": 4.261946088201282e-06, "loss": 0.0628, "step": 2760 }, { "epoch": 1.2561419472247497, "grad_norm": 1.3815222535677334, "learning_rate": 4.261439025278044e-06, "loss": 0.0783, "step": 2761 }, { "epoch": 1.256596906278435, "grad_norm": 1.6096595755674274, "learning_rate": 4.260931818417962e-06, "loss": 0.0655, "step": 2762 }, { "epoch": 1.25705186533212, "grad_norm": 1.4310899801227122, "learning_rate": 4.260424467662484e-06, "loss": 0.0794, "step": 2763 }, { "epoch": 1.2575068243858052, "grad_norm": 1.3830505652727263, "learning_rate": 4.259916973053069e-06, "loss": 0.126, "step": 2764 }, { "epoch": 1.2579617834394905, "grad_norm": 1.2593848254260958, "learning_rate": 4.2594093346311865e-06, "loss": 0.0952, "step": 2765 }, { "epoch": 1.2584167424931756, "grad_norm": 1.7618010142299456, "learning_rate": 4.258901552438319e-06, "loss": 0.1159, "step": 2766 }, { "epoch": 1.2588717015468607, "grad_norm": 1.4438782108606985, "learning_rate": 4.25839362651596e-06, "loss": 0.0862, "step": 2767 }, { "epoch": 1.259326660600546, "grad_norm": 1.960220687441142, "learning_rate": 4.257885556905613e-06, "loss": 0.0847, "step": 2768 }, { "epoch": 1.259781619654231, "grad_norm": 1.588478187298156, "learning_rate": 4.257377343648799e-06, "loss": 0.0798, "step": 2769 }, { "epoch": 1.2602365787079162, "grad_norm": 1.3801501508630765, "learning_rate": 4.256868986787044e-06, "loss": 0.0942, "step": 2770 }, { "epoch": 1.2606915377616015, "grad_norm": 1.429324437514992, "learning_rate": 4.256360486361889e-06, "loss": 0.0588, "step": 2771 }, { "epoch": 1.2611464968152866, "grad_norm": 1.6843373956104633, "learning_rate": 4.255851842414887e-06, "loss": 0.0655, "step": 2772 }, { "epoch": 1.2616014558689717, "grad_norm": 1.8180982857396182, "learning_rate": 4.255343054987601e-06, "loss": 0.1242, "step": 2773 }, { "epoch": 1.262056414922657, "grad_norm": 1.417537186445061, "learning_rate": 4.2548341241216085e-06, "loss": 0.0584, "step": 2774 }, { "epoch": 1.262511373976342, "grad_norm": 1.8094891195148863, "learning_rate": 4.254325049858496e-06, "loss": 0.104, "step": 2775 }, { "epoch": 1.2629663330300274, "grad_norm": 1.1102967241510793, "learning_rate": 4.2538158322398625e-06, "loss": 0.0714, "step": 2776 }, { "epoch": 1.2634212920837125, "grad_norm": 1.2558439821987628, "learning_rate": 4.2533064713073195e-06, "loss": 0.0784, "step": 2777 }, { "epoch": 1.2638762511373978, "grad_norm": 1.4000452382534576, "learning_rate": 4.252796967102489e-06, "loss": 0.0778, "step": 2778 }, { "epoch": 1.2643312101910829, "grad_norm": 1.2352745186077692, "learning_rate": 4.2522873196670065e-06, "loss": 0.0685, "step": 2779 }, { "epoch": 1.264786169244768, "grad_norm": 1.6222569071883635, "learning_rate": 4.2517775290425175e-06, "loss": 0.0674, "step": 2780 }, { "epoch": 1.2652411282984533, "grad_norm": 1.3943428277844097, "learning_rate": 4.251267595270681e-06, "loss": 0.0912, "step": 2781 }, { "epoch": 1.2656960873521383, "grad_norm": 1.6933682983834941, "learning_rate": 4.250757518393163e-06, "loss": 0.0721, "step": 2782 }, { "epoch": 1.2661510464058234, "grad_norm": 1.6598332060453826, "learning_rate": 4.250247298451649e-06, "loss": 0.0885, "step": 2783 }, { "epoch": 1.2666060054595087, "grad_norm": 1.6095015169265148, "learning_rate": 4.249736935487828e-06, "loss": 0.0809, "step": 2784 }, { "epoch": 1.2670609645131938, "grad_norm": 1.4343806186043688, "learning_rate": 4.249226429543408e-06, "loss": 0.1183, "step": 2785 }, { "epoch": 1.267515923566879, "grad_norm": 1.613384775900348, "learning_rate": 4.248715780660102e-06, "loss": 0.1099, "step": 2786 }, { "epoch": 1.2679708826205642, "grad_norm": 1.6980255955156842, "learning_rate": 4.2482049888796405e-06, "loss": 0.0787, "step": 2787 }, { "epoch": 1.2684258416742493, "grad_norm": 1.4760226822218372, "learning_rate": 4.247694054243762e-06, "loss": 0.0593, "step": 2788 }, { "epoch": 1.2688808007279344, "grad_norm": 1.5547756941150372, "learning_rate": 4.247182976794218e-06, "loss": 0.0671, "step": 2789 }, { "epoch": 1.2693357597816197, "grad_norm": 1.4208904195518834, "learning_rate": 4.246671756572771e-06, "loss": 0.058, "step": 2790 }, { "epoch": 1.2697907188353048, "grad_norm": 1.7246075849754179, "learning_rate": 4.246160393621197e-06, "loss": 0.1387, "step": 2791 }, { "epoch": 1.27024567788899, "grad_norm": 1.873913847934859, "learning_rate": 4.2456488879812805e-06, "loss": 0.0912, "step": 2792 }, { "epoch": 1.2707006369426752, "grad_norm": 1.9325200599382786, "learning_rate": 4.24513723969482e-06, "loss": 0.0841, "step": 2793 }, { "epoch": 1.2711555959963603, "grad_norm": 1.1624277540953984, "learning_rate": 4.244625448803625e-06, "loss": 0.0735, "step": 2794 }, { "epoch": 1.2716105550500454, "grad_norm": 1.5058744263506023, "learning_rate": 4.244113515349517e-06, "loss": 0.1056, "step": 2795 }, { "epoch": 1.2720655141037307, "grad_norm": 1.5921615694072897, "learning_rate": 4.243601439374329e-06, "loss": 0.0762, "step": 2796 }, { "epoch": 1.2725204731574158, "grad_norm": 1.6589468997739922, "learning_rate": 4.243089220919906e-06, "loss": 0.0971, "step": 2797 }, { "epoch": 1.2729754322111009, "grad_norm": 3.3483713846706564, "learning_rate": 4.242576860028103e-06, "loss": 0.1053, "step": 2798 }, { "epoch": 1.2734303912647862, "grad_norm": 2.1083459445303165, "learning_rate": 4.242064356740789e-06, "loss": 0.1147, "step": 2799 }, { "epoch": 1.2738853503184713, "grad_norm": 1.4332938875496246, "learning_rate": 4.2415517110998415e-06, "loss": 0.0839, "step": 2800 }, { "epoch": 1.2743403093721566, "grad_norm": 1.4220930588404028, "learning_rate": 4.241038923147155e-06, "loss": 0.0445, "step": 2801 }, { "epoch": 1.2747952684258417, "grad_norm": 3.0636582687604736, "learning_rate": 4.240525992924629e-06, "loss": 0.0688, "step": 2802 }, { "epoch": 1.2752502274795268, "grad_norm": 1.0756336520764858, "learning_rate": 4.240012920474179e-06, "loss": 0.0787, "step": 2803 }, { "epoch": 1.275705186533212, "grad_norm": 1.99174928377757, "learning_rate": 4.239499705837731e-06, "loss": 0.1046, "step": 2804 }, { "epoch": 1.2761601455868972, "grad_norm": 1.9469545943664321, "learning_rate": 4.238986349057223e-06, "loss": 0.0518, "step": 2805 }, { "epoch": 1.2766151046405825, "grad_norm": 2.130014912080278, "learning_rate": 4.238472850174603e-06, "loss": 0.1125, "step": 2806 }, { "epoch": 1.2770700636942676, "grad_norm": 1.6387703487658163, "learning_rate": 4.2379592092318326e-06, "loss": 0.0972, "step": 2807 }, { "epoch": 1.2775250227479527, "grad_norm": 1.6303170470737463, "learning_rate": 4.237445426270884e-06, "loss": 0.1408, "step": 2808 }, { "epoch": 1.277979981801638, "grad_norm": 1.6094055892481036, "learning_rate": 4.236931501333742e-06, "loss": 0.1381, "step": 2809 }, { "epoch": 1.278434940855323, "grad_norm": 1.6440127889718046, "learning_rate": 4.236417434462401e-06, "loss": 0.1009, "step": 2810 }, { "epoch": 1.2788898999090081, "grad_norm": 1.75176378452886, "learning_rate": 4.23590322569887e-06, "loss": 0.0853, "step": 2811 }, { "epoch": 1.2793448589626935, "grad_norm": 1.2507069468350538, "learning_rate": 4.2353888750851655e-06, "loss": 0.0892, "step": 2812 }, { "epoch": 1.2797998180163785, "grad_norm": 1.6745452439607542, "learning_rate": 4.2348743826633195e-06, "loss": 0.0726, "step": 2813 }, { "epoch": 1.2802547770700636, "grad_norm": 1.8726656670216983, "learning_rate": 4.234359748475374e-06, "loss": 0.1141, "step": 2814 }, { "epoch": 1.280709736123749, "grad_norm": 1.530273284199511, "learning_rate": 4.233844972563382e-06, "loss": 0.0694, "step": 2815 }, { "epoch": 1.281164695177434, "grad_norm": 1.77425338030721, "learning_rate": 4.233330054969409e-06, "loss": 0.1525, "step": 2816 }, { "epoch": 1.2816196542311191, "grad_norm": 1.469967349042513, "learning_rate": 4.23281499573553e-06, "loss": 0.0584, "step": 2817 }, { "epoch": 1.2820746132848044, "grad_norm": 1.8098393896928735, "learning_rate": 4.232299794903837e-06, "loss": 0.0919, "step": 2818 }, { "epoch": 1.2825295723384895, "grad_norm": 1.354752472156186, "learning_rate": 4.2317844525164265e-06, "loss": 0.0793, "step": 2819 }, { "epoch": 1.2829845313921746, "grad_norm": 1.724790766870009, "learning_rate": 4.2312689686154115e-06, "loss": 0.1109, "step": 2820 }, { "epoch": 1.28343949044586, "grad_norm": 1.5788752253359641, "learning_rate": 4.230753343242915e-06, "loss": 0.1003, "step": 2821 }, { "epoch": 1.283894449499545, "grad_norm": 1.440725017993767, "learning_rate": 4.230237576441071e-06, "loss": 0.0685, "step": 2822 }, { "epoch": 1.28434940855323, "grad_norm": 1.326187617761956, "learning_rate": 4.229721668252026e-06, "loss": 0.1124, "step": 2823 }, { "epoch": 1.2848043676069154, "grad_norm": 1.8953336693583542, "learning_rate": 4.2292056187179374e-06, "loss": 0.098, "step": 2824 }, { "epoch": 1.2852593266606005, "grad_norm": 1.3965507181702275, "learning_rate": 4.228689427880975e-06, "loss": 0.0588, "step": 2825 }, { "epoch": 1.2857142857142856, "grad_norm": 1.0966833391509712, "learning_rate": 4.228173095783319e-06, "loss": 0.0517, "step": 2826 }, { "epoch": 1.286169244767971, "grad_norm": 1.4548563423749026, "learning_rate": 4.227656622467162e-06, "loss": 0.0924, "step": 2827 }, { "epoch": 1.286624203821656, "grad_norm": 1.7350104226738665, "learning_rate": 4.2271400079747085e-06, "loss": 0.0847, "step": 2828 }, { "epoch": 1.2870791628753413, "grad_norm": 1.7382379248311097, "learning_rate": 4.2266232523481724e-06, "loss": 0.1049, "step": 2829 }, { "epoch": 1.2875341219290264, "grad_norm": 1.8345087967745906, "learning_rate": 4.226106355629781e-06, "loss": 0.0738, "step": 2830 }, { "epoch": 1.2879890809827115, "grad_norm": 1.5721341882625568, "learning_rate": 4.225589317861775e-06, "loss": 0.093, "step": 2831 }, { "epoch": 1.2884440400363968, "grad_norm": 1.604266810816268, "learning_rate": 4.225072139086401e-06, "loss": 0.0731, "step": 2832 }, { "epoch": 1.2888989990900819, "grad_norm": 1.6554141521158003, "learning_rate": 4.224554819345923e-06, "loss": 0.0782, "step": 2833 }, { "epoch": 1.2893539581437672, "grad_norm": 2.2170948306583855, "learning_rate": 4.224037358682614e-06, "loss": 0.068, "step": 2834 }, { "epoch": 1.2898089171974523, "grad_norm": 1.2523950290979835, "learning_rate": 4.223519757138756e-06, "loss": 0.0456, "step": 2835 }, { "epoch": 1.2902638762511374, "grad_norm": 1.7652192471670531, "learning_rate": 4.223002014756647e-06, "loss": 0.0805, "step": 2836 }, { "epoch": 1.2907188353048227, "grad_norm": 1.9605116711133057, "learning_rate": 4.222484131578595e-06, "loss": 0.1057, "step": 2837 }, { "epoch": 1.2911737943585078, "grad_norm": 1.2401047061325061, "learning_rate": 4.221966107646918e-06, "loss": 0.0689, "step": 2838 }, { "epoch": 1.2916287534121929, "grad_norm": 1.3716572645251588, "learning_rate": 4.221447943003947e-06, "loss": 0.0695, "step": 2839 }, { "epoch": 1.2920837124658782, "grad_norm": 1.4931517125024227, "learning_rate": 4.2209296376920254e-06, "loss": 0.1176, "step": 2840 }, { "epoch": 1.2925386715195633, "grad_norm": 1.4353437437747416, "learning_rate": 4.220411191753504e-06, "loss": 0.0933, "step": 2841 }, { "epoch": 1.2929936305732483, "grad_norm": 1.4361259937419293, "learning_rate": 4.21989260523075e-06, "loss": 0.0684, "step": 2842 }, { "epoch": 1.2934485896269337, "grad_norm": 1.600625770747058, "learning_rate": 4.219373878166139e-06, "loss": 0.1484, "step": 2843 }, { "epoch": 1.2939035486806187, "grad_norm": 1.7089575903368055, "learning_rate": 4.21885501060206e-06, "loss": 0.0918, "step": 2844 }, { "epoch": 1.2943585077343038, "grad_norm": 1.5563977839862093, "learning_rate": 4.21833600258091e-06, "loss": 0.0962, "step": 2845 }, { "epoch": 1.2948134667879891, "grad_norm": 1.7442948550642472, "learning_rate": 4.217816854145103e-06, "loss": 0.0892, "step": 2846 }, { "epoch": 1.2952684258416742, "grad_norm": 1.8680620512657267, "learning_rate": 4.2172975653370605e-06, "loss": 0.0708, "step": 2847 }, { "epoch": 1.2957233848953593, "grad_norm": 1.2256614406010522, "learning_rate": 4.216778136199216e-06, "loss": 0.1065, "step": 2848 }, { "epoch": 1.2961783439490446, "grad_norm": 2.7110253898080527, "learning_rate": 4.216258566774015e-06, "loss": 0.1639, "step": 2849 }, { "epoch": 1.2966333030027297, "grad_norm": 1.1811844359300152, "learning_rate": 4.215738857103915e-06, "loss": 0.0946, "step": 2850 }, { "epoch": 1.2970882620564148, "grad_norm": 1.2310932171907334, "learning_rate": 4.215219007231382e-06, "loss": 0.0744, "step": 2851 }, { "epoch": 1.2975432211101001, "grad_norm": 2.6650742149274156, "learning_rate": 4.214699017198899e-06, "loss": 0.1451, "step": 2852 }, { "epoch": 1.2979981801637852, "grad_norm": 1.240409164119912, "learning_rate": 4.214178887048956e-06, "loss": 0.1131, "step": 2853 }, { "epoch": 1.2984531392174703, "grad_norm": 1.2314736761416967, "learning_rate": 4.213658616824055e-06, "loss": 0.0564, "step": 2854 }, { "epoch": 1.2989080982711556, "grad_norm": 1.5161499734776573, "learning_rate": 4.213138206566711e-06, "loss": 0.1141, "step": 2855 }, { "epoch": 1.2993630573248407, "grad_norm": 1.569147968323215, "learning_rate": 4.21261765631945e-06, "loss": 0.091, "step": 2856 }, { "epoch": 1.299818016378526, "grad_norm": 1.4506371027224196, "learning_rate": 4.212096966124807e-06, "loss": 0.0797, "step": 2857 }, { "epoch": 1.300272975432211, "grad_norm": 1.6264422705046855, "learning_rate": 4.2115761360253325e-06, "loss": 0.1406, "step": 2858 }, { "epoch": 1.3007279344858962, "grad_norm": 1.6586007256829827, "learning_rate": 4.211055166063585e-06, "loss": 0.1187, "step": 2859 }, { "epoch": 1.3011828935395815, "grad_norm": 1.7963681679149552, "learning_rate": 4.210534056282136e-06, "loss": 0.1017, "step": 2860 }, { "epoch": 1.3016378525932666, "grad_norm": 1.845754042262854, "learning_rate": 4.21001280672357e-06, "loss": 0.1229, "step": 2861 }, { "epoch": 1.302092811646952, "grad_norm": 1.492780520504542, "learning_rate": 4.209491417430479e-06, "loss": 0.0871, "step": 2862 }, { "epoch": 1.302547770700637, "grad_norm": 2.0207516200977396, "learning_rate": 4.208969888445469e-06, "loss": 0.1013, "step": 2863 }, { "epoch": 1.303002729754322, "grad_norm": 2.338567005787092, "learning_rate": 4.208448219811158e-06, "loss": 0.142, "step": 2864 }, { "epoch": 1.3034576888080074, "grad_norm": 1.5659320619114794, "learning_rate": 4.207926411570172e-06, "loss": 0.0623, "step": 2865 }, { "epoch": 1.3039126478616925, "grad_norm": 0.9908532544196624, "learning_rate": 4.207404463765155e-06, "loss": 0.0762, "step": 2866 }, { "epoch": 1.3043676069153776, "grad_norm": 1.7668096337088337, "learning_rate": 4.2068823764387545e-06, "loss": 0.1188, "step": 2867 }, { "epoch": 1.3048225659690629, "grad_norm": 1.7019199508300746, "learning_rate": 4.206360149633635e-06, "loss": 0.0941, "step": 2868 }, { "epoch": 1.305277525022748, "grad_norm": 1.2569710057152046, "learning_rate": 4.205837783392469e-06, "loss": 0.0843, "step": 2869 }, { "epoch": 1.305732484076433, "grad_norm": 1.210302412612278, "learning_rate": 4.205315277757943e-06, "loss": 0.0727, "step": 2870 }, { "epoch": 1.3061874431301184, "grad_norm": 1.6653693545300776, "learning_rate": 4.204792632772754e-06, "loss": 0.1127, "step": 2871 }, { "epoch": 1.3066424021838035, "grad_norm": 1.316918338784307, "learning_rate": 4.204269848479611e-06, "loss": 0.0723, "step": 2872 }, { "epoch": 1.3070973612374885, "grad_norm": 1.6274430736336665, "learning_rate": 4.203746924921231e-06, "loss": 0.0599, "step": 2873 }, { "epoch": 1.3075523202911739, "grad_norm": 1.233346443210308, "learning_rate": 4.203223862140347e-06, "loss": 0.0755, "step": 2874 }, { "epoch": 1.308007279344859, "grad_norm": 1.2594424746635327, "learning_rate": 4.2027006601797e-06, "loss": 0.0797, "step": 2875 }, { "epoch": 1.308462238398544, "grad_norm": 1.2769615968017793, "learning_rate": 4.202177319082045e-06, "loss": 0.0857, "step": 2876 }, { "epoch": 1.3089171974522293, "grad_norm": 1.5250108536595257, "learning_rate": 4.201653838890146e-06, "loss": 0.1086, "step": 2877 }, { "epoch": 1.3093721565059144, "grad_norm": 1.5016318291106727, "learning_rate": 4.20113021964678e-06, "loss": 0.0578, "step": 2878 }, { "epoch": 1.3098271155595995, "grad_norm": 1.5409344405172085, "learning_rate": 4.200606461394735e-06, "loss": 0.0786, "step": 2879 }, { "epoch": 1.3102820746132848, "grad_norm": 1.694087127544417, "learning_rate": 4.200082564176809e-06, "loss": 0.1039, "step": 2880 }, { "epoch": 1.31073703366697, "grad_norm": 1.5922275950284588, "learning_rate": 4.199558528035814e-06, "loss": 0.1027, "step": 2881 }, { "epoch": 1.311191992720655, "grad_norm": 1.7943880475738858, "learning_rate": 4.199034353014572e-06, "loss": 0.0772, "step": 2882 }, { "epoch": 1.3116469517743403, "grad_norm": 1.644120742764432, "learning_rate": 4.198510039155914e-06, "loss": 0.0877, "step": 2883 }, { "epoch": 1.3121019108280254, "grad_norm": 1.723262842569343, "learning_rate": 4.197985586502686e-06, "loss": 0.1014, "step": 2884 }, { "epoch": 1.3125568698817107, "grad_norm": 1.5032263860937614, "learning_rate": 4.197460995097745e-06, "loss": 0.1003, "step": 2885 }, { "epoch": 1.3130118289353958, "grad_norm": 1.6522824207101456, "learning_rate": 4.1969362649839565e-06, "loss": 0.1428, "step": 2886 }, { "epoch": 1.3134667879890811, "grad_norm": 1.178575364703643, "learning_rate": 4.1964113962042e-06, "loss": 0.0679, "step": 2887 }, { "epoch": 1.3139217470427662, "grad_norm": 2.658263437863624, "learning_rate": 4.195886388801364e-06, "loss": 0.0585, "step": 2888 }, { "epoch": 1.3143767060964513, "grad_norm": 1.8387228241451763, "learning_rate": 4.195361242818354e-06, "loss": 0.0825, "step": 2889 }, { "epoch": 1.3148316651501366, "grad_norm": 1.327400635788573, "learning_rate": 4.194835958298076e-06, "loss": 0.1129, "step": 2890 }, { "epoch": 1.3152866242038217, "grad_norm": 2.7272488029062703, "learning_rate": 4.194310535283459e-06, "loss": 0.0929, "step": 2891 }, { "epoch": 1.3157415832575068, "grad_norm": 1.6650223595478135, "learning_rate": 4.193784973817436e-06, "loss": 0.0715, "step": 2892 }, { "epoch": 1.316196542311192, "grad_norm": 1.4955115345342818, "learning_rate": 4.193259273942954e-06, "loss": 0.0873, "step": 2893 }, { "epoch": 1.3166515013648772, "grad_norm": 1.5895810672683979, "learning_rate": 4.192733435702971e-06, "loss": 0.096, "step": 2894 }, { "epoch": 1.3171064604185623, "grad_norm": 1.5421113500962813, "learning_rate": 4.192207459140456e-06, "loss": 0.0812, "step": 2895 }, { "epoch": 1.3175614194722476, "grad_norm": 1.2490799014150549, "learning_rate": 4.1916813442983895e-06, "loss": 0.0671, "step": 2896 }, { "epoch": 1.3180163785259327, "grad_norm": 1.9483961420186395, "learning_rate": 4.191155091219763e-06, "loss": 0.1134, "step": 2897 }, { "epoch": 1.3184713375796178, "grad_norm": 1.2462644226642845, "learning_rate": 4.1906286999475785e-06, "loss": 0.1016, "step": 2898 }, { "epoch": 1.318926296633303, "grad_norm": 1.7147347392469159, "learning_rate": 4.190102170524853e-06, "loss": 0.1389, "step": 2899 }, { "epoch": 1.3193812556869882, "grad_norm": 1.5005447933561165, "learning_rate": 4.18957550299461e-06, "loss": 0.0897, "step": 2900 }, { "epoch": 1.3198362147406733, "grad_norm": 1.6378799651694023, "learning_rate": 4.189048697399887e-06, "loss": 0.1097, "step": 2901 }, { "epoch": 1.3202911737943586, "grad_norm": 1.1911009872380312, "learning_rate": 4.188521753783732e-06, "loss": 0.0657, "step": 2902 }, { "epoch": 1.3207461328480437, "grad_norm": 1.3463244428749435, "learning_rate": 4.187994672189205e-06, "loss": 0.1159, "step": 2903 }, { "epoch": 1.3212010919017287, "grad_norm": 1.62599894608312, "learning_rate": 4.187467452659376e-06, "loss": 0.0999, "step": 2904 }, { "epoch": 1.321656050955414, "grad_norm": 2.091716155797599, "learning_rate": 4.186940095237327e-06, "loss": 0.1167, "step": 2905 }, { "epoch": 1.3221110100090991, "grad_norm": 1.2983105428066397, "learning_rate": 4.186412599966152e-06, "loss": 0.0933, "step": 2906 }, { "epoch": 1.3225659690627842, "grad_norm": 1.2185905428815746, "learning_rate": 4.185884966888954e-06, "loss": 0.0602, "step": 2907 }, { "epoch": 1.3230209281164695, "grad_norm": 1.78805406702835, "learning_rate": 4.185357196048852e-06, "loss": 0.0842, "step": 2908 }, { "epoch": 1.3234758871701546, "grad_norm": 1.141299216119869, "learning_rate": 4.1848292874889694e-06, "loss": 0.0647, "step": 2909 }, { "epoch": 1.3239308462238397, "grad_norm": 1.27505639213974, "learning_rate": 4.184301241252447e-06, "loss": 0.0511, "step": 2910 }, { "epoch": 1.324385805277525, "grad_norm": 1.333484745438133, "learning_rate": 4.183773057382432e-06, "loss": 0.0863, "step": 2911 }, { "epoch": 1.3248407643312101, "grad_norm": 1.4432184430901294, "learning_rate": 4.183244735922087e-06, "loss": 0.0973, "step": 2912 }, { "epoch": 1.3252957233848954, "grad_norm": 1.5503394160992994, "learning_rate": 4.182716276914585e-06, "loss": 0.0792, "step": 2913 }, { "epoch": 1.3257506824385805, "grad_norm": 1.9196967455876535, "learning_rate": 4.182187680403107e-06, "loss": 0.0841, "step": 2914 }, { "epoch": 1.3262056414922658, "grad_norm": 1.5604564609642202, "learning_rate": 4.181658946430848e-06, "loss": 0.0782, "step": 2915 }, { "epoch": 1.326660600545951, "grad_norm": 1.57645871670399, "learning_rate": 4.181130075041015e-06, "loss": 0.0872, "step": 2916 }, { "epoch": 1.327115559599636, "grad_norm": 1.6415277481146764, "learning_rate": 4.180601066276824e-06, "loss": 0.0926, "step": 2917 }, { "epoch": 1.3275705186533213, "grad_norm": 1.7103853212476656, "learning_rate": 4.180071920181503e-06, "loss": 0.0624, "step": 2918 }, { "epoch": 1.3280254777070064, "grad_norm": 1.222885821481818, "learning_rate": 4.179542636798292e-06, "loss": 0.0631, "step": 2919 }, { "epoch": 1.3284804367606915, "grad_norm": 1.3839549191846936, "learning_rate": 4.1790132161704415e-06, "loss": 0.0733, "step": 2920 }, { "epoch": 1.3289353958143768, "grad_norm": 1.5514576863045915, "learning_rate": 4.178483658341213e-06, "loss": 0.1019, "step": 2921 }, { "epoch": 1.329390354868062, "grad_norm": 1.2864959365783242, "learning_rate": 4.17795396335388e-06, "loss": 0.0956, "step": 2922 }, { "epoch": 1.329845313921747, "grad_norm": 2.2248623552711724, "learning_rate": 4.177424131251728e-06, "loss": 0.1388, "step": 2923 }, { "epoch": 1.3303002729754323, "grad_norm": 1.565405163457581, "learning_rate": 4.17689416207805e-06, "loss": 0.0604, "step": 2924 }, { "epoch": 1.3307552320291174, "grad_norm": 4.286483598885122, "learning_rate": 4.176364055876154e-06, "loss": 0.0771, "step": 2925 }, { "epoch": 1.3312101910828025, "grad_norm": 1.5191564118430387, "learning_rate": 4.175833812689357e-06, "loss": 0.0758, "step": 2926 }, { "epoch": 1.3316651501364878, "grad_norm": 1.38550054693012, "learning_rate": 4.17530343256099e-06, "loss": 0.0888, "step": 2927 }, { "epoch": 1.3321201091901729, "grad_norm": 3.076131213534863, "learning_rate": 4.174772915534392e-06, "loss": 0.1726, "step": 2928 }, { "epoch": 1.332575068243858, "grad_norm": 1.6383109062767622, "learning_rate": 4.174242261652914e-06, "loss": 0.0872, "step": 2929 }, { "epoch": 1.3330300272975433, "grad_norm": 1.4903664429660133, "learning_rate": 4.173711470959919e-06, "loss": 0.0661, "step": 2930 }, { "epoch": 1.3334849863512284, "grad_norm": 1.652581494493652, "learning_rate": 4.173180543498782e-06, "loss": 0.1419, "step": 2931 }, { "epoch": 1.3339399454049135, "grad_norm": 1.3659537766846146, "learning_rate": 4.1726494793128864e-06, "loss": 0.0587, "step": 2932 }, { "epoch": 1.3343949044585988, "grad_norm": 1.499302032611582, "learning_rate": 4.172118278445629e-06, "loss": 0.069, "step": 2933 }, { "epoch": 1.3348498635122839, "grad_norm": 1.8548449809841507, "learning_rate": 4.171586940940417e-06, "loss": 0.1357, "step": 2934 }, { "epoch": 1.335304822565969, "grad_norm": 1.239570674767168, "learning_rate": 4.171055466840669e-06, "loss": 0.0759, "step": 2935 }, { "epoch": 1.3357597816196543, "grad_norm": 1.5929253550073612, "learning_rate": 4.1705238561898144e-06, "loss": 0.119, "step": 2936 }, { "epoch": 1.3362147406733393, "grad_norm": 1.0712424547119181, "learning_rate": 4.169992109031295e-06, "loss": 0.0524, "step": 2937 }, { "epoch": 1.3366696997270244, "grad_norm": 1.199084737766176, "learning_rate": 4.169460225408562e-06, "loss": 0.0695, "step": 2938 }, { "epoch": 1.3371246587807097, "grad_norm": 2.208105818939312, "learning_rate": 4.1689282053650786e-06, "loss": 0.1247, "step": 2939 }, { "epoch": 1.3375796178343948, "grad_norm": 1.889159988091748, "learning_rate": 4.168396048944318e-06, "loss": 0.0958, "step": 2940 }, { "epoch": 1.3380345768880801, "grad_norm": 1.6445957839513785, "learning_rate": 4.167863756189767e-06, "loss": 0.1147, "step": 2941 }, { "epoch": 1.3384895359417652, "grad_norm": 1.652952237834016, "learning_rate": 4.167331327144924e-06, "loss": 0.0873, "step": 2942 }, { "epoch": 1.3389444949954505, "grad_norm": 1.3734128684333413, "learning_rate": 4.166798761853291e-06, "loss": 0.0945, "step": 2943 }, { "epoch": 1.3393994540491356, "grad_norm": 1.1597265279563473, "learning_rate": 4.1662660603583936e-06, "loss": 0.0981, "step": 2944 }, { "epoch": 1.3398544131028207, "grad_norm": 1.7918593583307505, "learning_rate": 4.165733222703757e-06, "loss": 0.1272, "step": 2945 }, { "epoch": 1.340309372156506, "grad_norm": 1.5470680128525152, "learning_rate": 4.165200248932923e-06, "loss": 0.1179, "step": 2946 }, { "epoch": 1.3407643312101911, "grad_norm": 1.7015008875861288, "learning_rate": 4.164667139089446e-06, "loss": 0.086, "step": 2947 }, { "epoch": 1.3412192902638762, "grad_norm": 1.443283555679022, "learning_rate": 4.164133893216888e-06, "loss": 0.0583, "step": 2948 }, { "epoch": 1.3416742493175615, "grad_norm": 1.2173060682915533, "learning_rate": 4.163600511358823e-06, "loss": 0.0796, "step": 2949 }, { "epoch": 1.3421292083712466, "grad_norm": 1.5064682442891342, "learning_rate": 4.163066993558837e-06, "loss": 0.0798, "step": 2950 }, { "epoch": 1.3425841674249317, "grad_norm": 1.3609392422372266, "learning_rate": 4.1625333398605265e-06, "loss": 0.0763, "step": 2951 }, { "epoch": 1.343039126478617, "grad_norm": 1.639546655296031, "learning_rate": 4.1619995503075e-06, "loss": 0.0727, "step": 2952 }, { "epoch": 1.343494085532302, "grad_norm": 1.5265906846813477, "learning_rate": 4.161465624943375e-06, "loss": 0.09, "step": 2953 }, { "epoch": 1.3439490445859872, "grad_norm": 1.6908965474594075, "learning_rate": 4.1609315638117825e-06, "loss": 0.0763, "step": 2954 }, { "epoch": 1.3444040036396725, "grad_norm": 1.422916755381148, "learning_rate": 4.160397366956364e-06, "loss": 0.086, "step": 2955 }, { "epoch": 1.3448589626933576, "grad_norm": 2.0893573188133465, "learning_rate": 4.1598630344207705e-06, "loss": 0.0961, "step": 2956 }, { "epoch": 1.3453139217470427, "grad_norm": 1.9812780887142738, "learning_rate": 4.159328566248665e-06, "loss": 0.1157, "step": 2957 }, { "epoch": 1.345768880800728, "grad_norm": 2.0553257874914896, "learning_rate": 4.1587939624837225e-06, "loss": 0.1215, "step": 2958 }, { "epoch": 1.346223839854413, "grad_norm": 1.3963310694387345, "learning_rate": 4.15825922316963e-06, "loss": 0.1041, "step": 2959 }, { "epoch": 1.3466787989080982, "grad_norm": 1.8010725795700535, "learning_rate": 4.15772434835008e-06, "loss": 0.1022, "step": 2960 }, { "epoch": 1.3471337579617835, "grad_norm": 1.615269704707038, "learning_rate": 4.157189338068785e-06, "loss": 0.0944, "step": 2961 }, { "epoch": 1.3475887170154686, "grad_norm": 1.420289137371335, "learning_rate": 4.156654192369459e-06, "loss": 0.0945, "step": 2962 }, { "epoch": 1.3480436760691537, "grad_norm": 1.490495642849881, "learning_rate": 4.156118911295835e-06, "loss": 0.069, "step": 2963 }, { "epoch": 1.348498635122839, "grad_norm": 1.4828045336945652, "learning_rate": 4.155583494891651e-06, "loss": 0.1296, "step": 2964 }, { "epoch": 1.348953594176524, "grad_norm": 1.3865688024850134, "learning_rate": 4.155047943200663e-06, "loss": 0.0617, "step": 2965 }, { "epoch": 1.3494085532302094, "grad_norm": 1.480064364771036, "learning_rate": 4.154512256266629e-06, "loss": 0.0617, "step": 2966 }, { "epoch": 1.3498635122838945, "grad_norm": 1.907011112879505, "learning_rate": 4.153976434133327e-06, "loss": 0.0917, "step": 2967 }, { "epoch": 1.3503184713375795, "grad_norm": 1.3518812767189177, "learning_rate": 4.153440476844539e-06, "loss": 0.066, "step": 2968 }, { "epoch": 1.3507734303912649, "grad_norm": 2.6909360094493815, "learning_rate": 4.1529043844440616e-06, "loss": 0.1007, "step": 2969 }, { "epoch": 1.35122838944495, "grad_norm": 1.5791152984041121, "learning_rate": 4.1523681569757035e-06, "loss": 0.113, "step": 2970 }, { "epoch": 1.3516833484986353, "grad_norm": 1.4438631117766842, "learning_rate": 4.151831794483281e-06, "loss": 0.0913, "step": 2971 }, { "epoch": 1.3521383075523203, "grad_norm": 1.4480270817434, "learning_rate": 4.151295297010623e-06, "loss": 0.0905, "step": 2972 }, { "epoch": 1.3525932666060054, "grad_norm": 1.297338211036581, "learning_rate": 4.150758664601572e-06, "loss": 0.15, "step": 2973 }, { "epoch": 1.3530482256596907, "grad_norm": 1.3297370270180993, "learning_rate": 4.1502218972999765e-06, "loss": 0.1487, "step": 2974 }, { "epoch": 1.3535031847133758, "grad_norm": 1.5405682908296885, "learning_rate": 4.1496849951497005e-06, "loss": 0.09, "step": 2975 }, { "epoch": 1.353958143767061, "grad_norm": 1.7457410960894821, "learning_rate": 4.149147958194617e-06, "loss": 0.0729, "step": 2976 }, { "epoch": 1.3544131028207462, "grad_norm": 1.9786800490561125, "learning_rate": 4.1486107864786095e-06, "loss": 0.0856, "step": 2977 }, { "epoch": 1.3548680618744313, "grad_norm": 2.0043932977694197, "learning_rate": 4.148073480045573e-06, "loss": 0.0884, "step": 2978 }, { "epoch": 1.3553230209281164, "grad_norm": 1.8243273399433015, "learning_rate": 4.147536038939416e-06, "loss": 0.0814, "step": 2979 }, { "epoch": 1.3557779799818017, "grad_norm": 1.351056928177642, "learning_rate": 4.146998463204053e-06, "loss": 0.0752, "step": 2980 }, { "epoch": 1.3562329390354868, "grad_norm": 1.500032066712302, "learning_rate": 4.146460752883413e-06, "loss": 0.086, "step": 2981 }, { "epoch": 1.356687898089172, "grad_norm": 1.845867576804926, "learning_rate": 4.145922908021436e-06, "loss": 0.093, "step": 2982 }, { "epoch": 1.3571428571428572, "grad_norm": 1.1553611116745768, "learning_rate": 4.145384928662072e-06, "loss": 0.0808, "step": 2983 }, { "epoch": 1.3575978161965423, "grad_norm": 1.6204753642536465, "learning_rate": 4.144846814849282e-06, "loss": 0.1137, "step": 2984 }, { "epoch": 1.3580527752502274, "grad_norm": 1.4785742805114528, "learning_rate": 4.1443085666270375e-06, "loss": 0.0745, "step": 2985 }, { "epoch": 1.3585077343039127, "grad_norm": 1.568786059867133, "learning_rate": 4.143770184039324e-06, "loss": 0.0781, "step": 2986 }, { "epoch": 1.3589626933575978, "grad_norm": 1.3129535859500985, "learning_rate": 4.143231667130134e-06, "loss": 0.07, "step": 2987 }, { "epoch": 1.3594176524112829, "grad_norm": 1.4407738507014018, "learning_rate": 4.142693015943472e-06, "loss": 0.0924, "step": 2988 }, { "epoch": 1.3598726114649682, "grad_norm": 1.26178085087696, "learning_rate": 4.142154230523356e-06, "loss": 0.0982, "step": 2989 }, { "epoch": 1.3603275705186533, "grad_norm": 1.5645694158161196, "learning_rate": 4.141615310913812e-06, "loss": 0.1054, "step": 2990 }, { "epoch": 1.3607825295723384, "grad_norm": 1.457964863814924, "learning_rate": 4.141076257158878e-06, "loss": 0.0665, "step": 2991 }, { "epoch": 1.3612374886260237, "grad_norm": 1.440042510623743, "learning_rate": 4.1405370693026035e-06, "loss": 0.0986, "step": 2992 }, { "epoch": 1.3616924476797088, "grad_norm": 1.5963947064329014, "learning_rate": 4.139997747389049e-06, "loss": 0.0717, "step": 2993 }, { "epoch": 1.362147406733394, "grad_norm": 1.6062059101807404, "learning_rate": 4.139458291462283e-06, "loss": 0.0764, "step": 2994 }, { "epoch": 1.3626023657870792, "grad_norm": 1.5022431362975714, "learning_rate": 4.13891870156639e-06, "loss": 0.0958, "step": 2995 }, { "epoch": 1.3630573248407643, "grad_norm": 1.13820529503498, "learning_rate": 4.138378977745462e-06, "loss": 0.0647, "step": 2996 }, { "epoch": 1.3635122838944496, "grad_norm": 1.3046508062288489, "learning_rate": 4.137839120043603e-06, "loss": 0.0885, "step": 2997 }, { "epoch": 1.3639672429481347, "grad_norm": 2.022146417216205, "learning_rate": 4.137299128504928e-06, "loss": 0.07, "step": 2998 }, { "epoch": 1.36442220200182, "grad_norm": 1.3864500723998632, "learning_rate": 4.136759003173561e-06, "loss": 0.0538, "step": 2999 }, { "epoch": 1.364877161055505, "grad_norm": 1.2662973844633498, "learning_rate": 4.136218744093641e-06, "loss": 0.0702, "step": 3000 }, { "epoch": 1.3653321201091901, "grad_norm": 1.1664168994599806, "learning_rate": 4.1356783513093135e-06, "loss": 0.0994, "step": 3001 }, { "epoch": 1.3657870791628755, "grad_norm": 1.7061935637133232, "learning_rate": 4.135137824864738e-06, "loss": 0.11, "step": 3002 }, { "epoch": 1.3662420382165605, "grad_norm": 1.4389371466484144, "learning_rate": 4.134597164804084e-06, "loss": 0.0864, "step": 3003 }, { "epoch": 1.3666969972702456, "grad_norm": 1.3524284977876537, "learning_rate": 4.134056371171531e-06, "loss": 0.0887, "step": 3004 }, { "epoch": 1.367151956323931, "grad_norm": 1.6213408755328722, "learning_rate": 4.1335154440112715e-06, "loss": 0.0803, "step": 3005 }, { "epoch": 1.367606915377616, "grad_norm": 1.556029739463302, "learning_rate": 4.132974383367505e-06, "loss": 0.0765, "step": 3006 }, { "epoch": 1.3680618744313011, "grad_norm": 1.5198063717161714, "learning_rate": 4.1324331892844485e-06, "loss": 0.1071, "step": 3007 }, { "epoch": 1.3685168334849864, "grad_norm": 3.008907692912203, "learning_rate": 4.131891861806322e-06, "loss": 0.1128, "step": 3008 }, { "epoch": 1.3689717925386715, "grad_norm": 1.2375971188753598, "learning_rate": 4.131350400977363e-06, "loss": 0.0745, "step": 3009 }, { "epoch": 1.3694267515923566, "grad_norm": 1.6735552690192763, "learning_rate": 4.130808806841816e-06, "loss": 0.096, "step": 3010 }, { "epoch": 1.369881710646042, "grad_norm": 1.8640305107472706, "learning_rate": 4.130267079443939e-06, "loss": 0.1814, "step": 3011 }, { "epoch": 1.370336669699727, "grad_norm": 1.5200099422285263, "learning_rate": 4.129725218827997e-06, "loss": 0.0875, "step": 3012 }, { "epoch": 1.370791628753412, "grad_norm": 1.2635290423283334, "learning_rate": 4.1291832250382705e-06, "loss": 0.0939, "step": 3013 }, { "epoch": 1.3712465878070974, "grad_norm": 1.9722973424818753, "learning_rate": 4.128641098119048e-06, "loss": 0.1276, "step": 3014 }, { "epoch": 1.3717015468607825, "grad_norm": 1.2768609640411666, "learning_rate": 4.128098838114631e-06, "loss": 0.1057, "step": 3015 }, { "epoch": 1.3721565059144676, "grad_norm": 1.3098873599547771, "learning_rate": 4.127556445069328e-06, "loss": 0.0934, "step": 3016 }, { "epoch": 1.372611464968153, "grad_norm": 1.1243155323793386, "learning_rate": 4.127013919027462e-06, "loss": 0.0535, "step": 3017 }, { "epoch": 1.373066424021838, "grad_norm": 1.505169511821335, "learning_rate": 4.126471260033368e-06, "loss": 0.0689, "step": 3018 }, { "epoch": 1.373521383075523, "grad_norm": 1.337137821272305, "learning_rate": 4.125928468131387e-06, "loss": 0.0996, "step": 3019 }, { "epoch": 1.3739763421292084, "grad_norm": 1.6563555078245968, "learning_rate": 4.125385543365873e-06, "loss": 0.1533, "step": 3020 }, { "epoch": 1.3744313011828935, "grad_norm": 1.745600010223039, "learning_rate": 4.124842485781194e-06, "loss": 0.0652, "step": 3021 }, { "epoch": 1.3748862602365788, "grad_norm": 1.2474090573674002, "learning_rate": 4.1242992954217234e-06, "loss": 0.0873, "step": 3022 }, { "epoch": 1.3753412192902639, "grad_norm": 1.550268975520972, "learning_rate": 4.123755972331851e-06, "loss": 0.0716, "step": 3023 }, { "epoch": 1.3757961783439492, "grad_norm": 1.8343455277968792, "learning_rate": 4.123212516555972e-06, "loss": 0.0846, "step": 3024 }, { "epoch": 1.3762511373976343, "grad_norm": 2.1206041941880316, "learning_rate": 4.122668928138498e-06, "loss": 0.0701, "step": 3025 }, { "epoch": 1.3767060964513194, "grad_norm": 1.3957364960811491, "learning_rate": 4.122125207123846e-06, "loss": 0.1161, "step": 3026 }, { "epoch": 1.3771610555050047, "grad_norm": 1.6925375005740344, "learning_rate": 4.121581353556447e-06, "loss": 0.1144, "step": 3027 }, { "epoch": 1.3776160145586898, "grad_norm": 1.0642717374506152, "learning_rate": 4.121037367480744e-06, "loss": 0.0755, "step": 3028 }, { "epoch": 1.3780709736123748, "grad_norm": 1.4521461544935645, "learning_rate": 4.120493248941188e-06, "loss": 0.0864, "step": 3029 }, { "epoch": 1.3785259326660602, "grad_norm": 1.3256935722224579, "learning_rate": 4.119948997982241e-06, "loss": 0.0673, "step": 3030 }, { "epoch": 1.3789808917197452, "grad_norm": 1.4479186182402999, "learning_rate": 4.119404614648378e-06, "loss": 0.1084, "step": 3031 }, { "epoch": 1.3794358507734303, "grad_norm": 1.529794116898629, "learning_rate": 4.118860098984083e-06, "loss": 0.1069, "step": 3032 }, { "epoch": 1.3798908098271156, "grad_norm": 1.9453222758639295, "learning_rate": 4.118315451033851e-06, "loss": 0.1063, "step": 3033 }, { "epoch": 1.3803457688808007, "grad_norm": 1.8710214890239933, "learning_rate": 4.117770670842189e-06, "loss": 0.0986, "step": 3034 }, { "epoch": 1.3808007279344858, "grad_norm": 1.865982837057026, "learning_rate": 4.117225758453614e-06, "loss": 0.0665, "step": 3035 }, { "epoch": 1.3812556869881711, "grad_norm": 1.4172056959551298, "learning_rate": 4.116680713912652e-06, "loss": 0.0887, "step": 3036 }, { "epoch": 1.3817106460418562, "grad_norm": 1.2792617194810614, "learning_rate": 4.116135537263844e-06, "loss": 0.0793, "step": 3037 }, { "epoch": 1.3821656050955413, "grad_norm": 1.361861664663989, "learning_rate": 4.115590228551738e-06, "loss": 0.0546, "step": 3038 }, { "epoch": 1.3826205641492266, "grad_norm": 1.5111868162833144, "learning_rate": 4.115044787820895e-06, "loss": 0.0873, "step": 3039 }, { "epoch": 1.3830755232029117, "grad_norm": 1.294097183302166, "learning_rate": 4.114499215115885e-06, "loss": 0.0799, "step": 3040 }, { "epoch": 1.3835304822565968, "grad_norm": 1.6516053010678304, "learning_rate": 4.113953510481289e-06, "loss": 0.0936, "step": 3041 }, { "epoch": 1.3839854413102821, "grad_norm": 1.5418709349921675, "learning_rate": 4.113407673961702e-06, "loss": 0.0806, "step": 3042 }, { "epoch": 1.3844404003639672, "grad_norm": 1.4708403511141128, "learning_rate": 4.112861705601726e-06, "loss": 0.1166, "step": 3043 }, { "epoch": 1.3848953594176523, "grad_norm": 1.3189213143501122, "learning_rate": 4.112315605445975e-06, "loss": 0.07, "step": 3044 }, { "epoch": 1.3853503184713376, "grad_norm": 1.3607727338617557, "learning_rate": 4.111769373539073e-06, "loss": 0.1183, "step": 3045 }, { "epoch": 1.3858052775250227, "grad_norm": 1.8543740703010843, "learning_rate": 4.1112230099256576e-06, "loss": 0.1255, "step": 3046 }, { "epoch": 1.3862602365787078, "grad_norm": 1.6560439620970233, "learning_rate": 4.1106765146503735e-06, "loss": 0.1015, "step": 3047 }, { "epoch": 1.386715195632393, "grad_norm": 1.3456038126392524, "learning_rate": 4.110129887757878e-06, "loss": 0.0635, "step": 3048 }, { "epoch": 1.3871701546860782, "grad_norm": 1.5816088658143497, "learning_rate": 4.10958312929284e-06, "loss": 0.1239, "step": 3049 }, { "epoch": 1.3876251137397635, "grad_norm": 1.6794947140233902, "learning_rate": 4.1090362392999376e-06, "loss": 0.1089, "step": 3050 }, { "epoch": 1.3880800727934486, "grad_norm": 1.0729336590302643, "learning_rate": 4.108489217823859e-06, "loss": 0.0725, "step": 3051 }, { "epoch": 1.388535031847134, "grad_norm": 1.6756052232049428, "learning_rate": 4.107942064909306e-06, "loss": 0.0824, "step": 3052 }, { "epoch": 1.388989990900819, "grad_norm": 1.323475532657184, "learning_rate": 4.107394780600989e-06, "loss": 0.0586, "step": 3053 }, { "epoch": 1.389444949954504, "grad_norm": 1.9151869788887224, "learning_rate": 4.10684736494363e-06, "loss": 0.1133, "step": 3054 }, { "epoch": 1.3898999090081894, "grad_norm": 1.8061098813276502, "learning_rate": 4.10629981798196e-06, "loss": 0.0859, "step": 3055 }, { "epoch": 1.3903548680618745, "grad_norm": 1.7916810990267698, "learning_rate": 4.105752139760723e-06, "loss": 0.1029, "step": 3056 }, { "epoch": 1.3908098271155596, "grad_norm": 1.5329756786798103, "learning_rate": 4.105204330324673e-06, "loss": 0.0847, "step": 3057 }, { "epoch": 1.3912647861692449, "grad_norm": 1.3370514274112406, "learning_rate": 4.1046563897185736e-06, "loss": 0.0534, "step": 3058 }, { "epoch": 1.39171974522293, "grad_norm": 1.5092981508072156, "learning_rate": 4.104108317987201e-06, "loss": 0.0638, "step": 3059 }, { "epoch": 1.392174704276615, "grad_norm": 1.4184824051433425, "learning_rate": 4.103560115175341e-06, "loss": 0.121, "step": 3060 }, { "epoch": 1.3926296633303004, "grad_norm": 1.881972793096944, "learning_rate": 4.103011781327789e-06, "loss": 0.0879, "step": 3061 }, { "epoch": 1.3930846223839854, "grad_norm": 1.5365133515322453, "learning_rate": 4.102463316489354e-06, "loss": 0.0778, "step": 3062 }, { "epoch": 1.3935395814376705, "grad_norm": 2.0627555185565054, "learning_rate": 4.101914720704854e-06, "loss": 0.1263, "step": 3063 }, { "epoch": 1.3939945404913558, "grad_norm": 2.0049528831486643, "learning_rate": 4.101365994019116e-06, "loss": 0.1094, "step": 3064 }, { "epoch": 1.394449499545041, "grad_norm": 1.4090211096188048, "learning_rate": 4.100817136476981e-06, "loss": 0.159, "step": 3065 }, { "epoch": 1.394904458598726, "grad_norm": 1.705441955845772, "learning_rate": 4.1002681481233e-06, "loss": 0.0674, "step": 3066 }, { "epoch": 1.3953594176524113, "grad_norm": 1.5045760187094444, "learning_rate": 4.099719029002932e-06, "loss": 0.0727, "step": 3067 }, { "epoch": 1.3958143767060964, "grad_norm": 1.938718125172694, "learning_rate": 4.0991697791607485e-06, "loss": 0.1316, "step": 3068 }, { "epoch": 1.3962693357597815, "grad_norm": 2.009442315008649, "learning_rate": 4.098620398641633e-06, "loss": 0.0736, "step": 3069 }, { "epoch": 1.3967242948134668, "grad_norm": 1.4739945600438837, "learning_rate": 4.098070887490478e-06, "loss": 0.0946, "step": 3070 }, { "epoch": 1.397179253867152, "grad_norm": 1.3786062789475735, "learning_rate": 4.0975212457521865e-06, "loss": 0.038, "step": 3071 }, { "epoch": 1.397634212920837, "grad_norm": 1.6481154366038084, "learning_rate": 4.096971473471674e-06, "loss": 0.053, "step": 3072 }, { "epoch": 1.3980891719745223, "grad_norm": 1.5670127766477158, "learning_rate": 4.0964215706938635e-06, "loss": 0.1131, "step": 3073 }, { "epoch": 1.3985441310282074, "grad_norm": 1.871126644541074, "learning_rate": 4.0958715374636925e-06, "loss": 0.103, "step": 3074 }, { "epoch": 1.3989990900818925, "grad_norm": 1.2876342742017364, "learning_rate": 4.095321373826105e-06, "loss": 0.0483, "step": 3075 }, { "epoch": 1.3994540491355778, "grad_norm": 1.5282688364808739, "learning_rate": 4.094771079826061e-06, "loss": 0.0867, "step": 3076 }, { "epoch": 1.399909008189263, "grad_norm": 1.4859730684719616, "learning_rate": 4.094220655508525e-06, "loss": 0.0896, "step": 3077 }, { "epoch": 1.4003639672429482, "grad_norm": 2.000368133118769, "learning_rate": 4.0936701009184775e-06, "loss": 0.1337, "step": 3078 }, { "epoch": 1.4008189262966333, "grad_norm": 1.5985742852319313, "learning_rate": 4.0931194161009044e-06, "loss": 0.0943, "step": 3079 }, { "epoch": 1.4012738853503186, "grad_norm": 1.607575876046075, "learning_rate": 4.092568601100809e-06, "loss": 0.0884, "step": 3080 }, { "epoch": 1.4017288444040037, "grad_norm": 2.0593232455609485, "learning_rate": 4.092017655963199e-06, "loss": 0.0914, "step": 3081 }, { "epoch": 1.4021838034576888, "grad_norm": 1.8129033138852697, "learning_rate": 4.091466580733095e-06, "loss": 0.1029, "step": 3082 }, { "epoch": 1.402638762511374, "grad_norm": 1.4289262742824609, "learning_rate": 4.09091537545553e-06, "loss": 0.0868, "step": 3083 }, { "epoch": 1.4030937215650592, "grad_norm": 1.5132624763535472, "learning_rate": 4.090364040175545e-06, "loss": 0.069, "step": 3084 }, { "epoch": 1.4035486806187443, "grad_norm": 1.4092585916267564, "learning_rate": 4.089812574938192e-06, "loss": 0.1106, "step": 3085 }, { "epoch": 1.4040036396724296, "grad_norm": 1.4605436798270695, "learning_rate": 4.089260979788534e-06, "loss": 0.0933, "step": 3086 }, { "epoch": 1.4044585987261147, "grad_norm": 1.6229483425024653, "learning_rate": 4.088709254771648e-06, "loss": 0.0649, "step": 3087 }, { "epoch": 1.4049135577797998, "grad_norm": 1.4163321347380502, "learning_rate": 4.088157399932615e-06, "loss": 0.0708, "step": 3088 }, { "epoch": 1.405368516833485, "grad_norm": 1.2068467324968226, "learning_rate": 4.0876054153165314e-06, "loss": 0.0897, "step": 3089 }, { "epoch": 1.4058234758871702, "grad_norm": 2.0109067301545056, "learning_rate": 4.087053300968502e-06, "loss": 0.0956, "step": 3090 }, { "epoch": 1.4062784349408552, "grad_norm": 1.4968196153904874, "learning_rate": 4.086501056933646e-06, "loss": 0.0803, "step": 3091 }, { "epoch": 1.4067333939945406, "grad_norm": 1.4158603555039302, "learning_rate": 4.085948683257087e-06, "loss": 0.0833, "step": 3092 }, { "epoch": 1.4071883530482256, "grad_norm": 1.7754359285875605, "learning_rate": 4.085396179983963e-06, "loss": 0.1322, "step": 3093 }, { "epoch": 1.4076433121019107, "grad_norm": 1.751576660014664, "learning_rate": 4.084843547159424e-06, "loss": 0.1194, "step": 3094 }, { "epoch": 1.408098271155596, "grad_norm": 1.6044657598380159, "learning_rate": 4.0842907848286265e-06, "loss": 0.1068, "step": 3095 }, { "epoch": 1.4085532302092811, "grad_norm": 1.9137047485574095, "learning_rate": 4.083737893036741e-06, "loss": 0.0739, "step": 3096 }, { "epoch": 1.4090081892629662, "grad_norm": 1.237339009763652, "learning_rate": 4.083184871828947e-06, "loss": 0.1083, "step": 3097 }, { "epoch": 1.4094631483166515, "grad_norm": 1.388053925993269, "learning_rate": 4.0826317212504345e-06, "loss": 0.0547, "step": 3098 }, { "epoch": 1.4099181073703366, "grad_norm": 1.7664599542872024, "learning_rate": 4.0820784413464054e-06, "loss": 0.0817, "step": 3099 }, { "epoch": 1.4103730664240217, "grad_norm": 1.628966649714229, "learning_rate": 4.08152503216207e-06, "loss": 0.0973, "step": 3100 }, { "epoch": 1.410828025477707, "grad_norm": 1.436293717980754, "learning_rate": 4.080971493742652e-06, "loss": 0.0708, "step": 3101 }, { "epoch": 1.4112829845313921, "grad_norm": 2.0058758429805343, "learning_rate": 4.080417826133382e-06, "loss": 0.1232, "step": 3102 }, { "epoch": 1.4117379435850774, "grad_norm": 1.1522268625472838, "learning_rate": 4.079864029379506e-06, "loss": 0.1099, "step": 3103 }, { "epoch": 1.4121929026387625, "grad_norm": 1.6045577352071256, "learning_rate": 4.079310103526275e-06, "loss": 0.0589, "step": 3104 }, { "epoch": 1.4126478616924476, "grad_norm": 1.5887279031596504, "learning_rate": 4.0787560486189545e-06, "loss": 0.0748, "step": 3105 }, { "epoch": 1.413102820746133, "grad_norm": 1.5933721914255996, "learning_rate": 4.07820186470282e-06, "loss": 0.0974, "step": 3106 }, { "epoch": 1.413557779799818, "grad_norm": 1.558542137688861, "learning_rate": 4.077647551823155e-06, "loss": 0.0643, "step": 3107 }, { "epoch": 1.4140127388535033, "grad_norm": 1.4257475666094181, "learning_rate": 4.077093110025258e-06, "loss": 0.0752, "step": 3108 }, { "epoch": 1.4144676979071884, "grad_norm": 1.7509966093110312, "learning_rate": 4.076538539354433e-06, "loss": 0.0759, "step": 3109 }, { "epoch": 1.4149226569608735, "grad_norm": 1.6569357193334953, "learning_rate": 4.075983839855999e-06, "loss": 0.086, "step": 3110 }, { "epoch": 1.4153776160145588, "grad_norm": 1.6556214652539405, "learning_rate": 4.075429011575281e-06, "loss": 0.148, "step": 3111 }, { "epoch": 1.415832575068244, "grad_norm": 1.386912477817398, "learning_rate": 4.07487405455762e-06, "loss": 0.0681, "step": 3112 }, { "epoch": 1.416287534121929, "grad_norm": 1.3537829403552966, "learning_rate": 4.074318968848364e-06, "loss": 0.0647, "step": 3113 }, { "epoch": 1.4167424931756143, "grad_norm": 2.4261968022238896, "learning_rate": 4.073763754492871e-06, "loss": 0.1206, "step": 3114 }, { "epoch": 1.4171974522292994, "grad_norm": 1.337628734579206, "learning_rate": 4.07320841153651e-06, "loss": 0.0921, "step": 3115 }, { "epoch": 1.4176524112829845, "grad_norm": 1.7332427724831225, "learning_rate": 4.072652940024664e-06, "loss": 0.082, "step": 3116 }, { "epoch": 1.4181073703366698, "grad_norm": 1.6295901485452224, "learning_rate": 4.07209734000272e-06, "loss": 0.0905, "step": 3117 }, { "epoch": 1.4185623293903549, "grad_norm": 1.6997544451783175, "learning_rate": 4.071541611516082e-06, "loss": 0.0859, "step": 3118 }, { "epoch": 1.41901728844404, "grad_norm": 1.8300170633413646, "learning_rate": 4.0709857546101605e-06, "loss": 0.1308, "step": 3119 }, { "epoch": 1.4194722474977253, "grad_norm": 1.3981884700810239, "learning_rate": 4.0704297693303775e-06, "loss": 0.1278, "step": 3120 }, { "epoch": 1.4199272065514104, "grad_norm": 1.64654137499224, "learning_rate": 4.0698736557221655e-06, "loss": 0.182, "step": 3121 }, { "epoch": 1.4203821656050954, "grad_norm": 1.974429504756398, "learning_rate": 4.069317413830968e-06, "loss": 0.1191, "step": 3122 }, { "epoch": 1.4208371246587808, "grad_norm": 1.6716798129461754, "learning_rate": 4.068761043702237e-06, "loss": 0.0721, "step": 3123 }, { "epoch": 1.4212920837124658, "grad_norm": 1.7117987338988347, "learning_rate": 4.06820454538144e-06, "loss": 0.1008, "step": 3124 }, { "epoch": 1.421747042766151, "grad_norm": 1.6152542759665993, "learning_rate": 4.067647918914049e-06, "loss": 0.0761, "step": 3125 }, { "epoch": 1.4222020018198362, "grad_norm": 1.334134782050915, "learning_rate": 4.067091164345549e-06, "loss": 0.0685, "step": 3126 }, { "epoch": 1.4226569608735213, "grad_norm": 1.4606109664867182, "learning_rate": 4.066534281721437e-06, "loss": 0.0606, "step": 3127 }, { "epoch": 1.4231119199272064, "grad_norm": 1.5806747402668917, "learning_rate": 4.065977271087216e-06, "loss": 0.1329, "step": 3128 }, { "epoch": 1.4235668789808917, "grad_norm": 1.5691214419378425, "learning_rate": 4.065420132488406e-06, "loss": 0.0737, "step": 3129 }, { "epoch": 1.4240218380345768, "grad_norm": 1.8375133989154135, "learning_rate": 4.064862865970531e-06, "loss": 0.1249, "step": 3130 }, { "epoch": 1.4244767970882621, "grad_norm": 1.3802242599504997, "learning_rate": 4.064305471579131e-06, "loss": 0.0999, "step": 3131 }, { "epoch": 1.4249317561419472, "grad_norm": 1.541194698082713, "learning_rate": 4.063747949359751e-06, "loss": 0.0918, "step": 3132 }, { "epoch": 1.4253867151956323, "grad_norm": 1.6911608564123464, "learning_rate": 4.063190299357951e-06, "loss": 0.1007, "step": 3133 }, { "epoch": 1.4258416742493176, "grad_norm": 1.3418125091033966, "learning_rate": 4.062632521619298e-06, "loss": 0.09, "step": 3134 }, { "epoch": 1.4262966333030027, "grad_norm": 1.3817465718776452, "learning_rate": 4.0620746161893736e-06, "loss": 0.0745, "step": 3135 }, { "epoch": 1.426751592356688, "grad_norm": 2.025133612388266, "learning_rate": 4.061516583113765e-06, "loss": 0.1207, "step": 3136 }, { "epoch": 1.4272065514103731, "grad_norm": 1.171825574188528, "learning_rate": 4.060958422438073e-06, "loss": 0.11, "step": 3137 }, { "epoch": 1.4276615104640582, "grad_norm": 1.3442150387549336, "learning_rate": 4.060400134207908e-06, "loss": 0.0754, "step": 3138 }, { "epoch": 1.4281164695177435, "grad_norm": 1.5912738318144795, "learning_rate": 4.05984171846889e-06, "loss": 0.0954, "step": 3139 }, { "epoch": 1.4285714285714286, "grad_norm": 1.9917615223928906, "learning_rate": 4.059283175266652e-06, "loss": 0.1068, "step": 3140 }, { "epoch": 1.4290263876251137, "grad_norm": 2.089627319018694, "learning_rate": 4.058724504646834e-06, "loss": 0.0814, "step": 3141 }, { "epoch": 1.429481346678799, "grad_norm": 1.9116853282500395, "learning_rate": 4.058165706655089e-06, "loss": 0.0944, "step": 3142 }, { "epoch": 1.429936305732484, "grad_norm": 1.300828592702008, "learning_rate": 4.057606781337079e-06, "loss": 0.0621, "step": 3143 }, { "epoch": 1.4303912647861692, "grad_norm": 1.4206548674219437, "learning_rate": 4.057047728738477e-06, "loss": 0.0774, "step": 3144 }, { "epoch": 1.4308462238398545, "grad_norm": 1.9853334065977504, "learning_rate": 4.056488548904966e-06, "loss": 0.107, "step": 3145 }, { "epoch": 1.4313011828935396, "grad_norm": 1.2653239584479665, "learning_rate": 4.055929241882239e-06, "loss": 0.0873, "step": 3146 }, { "epoch": 1.4317561419472247, "grad_norm": 1.8362641844108591, "learning_rate": 4.0553698077160025e-06, "loss": 0.101, "step": 3147 }, { "epoch": 1.43221110100091, "grad_norm": 1.538922000402847, "learning_rate": 4.054810246451969e-06, "loss": 0.1462, "step": 3148 }, { "epoch": 1.432666060054595, "grad_norm": 1.394370905413465, "learning_rate": 4.054250558135862e-06, "loss": 0.0954, "step": 3149 }, { "epoch": 1.4331210191082802, "grad_norm": 1.7442058091518615, "learning_rate": 4.05369074281342e-06, "loss": 0.0706, "step": 3150 }, { "epoch": 1.4335759781619655, "grad_norm": 1.5707090338313843, "learning_rate": 4.053130800530387e-06, "loss": 0.0972, "step": 3151 }, { "epoch": 1.4340309372156506, "grad_norm": 2.103565057155518, "learning_rate": 4.052570731332518e-06, "loss": 0.1038, "step": 3152 }, { "epoch": 1.4344858962693356, "grad_norm": 2.735486066200119, "learning_rate": 4.0520105352655805e-06, "loss": 0.0684, "step": 3153 }, { "epoch": 1.434940855323021, "grad_norm": 1.9919135580810592, "learning_rate": 4.051450212375351e-06, "loss": 0.112, "step": 3154 }, { "epoch": 1.435395814376706, "grad_norm": 1.5645652275838957, "learning_rate": 4.050889762707616e-06, "loss": 0.0928, "step": 3155 }, { "epoch": 1.4358507734303911, "grad_norm": 1.419745702328624, "learning_rate": 4.050329186308173e-06, "loss": 0.0588, "step": 3156 }, { "epoch": 1.4363057324840764, "grad_norm": 1.8706736467530494, "learning_rate": 4.0497684832228305e-06, "loss": 0.0847, "step": 3157 }, { "epoch": 1.4367606915377615, "grad_norm": 1.3991493478098997, "learning_rate": 4.049207653497406e-06, "loss": 0.1132, "step": 3158 }, { "epoch": 1.4372156505914468, "grad_norm": 1.96309237415054, "learning_rate": 4.0486466971777295e-06, "loss": 0.1246, "step": 3159 }, { "epoch": 1.437670609645132, "grad_norm": 1.4189800401113921, "learning_rate": 4.048085614309638e-06, "loss": 0.102, "step": 3160 }, { "epoch": 1.438125568698817, "grad_norm": 2.2148729038713624, "learning_rate": 4.047524404938981e-06, "loss": 0.0628, "step": 3161 }, { "epoch": 1.4385805277525023, "grad_norm": 1.5171369293948123, "learning_rate": 4.046963069111617e-06, "loss": 0.0795, "step": 3162 }, { "epoch": 1.4390354868061874, "grad_norm": 1.539848482078195, "learning_rate": 4.046401606873419e-06, "loss": 0.114, "step": 3163 }, { "epoch": 1.4394904458598727, "grad_norm": 1.690492127124779, "learning_rate": 4.045840018270264e-06, "loss": 0.1139, "step": 3164 }, { "epoch": 1.4399454049135578, "grad_norm": 1.5947466110576605, "learning_rate": 4.045278303348044e-06, "loss": 0.1461, "step": 3165 }, { "epoch": 1.440400363967243, "grad_norm": 1.8127271461816068, "learning_rate": 4.044716462152659e-06, "loss": 0.0739, "step": 3166 }, { "epoch": 1.4408553230209282, "grad_norm": 1.3271929423122868, "learning_rate": 4.04415449473002e-06, "loss": 0.0656, "step": 3167 }, { "epoch": 1.4413102820746133, "grad_norm": 1.660108301629467, "learning_rate": 4.043592401126051e-06, "loss": 0.0903, "step": 3168 }, { "epoch": 1.4417652411282984, "grad_norm": 1.8848818971646046, "learning_rate": 4.043030181386681e-06, "loss": 0.1474, "step": 3169 }, { "epoch": 1.4422202001819837, "grad_norm": 1.5333568431709597, "learning_rate": 4.042467835557853e-06, "loss": 0.1043, "step": 3170 }, { "epoch": 1.4426751592356688, "grad_norm": 1.4859729034954354, "learning_rate": 4.0419053636855185e-06, "loss": 0.1088, "step": 3171 }, { "epoch": 1.443130118289354, "grad_norm": 1.2014801453878274, "learning_rate": 4.041342765815641e-06, "loss": 0.1019, "step": 3172 }, { "epoch": 1.4435850773430392, "grad_norm": 1.302761255939903, "learning_rate": 4.040780041994193e-06, "loss": 0.1004, "step": 3173 }, { "epoch": 1.4440400363967243, "grad_norm": 1.5519683046297188, "learning_rate": 4.040217192267159e-06, "loss": 0.116, "step": 3174 }, { "epoch": 1.4444949954504094, "grad_norm": 1.392075237134328, "learning_rate": 4.03965421668053e-06, "loss": 0.096, "step": 3175 }, { "epoch": 1.4449499545040947, "grad_norm": 1.7241092171311203, "learning_rate": 4.039091115280314e-06, "loss": 0.1131, "step": 3176 }, { "epoch": 1.4454049135577798, "grad_norm": 1.9159261352612456, "learning_rate": 4.038527888112521e-06, "loss": 0.1249, "step": 3177 }, { "epoch": 1.4458598726114649, "grad_norm": 1.3929813616257414, "learning_rate": 4.037964535223177e-06, "loss": 0.0844, "step": 3178 }, { "epoch": 1.4463148316651502, "grad_norm": 1.314578237454095, "learning_rate": 4.037401056658317e-06, "loss": 0.1089, "step": 3179 }, { "epoch": 1.4467697907188353, "grad_norm": 1.4374145711061501, "learning_rate": 4.036837452463985e-06, "loss": 0.0638, "step": 3180 }, { "epoch": 1.4472247497725204, "grad_norm": 1.5535123182873063, "learning_rate": 4.0362737226862356e-06, "loss": 0.1013, "step": 3181 }, { "epoch": 1.4476797088262057, "grad_norm": 1.4489834318203805, "learning_rate": 4.035709867371137e-06, "loss": 0.0562, "step": 3182 }, { "epoch": 1.4481346678798908, "grad_norm": 2.6312713569197106, "learning_rate": 4.035145886564763e-06, "loss": 0.095, "step": 3183 }, { "epoch": 1.4485896269335758, "grad_norm": 1.6466551304590673, "learning_rate": 4.0345817803132e-06, "loss": 0.078, "step": 3184 }, { "epoch": 1.4490445859872612, "grad_norm": 1.902175703161397, "learning_rate": 4.034017548662544e-06, "loss": 0.0839, "step": 3185 }, { "epoch": 1.4494995450409462, "grad_norm": 1.2961165460178068, "learning_rate": 4.033453191658901e-06, "loss": 0.1053, "step": 3186 }, { "epoch": 1.4499545040946316, "grad_norm": 1.708605331898319, "learning_rate": 4.032888709348388e-06, "loss": 0.1129, "step": 3187 }, { "epoch": 1.4504094631483166, "grad_norm": 2.203731263531537, "learning_rate": 4.032324101777132e-06, "loss": 0.1082, "step": 3188 }, { "epoch": 1.450864422202002, "grad_norm": 1.4635020148733482, "learning_rate": 4.03175936899127e-06, "loss": 0.0749, "step": 3189 }, { "epoch": 1.451319381255687, "grad_norm": 1.6588439091760636, "learning_rate": 4.031194511036951e-06, "loss": 0.0632, "step": 3190 }, { "epoch": 1.4517743403093721, "grad_norm": 1.276064754924485, "learning_rate": 4.0306295279603304e-06, "loss": 0.0753, "step": 3191 }, { "epoch": 1.4522292993630574, "grad_norm": 1.0801965501399753, "learning_rate": 4.030064419807578e-06, "loss": 0.0586, "step": 3192 }, { "epoch": 1.4526842584167425, "grad_norm": 1.4293396801109137, "learning_rate": 4.02949918662487e-06, "loss": 0.075, "step": 3193 }, { "epoch": 1.4531392174704276, "grad_norm": 2.099507341025243, "learning_rate": 4.028933828458396e-06, "loss": 0.1007, "step": 3194 }, { "epoch": 1.453594176524113, "grad_norm": 1.091000693206422, "learning_rate": 4.028368345354355e-06, "loss": 0.1004, "step": 3195 }, { "epoch": 1.454049135577798, "grad_norm": 1.6526334157466946, "learning_rate": 4.027802737358954e-06, "loss": 0.0839, "step": 3196 }, { "epoch": 1.4545040946314831, "grad_norm": 1.205937406414075, "learning_rate": 4.027237004518413e-06, "loss": 0.0422, "step": 3197 }, { "epoch": 1.4549590536851684, "grad_norm": 0.9718730876411333, "learning_rate": 4.02667114687896e-06, "loss": 0.0462, "step": 3198 }, { "epoch": 1.4554140127388535, "grad_norm": 1.491604600841857, "learning_rate": 4.026105164486836e-06, "loss": 0.0772, "step": 3199 }, { "epoch": 1.4558689717925386, "grad_norm": 1.9750221524098803, "learning_rate": 4.0255390573882904e-06, "loss": 0.0896, "step": 3200 }, { "epoch": 1.456323930846224, "grad_norm": 1.775358087020972, "learning_rate": 4.024972825629581e-06, "loss": 0.0957, "step": 3201 }, { "epoch": 1.456778889899909, "grad_norm": 1.9485194722635422, "learning_rate": 4.024406469256979e-06, "loss": 0.0959, "step": 3202 }, { "epoch": 1.457233848953594, "grad_norm": 1.4899991601552178, "learning_rate": 4.023839988316766e-06, "loss": 0.0804, "step": 3203 }, { "epoch": 1.4576888080072794, "grad_norm": 1.1836795583950777, "learning_rate": 4.02327338285523e-06, "loss": 0.0717, "step": 3204 }, { "epoch": 1.4581437670609645, "grad_norm": 1.3591841816683365, "learning_rate": 4.022706652918672e-06, "loss": 0.0725, "step": 3205 }, { "epoch": 1.4585987261146496, "grad_norm": 1.7081834748594866, "learning_rate": 4.022139798553404e-06, "loss": 0.1175, "step": 3206 }, { "epoch": 1.459053685168335, "grad_norm": 1.5400986572523532, "learning_rate": 4.021572819805744e-06, "loss": 0.0757, "step": 3207 }, { "epoch": 1.45950864422202, "grad_norm": 1.8774574181635106, "learning_rate": 4.021005716722025e-06, "loss": 0.0894, "step": 3208 }, { "epoch": 1.459963603275705, "grad_norm": 1.3654059330528672, "learning_rate": 4.020438489348587e-06, "loss": 0.0573, "step": 3209 }, { "epoch": 1.4604185623293904, "grad_norm": 1.5565927185612813, "learning_rate": 4.019871137731783e-06, "loss": 0.0652, "step": 3210 }, { "epoch": 1.4608735213830755, "grad_norm": 1.6803377603477627, "learning_rate": 4.019303661917973e-06, "loss": 0.0816, "step": 3211 }, { "epoch": 1.4613284804367606, "grad_norm": 1.5150441348420585, "learning_rate": 4.018736061953529e-06, "loss": 0.0692, "step": 3212 }, { "epoch": 1.4617834394904459, "grad_norm": 1.9244126742235879, "learning_rate": 4.018168337884832e-06, "loss": 0.1018, "step": 3213 }, { "epoch": 1.462238398544131, "grad_norm": 1.2657288397337036, "learning_rate": 4.017600489758275e-06, "loss": 0.0834, "step": 3214 }, { "epoch": 1.4626933575978163, "grad_norm": 1.874738284595696, "learning_rate": 4.017032517620259e-06, "loss": 0.0696, "step": 3215 }, { "epoch": 1.4631483166515014, "grad_norm": 1.800018602254352, "learning_rate": 4.016464421517197e-06, "loss": 0.0768, "step": 3216 }, { "epoch": 1.4636032757051867, "grad_norm": 1.9498817254017062, "learning_rate": 4.015896201495511e-06, "loss": 0.1065, "step": 3217 }, { "epoch": 1.4640582347588718, "grad_norm": 1.4179985906911379, "learning_rate": 4.015327857601632e-06, "loss": 0.0529, "step": 3218 }, { "epoch": 1.4645131938125568, "grad_norm": 4.360690958829557, "learning_rate": 4.014759389882004e-06, "loss": 0.2107, "step": 3219 }, { "epoch": 1.4649681528662422, "grad_norm": 1.3341853448488075, "learning_rate": 4.0141907983830794e-06, "loss": 0.1188, "step": 3220 }, { "epoch": 1.4654231119199272, "grad_norm": 1.4563227253729085, "learning_rate": 4.0136220831513205e-06, "loss": 0.0705, "step": 3221 }, { "epoch": 1.4658780709736123, "grad_norm": 1.3887811152187093, "learning_rate": 4.013053244233202e-06, "loss": 0.0706, "step": 3222 }, { "epoch": 1.4663330300272976, "grad_norm": 1.21226903108683, "learning_rate": 4.012484281675203e-06, "loss": 0.0368, "step": 3223 }, { "epoch": 1.4667879890809827, "grad_norm": 1.966132671906471, "learning_rate": 4.01191519552382e-06, "loss": 0.0864, "step": 3224 }, { "epoch": 1.4672429481346678, "grad_norm": 2.179132225544321, "learning_rate": 4.011345985825555e-06, "loss": 0.1693, "step": 3225 }, { "epoch": 1.4676979071883531, "grad_norm": 3.3476162663732927, "learning_rate": 4.010776652626921e-06, "loss": 0.0905, "step": 3226 }, { "epoch": 1.4681528662420382, "grad_norm": 1.785347864874849, "learning_rate": 4.010207195974441e-06, "loss": 0.1013, "step": 3227 }, { "epoch": 1.4686078252957233, "grad_norm": 2.075973514500278, "learning_rate": 4.00963761591465e-06, "loss": 0.1084, "step": 3228 }, { "epoch": 1.4690627843494086, "grad_norm": 1.6180343815860272, "learning_rate": 4.00906791249409e-06, "loss": 0.1205, "step": 3229 }, { "epoch": 1.4695177434030937, "grad_norm": 1.515943526633659, "learning_rate": 4.008498085759315e-06, "loss": 0.0911, "step": 3230 }, { "epoch": 1.4699727024567788, "grad_norm": 1.262244587345572, "learning_rate": 4.007928135756889e-06, "loss": 0.0748, "step": 3231 }, { "epoch": 1.4704276615104641, "grad_norm": 1.5344703709585463, "learning_rate": 4.007358062533386e-06, "loss": 0.1004, "step": 3232 }, { "epoch": 1.4708826205641492, "grad_norm": 1.5341816187650714, "learning_rate": 4.006787866135387e-06, "loss": 0.1169, "step": 3233 }, { "epoch": 1.4713375796178343, "grad_norm": 1.2425212919572375, "learning_rate": 4.006217546609491e-06, "loss": 0.0898, "step": 3234 }, { "epoch": 1.4717925386715196, "grad_norm": 1.544414549225434, "learning_rate": 4.005647104002298e-06, "loss": 0.0711, "step": 3235 }, { "epoch": 1.4722474977252047, "grad_norm": 1.3335415606143304, "learning_rate": 4.005076538360424e-06, "loss": 0.1061, "step": 3236 }, { "epoch": 1.4727024567788898, "grad_norm": 1.4343888932492248, "learning_rate": 4.00450584973049e-06, "loss": 0.0776, "step": 3237 }, { "epoch": 1.473157415832575, "grad_norm": 1.217121406367356, "learning_rate": 4.003935038159134e-06, "loss": 0.057, "step": 3238 }, { "epoch": 1.4736123748862602, "grad_norm": 1.7387144880684953, "learning_rate": 4.003364103692998e-06, "loss": 0.0905, "step": 3239 }, { "epoch": 1.4740673339399453, "grad_norm": 1.3664010434983416, "learning_rate": 4.002793046378736e-06, "loss": 0.0765, "step": 3240 }, { "epoch": 1.4745222929936306, "grad_norm": 1.5263498209289328, "learning_rate": 4.002221866263013e-06, "loss": 0.1211, "step": 3241 }, { "epoch": 1.4749772520473157, "grad_norm": 1.3929977109518552, "learning_rate": 4.001650563392504e-06, "loss": 0.1048, "step": 3242 }, { "epoch": 1.475432211101001, "grad_norm": 1.8238205359217272, "learning_rate": 4.001079137813892e-06, "loss": 0.133, "step": 3243 }, { "epoch": 1.475887170154686, "grad_norm": 1.6094595438564285, "learning_rate": 4.00050758957387e-06, "loss": 0.1176, "step": 3244 }, { "epoch": 1.4763421292083714, "grad_norm": 1.5873590393381076, "learning_rate": 3.999935918719146e-06, "loss": 0.1378, "step": 3245 }, { "epoch": 1.4767970882620565, "grad_norm": 1.6386671305090232, "learning_rate": 3.999364125296432e-06, "loss": 0.0997, "step": 3246 }, { "epoch": 1.4772520473157416, "grad_norm": 1.6452548227777368, "learning_rate": 3.998792209352453e-06, "loss": 0.06, "step": 3247 }, { "epoch": 1.4777070063694269, "grad_norm": 1.5071735597341693, "learning_rate": 3.998220170933942e-06, "loss": 0.0965, "step": 3248 }, { "epoch": 1.478161965423112, "grad_norm": 1.3815536936491717, "learning_rate": 3.997648010087645e-06, "loss": 0.0598, "step": 3249 }, { "epoch": 1.478616924476797, "grad_norm": 1.4786753982246783, "learning_rate": 3.997075726860316e-06, "loss": 0.0791, "step": 3250 }, { "epoch": 1.4790718835304824, "grad_norm": 1.3858071987182161, "learning_rate": 3.996503321298719e-06, "loss": 0.0932, "step": 3251 }, { "epoch": 1.4795268425841674, "grad_norm": 1.1675051257480176, "learning_rate": 3.995930793449629e-06, "loss": 0.1272, "step": 3252 }, { "epoch": 1.4799818016378525, "grad_norm": 1.3490664464688793, "learning_rate": 3.995358143359831e-06, "loss": 0.0973, "step": 3253 }, { "epoch": 1.4804367606915378, "grad_norm": 1.4253312495370847, "learning_rate": 3.994785371076118e-06, "loss": 0.081, "step": 3254 }, { "epoch": 1.480891719745223, "grad_norm": 1.3971442617676086, "learning_rate": 3.994212476645294e-06, "loss": 0.0865, "step": 3255 }, { "epoch": 1.481346678798908, "grad_norm": 1.59804080949228, "learning_rate": 3.993639460114175e-06, "loss": 0.1345, "step": 3256 }, { "epoch": 1.4818016378525933, "grad_norm": 1.4716555385802492, "learning_rate": 3.9930663215295845e-06, "loss": 0.1173, "step": 3257 }, { "epoch": 1.4822565969062784, "grad_norm": 2.051698628510056, "learning_rate": 3.992493060938357e-06, "loss": 0.1538, "step": 3258 }, { "epoch": 1.4827115559599635, "grad_norm": 1.9477942963778765, "learning_rate": 3.991919678387336e-06, "loss": 0.1152, "step": 3259 }, { "epoch": 1.4831665150136488, "grad_norm": 1.5246176657931476, "learning_rate": 3.991346173923378e-06, "loss": 0.0769, "step": 3260 }, { "epoch": 1.483621474067334, "grad_norm": 1.1010558210029304, "learning_rate": 3.990772547593342e-06, "loss": 0.0364, "step": 3261 }, { "epoch": 1.484076433121019, "grad_norm": 1.886940929857769, "learning_rate": 3.990198799444109e-06, "loss": 0.1441, "step": 3262 }, { "epoch": 1.4845313921747043, "grad_norm": 1.7683715628963836, "learning_rate": 3.989624929522558e-06, "loss": 0.1106, "step": 3263 }, { "epoch": 1.4849863512283894, "grad_norm": 1.4844448636046614, "learning_rate": 3.989050937875586e-06, "loss": 0.0732, "step": 3264 }, { "epoch": 1.4854413102820745, "grad_norm": 1.345268568484509, "learning_rate": 3.988476824550095e-06, "loss": 0.0938, "step": 3265 }, { "epoch": 1.4858962693357598, "grad_norm": 2.2460525845061006, "learning_rate": 3.9879025895930005e-06, "loss": 0.104, "step": 3266 }, { "epoch": 1.486351228389445, "grad_norm": 1.5488378412465755, "learning_rate": 3.987328233051225e-06, "loss": 0.091, "step": 3267 }, { "epoch": 1.4868061874431302, "grad_norm": 1.9765818332905905, "learning_rate": 3.986753754971703e-06, "loss": 0.0706, "step": 3268 }, { "epoch": 1.4872611464968153, "grad_norm": 1.407359635172796, "learning_rate": 3.986179155401379e-06, "loss": 0.0895, "step": 3269 }, { "epoch": 1.4877161055505004, "grad_norm": 1.2924518497228823, "learning_rate": 3.985604434387206e-06, "loss": 0.1248, "step": 3270 }, { "epoch": 1.4881710646041857, "grad_norm": 1.5218889950988561, "learning_rate": 3.985029591976147e-06, "loss": 0.0792, "step": 3271 }, { "epoch": 1.4886260236578708, "grad_norm": 2.3110921197610805, "learning_rate": 3.984454628215176e-06, "loss": 0.0989, "step": 3272 }, { "epoch": 1.489080982711556, "grad_norm": 1.9491858087176606, "learning_rate": 3.983879543151277e-06, "loss": 0.0928, "step": 3273 }, { "epoch": 1.4895359417652412, "grad_norm": 1.6692979048621384, "learning_rate": 3.9833043368314426e-06, "loss": 0.0889, "step": 3274 }, { "epoch": 1.4899909008189263, "grad_norm": 1.56162066388252, "learning_rate": 3.982729009302676e-06, "loss": 0.1381, "step": 3275 }, { "epoch": 1.4904458598726116, "grad_norm": 1.2397951537565872, "learning_rate": 3.982153560611991e-06, "loss": 0.1027, "step": 3276 }, { "epoch": 1.4909008189262967, "grad_norm": 1.9522892702016734, "learning_rate": 3.98157799080641e-06, "loss": 0.0887, "step": 3277 }, { "epoch": 1.4913557779799818, "grad_norm": 1.559559395869986, "learning_rate": 3.9810022999329675e-06, "loss": 0.1776, "step": 3278 }, { "epoch": 1.491810737033667, "grad_norm": 1.3405650302564898, "learning_rate": 3.980426488038703e-06, "loss": 0.0734, "step": 3279 }, { "epoch": 1.4922656960873522, "grad_norm": 1.7778119841931375, "learning_rate": 3.979850555170673e-06, "loss": 0.1076, "step": 3280 }, { "epoch": 1.4927206551410372, "grad_norm": 1.2773005309611862, "learning_rate": 3.979274501375939e-06, "loss": 0.0929, "step": 3281 }, { "epoch": 1.4931756141947226, "grad_norm": 1.759459718258714, "learning_rate": 3.978698326701573e-06, "loss": 0.1451, "step": 3282 }, { "epoch": 1.4936305732484076, "grad_norm": 1.837225128979886, "learning_rate": 3.978122031194657e-06, "loss": 0.064, "step": 3283 }, { "epoch": 1.4940855323020927, "grad_norm": 1.6288168152659654, "learning_rate": 3.977545614902284e-06, "loss": 0.0581, "step": 3284 }, { "epoch": 1.494540491355778, "grad_norm": 1.6347064947993442, "learning_rate": 3.976969077871555e-06, "loss": 0.1265, "step": 3285 }, { "epoch": 1.4949954504094631, "grad_norm": 1.7257432934585628, "learning_rate": 3.976392420149583e-06, "loss": 0.1044, "step": 3286 }, { "epoch": 1.4954504094631482, "grad_norm": 1.3411834725123, "learning_rate": 3.975815641783491e-06, "loss": 0.0448, "step": 3287 }, { "epoch": 1.4959053685168335, "grad_norm": 1.2133624205106701, "learning_rate": 3.975238742820409e-06, "loss": 0.0852, "step": 3288 }, { "epoch": 1.4963603275705186, "grad_norm": 1.6131434442831794, "learning_rate": 3.9746617233074785e-06, "loss": 0.0875, "step": 3289 }, { "epoch": 1.4968152866242037, "grad_norm": 1.4603037455397265, "learning_rate": 3.974084583291851e-06, "loss": 0.1127, "step": 3290 }, { "epoch": 1.497270245677889, "grad_norm": 1.5220114621495135, "learning_rate": 3.97350732282069e-06, "loss": 0.0764, "step": 3291 }, { "epoch": 1.4977252047315741, "grad_norm": 1.3285687828889672, "learning_rate": 3.9729299419411635e-06, "loss": 0.0698, "step": 3292 }, { "epoch": 1.4981801637852592, "grad_norm": 1.5352343736928473, "learning_rate": 3.972352440700455e-06, "loss": 0.0755, "step": 3293 }, { "epoch": 1.4986351228389445, "grad_norm": 1.953765065951385, "learning_rate": 3.971774819145753e-06, "loss": 0.1119, "step": 3294 }, { "epoch": 1.4990900818926296, "grad_norm": 2.2162517556115477, "learning_rate": 3.97119707732426e-06, "loss": 0.1076, "step": 3295 }, { "epoch": 1.499545040946315, "grad_norm": 1.3488060160482784, "learning_rate": 3.970619215283185e-06, "loss": 0.0798, "step": 3296 }, { "epoch": 1.5, "grad_norm": 2.2291310339686676, "learning_rate": 3.97004123306975e-06, "loss": 0.1155, "step": 3297 }, { "epoch": 1.5004549590536853, "grad_norm": 2.0199034395407893, "learning_rate": 3.969463130731183e-06, "loss": 0.1047, "step": 3298 }, { "epoch": 1.5009099181073702, "grad_norm": 1.412851027797723, "learning_rate": 3.968884908314725e-06, "loss": 0.0996, "step": 3299 }, { "epoch": 1.5013648771610555, "grad_norm": 1.4716286548852187, "learning_rate": 3.968306565867627e-06, "loss": 0.0785, "step": 3300 }, { "epoch": 1.5018198362147408, "grad_norm": 1.8587266849172353, "learning_rate": 3.967728103437146e-06, "loss": 0.0721, "step": 3301 }, { "epoch": 1.5022747952684259, "grad_norm": 1.4680358159782123, "learning_rate": 3.967149521070554e-06, "loss": 0.1017, "step": 3302 }, { "epoch": 1.502729754322111, "grad_norm": 1.557907912714375, "learning_rate": 3.966570818815126e-06, "loss": 0.0774, "step": 3303 }, { "epoch": 1.5031847133757963, "grad_norm": 1.681251325177101, "learning_rate": 3.965991996718156e-06, "loss": 0.0654, "step": 3304 }, { "epoch": 1.5036396724294814, "grad_norm": 1.8951605931520792, "learning_rate": 3.965413054826941e-06, "loss": 0.0831, "step": 3305 }, { "epoch": 1.5040946314831665, "grad_norm": 1.669920771136764, "learning_rate": 3.964833993188787e-06, "loss": 0.0738, "step": 3306 }, { "epoch": 1.5045495905368518, "grad_norm": 1.7766127925336817, "learning_rate": 3.964254811851015e-06, "loss": 0.118, "step": 3307 }, { "epoch": 1.5050045495905369, "grad_norm": 1.4254952779308219, "learning_rate": 3.963675510860952e-06, "loss": 0.1715, "step": 3308 }, { "epoch": 1.505459508644222, "grad_norm": 1.6552266448857769, "learning_rate": 3.963096090265936e-06, "loss": 0.0892, "step": 3309 }, { "epoch": 1.5059144676979073, "grad_norm": 1.7629611200093136, "learning_rate": 3.962516550113316e-06, "loss": 0.1031, "step": 3310 }, { "epoch": 1.5063694267515924, "grad_norm": 1.5413136654540505, "learning_rate": 3.961936890450447e-06, "loss": 0.1095, "step": 3311 }, { "epoch": 1.5068243858052774, "grad_norm": 1.3404528549410122, "learning_rate": 3.961357111324697e-06, "loss": 0.1283, "step": 3312 }, { "epoch": 1.5072793448589628, "grad_norm": 1.5288298655346926, "learning_rate": 3.960777212783445e-06, "loss": 0.0612, "step": 3313 }, { "epoch": 1.5077343039126478, "grad_norm": 1.3541760948555617, "learning_rate": 3.960197194874075e-06, "loss": 0.0989, "step": 3314 }, { "epoch": 1.508189262966333, "grad_norm": 1.467441850315342, "learning_rate": 3.9596170576439844e-06, "loss": 0.0484, "step": 3315 }, { "epoch": 1.5086442220200182, "grad_norm": 1.0635005345830981, "learning_rate": 3.959036801140579e-06, "loss": 0.0644, "step": 3316 }, { "epoch": 1.5090991810737033, "grad_norm": 1.6984602789186973, "learning_rate": 3.958456425411275e-06, "loss": 0.099, "step": 3317 }, { "epoch": 1.5095541401273884, "grad_norm": 1.1353262002245295, "learning_rate": 3.9578759305035e-06, "loss": 0.0524, "step": 3318 }, { "epoch": 1.5100090991810737, "grad_norm": 1.6212070722592495, "learning_rate": 3.957295316464686e-06, "loss": 0.0702, "step": 3319 }, { "epoch": 1.5104640582347588, "grad_norm": 1.398683619014484, "learning_rate": 3.956714583342281e-06, "loss": 0.0929, "step": 3320 }, { "epoch": 1.510919017288444, "grad_norm": 1.8141454190519326, "learning_rate": 3.9561337311837365e-06, "loss": 0.0834, "step": 3321 }, { "epoch": 1.5113739763421292, "grad_norm": 1.4916852198406216, "learning_rate": 3.955552760036522e-06, "loss": 0.1171, "step": 3322 }, { "epoch": 1.5118289353958145, "grad_norm": 1.5197751813945395, "learning_rate": 3.9549716699481076e-06, "loss": 0.1458, "step": 3323 }, { "epoch": 1.5122838944494994, "grad_norm": 1.2998921324659878, "learning_rate": 3.954390460965979e-06, "loss": 0.0895, "step": 3324 }, { "epoch": 1.5127388535031847, "grad_norm": 1.8357920021025818, "learning_rate": 3.95380913313763e-06, "loss": 0.1251, "step": 3325 }, { "epoch": 1.51319381255687, "grad_norm": 1.5979320292376507, "learning_rate": 3.953227686510565e-06, "loss": 0.1195, "step": 3326 }, { "epoch": 1.5136487716105549, "grad_norm": 1.4000257166401557, "learning_rate": 3.9526461211322955e-06, "loss": 0.0617, "step": 3327 }, { "epoch": 1.5141037306642402, "grad_norm": 1.5796696504305936, "learning_rate": 3.9520644370503446e-06, "loss": 0.1333, "step": 3328 }, { "epoch": 1.5145586897179255, "grad_norm": 1.5610836258863345, "learning_rate": 3.951482634312246e-06, "loss": 0.0645, "step": 3329 }, { "epoch": 1.5150136487716106, "grad_norm": 1.7141919609331608, "learning_rate": 3.950900712965541e-06, "loss": 0.0812, "step": 3330 }, { "epoch": 1.5154686078252957, "grad_norm": 1.569455445967712, "learning_rate": 3.950318673057782e-06, "loss": 0.1142, "step": 3331 }, { "epoch": 1.515923566878981, "grad_norm": 1.4217515854178902, "learning_rate": 3.949736514636531e-06, "loss": 0.0716, "step": 3332 }, { "epoch": 1.516378525932666, "grad_norm": 1.690637072475791, "learning_rate": 3.949154237749358e-06, "loss": 0.0857, "step": 3333 }, { "epoch": 1.5168334849863512, "grad_norm": 1.5897206868241083, "learning_rate": 3.948571842443846e-06, "loss": 0.0768, "step": 3334 }, { "epoch": 1.5172884440400365, "grad_norm": 2.0070594498337253, "learning_rate": 3.947989328767585e-06, "loss": 0.0788, "step": 3335 }, { "epoch": 1.5177434030937216, "grad_norm": 1.7890514070769532, "learning_rate": 3.9474066967681744e-06, "loss": 0.1786, "step": 3336 }, { "epoch": 1.5181983621474067, "grad_norm": 1.9339795026913917, "learning_rate": 3.946823946493224e-06, "loss": 0.104, "step": 3337 }, { "epoch": 1.518653321201092, "grad_norm": 1.3670500409040025, "learning_rate": 3.946241077990356e-06, "loss": 0.0815, "step": 3338 }, { "epoch": 1.519108280254777, "grad_norm": 1.568227437911479, "learning_rate": 3.945658091307198e-06, "loss": 0.1082, "step": 3339 }, { "epoch": 1.5195632393084622, "grad_norm": 1.7331161632558227, "learning_rate": 3.9450749864913895e-06, "loss": 0.1867, "step": 3340 }, { "epoch": 1.5200181983621475, "grad_norm": 1.572093896598596, "learning_rate": 3.9444917635905784e-06, "loss": 0.0914, "step": 3341 }, { "epoch": 1.5204731574158326, "grad_norm": 1.6345203894010123, "learning_rate": 3.943908422652424e-06, "loss": 0.1048, "step": 3342 }, { "epoch": 1.5209281164695176, "grad_norm": 1.4370316569753983, "learning_rate": 3.943324963724594e-06, "loss": 0.0808, "step": 3343 }, { "epoch": 1.521383075523203, "grad_norm": 2.0007881836324, "learning_rate": 3.942741386854766e-06, "loss": 0.1317, "step": 3344 }, { "epoch": 1.521838034576888, "grad_norm": 2.021214775517931, "learning_rate": 3.942157692090627e-06, "loss": 0.085, "step": 3345 }, { "epoch": 1.5222929936305731, "grad_norm": 1.3333852410958642, "learning_rate": 3.941573879479874e-06, "loss": 0.0504, "step": 3346 }, { "epoch": 1.5227479526842584, "grad_norm": 1.7300411140963852, "learning_rate": 3.940989949070214e-06, "loss": 0.1181, "step": 3347 }, { "epoch": 1.5232029117379435, "grad_norm": 1.2403389180904716, "learning_rate": 3.940405900909362e-06, "loss": 0.0785, "step": 3348 }, { "epoch": 1.5236578707916286, "grad_norm": 1.6241432825917363, "learning_rate": 3.939821735045046e-06, "loss": 0.0941, "step": 3349 }, { "epoch": 1.524112829845314, "grad_norm": 1.2954395499887315, "learning_rate": 3.9392374515249986e-06, "loss": 0.1034, "step": 3350 }, { "epoch": 1.5245677888989992, "grad_norm": 1.2175652251296913, "learning_rate": 3.938653050396967e-06, "loss": 0.0946, "step": 3351 }, { "epoch": 1.525022747952684, "grad_norm": 1.5800092750013774, "learning_rate": 3.938068531708706e-06, "loss": 0.1018, "step": 3352 }, { "epoch": 1.5254777070063694, "grad_norm": 1.8189017669672445, "learning_rate": 3.937483895507977e-06, "loss": 0.0861, "step": 3353 }, { "epoch": 1.5259326660600547, "grad_norm": 2.961306623385503, "learning_rate": 3.936899141842556e-06, "loss": 0.1027, "step": 3354 }, { "epoch": 1.5263876251137396, "grad_norm": 1.4420197724592472, "learning_rate": 3.936314270760227e-06, "loss": 0.1025, "step": 3355 }, { "epoch": 1.526842584167425, "grad_norm": 1.2355496558137826, "learning_rate": 3.935729282308781e-06, "loss": 0.0717, "step": 3356 }, { "epoch": 1.5272975432211102, "grad_norm": 1.5118896760601945, "learning_rate": 3.935144176536023e-06, "loss": 0.1073, "step": 3357 }, { "epoch": 1.5277525022747953, "grad_norm": 1.4887939134318973, "learning_rate": 3.934558953489763e-06, "loss": 0.0964, "step": 3358 }, { "epoch": 1.5282074613284804, "grad_norm": 1.8251541789074262, "learning_rate": 3.9339736132178245e-06, "loss": 0.0739, "step": 3359 }, { "epoch": 1.5286624203821657, "grad_norm": 1.4488744548354437, "learning_rate": 3.933388155768038e-06, "loss": 0.0827, "step": 3360 }, { "epoch": 1.5291173794358508, "grad_norm": 1.4692454556719072, "learning_rate": 3.932802581188243e-06, "loss": 0.0478, "step": 3361 }, { "epoch": 1.5295723384895359, "grad_norm": 1.6732466005884168, "learning_rate": 3.932216889526293e-06, "loss": 0.0812, "step": 3362 }, { "epoch": 1.5300272975432212, "grad_norm": 1.6890474994349833, "learning_rate": 3.931631080830046e-06, "loss": 0.0849, "step": 3363 }, { "epoch": 1.5304822565969063, "grad_norm": 2.346151751040408, "learning_rate": 3.931045155147373e-06, "loss": 0.1153, "step": 3364 }, { "epoch": 1.5309372156505914, "grad_norm": 1.4800726823296253, "learning_rate": 3.930459112526153e-06, "loss": 0.1139, "step": 3365 }, { "epoch": 1.5313921747042767, "grad_norm": 1.6230763372564965, "learning_rate": 3.929872953014272e-06, "loss": 0.0988, "step": 3366 }, { "epoch": 1.5318471337579618, "grad_norm": 1.6060779871092317, "learning_rate": 3.929286676659632e-06, "loss": 0.0921, "step": 3367 }, { "epoch": 1.5323020928116469, "grad_norm": 1.3793645721595265, "learning_rate": 3.92870028351014e-06, "loss": 0.047, "step": 3368 }, { "epoch": 1.5327570518653322, "grad_norm": 0.943707799239621, "learning_rate": 3.9281137736137105e-06, "loss": 0.096, "step": 3369 }, { "epoch": 1.5332120109190173, "grad_norm": 1.3718593148148275, "learning_rate": 3.927527147018275e-06, "loss": 0.0997, "step": 3370 }, { "epoch": 1.5336669699727024, "grad_norm": 1.1989379868923307, "learning_rate": 3.926940403771767e-06, "loss": 0.0861, "step": 3371 }, { "epoch": 1.5341219290263877, "grad_norm": 1.4591750992401513, "learning_rate": 3.926353543922133e-06, "loss": 0.0987, "step": 3372 }, { "epoch": 1.5345768880800728, "grad_norm": 1.6548656818632366, "learning_rate": 3.925766567517329e-06, "loss": 0.0623, "step": 3373 }, { "epoch": 1.5350318471337578, "grad_norm": 1.2640027754565126, "learning_rate": 3.925179474605319e-06, "loss": 0.0943, "step": 3374 }, { "epoch": 1.5354868061874432, "grad_norm": 1.2278135407756106, "learning_rate": 3.92459226523408e-06, "loss": 0.0788, "step": 3375 }, { "epoch": 1.5359417652411285, "grad_norm": 1.557537507164988, "learning_rate": 3.924004939451593e-06, "loss": 0.079, "step": 3376 }, { "epoch": 1.5363967242948133, "grad_norm": 1.3090439162150351, "learning_rate": 3.923417497305853e-06, "loss": 0.12, "step": 3377 }, { "epoch": 1.5368516833484986, "grad_norm": 1.5350536284615355, "learning_rate": 3.9228299388448645e-06, "loss": 0.0553, "step": 3378 }, { "epoch": 1.537306642402184, "grad_norm": 1.533368660860133, "learning_rate": 3.922242264116639e-06, "loss": 0.0901, "step": 3379 }, { "epoch": 1.5377616014558688, "grad_norm": 1.6850878998505332, "learning_rate": 3.921654473169198e-06, "loss": 0.1227, "step": 3380 }, { "epoch": 1.5382165605095541, "grad_norm": 2.0261242908503636, "learning_rate": 3.921066566050573e-06, "loss": 0.103, "step": 3381 }, { "epoch": 1.5386715195632394, "grad_norm": 1.556461991327618, "learning_rate": 3.920478542808806e-06, "loss": 0.1363, "step": 3382 }, { "epoch": 1.5391264786169245, "grad_norm": 1.6027756915069185, "learning_rate": 3.919890403491947e-06, "loss": 0.0645, "step": 3383 }, { "epoch": 1.5395814376706096, "grad_norm": 1.5890407860383862, "learning_rate": 3.919302148148057e-06, "loss": 0.0989, "step": 3384 }, { "epoch": 1.540036396724295, "grad_norm": 1.4653645987896922, "learning_rate": 3.918713776825204e-06, "loss": 0.0693, "step": 3385 }, { "epoch": 1.54049135577798, "grad_norm": 1.4636776054425236, "learning_rate": 3.918125289571469e-06, "loss": 0.0718, "step": 3386 }, { "epoch": 1.540946314831665, "grad_norm": 1.5605390465037172, "learning_rate": 3.917536686434939e-06, "loss": 0.0684, "step": 3387 }, { "epoch": 1.5414012738853504, "grad_norm": 1.8754482128968593, "learning_rate": 3.916947967463713e-06, "loss": 0.1084, "step": 3388 }, { "epoch": 1.5418562329390355, "grad_norm": 1.4247155744871083, "learning_rate": 3.916359132705898e-06, "loss": 0.0602, "step": 3389 }, { "epoch": 1.5423111919927206, "grad_norm": 1.5975763313985631, "learning_rate": 3.91577018220961e-06, "loss": 0.1269, "step": 3390 }, { "epoch": 1.542766151046406, "grad_norm": 1.7832398521277804, "learning_rate": 3.9151811160229765e-06, "loss": 0.0901, "step": 3391 }, { "epoch": 1.543221110100091, "grad_norm": 1.321381959774309, "learning_rate": 3.914591934194134e-06, "loss": 0.076, "step": 3392 }, { "epoch": 1.543676069153776, "grad_norm": 1.5799847149196016, "learning_rate": 3.914002636771226e-06, "loss": 0.0791, "step": 3393 }, { "epoch": 1.5441310282074614, "grad_norm": 1.447330547602097, "learning_rate": 3.913413223802408e-06, "loss": 0.0769, "step": 3394 }, { "epoch": 1.5445859872611465, "grad_norm": 1.1233905720332678, "learning_rate": 3.912823695335845e-06, "loss": 0.0649, "step": 3395 }, { "epoch": 1.5450409463148316, "grad_norm": 1.9555845529348213, "learning_rate": 3.91223405141971e-06, "loss": 0.152, "step": 3396 }, { "epoch": 1.5454959053685169, "grad_norm": 1.9330573445427235, "learning_rate": 3.911644292102185e-06, "loss": 0.08, "step": 3397 }, { "epoch": 1.545950864422202, "grad_norm": 1.7497524430824924, "learning_rate": 3.911054417431465e-06, "loss": 0.0937, "step": 3398 }, { "epoch": 1.546405823475887, "grad_norm": 1.2986190511733442, "learning_rate": 3.9104644274557485e-06, "loss": 0.0634, "step": 3399 }, { "epoch": 1.5468607825295724, "grad_norm": 1.750640369765871, "learning_rate": 3.909874322223249e-06, "loss": 0.1496, "step": 3400 }, { "epoch": 1.5473157415832575, "grad_norm": 1.3258256876502408, "learning_rate": 3.909284101782187e-06, "loss": 0.0974, "step": 3401 }, { "epoch": 1.5477707006369426, "grad_norm": 1.1499611293594922, "learning_rate": 3.908693766180792e-06, "loss": 0.1077, "step": 3402 }, { "epoch": 1.5482256596906279, "grad_norm": 1.3729862313863552, "learning_rate": 3.908103315467306e-06, "loss": 0.0631, "step": 3403 }, { "epoch": 1.5486806187443132, "grad_norm": 1.6135716564982725, "learning_rate": 3.907512749689973e-06, "loss": 0.1298, "step": 3404 }, { "epoch": 1.549135577797998, "grad_norm": 1.4042663512191482, "learning_rate": 3.906922068897057e-06, "loss": 0.1337, "step": 3405 }, { "epoch": 1.5495905368516834, "grad_norm": 1.2647295236789304, "learning_rate": 3.906331273136822e-06, "loss": 0.081, "step": 3406 }, { "epoch": 1.5500454959053687, "grad_norm": 1.336174501862538, "learning_rate": 3.905740362457546e-06, "loss": 0.0745, "step": 3407 }, { "epoch": 1.5505004549590535, "grad_norm": 1.2913965807095413, "learning_rate": 3.905149336907516e-06, "loss": 0.0823, "step": 3408 }, { "epoch": 1.5509554140127388, "grad_norm": 1.8841879084330162, "learning_rate": 3.904558196535029e-06, "loss": 0.1459, "step": 3409 }, { "epoch": 1.5514103730664242, "grad_norm": 2.187087664089265, "learning_rate": 3.903966941388387e-06, "loss": 0.107, "step": 3410 }, { "epoch": 1.5518653321201092, "grad_norm": 2.210658140486813, "learning_rate": 3.9033755715159085e-06, "loss": 0.0714, "step": 3411 }, { "epoch": 1.5523202911737943, "grad_norm": 1.5515395346924976, "learning_rate": 3.902784086965915e-06, "loss": 0.0987, "step": 3412 }, { "epoch": 1.5527752502274796, "grad_norm": 1.5780590775203256, "learning_rate": 3.902192487786741e-06, "loss": 0.0936, "step": 3413 }, { "epoch": 1.5532302092811647, "grad_norm": 1.6368593735031614, "learning_rate": 3.9016007740267295e-06, "loss": 0.0707, "step": 3414 }, { "epoch": 1.5536851683348498, "grad_norm": 1.7378987945753075, "learning_rate": 3.901008945734232e-06, "loss": 0.1149, "step": 3415 }, { "epoch": 1.5541401273885351, "grad_norm": 2.1510345975591023, "learning_rate": 3.90041700295761e-06, "loss": 0.1328, "step": 3416 }, { "epoch": 1.5545950864422202, "grad_norm": 2.0370435337174984, "learning_rate": 3.899824945745236e-06, "loss": 0.1054, "step": 3417 }, { "epoch": 1.5550500454959053, "grad_norm": 1.2633068639961949, "learning_rate": 3.899232774145488e-06, "loss": 0.0548, "step": 3418 }, { "epoch": 1.5555050045495906, "grad_norm": 1.4942493125921321, "learning_rate": 3.898640488206756e-06, "loss": 0.0981, "step": 3419 }, { "epoch": 1.5559599636032757, "grad_norm": 1.5560696766813005, "learning_rate": 3.898048087977441e-06, "loss": 0.1112, "step": 3420 }, { "epoch": 1.5564149226569608, "grad_norm": 1.7970579684511379, "learning_rate": 3.89745557350595e-06, "loss": 0.089, "step": 3421 }, { "epoch": 1.556869881710646, "grad_norm": 1.6707892538090077, "learning_rate": 3.896862944840698e-06, "loss": 0.1019, "step": 3422 }, { "epoch": 1.5573248407643312, "grad_norm": 1.7044813853549112, "learning_rate": 3.896270202030116e-06, "loss": 0.1131, "step": 3423 }, { "epoch": 1.5577797998180163, "grad_norm": 1.31369346227832, "learning_rate": 3.895677345122638e-06, "loss": 0.0609, "step": 3424 }, { "epoch": 1.5582347588717016, "grad_norm": 1.5358502248119053, "learning_rate": 3.895084374166711e-06, "loss": 0.0999, "step": 3425 }, { "epoch": 1.5586897179253867, "grad_norm": 1.5278955490750785, "learning_rate": 3.894491289210788e-06, "loss": 0.0852, "step": 3426 }, { "epoch": 1.5591446769790718, "grad_norm": 2.0397572423807566, "learning_rate": 3.893898090303335e-06, "loss": 0.1029, "step": 3427 }, { "epoch": 1.559599636032757, "grad_norm": 1.7970625523759147, "learning_rate": 3.893304777492825e-06, "loss": 0.1206, "step": 3428 }, { "epoch": 1.5600545950864422, "grad_norm": 1.7308191704995441, "learning_rate": 3.89271135082774e-06, "loss": 0.0867, "step": 3429 }, { "epoch": 1.5605095541401273, "grad_norm": 1.632629676475227, "learning_rate": 3.892117810356574e-06, "loss": 0.0784, "step": 3430 }, { "epoch": 1.5609645131938126, "grad_norm": 1.4607484283676528, "learning_rate": 3.8915241561278265e-06, "loss": 0.0832, "step": 3431 }, { "epoch": 1.5614194722474979, "grad_norm": 1.4329441812014883, "learning_rate": 3.890930388190009e-06, "loss": 0.0715, "step": 3432 }, { "epoch": 1.5618744313011828, "grad_norm": 1.4969363218336595, "learning_rate": 3.890336506591642e-06, "loss": 0.1227, "step": 3433 }, { "epoch": 1.562329390354868, "grad_norm": 1.6002750259633243, "learning_rate": 3.889742511381254e-06, "loss": 0.0952, "step": 3434 }, { "epoch": 1.5627843494085534, "grad_norm": 2.2356628788433195, "learning_rate": 3.889148402607384e-06, "loss": 0.1224, "step": 3435 }, { "epoch": 1.5632393084622382, "grad_norm": 1.4989681923393605, "learning_rate": 3.88855418031858e-06, "loss": 0.0756, "step": 3436 }, { "epoch": 1.5636942675159236, "grad_norm": 1.948273601108927, "learning_rate": 3.887959844563399e-06, "loss": 0.081, "step": 3437 }, { "epoch": 1.5641492265696089, "grad_norm": 1.7407023652732652, "learning_rate": 3.887365395390407e-06, "loss": 0.1206, "step": 3438 }, { "epoch": 1.564604185623294, "grad_norm": 2.2811727140910287, "learning_rate": 3.886770832848181e-06, "loss": 0.1164, "step": 3439 }, { "epoch": 1.565059144676979, "grad_norm": 1.6563589961809235, "learning_rate": 3.886176156985305e-06, "loss": 0.1087, "step": 3440 }, { "epoch": 1.5655141037306644, "grad_norm": 1.49319450056387, "learning_rate": 3.885581367850373e-06, "loss": 0.0783, "step": 3441 }, { "epoch": 1.5659690627843494, "grad_norm": 1.5998867152895082, "learning_rate": 3.8849864654919885e-06, "loss": 0.0977, "step": 3442 }, { "epoch": 1.5664240218380345, "grad_norm": 1.677398929764452, "learning_rate": 3.884391449958765e-06, "loss": 0.0869, "step": 3443 }, { "epoch": 1.5668789808917198, "grad_norm": 1.754032797982339, "learning_rate": 3.883796321299325e-06, "loss": 0.1264, "step": 3444 }, { "epoch": 1.567333939945405, "grad_norm": 1.3980801051532858, "learning_rate": 3.8832010795622975e-06, "loss": 0.074, "step": 3445 }, { "epoch": 1.56778889899909, "grad_norm": 1.5184041647584323, "learning_rate": 3.882605724796324e-06, "loss": 0.0805, "step": 3446 }, { "epoch": 1.5682438580527753, "grad_norm": 1.4334767475210397, "learning_rate": 3.882010257050056e-06, "loss": 0.1379, "step": 3447 }, { "epoch": 1.5686988171064604, "grad_norm": 1.5565451887443413, "learning_rate": 3.88141467637215e-06, "loss": 0.0827, "step": 3448 }, { "epoch": 1.5691537761601455, "grad_norm": 1.3969171176004442, "learning_rate": 3.880818982811275e-06, "loss": 0.1072, "step": 3449 }, { "epoch": 1.5696087352138308, "grad_norm": 1.5562909950536974, "learning_rate": 3.880223176416108e-06, "loss": 0.0788, "step": 3450 }, { "epoch": 1.570063694267516, "grad_norm": 1.5988993653667938, "learning_rate": 3.879627257235337e-06, "loss": 0.1178, "step": 3451 }, { "epoch": 1.570518653321201, "grad_norm": 1.3736029016541802, "learning_rate": 3.8790312253176565e-06, "loss": 0.1337, "step": 3452 }, { "epoch": 1.5709736123748863, "grad_norm": 1.2613503643808541, "learning_rate": 3.878435080711772e-06, "loss": 0.0695, "step": 3453 }, { "epoch": 1.5714285714285714, "grad_norm": 1.4584985207068788, "learning_rate": 3.877838823466398e-06, "loss": 0.0872, "step": 3454 }, { "epoch": 1.5718835304822565, "grad_norm": 1.1617748754624213, "learning_rate": 3.8772424536302565e-06, "loss": 0.0385, "step": 3455 }, { "epoch": 1.5723384895359418, "grad_norm": 1.5527281868916427, "learning_rate": 3.876645971252082e-06, "loss": 0.0919, "step": 3456 }, { "epoch": 1.5727934485896269, "grad_norm": 1.2687653541406154, "learning_rate": 3.876049376380615e-06, "loss": 0.098, "step": 3457 }, { "epoch": 1.573248407643312, "grad_norm": 2.0731310400058183, "learning_rate": 3.875452669064609e-06, "loss": 0.1657, "step": 3458 }, { "epoch": 1.5737033666969973, "grad_norm": 1.5254327947978468, "learning_rate": 3.874855849352821e-06, "loss": 0.0859, "step": 3459 }, { "epoch": 1.5741583257506826, "grad_norm": 1.5218287766669762, "learning_rate": 3.874258917294021e-06, "loss": 0.0985, "step": 3460 }, { "epoch": 1.5746132848043675, "grad_norm": 1.5563807081111813, "learning_rate": 3.873661872936989e-06, "loss": 0.0934, "step": 3461 }, { "epoch": 1.5750682438580528, "grad_norm": 1.733411800397255, "learning_rate": 3.873064716330513e-06, "loss": 0.0866, "step": 3462 }, { "epoch": 1.575523202911738, "grad_norm": 1.5095280819260823, "learning_rate": 3.872467447523388e-06, "loss": 0.1183, "step": 3463 }, { "epoch": 1.575978161965423, "grad_norm": 2.4885694027870566, "learning_rate": 3.871870066564422e-06, "loss": 0.119, "step": 3464 }, { "epoch": 1.5764331210191083, "grad_norm": 1.6381002915955072, "learning_rate": 3.8712725735024295e-06, "loss": 0.0997, "step": 3465 }, { "epoch": 1.5768880800727936, "grad_norm": 1.5393242876777635, "learning_rate": 3.870674968386234e-06, "loss": 0.1066, "step": 3466 }, { "epoch": 1.5773430391264787, "grad_norm": 1.3626676894889325, "learning_rate": 3.87007725126467e-06, "loss": 0.0631, "step": 3467 }, { "epoch": 1.5777979981801638, "grad_norm": 1.7713775566907601, "learning_rate": 3.869479422186582e-06, "loss": 0.0556, "step": 3468 }, { "epoch": 1.578252957233849, "grad_norm": 1.5679383078906888, "learning_rate": 3.868881481200818e-06, "loss": 0.0662, "step": 3469 }, { "epoch": 1.5787079162875342, "grad_norm": 1.378322340336987, "learning_rate": 3.868283428356243e-06, "loss": 0.0923, "step": 3470 }, { "epoch": 1.5791628753412192, "grad_norm": 1.8602826408061313, "learning_rate": 3.8676852637017234e-06, "loss": 0.0651, "step": 3471 }, { "epoch": 1.5796178343949046, "grad_norm": 1.2842910081332937, "learning_rate": 3.867086987286141e-06, "loss": 0.1155, "step": 3472 }, { "epoch": 1.5800727934485896, "grad_norm": 1.3849153670505507, "learning_rate": 3.866488599158386e-06, "loss": 0.0952, "step": 3473 }, { "epoch": 1.5805277525022747, "grad_norm": 2.0685207098086513, "learning_rate": 3.865890099367351e-06, "loss": 0.0829, "step": 3474 }, { "epoch": 1.58098271155596, "grad_norm": 1.7124978025307944, "learning_rate": 3.865291487961946e-06, "loss": 0.0931, "step": 3475 }, { "epoch": 1.5814376706096451, "grad_norm": 1.6402558324796304, "learning_rate": 3.864692764991087e-06, "loss": 0.149, "step": 3476 }, { "epoch": 1.5818926296633302, "grad_norm": 1.1937144314369066, "learning_rate": 3.864093930503697e-06, "loss": 0.1434, "step": 3477 }, { "epoch": 1.5823475887170155, "grad_norm": 1.306600358502951, "learning_rate": 3.863494984548712e-06, "loss": 0.0999, "step": 3478 }, { "epoch": 1.5828025477707006, "grad_norm": 1.5824625528399092, "learning_rate": 3.862895927175074e-06, "loss": 0.1263, "step": 3479 }, { "epoch": 1.5832575068243857, "grad_norm": 1.7516919661536634, "learning_rate": 3.862296758431736e-06, "loss": 0.111, "step": 3480 }, { "epoch": 1.583712465878071, "grad_norm": 1.2305801723832064, "learning_rate": 3.861697478367658e-06, "loss": 0.0861, "step": 3481 }, { "epoch": 1.584167424931756, "grad_norm": 1.4370015931112743, "learning_rate": 3.8610980870318126e-06, "loss": 0.0664, "step": 3482 }, { "epoch": 1.5846223839854412, "grad_norm": 2.1749670468513758, "learning_rate": 3.860498584473178e-06, "loss": 0.0893, "step": 3483 }, { "epoch": 1.5850773430391265, "grad_norm": 1.335938819056283, "learning_rate": 3.859898970740743e-06, "loss": 0.0552, "step": 3484 }, { "epoch": 1.5855323020928116, "grad_norm": 1.5007416809250531, "learning_rate": 3.859299245883505e-06, "loss": 0.1123, "step": 3485 }, { "epoch": 1.5859872611464967, "grad_norm": 1.759426649516968, "learning_rate": 3.858699409950472e-06, "loss": 0.1221, "step": 3486 }, { "epoch": 1.586442220200182, "grad_norm": 1.8380583466673612, "learning_rate": 3.858099462990658e-06, "loss": 0.1213, "step": 3487 }, { "epoch": 1.5868971792538673, "grad_norm": 1.2695106757386403, "learning_rate": 3.857499405053089e-06, "loss": 0.1007, "step": 3488 }, { "epoch": 1.5873521383075522, "grad_norm": 1.3065748361992395, "learning_rate": 3.856899236186799e-06, "loss": 0.0783, "step": 3489 }, { "epoch": 1.5878070973612375, "grad_norm": 1.3496321662986939, "learning_rate": 3.856298956440832e-06, "loss": 0.0813, "step": 3490 }, { "epoch": 1.5882620564149228, "grad_norm": 1.5241463495968492, "learning_rate": 3.8556985658642395e-06, "loss": 0.0902, "step": 3491 }, { "epoch": 1.5887170154686077, "grad_norm": 1.31969300596435, "learning_rate": 3.855098064506081e-06, "loss": 0.051, "step": 3492 }, { "epoch": 1.589171974522293, "grad_norm": 1.244063836965919, "learning_rate": 3.85449745241543e-06, "loss": 0.0635, "step": 3493 }, { "epoch": 1.5896269335759783, "grad_norm": 1.6324467035434929, "learning_rate": 3.853896729641363e-06, "loss": 0.1146, "step": 3494 }, { "epoch": 1.5900818926296634, "grad_norm": 1.5650980777601415, "learning_rate": 3.853295896232969e-06, "loss": 0.1477, "step": 3495 }, { "epoch": 1.5905368516833485, "grad_norm": 1.678369650409693, "learning_rate": 3.852694952239347e-06, "loss": 0.1271, "step": 3496 }, { "epoch": 1.5909918107370338, "grad_norm": 1.6437967577015182, "learning_rate": 3.852093897709601e-06, "loss": 0.0569, "step": 3497 }, { "epoch": 1.5914467697907189, "grad_norm": 1.429437313630493, "learning_rate": 3.851492732692849e-06, "loss": 0.096, "step": 3498 }, { "epoch": 1.591901728844404, "grad_norm": 1.3192213297358595, "learning_rate": 3.8508914572382124e-06, "loss": 0.1122, "step": 3499 }, { "epoch": 1.5923566878980893, "grad_norm": 1.3798878892496924, "learning_rate": 3.850290071394828e-06, "loss": 0.0694, "step": 3500 }, { "epoch": 1.5928116469517744, "grad_norm": 1.683960914777888, "learning_rate": 3.8496885752118365e-06, "loss": 0.1397, "step": 3501 }, { "epoch": 1.5932666060054594, "grad_norm": 1.6045790266914162, "learning_rate": 3.849086968738389e-06, "loss": 0.1209, "step": 3502 }, { "epoch": 1.5937215650591448, "grad_norm": 1.7710089101521256, "learning_rate": 3.848485252023647e-06, "loss": 0.1092, "step": 3503 }, { "epoch": 1.5941765241128298, "grad_norm": 1.7153679747749317, "learning_rate": 3.847883425116781e-06, "loss": 0.0961, "step": 3504 }, { "epoch": 1.594631483166515, "grad_norm": 1.2913333028527068, "learning_rate": 3.8472814880669675e-06, "loss": 0.1116, "step": 3505 }, { "epoch": 1.5950864422202002, "grad_norm": 1.1551926049954213, "learning_rate": 3.8466794409233946e-06, "loss": 0.0489, "step": 3506 }, { "epoch": 1.5955414012738853, "grad_norm": 1.7259331368947677, "learning_rate": 3.846077283735261e-06, "loss": 0.0746, "step": 3507 }, { "epoch": 1.5959963603275704, "grad_norm": 1.3680081291052668, "learning_rate": 3.84547501655177e-06, "loss": 0.0675, "step": 3508 }, { "epoch": 1.5964513193812557, "grad_norm": 1.6159416623809166, "learning_rate": 3.844872639422136e-06, "loss": 0.0724, "step": 3509 }, { "epoch": 1.5969062784349408, "grad_norm": 1.3892366435345012, "learning_rate": 3.844270152395583e-06, "loss": 0.0648, "step": 3510 }, { "epoch": 1.597361237488626, "grad_norm": 1.6960975437321597, "learning_rate": 3.843667555521346e-06, "loss": 0.1079, "step": 3511 }, { "epoch": 1.5978161965423112, "grad_norm": 1.3397623671251953, "learning_rate": 3.843064848848662e-06, "loss": 0.0942, "step": 3512 }, { "epoch": 1.5982711555959963, "grad_norm": 2.025964211262628, "learning_rate": 3.842462032426784e-06, "loss": 0.1176, "step": 3513 }, { "epoch": 1.5987261146496814, "grad_norm": 1.3617208449580485, "learning_rate": 3.841859106304973e-06, "loss": 0.076, "step": 3514 }, { "epoch": 1.5991810737033667, "grad_norm": 1.4146551349391596, "learning_rate": 3.841256070532494e-06, "loss": 0.135, "step": 3515 }, { "epoch": 1.599636032757052, "grad_norm": 1.792039138290509, "learning_rate": 3.840652925158626e-06, "loss": 0.0954, "step": 3516 }, { "epoch": 1.6000909918107369, "grad_norm": 2.2445593461709357, "learning_rate": 3.840049670232656e-06, "loss": 0.1028, "step": 3517 }, { "epoch": 1.6005459508644222, "grad_norm": 1.3671828185164485, "learning_rate": 3.839446305803878e-06, "loss": 0.109, "step": 3518 }, { "epoch": 1.6010009099181075, "grad_norm": 1.4589813025532787, "learning_rate": 3.838842831921598e-06, "loss": 0.0565, "step": 3519 }, { "epoch": 1.6014558689717924, "grad_norm": 1.179003882602948, "learning_rate": 3.8382392486351265e-06, "loss": 0.1078, "step": 3520 }, { "epoch": 1.6019108280254777, "grad_norm": 1.3716227541359156, "learning_rate": 3.837635555993787e-06, "loss": 0.0916, "step": 3521 }, { "epoch": 1.602365787079163, "grad_norm": 1.4943470230778477, "learning_rate": 3.837031754046912e-06, "loss": 0.0837, "step": 3522 }, { "epoch": 1.602820746132848, "grad_norm": 1.3481941021782222, "learning_rate": 3.836427842843838e-06, "loss": 0.0984, "step": 3523 }, { "epoch": 1.6032757051865332, "grad_norm": 1.4341570931316712, "learning_rate": 3.835823822433918e-06, "loss": 0.107, "step": 3524 }, { "epoch": 1.6037306642402185, "grad_norm": 1.8015575511803747, "learning_rate": 3.835219692866506e-06, "loss": 0.076, "step": 3525 }, { "epoch": 1.6041856232939036, "grad_norm": 1.2341209324499218, "learning_rate": 3.834615454190972e-06, "loss": 0.0843, "step": 3526 }, { "epoch": 1.6046405823475887, "grad_norm": 1.5197405953661047, "learning_rate": 3.834011106456689e-06, "loss": 0.1205, "step": 3527 }, { "epoch": 1.605095541401274, "grad_norm": 1.9424552037594642, "learning_rate": 3.833406649713044e-06, "loss": 0.1151, "step": 3528 }, { "epoch": 1.605550500454959, "grad_norm": 1.5021974869937356, "learning_rate": 3.832802084009428e-06, "loss": 0.0973, "step": 3529 }, { "epoch": 1.6060054595086442, "grad_norm": 1.8266618663693819, "learning_rate": 3.832197409395245e-06, "loss": 0.0709, "step": 3530 }, { "epoch": 1.6064604185623295, "grad_norm": 1.6330948873636992, "learning_rate": 3.831592625919906e-06, "loss": 0.1113, "step": 3531 }, { "epoch": 1.6069153776160146, "grad_norm": 1.5099409182888164, "learning_rate": 3.830987733632831e-06, "loss": 0.0661, "step": 3532 }, { "epoch": 1.6073703366696996, "grad_norm": 1.1825004757785686, "learning_rate": 3.830382732583449e-06, "loss": 0.0665, "step": 3533 }, { "epoch": 1.607825295723385, "grad_norm": 1.471614492634342, "learning_rate": 3.829777622821198e-06, "loss": 0.0697, "step": 3534 }, { "epoch": 1.60828025477707, "grad_norm": 1.2408317472034625, "learning_rate": 3.8291724043955245e-06, "loss": 0.0687, "step": 3535 }, { "epoch": 1.6087352138307551, "grad_norm": 1.2781595060509734, "learning_rate": 3.828567077355885e-06, "loss": 0.0779, "step": 3536 }, { "epoch": 1.6091901728844404, "grad_norm": 1.204681792383495, "learning_rate": 3.827961641751744e-06, "loss": 0.0776, "step": 3537 }, { "epoch": 1.6096451319381255, "grad_norm": 1.3329068919170464, "learning_rate": 3.827356097632574e-06, "loss": 0.0978, "step": 3538 }, { "epoch": 1.6101000909918106, "grad_norm": 1.3404150745688894, "learning_rate": 3.826750445047859e-06, "loss": 0.0592, "step": 3539 }, { "epoch": 1.610555050045496, "grad_norm": 2.003077942872616, "learning_rate": 3.826144684047089e-06, "loss": 0.0835, "step": 3540 }, { "epoch": 1.6110100090991812, "grad_norm": 2.0366944254540496, "learning_rate": 3.825538814679763e-06, "loss": 0.1048, "step": 3541 }, { "epoch": 1.611464968152866, "grad_norm": 1.281809732925394, "learning_rate": 3.824932836995392e-06, "loss": 0.0874, "step": 3542 }, { "epoch": 1.6119199272065514, "grad_norm": 1.5363707502332304, "learning_rate": 3.8243267510434936e-06, "loss": 0.1034, "step": 3543 }, { "epoch": 1.6123748862602367, "grad_norm": 1.530716131378312, "learning_rate": 3.823720556873592e-06, "loss": 0.0768, "step": 3544 }, { "epoch": 1.6128298453139216, "grad_norm": 1.680861742559888, "learning_rate": 3.823114254535226e-06, "loss": 0.1131, "step": 3545 }, { "epoch": 1.613284804367607, "grad_norm": 1.24364403969975, "learning_rate": 3.8225078440779375e-06, "loss": 0.0813, "step": 3546 }, { "epoch": 1.6137397634212922, "grad_norm": 1.486698049343012, "learning_rate": 3.821901325551281e-06, "loss": 0.0791, "step": 3547 }, { "epoch": 1.6141947224749773, "grad_norm": 1.3137802252683117, "learning_rate": 3.821294699004816e-06, "loss": 0.0728, "step": 3548 }, { "epoch": 1.6146496815286624, "grad_norm": 1.19710727921573, "learning_rate": 3.820687964488117e-06, "loss": 0.0583, "step": 3549 }, { "epoch": 1.6151046405823477, "grad_norm": 1.6889290961263983, "learning_rate": 3.82008112205076e-06, "loss": 0.1124, "step": 3550 }, { "epoch": 1.6155595996360328, "grad_norm": 1.9215886981142773, "learning_rate": 3.819474171742336e-06, "loss": 0.0887, "step": 3551 }, { "epoch": 1.6160145586897179, "grad_norm": 1.318529851635924, "learning_rate": 3.8188671136124425e-06, "loss": 0.059, "step": 3552 }, { "epoch": 1.6164695177434032, "grad_norm": 0.918143884813545, "learning_rate": 3.818259947710683e-06, "loss": 0.0478, "step": 3553 }, { "epoch": 1.6169244767970883, "grad_norm": 1.3662516804229434, "learning_rate": 3.817652674086675e-06, "loss": 0.073, "step": 3554 }, { "epoch": 1.6173794358507734, "grad_norm": 1.4942942253913223, "learning_rate": 3.81704529279004e-06, "loss": 0.1281, "step": 3555 }, { "epoch": 1.6178343949044587, "grad_norm": 1.4094773367778046, "learning_rate": 3.816437803870412e-06, "loss": 0.0863, "step": 3556 }, { "epoch": 1.6182893539581438, "grad_norm": 1.6728232499683344, "learning_rate": 3.815830207377431e-06, "loss": 0.1052, "step": 3557 }, { "epoch": 1.6187443130118289, "grad_norm": 2.0382212771808716, "learning_rate": 3.815222503360748e-06, "loss": 0.1259, "step": 3558 }, { "epoch": 1.6191992720655142, "grad_norm": 1.5231685082168058, "learning_rate": 3.814614691870021e-06, "loss": 0.0961, "step": 3559 }, { "epoch": 1.6196542311191993, "grad_norm": 1.3739337832501564, "learning_rate": 3.814006772954919e-06, "loss": 0.1122, "step": 3560 }, { "epoch": 1.6201091901728844, "grad_norm": 1.474137218074661, "learning_rate": 3.8133987466651175e-06, "loss": 0.0699, "step": 3561 }, { "epoch": 1.6205641492265697, "grad_norm": 1.6614772023979796, "learning_rate": 3.8127906130503014e-06, "loss": 0.0847, "step": 3562 }, { "epoch": 1.6210191082802548, "grad_norm": 1.6403907090113856, "learning_rate": 3.8121823721601647e-06, "loss": 0.0642, "step": 3563 }, { "epoch": 1.6214740673339398, "grad_norm": 1.4606819068040637, "learning_rate": 3.8115740240444106e-06, "loss": 0.0686, "step": 3564 }, { "epoch": 1.6219290263876252, "grad_norm": 1.5335099118337074, "learning_rate": 3.81096556875275e-06, "loss": 0.0945, "step": 3565 }, { "epoch": 1.6223839854413102, "grad_norm": 1.4211439533946046, "learning_rate": 3.8103570063349034e-06, "loss": 0.0848, "step": 3566 }, { "epoch": 1.6228389444949953, "grad_norm": 2.05671220826532, "learning_rate": 3.8097483368406003e-06, "loss": 0.1393, "step": 3567 }, { "epoch": 1.6232939035486806, "grad_norm": 1.258941393822947, "learning_rate": 3.809139560319577e-06, "loss": 0.0921, "step": 3568 }, { "epoch": 1.623748862602366, "grad_norm": 1.3433169063353143, "learning_rate": 3.8085306768215812e-06, "loss": 0.0487, "step": 3569 }, { "epoch": 1.6242038216560508, "grad_norm": 1.8383611324170595, "learning_rate": 3.8079216863963675e-06, "loss": 0.0958, "step": 3570 }, { "epoch": 1.6246587807097361, "grad_norm": 1.1667592271810971, "learning_rate": 3.807312589093701e-06, "loss": 0.1073, "step": 3571 }, { "epoch": 1.6251137397634214, "grad_norm": 1.3037848687491307, "learning_rate": 3.806703384963353e-06, "loss": 0.0887, "step": 3572 }, { "epoch": 1.6255686988171063, "grad_norm": 2.1805764107753682, "learning_rate": 3.8060940740551056e-06, "loss": 0.1175, "step": 3573 }, { "epoch": 1.6260236578707916, "grad_norm": 1.5290852074046848, "learning_rate": 3.8054846564187486e-06, "loss": 0.0791, "step": 3574 }, { "epoch": 1.626478616924477, "grad_norm": 1.3893969194477716, "learning_rate": 3.8048751321040806e-06, "loss": 0.101, "step": 3575 }, { "epoch": 1.626933575978162, "grad_norm": 1.9004304102737104, "learning_rate": 3.80426550116091e-06, "loss": 0.0928, "step": 3576 }, { "epoch": 1.627388535031847, "grad_norm": 1.6915230381144057, "learning_rate": 3.8036557636390527e-06, "loss": 0.0766, "step": 3577 }, { "epoch": 1.6278434940855324, "grad_norm": 1.0514486886395418, "learning_rate": 3.803045919588333e-06, "loss": 0.0664, "step": 3578 }, { "epoch": 1.6282984531392175, "grad_norm": 1.4234863878823187, "learning_rate": 3.8024359690585856e-06, "loss": 0.0721, "step": 3579 }, { "epoch": 1.6287534121929026, "grad_norm": 1.2609891905732606, "learning_rate": 3.8018259120996527e-06, "loss": 0.0872, "step": 3580 }, { "epoch": 1.629208371246588, "grad_norm": 1.5850653039807274, "learning_rate": 3.8012157487613853e-06, "loss": 0.0805, "step": 3581 }, { "epoch": 1.629663330300273, "grad_norm": 1.832485082888465, "learning_rate": 3.800605479093643e-06, "loss": 0.1024, "step": 3582 }, { "epoch": 1.630118289353958, "grad_norm": 1.702654974463059, "learning_rate": 3.7999951031462946e-06, "loss": 0.0906, "step": 3583 }, { "epoch": 1.6305732484076434, "grad_norm": 1.474499334683554, "learning_rate": 3.7993846209692176e-06, "loss": 0.0962, "step": 3584 }, { "epoch": 1.6310282074613285, "grad_norm": 1.5086663161158875, "learning_rate": 3.798774032612297e-06, "loss": 0.0654, "step": 3585 }, { "epoch": 1.6314831665150136, "grad_norm": 1.0661038423136504, "learning_rate": 3.7981633381254266e-06, "loss": 0.0447, "step": 3586 }, { "epoch": 1.6319381255686989, "grad_norm": 1.4518424967983448, "learning_rate": 3.7975525375585115e-06, "loss": 0.0734, "step": 3587 }, { "epoch": 1.632393084622384, "grad_norm": 1.5501158169773566, "learning_rate": 3.7969416309614633e-06, "loss": 0.0898, "step": 3588 }, { "epoch": 1.632848043676069, "grad_norm": 1.965701815823085, "learning_rate": 3.796330618384201e-06, "loss": 0.1057, "step": 3589 }, { "epoch": 1.6333030027297544, "grad_norm": 1.3607658650755952, "learning_rate": 3.795719499876655e-06, "loss": 0.0854, "step": 3590 }, { "epoch": 1.6337579617834395, "grad_norm": 1.8733053222408997, "learning_rate": 3.7951082754887638e-06, "loss": 0.0631, "step": 3591 }, { "epoch": 1.6342129208371245, "grad_norm": 1.5841759160765763, "learning_rate": 3.7944969452704717e-06, "loss": 0.0909, "step": 3592 }, { "epoch": 1.6346678798908099, "grad_norm": 1.6571108055603234, "learning_rate": 3.7938855092717354e-06, "loss": 0.0949, "step": 3593 }, { "epoch": 1.635122838944495, "grad_norm": 1.2185150904807907, "learning_rate": 3.793273967542519e-06, "loss": 0.0617, "step": 3594 }, { "epoch": 1.63557779799818, "grad_norm": 1.9263453323265187, "learning_rate": 3.792662320132794e-06, "loss": 0.08, "step": 3595 }, { "epoch": 1.6360327570518653, "grad_norm": 1.2556800593788535, "learning_rate": 3.792050567092542e-06, "loss": 0.0778, "step": 3596 }, { "epoch": 1.6364877161055507, "grad_norm": 1.736889144215566, "learning_rate": 3.791438708471752e-06, "loss": 0.0772, "step": 3597 }, { "epoch": 1.6369426751592355, "grad_norm": 1.8149141305860381, "learning_rate": 3.7908267443204226e-06, "loss": 0.1436, "step": 3598 }, { "epoch": 1.6373976342129208, "grad_norm": 1.3848324381986992, "learning_rate": 3.7902146746885614e-06, "loss": 0.079, "step": 3599 }, { "epoch": 1.6378525932666061, "grad_norm": 1.8298322470086545, "learning_rate": 3.789602499626184e-06, "loss": 0.0781, "step": 3600 }, { "epoch": 1.638307552320291, "grad_norm": 1.2549328571889946, "learning_rate": 3.788990219183314e-06, "loss": 0.0726, "step": 3601 }, { "epoch": 1.6387625113739763, "grad_norm": 1.616673851884488, "learning_rate": 3.7883778334099842e-06, "loss": 0.0599, "step": 3602 }, { "epoch": 1.6392174704276616, "grad_norm": 1.249668977036354, "learning_rate": 3.7877653423562365e-06, "loss": 0.0679, "step": 3603 }, { "epoch": 1.6396724294813467, "grad_norm": 1.5394079113104815, "learning_rate": 3.787152746072119e-06, "loss": 0.0933, "step": 3604 }, { "epoch": 1.6401273885350318, "grad_norm": 2.0112093150040424, "learning_rate": 3.7865400446076933e-06, "loss": 0.0696, "step": 3605 }, { "epoch": 1.6405823475887171, "grad_norm": 1.4123374061993457, "learning_rate": 3.7859272380130248e-06, "loss": 0.0815, "step": 3606 }, { "epoch": 1.6410373066424022, "grad_norm": 1.0888289865717673, "learning_rate": 3.785314326338189e-06, "loss": 0.0783, "step": 3607 }, { "epoch": 1.6414922656960873, "grad_norm": 1.2122963409667613, "learning_rate": 3.784701309633272e-06, "loss": 0.095, "step": 3608 }, { "epoch": 1.6419472247497726, "grad_norm": 1.657921753313432, "learning_rate": 3.7840881879483647e-06, "loss": 0.0894, "step": 3609 }, { "epoch": 1.6424021838034577, "grad_norm": 1.6605883016878868, "learning_rate": 3.7834749613335704e-06, "loss": 0.0768, "step": 3610 }, { "epoch": 1.6428571428571428, "grad_norm": 1.3334818318847992, "learning_rate": 3.782861629838997e-06, "loss": 0.1419, "step": 3611 }, { "epoch": 1.643312101910828, "grad_norm": 1.5204400934596147, "learning_rate": 3.782248193514766e-06, "loss": 0.1012, "step": 3612 }, { "epoch": 1.6437670609645132, "grad_norm": 1.6496922807036696, "learning_rate": 3.7816346524110027e-06, "loss": 0.1082, "step": 3613 }, { "epoch": 1.6442220200181983, "grad_norm": 1.2662124668547772, "learning_rate": 3.781021006577843e-06, "loss": 0.0637, "step": 3614 }, { "epoch": 1.6446769790718836, "grad_norm": 1.7141874121772738, "learning_rate": 3.780407256065432e-06, "loss": 0.104, "step": 3615 }, { "epoch": 1.6451319381255687, "grad_norm": 1.5545726205652852, "learning_rate": 3.7797934009239224e-06, "loss": 0.0875, "step": 3616 }, { "epoch": 1.6455868971792538, "grad_norm": 1.2911635931331455, "learning_rate": 3.7791794412034756e-06, "loss": 0.0977, "step": 3617 }, { "epoch": 1.646041856232939, "grad_norm": 1.731192815563008, "learning_rate": 3.7785653769542613e-06, "loss": 0.0913, "step": 3618 }, { "epoch": 1.6464968152866242, "grad_norm": 1.2947485511224062, "learning_rate": 3.7779512082264586e-06, "loss": 0.0557, "step": 3619 }, { "epoch": 1.6469517743403093, "grad_norm": 1.383550116936231, "learning_rate": 3.777336935070255e-06, "loss": 0.1182, "step": 3620 }, { "epoch": 1.6474067333939946, "grad_norm": 1.8285538245973652, "learning_rate": 3.7767225575358434e-06, "loss": 0.1094, "step": 3621 }, { "epoch": 1.6478616924476797, "grad_norm": 1.7190702067378485, "learning_rate": 3.7761080756734318e-06, "loss": 0.0875, "step": 3622 }, { "epoch": 1.6483166515013647, "grad_norm": 1.4687009770206203, "learning_rate": 3.7754934895332306e-06, "loss": 0.1103, "step": 3623 }, { "epoch": 1.64877161055505, "grad_norm": 1.5519449686299407, "learning_rate": 3.7748787991654623e-06, "loss": 0.1308, "step": 3624 }, { "epoch": 1.6492265696087354, "grad_norm": 2.823910289422741, "learning_rate": 3.774264004620355e-06, "loss": 0.1048, "step": 3625 }, { "epoch": 1.6496815286624202, "grad_norm": 1.5445681796729505, "learning_rate": 3.7736491059481474e-06, "loss": 0.0567, "step": 3626 }, { "epoch": 1.6501364877161055, "grad_norm": 1.525642400249133, "learning_rate": 3.7730341031990873e-06, "loss": 0.1157, "step": 3627 }, { "epoch": 1.6505914467697909, "grad_norm": 2.0183237094876043, "learning_rate": 3.772418996423428e-06, "loss": 0.0826, "step": 3628 }, { "epoch": 1.6510464058234757, "grad_norm": 1.9302965609612186, "learning_rate": 3.7718037856714364e-06, "loss": 0.1091, "step": 3629 }, { "epoch": 1.651501364877161, "grad_norm": 1.1858043482204346, "learning_rate": 3.7711884709933823e-06, "loss": 0.069, "step": 3630 }, { "epoch": 1.6519563239308463, "grad_norm": 1.6924449291018344, "learning_rate": 3.7705730524395466e-06, "loss": 0.0985, "step": 3631 }, { "epoch": 1.6524112829845314, "grad_norm": 1.913108328172786, "learning_rate": 3.7699575300602188e-06, "loss": 0.0975, "step": 3632 }, { "epoch": 1.6528662420382165, "grad_norm": 1.750550939319706, "learning_rate": 3.7693419039056965e-06, "loss": 0.1034, "step": 3633 }, { "epoch": 1.6533212010919018, "grad_norm": 2.2636189463256655, "learning_rate": 3.768726174026287e-06, "loss": 0.0884, "step": 3634 }, { "epoch": 1.653776160145587, "grad_norm": 1.502911328693322, "learning_rate": 3.768110340472304e-06, "loss": 0.0685, "step": 3635 }, { "epoch": 1.654231119199272, "grad_norm": 1.601983215249293, "learning_rate": 3.7674944032940696e-06, "loss": 0.1085, "step": 3636 }, { "epoch": 1.6546860782529573, "grad_norm": 1.660467775091667, "learning_rate": 3.766878362541918e-06, "loss": 0.0897, "step": 3637 }, { "epoch": 1.6551410373066424, "grad_norm": 1.5658426079984442, "learning_rate": 3.7662622182661867e-06, "loss": 0.0627, "step": 3638 }, { "epoch": 1.6555959963603275, "grad_norm": 1.7243236232811525, "learning_rate": 3.7656459705172255e-06, "loss": 0.1599, "step": 3639 }, { "epoch": 1.6560509554140128, "grad_norm": 1.4633169452497403, "learning_rate": 3.7650296193453916e-06, "loss": 0.1015, "step": 3640 }, { "epoch": 1.656505914467698, "grad_norm": 1.5226584092590818, "learning_rate": 3.7644131648010494e-06, "loss": 0.1035, "step": 3641 }, { "epoch": 1.656960873521383, "grad_norm": 1.4079296739454745, "learning_rate": 3.7637966069345743e-06, "loss": 0.0664, "step": 3642 }, { "epoch": 1.6574158325750683, "grad_norm": 2.7222833819753984, "learning_rate": 3.7631799457963467e-06, "loss": 0.1111, "step": 3643 }, { "epoch": 1.6578707916287534, "grad_norm": 1.5982823104845094, "learning_rate": 3.7625631814367593e-06, "loss": 0.0656, "step": 3644 }, { "epoch": 1.6583257506824385, "grad_norm": 1.4481458140884786, "learning_rate": 3.7619463139062097e-06, "loss": 0.0926, "step": 3645 }, { "epoch": 1.6587807097361238, "grad_norm": 1.2001609957101869, "learning_rate": 3.761329343255107e-06, "loss": 0.0938, "step": 3646 }, { "epoch": 1.6592356687898089, "grad_norm": 1.6568540394635407, "learning_rate": 3.760712269533866e-06, "loss": 0.1051, "step": 3647 }, { "epoch": 1.659690627843494, "grad_norm": 1.9577315046631556, "learning_rate": 3.7600950927929116e-06, "loss": 0.112, "step": 3648 }, { "epoch": 1.6601455868971793, "grad_norm": 1.2119074026938559, "learning_rate": 3.759477813082677e-06, "loss": 0.081, "step": 3649 }, { "epoch": 1.6606005459508644, "grad_norm": 1.4227614277462064, "learning_rate": 3.7588604304536026e-06, "loss": 0.0954, "step": 3650 }, { "epoch": 1.6610555050045495, "grad_norm": 1.5208959259214818, "learning_rate": 3.75824294495614e-06, "loss": 0.0756, "step": 3651 }, { "epoch": 1.6615104640582348, "grad_norm": 1.9620903809540224, "learning_rate": 3.757625356640745e-06, "loss": 0.1111, "step": 3652 }, { "epoch": 1.66196542311192, "grad_norm": 2.488645943281837, "learning_rate": 3.757007665557886e-06, "loss": 0.1185, "step": 3653 }, { "epoch": 1.662420382165605, "grad_norm": 1.4390904853236535, "learning_rate": 3.7563898717580364e-06, "loss": 0.0898, "step": 3654 }, { "epoch": 1.6628753412192903, "grad_norm": 1.0195742663144811, "learning_rate": 3.755771975291681e-06, "loss": 0.0563, "step": 3655 }, { "epoch": 1.6633303002729756, "grad_norm": 1.7724917481639257, "learning_rate": 3.7551539762093103e-06, "loss": 0.0754, "step": 3656 }, { "epoch": 1.6637852593266604, "grad_norm": 1.4001782068682185, "learning_rate": 3.7545358745614246e-06, "loss": 0.0733, "step": 3657 }, { "epoch": 1.6642402183803457, "grad_norm": 1.3981768605917073, "learning_rate": 3.7539176703985338e-06, "loss": 0.0883, "step": 3658 }, { "epoch": 1.664695177434031, "grad_norm": 1.996233099344841, "learning_rate": 3.7532993637711524e-06, "loss": 0.1124, "step": 3659 }, { "epoch": 1.6651501364877161, "grad_norm": 1.341770798886997, "learning_rate": 3.7526809547298072e-06, "loss": 0.0462, "step": 3660 }, { "epoch": 1.6656050955414012, "grad_norm": 1.7299696748529716, "learning_rate": 3.752062443325032e-06, "loss": 0.116, "step": 3661 }, { "epoch": 1.6660600545950865, "grad_norm": 1.376930204936947, "learning_rate": 3.7514438296073678e-06, "loss": 0.0783, "step": 3662 }, { "epoch": 1.6665150136487716, "grad_norm": 1.065567658542791, "learning_rate": 3.7508251136273656e-06, "loss": 0.0679, "step": 3663 }, { "epoch": 1.6669699727024567, "grad_norm": 1.8184328545058193, "learning_rate": 3.7502062954355835e-06, "loss": 0.0918, "step": 3664 }, { "epoch": 1.667424931756142, "grad_norm": 1.6690415750134169, "learning_rate": 3.749587375082589e-06, "loss": 0.0821, "step": 3665 }, { "epoch": 1.6678798908098271, "grad_norm": 1.2789568357787622, "learning_rate": 3.7489683526189575e-06, "loss": 0.1001, "step": 3666 }, { "epoch": 1.6683348498635122, "grad_norm": 1.5649947712697578, "learning_rate": 3.7483492280952718e-06, "loss": 0.1129, "step": 3667 }, { "epoch": 1.6687898089171975, "grad_norm": 1.294140271102535, "learning_rate": 3.747730001562125e-06, "loss": 0.0674, "step": 3668 }, { "epoch": 1.6692447679708826, "grad_norm": 1.6650935759632473, "learning_rate": 3.747110673070117e-06, "loss": 0.1275, "step": 3669 }, { "epoch": 1.6696997270245677, "grad_norm": 1.5000268037908384, "learning_rate": 3.7464912426698568e-06, "loss": 0.0778, "step": 3670 }, { "epoch": 1.670154686078253, "grad_norm": 1.6257718815565154, "learning_rate": 3.7458717104119618e-06, "loss": 0.0751, "step": 3671 }, { "epoch": 1.670609645131938, "grad_norm": 1.7774772668460406, "learning_rate": 3.7452520763470567e-06, "loss": 0.0984, "step": 3672 }, { "epoch": 1.6710646041856232, "grad_norm": 1.4532703561265023, "learning_rate": 3.7446323405257755e-06, "loss": 0.1332, "step": 3673 }, { "epoch": 1.6715195632393085, "grad_norm": 1.3124845817330937, "learning_rate": 3.7440125029987593e-06, "loss": 0.072, "step": 3674 }, { "epoch": 1.6719745222929936, "grad_norm": 2.2038020944741032, "learning_rate": 3.7433925638166603e-06, "loss": 0.0973, "step": 3675 }, { "epoch": 1.6724294813466787, "grad_norm": 1.4317596429814727, "learning_rate": 3.742772523030136e-06, "loss": 0.1099, "step": 3676 }, { "epoch": 1.672884440400364, "grad_norm": 1.6164361152243525, "learning_rate": 3.742152380689853e-06, "loss": 0.1022, "step": 3677 }, { "epoch": 1.673339399454049, "grad_norm": 1.3395380315056304, "learning_rate": 3.7415321368464872e-06, "loss": 0.0541, "step": 3678 }, { "epoch": 1.6737943585077342, "grad_norm": 2.5052300287798066, "learning_rate": 3.740911791550722e-06, "loss": 0.0683, "step": 3679 }, { "epoch": 1.6742493175614195, "grad_norm": 1.4949822903789034, "learning_rate": 3.7402913448532493e-06, "loss": 0.1075, "step": 3680 }, { "epoch": 1.6747042766151048, "grad_norm": 1.4220208768996168, "learning_rate": 3.7396707968047676e-06, "loss": 0.0929, "step": 3681 }, { "epoch": 1.6751592356687897, "grad_norm": 1.6680349441551647, "learning_rate": 3.7390501474559883e-06, "loss": 0.111, "step": 3682 }, { "epoch": 1.675614194722475, "grad_norm": 1.5715722776577439, "learning_rate": 3.738429396857626e-06, "loss": 0.1475, "step": 3683 }, { "epoch": 1.6760691537761603, "grad_norm": 1.7376138155650749, "learning_rate": 3.7378085450604053e-06, "loss": 0.1191, "step": 3684 }, { "epoch": 1.6765241128298451, "grad_norm": 1.4148290534830361, "learning_rate": 3.7371875921150612e-06, "loss": 0.072, "step": 3685 }, { "epoch": 1.6769790718835305, "grad_norm": 1.4623075467074917, "learning_rate": 3.7365665380723335e-06, "loss": 0.0509, "step": 3686 }, { "epoch": 1.6774340309372158, "grad_norm": 1.3573812813454127, "learning_rate": 3.7359453829829734e-06, "loss": 0.0985, "step": 3687 }, { "epoch": 1.6778889899909009, "grad_norm": 1.5568897790670728, "learning_rate": 3.7353241268977373e-06, "loss": 0.1236, "step": 3688 }, { "epoch": 1.678343949044586, "grad_norm": 2.2940958630201886, "learning_rate": 3.734702769867393e-06, "loss": 0.0764, "step": 3689 }, { "epoch": 1.6787989080982713, "grad_norm": 1.3442482512840874, "learning_rate": 3.734081311942714e-06, "loss": 0.0978, "step": 3690 }, { "epoch": 1.6792538671519563, "grad_norm": 1.8138452784104404, "learning_rate": 3.733459753174482e-06, "loss": 0.0859, "step": 3691 }, { "epoch": 1.6797088262056414, "grad_norm": 1.4321187601359233, "learning_rate": 3.7328380936134904e-06, "loss": 0.0775, "step": 3692 }, { "epoch": 1.6801637852593267, "grad_norm": 1.41069236183624, "learning_rate": 3.732216333310537e-06, "loss": 0.0917, "step": 3693 }, { "epoch": 1.6806187443130118, "grad_norm": 1.9951250186444143, "learning_rate": 3.7315944723164297e-06, "loss": 0.082, "step": 3694 }, { "epoch": 1.681073703366697, "grad_norm": 1.6730991033350597, "learning_rate": 3.730972510681984e-06, "loss": 0.1247, "step": 3695 }, { "epoch": 1.6815286624203822, "grad_norm": 1.2653740300959873, "learning_rate": 3.7303504484580235e-06, "loss": 0.0612, "step": 3696 }, { "epoch": 1.6819836214740673, "grad_norm": 2.182693049825918, "learning_rate": 3.729728285695381e-06, "loss": 0.1327, "step": 3697 }, { "epoch": 1.6824385805277524, "grad_norm": 1.4305055234618265, "learning_rate": 3.7291060224448948e-06, "loss": 0.0846, "step": 3698 }, { "epoch": 1.6828935395814377, "grad_norm": 1.6928992379475964, "learning_rate": 3.728483658757417e-06, "loss": 0.0804, "step": 3699 }, { "epoch": 1.6833484986351228, "grad_norm": 1.5255874576024848, "learning_rate": 3.7278611946838016e-06, "loss": 0.0808, "step": 3700 }, { "epoch": 1.683803457688808, "grad_norm": 1.4000434276219165, "learning_rate": 3.727238630274914e-06, "loss": 0.0714, "step": 3701 }, { "epoch": 1.6842584167424932, "grad_norm": 1.8847969431398393, "learning_rate": 3.726615965581628e-06, "loss": 0.182, "step": 3702 }, { "epoch": 1.6847133757961783, "grad_norm": 2.000266747141511, "learning_rate": 3.725993200654825e-06, "loss": 0.1013, "step": 3703 }, { "epoch": 1.6851683348498634, "grad_norm": 1.9245292269555778, "learning_rate": 3.725370335545394e-06, "loss": 0.1387, "step": 3704 }, { "epoch": 1.6856232939035487, "grad_norm": 1.3437592768517972, "learning_rate": 3.7247473703042324e-06, "loss": 0.0668, "step": 3705 }, { "epoch": 1.686078252957234, "grad_norm": 1.5503063141347615, "learning_rate": 3.7241243049822475e-06, "loss": 0.0772, "step": 3706 }, { "epoch": 1.6865332120109189, "grad_norm": 1.2067326976504338, "learning_rate": 3.723501139630352e-06, "loss": 0.0666, "step": 3707 }, { "epoch": 1.6869881710646042, "grad_norm": 1.0982993568239983, "learning_rate": 3.722877874299469e-06, "loss": 0.0934, "step": 3708 }, { "epoch": 1.6874431301182895, "grad_norm": 1.8433072788333809, "learning_rate": 3.722254509040527e-06, "loss": 0.1073, "step": 3709 }, { "epoch": 1.6878980891719744, "grad_norm": 1.3571893079530788, "learning_rate": 3.721631043904468e-06, "loss": 0.095, "step": 3710 }, { "epoch": 1.6883530482256597, "grad_norm": 1.3636982650138714, "learning_rate": 3.7210074789422363e-06, "loss": 0.0665, "step": 3711 }, { "epoch": 1.688808007279345, "grad_norm": 1.6499064802980667, "learning_rate": 3.7203838142047875e-06, "loss": 0.0803, "step": 3712 }, { "epoch": 1.68926296633303, "grad_norm": 1.0606481449512501, "learning_rate": 3.719760049743084e-06, "loss": 0.0879, "step": 3713 }, { "epoch": 1.6897179253867152, "grad_norm": 1.6797002448880465, "learning_rate": 3.719136185608099e-06, "loss": 0.1122, "step": 3714 }, { "epoch": 1.6901728844404005, "grad_norm": 1.1510485276854876, "learning_rate": 3.7185122218508097e-06, "loss": 0.0685, "step": 3715 }, { "epoch": 1.6906278434940856, "grad_norm": 1.1288160329588468, "learning_rate": 3.717888158522204e-06, "loss": 0.0765, "step": 3716 }, { "epoch": 1.6910828025477707, "grad_norm": 1.4055958420961796, "learning_rate": 3.717263995673278e-06, "loss": 0.0758, "step": 3717 }, { "epoch": 1.691537761601456, "grad_norm": 1.9013516398168697, "learning_rate": 3.7166397333550357e-06, "loss": 0.0829, "step": 3718 }, { "epoch": 1.691992720655141, "grad_norm": 1.9667828483944916, "learning_rate": 3.7160153716184887e-06, "loss": 0.1028, "step": 3719 }, { "epoch": 1.6924476797088261, "grad_norm": 1.4696624576158512, "learning_rate": 3.7153909105146567e-06, "loss": 0.0722, "step": 3720 }, { "epoch": 1.6929026387625115, "grad_norm": 1.4023512858656115, "learning_rate": 3.7147663500945692e-06, "loss": 0.0647, "step": 3721 }, { "epoch": 1.6933575978161965, "grad_norm": 1.4221517384615157, "learning_rate": 3.7141416904092605e-06, "loss": 0.1361, "step": 3722 }, { "epoch": 1.6938125568698816, "grad_norm": 1.1933332333684021, "learning_rate": 3.713516931509775e-06, "loss": 0.0944, "step": 3723 }, { "epoch": 1.694267515923567, "grad_norm": 1.1269111842030053, "learning_rate": 3.7128920734471677e-06, "loss": 0.1164, "step": 3724 }, { "epoch": 1.694722474977252, "grad_norm": 1.7348503651718323, "learning_rate": 3.7122671162724966e-06, "loss": 0.0994, "step": 3725 }, { "epoch": 1.6951774340309371, "grad_norm": 1.8612604692549797, "learning_rate": 3.711642060036832e-06, "loss": 0.08, "step": 3726 }, { "epoch": 1.6956323930846224, "grad_norm": 1.387229340070187, "learning_rate": 3.711016904791249e-06, "loss": 0.0752, "step": 3727 }, { "epoch": 1.6960873521383075, "grad_norm": 1.7008250731692929, "learning_rate": 3.7103916505868342e-06, "loss": 0.1056, "step": 3728 }, { "epoch": 1.6965423111919926, "grad_norm": 1.3239134318463717, "learning_rate": 3.7097662974746795e-06, "loss": 0.0699, "step": 3729 }, { "epoch": 1.696997270245678, "grad_norm": 1.8256533430388462, "learning_rate": 3.7091408455058862e-06, "loss": 0.0775, "step": 3730 }, { "epoch": 1.697452229299363, "grad_norm": 1.79613204049805, "learning_rate": 3.708515294731564e-06, "loss": 0.0721, "step": 3731 }, { "epoch": 1.697907188353048, "grad_norm": 1.5736816771483433, "learning_rate": 3.707889645202829e-06, "loss": 0.0744, "step": 3732 }, { "epoch": 1.6983621474067334, "grad_norm": 1.6038106669830827, "learning_rate": 3.707263896970807e-06, "loss": 0.0808, "step": 3733 }, { "epoch": 1.6988171064604187, "grad_norm": 1.4277880201447593, "learning_rate": 3.706638050086631e-06, "loss": 0.0672, "step": 3734 }, { "epoch": 1.6992720655141036, "grad_norm": 1.5037542209114563, "learning_rate": 3.7060121046014434e-06, "loss": 0.1076, "step": 3735 }, { "epoch": 1.699727024567789, "grad_norm": 1.1797401767926583, "learning_rate": 3.7053860605663927e-06, "loss": 0.068, "step": 3736 }, { "epoch": 1.7001819836214742, "grad_norm": 2.1136431421218194, "learning_rate": 3.704759918032636e-06, "loss": 0.1385, "step": 3737 }, { "epoch": 1.700636942675159, "grad_norm": 1.4837090259637953, "learning_rate": 3.7041336770513403e-06, "loss": 0.1509, "step": 3738 }, { "epoch": 1.7010919017288444, "grad_norm": 1.2846387673700081, "learning_rate": 3.703507337673678e-06, "loss": 0.0656, "step": 3739 }, { "epoch": 1.7015468607825297, "grad_norm": 1.5402079538010371, "learning_rate": 3.702880899950831e-06, "loss": 0.1323, "step": 3740 }, { "epoch": 1.7020018198362148, "grad_norm": 1.650159643844847, "learning_rate": 3.702254363933989e-06, "loss": 0.0861, "step": 3741 }, { "epoch": 1.7024567788898999, "grad_norm": 2.1251909064685277, "learning_rate": 3.7016277296743496e-06, "loss": 0.0932, "step": 3742 }, { "epoch": 1.7029117379435852, "grad_norm": 1.5083623031587101, "learning_rate": 3.7010009972231186e-06, "loss": 0.1212, "step": 3743 }, { "epoch": 1.7033666969972703, "grad_norm": 1.3413381906247146, "learning_rate": 3.7003741666315095e-06, "loss": 0.1097, "step": 3744 }, { "epoch": 1.7038216560509554, "grad_norm": 1.4864505809541824, "learning_rate": 3.6997472379507454e-06, "loss": 0.0817, "step": 3745 }, { "epoch": 1.7042766151046407, "grad_norm": 1.599885444661115, "learning_rate": 3.6991202112320544e-06, "loss": 0.1032, "step": 3746 }, { "epoch": 1.7047315741583258, "grad_norm": 1.3311994376240086, "learning_rate": 3.6984930865266744e-06, "loss": 0.068, "step": 3747 }, { "epoch": 1.7051865332120109, "grad_norm": 1.5952473278921924, "learning_rate": 3.6978658638858526e-06, "loss": 0.0815, "step": 3748 }, { "epoch": 1.7056414922656962, "grad_norm": 1.9075687329859012, "learning_rate": 3.6972385433608416e-06, "loss": 0.1066, "step": 3749 }, { "epoch": 1.7060964513193813, "grad_norm": 1.2607213584220722, "learning_rate": 3.6966111250029035e-06, "loss": 0.0813, "step": 3750 }, { "epoch": 1.7065514103730663, "grad_norm": 1.4465646863905623, "learning_rate": 3.695983608863308e-06, "loss": 0.0701, "step": 3751 }, { "epoch": 1.7070063694267517, "grad_norm": 2.1212314170858817, "learning_rate": 3.6953559949933334e-06, "loss": 0.1001, "step": 3752 }, { "epoch": 1.7074613284804367, "grad_norm": 1.4298604969848259, "learning_rate": 3.6947282834442643e-06, "loss": 0.0673, "step": 3753 }, { "epoch": 1.7079162875341218, "grad_norm": 1.3169537242562657, "learning_rate": 3.6941004742673958e-06, "loss": 0.112, "step": 3754 }, { "epoch": 1.7083712465878071, "grad_norm": 1.8939339201341778, "learning_rate": 3.693472567514029e-06, "loss": 0.0833, "step": 3755 }, { "epoch": 1.7088262056414922, "grad_norm": 1.422111689632484, "learning_rate": 3.692844563235474e-06, "loss": 0.0696, "step": 3756 }, { "epoch": 1.7092811646951773, "grad_norm": 1.5645531669068269, "learning_rate": 3.692216461483047e-06, "loss": 0.0844, "step": 3757 }, { "epoch": 1.7097361237488626, "grad_norm": 1.170512719856435, "learning_rate": 3.6915882623080756e-06, "loss": 0.0777, "step": 3758 }, { "epoch": 1.7101910828025477, "grad_norm": 1.8470552824389257, "learning_rate": 3.690959965761893e-06, "loss": 0.0855, "step": 3759 }, { "epoch": 1.7106460418562328, "grad_norm": 1.4159457920300058, "learning_rate": 3.6903315718958397e-06, "loss": 0.0676, "step": 3760 }, { "epoch": 1.7111010009099181, "grad_norm": 1.6376336561387228, "learning_rate": 3.6897030807612655e-06, "loss": 0.0748, "step": 3761 }, { "epoch": 1.7115559599636034, "grad_norm": 1.4552631893432022, "learning_rate": 3.689074492409529e-06, "loss": 0.1206, "step": 3762 }, { "epoch": 1.7120109190172883, "grad_norm": 1.7069134427008634, "learning_rate": 3.6884458068919935e-06, "loss": 0.1162, "step": 3763 }, { "epoch": 1.7124658780709736, "grad_norm": 1.5208538977480064, "learning_rate": 3.687817024260035e-06, "loss": 0.0832, "step": 3764 }, { "epoch": 1.712920837124659, "grad_norm": 1.8228575449449067, "learning_rate": 3.687188144565033e-06, "loss": 0.1058, "step": 3765 }, { "epoch": 1.7133757961783438, "grad_norm": 1.6061196451342563, "learning_rate": 3.6865591678583775e-06, "loss": 0.0713, "step": 3766 }, { "epoch": 1.713830755232029, "grad_norm": 1.3560691827285056, "learning_rate": 3.685930094191465e-06, "loss": 0.0826, "step": 3767 }, { "epoch": 1.7142857142857144, "grad_norm": 1.6031272277128465, "learning_rate": 3.6853009236157e-06, "loss": 0.1301, "step": 3768 }, { "epoch": 1.7147406733393995, "grad_norm": 1.7368981005649833, "learning_rate": 3.684671656182497e-06, "loss": 0.1, "step": 3769 }, { "epoch": 1.7151956323930846, "grad_norm": 1.5003059917041945, "learning_rate": 3.6840422919432762e-06, "loss": 0.0683, "step": 3770 }, { "epoch": 1.71565059144677, "grad_norm": 1.2651906740498884, "learning_rate": 3.683412830949466e-06, "loss": 0.1634, "step": 3771 }, { "epoch": 1.716105550500455, "grad_norm": 1.7586184298073935, "learning_rate": 3.6827832732525042e-06, "loss": 0.0877, "step": 3772 }, { "epoch": 1.71656050955414, "grad_norm": 1.1281801531853541, "learning_rate": 3.6821536189038343e-06, "loss": 0.0601, "step": 3773 }, { "epoch": 1.7170154686078254, "grad_norm": 1.5992019858074247, "learning_rate": 3.681523867954909e-06, "loss": 0.0762, "step": 3774 }, { "epoch": 1.7174704276615105, "grad_norm": 1.6597502796480712, "learning_rate": 3.6808940204571895e-06, "loss": 0.1101, "step": 3775 }, { "epoch": 1.7179253867151956, "grad_norm": 1.35151276518898, "learning_rate": 3.6802640764621427e-06, "loss": 0.0743, "step": 3776 }, { "epoch": 1.7183803457688809, "grad_norm": 1.1807979282969794, "learning_rate": 3.6796340360212467e-06, "loss": 0.0672, "step": 3777 }, { "epoch": 1.718835304822566, "grad_norm": 1.3910321776980616, "learning_rate": 3.679003899185983e-06, "loss": 0.0755, "step": 3778 }, { "epoch": 1.719290263876251, "grad_norm": 1.4989573646780738, "learning_rate": 3.6783736660078463e-06, "loss": 0.1025, "step": 3779 }, { "epoch": 1.7197452229299364, "grad_norm": 1.1392210383956833, "learning_rate": 3.6777433365383348e-06, "loss": 0.086, "step": 3780 }, { "epoch": 1.7202001819836215, "grad_norm": 1.4260199034522016, "learning_rate": 3.6771129108289568e-06, "loss": 0.0872, "step": 3781 }, { "epoch": 1.7206551410373065, "grad_norm": 1.6436571242095346, "learning_rate": 3.6764823889312263e-06, "loss": 0.0793, "step": 3782 }, { "epoch": 1.7211101000909919, "grad_norm": 1.1657763049926704, "learning_rate": 3.675851770896669e-06, "loss": 0.1145, "step": 3783 }, { "epoch": 1.721565059144677, "grad_norm": 1.541041910455343, "learning_rate": 3.675221056776815e-06, "loss": 0.0895, "step": 3784 }, { "epoch": 1.722020018198362, "grad_norm": 1.7463116852930027, "learning_rate": 3.6745902466232027e-06, "loss": 0.082, "step": 3785 }, { "epoch": 1.7224749772520473, "grad_norm": 1.2539235031548606, "learning_rate": 3.6739593404873804e-06, "loss": 0.0528, "step": 3786 }, { "epoch": 1.7229299363057324, "grad_norm": 1.3433544911227029, "learning_rate": 3.6733283384209022e-06, "loss": 0.1119, "step": 3787 }, { "epoch": 1.7233848953594175, "grad_norm": 1.8702987407382203, "learning_rate": 3.6726972404753313e-06, "loss": 0.0957, "step": 3788 }, { "epoch": 1.7238398544131028, "grad_norm": 1.3778226536395195, "learning_rate": 3.672066046702237e-06, "loss": 0.0834, "step": 3789 }, { "epoch": 1.7242948134667881, "grad_norm": 1.7931613011963745, "learning_rate": 3.6714347571531993e-06, "loss": 0.0998, "step": 3790 }, { "epoch": 1.724749772520473, "grad_norm": 1.201047512458584, "learning_rate": 3.670803371879803e-06, "loss": 0.0786, "step": 3791 }, { "epoch": 1.7252047315741583, "grad_norm": 1.489415758966608, "learning_rate": 3.6701718909336424e-06, "loss": 0.0523, "step": 3792 }, { "epoch": 1.7256596906278436, "grad_norm": 1.4276091361573713, "learning_rate": 3.669540314366319e-06, "loss": 0.0666, "step": 3793 }, { "epoch": 1.7261146496815285, "grad_norm": 1.5354695779982008, "learning_rate": 3.6689086422294434e-06, "loss": 0.1034, "step": 3794 }, { "epoch": 1.7265696087352138, "grad_norm": 1.229967897360051, "learning_rate": 3.6682768745746317e-06, "loss": 0.071, "step": 3795 }, { "epoch": 1.7270245677888991, "grad_norm": 1.4563971926009978, "learning_rate": 3.66764501145351e-06, "loss": 0.0808, "step": 3796 }, { "epoch": 1.7274795268425842, "grad_norm": 1.743782731560498, "learning_rate": 3.6670130529177108e-06, "loss": 0.1104, "step": 3797 }, { "epoch": 1.7279344858962693, "grad_norm": 1.8577089194191831, "learning_rate": 3.6663809990188752e-06, "loss": 0.0614, "step": 3798 }, { "epoch": 1.7283894449499546, "grad_norm": 2.04040729095502, "learning_rate": 3.6657488498086517e-06, "loss": 0.0959, "step": 3799 }, { "epoch": 1.7288444040036397, "grad_norm": 1.6678453798048642, "learning_rate": 3.6651166053386966e-06, "loss": 0.0849, "step": 3800 }, { "epoch": 1.7292993630573248, "grad_norm": 1.6628089965528048, "learning_rate": 3.664484265660675e-06, "loss": 0.0942, "step": 3801 }, { "epoch": 1.72975432211101, "grad_norm": 1.7687120859676517, "learning_rate": 3.6638518308262567e-06, "loss": 0.0835, "step": 3802 }, { "epoch": 1.7302092811646952, "grad_norm": 1.5339091662300357, "learning_rate": 3.663219300887123e-06, "loss": 0.0822, "step": 3803 }, { "epoch": 1.7306642402183803, "grad_norm": 1.1861486992574195, "learning_rate": 3.6625866758949614e-06, "loss": 0.1094, "step": 3804 }, { "epoch": 1.7311191992720656, "grad_norm": 1.8288303563046757, "learning_rate": 3.6619539559014673e-06, "loss": 0.1001, "step": 3805 }, { "epoch": 1.7315741583257507, "grad_norm": 1.3150775526264655, "learning_rate": 3.661321140958342e-06, "loss": 0.0575, "step": 3806 }, { "epoch": 1.7320291173794358, "grad_norm": 1.366321907024557, "learning_rate": 3.660688231117298e-06, "loss": 0.1304, "step": 3807 }, { "epoch": 1.732484076433121, "grad_norm": 1.9116877563591552, "learning_rate": 3.660055226430054e-06, "loss": 0.0962, "step": 3808 }, { "epoch": 1.7329390354868062, "grad_norm": 1.9279283873836337, "learning_rate": 3.6594221269483356e-06, "loss": 0.0813, "step": 3809 }, { "epoch": 1.7333939945404913, "grad_norm": 1.436731798492786, "learning_rate": 3.658788932723876e-06, "loss": 0.098, "step": 3810 }, { "epoch": 1.7338489535941766, "grad_norm": 1.9358039523259358, "learning_rate": 3.6581556438084185e-06, "loss": 0.0663, "step": 3811 }, { "epoch": 1.7343039126478617, "grad_norm": 1.4418856652827021, "learning_rate": 3.6575222602537118e-06, "loss": 0.0697, "step": 3812 }, { "epoch": 1.7347588717015467, "grad_norm": 1.2283746063700987, "learning_rate": 3.6568887821115134e-06, "loss": 0.1015, "step": 3813 }, { "epoch": 1.735213830755232, "grad_norm": 1.3721511510877509, "learning_rate": 3.6562552094335878e-06, "loss": 0.0699, "step": 3814 }, { "epoch": 1.7356687898089171, "grad_norm": 1.2522694615863816, "learning_rate": 3.655621542271709e-06, "loss": 0.0765, "step": 3815 }, { "epoch": 1.7361237488626022, "grad_norm": 1.9502637984521098, "learning_rate": 3.654987780677656e-06, "loss": 0.1458, "step": 3816 }, { "epoch": 1.7365787079162875, "grad_norm": 1.9527107998339472, "learning_rate": 3.654353924703217e-06, "loss": 0.1071, "step": 3817 }, { "epoch": 1.7370336669699729, "grad_norm": 1.6144154383957934, "learning_rate": 3.6537199744001893e-06, "loss": 0.0577, "step": 3818 }, { "epoch": 1.7374886260236577, "grad_norm": 1.635793068784688, "learning_rate": 3.6530859298203746e-06, "loss": 0.0801, "step": 3819 }, { "epoch": 1.737943585077343, "grad_norm": 1.3556409545550439, "learning_rate": 3.6524517910155853e-06, "loss": 0.0551, "step": 3820 }, { "epoch": 1.7383985441310283, "grad_norm": 1.8914290918388026, "learning_rate": 3.65181755803764e-06, "loss": 0.1084, "step": 3821 }, { "epoch": 1.7388535031847132, "grad_norm": 1.272347843250335, "learning_rate": 3.6511832309383654e-06, "loss": 0.0802, "step": 3822 }, { "epoch": 1.7393084622383985, "grad_norm": 1.4860750368468538, "learning_rate": 3.6505488097695963e-06, "loss": 0.0848, "step": 3823 }, { "epoch": 1.7397634212920838, "grad_norm": 1.512013977206517, "learning_rate": 3.6499142945831732e-06, "loss": 0.1224, "step": 3824 }, { "epoch": 1.740218380345769, "grad_norm": 1.8002760761212289, "learning_rate": 3.649279685430948e-06, "loss": 0.0951, "step": 3825 }, { "epoch": 1.740673339399454, "grad_norm": 2.0425023917938923, "learning_rate": 3.648644982364777e-06, "loss": 0.0933, "step": 3826 }, { "epoch": 1.7411282984531393, "grad_norm": 1.5872740997753052, "learning_rate": 3.648010185436525e-06, "loss": 0.1105, "step": 3827 }, { "epoch": 1.7415832575068244, "grad_norm": 1.1605977735109105, "learning_rate": 3.6473752946980644e-06, "loss": 0.1119, "step": 3828 }, { "epoch": 1.7420382165605095, "grad_norm": 1.5062493982550569, "learning_rate": 3.6467403102012767e-06, "loss": 0.1019, "step": 3829 }, { "epoch": 1.7424931756141948, "grad_norm": 1.918167762817108, "learning_rate": 3.64610523199805e-06, "loss": 0.1195, "step": 3830 }, { "epoch": 1.74294813466788, "grad_norm": 1.3465476426459404, "learning_rate": 3.6454700601402783e-06, "loss": 0.0664, "step": 3831 }, { "epoch": 1.743403093721565, "grad_norm": 2.0064456201618395, "learning_rate": 3.6448347946798672e-06, "loss": 0.0876, "step": 3832 }, { "epoch": 1.7438580527752503, "grad_norm": 2.2003801995974475, "learning_rate": 3.6441994356687265e-06, "loss": 0.1139, "step": 3833 }, { "epoch": 1.7443130118289354, "grad_norm": 1.4762961480620516, "learning_rate": 3.643563983158775e-06, "loss": 0.0629, "step": 3834 }, { "epoch": 1.7447679708826205, "grad_norm": 1.6725152150047542, "learning_rate": 3.642928437201939e-06, "loss": 0.0703, "step": 3835 }, { "epoch": 1.7452229299363058, "grad_norm": 1.334876514653773, "learning_rate": 3.642292797850153e-06, "loss": 0.1062, "step": 3836 }, { "epoch": 1.7456778889899909, "grad_norm": 1.7378134234814446, "learning_rate": 3.641657065155358e-06, "loss": 0.0738, "step": 3837 }, { "epoch": 1.746132848043676, "grad_norm": 1.4448485374671916, "learning_rate": 3.6410212391695023e-06, "loss": 0.0683, "step": 3838 }, { "epoch": 1.7465878070973613, "grad_norm": 1.312191333205634, "learning_rate": 3.6403853199445448e-06, "loss": 0.063, "step": 3839 }, { "epoch": 1.7470427661510464, "grad_norm": 1.5645009928552622, "learning_rate": 3.6397493075324486e-06, "loss": 0.0549, "step": 3840 }, { "epoch": 1.7474977252047315, "grad_norm": 1.5668407577082635, "learning_rate": 3.6391132019851857e-06, "loss": 0.0682, "step": 3841 }, { "epoch": 1.7479526842584168, "grad_norm": 1.5774019134496031, "learning_rate": 3.6384770033547366e-06, "loss": 0.0736, "step": 3842 }, { "epoch": 1.7484076433121019, "grad_norm": 1.5046571729103253, "learning_rate": 3.637840711693088e-06, "loss": 0.0877, "step": 3843 }, { "epoch": 1.748862602365787, "grad_norm": 1.5442307565561748, "learning_rate": 3.637204327052235e-06, "loss": 0.1343, "step": 3844 }, { "epoch": 1.7493175614194723, "grad_norm": 1.5036673021689093, "learning_rate": 3.6365678494841795e-06, "loss": 0.1261, "step": 3845 }, { "epoch": 1.7497725204731576, "grad_norm": 1.4932790824808861, "learning_rate": 3.6359312790409323e-06, "loss": 0.0992, "step": 3846 }, { "epoch": 1.7502274795268424, "grad_norm": 1.9107453215438228, "learning_rate": 3.635294615774511e-06, "loss": 0.1357, "step": 3847 }, { "epoch": 1.7506824385805277, "grad_norm": 1.5774351391173225, "learning_rate": 3.6346578597369397e-06, "loss": 0.1263, "step": 3848 }, { "epoch": 1.751137397634213, "grad_norm": 1.5614637580431585, "learning_rate": 3.634021010980254e-06, "loss": 0.0996, "step": 3849 }, { "epoch": 1.7515923566878981, "grad_norm": 1.598187914261784, "learning_rate": 3.633384069556491e-06, "loss": 0.1438, "step": 3850 }, { "epoch": 1.7520473157415832, "grad_norm": 1.5399464760716188, "learning_rate": 3.6327470355177006e-06, "loss": 0.0723, "step": 3851 }, { "epoch": 1.7525022747952685, "grad_norm": 1.1196804517998342, "learning_rate": 3.6321099089159377e-06, "loss": 0.0887, "step": 3852 }, { "epoch": 1.7529572338489536, "grad_norm": 1.3647416556856382, "learning_rate": 3.631472689803266e-06, "loss": 0.0967, "step": 3853 }, { "epoch": 1.7534121929026387, "grad_norm": 1.5039771262948265, "learning_rate": 3.6308353782317557e-06, "loss": 0.0976, "step": 3854 }, { "epoch": 1.753867151956324, "grad_norm": 1.8824024911224424, "learning_rate": 3.6301979742534844e-06, "loss": 0.1241, "step": 3855 }, { "epoch": 1.7543221110100091, "grad_norm": 1.7796095717371427, "learning_rate": 3.6295604779205394e-06, "loss": 0.1209, "step": 3856 }, { "epoch": 1.7547770700636942, "grad_norm": 1.3863571488422435, "learning_rate": 3.6289228892850126e-06, "loss": 0.0903, "step": 3857 }, { "epoch": 1.7552320291173795, "grad_norm": 1.4594228439323398, "learning_rate": 3.628285208399006e-06, "loss": 0.1556, "step": 3858 }, { "epoch": 1.7556869881710646, "grad_norm": 1.8120626288336543, "learning_rate": 3.6276474353146274e-06, "loss": 0.1212, "step": 3859 }, { "epoch": 1.7561419472247497, "grad_norm": 1.4378693679914298, "learning_rate": 3.6270095700839926e-06, "loss": 0.1273, "step": 3860 }, { "epoch": 1.756596906278435, "grad_norm": 1.7188037082210168, "learning_rate": 3.6263716127592253e-06, "loss": 0.0594, "step": 3861 }, { "epoch": 1.75705186533212, "grad_norm": 1.4071444456084878, "learning_rate": 3.6257335633924564e-06, "loss": 0.0672, "step": 3862 }, { "epoch": 1.7575068243858052, "grad_norm": 1.5794005882869442, "learning_rate": 3.6250954220358248e-06, "loss": 0.1336, "step": 3863 }, { "epoch": 1.7579617834394905, "grad_norm": 1.6057820450667988, "learning_rate": 3.624457188741476e-06, "loss": 0.1115, "step": 3864 }, { "epoch": 1.7584167424931756, "grad_norm": 1.2944612620947467, "learning_rate": 3.6238188635615636e-06, "loss": 0.084, "step": 3865 }, { "epoch": 1.7588717015468607, "grad_norm": 1.4088618815259382, "learning_rate": 3.6231804465482483e-06, "loss": 0.06, "step": 3866 }, { "epoch": 1.759326660600546, "grad_norm": 1.8612925288087485, "learning_rate": 3.6225419377536997e-06, "loss": 0.0923, "step": 3867 }, { "epoch": 1.759781619654231, "grad_norm": 1.531245089574177, "learning_rate": 3.6219033372300937e-06, "loss": 0.1042, "step": 3868 }, { "epoch": 1.7602365787079162, "grad_norm": 1.5852402268208892, "learning_rate": 3.621264645029613e-06, "loss": 0.1098, "step": 3869 }, { "epoch": 1.7606915377616015, "grad_norm": 1.7629746713267567, "learning_rate": 3.6206258612044486e-06, "loss": 0.076, "step": 3870 }, { "epoch": 1.7611464968152868, "grad_norm": 1.6050447339299123, "learning_rate": 3.6199869858068003e-06, "loss": 0.1067, "step": 3871 }, { "epoch": 1.7616014558689717, "grad_norm": 1.7437826102736806, "learning_rate": 3.619348018888873e-06, "loss": 0.0691, "step": 3872 }, { "epoch": 1.762056414922657, "grad_norm": 1.8486897309048702, "learning_rate": 3.618708960502881e-06, "loss": 0.0818, "step": 3873 }, { "epoch": 1.7625113739763423, "grad_norm": 1.6272214818612516, "learning_rate": 3.6180698107010435e-06, "loss": 0.1631, "step": 3874 }, { "epoch": 1.7629663330300271, "grad_norm": 1.5156085602027, "learning_rate": 3.617430569535592e-06, "loss": 0.1022, "step": 3875 }, { "epoch": 1.7634212920837125, "grad_norm": 1.3931136576855574, "learning_rate": 3.61679123705876e-06, "loss": 0.1205, "step": 3876 }, { "epoch": 1.7638762511373978, "grad_norm": 1.6864894884613248, "learning_rate": 3.616151813322791e-06, "loss": 0.1305, "step": 3877 }, { "epoch": 1.7643312101910829, "grad_norm": 1.6403796925238476, "learning_rate": 3.615512298379937e-06, "loss": 0.1235, "step": 3878 }, { "epoch": 1.764786169244768, "grad_norm": 1.524093268252484, "learning_rate": 3.6148726922824545e-06, "loss": 0.1027, "step": 3879 }, { "epoch": 1.7652411282984533, "grad_norm": 1.6831607845816892, "learning_rate": 3.614232995082611e-06, "loss": 0.1126, "step": 3880 }, { "epoch": 1.7656960873521383, "grad_norm": 1.4909684355720194, "learning_rate": 3.6135932068326797e-06, "loss": 0.0948, "step": 3881 }, { "epoch": 1.7661510464058234, "grad_norm": 1.4002526288991257, "learning_rate": 3.6129533275849395e-06, "loss": 0.1243, "step": 3882 }, { "epoch": 1.7666060054595087, "grad_norm": 1.5457084778815036, "learning_rate": 3.6123133573916792e-06, "loss": 0.0576, "step": 3883 }, { "epoch": 1.7670609645131938, "grad_norm": 1.5145094287630976, "learning_rate": 3.6116732963051946e-06, "loss": 0.1016, "step": 3884 }, { "epoch": 1.767515923566879, "grad_norm": 2.0955186918239956, "learning_rate": 3.611033144377789e-06, "loss": 0.1362, "step": 3885 }, { "epoch": 1.7679708826205642, "grad_norm": 1.728827050175275, "learning_rate": 3.610392901661772e-06, "loss": 0.0606, "step": 3886 }, { "epoch": 1.7684258416742493, "grad_norm": 1.927228156866062, "learning_rate": 3.609752568209462e-06, "loss": 0.0663, "step": 3887 }, { "epoch": 1.7688808007279344, "grad_norm": 1.411400637351551, "learning_rate": 3.6091121440731835e-06, "loss": 0.1082, "step": 3888 }, { "epoch": 1.7693357597816197, "grad_norm": 1.404150818623767, "learning_rate": 3.608471629305269e-06, "loss": 0.085, "step": 3889 }, { "epoch": 1.7697907188353048, "grad_norm": 1.319579914385889, "learning_rate": 3.607831023958059e-06, "loss": 0.0723, "step": 3890 }, { "epoch": 1.77024567788899, "grad_norm": 2.2803121978915, "learning_rate": 3.6071903280839003e-06, "loss": 0.1427, "step": 3891 }, { "epoch": 1.7707006369426752, "grad_norm": 1.401391831533701, "learning_rate": 3.606549541735148e-06, "loss": 0.102, "step": 3892 }, { "epoch": 1.7711555959963603, "grad_norm": 1.1635150426144107, "learning_rate": 3.605908664964165e-06, "loss": 0.062, "step": 3893 }, { "epoch": 1.7716105550500454, "grad_norm": 2.0320471235156163, "learning_rate": 3.605267697823319e-06, "loss": 0.1358, "step": 3894 }, { "epoch": 1.7720655141037307, "grad_norm": 1.3469754579348363, "learning_rate": 3.6046266403649897e-06, "loss": 0.0719, "step": 3895 }, { "epoch": 1.7725204731574158, "grad_norm": 2.110380658670838, "learning_rate": 3.6039854926415585e-06, "loss": 0.0927, "step": 3896 }, { "epoch": 1.7729754322111009, "grad_norm": 1.5787727353656158, "learning_rate": 3.603344254705419e-06, "loss": 0.1083, "step": 3897 }, { "epoch": 1.7734303912647862, "grad_norm": 1.941817913682967, "learning_rate": 3.6027029266089693e-06, "loss": 0.0935, "step": 3898 }, { "epoch": 1.7738853503184715, "grad_norm": 1.1559752033466744, "learning_rate": 3.602061508404616e-06, "loss": 0.0617, "step": 3899 }, { "epoch": 1.7743403093721564, "grad_norm": 1.3846072664515927, "learning_rate": 3.601420000144774e-06, "loss": 0.1544, "step": 3900 }, { "epoch": 1.7747952684258417, "grad_norm": 1.7474218353078286, "learning_rate": 3.6007784018818627e-06, "loss": 0.1251, "step": 3901 }, { "epoch": 1.775250227479527, "grad_norm": 1.721957372206255, "learning_rate": 3.6001367136683117e-06, "loss": 0.0644, "step": 3902 }, { "epoch": 1.7757051865332119, "grad_norm": 1.9670595492924352, "learning_rate": 3.5994949355565565e-06, "loss": 0.0883, "step": 3903 }, { "epoch": 1.7761601455868972, "grad_norm": 1.2647952523959152, "learning_rate": 3.598853067599041e-06, "loss": 0.065, "step": 3904 }, { "epoch": 1.7766151046405825, "grad_norm": 1.4031869043151017, "learning_rate": 3.5982111098482146e-06, "loss": 0.0733, "step": 3905 }, { "epoch": 1.7770700636942676, "grad_norm": 1.36956896202758, "learning_rate": 3.5975690623565364e-06, "loss": 0.0771, "step": 3906 }, { "epoch": 1.7775250227479527, "grad_norm": 1.3373699020544958, "learning_rate": 3.5969269251764704e-06, "loss": 0.067, "step": 3907 }, { "epoch": 1.777979981801638, "grad_norm": 1.6308387280174579, "learning_rate": 3.596284698360489e-06, "loss": 0.1428, "step": 3908 }, { "epoch": 1.778434940855323, "grad_norm": 1.345125673773998, "learning_rate": 3.5956423819610747e-06, "loss": 0.0789, "step": 3909 }, { "epoch": 1.7788898999090081, "grad_norm": 1.7296338835674745, "learning_rate": 3.594999976030712e-06, "loss": 0.0868, "step": 3910 }, { "epoch": 1.7793448589626935, "grad_norm": 1.4468026384860735, "learning_rate": 3.594357480621896e-06, "loss": 0.118, "step": 3911 }, { "epoch": 1.7797998180163785, "grad_norm": 1.8915181864843718, "learning_rate": 3.5937148957871294e-06, "loss": 0.0937, "step": 3912 }, { "epoch": 1.7802547770700636, "grad_norm": 1.9897789874859553, "learning_rate": 3.59307222157892e-06, "loss": 0.0797, "step": 3913 }, { "epoch": 1.780709736123749, "grad_norm": 1.8108575561560587, "learning_rate": 3.5924294580497852e-06, "loss": 0.0968, "step": 3914 }, { "epoch": 1.781164695177434, "grad_norm": 1.8247165386681972, "learning_rate": 3.5917866052522478e-06, "loss": 0.1088, "step": 3915 }, { "epoch": 1.7816196542311191, "grad_norm": 1.6352507896119537, "learning_rate": 3.5911436632388403e-06, "loss": 0.101, "step": 3916 }, { "epoch": 1.7820746132848044, "grad_norm": 1.1288470952556902, "learning_rate": 3.5905006320621006e-06, "loss": 0.0652, "step": 3917 }, { "epoch": 1.7825295723384895, "grad_norm": 1.51655586642245, "learning_rate": 3.5898575117745725e-06, "loss": 0.0831, "step": 3918 }, { "epoch": 1.7829845313921746, "grad_norm": 1.4025902376992039, "learning_rate": 3.589214302428811e-06, "loss": 0.0735, "step": 3919 }, { "epoch": 1.78343949044586, "grad_norm": 1.22925217378455, "learning_rate": 3.5885710040773757e-06, "loss": 0.0766, "step": 3920 }, { "epoch": 1.783894449499545, "grad_norm": 1.4651508589653857, "learning_rate": 3.5879276167728343e-06, "loss": 0.1204, "step": 3921 }, { "epoch": 1.78434940855323, "grad_norm": 1.785809538875832, "learning_rate": 3.5872841405677607e-06, "loss": 0.1, "step": 3922 }, { "epoch": 1.7848043676069154, "grad_norm": 1.5290844298201025, "learning_rate": 3.5866405755147364e-06, "loss": 0.0932, "step": 3923 }, { "epoch": 1.7852593266606005, "grad_norm": 1.5383400509548701, "learning_rate": 3.5859969216663526e-06, "loss": 0.0958, "step": 3924 }, { "epoch": 1.7857142857142856, "grad_norm": 1.6721888894559662, "learning_rate": 3.585353179075204e-06, "loss": 0.0868, "step": 3925 }, { "epoch": 1.786169244767971, "grad_norm": 1.5889178490289908, "learning_rate": 3.5847093477938955e-06, "loss": 0.1226, "step": 3926 }, { "epoch": 1.7866242038216562, "grad_norm": 1.3808928931481774, "learning_rate": 3.5840654278750377e-06, "loss": 0.0808, "step": 3927 }, { "epoch": 1.787079162875341, "grad_norm": 1.7626724354357823, "learning_rate": 3.5834214193712483e-06, "loss": 0.1044, "step": 3928 }, { "epoch": 1.7875341219290264, "grad_norm": 1.6053986869195658, "learning_rate": 3.5827773223351535e-06, "loss": 0.0683, "step": 3929 }, { "epoch": 1.7879890809827117, "grad_norm": 1.533257274603386, "learning_rate": 3.5821331368193857e-06, "loss": 0.0963, "step": 3930 }, { "epoch": 1.7884440400363966, "grad_norm": 1.6692928386824948, "learning_rate": 3.5814888628765846e-06, "loss": 0.0947, "step": 3931 }, { "epoch": 1.7888989990900819, "grad_norm": 1.7186881336492157, "learning_rate": 3.5808445005593972e-06, "loss": 0.1154, "step": 3932 }, { "epoch": 1.7893539581437672, "grad_norm": 1.9190416507315198, "learning_rate": 3.5802000499204793e-06, "loss": 0.1339, "step": 3933 }, { "epoch": 1.7898089171974523, "grad_norm": 1.419386812970281, "learning_rate": 3.5795555110124913e-06, "loss": 0.07, "step": 3934 }, { "epoch": 1.7902638762511374, "grad_norm": 1.3862706900284467, "learning_rate": 3.5789108838881017e-06, "loss": 0.0944, "step": 3935 }, { "epoch": 1.7907188353048227, "grad_norm": 0.9828035012548878, "learning_rate": 3.5782661685999863e-06, "loss": 0.0995, "step": 3936 }, { "epoch": 1.7911737943585078, "grad_norm": 1.535002469296854, "learning_rate": 3.57762136520083e-06, "loss": 0.0968, "step": 3937 }, { "epoch": 1.7916287534121929, "grad_norm": 1.489391281526088, "learning_rate": 3.5769764737433226e-06, "loss": 0.1153, "step": 3938 }, { "epoch": 1.7920837124658782, "grad_norm": 1.5376947733670359, "learning_rate": 3.576331494280161e-06, "loss": 0.1117, "step": 3939 }, { "epoch": 1.7925386715195633, "grad_norm": 1.073552191247603, "learning_rate": 3.5756864268640494e-06, "loss": 0.0684, "step": 3940 }, { "epoch": 1.7929936305732483, "grad_norm": 1.5831135589887766, "learning_rate": 3.5750412715477016e-06, "loss": 0.0753, "step": 3941 }, { "epoch": 1.7934485896269337, "grad_norm": 1.5064036180464089, "learning_rate": 3.574396028383836e-06, "loss": 0.0803, "step": 3942 }, { "epoch": 1.7939035486806187, "grad_norm": 1.2605613420727315, "learning_rate": 3.5737506974251785e-06, "loss": 0.0711, "step": 3943 }, { "epoch": 1.7943585077343038, "grad_norm": 1.253206207295338, "learning_rate": 3.573105278724463e-06, "loss": 0.1006, "step": 3944 }, { "epoch": 1.7948134667879891, "grad_norm": 1.2295765953742825, "learning_rate": 3.5724597723344313e-06, "loss": 0.0721, "step": 3945 }, { "epoch": 1.7952684258416742, "grad_norm": 1.4831661899175725, "learning_rate": 3.5718141783078285e-06, "loss": 0.0756, "step": 3946 }, { "epoch": 1.7957233848953593, "grad_norm": 1.314611807178835, "learning_rate": 3.5711684966974125e-06, "loss": 0.0894, "step": 3947 }, { "epoch": 1.7961783439490446, "grad_norm": 1.3355024952440224, "learning_rate": 3.570522727555944e-06, "loss": 0.1196, "step": 3948 }, { "epoch": 1.7966333030027297, "grad_norm": 1.3971852716884732, "learning_rate": 3.5698768709361926e-06, "loss": 0.1016, "step": 3949 }, { "epoch": 1.7970882620564148, "grad_norm": 1.5370831894362105, "learning_rate": 3.569230926890935e-06, "loss": 0.0967, "step": 3950 }, { "epoch": 1.7975432211101001, "grad_norm": 1.604710850394061, "learning_rate": 3.568584895472954e-06, "loss": 0.1071, "step": 3951 }, { "epoch": 1.7979981801637852, "grad_norm": 1.295840206954609, "learning_rate": 3.5679387767350414e-06, "loss": 0.0752, "step": 3952 }, { "epoch": 1.7984531392174703, "grad_norm": 1.8253669541477449, "learning_rate": 3.5672925707299955e-06, "loss": 0.0984, "step": 3953 }, { "epoch": 1.7989080982711556, "grad_norm": 1.1979393277912531, "learning_rate": 3.5666462775106193e-06, "loss": 0.0892, "step": 3954 }, { "epoch": 1.799363057324841, "grad_norm": 1.4426740494124082, "learning_rate": 3.565999897129727e-06, "loss": 0.0719, "step": 3955 }, { "epoch": 1.7998180163785258, "grad_norm": 1.715064453682624, "learning_rate": 3.5653534296401372e-06, "loss": 0.1119, "step": 3956 }, { "epoch": 1.800272975432211, "grad_norm": 1.1082572564975344, "learning_rate": 3.5647068750946754e-06, "loss": 0.0637, "step": 3957 }, { "epoch": 1.8007279344858964, "grad_norm": 1.5457728382697675, "learning_rate": 3.564060233546177e-06, "loss": 0.0789, "step": 3958 }, { "epoch": 1.8011828935395813, "grad_norm": 1.4320301385381975, "learning_rate": 3.563413505047481e-06, "loss": 0.0888, "step": 3959 }, { "epoch": 1.8016378525932666, "grad_norm": 1.4906369753636222, "learning_rate": 3.562766689651436e-06, "loss": 0.1217, "step": 3960 }, { "epoch": 1.802092811646952, "grad_norm": 1.4071642445182821, "learning_rate": 3.5621197874108957e-06, "loss": 0.1052, "step": 3961 }, { "epoch": 1.802547770700637, "grad_norm": 1.5118286207978275, "learning_rate": 3.5614727983787244e-06, "loss": 0.1448, "step": 3962 }, { "epoch": 1.803002729754322, "grad_norm": 1.3524332676824797, "learning_rate": 3.5608257226077887e-06, "loss": 0.1109, "step": 3963 }, { "epoch": 1.8034576888080074, "grad_norm": 1.8500922974690786, "learning_rate": 3.5601785601509654e-06, "loss": 0.1331, "step": 3964 }, { "epoch": 1.8039126478616925, "grad_norm": 1.622369192194537, "learning_rate": 3.5595313110611386e-06, "loss": 0.0946, "step": 3965 }, { "epoch": 1.8043676069153776, "grad_norm": 1.4360326025860157, "learning_rate": 3.558883975391197e-06, "loss": 0.0824, "step": 3966 }, { "epoch": 1.8048225659690629, "grad_norm": 1.3903616396700813, "learning_rate": 3.5582365531940387e-06, "loss": 0.0624, "step": 3967 }, { "epoch": 1.805277525022748, "grad_norm": 1.437238853867719, "learning_rate": 3.5575890445225686e-06, "loss": 0.0873, "step": 3968 }, { "epoch": 1.805732484076433, "grad_norm": 1.4699230790332718, "learning_rate": 3.5569414494296982e-06, "loss": 0.1065, "step": 3969 }, { "epoch": 1.8061874431301184, "grad_norm": 1.3845690696654567, "learning_rate": 3.5562937679683455e-06, "loss": 0.0751, "step": 3970 }, { "epoch": 1.8066424021838035, "grad_norm": 1.2980622904434707, "learning_rate": 3.5556460001914357e-06, "loss": 0.1009, "step": 3971 }, { "epoch": 1.8070973612374885, "grad_norm": 2.024438902968821, "learning_rate": 3.5549981461519028e-06, "loss": 0.0698, "step": 3972 }, { "epoch": 1.8075523202911739, "grad_norm": 1.2190276270360438, "learning_rate": 3.554350205902685e-06, "loss": 0.0892, "step": 3973 }, { "epoch": 1.808007279344859, "grad_norm": 1.9342522472015298, "learning_rate": 3.55370217949673e-06, "loss": 0.1091, "step": 3974 }, { "epoch": 1.808462238398544, "grad_norm": 1.6396563799267518, "learning_rate": 3.5530540669869915e-06, "loss": 0.0918, "step": 3975 }, { "epoch": 1.8089171974522293, "grad_norm": 1.5903146454763155, "learning_rate": 3.5524058684264304e-06, "loss": 0.0704, "step": 3976 }, { "epoch": 1.8093721565059144, "grad_norm": 1.9654297350640906, "learning_rate": 3.551757583868015e-06, "loss": 0.0972, "step": 3977 }, { "epoch": 1.8098271155595995, "grad_norm": 1.545462429692124, "learning_rate": 3.551109213364717e-06, "loss": 0.0866, "step": 3978 }, { "epoch": 1.8102820746132848, "grad_norm": 1.806694424780425, "learning_rate": 3.5504607569695237e-06, "loss": 0.0938, "step": 3979 }, { "epoch": 1.81073703366697, "grad_norm": 1.598105676087796, "learning_rate": 3.5498122147354198e-06, "loss": 0.0775, "step": 3980 }, { "epoch": 1.811191992720655, "grad_norm": 1.3100494958106024, "learning_rate": 3.549163586715403e-06, "loss": 0.0761, "step": 3981 }, { "epoch": 1.8116469517743403, "grad_norm": 1.289036733328777, "learning_rate": 3.5485148729624756e-06, "loss": 0.1184, "step": 3982 }, { "epoch": 1.8121019108280256, "grad_norm": 1.9497791037890386, "learning_rate": 3.5478660735296476e-06, "loss": 0.1002, "step": 3983 }, { "epoch": 1.8125568698817105, "grad_norm": 1.2394822063068622, "learning_rate": 3.547217188469937e-06, "loss": 0.0685, "step": 3984 }, { "epoch": 1.8130118289353958, "grad_norm": 1.8299151050535238, "learning_rate": 3.5465682178363657e-06, "loss": 0.1138, "step": 3985 }, { "epoch": 1.8134667879890811, "grad_norm": 1.8131800941433622, "learning_rate": 3.5459191616819676e-06, "loss": 0.0691, "step": 3986 }, { "epoch": 1.813921747042766, "grad_norm": 1.485052489293852, "learning_rate": 3.545270020059778e-06, "loss": 0.0785, "step": 3987 }, { "epoch": 1.8143767060964513, "grad_norm": 1.8155831580202784, "learning_rate": 3.544620793022842e-06, "loss": 0.072, "step": 3988 }, { "epoch": 1.8148316651501366, "grad_norm": 1.5300562456752116, "learning_rate": 3.543971480624214e-06, "loss": 0.0865, "step": 3989 }, { "epoch": 1.8152866242038217, "grad_norm": 1.4324759719288869, "learning_rate": 3.5433220829169495e-06, "loss": 0.113, "step": 3990 }, { "epoch": 1.8157415832575068, "grad_norm": 1.9525682285008492, "learning_rate": 3.542672599954117e-06, "loss": 0.1119, "step": 3991 }, { "epoch": 1.816196542311192, "grad_norm": 1.6250068359161982, "learning_rate": 3.5420230317887884e-06, "loss": 0.1053, "step": 3992 }, { "epoch": 1.8166515013648772, "grad_norm": 1.4463332416531227, "learning_rate": 3.5413733784740423e-06, "loss": 0.0963, "step": 3993 }, { "epoch": 1.8171064604185623, "grad_norm": 1.2756589151610211, "learning_rate": 3.5407236400629676e-06, "loss": 0.0865, "step": 3994 }, { "epoch": 1.8175614194722476, "grad_norm": 1.5847670311457605, "learning_rate": 3.5400738166086555e-06, "loss": 0.0707, "step": 3995 }, { "epoch": 1.8180163785259327, "grad_norm": 1.5441419321695022, "learning_rate": 3.5394239081642095e-06, "loss": 0.0633, "step": 3996 }, { "epoch": 1.8184713375796178, "grad_norm": 1.4022851946937278, "learning_rate": 3.5387739147827355e-06, "loss": 0.1021, "step": 3997 }, { "epoch": 1.818926296633303, "grad_norm": 1.5993910170200571, "learning_rate": 3.538123836517348e-06, "loss": 0.1054, "step": 3998 }, { "epoch": 1.8193812556869882, "grad_norm": 1.4101800305045573, "learning_rate": 3.5374736734211686e-06, "loss": 0.095, "step": 3999 }, { "epoch": 1.8198362147406733, "grad_norm": 1.0883517935098481, "learning_rate": 3.5368234255473255e-06, "loss": 0.0615, "step": 4000 }, { "epoch": 1.8202911737943586, "grad_norm": 1.4093112004472221, "learning_rate": 3.536173092948955e-06, "loss": 0.1023, "step": 4001 }, { "epoch": 1.8207461328480437, "grad_norm": 1.2933863312892944, "learning_rate": 3.535522675679198e-06, "loss": 0.087, "step": 4002 }, { "epoch": 1.8212010919017287, "grad_norm": 1.7967808901445457, "learning_rate": 3.534872173791205e-06, "loss": 0.0934, "step": 4003 }, { "epoch": 1.821656050955414, "grad_norm": 1.8741353819993365, "learning_rate": 3.534221587338131e-06, "loss": 0.1039, "step": 4004 }, { "epoch": 1.8221110100090991, "grad_norm": 1.494851252515709, "learning_rate": 3.5335709163731395e-06, "loss": 0.0972, "step": 4005 }, { "epoch": 1.8225659690627842, "grad_norm": 1.4742269496209741, "learning_rate": 3.5329201609494002e-06, "loss": 0.1184, "step": 4006 }, { "epoch": 1.8230209281164695, "grad_norm": 1.6346732660236687, "learning_rate": 3.53226932112009e-06, "loss": 0.0634, "step": 4007 }, { "epoch": 1.8234758871701549, "grad_norm": 1.557869271208195, "learning_rate": 3.5316183969383927e-06, "loss": 0.0858, "step": 4008 }, { "epoch": 1.8239308462238397, "grad_norm": 1.2761714255917784, "learning_rate": 3.5309673884574986e-06, "loss": 0.0941, "step": 4009 }, { "epoch": 1.824385805277525, "grad_norm": 1.4811450419133092, "learning_rate": 3.530316295730605e-06, "loss": 0.104, "step": 4010 }, { "epoch": 1.8248407643312103, "grad_norm": 1.466205797428728, "learning_rate": 3.5296651188109177e-06, "loss": 0.1047, "step": 4011 }, { "epoch": 1.8252957233848952, "grad_norm": 1.8021334458803067, "learning_rate": 3.529013857751646e-06, "loss": 0.1127, "step": 4012 }, { "epoch": 1.8257506824385805, "grad_norm": 1.2370650934050405, "learning_rate": 3.5283625126060084e-06, "loss": 0.0976, "step": 4013 }, { "epoch": 1.8262056414922658, "grad_norm": 1.2987985228173513, "learning_rate": 3.5277110834272305e-06, "loss": 0.0595, "step": 4014 }, { "epoch": 1.826660600545951, "grad_norm": 1.361687488677831, "learning_rate": 3.527059570268545e-06, "loss": 0.1308, "step": 4015 }, { "epoch": 1.827115559599636, "grad_norm": 2.1030020001274456, "learning_rate": 3.5264079731831885e-06, "loss": 0.0881, "step": 4016 }, { "epoch": 1.8275705186533213, "grad_norm": 1.5859094177026745, "learning_rate": 3.5257562922244074e-06, "loss": 0.1111, "step": 4017 }, { "epoch": 1.8280254777070064, "grad_norm": 1.4384875086528517, "learning_rate": 3.5251045274454554e-06, "loss": 0.0658, "step": 4018 }, { "epoch": 1.8284804367606915, "grad_norm": 1.3301681764066562, "learning_rate": 3.5244526788995905e-06, "loss": 0.0727, "step": 4019 }, { "epoch": 1.8289353958143768, "grad_norm": 1.783764735643231, "learning_rate": 3.5238007466400787e-06, "loss": 0.062, "step": 4020 }, { "epoch": 1.829390354868062, "grad_norm": 1.4026664224969767, "learning_rate": 3.5231487307201933e-06, "loss": 0.0802, "step": 4021 }, { "epoch": 1.829845313921747, "grad_norm": 1.465140595135788, "learning_rate": 3.5224966311932145e-06, "loss": 0.1539, "step": 4022 }, { "epoch": 1.8303002729754323, "grad_norm": 2.2013824316677293, "learning_rate": 3.521844448112428e-06, "loss": 0.081, "step": 4023 }, { "epoch": 1.8307552320291174, "grad_norm": 1.3566593993963982, "learning_rate": 3.5211921815311284e-06, "loss": 0.1015, "step": 4024 }, { "epoch": 1.8312101910828025, "grad_norm": 1.4353235605936552, "learning_rate": 3.5205398315026156e-06, "loss": 0.0806, "step": 4025 }, { "epoch": 1.8316651501364878, "grad_norm": 2.0166739081918053, "learning_rate": 3.5198873980801956e-06, "loss": 0.078, "step": 4026 }, { "epoch": 1.8321201091901729, "grad_norm": 1.76286593886622, "learning_rate": 3.519234881317184e-06, "loss": 0.0871, "step": 4027 }, { "epoch": 1.832575068243858, "grad_norm": 1.1837840426271566, "learning_rate": 3.5185822812669e-06, "loss": 0.0714, "step": 4028 }, { "epoch": 1.8330300272975433, "grad_norm": 1.3296159186425494, "learning_rate": 3.517929597982672e-06, "loss": 0.148, "step": 4029 }, { "epoch": 1.8334849863512284, "grad_norm": 1.0635132101429015, "learning_rate": 3.5172768315178345e-06, "loss": 0.0765, "step": 4030 }, { "epoch": 1.8339399454049135, "grad_norm": 1.3330922847185651, "learning_rate": 3.5166239819257286e-06, "loss": 0.1206, "step": 4031 }, { "epoch": 1.8343949044585988, "grad_norm": 1.3713865351131023, "learning_rate": 3.5159710492597014e-06, "loss": 0.1105, "step": 4032 }, { "epoch": 1.8348498635122839, "grad_norm": 1.6167245954707383, "learning_rate": 3.5153180335731085e-06, "loss": 0.079, "step": 4033 }, { "epoch": 1.835304822565969, "grad_norm": 1.8002635354441756, "learning_rate": 3.5146649349193108e-06, "loss": 0.0763, "step": 4034 }, { "epoch": 1.8357597816196543, "grad_norm": 1.6432364710140848, "learning_rate": 3.514011753351677e-06, "loss": 0.0876, "step": 4035 }, { "epoch": 1.8362147406733396, "grad_norm": 1.7240424504508396, "learning_rate": 3.5133584889235817e-06, "loss": 0.0714, "step": 4036 }, { "epoch": 1.8366696997270244, "grad_norm": 0.9496409902536068, "learning_rate": 3.5127051416884067e-06, "loss": 0.105, "step": 4037 }, { "epoch": 1.8371246587807097, "grad_norm": 1.70897223694499, "learning_rate": 3.5120517116995413e-06, "loss": 0.1075, "step": 4038 }, { "epoch": 1.837579617834395, "grad_norm": 1.7645377214496607, "learning_rate": 3.5113981990103807e-06, "loss": 0.0919, "step": 4039 }, { "epoch": 1.83803457688808, "grad_norm": 1.559402589556239, "learning_rate": 3.5107446036743257e-06, "loss": 0.083, "step": 4040 }, { "epoch": 1.8384895359417652, "grad_norm": 1.400470091896617, "learning_rate": 3.510090925744787e-06, "loss": 0.1615, "step": 4041 }, { "epoch": 1.8389444949954505, "grad_norm": 1.3642534795589096, "learning_rate": 3.5094371652751797e-06, "loss": 0.132, "step": 4042 }, { "epoch": 1.8393994540491356, "grad_norm": 1.5943195188520494, "learning_rate": 3.508783322318925e-06, "loss": 0.0711, "step": 4043 }, { "epoch": 1.8398544131028207, "grad_norm": 1.5410077381378657, "learning_rate": 3.508129396929453e-06, "loss": 0.0802, "step": 4044 }, { "epoch": 1.840309372156506, "grad_norm": 1.2237484557563534, "learning_rate": 3.5074753891601996e-06, "loss": 0.0853, "step": 4045 }, { "epoch": 1.8407643312101911, "grad_norm": 1.3809808483872426, "learning_rate": 3.5068212990646065e-06, "loss": 0.0719, "step": 4046 }, { "epoch": 1.8412192902638762, "grad_norm": 1.6580232400757038, "learning_rate": 3.506167126696125e-06, "loss": 0.0825, "step": 4047 }, { "epoch": 1.8416742493175615, "grad_norm": 1.6111119004005907, "learning_rate": 3.5055128721082083e-06, "loss": 0.0699, "step": 4048 }, { "epoch": 1.8421292083712466, "grad_norm": 1.2583629469163944, "learning_rate": 3.5048585353543212e-06, "loss": 0.05, "step": 4049 }, { "epoch": 1.8425841674249317, "grad_norm": 1.520691246153791, "learning_rate": 3.5042041164879324e-06, "loss": 0.1246, "step": 4050 }, { "epoch": 1.843039126478617, "grad_norm": 1.962331172718861, "learning_rate": 3.503549615562518e-06, "loss": 0.1323, "step": 4051 }, { "epoch": 1.843494085532302, "grad_norm": 1.5861152941288819, "learning_rate": 3.5028950326315615e-06, "loss": 0.0488, "step": 4052 }, { "epoch": 1.8439490445859872, "grad_norm": 1.40422373818074, "learning_rate": 3.502240367748551e-06, "loss": 0.0879, "step": 4053 }, { "epoch": 1.8444040036396725, "grad_norm": 1.3766167862609862, "learning_rate": 3.501585620966985e-06, "loss": 0.1228, "step": 4054 }, { "epoch": 1.8448589626933576, "grad_norm": 1.3311696208204924, "learning_rate": 3.5009307923403634e-06, "loss": 0.1097, "step": 4055 }, { "epoch": 1.8453139217470427, "grad_norm": 1.6533578404754772, "learning_rate": 3.5002758819221993e-06, "loss": 0.0815, "step": 4056 }, { "epoch": 1.845768880800728, "grad_norm": 1.6320533353857831, "learning_rate": 3.499620889766007e-06, "loss": 0.1019, "step": 4057 }, { "epoch": 1.846223839854413, "grad_norm": 1.648962535080028, "learning_rate": 3.4989658159253094e-06, "loss": 0.0865, "step": 4058 }, { "epoch": 1.8466787989080982, "grad_norm": 1.3312561269568546, "learning_rate": 3.4983106604536367e-06, "loss": 0.0729, "step": 4059 }, { "epoch": 1.8471337579617835, "grad_norm": 1.3509856527487163, "learning_rate": 3.4976554234045253e-06, "loss": 0.1184, "step": 4060 }, { "epoch": 1.8475887170154686, "grad_norm": 1.5235035435559412, "learning_rate": 3.4970001048315184e-06, "loss": 0.0508, "step": 4061 }, { "epoch": 1.8480436760691537, "grad_norm": 1.173808780891847, "learning_rate": 3.496344704788165e-06, "loss": 0.0655, "step": 4062 }, { "epoch": 1.848498635122839, "grad_norm": 1.595476347277881, "learning_rate": 3.4956892233280214e-06, "loss": 0.1071, "step": 4063 }, { "epoch": 1.8489535941765243, "grad_norm": 1.5499460618833258, "learning_rate": 3.4950336605046513e-06, "loss": 0.0864, "step": 4064 }, { "epoch": 1.8494085532302091, "grad_norm": 1.7045602362387247, "learning_rate": 3.4943780163716232e-06, "loss": 0.1251, "step": 4065 }, { "epoch": 1.8498635122838945, "grad_norm": 2.0393983541875644, "learning_rate": 3.4937222909825157e-06, "loss": 0.1247, "step": 4066 }, { "epoch": 1.8503184713375798, "grad_norm": 1.4799835730020663, "learning_rate": 3.493066484390909e-06, "loss": 0.0689, "step": 4067 }, { "epoch": 1.8507734303912646, "grad_norm": 1.3859003801260346, "learning_rate": 3.4924105966503952e-06, "loss": 0.0901, "step": 4068 }, { "epoch": 1.85122838944495, "grad_norm": 1.902908525856045, "learning_rate": 3.491754627814568e-06, "loss": 0.1364, "step": 4069 }, { "epoch": 1.8516833484986353, "grad_norm": 1.8154188386652903, "learning_rate": 3.491098577937031e-06, "loss": 0.0759, "step": 4070 }, { "epoch": 1.8521383075523203, "grad_norm": 1.7607883845330532, "learning_rate": 3.4904424470713947e-06, "loss": 0.0718, "step": 4071 }, { "epoch": 1.8525932666060054, "grad_norm": 1.232456200431414, "learning_rate": 3.4897862352712743e-06, "loss": 0.0968, "step": 4072 }, { "epoch": 1.8530482256596907, "grad_norm": 1.262168638125117, "learning_rate": 3.4891299425902923e-06, "loss": 0.0795, "step": 4073 }, { "epoch": 1.8535031847133758, "grad_norm": 1.2761919178170098, "learning_rate": 3.4884735690820786e-06, "loss": 0.0891, "step": 4074 }, { "epoch": 1.853958143767061, "grad_norm": 2.2576745602212616, "learning_rate": 3.4878171148002694e-06, "loss": 0.1388, "step": 4075 }, { "epoch": 1.8544131028207462, "grad_norm": 1.2237522911961478, "learning_rate": 3.4871605797985052e-06, "loss": 0.0794, "step": 4076 }, { "epoch": 1.8548680618744313, "grad_norm": 1.8099647025853436, "learning_rate": 3.486503964130437e-06, "loss": 0.0847, "step": 4077 }, { "epoch": 1.8553230209281164, "grad_norm": 1.6323327898650988, "learning_rate": 3.4858472678497204e-06, "loss": 0.0567, "step": 4078 }, { "epoch": 1.8557779799818017, "grad_norm": 1.7401108896095778, "learning_rate": 3.4851904910100166e-06, "loss": 0.1463, "step": 4079 }, { "epoch": 1.8562329390354868, "grad_norm": 2.0769505024451966, "learning_rate": 3.4845336336649943e-06, "loss": 0.1198, "step": 4080 }, { "epoch": 1.856687898089172, "grad_norm": 2.265845834591741, "learning_rate": 3.483876695868331e-06, "loss": 0.0655, "step": 4081 }, { "epoch": 1.8571428571428572, "grad_norm": 1.682189107769135, "learning_rate": 3.483219677673706e-06, "loss": 0.1074, "step": 4082 }, { "epoch": 1.8575978161965423, "grad_norm": 1.7295718722221005, "learning_rate": 3.4825625791348093e-06, "loss": 0.0945, "step": 4083 }, { "epoch": 1.8580527752502274, "grad_norm": 1.8102113782062268, "learning_rate": 3.481905400305336e-06, "loss": 0.0732, "step": 4084 }, { "epoch": 1.8585077343039127, "grad_norm": 1.1777282691174737, "learning_rate": 3.481248141238988e-06, "loss": 0.0807, "step": 4085 }, { "epoch": 1.8589626933575978, "grad_norm": 1.923431314724143, "learning_rate": 3.480590801989473e-06, "loss": 0.1135, "step": 4086 }, { "epoch": 1.8594176524112829, "grad_norm": 2.5746172253260773, "learning_rate": 3.479933382610506e-06, "loss": 0.0974, "step": 4087 }, { "epoch": 1.8598726114649682, "grad_norm": 1.7240296145700584, "learning_rate": 3.479275883155808e-06, "loss": 0.1115, "step": 4088 }, { "epoch": 1.8603275705186533, "grad_norm": 1.3382015084272265, "learning_rate": 3.478618303679108e-06, "loss": 0.1034, "step": 4089 }, { "epoch": 1.8607825295723384, "grad_norm": 1.4357343361961057, "learning_rate": 3.4779606442341385e-06, "loss": 0.0676, "step": 4090 }, { "epoch": 1.8612374886260237, "grad_norm": 1.6229943285154442, "learning_rate": 3.477302904874642e-06, "loss": 0.0685, "step": 4091 }, { "epoch": 1.861692447679709, "grad_norm": 1.26527720451928, "learning_rate": 3.476645085654366e-06, "loss": 0.0616, "step": 4092 }, { "epoch": 1.8621474067333939, "grad_norm": 1.0737439306727963, "learning_rate": 3.4759871866270633e-06, "loss": 0.0563, "step": 4093 }, { "epoch": 1.8626023657870792, "grad_norm": 1.5730297691025419, "learning_rate": 3.475329207846496e-06, "loss": 0.0851, "step": 4094 }, { "epoch": 1.8630573248407645, "grad_norm": 1.486403108787133, "learning_rate": 3.4746711493664305e-06, "loss": 0.1075, "step": 4095 }, { "epoch": 1.8635122838944493, "grad_norm": 1.6106703680190648, "learning_rate": 3.4740130112406395e-06, "loss": 0.1033, "step": 4096 }, { "epoch": 1.8639672429481347, "grad_norm": 0.9991238739762511, "learning_rate": 3.473354793522904e-06, "loss": 0.0409, "step": 4097 }, { "epoch": 1.86442220200182, "grad_norm": 1.3237049132633334, "learning_rate": 3.472696496267011e-06, "loss": 0.0848, "step": 4098 }, { "epoch": 1.864877161055505, "grad_norm": 1.5626147938589972, "learning_rate": 3.4720381195267523e-06, "loss": 0.0661, "step": 4099 }, { "epoch": 1.8653321201091901, "grad_norm": 1.620639560734756, "learning_rate": 3.4713796633559283e-06, "loss": 0.1121, "step": 4100 }, { "epoch": 1.8657870791628755, "grad_norm": 1.7494971582944916, "learning_rate": 3.4707211278083453e-06, "loss": 0.0852, "step": 4101 }, { "epoch": 1.8662420382165605, "grad_norm": 1.5245162427101169, "learning_rate": 3.470062512937815e-06, "loss": 0.102, "step": 4102 }, { "epoch": 1.8666969972702456, "grad_norm": 1.2295306337132756, "learning_rate": 3.4694038187981573e-06, "loss": 0.0755, "step": 4103 }, { "epoch": 1.867151956323931, "grad_norm": 1.8099872942646476, "learning_rate": 3.468745045443197e-06, "loss": 0.0994, "step": 4104 }, { "epoch": 1.867606915377616, "grad_norm": 1.3329597281600571, "learning_rate": 3.468086192926767e-06, "loss": 0.0443, "step": 4105 }, { "epoch": 1.8680618744313011, "grad_norm": 1.3775378288923616, "learning_rate": 3.4674272613027043e-06, "loss": 0.0658, "step": 4106 }, { "epoch": 1.8685168334849864, "grad_norm": 1.3139765596285502, "learning_rate": 3.4667682506248547e-06, "loss": 0.0915, "step": 4107 }, { "epoch": 1.8689717925386715, "grad_norm": 1.0947935956297494, "learning_rate": 3.46610916094707e-06, "loss": 0.0741, "step": 4108 }, { "epoch": 1.8694267515923566, "grad_norm": 1.642479107568066, "learning_rate": 3.465449992323208e-06, "loss": 0.0848, "step": 4109 }, { "epoch": 1.869881710646042, "grad_norm": 1.5490672755624628, "learning_rate": 3.4647907448071315e-06, "loss": 0.0964, "step": 4110 }, { "epoch": 1.870336669699727, "grad_norm": 1.4204634027218384, "learning_rate": 3.464131418452713e-06, "loss": 0.0835, "step": 4111 }, { "epoch": 1.870791628753412, "grad_norm": 1.7886424017454572, "learning_rate": 3.463472013313829e-06, "loss": 0.0776, "step": 4112 }, { "epoch": 1.8712465878070974, "grad_norm": 1.5351637721963112, "learning_rate": 3.4628125294443625e-06, "loss": 0.0807, "step": 4113 }, { "epoch": 1.8717015468607825, "grad_norm": 1.929015151861915, "learning_rate": 3.4621529668982047e-06, "loss": 0.0837, "step": 4114 }, { "epoch": 1.8721565059144676, "grad_norm": 1.168687772264402, "learning_rate": 3.4614933257292514e-06, "loss": 0.0879, "step": 4115 }, { "epoch": 1.872611464968153, "grad_norm": 2.0245078574365882, "learning_rate": 3.4608336059914057e-06, "loss": 0.1098, "step": 4116 }, { "epoch": 1.873066424021838, "grad_norm": 1.6911368091501362, "learning_rate": 3.4601738077385766e-06, "loss": 0.0724, "step": 4117 }, { "epoch": 1.873521383075523, "grad_norm": 1.4860767448002008, "learning_rate": 3.4595139310246795e-06, "loss": 0.1005, "step": 4118 }, { "epoch": 1.8739763421292084, "grad_norm": 1.4577177516571427, "learning_rate": 3.4588539759036377e-06, "loss": 0.0813, "step": 4119 }, { "epoch": 1.8744313011828937, "grad_norm": 1.9012543010358114, "learning_rate": 3.4581939424293792e-06, "loss": 0.0861, "step": 4120 }, { "epoch": 1.8748862602365786, "grad_norm": 2.2109125106057803, "learning_rate": 3.457533830655838e-06, "loss": 0.131, "step": 4121 }, { "epoch": 1.8753412192902639, "grad_norm": 1.9039444326789203, "learning_rate": 3.456873640636958e-06, "loss": 0.121, "step": 4122 }, { "epoch": 1.8757961783439492, "grad_norm": 1.7246099220063285, "learning_rate": 3.456213372426684e-06, "loss": 0.0893, "step": 4123 }, { "epoch": 1.876251137397634, "grad_norm": 1.4171558180903838, "learning_rate": 3.4555530260789715e-06, "loss": 0.0716, "step": 4124 }, { "epoch": 1.8767060964513194, "grad_norm": 1.4298474808344117, "learning_rate": 3.4548926016477815e-06, "loss": 0.1171, "step": 4125 }, { "epoch": 1.8771610555050047, "grad_norm": 1.5779558294671008, "learning_rate": 3.4542320991870803e-06, "loss": 0.1096, "step": 4126 }, { "epoch": 1.8776160145586898, "grad_norm": 1.6538121796546623, "learning_rate": 3.4535715187508406e-06, "loss": 0.0771, "step": 4127 }, { "epoch": 1.8780709736123748, "grad_norm": 1.5836249384701016, "learning_rate": 3.4529108603930428e-06, "loss": 0.054, "step": 4128 }, { "epoch": 1.8785259326660602, "grad_norm": 1.4868161588123674, "learning_rate": 3.452250124167674e-06, "loss": 0.1174, "step": 4129 }, { "epoch": 1.8789808917197452, "grad_norm": 1.3276341541416605, "learning_rate": 3.451589310128724e-06, "loss": 0.0982, "step": 4130 }, { "epoch": 1.8794358507734303, "grad_norm": 1.6179268201804782, "learning_rate": 3.450928418330193e-06, "loss": 0.074, "step": 4131 }, { "epoch": 1.8798908098271156, "grad_norm": 2.4291985281222557, "learning_rate": 3.450267448826087e-06, "loss": 0.1213, "step": 4132 }, { "epoch": 1.8803457688808007, "grad_norm": 1.4638492167669688, "learning_rate": 3.4496064016704158e-06, "loss": 0.1055, "step": 4133 }, { "epoch": 1.8808007279344858, "grad_norm": 1.8954355792147906, "learning_rate": 3.4489452769171982e-06, "loss": 0.0754, "step": 4134 }, { "epoch": 1.8812556869881711, "grad_norm": 1.5482792861871577, "learning_rate": 3.4482840746204573e-06, "loss": 0.0949, "step": 4135 }, { "epoch": 1.8817106460418562, "grad_norm": 1.605992787254981, "learning_rate": 3.4476227948342247e-06, "loss": 0.0665, "step": 4136 }, { "epoch": 1.8821656050955413, "grad_norm": 1.6666267518026783, "learning_rate": 3.446961437612536e-06, "loss": 0.1276, "step": 4137 }, { "epoch": 1.8826205641492266, "grad_norm": 1.3377887182050099, "learning_rate": 3.4463000030094356e-06, "loss": 0.0749, "step": 4138 }, { "epoch": 1.8830755232029117, "grad_norm": 1.443178169641014, "learning_rate": 3.445638491078973e-06, "loss": 0.095, "step": 4139 }, { "epoch": 1.8835304822565968, "grad_norm": 1.2096197017799846, "learning_rate": 3.4449769018752027e-06, "loss": 0.1157, "step": 4140 }, { "epoch": 1.8839854413102821, "grad_norm": 1.7479045175597485, "learning_rate": 3.4443152354521882e-06, "loss": 0.1371, "step": 4141 }, { "epoch": 1.8844404003639672, "grad_norm": 1.362663666071147, "learning_rate": 3.4436534918639957e-06, "loss": 0.0748, "step": 4142 }, { "epoch": 1.8848953594176523, "grad_norm": 2.047652186105477, "learning_rate": 3.442991671164703e-06, "loss": 0.089, "step": 4143 }, { "epoch": 1.8853503184713376, "grad_norm": 1.6841039140811258, "learning_rate": 3.4423297734083884e-06, "loss": 0.1015, "step": 4144 }, { "epoch": 1.8858052775250227, "grad_norm": 1.628024361018149, "learning_rate": 3.4416677986491397e-06, "loss": 0.0886, "step": 4145 }, { "epoch": 1.8862602365787078, "grad_norm": 1.3985654728455448, "learning_rate": 3.4410057469410524e-06, "loss": 0.094, "step": 4146 }, { "epoch": 1.886715195632393, "grad_norm": 1.63393698506371, "learning_rate": 3.4403436183382244e-06, "loss": 0.1191, "step": 4147 }, { "epoch": 1.8871701546860784, "grad_norm": 2.169278893001332, "learning_rate": 3.4396814128947626e-06, "loss": 0.1149, "step": 4148 }, { "epoch": 1.8876251137397633, "grad_norm": 1.3500745972311974, "learning_rate": 3.4390191306647787e-06, "loss": 0.0625, "step": 4149 }, { "epoch": 1.8880800727934486, "grad_norm": 1.880335090818805, "learning_rate": 3.4383567717023923e-06, "loss": 0.0963, "step": 4150 }, { "epoch": 1.888535031847134, "grad_norm": 1.3484485672364326, "learning_rate": 3.437694336061729e-06, "loss": 0.0748, "step": 4151 }, { "epoch": 1.8889899909008188, "grad_norm": 1.5546960807175936, "learning_rate": 3.437031823796918e-06, "loss": 0.0717, "step": 4152 }, { "epoch": 1.889444949954504, "grad_norm": 1.5366244360199475, "learning_rate": 3.436369234962099e-06, "loss": 0.1323, "step": 4153 }, { "epoch": 1.8898999090081894, "grad_norm": 1.4261034883476653, "learning_rate": 3.4357065696114134e-06, "loss": 0.0727, "step": 4154 }, { "epoch": 1.8903548680618745, "grad_norm": 1.7698096337479736, "learning_rate": 3.435043827799014e-06, "loss": 0.1611, "step": 4155 }, { "epoch": 1.8908098271155596, "grad_norm": 1.1968310976690506, "learning_rate": 3.4343810095790547e-06, "loss": 0.0901, "step": 4156 }, { "epoch": 1.8912647861692449, "grad_norm": 1.4543608662752987, "learning_rate": 3.4337181150056984e-06, "loss": 0.0627, "step": 4157 }, { "epoch": 1.89171974522293, "grad_norm": 1.7567759579741318, "learning_rate": 3.433055144133116e-06, "loss": 0.069, "step": 4158 }, { "epoch": 1.892174704276615, "grad_norm": 1.5341657106666644, "learning_rate": 3.432392097015479e-06, "loss": 0.0638, "step": 4159 }, { "epoch": 1.8926296633303004, "grad_norm": 1.3857344307572173, "learning_rate": 3.431728973706972e-06, "loss": 0.1167, "step": 4160 }, { "epoch": 1.8930846223839854, "grad_norm": 1.5564261487733888, "learning_rate": 3.4310657742617804e-06, "loss": 0.1286, "step": 4161 }, { "epoch": 1.8935395814376705, "grad_norm": 1.5999555210442749, "learning_rate": 3.4304024987340982e-06, "loss": 0.0886, "step": 4162 }, { "epoch": 1.8939945404913558, "grad_norm": 1.1852083005472314, "learning_rate": 3.429739147178126e-06, "loss": 0.0831, "step": 4163 }, { "epoch": 1.894449499545041, "grad_norm": 1.7288664351923562, "learning_rate": 3.4290757196480683e-06, "loss": 0.0996, "step": 4164 }, { "epoch": 1.894904458598726, "grad_norm": 1.6366168855755323, "learning_rate": 3.42841221619814e-06, "loss": 0.1368, "step": 4165 }, { "epoch": 1.8953594176524113, "grad_norm": 2.065953250826463, "learning_rate": 3.4277486368825563e-06, "loss": 0.1126, "step": 4166 }, { "epoch": 1.8958143767060964, "grad_norm": 1.2504353488837774, "learning_rate": 3.427084981755545e-06, "loss": 0.086, "step": 4167 }, { "epoch": 1.8962693357597815, "grad_norm": 1.4527484114059985, "learning_rate": 3.4264212508713357e-06, "loss": 0.0932, "step": 4168 }, { "epoch": 1.8967242948134668, "grad_norm": 1.2819443469844625, "learning_rate": 3.4257574442841644e-06, "loss": 0.0705, "step": 4169 }, { "epoch": 1.897179253867152, "grad_norm": 1.2764501707822304, "learning_rate": 3.425093562048276e-06, "loss": 0.1165, "step": 4170 }, { "epoch": 1.897634212920837, "grad_norm": 1.5397084270484258, "learning_rate": 3.424429604217919e-06, "loss": 0.0905, "step": 4171 }, { "epoch": 1.8980891719745223, "grad_norm": 1.4767038347714316, "learning_rate": 3.4237655708473506e-06, "loss": 0.1001, "step": 4172 }, { "epoch": 1.8985441310282076, "grad_norm": 1.5794484461269536, "learning_rate": 3.4231014619908303e-06, "loss": 0.0944, "step": 4173 }, { "epoch": 1.8989990900818925, "grad_norm": 1.8037677959968994, "learning_rate": 3.422437277702628e-06, "loss": 0.0728, "step": 4174 }, { "epoch": 1.8994540491355778, "grad_norm": 1.2730899677881777, "learning_rate": 3.4217730180370168e-06, "loss": 0.103, "step": 4175 }, { "epoch": 1.8999090081892631, "grad_norm": 1.5813522156532585, "learning_rate": 3.4211086830482766e-06, "loss": 0.0629, "step": 4176 }, { "epoch": 1.900363967242948, "grad_norm": 1.5439340568071835, "learning_rate": 3.420444272790695e-06, "loss": 0.0909, "step": 4177 }, { "epoch": 1.9008189262966333, "grad_norm": 1.384715166639908, "learning_rate": 3.419779787318564e-06, "loss": 0.1135, "step": 4178 }, { "epoch": 1.9012738853503186, "grad_norm": 1.5354073791340568, "learning_rate": 3.4191152266861826e-06, "loss": 0.0699, "step": 4179 }, { "epoch": 1.9017288444040037, "grad_norm": 1.817165465251837, "learning_rate": 3.4184505909478554e-06, "loss": 0.0871, "step": 4180 }, { "epoch": 1.9021838034576888, "grad_norm": 1.5768312029259108, "learning_rate": 3.417785880157894e-06, "loss": 0.1492, "step": 4181 }, { "epoch": 1.902638762511374, "grad_norm": 1.5007855151918974, "learning_rate": 3.417121094370615e-06, "loss": 0.0891, "step": 4182 }, { "epoch": 1.9030937215650592, "grad_norm": 1.8333071942312296, "learning_rate": 3.416456233640342e-06, "loss": 0.1055, "step": 4183 }, { "epoch": 1.9035486806187443, "grad_norm": 1.5813199055923117, "learning_rate": 3.4157912980214036e-06, "loss": 0.1061, "step": 4184 }, { "epoch": 1.9040036396724296, "grad_norm": 1.489036147310256, "learning_rate": 3.4151262875681362e-06, "loss": 0.1495, "step": 4185 }, { "epoch": 1.9044585987261147, "grad_norm": 1.3317921141371796, "learning_rate": 3.4144612023348823e-06, "loss": 0.0907, "step": 4186 }, { "epoch": 1.9049135577797998, "grad_norm": 1.0212810858136019, "learning_rate": 3.4137960423759874e-06, "loss": 0.0866, "step": 4187 }, { "epoch": 1.905368516833485, "grad_norm": 1.4701853998272907, "learning_rate": 3.413130807745807e-06, "loss": 0.1165, "step": 4188 }, { "epoch": 1.9058234758871702, "grad_norm": 1.5228823160770288, "learning_rate": 3.4124654984987003e-06, "loss": 0.0651, "step": 4189 }, { "epoch": 1.9062784349408552, "grad_norm": 1.8390923462902071, "learning_rate": 3.4118001146890345e-06, "loss": 0.0978, "step": 4190 }, { "epoch": 1.9067333939945406, "grad_norm": 1.3237125602202398, "learning_rate": 3.41113465637118e-06, "loss": 0.0458, "step": 4191 }, { "epoch": 1.9071883530482256, "grad_norm": 1.318178192583189, "learning_rate": 3.4104691235995173e-06, "loss": 0.1226, "step": 4192 }, { "epoch": 1.9076433121019107, "grad_norm": 1.90871276811074, "learning_rate": 3.4098035164284284e-06, "loss": 0.0755, "step": 4193 }, { "epoch": 1.908098271155596, "grad_norm": 1.888532421146149, "learning_rate": 3.409137834912305e-06, "loss": 0.0888, "step": 4194 }, { "epoch": 1.9085532302092811, "grad_norm": 2.1412900407584203, "learning_rate": 3.408472079105544e-06, "loss": 0.1243, "step": 4195 }, { "epoch": 1.9090081892629662, "grad_norm": 1.6214672817432414, "learning_rate": 3.4078062490625465e-06, "loss": 0.0946, "step": 4196 }, { "epoch": 1.9094631483166515, "grad_norm": 1.7207237095924448, "learning_rate": 3.407140344837722e-06, "loss": 0.1412, "step": 4197 }, { "epoch": 1.9099181073703366, "grad_norm": 1.1232379992379184, "learning_rate": 3.4064743664854853e-06, "loss": 0.0491, "step": 4198 }, { "epoch": 1.9103730664240217, "grad_norm": 1.2893068056337496, "learning_rate": 3.405808314060257e-06, "loss": 0.0965, "step": 4199 }, { "epoch": 1.910828025477707, "grad_norm": 1.3167444568623385, "learning_rate": 3.4051421876164643e-06, "loss": 0.0624, "step": 4200 }, { "epoch": 1.9112829845313923, "grad_norm": 1.7563607754275163, "learning_rate": 3.4044759872085387e-06, "loss": 0.1114, "step": 4201 }, { "epoch": 1.9117379435850772, "grad_norm": 1.4383815772821011, "learning_rate": 3.4038097128909207e-06, "loss": 0.0949, "step": 4202 }, { "epoch": 1.9121929026387625, "grad_norm": 1.4727257182675373, "learning_rate": 3.4031433647180547e-06, "loss": 0.0823, "step": 4203 }, { "epoch": 1.9126478616924478, "grad_norm": 1.5230028514464025, "learning_rate": 3.4024769427443916e-06, "loss": 0.0461, "step": 4204 }, { "epoch": 1.9131028207461327, "grad_norm": 1.2703547948455516, "learning_rate": 3.4018104470243866e-06, "loss": 0.1002, "step": 4205 }, { "epoch": 1.913557779799818, "grad_norm": 1.5697864182582966, "learning_rate": 3.401143877612506e-06, "loss": 0.1227, "step": 4206 }, { "epoch": 1.9140127388535033, "grad_norm": 1.3529567116728383, "learning_rate": 3.400477234563217e-06, "loss": 0.0543, "step": 4207 }, { "epoch": 1.9144676979071884, "grad_norm": 1.6549119747593437, "learning_rate": 3.3998105179309946e-06, "loss": 0.0851, "step": 4208 }, { "epoch": 1.9149226569608735, "grad_norm": 1.389562371993063, "learning_rate": 3.399143727770321e-06, "loss": 0.0859, "step": 4209 }, { "epoch": 1.9153776160145588, "grad_norm": 1.8103340113113127, "learning_rate": 3.3984768641356812e-06, "loss": 0.1051, "step": 4210 }, { "epoch": 1.915832575068244, "grad_norm": 1.1896273712625753, "learning_rate": 3.3978099270815714e-06, "loss": 0.1049, "step": 4211 }, { "epoch": 1.916287534121929, "grad_norm": 4.693552632091914, "learning_rate": 3.3971429166624864e-06, "loss": 0.1012, "step": 4212 }, { "epoch": 1.9167424931756143, "grad_norm": 1.2914693959831682, "learning_rate": 3.3964758329329356e-06, "loss": 0.0712, "step": 4213 }, { "epoch": 1.9171974522292994, "grad_norm": 1.8857762401787097, "learning_rate": 3.3958086759474275e-06, "loss": 0.0966, "step": 4214 }, { "epoch": 1.9176524112829845, "grad_norm": 2.138780678277845, "learning_rate": 3.395141445760479e-06, "loss": 0.0919, "step": 4215 }, { "epoch": 1.9181073703366698, "grad_norm": 1.6667470435625533, "learning_rate": 3.394474142426616e-06, "loss": 0.0923, "step": 4216 }, { "epoch": 1.9185623293903549, "grad_norm": 1.3335038943222106, "learning_rate": 3.3938067660003636e-06, "loss": 0.0829, "step": 4217 }, { "epoch": 1.91901728844404, "grad_norm": 2.0477420635665275, "learning_rate": 3.3931393165362604e-06, "loss": 0.1081, "step": 4218 }, { "epoch": 1.9194722474977253, "grad_norm": 1.5541163916821557, "learning_rate": 3.3924717940888437e-06, "loss": 0.0777, "step": 4219 }, { "epoch": 1.9199272065514104, "grad_norm": 1.6724662847992082, "learning_rate": 3.391804198712664e-06, "loss": 0.1158, "step": 4220 }, { "epoch": 1.9203821656050954, "grad_norm": 1.265849381594138, "learning_rate": 3.391136530462272e-06, "loss": 0.0626, "step": 4221 }, { "epoch": 1.9208371246587808, "grad_norm": 1.9909927365037632, "learning_rate": 3.390468789392226e-06, "loss": 0.1135, "step": 4222 }, { "epoch": 1.9212920837124658, "grad_norm": 1.5135348547858094, "learning_rate": 3.389800975557093e-06, "loss": 0.0665, "step": 4223 }, { "epoch": 1.921747042766151, "grad_norm": 1.8996574951937835, "learning_rate": 3.389133089011442e-06, "loss": 0.1137, "step": 4224 }, { "epoch": 1.9222020018198362, "grad_norm": 1.5715678093274923, "learning_rate": 3.3884651298098514e-06, "loss": 0.1505, "step": 4225 }, { "epoch": 1.9226569608735213, "grad_norm": 1.907326915988784, "learning_rate": 3.3877970980069015e-06, "loss": 0.0998, "step": 4226 }, { "epoch": 1.9231119199272064, "grad_norm": 2.024206842961864, "learning_rate": 3.387128993657182e-06, "loss": 0.0737, "step": 4227 }, { "epoch": 1.9235668789808917, "grad_norm": 1.7141606500673001, "learning_rate": 3.3864608168152885e-06, "loss": 0.0712, "step": 4228 }, { "epoch": 1.924021838034577, "grad_norm": 1.8856052238226075, "learning_rate": 3.3857925675358187e-06, "loss": 0.0798, "step": 4229 }, { "epoch": 1.924476797088262, "grad_norm": 1.8040177848162038, "learning_rate": 3.3851242458733818e-06, "loss": 0.108, "step": 4230 }, { "epoch": 1.9249317561419472, "grad_norm": 1.550087770756717, "learning_rate": 3.3844558518825876e-06, "loss": 0.1161, "step": 4231 }, { "epoch": 1.9253867151956325, "grad_norm": 1.8166059513460924, "learning_rate": 3.383787385618057e-06, "loss": 0.1215, "step": 4232 }, { "epoch": 1.9258416742493174, "grad_norm": 1.579013002911444, "learning_rate": 3.383118847134411e-06, "loss": 0.0738, "step": 4233 }, { "epoch": 1.9262966333030027, "grad_norm": 2.7248106640654175, "learning_rate": 3.382450236486281e-06, "loss": 0.0723, "step": 4234 }, { "epoch": 1.926751592356688, "grad_norm": 1.6039637438869014, "learning_rate": 3.3817815537283033e-06, "loss": 0.0634, "step": 4235 }, { "epoch": 1.9272065514103731, "grad_norm": 1.5823801671042332, "learning_rate": 3.381112798915118e-06, "loss": 0.0844, "step": 4236 }, { "epoch": 1.9276615104640582, "grad_norm": 1.6692132848519299, "learning_rate": 3.3804439721013756e-06, "loss": 0.0692, "step": 4237 }, { "epoch": 1.9281164695177435, "grad_norm": 1.4772277957005857, "learning_rate": 3.379775073341727e-06, "loss": 0.0927, "step": 4238 }, { "epoch": 1.9285714285714286, "grad_norm": 1.4647867454479184, "learning_rate": 3.3791061026908323e-06, "loss": 0.0894, "step": 4239 }, { "epoch": 1.9290263876251137, "grad_norm": 1.2080364424941343, "learning_rate": 3.3784370602033572e-06, "loss": 0.0979, "step": 4240 }, { "epoch": 1.929481346678799, "grad_norm": 1.6169547894179497, "learning_rate": 3.3777679459339717e-06, "loss": 0.0767, "step": 4241 }, { "epoch": 1.929936305732484, "grad_norm": 2.1210462210696446, "learning_rate": 3.377098759937355e-06, "loss": 0.1055, "step": 4242 }, { "epoch": 1.9303912647861692, "grad_norm": 1.866424366987311, "learning_rate": 3.376429502268188e-06, "loss": 0.0771, "step": 4243 }, { "epoch": 1.9308462238398545, "grad_norm": 1.6670651248692852, "learning_rate": 3.3757601729811596e-06, "loss": 0.1126, "step": 4244 }, { "epoch": 1.9313011828935396, "grad_norm": 1.84109334470497, "learning_rate": 3.3750907721309658e-06, "loss": 0.0745, "step": 4245 }, { "epoch": 1.9317561419472247, "grad_norm": 1.5957545276113658, "learning_rate": 3.374421299772305e-06, "loss": 0.0569, "step": 4246 }, { "epoch": 1.93221110100091, "grad_norm": 1.4143791490125228, "learning_rate": 3.373751755959884e-06, "loss": 0.0737, "step": 4247 }, { "epoch": 1.932666060054595, "grad_norm": 1.4208885366629587, "learning_rate": 3.373082140748416e-06, "loss": 0.0982, "step": 4248 }, { "epoch": 1.9331210191082802, "grad_norm": 1.1679284731413264, "learning_rate": 3.3724124541926184e-06, "loss": 0.1251, "step": 4249 }, { "epoch": 1.9335759781619655, "grad_norm": 1.953737875993607, "learning_rate": 3.3717426963472146e-06, "loss": 0.0816, "step": 4250 }, { "epoch": 1.9340309372156506, "grad_norm": 1.5621159136345937, "learning_rate": 3.371072867266934e-06, "loss": 0.0544, "step": 4251 }, { "epoch": 1.9344858962693356, "grad_norm": 1.3766557889003566, "learning_rate": 3.3704029670065135e-06, "loss": 0.0778, "step": 4252 }, { "epoch": 1.934940855323021, "grad_norm": 1.516566766950502, "learning_rate": 3.3697329956206927e-06, "loss": 0.1051, "step": 4253 }, { "epoch": 1.935395814376706, "grad_norm": 1.634215459300019, "learning_rate": 3.3690629531642188e-06, "loss": 0.095, "step": 4254 }, { "epoch": 1.9358507734303911, "grad_norm": 1.374619076854696, "learning_rate": 3.3683928396918453e-06, "loss": 0.0725, "step": 4255 }, { "epoch": 1.9363057324840764, "grad_norm": 2.4426521491422832, "learning_rate": 3.3677226552583307e-06, "loss": 0.0958, "step": 4256 }, { "epoch": 1.9367606915377618, "grad_norm": 0.9710317629325776, "learning_rate": 3.367052399918439e-06, "loss": 0.0336, "step": 4257 }, { "epoch": 1.9372156505914466, "grad_norm": 1.279247709122935, "learning_rate": 3.3663820737269408e-06, "loss": 0.0635, "step": 4258 }, { "epoch": 1.937670609645132, "grad_norm": 1.3133529299598832, "learning_rate": 3.365711676738612e-06, "loss": 0.0763, "step": 4259 }, { "epoch": 1.9381255686988172, "grad_norm": 1.8440514621923751, "learning_rate": 3.365041209008235e-06, "loss": 0.1098, "step": 4260 }, { "epoch": 1.9385805277525021, "grad_norm": 1.8941004460516286, "learning_rate": 3.3643706705905967e-06, "loss": 0.0599, "step": 4261 }, { "epoch": 1.9390354868061874, "grad_norm": 1.418317849025675, "learning_rate": 3.3637000615404907e-06, "loss": 0.0844, "step": 4262 }, { "epoch": 1.9394904458598727, "grad_norm": 1.4225397954436425, "learning_rate": 3.3630293819127157e-06, "loss": 0.0627, "step": 4263 }, { "epoch": 1.9399454049135578, "grad_norm": 1.5499047466460552, "learning_rate": 3.362358631762077e-06, "loss": 0.0712, "step": 4264 }, { "epoch": 1.940400363967243, "grad_norm": 1.3759364825761153, "learning_rate": 3.361687811143386e-06, "loss": 0.1249, "step": 4265 }, { "epoch": 1.9408553230209282, "grad_norm": 1.3962043920868283, "learning_rate": 3.3610169201114586e-06, "loss": 0.0695, "step": 4266 }, { "epoch": 1.9413102820746133, "grad_norm": 1.7410420286349613, "learning_rate": 3.360345958721116e-06, "loss": 0.0831, "step": 4267 }, { "epoch": 1.9417652411282984, "grad_norm": 1.7384559065313046, "learning_rate": 3.3596749270271868e-06, "loss": 0.068, "step": 4268 }, { "epoch": 1.9422202001819837, "grad_norm": 1.710671399707371, "learning_rate": 3.3590038250845052e-06, "loss": 0.0896, "step": 4269 }, { "epoch": 1.9426751592356688, "grad_norm": 1.8288895728597674, "learning_rate": 3.3583326529479103e-06, "loss": 0.1297, "step": 4270 }, { "epoch": 1.943130118289354, "grad_norm": 1.4604696471335676, "learning_rate": 3.3576614106722473e-06, "loss": 0.1116, "step": 4271 }, { "epoch": 1.9435850773430392, "grad_norm": 1.3615113695366645, "learning_rate": 3.356990098312366e-06, "loss": 0.0584, "step": 4272 }, { "epoch": 1.9440400363967243, "grad_norm": 1.5450640733905168, "learning_rate": 3.3563187159231255e-06, "loss": 0.1159, "step": 4273 }, { "epoch": 1.9444949954504094, "grad_norm": 1.487305191758892, "learning_rate": 3.355647263559386e-06, "loss": 0.0974, "step": 4274 }, { "epoch": 1.9449499545040947, "grad_norm": 1.215513598595534, "learning_rate": 3.354975741276016e-06, "loss": 0.0694, "step": 4275 }, { "epoch": 1.9454049135577798, "grad_norm": 2.0308584140590678, "learning_rate": 3.354304149127889e-06, "loss": 0.0877, "step": 4276 }, { "epoch": 1.9458598726114649, "grad_norm": 1.6320496452502486, "learning_rate": 3.353632487169886e-06, "loss": 0.1083, "step": 4277 }, { "epoch": 1.9463148316651502, "grad_norm": 1.7844036947871904, "learning_rate": 3.3529607554568904e-06, "loss": 0.1048, "step": 4278 }, { "epoch": 1.9467697907188353, "grad_norm": 1.314333956059009, "learning_rate": 3.3522889540437946e-06, "loss": 0.1335, "step": 4279 }, { "epoch": 1.9472247497725204, "grad_norm": 1.4891497980201782, "learning_rate": 3.3516170829854938e-06, "loss": 0.0884, "step": 4280 }, { "epoch": 1.9476797088262057, "grad_norm": 1.3310188865285766, "learning_rate": 3.350945142336891e-06, "loss": 0.068, "step": 4281 }, { "epoch": 1.9481346678798908, "grad_norm": 1.4908171158213093, "learning_rate": 3.3502731321528936e-06, "loss": 0.078, "step": 4282 }, { "epoch": 1.9485896269335758, "grad_norm": 1.5638462641757567, "learning_rate": 3.349601052488416e-06, "loss": 0.0882, "step": 4283 }, { "epoch": 1.9490445859872612, "grad_norm": 1.640468405785834, "learning_rate": 3.3489289033983767e-06, "loss": 0.1046, "step": 4284 }, { "epoch": 1.9494995450409465, "grad_norm": 1.290192657203814, "learning_rate": 3.3482566849377017e-06, "loss": 0.085, "step": 4285 }, { "epoch": 1.9499545040946313, "grad_norm": 1.602539906989237, "learning_rate": 3.347584397161321e-06, "loss": 0.0882, "step": 4286 }, { "epoch": 1.9504094631483166, "grad_norm": 1.5497579354992663, "learning_rate": 3.3469120401241705e-06, "loss": 0.0786, "step": 4287 }, { "epoch": 1.950864422202002, "grad_norm": 1.5771212520177949, "learning_rate": 3.3462396138811936e-06, "loss": 0.0952, "step": 4288 }, { "epoch": 1.9513193812556868, "grad_norm": 1.666637919686411, "learning_rate": 3.3455671184873363e-06, "loss": 0.0661, "step": 4289 }, { "epoch": 1.9517743403093721, "grad_norm": 1.5262836237273811, "learning_rate": 3.3448945539975532e-06, "loss": 0.099, "step": 4290 }, { "epoch": 1.9522292993630574, "grad_norm": 1.7583709204949445, "learning_rate": 3.3442219204668024e-06, "loss": 0.1259, "step": 4291 }, { "epoch": 1.9526842584167425, "grad_norm": 1.6777457753616931, "learning_rate": 3.3435492179500482e-06, "loss": 0.0878, "step": 4292 }, { "epoch": 1.9531392174704276, "grad_norm": 1.6248294433039778, "learning_rate": 3.3428764465022623e-06, "loss": 0.1079, "step": 4293 }, { "epoch": 1.953594176524113, "grad_norm": 2.541658686766655, "learning_rate": 3.342203606178419e-06, "loss": 0.0943, "step": 4294 }, { "epoch": 1.954049135577798, "grad_norm": 1.4490052237222681, "learning_rate": 3.341530697033501e-06, "loss": 0.0887, "step": 4295 }, { "epoch": 1.9545040946314831, "grad_norm": 1.7766434222954395, "learning_rate": 3.3408577191224938e-06, "loss": 0.1493, "step": 4296 }, { "epoch": 1.9549590536851684, "grad_norm": 1.2097476351595582, "learning_rate": 3.3401846725003916e-06, "loss": 0.0519, "step": 4297 }, { "epoch": 1.9554140127388535, "grad_norm": 6.579751344665857, "learning_rate": 3.3395115572221927e-06, "loss": 0.0902, "step": 4298 }, { "epoch": 1.9558689717925386, "grad_norm": 1.3419018624107848, "learning_rate": 3.3388383733428987e-06, "loss": 0.0967, "step": 4299 }, { "epoch": 1.956323930846224, "grad_norm": 1.8540729163630763, "learning_rate": 3.3381651209175224e-06, "loss": 0.0786, "step": 4300 }, { "epoch": 1.956778889899909, "grad_norm": 1.5184093168532122, "learning_rate": 3.3374918000010776e-06, "loss": 0.0613, "step": 4301 }, { "epoch": 1.957233848953594, "grad_norm": 1.509012769367556, "learning_rate": 3.336818410648585e-06, "loss": 0.0765, "step": 4302 }, { "epoch": 1.9576888080072794, "grad_norm": 1.1996590036978556, "learning_rate": 3.3361449529150706e-06, "loss": 0.0471, "step": 4303 }, { "epoch": 1.9581437670609645, "grad_norm": 1.9234687392066945, "learning_rate": 3.3354714268555668e-06, "loss": 0.0736, "step": 4304 }, { "epoch": 1.9585987261146496, "grad_norm": 1.4212767310216636, "learning_rate": 3.3347978325251113e-06, "loss": 0.062, "step": 4305 }, { "epoch": 1.959053685168335, "grad_norm": 1.6294389656492376, "learning_rate": 3.3341241699787456e-06, "loss": 0.0827, "step": 4306 }, { "epoch": 1.95950864422202, "grad_norm": 1.1912655384566813, "learning_rate": 3.3334504392715205e-06, "loss": 0.0413, "step": 4307 }, { "epoch": 1.959963603275705, "grad_norm": 1.6727213818125781, "learning_rate": 3.3327766404584892e-06, "loss": 0.0922, "step": 4308 }, { "epoch": 1.9604185623293904, "grad_norm": 1.4687452198433801, "learning_rate": 3.332102773594712e-06, "loss": 0.1147, "step": 4309 }, { "epoch": 1.9608735213830755, "grad_norm": 1.624721498522894, "learning_rate": 3.331428838735254e-06, "loss": 0.0596, "step": 4310 }, { "epoch": 1.9613284804367606, "grad_norm": 1.3322408844858742, "learning_rate": 3.330754835935185e-06, "loss": 0.0831, "step": 4311 }, { "epoch": 1.9617834394904459, "grad_norm": 1.6830337729365803, "learning_rate": 3.330080765249584e-06, "loss": 0.0723, "step": 4312 }, { "epoch": 1.9622383985441312, "grad_norm": 1.1636750651051007, "learning_rate": 3.32940662673353e-06, "loss": 0.0893, "step": 4313 }, { "epoch": 1.962693357597816, "grad_norm": 1.5361362208126288, "learning_rate": 3.3287324204421125e-06, "loss": 0.089, "step": 4314 }, { "epoch": 1.9631483166515014, "grad_norm": 1.4373830769904916, "learning_rate": 3.3280581464304244e-06, "loss": 0.1086, "step": 4315 }, { "epoch": 1.9636032757051867, "grad_norm": 1.54716553901365, "learning_rate": 3.3273838047535635e-06, "loss": 0.1318, "step": 4316 }, { "epoch": 1.9640582347588715, "grad_norm": 1.7056216979677221, "learning_rate": 3.326709395466635e-06, "loss": 0.0975, "step": 4317 }, { "epoch": 1.9645131938125568, "grad_norm": 1.3596633730560679, "learning_rate": 3.3260349186247476e-06, "loss": 0.0543, "step": 4318 }, { "epoch": 1.9649681528662422, "grad_norm": 1.7458757341326239, "learning_rate": 3.3253603742830174e-06, "loss": 0.082, "step": 4319 }, { "epoch": 1.9654231119199272, "grad_norm": 1.4621254706148623, "learning_rate": 3.3246857624965645e-06, "loss": 0.0784, "step": 4320 }, { "epoch": 1.9658780709736123, "grad_norm": 1.8599205359625564, "learning_rate": 3.324011083320515e-06, "loss": 0.1014, "step": 4321 }, { "epoch": 1.9663330300272976, "grad_norm": 1.4482555095132081, "learning_rate": 3.3233363368100025e-06, "loss": 0.1575, "step": 4322 }, { "epoch": 1.9667879890809827, "grad_norm": 1.8169525589744027, "learning_rate": 3.3226615230201613e-06, "loss": 0.1306, "step": 4323 }, { "epoch": 1.9672429481346678, "grad_norm": 1.9089367658344036, "learning_rate": 3.3219866420061356e-06, "loss": 0.0801, "step": 4324 }, { "epoch": 1.9676979071883531, "grad_norm": 1.3307509423093313, "learning_rate": 3.321311693823074e-06, "loss": 0.0723, "step": 4325 }, { "epoch": 1.9681528662420382, "grad_norm": 1.3691234910914967, "learning_rate": 3.32063667852613e-06, "loss": 0.1086, "step": 4326 }, { "epoch": 1.9686078252957233, "grad_norm": 1.3776822617105173, "learning_rate": 3.3199615961704616e-06, "loss": 0.0734, "step": 4327 }, { "epoch": 1.9690627843494086, "grad_norm": 1.5043467046126924, "learning_rate": 3.319286446811235e-06, "loss": 0.0932, "step": 4328 }, { "epoch": 1.9695177434030937, "grad_norm": 1.395444059601315, "learning_rate": 3.3186112305036205e-06, "loss": 0.0801, "step": 4329 }, { "epoch": 1.9699727024567788, "grad_norm": 1.4914964070235004, "learning_rate": 3.3179359473027923e-06, "loss": 0.0968, "step": 4330 }, { "epoch": 1.9704276615104641, "grad_norm": 1.21823364579694, "learning_rate": 3.3172605972639326e-06, "loss": 0.0554, "step": 4331 }, { "epoch": 1.9708826205641492, "grad_norm": 1.3114941625844965, "learning_rate": 3.3165851804422276e-06, "loss": 0.07, "step": 4332 }, { "epoch": 1.9713375796178343, "grad_norm": 1.0667278409195866, "learning_rate": 3.3159096968928688e-06, "loss": 0.0866, "step": 4333 }, { "epoch": 1.9717925386715196, "grad_norm": 1.426995811639697, "learning_rate": 3.3152341466710547e-06, "loss": 0.0796, "step": 4334 }, { "epoch": 1.9722474977252047, "grad_norm": 1.905884216982143, "learning_rate": 3.3145585298319873e-06, "loss": 0.1183, "step": 4335 }, { "epoch": 1.9727024567788898, "grad_norm": 1.341495506191566, "learning_rate": 3.313882846430876e-06, "loss": 0.0721, "step": 4336 }, { "epoch": 1.973157415832575, "grad_norm": 1.8908487306224737, "learning_rate": 3.3132070965229334e-06, "loss": 0.1175, "step": 4337 }, { "epoch": 1.9736123748862604, "grad_norm": 1.6160466273395677, "learning_rate": 3.312531280163379e-06, "loss": 0.1551, "step": 4338 }, { "epoch": 1.9740673339399453, "grad_norm": 1.9925673613704595, "learning_rate": 3.3118553974074392e-06, "loss": 0.1139, "step": 4339 }, { "epoch": 1.9745222929936306, "grad_norm": 1.177674690741949, "learning_rate": 3.311179448310341e-06, "loss": 0.0462, "step": 4340 }, { "epoch": 1.974977252047316, "grad_norm": 1.244435618808717, "learning_rate": 3.3105034329273224e-06, "loss": 0.0539, "step": 4341 }, { "epoch": 1.9754322111010008, "grad_norm": 1.5195982974093385, "learning_rate": 3.309827351313623e-06, "loss": 0.0863, "step": 4342 }, { "epoch": 1.975887170154686, "grad_norm": 1.8072148872008935, "learning_rate": 3.30915120352449e-06, "loss": 0.0954, "step": 4343 }, { "epoch": 1.9763421292083714, "grad_norm": 1.4507215534611957, "learning_rate": 3.3084749896151746e-06, "loss": 0.1355, "step": 4344 }, { "epoch": 1.9767970882620565, "grad_norm": 1.409412495999324, "learning_rate": 3.3077987096409335e-06, "loss": 0.0864, "step": 4345 }, { "epoch": 1.9772520473157416, "grad_norm": 1.1152969202569238, "learning_rate": 3.3071223636570316e-06, "loss": 0.0991, "step": 4346 }, { "epoch": 1.9777070063694269, "grad_norm": 1.6415847022514303, "learning_rate": 3.306445951718733e-06, "loss": 0.0891, "step": 4347 }, { "epoch": 1.978161965423112, "grad_norm": 1.8006115664521167, "learning_rate": 3.305769473881314e-06, "loss": 0.1105, "step": 4348 }, { "epoch": 1.978616924476797, "grad_norm": 1.2619212575999605, "learning_rate": 3.305092930200053e-06, "loss": 0.1093, "step": 4349 }, { "epoch": 1.9790718835304824, "grad_norm": 1.2596083255439026, "learning_rate": 3.3044163207302326e-06, "loss": 0.1072, "step": 4350 }, { "epoch": 1.9795268425841674, "grad_norm": 1.1939749546981848, "learning_rate": 3.303739645527144e-06, "loss": 0.0643, "step": 4351 }, { "epoch": 1.9799818016378525, "grad_norm": 1.0434899265652202, "learning_rate": 3.3030629046460798e-06, "loss": 0.0716, "step": 4352 }, { "epoch": 1.9804367606915378, "grad_norm": 1.5163344438282986, "learning_rate": 3.3023860981423427e-06, "loss": 0.0706, "step": 4353 }, { "epoch": 1.980891719745223, "grad_norm": 1.529869199667101, "learning_rate": 3.3017092260712375e-06, "loss": 0.0749, "step": 4354 }, { "epoch": 1.981346678798908, "grad_norm": 1.246217857178888, "learning_rate": 3.301032288488074e-06, "loss": 0.0881, "step": 4355 }, { "epoch": 1.9818016378525933, "grad_norm": 1.6395731534785023, "learning_rate": 3.3003552854481703e-06, "loss": 0.0668, "step": 4356 }, { "epoch": 1.9822565969062784, "grad_norm": 1.9650118546669948, "learning_rate": 3.2996782170068457e-06, "loss": 0.0915, "step": 4357 }, { "epoch": 1.9827115559599635, "grad_norm": 1.1936928906484088, "learning_rate": 3.2990010832194297e-06, "loss": 0.0586, "step": 4358 }, { "epoch": 1.9831665150136488, "grad_norm": 1.4461937459171126, "learning_rate": 3.2983238841412526e-06, "loss": 0.0983, "step": 4359 }, { "epoch": 1.983621474067334, "grad_norm": 1.654390494531532, "learning_rate": 3.2976466198276535e-06, "loss": 0.0871, "step": 4360 }, { "epoch": 1.984076433121019, "grad_norm": 1.7480235553022427, "learning_rate": 3.2969692903339746e-06, "loss": 0.0876, "step": 4361 }, { "epoch": 1.9845313921747043, "grad_norm": 1.5412897469083795, "learning_rate": 3.296291895715564e-06, "loss": 0.1564, "step": 4362 }, { "epoch": 1.9849863512283894, "grad_norm": 1.3722210655986884, "learning_rate": 3.2956144360277764e-06, "loss": 0.0781, "step": 4363 }, { "epoch": 1.9854413102820745, "grad_norm": 2.014969667930122, "learning_rate": 3.2949369113259698e-06, "loss": 0.1287, "step": 4364 }, { "epoch": 1.9858962693357598, "grad_norm": 1.8024906124238225, "learning_rate": 3.294259321665509e-06, "loss": 0.0645, "step": 4365 }, { "epoch": 1.9863512283894451, "grad_norm": 1.417361938099449, "learning_rate": 3.2935816671017627e-06, "loss": 0.0575, "step": 4366 }, { "epoch": 1.98680618744313, "grad_norm": 1.5029708150661372, "learning_rate": 3.292903947690106e-06, "loss": 0.0746, "step": 4367 }, { "epoch": 1.9872611464968153, "grad_norm": 1.75958375923616, "learning_rate": 3.2922261634859205e-06, "loss": 0.0928, "step": 4368 }, { "epoch": 1.9877161055505006, "grad_norm": 1.4586170231833593, "learning_rate": 3.291548314544589e-06, "loss": 0.1328, "step": 4369 }, { "epoch": 1.9881710646041855, "grad_norm": 1.3821854835932892, "learning_rate": 3.2908704009215053e-06, "loss": 0.0686, "step": 4370 }, { "epoch": 1.9886260236578708, "grad_norm": 1.6102025185126778, "learning_rate": 3.290192422672064e-06, "loss": 0.0686, "step": 4371 }, { "epoch": 1.989080982711556, "grad_norm": 1.2782082469789982, "learning_rate": 3.289514379851666e-06, "loss": 0.0608, "step": 4372 }, { "epoch": 1.9895359417652412, "grad_norm": 1.427217039226639, "learning_rate": 3.288836272515718e-06, "loss": 0.0637, "step": 4373 }, { "epoch": 1.9899909008189263, "grad_norm": 1.3156392869181261, "learning_rate": 3.288158100719632e-06, "loss": 0.0879, "step": 4374 }, { "epoch": 1.9904458598726116, "grad_norm": 1.5964668209603774, "learning_rate": 3.2874798645188264e-06, "loss": 0.102, "step": 4375 }, { "epoch": 1.9909008189262967, "grad_norm": 1.3329522957558393, "learning_rate": 3.2868015639687214e-06, "loss": 0.1346, "step": 4376 }, { "epoch": 1.9913557779799818, "grad_norm": 1.7093147892962317, "learning_rate": 3.286123199124746e-06, "loss": 0.0998, "step": 4377 }, { "epoch": 1.991810737033667, "grad_norm": 1.8485242101918202, "learning_rate": 3.285444770042333e-06, "loss": 0.088, "step": 4378 }, { "epoch": 1.9922656960873522, "grad_norm": 1.4305565790331634, "learning_rate": 3.2847662767769206e-06, "loss": 0.0948, "step": 4379 }, { "epoch": 1.9927206551410372, "grad_norm": 1.5877192204096924, "learning_rate": 3.284087719383952e-06, "loss": 0.1063, "step": 4380 }, { "epoch": 1.9931756141947226, "grad_norm": 1.1121374554418526, "learning_rate": 3.2834090979188754e-06, "loss": 0.0702, "step": 4381 }, { "epoch": 1.9936305732484076, "grad_norm": 1.4646649233977103, "learning_rate": 3.2827304124371462e-06, "loss": 0.0965, "step": 4382 }, { "epoch": 1.9940855323020927, "grad_norm": 1.9002003709480966, "learning_rate": 3.282051662994221e-06, "loss": 0.1045, "step": 4383 }, { "epoch": 1.994540491355778, "grad_norm": 1.7995939994404078, "learning_rate": 3.2813728496455667e-06, "loss": 0.078, "step": 4384 }, { "epoch": 1.9949954504094631, "grad_norm": 1.4201568604099797, "learning_rate": 3.280693972446652e-06, "loss": 0.094, "step": 4385 }, { "epoch": 1.9954504094631482, "grad_norm": 1.3449503387509005, "learning_rate": 3.2800150314529504e-06, "loss": 0.0916, "step": 4386 }, { "epoch": 1.9959053685168335, "grad_norm": 1.5372044773470928, "learning_rate": 3.279336026719944e-06, "loss": 0.1013, "step": 4387 }, { "epoch": 1.9963603275705186, "grad_norm": 1.4887882552804028, "learning_rate": 3.278656958303116e-06, "loss": 0.0796, "step": 4388 }, { "epoch": 1.9968152866242037, "grad_norm": 1.2638049912243654, "learning_rate": 3.277977826257959e-06, "loss": 0.0493, "step": 4389 }, { "epoch": 1.997270245677889, "grad_norm": 1.5430560101966988, "learning_rate": 3.277298630639966e-06, "loss": 0.0952, "step": 4390 }, { "epoch": 1.9977252047315741, "grad_norm": 1.6799290301549101, "learning_rate": 3.276619371504639e-06, "loss": 0.1047, "step": 4391 }, { "epoch": 1.9981801637852592, "grad_norm": 1.3838746884500472, "learning_rate": 3.2759400489074855e-06, "loss": 0.0779, "step": 4392 }, { "epoch": 1.9986351228389445, "grad_norm": 1.580494099591986, "learning_rate": 3.2752606629040146e-06, "loss": 0.0769, "step": 4393 }, { "epoch": 1.9990900818926298, "grad_norm": 1.5833215820596414, "learning_rate": 3.2745812135497433e-06, "loss": 0.1082, "step": 4394 }, { "epoch": 1.9995450409463147, "grad_norm": 1.2290752382887884, "learning_rate": 3.2739017009001927e-06, "loss": 0.1127, "step": 4395 }, { "epoch": 2.0, "grad_norm": 1.3469692778467985, "learning_rate": 3.2732221250108915e-06, "loss": 0.0515, "step": 4396 }, { "epoch": 2.0004549590536853, "grad_norm": 0.9933013515240868, "learning_rate": 3.272542485937369e-06, "loss": 0.0437, "step": 4397 }, { "epoch": 2.00090991810737, "grad_norm": 1.0218307020436446, "learning_rate": 3.2718627837351634e-06, "loss": 0.039, "step": 4398 }, { "epoch": 2.0013648771610555, "grad_norm": 1.1312383474306626, "learning_rate": 3.271183018459817e-06, "loss": 0.0356, "step": 4399 }, { "epoch": 2.001819836214741, "grad_norm": 1.3718501794374394, "learning_rate": 3.270503190166877e-06, "loss": 0.0501, "step": 4400 }, { "epoch": 2.0022747952684257, "grad_norm": 0.8557228967513829, "learning_rate": 3.2698232989118954e-06, "loss": 0.0476, "step": 4401 }, { "epoch": 2.002729754322111, "grad_norm": 0.8629849239957184, "learning_rate": 3.2691433447504312e-06, "loss": 0.0578, "step": 4402 }, { "epoch": 2.0031847133757963, "grad_norm": 0.8233155317564521, "learning_rate": 3.268463327738046e-06, "loss": 0.0623, "step": 4403 }, { "epoch": 2.003639672429481, "grad_norm": 1.070479508281503, "learning_rate": 3.2677832479303075e-06, "loss": 0.0427, "step": 4404 }, { "epoch": 2.0040946314831665, "grad_norm": 1.192202836037913, "learning_rate": 3.2671031053827896e-06, "loss": 0.0621, "step": 4405 }, { "epoch": 2.0045495905368518, "grad_norm": 0.859879478924876, "learning_rate": 3.26642290015107e-06, "loss": 0.0461, "step": 4406 }, { "epoch": 2.0050045495905366, "grad_norm": 1.0040211463886564, "learning_rate": 3.265742632290732e-06, "loss": 0.0457, "step": 4407 }, { "epoch": 2.005459508644222, "grad_norm": 1.472714332597669, "learning_rate": 3.2650623018573644e-06, "loss": 0.0401, "step": 4408 }, { "epoch": 2.0059144676979073, "grad_norm": 0.9328892521669188, "learning_rate": 3.2643819089065608e-06, "loss": 0.0449, "step": 4409 }, { "epoch": 2.0063694267515926, "grad_norm": 1.1877474791768876, "learning_rate": 3.263701453493919e-06, "loss": 0.0293, "step": 4410 }, { "epoch": 2.0068243858052774, "grad_norm": 1.6384172737967344, "learning_rate": 3.263020935675043e-06, "loss": 0.0538, "step": 4411 }, { "epoch": 2.0072793448589628, "grad_norm": 1.2459735792891722, "learning_rate": 3.2623403555055423e-06, "loss": 0.0694, "step": 4412 }, { "epoch": 2.007734303912648, "grad_norm": 1.461534321859994, "learning_rate": 3.261659713041031e-06, "loss": 0.0311, "step": 4413 }, { "epoch": 2.008189262966333, "grad_norm": 1.4978140386165864, "learning_rate": 3.2609790083371266e-06, "loss": 0.0589, "step": 4414 }, { "epoch": 2.0086442220200182, "grad_norm": 1.121816981736886, "learning_rate": 3.260298241449455e-06, "loss": 0.0593, "step": 4415 }, { "epoch": 2.0090991810737036, "grad_norm": 1.4492450637228496, "learning_rate": 3.259617412433644e-06, "loss": 0.0489, "step": 4416 }, { "epoch": 2.0095541401273884, "grad_norm": 1.4171609442360789, "learning_rate": 3.258936521345329e-06, "loss": 0.0487, "step": 4417 }, { "epoch": 2.0100090991810737, "grad_norm": 1.5012961060691163, "learning_rate": 3.2582555682401486e-06, "loss": 0.0284, "step": 4418 }, { "epoch": 2.010464058234759, "grad_norm": 0.8407155114751346, "learning_rate": 3.2575745531737475e-06, "loss": 0.0303, "step": 4419 }, { "epoch": 2.010919017288444, "grad_norm": 1.6707455407537646, "learning_rate": 3.2568934762017744e-06, "loss": 0.0468, "step": 4420 }, { "epoch": 2.011373976342129, "grad_norm": 1.220397779371225, "learning_rate": 3.256212337379886e-06, "loss": 0.033, "step": 4421 }, { "epoch": 2.0118289353958145, "grad_norm": 1.4953092363079539, "learning_rate": 3.255531136763739e-06, "loss": 0.069, "step": 4422 }, { "epoch": 2.0122838944494994, "grad_norm": 1.952548598092749, "learning_rate": 3.2548498744089996e-06, "loss": 0.0555, "step": 4423 }, { "epoch": 2.0127388535031847, "grad_norm": 1.6834586838249306, "learning_rate": 3.2541685503713377e-06, "loss": 0.0339, "step": 4424 }, { "epoch": 2.01319381255687, "grad_norm": 2.067430760786797, "learning_rate": 3.2534871647064275e-06, "loss": 0.0231, "step": 4425 }, { "epoch": 2.013648771610555, "grad_norm": 1.2878202698788677, "learning_rate": 3.252805717469949e-06, "loss": 0.0457, "step": 4426 }, { "epoch": 2.01410373066424, "grad_norm": 1.5579264894688172, "learning_rate": 3.252124208717587e-06, "loss": 0.0494, "step": 4427 }, { "epoch": 2.0145586897179255, "grad_norm": 1.0058946623960623, "learning_rate": 3.2514426385050313e-06, "loss": 0.0293, "step": 4428 }, { "epoch": 2.0150136487716104, "grad_norm": 1.5109378916745282, "learning_rate": 3.2507610068879756e-06, "loss": 0.062, "step": 4429 }, { "epoch": 2.0154686078252957, "grad_norm": 2.0069777697051414, "learning_rate": 3.2500793139221227e-06, "loss": 0.0574, "step": 4430 }, { "epoch": 2.015923566878981, "grad_norm": 1.037040549278024, "learning_rate": 3.249397559663174e-06, "loss": 0.0542, "step": 4431 }, { "epoch": 2.016378525932666, "grad_norm": 1.1833202563098557, "learning_rate": 3.2487157441668416e-06, "loss": 0.0268, "step": 4432 }, { "epoch": 2.016833484986351, "grad_norm": 1.266147636286517, "learning_rate": 3.2480338674888403e-06, "loss": 0.0535, "step": 4433 }, { "epoch": 2.0172884440400365, "grad_norm": 1.2010312997300316, "learning_rate": 3.247351929684889e-06, "loss": 0.0289, "step": 4434 }, { "epoch": 2.0177434030937214, "grad_norm": 1.4557132299841036, "learning_rate": 3.246669930810713e-06, "loss": 0.0398, "step": 4435 }, { "epoch": 2.0181983621474067, "grad_norm": 0.9640288299356471, "learning_rate": 3.2459878709220417e-06, "loss": 0.0211, "step": 4436 }, { "epoch": 2.018653321201092, "grad_norm": 1.1860857123946789, "learning_rate": 3.245305750074611e-06, "loss": 0.0591, "step": 4437 }, { "epoch": 2.0191082802547773, "grad_norm": 1.0221696630906623, "learning_rate": 3.2446235683241605e-06, "loss": 0.0532, "step": 4438 }, { "epoch": 2.019563239308462, "grad_norm": 1.2951527274413372, "learning_rate": 3.2439413257264335e-06, "loss": 0.0399, "step": 4439 }, { "epoch": 2.0200181983621475, "grad_norm": 1.5599553255561984, "learning_rate": 3.243259022337182e-06, "loss": 0.0651, "step": 4440 }, { "epoch": 2.0204731574158328, "grad_norm": 1.4246513514099302, "learning_rate": 3.2425766582121592e-06, "loss": 0.0817, "step": 4441 }, { "epoch": 2.0209281164695176, "grad_norm": 1.278956278658517, "learning_rate": 3.2418942334071255e-06, "loss": 0.0462, "step": 4442 }, { "epoch": 2.021383075523203, "grad_norm": 1.3978829081200876, "learning_rate": 3.241211747977846e-06, "loss": 0.035, "step": 4443 }, { "epoch": 2.0218380345768883, "grad_norm": 1.5599519646917956, "learning_rate": 3.2405292019800888e-06, "loss": 0.0751, "step": 4444 }, { "epoch": 2.022292993630573, "grad_norm": 1.4511453930905513, "learning_rate": 3.2398465954696302e-06, "loss": 0.0234, "step": 4445 }, { "epoch": 2.0227479526842584, "grad_norm": 1.0892345702152544, "learning_rate": 3.239163928502248e-06, "loss": 0.0521, "step": 4446 }, { "epoch": 2.0232029117379438, "grad_norm": 1.647929596364143, "learning_rate": 3.2384812011337286e-06, "loss": 0.0877, "step": 4447 }, { "epoch": 2.0236578707916286, "grad_norm": 1.4137122638400559, "learning_rate": 3.237798413419859e-06, "loss": 0.0771, "step": 4448 }, { "epoch": 2.024112829845314, "grad_norm": 1.5135972697317839, "learning_rate": 3.2371155654164365e-06, "loss": 0.065, "step": 4449 }, { "epoch": 2.0245677888989992, "grad_norm": 1.3044023290583162, "learning_rate": 3.2364326571792583e-06, "loss": 0.0733, "step": 4450 }, { "epoch": 2.025022747952684, "grad_norm": 1.0971390474649951, "learning_rate": 3.2357496887641288e-06, "loss": 0.0265, "step": 4451 }, { "epoch": 2.0254777070063694, "grad_norm": 1.114682832862875, "learning_rate": 3.235066660226858e-06, "loss": 0.0265, "step": 4452 }, { "epoch": 2.0259326660600547, "grad_norm": 1.3320297026314643, "learning_rate": 3.2343835716232576e-06, "loss": 0.0445, "step": 4453 }, { "epoch": 2.0263876251137396, "grad_norm": 1.4876061743193967, "learning_rate": 3.2337004230091497e-06, "loss": 0.0304, "step": 4454 }, { "epoch": 2.026842584167425, "grad_norm": 1.2236107354008499, "learning_rate": 3.2330172144403565e-06, "loss": 0.0943, "step": 4455 }, { "epoch": 2.02729754322111, "grad_norm": 1.4854641427050799, "learning_rate": 3.2323339459727065e-06, "loss": 0.0514, "step": 4456 }, { "epoch": 2.027752502274795, "grad_norm": 1.1049860394482223, "learning_rate": 3.231650617662033e-06, "loss": 0.0293, "step": 4457 }, { "epoch": 2.0282074613284804, "grad_norm": 1.5917143419419377, "learning_rate": 3.2309672295641757e-06, "loss": 0.0444, "step": 4458 }, { "epoch": 2.0286624203821657, "grad_norm": 1.0043166281703289, "learning_rate": 3.230283781734978e-06, "loss": 0.0248, "step": 4459 }, { "epoch": 2.0291173794358506, "grad_norm": 1.2956894191604518, "learning_rate": 3.229600274230287e-06, "loss": 0.037, "step": 4460 }, { "epoch": 2.029572338489536, "grad_norm": 1.4336675095419544, "learning_rate": 3.2289167071059565e-06, "loss": 0.0396, "step": 4461 }, { "epoch": 2.030027297543221, "grad_norm": 1.3058140995924816, "learning_rate": 3.2282330804178447e-06, "loss": 0.0377, "step": 4462 }, { "epoch": 2.030482256596906, "grad_norm": 1.1158346876759844, "learning_rate": 3.227549394221814e-06, "loss": 0.0292, "step": 4463 }, { "epoch": 2.0309372156505914, "grad_norm": 0.6593747242982305, "learning_rate": 3.226865648573732e-06, "loss": 0.0132, "step": 4464 }, { "epoch": 2.0313921747042767, "grad_norm": 1.2244245485277307, "learning_rate": 3.226181843529472e-06, "loss": 0.0232, "step": 4465 }, { "epoch": 2.031847133757962, "grad_norm": 1.6259819844851524, "learning_rate": 3.2254979791449115e-06, "loss": 0.0454, "step": 4466 }, { "epoch": 2.032302092811647, "grad_norm": 0.9392849041498846, "learning_rate": 3.224814055475932e-06, "loss": 0.0359, "step": 4467 }, { "epoch": 2.032757051865332, "grad_norm": 1.9239218572019896, "learning_rate": 3.224130072578421e-06, "loss": 0.0568, "step": 4468 }, { "epoch": 2.0332120109190175, "grad_norm": 1.5262318076744885, "learning_rate": 3.2234460305082717e-06, "loss": 0.0521, "step": 4469 }, { "epoch": 2.0336669699727024, "grad_norm": 0.7309105592001935, "learning_rate": 3.2227619293213784e-06, "loss": 0.0168, "step": 4470 }, { "epoch": 2.0341219290263877, "grad_norm": 1.3443806628261397, "learning_rate": 3.222077769073645e-06, "loss": 0.0394, "step": 4471 }, { "epoch": 2.034576888080073, "grad_norm": 1.7312471352416292, "learning_rate": 3.221393549820977e-06, "loss": 0.0616, "step": 4472 }, { "epoch": 2.035031847133758, "grad_norm": 1.5422226696763524, "learning_rate": 3.2207092716192863e-06, "loss": 0.0262, "step": 4473 }, { "epoch": 2.035486806187443, "grad_norm": 1.290593519008796, "learning_rate": 3.2200249345244876e-06, "loss": 0.0281, "step": 4474 }, { "epoch": 2.0359417652411285, "grad_norm": 2.3859247024741506, "learning_rate": 3.2193405385925035e-06, "loss": 0.0912, "step": 4475 }, { "epoch": 2.0363967242948133, "grad_norm": 0.765759453181892, "learning_rate": 3.21865608387926e-06, "loss": 0.0098, "step": 4476 }, { "epoch": 2.0368516833484986, "grad_norm": 1.4319479759213656, "learning_rate": 3.2179715704406853e-06, "loss": 0.0309, "step": 4477 }, { "epoch": 2.037306642402184, "grad_norm": 1.231605191239591, "learning_rate": 3.2172869983327164e-06, "loss": 0.049, "step": 4478 }, { "epoch": 2.037761601455869, "grad_norm": 1.6662573304943553, "learning_rate": 3.216602367611294e-06, "loss": 0.0458, "step": 4479 }, { "epoch": 2.038216560509554, "grad_norm": 1.6068444357433127, "learning_rate": 3.215917678332362e-06, "loss": 0.0364, "step": 4480 }, { "epoch": 2.0386715195632394, "grad_norm": 1.4991035821593393, "learning_rate": 3.21523293055187e-06, "loss": 0.0439, "step": 4481 }, { "epoch": 2.0391264786169243, "grad_norm": 1.3638534101633082, "learning_rate": 3.2145481243257726e-06, "loss": 0.0334, "step": 4482 }, { "epoch": 2.0395814376706096, "grad_norm": 0.7504433308031289, "learning_rate": 3.2138632597100305e-06, "loss": 0.0196, "step": 4483 }, { "epoch": 2.040036396724295, "grad_norm": 2.021241787573951, "learning_rate": 3.2131783367606057e-06, "loss": 0.0499, "step": 4484 }, { "epoch": 2.04049135577798, "grad_norm": 1.6439279459821368, "learning_rate": 3.212493355533468e-06, "loss": 0.0564, "step": 4485 }, { "epoch": 2.040946314831665, "grad_norm": 1.294144721413093, "learning_rate": 3.2118083160845915e-06, "loss": 0.0398, "step": 4486 }, { "epoch": 2.0414012738853504, "grad_norm": 1.2902321929462734, "learning_rate": 3.211123218469953e-06, "loss": 0.0399, "step": 4487 }, { "epoch": 2.0418562329390353, "grad_norm": 1.316793211054239, "learning_rate": 3.210438062745537e-06, "loss": 0.0605, "step": 4488 }, { "epoch": 2.0423111919927206, "grad_norm": 1.492504015556577, "learning_rate": 3.20975284896733e-06, "loss": 0.0289, "step": 4489 }, { "epoch": 2.042766151046406, "grad_norm": 1.4163019186209156, "learning_rate": 3.2090675771913273e-06, "loss": 0.049, "step": 4490 }, { "epoch": 2.0432211101000908, "grad_norm": 1.2899157199035474, "learning_rate": 3.2083822474735233e-06, "loss": 0.0466, "step": 4491 }, { "epoch": 2.043676069153776, "grad_norm": 1.1836459397218353, "learning_rate": 3.2076968598699197e-06, "loss": 0.0645, "step": 4492 }, { "epoch": 2.0441310282074614, "grad_norm": 1.3657458585620577, "learning_rate": 3.2070114144365265e-06, "loss": 0.0301, "step": 4493 }, { "epoch": 2.0445859872611467, "grad_norm": 1.3352881168704531, "learning_rate": 3.2063259112293526e-06, "loss": 0.0368, "step": 4494 }, { "epoch": 2.0450409463148316, "grad_norm": 2.0546037202149368, "learning_rate": 3.2056403503044155e-06, "loss": 0.03, "step": 4495 }, { "epoch": 2.045495905368517, "grad_norm": 1.0986422052958182, "learning_rate": 3.2049547317177355e-06, "loss": 0.0367, "step": 4496 }, { "epoch": 2.045950864422202, "grad_norm": 1.660249182227502, "learning_rate": 3.2042690555253375e-06, "loss": 0.0804, "step": 4497 }, { "epoch": 2.046405823475887, "grad_norm": 1.2408890826333419, "learning_rate": 3.2035833217832536e-06, "loss": 0.0306, "step": 4498 }, { "epoch": 2.0468607825295724, "grad_norm": 1.6878774713837328, "learning_rate": 3.2028975305475174e-06, "loss": 0.0167, "step": 4499 }, { "epoch": 2.0473157415832577, "grad_norm": 1.050076815582365, "learning_rate": 3.20221168187417e-06, "loss": 0.048, "step": 4500 }, { "epoch": 2.0477707006369426, "grad_norm": 1.3177825726661399, "learning_rate": 3.2015257758192543e-06, "loss": 0.0385, "step": 4501 }, { "epoch": 2.048225659690628, "grad_norm": 1.2248532215271772, "learning_rate": 3.2008398124388206e-06, "loss": 0.0342, "step": 4502 }, { "epoch": 2.048680618744313, "grad_norm": 1.4791539249327337, "learning_rate": 3.2001537917889223e-06, "loss": 0.0943, "step": 4503 }, { "epoch": 2.049135577797998, "grad_norm": 1.3225380276058498, "learning_rate": 3.1994677139256182e-06, "loss": 0.0542, "step": 4504 }, { "epoch": 2.0495905368516834, "grad_norm": 1.4591674861340034, "learning_rate": 3.198781578904972e-06, "loss": 0.0443, "step": 4505 }, { "epoch": 2.0500454959053687, "grad_norm": 1.2142252751515434, "learning_rate": 3.198095386783049e-06, "loss": 0.0226, "step": 4506 }, { "epoch": 2.0505004549590535, "grad_norm": 0.9713436698088449, "learning_rate": 3.197409137615925e-06, "loss": 0.0359, "step": 4507 }, { "epoch": 2.050955414012739, "grad_norm": 2.117122368861089, "learning_rate": 3.196722831459676e-06, "loss": 0.0412, "step": 4508 }, { "epoch": 2.051410373066424, "grad_norm": 1.517972192755166, "learning_rate": 3.1960364683703822e-06, "loss": 0.0906, "step": 4509 }, { "epoch": 2.051865332120109, "grad_norm": 1.4999745718623494, "learning_rate": 3.195350048404133e-06, "loss": 0.0551, "step": 4510 }, { "epoch": 2.0523202911737943, "grad_norm": 1.0788638705584614, "learning_rate": 3.1946635716170167e-06, "loss": 0.026, "step": 4511 }, { "epoch": 2.0527752502274796, "grad_norm": 1.9395562931697328, "learning_rate": 3.1939770380651315e-06, "loss": 0.0904, "step": 4512 }, { "epoch": 2.0532302092811645, "grad_norm": 1.2265453090870007, "learning_rate": 3.1932904478045756e-06, "loss": 0.0452, "step": 4513 }, { "epoch": 2.05368516833485, "grad_norm": 2.4984624429689988, "learning_rate": 3.192603800891456e-06, "loss": 0.0538, "step": 4514 }, { "epoch": 2.054140127388535, "grad_norm": 1.2938593765105686, "learning_rate": 3.1919170973818814e-06, "loss": 0.0323, "step": 4515 }, { "epoch": 2.05459508644222, "grad_norm": 1.4716103736661434, "learning_rate": 3.191230337331966e-06, "loss": 0.0561, "step": 4516 }, { "epoch": 2.0550500454959053, "grad_norm": 1.1329592663228893, "learning_rate": 3.1905435207978293e-06, "loss": 0.0205, "step": 4517 }, { "epoch": 2.0555050045495906, "grad_norm": 1.5025115650303367, "learning_rate": 3.1898566478355943e-06, "loss": 0.0322, "step": 4518 }, { "epoch": 2.055959963603276, "grad_norm": 1.3077566587511098, "learning_rate": 3.1891697185013892e-06, "loss": 0.0641, "step": 4519 }, { "epoch": 2.056414922656961, "grad_norm": 1.6078444400520158, "learning_rate": 3.188482732851348e-06, "loss": 0.058, "step": 4520 }, { "epoch": 2.056869881710646, "grad_norm": 0.9470323139734805, "learning_rate": 3.1877956909416063e-06, "loss": 0.0429, "step": 4521 }, { "epoch": 2.0573248407643314, "grad_norm": 1.2074556609089016, "learning_rate": 3.187108592828307e-06, "loss": 0.0312, "step": 4522 }, { "epoch": 2.0577797998180163, "grad_norm": 1.592001108817723, "learning_rate": 3.1864214385675957e-06, "loss": 0.0662, "step": 4523 }, { "epoch": 2.0582347588717016, "grad_norm": 1.1801358698009314, "learning_rate": 3.185734228215625e-06, "loss": 0.0522, "step": 4524 }, { "epoch": 2.058689717925387, "grad_norm": 1.2120056685288352, "learning_rate": 3.1850469618285494e-06, "loss": 0.0566, "step": 4525 }, { "epoch": 2.0591446769790718, "grad_norm": 1.2959713336091738, "learning_rate": 3.18435963946253e-06, "loss": 0.0692, "step": 4526 }, { "epoch": 2.059599636032757, "grad_norm": 1.2712394913363487, "learning_rate": 3.1836722611737326e-06, "loss": 0.031, "step": 4527 }, { "epoch": 2.0600545950864424, "grad_norm": 1.4191744594435043, "learning_rate": 3.182984827018324e-06, "loss": 0.0676, "step": 4528 }, { "epoch": 2.0605095541401273, "grad_norm": 0.9296323341326973, "learning_rate": 3.18229733705248e-06, "loss": 0.0366, "step": 4529 }, { "epoch": 2.0609645131938126, "grad_norm": 0.9580372284286517, "learning_rate": 3.181609791332379e-06, "loss": 0.0264, "step": 4530 }, { "epoch": 2.061419472247498, "grad_norm": 1.1588100825966121, "learning_rate": 3.180922189914204e-06, "loss": 0.0316, "step": 4531 }, { "epoch": 2.0618744313011828, "grad_norm": 0.9214880924799027, "learning_rate": 3.180234532854143e-06, "loss": 0.0113, "step": 4532 }, { "epoch": 2.062329390354868, "grad_norm": 1.285561847169919, "learning_rate": 3.1795468202083864e-06, "loss": 0.0336, "step": 4533 }, { "epoch": 2.0627843494085534, "grad_norm": 1.1846795653872542, "learning_rate": 3.1788590520331337e-06, "loss": 0.0391, "step": 4534 }, { "epoch": 2.0632393084622382, "grad_norm": 1.6314553343369242, "learning_rate": 3.1781712283845844e-06, "loss": 0.0479, "step": 4535 }, { "epoch": 2.0636942675159236, "grad_norm": 1.6025702550415535, "learning_rate": 3.177483349318946e-06, "loss": 0.0626, "step": 4536 }, { "epoch": 2.064149226569609, "grad_norm": 1.257287882913492, "learning_rate": 3.1767954148924266e-06, "loss": 0.0225, "step": 4537 }, { "epoch": 2.0646041856232937, "grad_norm": 1.9271717002323268, "learning_rate": 3.176107425161243e-06, "loss": 0.0521, "step": 4538 }, { "epoch": 2.065059144676979, "grad_norm": 1.8425389888004107, "learning_rate": 3.1754193801816137e-06, "loss": 0.0626, "step": 4539 }, { "epoch": 2.0655141037306644, "grad_norm": 0.5692942959115204, "learning_rate": 3.174731280009762e-06, "loss": 0.0082, "step": 4540 }, { "epoch": 2.065969062784349, "grad_norm": 1.5047719636613874, "learning_rate": 3.174043124701918e-06, "loss": 0.0909, "step": 4541 }, { "epoch": 2.0664240218380345, "grad_norm": 1.409601948676839, "learning_rate": 3.1733549143143137e-06, "loss": 0.0296, "step": 4542 }, { "epoch": 2.06687898089172, "grad_norm": 0.825069374243067, "learning_rate": 3.1726666489031873e-06, "loss": 0.0368, "step": 4543 }, { "epoch": 2.0673339399454047, "grad_norm": 1.2164000485609767, "learning_rate": 3.171978328524779e-06, "loss": 0.035, "step": 4544 }, { "epoch": 2.06778889899909, "grad_norm": 1.329493672511183, "learning_rate": 3.1712899532353366e-06, "loss": 0.0549, "step": 4545 }, { "epoch": 2.0682438580527753, "grad_norm": 1.233554080137719, "learning_rate": 3.1706015230911114e-06, "loss": 0.0539, "step": 4546 }, { "epoch": 2.06869881710646, "grad_norm": 1.6268986727066572, "learning_rate": 3.1699130381483574e-06, "loss": 0.0849, "step": 4547 }, { "epoch": 2.0691537761601455, "grad_norm": 1.561946298979001, "learning_rate": 3.1692244984633353e-06, "loss": 0.0449, "step": 4548 }, { "epoch": 2.069608735213831, "grad_norm": 1.4240498709901004, "learning_rate": 3.1685359040923097e-06, "loss": 0.122, "step": 4549 }, { "epoch": 2.070063694267516, "grad_norm": 1.4046787081976124, "learning_rate": 3.167847255091549e-06, "loss": 0.0356, "step": 4550 }, { "epoch": 2.070518653321201, "grad_norm": 1.1039262273432968, "learning_rate": 3.1671585515173262e-06, "loss": 0.0379, "step": 4551 }, { "epoch": 2.0709736123748863, "grad_norm": 1.1798518412641645, "learning_rate": 3.166469793425919e-06, "loss": 0.0556, "step": 4552 }, { "epoch": 2.0714285714285716, "grad_norm": 1.59249674879058, "learning_rate": 3.165780980873612e-06, "loss": 0.0818, "step": 4553 }, { "epoch": 2.0718835304822565, "grad_norm": 1.286409059922175, "learning_rate": 3.165092113916688e-06, "loss": 0.0684, "step": 4554 }, { "epoch": 2.072338489535942, "grad_norm": 2.4620196487887895, "learning_rate": 3.1644031926114403e-06, "loss": 0.0443, "step": 4555 }, { "epoch": 2.072793448589627, "grad_norm": 1.0812330479752066, "learning_rate": 3.1637142170141655e-06, "loss": 0.0378, "step": 4556 }, { "epoch": 2.073248407643312, "grad_norm": 1.212448447840173, "learning_rate": 3.163025187181161e-06, "loss": 0.0407, "step": 4557 }, { "epoch": 2.0737033666969973, "grad_norm": 1.346000347686834, "learning_rate": 3.1623361031687323e-06, "loss": 0.0242, "step": 4558 }, { "epoch": 2.0741583257506826, "grad_norm": 2.027328982284325, "learning_rate": 3.1616469650331884e-06, "loss": 0.0423, "step": 4559 }, { "epoch": 2.0746132848043675, "grad_norm": 1.2808171328914186, "learning_rate": 3.1609577728308428e-06, "loss": 0.0555, "step": 4560 }, { "epoch": 2.0750682438580528, "grad_norm": 1.2306125332798237, "learning_rate": 3.160268526618012e-06, "loss": 0.025, "step": 4561 }, { "epoch": 2.075523202911738, "grad_norm": 1.25784079170724, "learning_rate": 3.15957922645102e-06, "loss": 0.0489, "step": 4562 }, { "epoch": 2.075978161965423, "grad_norm": 1.2412476990977388, "learning_rate": 3.158889872386192e-06, "loss": 0.0517, "step": 4563 }, { "epoch": 2.0764331210191083, "grad_norm": 1.1221488409445879, "learning_rate": 3.158200464479859e-06, "loss": 0.0401, "step": 4564 }, { "epoch": 2.0768880800727936, "grad_norm": 1.3379944436548459, "learning_rate": 3.1575110027883566e-06, "loss": 0.0247, "step": 4565 }, { "epoch": 2.0773430391264784, "grad_norm": 1.2049320893943714, "learning_rate": 3.156821487368025e-06, "loss": 0.0344, "step": 4566 }, { "epoch": 2.0777979981801638, "grad_norm": 1.4004037211295741, "learning_rate": 3.1561319182752066e-06, "loss": 0.0452, "step": 4567 }, { "epoch": 2.078252957233849, "grad_norm": 1.5714086185299443, "learning_rate": 3.1554422955662505e-06, "loss": 0.0708, "step": 4568 }, { "epoch": 2.078707916287534, "grad_norm": 1.6707136484031058, "learning_rate": 3.154752619297511e-06, "loss": 0.0865, "step": 4569 }, { "epoch": 2.0791628753412192, "grad_norm": 3.0188300342708163, "learning_rate": 3.1540628895253438e-06, "loss": 0.0747, "step": 4570 }, { "epoch": 2.0796178343949046, "grad_norm": 1.0541028933175631, "learning_rate": 3.153373106306111e-06, "loss": 0.0221, "step": 4571 }, { "epoch": 2.0800727934485894, "grad_norm": 1.0739075345690892, "learning_rate": 3.152683269696179e-06, "loss": 0.0368, "step": 4572 }, { "epoch": 2.0805277525022747, "grad_norm": 1.3537917580522794, "learning_rate": 3.1519933797519174e-06, "loss": 0.0291, "step": 4573 }, { "epoch": 2.08098271155596, "grad_norm": 1.8006990649710841, "learning_rate": 3.1513034365297013e-06, "loss": 0.0352, "step": 4574 }, { "epoch": 2.0814376706096454, "grad_norm": 1.4462252443597412, "learning_rate": 3.150613440085909e-06, "loss": 0.0503, "step": 4575 }, { "epoch": 2.08189262966333, "grad_norm": 1.2536474339759514, "learning_rate": 3.149923390476925e-06, "loss": 0.0236, "step": 4576 }, { "epoch": 2.0823475887170155, "grad_norm": 1.136757789342446, "learning_rate": 3.1492332877591368e-06, "loss": 0.0669, "step": 4577 }, { "epoch": 2.082802547770701, "grad_norm": 1.4962070722710912, "learning_rate": 3.148543131988936e-06, "loss": 0.0373, "step": 4578 }, { "epoch": 2.0832575068243857, "grad_norm": 1.7100033251853965, "learning_rate": 3.1478529232227197e-06, "loss": 0.0459, "step": 4579 }, { "epoch": 2.083712465878071, "grad_norm": 1.2362779237348236, "learning_rate": 3.1471626615168876e-06, "loss": 0.0251, "step": 4580 }, { "epoch": 2.0841674249317563, "grad_norm": 3.117895060622359, "learning_rate": 3.146472346927845e-06, "loss": 0.0729, "step": 4581 }, { "epoch": 2.084622383985441, "grad_norm": 1.0441619875925956, "learning_rate": 3.1457819795120026e-06, "loss": 0.051, "step": 4582 }, { "epoch": 2.0850773430391265, "grad_norm": 1.9671512318816176, "learning_rate": 3.145091559325773e-06, "loss": 0.0466, "step": 4583 }, { "epoch": 2.085532302092812, "grad_norm": 1.173392956185096, "learning_rate": 3.1444010864255737e-06, "loss": 0.0354, "step": 4584 }, { "epoch": 2.0859872611464967, "grad_norm": 1.5307413181016196, "learning_rate": 3.1437105608678287e-06, "loss": 0.0519, "step": 4585 }, { "epoch": 2.086442220200182, "grad_norm": 0.8644022849018043, "learning_rate": 3.1430199827089624e-06, "loss": 0.0242, "step": 4586 }, { "epoch": 2.0868971792538673, "grad_norm": 1.566407324708009, "learning_rate": 3.1423293520054076e-06, "loss": 0.0317, "step": 4587 }, { "epoch": 2.087352138307552, "grad_norm": 1.2668346414101586, "learning_rate": 3.141638668813599e-06, "loss": 0.0361, "step": 4588 }, { "epoch": 2.0878070973612375, "grad_norm": 1.008774459335125, "learning_rate": 3.1409479331899755e-06, "loss": 0.0374, "step": 4589 }, { "epoch": 2.088262056414923, "grad_norm": 1.1307469024413677, "learning_rate": 3.1402571451909823e-06, "loss": 0.0281, "step": 4590 }, { "epoch": 2.0887170154686077, "grad_norm": 1.2629006115622503, "learning_rate": 3.1395663048730662e-06, "loss": 0.0307, "step": 4591 }, { "epoch": 2.089171974522293, "grad_norm": 1.3724802845190578, "learning_rate": 3.1388754122926803e-06, "loss": 0.0367, "step": 4592 }, { "epoch": 2.0896269335759783, "grad_norm": 1.9560889713086214, "learning_rate": 3.1381844675062796e-06, "loss": 0.0649, "step": 4593 }, { "epoch": 2.090081892629663, "grad_norm": 1.3390563915343219, "learning_rate": 3.137493470570327e-06, "loss": 0.0451, "step": 4594 }, { "epoch": 2.0905368516833485, "grad_norm": 2.935691456743712, "learning_rate": 3.1368024215412866e-06, "loss": 0.1209, "step": 4595 }, { "epoch": 2.0909918107370338, "grad_norm": 1.037909848708407, "learning_rate": 3.1361113204756284e-06, "loss": 0.0304, "step": 4596 }, { "epoch": 2.0914467697907186, "grad_norm": 1.0884397750616173, "learning_rate": 3.1354201674298257e-06, "loss": 0.03, "step": 4597 }, { "epoch": 2.091901728844404, "grad_norm": 0.7701774572608224, "learning_rate": 3.1347289624603565e-06, "loss": 0.0256, "step": 4598 }, { "epoch": 2.0923566878980893, "grad_norm": 1.231699278382788, "learning_rate": 3.1340377056237032e-06, "loss": 0.0316, "step": 4599 }, { "epoch": 2.092811646951774, "grad_norm": 1.2320909711960082, "learning_rate": 3.133346396976351e-06, "loss": 0.0273, "step": 4600 }, { "epoch": 2.0932666060054594, "grad_norm": 1.5073186793831828, "learning_rate": 3.132655036574792e-06, "loss": 0.0967, "step": 4601 }, { "epoch": 2.0937215650591448, "grad_norm": 1.420960761634141, "learning_rate": 3.131963624475521e-06, "loss": 0.0311, "step": 4602 }, { "epoch": 2.0941765241128296, "grad_norm": 0.9892065297936363, "learning_rate": 3.131272160735035e-06, "loss": 0.0354, "step": 4603 }, { "epoch": 2.094631483166515, "grad_norm": 1.269735799808192, "learning_rate": 3.1305806454098404e-06, "loss": 0.0421, "step": 4604 }, { "epoch": 2.0950864422202002, "grad_norm": 1.553009370674734, "learning_rate": 3.1298890785564425e-06, "loss": 0.0448, "step": 4605 }, { "epoch": 2.0955414012738856, "grad_norm": 1.2190864720380914, "learning_rate": 3.1291974602313536e-06, "loss": 0.0342, "step": 4606 }, { "epoch": 2.0959963603275704, "grad_norm": 2.163470016836716, "learning_rate": 3.1285057904910896e-06, "loss": 0.076, "step": 4607 }, { "epoch": 2.0964513193812557, "grad_norm": 1.7288661181250369, "learning_rate": 3.1278140693921704e-06, "loss": 0.0428, "step": 4608 }, { "epoch": 2.096906278434941, "grad_norm": 2.3205155490615597, "learning_rate": 3.127122296991122e-06, "loss": 0.0627, "step": 4609 }, { "epoch": 2.097361237488626, "grad_norm": 1.332112357197098, "learning_rate": 3.1264304733444694e-06, "loss": 0.0256, "step": 4610 }, { "epoch": 2.097816196542311, "grad_norm": 1.286165033900726, "learning_rate": 3.125738598508749e-06, "loss": 0.0378, "step": 4611 }, { "epoch": 2.0982711555959965, "grad_norm": 1.3256794862107886, "learning_rate": 3.125046672540496e-06, "loss": 0.0984, "step": 4612 }, { "epoch": 2.0987261146496814, "grad_norm": 1.604504776837272, "learning_rate": 3.124354695496252e-06, "loss": 0.0349, "step": 4613 }, { "epoch": 2.0991810737033667, "grad_norm": 1.2418774909171844, "learning_rate": 3.1236626674325603e-06, "loss": 0.0312, "step": 4614 }, { "epoch": 2.099636032757052, "grad_norm": 1.6992052805586937, "learning_rate": 3.122970588405973e-06, "loss": 0.0594, "step": 4615 }, { "epoch": 2.100090991810737, "grad_norm": 2.0353749409260886, "learning_rate": 3.1222784584730426e-06, "loss": 0.054, "step": 4616 }, { "epoch": 2.100545950864422, "grad_norm": 1.0756984131151548, "learning_rate": 3.1215862776903255e-06, "loss": 0.0487, "step": 4617 }, { "epoch": 2.1010009099181075, "grad_norm": 1.2853971085095626, "learning_rate": 3.1208940461143866e-06, "loss": 0.0418, "step": 4618 }, { "epoch": 2.1014558689717924, "grad_norm": 1.25710679417819, "learning_rate": 3.1202017638017895e-06, "loss": 0.0375, "step": 4619 }, { "epoch": 2.1019108280254777, "grad_norm": 1.7993039590710984, "learning_rate": 3.119509430809105e-06, "loss": 0.0658, "step": 4620 }, { "epoch": 2.102365787079163, "grad_norm": 1.47590622830292, "learning_rate": 3.118817047192907e-06, "loss": 0.0743, "step": 4621 }, { "epoch": 2.102820746132848, "grad_norm": 1.371011755052548, "learning_rate": 3.118124613009775e-06, "loss": 0.0635, "step": 4622 }, { "epoch": 2.103275705186533, "grad_norm": 1.6711931265092552, "learning_rate": 3.117432128316291e-06, "loss": 0.0628, "step": 4623 }, { "epoch": 2.1037306642402185, "grad_norm": 1.476876505004312, "learning_rate": 3.116739593169042e-06, "loss": 0.0941, "step": 4624 }, { "epoch": 2.1041856232939034, "grad_norm": 1.3534527257213653, "learning_rate": 3.116047007624618e-06, "loss": 0.0309, "step": 4625 }, { "epoch": 2.1046405823475887, "grad_norm": 1.255054019779178, "learning_rate": 3.1153543717396157e-06, "loss": 0.054, "step": 4626 }, { "epoch": 2.105095541401274, "grad_norm": 1.3541315616715852, "learning_rate": 3.114661685570632e-06, "loss": 0.0249, "step": 4627 }, { "epoch": 2.105550500454959, "grad_norm": 1.7373928754018704, "learning_rate": 3.1139689491742708e-06, "loss": 0.0415, "step": 4628 }, { "epoch": 2.106005459508644, "grad_norm": 1.5602951770044344, "learning_rate": 3.1132761626071406e-06, "loss": 0.0851, "step": 4629 }, { "epoch": 2.1064604185623295, "grad_norm": 1.0130398140667494, "learning_rate": 3.112583325925852e-06, "loss": 0.0182, "step": 4630 }, { "epoch": 2.1069153776160148, "grad_norm": 1.911560354352603, "learning_rate": 3.1118904391870197e-06, "loss": 0.055, "step": 4631 }, { "epoch": 2.1073703366696996, "grad_norm": 2.0028295308880546, "learning_rate": 3.1111975024472647e-06, "loss": 0.0401, "step": 4632 }, { "epoch": 2.107825295723385, "grad_norm": 1.3778236040546668, "learning_rate": 3.11050451576321e-06, "loss": 0.0621, "step": 4633 }, { "epoch": 2.1082802547770703, "grad_norm": 1.7591301186648847, "learning_rate": 3.1098114791914825e-06, "loss": 0.0677, "step": 4634 }, { "epoch": 2.108735213830755, "grad_norm": 1.0238744679456264, "learning_rate": 3.1091183927887154e-06, "loss": 0.0188, "step": 4635 }, { "epoch": 2.1091901728844404, "grad_norm": 1.3675178935765342, "learning_rate": 3.1084252566115437e-06, "loss": 0.0505, "step": 4636 }, { "epoch": 2.1096451319381258, "grad_norm": 0.9079066005420199, "learning_rate": 3.107732070716608e-06, "loss": 0.018, "step": 4637 }, { "epoch": 2.1101000909918106, "grad_norm": 1.127662679313093, "learning_rate": 3.1070388351605513e-06, "loss": 0.0215, "step": 4638 }, { "epoch": 2.110555050045496, "grad_norm": 1.2687111132476532, "learning_rate": 3.106345550000023e-06, "loss": 0.0444, "step": 4639 }, { "epoch": 2.1110100090991812, "grad_norm": 1.9604961223179536, "learning_rate": 3.1056522152916747e-06, "loss": 0.0395, "step": 4640 }, { "epoch": 2.111464968152866, "grad_norm": 1.4565008770876413, "learning_rate": 3.104958831092162e-06, "loss": 0.0455, "step": 4641 }, { "epoch": 2.1119199272065514, "grad_norm": 1.1358459310264946, "learning_rate": 3.1042653974581455e-06, "loss": 0.0383, "step": 4642 }, { "epoch": 2.1123748862602367, "grad_norm": 2.686395210902093, "learning_rate": 3.10357191444629e-06, "loss": 0.0407, "step": 4643 }, { "epoch": 2.1128298453139216, "grad_norm": 1.4319370873558324, "learning_rate": 3.102878382113263e-06, "loss": 0.0413, "step": 4644 }, { "epoch": 2.113284804367607, "grad_norm": 1.4281565053067113, "learning_rate": 3.1021848005157372e-06, "loss": 0.0371, "step": 4645 }, { "epoch": 2.113739763421292, "grad_norm": 1.131728560228816, "learning_rate": 3.101491169710389e-06, "loss": 0.0248, "step": 4646 }, { "epoch": 2.114194722474977, "grad_norm": 1.2324567440491374, "learning_rate": 3.100797489753899e-06, "loss": 0.0553, "step": 4647 }, { "epoch": 2.1146496815286624, "grad_norm": 1.6235434686145498, "learning_rate": 3.1001037607029512e-06, "loss": 0.0298, "step": 4648 }, { "epoch": 2.1151046405823477, "grad_norm": 2.2091358741575817, "learning_rate": 3.099409982614234e-06, "loss": 0.0656, "step": 4649 }, { "epoch": 2.1155595996360326, "grad_norm": 1.017699861015832, "learning_rate": 3.09871615554444e-06, "loss": 0.0309, "step": 4650 }, { "epoch": 2.116014558689718, "grad_norm": 1.3790190622594471, "learning_rate": 3.0980222795502655e-06, "loss": 0.0634, "step": 4651 }, { "epoch": 2.116469517743403, "grad_norm": 1.2206161074311142, "learning_rate": 3.097328354688411e-06, "loss": 0.0267, "step": 4652 }, { "epoch": 2.116924476797088, "grad_norm": 1.2891973173396556, "learning_rate": 3.096634381015581e-06, "loss": 0.0356, "step": 4653 }, { "epoch": 2.1173794358507734, "grad_norm": 1.4500696886464923, "learning_rate": 3.0959403585884833e-06, "loss": 0.0921, "step": 4654 }, { "epoch": 2.1178343949044587, "grad_norm": 1.2588752201588786, "learning_rate": 3.0952462874638318e-06, "loss": 0.037, "step": 4655 }, { "epoch": 2.1182893539581436, "grad_norm": 1.332922323077646, "learning_rate": 3.0945521676983403e-06, "loss": 0.0458, "step": 4656 }, { "epoch": 2.118744313011829, "grad_norm": 1.9282696221101967, "learning_rate": 3.0938579993487314e-06, "loss": 0.0327, "step": 4657 }, { "epoch": 2.119199272065514, "grad_norm": 1.1728469153237555, "learning_rate": 3.0931637824717287e-06, "loss": 0.0621, "step": 4658 }, { "epoch": 2.1196542311191995, "grad_norm": 1.2371410576680646, "learning_rate": 3.0924695171240606e-06, "loss": 0.0276, "step": 4659 }, { "epoch": 2.1201091901728844, "grad_norm": 2.076792312225166, "learning_rate": 3.0917752033624587e-06, "loss": 0.0373, "step": 4660 }, { "epoch": 2.1205641492265697, "grad_norm": 0.9850439073375465, "learning_rate": 3.09108084124366e-06, "loss": 0.0278, "step": 4661 }, { "epoch": 2.121019108280255, "grad_norm": 1.5564915610164152, "learning_rate": 3.0903864308244042e-06, "loss": 0.0559, "step": 4662 }, { "epoch": 2.12147406733394, "grad_norm": 2.264899353658849, "learning_rate": 3.0896919721614342e-06, "loss": 0.0572, "step": 4663 }, { "epoch": 2.121929026387625, "grad_norm": 1.278428159136046, "learning_rate": 3.0889974653115006e-06, "loss": 0.0928, "step": 4664 }, { "epoch": 2.1223839854413105, "grad_norm": 1.0506333103829117, "learning_rate": 3.0883029103313537e-06, "loss": 0.0429, "step": 4665 }, { "epoch": 2.1228389444949953, "grad_norm": 2.0119111821046465, "learning_rate": 3.0876083072777498e-06, "loss": 0.0335, "step": 4666 }, { "epoch": 2.1232939035486806, "grad_norm": 1.0878508100371884, "learning_rate": 3.0869136562074486e-06, "loss": 0.0375, "step": 4667 }, { "epoch": 2.123748862602366, "grad_norm": 1.3881060769166027, "learning_rate": 3.086218957177214e-06, "loss": 0.0627, "step": 4668 }, { "epoch": 2.124203821656051, "grad_norm": 1.3774151146470002, "learning_rate": 3.0855242102438137e-06, "loss": 0.0342, "step": 4669 }, { "epoch": 2.124658780709736, "grad_norm": 1.2707209853232415, "learning_rate": 3.0848294154640184e-06, "loss": 0.0303, "step": 4670 }, { "epoch": 2.1251137397634214, "grad_norm": 1.2917153393737493, "learning_rate": 3.0841345728946056e-06, "loss": 0.058, "step": 4671 }, { "epoch": 2.1255686988171063, "grad_norm": 1.3299239774757794, "learning_rate": 3.0834396825923523e-06, "loss": 0.0336, "step": 4672 }, { "epoch": 2.1260236578707916, "grad_norm": 1.0712751814908938, "learning_rate": 3.082744744614043e-06, "loss": 0.0556, "step": 4673 }, { "epoch": 2.126478616924477, "grad_norm": 0.9188399436087901, "learning_rate": 3.0820497590164655e-06, "loss": 0.0212, "step": 4674 }, { "epoch": 2.126933575978162, "grad_norm": 1.7332844635971665, "learning_rate": 3.08135472585641e-06, "loss": 0.0423, "step": 4675 }, { "epoch": 2.127388535031847, "grad_norm": 2.196565432506177, "learning_rate": 3.080659645190671e-06, "loss": 0.0306, "step": 4676 }, { "epoch": 2.1278434940855324, "grad_norm": 1.5251885743391247, "learning_rate": 3.079964517076049e-06, "loss": 0.0424, "step": 4677 }, { "epoch": 2.1282984531392173, "grad_norm": 1.243791142849915, "learning_rate": 3.0792693415693446e-06, "loss": 0.0263, "step": 4678 }, { "epoch": 2.1287534121929026, "grad_norm": 1.4163197455783212, "learning_rate": 3.078574118727367e-06, "loss": 0.0495, "step": 4679 }, { "epoch": 2.129208371246588, "grad_norm": 1.4383400153782626, "learning_rate": 3.077878848606924e-06, "loss": 0.0593, "step": 4680 }, { "epoch": 2.1296633303002728, "grad_norm": 1.6784165964272575, "learning_rate": 3.0771835312648317e-06, "loss": 0.0377, "step": 4681 }, { "epoch": 2.130118289353958, "grad_norm": 1.2439634848534935, "learning_rate": 3.076488166757908e-06, "loss": 0.0395, "step": 4682 }, { "epoch": 2.1305732484076434, "grad_norm": 1.6162131349579434, "learning_rate": 3.0757927551429744e-06, "loss": 0.0507, "step": 4683 }, { "epoch": 2.1310282074613287, "grad_norm": 0.939571473982225, "learning_rate": 3.075097296476857e-06, "loss": 0.0311, "step": 4684 }, { "epoch": 2.1314831665150136, "grad_norm": 0.9226202993582882, "learning_rate": 3.0744017908163853e-06, "loss": 0.0419, "step": 4685 }, { "epoch": 2.131938125568699, "grad_norm": 1.139379118638161, "learning_rate": 3.0737062382183946e-06, "loss": 0.0381, "step": 4686 }, { "epoch": 2.132393084622384, "grad_norm": 1.4107275017459073, "learning_rate": 3.073010638739719e-06, "loss": 0.0745, "step": 4687 }, { "epoch": 2.132848043676069, "grad_norm": 1.648070014327298, "learning_rate": 3.0723149924372032e-06, "loss": 0.0624, "step": 4688 }, { "epoch": 2.1333030027297544, "grad_norm": 1.3704063768845247, "learning_rate": 3.071619299367691e-06, "loss": 0.0373, "step": 4689 }, { "epoch": 2.1337579617834397, "grad_norm": 1.2908832958567749, "learning_rate": 3.07092355958803e-06, "loss": 0.0323, "step": 4690 }, { "epoch": 2.1342129208371245, "grad_norm": 1.3365735338075082, "learning_rate": 3.0702277731550745e-06, "loss": 0.0402, "step": 4691 }, { "epoch": 2.13466787989081, "grad_norm": 1.081611636222346, "learning_rate": 3.06953194012568e-06, "loss": 0.0555, "step": 4692 }, { "epoch": 2.135122838944495, "grad_norm": 1.4481081649337628, "learning_rate": 3.068836060556708e-06, "loss": 0.0601, "step": 4693 }, { "epoch": 2.13557779799818, "grad_norm": 1.5490509231162921, "learning_rate": 3.0681401345050214e-06, "loss": 0.0383, "step": 4694 }, { "epoch": 2.1360327570518653, "grad_norm": 1.5190919169385606, "learning_rate": 3.067444162027489e-06, "loss": 0.0592, "step": 4695 }, { "epoch": 2.1364877161055507, "grad_norm": 1.155312146153874, "learning_rate": 3.0667481431809826e-06, "loss": 0.0799, "step": 4696 }, { "epoch": 2.1369426751592355, "grad_norm": 1.1261778422703903, "learning_rate": 3.0660520780223767e-06, "loss": 0.0294, "step": 4697 }, { "epoch": 2.137397634212921, "grad_norm": 1.7233763237324151, "learning_rate": 3.0653559666085513e-06, "loss": 0.0489, "step": 4698 }, { "epoch": 2.137852593266606, "grad_norm": 1.493522208417439, "learning_rate": 3.06465980899639e-06, "loss": 0.0317, "step": 4699 }, { "epoch": 2.138307552320291, "grad_norm": 0.9835866733868106, "learning_rate": 3.063963605242779e-06, "loss": 0.0213, "step": 4700 }, { "epoch": 2.1387625113739763, "grad_norm": 1.1908456177913675, "learning_rate": 3.0632673554046084e-06, "loss": 0.0608, "step": 4701 }, { "epoch": 2.1392174704276616, "grad_norm": 1.3937849847626709, "learning_rate": 3.062571059538774e-06, "loss": 0.0264, "step": 4702 }, { "epoch": 2.1396724294813465, "grad_norm": 3.5833616834535693, "learning_rate": 3.0618747177021725e-06, "loss": 0.0539, "step": 4703 }, { "epoch": 2.140127388535032, "grad_norm": 1.6504715633562435, "learning_rate": 3.061178329951707e-06, "loss": 0.0348, "step": 4704 }, { "epoch": 2.140582347588717, "grad_norm": 0.8468223296702783, "learning_rate": 3.0604818963442818e-06, "loss": 0.0234, "step": 4705 }, { "epoch": 2.141037306642402, "grad_norm": 1.4392790389757855, "learning_rate": 3.059785416936808e-06, "loss": 0.0332, "step": 4706 }, { "epoch": 2.1414922656960873, "grad_norm": 1.4880222374102772, "learning_rate": 3.059088891786197e-06, "loss": 0.04, "step": 4707 }, { "epoch": 2.1419472247497726, "grad_norm": 1.2811874527019549, "learning_rate": 3.058392320949367e-06, "loss": 0.0361, "step": 4708 }, { "epoch": 2.1424021838034575, "grad_norm": 1.6719214437603138, "learning_rate": 3.057695704483239e-06, "loss": 0.0361, "step": 4709 }, { "epoch": 2.142857142857143, "grad_norm": 1.2659096850265974, "learning_rate": 3.056999042444736e-06, "loss": 0.0289, "step": 4710 }, { "epoch": 2.143312101910828, "grad_norm": 1.5371108364816337, "learning_rate": 3.056302334890786e-06, "loss": 0.048, "step": 4711 }, { "epoch": 2.143767060964513, "grad_norm": 1.056915380565755, "learning_rate": 3.055605581878322e-06, "loss": 0.0381, "step": 4712 }, { "epoch": 2.1442220200181983, "grad_norm": 1.5742743199759066, "learning_rate": 3.05490878346428e-06, "loss": 0.0263, "step": 4713 }, { "epoch": 2.1446769790718836, "grad_norm": 1.6732493196798153, "learning_rate": 3.0542119397055964e-06, "loss": 0.0334, "step": 4714 }, { "epoch": 2.145131938125569, "grad_norm": 1.3165001459337362, "learning_rate": 3.0535150506592163e-06, "loss": 0.0246, "step": 4715 }, { "epoch": 2.1455868971792538, "grad_norm": 1.2430133356703763, "learning_rate": 3.0528181163820863e-06, "loss": 0.0254, "step": 4716 }, { "epoch": 2.146041856232939, "grad_norm": 1.369500006813292, "learning_rate": 3.0521211369311564e-06, "loss": 0.0289, "step": 4717 }, { "epoch": 2.1464968152866244, "grad_norm": 1.3343401167833668, "learning_rate": 3.0514241123633804e-06, "loss": 0.0697, "step": 4718 }, { "epoch": 2.1469517743403093, "grad_norm": 1.6187005759578441, "learning_rate": 3.0507270427357162e-06, "loss": 0.0392, "step": 4719 }, { "epoch": 2.1474067333939946, "grad_norm": 1.8347883967954703, "learning_rate": 3.0500299281051254e-06, "loss": 0.0416, "step": 4720 }, { "epoch": 2.14786169244768, "grad_norm": 1.3621215066792112, "learning_rate": 3.0493327685285723e-06, "loss": 0.0367, "step": 4721 }, { "epoch": 2.1483166515013647, "grad_norm": 1.7761624209733837, "learning_rate": 3.048635564063026e-06, "loss": 0.0886, "step": 4722 }, { "epoch": 2.14877161055505, "grad_norm": 1.8487248312984426, "learning_rate": 3.047938314765459e-06, "loss": 0.0682, "step": 4723 }, { "epoch": 2.1492265696087354, "grad_norm": 0.8320710680068105, "learning_rate": 3.0472410206928483e-06, "loss": 0.0155, "step": 4724 }, { "epoch": 2.1496815286624202, "grad_norm": 1.6936215715437963, "learning_rate": 3.0465436819021726e-06, "loss": 0.0524, "step": 4725 }, { "epoch": 2.1501364877161055, "grad_norm": 1.5623344369698637, "learning_rate": 3.0458462984504137e-06, "loss": 0.0425, "step": 4726 }, { "epoch": 2.150591446769791, "grad_norm": 1.5221722853077688, "learning_rate": 3.045148870394562e-06, "loss": 0.039, "step": 4727 }, { "epoch": 2.1510464058234757, "grad_norm": 1.4593718264565834, "learning_rate": 3.0444513977916057e-06, "loss": 0.0343, "step": 4728 }, { "epoch": 2.151501364877161, "grad_norm": 1.7027978962252068, "learning_rate": 3.04375388069854e-06, "loss": 0.038, "step": 4729 }, { "epoch": 2.1519563239308463, "grad_norm": 1.279104973351225, "learning_rate": 3.0430563191723633e-06, "loss": 0.0239, "step": 4730 }, { "epoch": 2.152411282984531, "grad_norm": 1.3594456647285629, "learning_rate": 3.042358713270076e-06, "loss": 0.0324, "step": 4731 }, { "epoch": 2.1528662420382165, "grad_norm": 1.1077956365454606, "learning_rate": 3.041661063048685e-06, "loss": 0.0238, "step": 4732 }, { "epoch": 2.153321201091902, "grad_norm": 1.2194367605286915, "learning_rate": 3.040963368565196e-06, "loss": 0.0414, "step": 4733 }, { "epoch": 2.1537761601455867, "grad_norm": 1.440504803445072, "learning_rate": 3.0402656298766254e-06, "loss": 0.0362, "step": 4734 }, { "epoch": 2.154231119199272, "grad_norm": 1.9207405958469503, "learning_rate": 3.0395678470399863e-06, "loss": 0.0841, "step": 4735 }, { "epoch": 2.1546860782529573, "grad_norm": 1.2740907312795982, "learning_rate": 3.0388700201123e-06, "loss": 0.0622, "step": 4736 }, { "epoch": 2.1551410373066426, "grad_norm": 1.272277459920227, "learning_rate": 3.038172149150589e-06, "loss": 0.0249, "step": 4737 }, { "epoch": 2.1555959963603275, "grad_norm": 1.9975602010597957, "learning_rate": 3.0374742342118803e-06, "loss": 0.0617, "step": 4738 }, { "epoch": 2.156050955414013, "grad_norm": 1.7453572099374532, "learning_rate": 3.036776275353204e-06, "loss": 0.0486, "step": 4739 }, { "epoch": 2.156505914467698, "grad_norm": 1.3006080730075464, "learning_rate": 3.0360782726315948e-06, "loss": 0.0441, "step": 4740 }, { "epoch": 2.156960873521383, "grad_norm": 2.2179737344045596, "learning_rate": 3.0353802261040904e-06, "loss": 0.0644, "step": 4741 }, { "epoch": 2.1574158325750683, "grad_norm": 1.0263452089479423, "learning_rate": 3.0346821358277316e-06, "loss": 0.0179, "step": 4742 }, { "epoch": 2.1578707916287536, "grad_norm": 1.4130281199358707, "learning_rate": 3.0339840018595622e-06, "loss": 0.0476, "step": 4743 }, { "epoch": 2.1583257506824385, "grad_norm": 1.418081414857794, "learning_rate": 3.0332858242566333e-06, "loss": 0.0512, "step": 4744 }, { "epoch": 2.158780709736124, "grad_norm": 0.8912796577032582, "learning_rate": 3.032587603075994e-06, "loss": 0.0315, "step": 4745 }, { "epoch": 2.159235668789809, "grad_norm": 1.6189922694240682, "learning_rate": 3.0318893383747018e-06, "loss": 0.0594, "step": 4746 }, { "epoch": 2.159690627843494, "grad_norm": 1.496960890807122, "learning_rate": 3.031191030209814e-06, "loss": 0.0459, "step": 4747 }, { "epoch": 2.1601455868971793, "grad_norm": 1.0326444832578583, "learning_rate": 3.0304926786383943e-06, "loss": 0.0371, "step": 4748 }, { "epoch": 2.1606005459508646, "grad_norm": 1.6642252281147472, "learning_rate": 3.0297942837175092e-06, "loss": 0.0469, "step": 4749 }, { "epoch": 2.1610555050045495, "grad_norm": 1.6474158709300952, "learning_rate": 3.0290958455042264e-06, "loss": 0.0685, "step": 4750 }, { "epoch": 2.1615104640582348, "grad_norm": 1.4955554468239705, "learning_rate": 3.028397364055622e-06, "loss": 0.1085, "step": 4751 }, { "epoch": 2.16196542311192, "grad_norm": 1.1987535047853668, "learning_rate": 3.0276988394287697e-06, "loss": 0.0188, "step": 4752 }, { "epoch": 2.162420382165605, "grad_norm": 0.919142624862043, "learning_rate": 3.027000271680753e-06, "loss": 0.0415, "step": 4753 }, { "epoch": 2.1628753412192903, "grad_norm": 1.424868161065683, "learning_rate": 3.026301660868653e-06, "loss": 0.0392, "step": 4754 }, { "epoch": 2.1633303002729756, "grad_norm": 1.5878926458673583, "learning_rate": 3.025603007049558e-06, "loss": 0.0401, "step": 4755 }, { "epoch": 2.1637852593266604, "grad_norm": 1.343828250798147, "learning_rate": 3.024904310280559e-06, "loss": 0.0452, "step": 4756 }, { "epoch": 2.1642402183803457, "grad_norm": 1.2580959117470718, "learning_rate": 3.0242055706187502e-06, "loss": 0.0343, "step": 4757 }, { "epoch": 2.164695177434031, "grad_norm": 1.0823656657982161, "learning_rate": 3.0235067881212295e-06, "loss": 0.0247, "step": 4758 }, { "epoch": 2.165150136487716, "grad_norm": 1.4002191776235262, "learning_rate": 3.022807962845098e-06, "loss": 0.0422, "step": 4759 }, { "epoch": 2.1656050955414012, "grad_norm": 1.07853897501745, "learning_rate": 3.022109094847461e-06, "loss": 0.0406, "step": 4760 }, { "epoch": 2.1660600545950865, "grad_norm": 1.3407343980142532, "learning_rate": 3.021410184185427e-06, "loss": 0.0299, "step": 4761 }, { "epoch": 2.1665150136487714, "grad_norm": 1.0683195345947276, "learning_rate": 3.020711230916107e-06, "loss": 0.0298, "step": 4762 }, { "epoch": 2.1669699727024567, "grad_norm": 1.3754819191368792, "learning_rate": 3.0200122350966167e-06, "loss": 0.0691, "step": 4763 }, { "epoch": 2.167424931756142, "grad_norm": 1.2113085852024543, "learning_rate": 3.019313196784075e-06, "loss": 0.0432, "step": 4764 }, { "epoch": 2.167879890809827, "grad_norm": 1.4171292478841633, "learning_rate": 3.0186141160356035e-06, "loss": 0.0476, "step": 4765 }, { "epoch": 2.168334849863512, "grad_norm": 1.174176759006349, "learning_rate": 3.0179149929083294e-06, "loss": 0.0375, "step": 4766 }, { "epoch": 2.1687898089171975, "grad_norm": 1.2602074476155056, "learning_rate": 3.0172158274593803e-06, "loss": 0.0172, "step": 4767 }, { "epoch": 2.1692447679708824, "grad_norm": 1.096553426998607, "learning_rate": 3.0165166197458897e-06, "loss": 0.0228, "step": 4768 }, { "epoch": 2.1696997270245677, "grad_norm": 1.5790776885409465, "learning_rate": 3.0158173698249934e-06, "loss": 0.0343, "step": 4769 }, { "epoch": 2.170154686078253, "grad_norm": 1.629823854293363, "learning_rate": 3.0151180777538313e-06, "loss": 0.0369, "step": 4770 }, { "epoch": 2.1706096451319383, "grad_norm": 1.3218744662584911, "learning_rate": 3.014418743589546e-06, "loss": 0.03, "step": 4771 }, { "epoch": 2.171064604185623, "grad_norm": 1.247079416052674, "learning_rate": 3.0137193673892837e-06, "loss": 0.0432, "step": 4772 }, { "epoch": 2.1715195632393085, "grad_norm": 1.4965130842716934, "learning_rate": 3.013019949210196e-06, "loss": 0.0346, "step": 4773 }, { "epoch": 2.171974522292994, "grad_norm": 1.4810367962944182, "learning_rate": 3.0123204891094333e-06, "loss": 0.0355, "step": 4774 }, { "epoch": 2.1724294813466787, "grad_norm": 1.9713359045318246, "learning_rate": 3.011620987144154e-06, "loss": 0.0719, "step": 4775 }, { "epoch": 2.172884440400364, "grad_norm": 1.2075153059167507, "learning_rate": 3.010921443371518e-06, "loss": 0.0558, "step": 4776 }, { "epoch": 2.1733393994540493, "grad_norm": 3.574243282458798, "learning_rate": 3.01022185784869e-06, "loss": 0.0464, "step": 4777 }, { "epoch": 2.173794358507734, "grad_norm": 1.1421539524619082, "learning_rate": 3.009522230632836e-06, "loss": 0.0459, "step": 4778 }, { "epoch": 2.1742493175614195, "grad_norm": 1.4476463430397444, "learning_rate": 3.008822561781125e-06, "loss": 0.0491, "step": 4779 }, { "epoch": 2.174704276615105, "grad_norm": 1.7235764784630614, "learning_rate": 3.008122851350733e-06, "loss": 0.0423, "step": 4780 }, { "epoch": 2.1751592356687897, "grad_norm": 2.128888874587494, "learning_rate": 3.0074230993988363e-06, "loss": 0.0796, "step": 4781 }, { "epoch": 2.175614194722475, "grad_norm": 1.3082984717971913, "learning_rate": 3.0067233059826146e-06, "loss": 0.0344, "step": 4782 }, { "epoch": 2.1760691537761603, "grad_norm": 1.5122013413496431, "learning_rate": 3.0060234711592534e-06, "loss": 0.0401, "step": 4783 }, { "epoch": 2.176524112829845, "grad_norm": 1.5018972109617503, "learning_rate": 3.0053235949859392e-06, "loss": 0.0606, "step": 4784 }, { "epoch": 2.1769790718835305, "grad_norm": 1.3501984873405586, "learning_rate": 3.0046236775198625e-06, "loss": 0.0738, "step": 4785 }, { "epoch": 2.1774340309372158, "grad_norm": 1.5523391950820271, "learning_rate": 3.003923718818218e-06, "loss": 0.0379, "step": 4786 }, { "epoch": 2.1778889899909006, "grad_norm": 1.5967869297250574, "learning_rate": 3.003223718938203e-06, "loss": 0.0416, "step": 4787 }, { "epoch": 2.178343949044586, "grad_norm": 1.5898862519054306, "learning_rate": 3.002523677937018e-06, "loss": 0.0399, "step": 4788 }, { "epoch": 2.1787989080982713, "grad_norm": 1.349591523055821, "learning_rate": 3.001823595871867e-06, "loss": 0.0342, "step": 4789 }, { "epoch": 2.179253867151956, "grad_norm": 1.3804829957600524, "learning_rate": 3.001123472799959e-06, "loss": 0.0754, "step": 4790 }, { "epoch": 2.1797088262056414, "grad_norm": 1.2898335801737437, "learning_rate": 3.0004233087785033e-06, "loss": 0.0221, "step": 4791 }, { "epoch": 2.1801637852593267, "grad_norm": 1.5807929597595896, "learning_rate": 2.9997231038647147e-06, "loss": 0.0507, "step": 4792 }, { "epoch": 2.180618744313012, "grad_norm": 1.7976034527105236, "learning_rate": 2.9990228581158103e-06, "loss": 0.0451, "step": 4793 }, { "epoch": 2.181073703366697, "grad_norm": 1.424482978187728, "learning_rate": 2.9983225715890123e-06, "loss": 0.0414, "step": 4794 }, { "epoch": 2.1815286624203822, "grad_norm": 0.8613308121736274, "learning_rate": 2.997622244341544e-06, "loss": 0.0237, "step": 4795 }, { "epoch": 2.1819836214740675, "grad_norm": 1.3419243443907907, "learning_rate": 2.996921876430633e-06, "loss": 0.0329, "step": 4796 }, { "epoch": 2.1824385805277524, "grad_norm": 1.1335092313004314, "learning_rate": 2.9962214679135105e-06, "loss": 0.0301, "step": 4797 }, { "epoch": 2.1828935395814377, "grad_norm": 1.6463625465586271, "learning_rate": 2.99552101884741e-06, "loss": 0.0353, "step": 4798 }, { "epoch": 2.183348498635123, "grad_norm": 1.5911675138460448, "learning_rate": 2.99482052928957e-06, "loss": 0.0738, "step": 4799 }, { "epoch": 2.183803457688808, "grad_norm": 1.2472013520281935, "learning_rate": 2.9941199992972316e-06, "loss": 0.0334, "step": 4800 }, { "epoch": 2.184258416742493, "grad_norm": 0.942999087262108, "learning_rate": 2.9934194289276368e-06, "loss": 0.0223, "step": 4801 }, { "epoch": 2.1847133757961785, "grad_norm": 1.2232057224053545, "learning_rate": 2.992718818238036e-06, "loss": 0.0397, "step": 4802 }, { "epoch": 2.1851683348498634, "grad_norm": 1.794815495627681, "learning_rate": 2.992018167285677e-06, "loss": 0.0392, "step": 4803 }, { "epoch": 2.1856232939035487, "grad_norm": 1.3228598337744404, "learning_rate": 2.9913174761278163e-06, "loss": 0.0769, "step": 4804 }, { "epoch": 2.186078252957234, "grad_norm": 1.3560829996811705, "learning_rate": 2.99061674482171e-06, "loss": 0.0325, "step": 4805 }, { "epoch": 2.186533212010919, "grad_norm": 1.6190823992218004, "learning_rate": 2.9899159734246187e-06, "loss": 0.0477, "step": 4806 }, { "epoch": 2.186988171064604, "grad_norm": 3.2059067741248, "learning_rate": 2.989215161993807e-06, "loss": 0.0592, "step": 4807 }, { "epoch": 2.1874431301182895, "grad_norm": 1.1596807779604015, "learning_rate": 2.988514310586541e-06, "loss": 0.02, "step": 4808 }, { "epoch": 2.1878980891719744, "grad_norm": 1.2252960331295697, "learning_rate": 2.9878134192600926e-06, "loss": 0.0331, "step": 4809 }, { "epoch": 2.1883530482256597, "grad_norm": 1.3914154544482207, "learning_rate": 2.9871124880717333e-06, "loss": 0.0455, "step": 4810 }, { "epoch": 2.188808007279345, "grad_norm": 1.226278960518522, "learning_rate": 2.9864115170787424e-06, "loss": 0.0458, "step": 4811 }, { "epoch": 2.18926296633303, "grad_norm": 1.8607892683922145, "learning_rate": 2.985710506338398e-06, "loss": 0.0434, "step": 4812 }, { "epoch": 2.189717925386715, "grad_norm": 0.9550763887249889, "learning_rate": 2.9850094559079845e-06, "loss": 0.0266, "step": 4813 }, { "epoch": 2.1901728844404005, "grad_norm": 1.2065687310825262, "learning_rate": 2.9843083658447893e-06, "loss": 0.0309, "step": 4814 }, { "epoch": 2.1906278434940853, "grad_norm": 1.1754706658114427, "learning_rate": 2.983607236206101e-06, "loss": 0.0582, "step": 4815 }, { "epoch": 2.1910828025477707, "grad_norm": 1.727819397901102, "learning_rate": 2.982906067049214e-06, "loss": 0.0527, "step": 4816 }, { "epoch": 2.191537761601456, "grad_norm": 1.0315348740113737, "learning_rate": 2.9822048584314227e-06, "loss": 0.0351, "step": 4817 }, { "epoch": 2.191992720655141, "grad_norm": 1.3342916414226518, "learning_rate": 2.9815036104100287e-06, "loss": 0.04, "step": 4818 }, { "epoch": 2.192447679708826, "grad_norm": 1.1811471394276059, "learning_rate": 2.9808023230423342e-06, "loss": 0.0441, "step": 4819 }, { "epoch": 2.1929026387625115, "grad_norm": 1.3058312448839395, "learning_rate": 2.9801009963856446e-06, "loss": 0.025, "step": 4820 }, { "epoch": 2.1933575978161963, "grad_norm": 0.8954670624845833, "learning_rate": 2.9793996304972705e-06, "loss": 0.0344, "step": 4821 }, { "epoch": 2.1938125568698816, "grad_norm": 2.6910862075991124, "learning_rate": 2.978698225434523e-06, "loss": 0.0343, "step": 4822 }, { "epoch": 2.194267515923567, "grad_norm": 1.0423447956132392, "learning_rate": 2.977996781254719e-06, "loss": 0.0647, "step": 4823 }, { "epoch": 2.194722474977252, "grad_norm": 1.2745877656076845, "learning_rate": 2.977295298015176e-06, "loss": 0.0369, "step": 4824 }, { "epoch": 2.195177434030937, "grad_norm": 1.2847219374501857, "learning_rate": 2.9765937757732166e-06, "loss": 0.0331, "step": 4825 }, { "epoch": 2.1956323930846224, "grad_norm": 1.7366874247577795, "learning_rate": 2.975892214586167e-06, "loss": 0.0725, "step": 4826 }, { "epoch": 2.1960873521383077, "grad_norm": 1.7100026467647615, "learning_rate": 2.975190614511353e-06, "loss": 0.0654, "step": 4827 }, { "epoch": 2.1965423111919926, "grad_norm": 1.5687637140327095, "learning_rate": 2.9744889756061097e-06, "loss": 0.0786, "step": 4828 }, { "epoch": 2.196997270245678, "grad_norm": 1.4421451617700036, "learning_rate": 2.9737872979277694e-06, "loss": 0.0541, "step": 4829 }, { "epoch": 2.1974522292993632, "grad_norm": 1.1236746499609789, "learning_rate": 2.9730855815336706e-06, "loss": 0.0326, "step": 4830 }, { "epoch": 2.197907188353048, "grad_norm": 1.1990245581267205, "learning_rate": 2.9723838264811546e-06, "loss": 0.0576, "step": 4831 }, { "epoch": 2.1983621474067334, "grad_norm": 1.3063637973955409, "learning_rate": 2.9716820328275655e-06, "loss": 0.0365, "step": 4832 }, { "epoch": 2.1988171064604187, "grad_norm": 1.845417465921119, "learning_rate": 2.970980200630251e-06, "loss": 0.071, "step": 4833 }, { "epoch": 2.1992720655141036, "grad_norm": 1.8096652933823452, "learning_rate": 2.9702783299465617e-06, "loss": 0.0745, "step": 4834 }, { "epoch": 2.199727024567789, "grad_norm": 1.849557703132923, "learning_rate": 2.96957642083385e-06, "loss": 0.0335, "step": 4835 }, { "epoch": 2.200181983621474, "grad_norm": 1.1511727849560855, "learning_rate": 2.968874473349475e-06, "loss": 0.0191, "step": 4836 }, { "epoch": 2.200636942675159, "grad_norm": 1.8168877293331658, "learning_rate": 2.9681724875507947e-06, "loss": 0.0334, "step": 4837 }, { "epoch": 2.2010919017288444, "grad_norm": 1.1705449675320077, "learning_rate": 2.967470463495173e-06, "loss": 0.032, "step": 4838 }, { "epoch": 2.2015468607825297, "grad_norm": 1.4943798736630216, "learning_rate": 2.966768401239976e-06, "loss": 0.0508, "step": 4839 }, { "epoch": 2.2020018198362146, "grad_norm": 1.1140046767344616, "learning_rate": 2.9660663008425738e-06, "loss": 0.0465, "step": 4840 }, { "epoch": 2.2024567788899, "grad_norm": 1.663706997459617, "learning_rate": 2.965364162360338e-06, "loss": 0.0585, "step": 4841 }, { "epoch": 2.202911737943585, "grad_norm": 1.7046274738963747, "learning_rate": 2.9646619858506437e-06, "loss": 0.0562, "step": 4842 }, { "epoch": 2.20336669699727, "grad_norm": 1.4118345100466037, "learning_rate": 2.963959771370871e-06, "loss": 0.0412, "step": 4843 }, { "epoch": 2.2038216560509554, "grad_norm": 1.8677437148688187, "learning_rate": 2.963257518978401e-06, "loss": 0.0718, "step": 4844 }, { "epoch": 2.2042766151046407, "grad_norm": 1.3474121864624116, "learning_rate": 2.962555228730618e-06, "loss": 0.0641, "step": 4845 }, { "epoch": 2.2047315741583255, "grad_norm": 1.1644491534341936, "learning_rate": 2.961852900684911e-06, "loss": 0.0325, "step": 4846 }, { "epoch": 2.205186533212011, "grad_norm": 1.3944549367184338, "learning_rate": 2.9611505348986715e-06, "loss": 0.0301, "step": 4847 }, { "epoch": 2.205641492265696, "grad_norm": 1.1831072628609203, "learning_rate": 2.9604481314292914e-06, "loss": 0.0236, "step": 4848 }, { "epoch": 2.2060964513193815, "grad_norm": 1.4889884456765492, "learning_rate": 2.9597456903341703e-06, "loss": 0.0366, "step": 4849 }, { "epoch": 2.2065514103730663, "grad_norm": 1.6016835209314058, "learning_rate": 2.9590432116707075e-06, "loss": 0.0482, "step": 4850 }, { "epoch": 2.2070063694267517, "grad_norm": 2.154609937036321, "learning_rate": 2.9583406954963063e-06, "loss": 0.04, "step": 4851 }, { "epoch": 2.207461328480437, "grad_norm": 1.6610096115842745, "learning_rate": 2.957638141868373e-06, "loss": 0.0516, "step": 4852 }, { "epoch": 2.207916287534122, "grad_norm": 1.3619020767003573, "learning_rate": 2.9569355508443182e-06, "loss": 0.0436, "step": 4853 }, { "epoch": 2.208371246587807, "grad_norm": 1.9562032265560927, "learning_rate": 2.956232922481553e-06, "loss": 0.048, "step": 4854 }, { "epoch": 2.2088262056414925, "grad_norm": 1.5388973585371033, "learning_rate": 2.955530256837493e-06, "loss": 0.0619, "step": 4855 }, { "epoch": 2.2092811646951773, "grad_norm": 1.0760579650269442, "learning_rate": 2.9548275539695588e-06, "loss": 0.0323, "step": 4856 }, { "epoch": 2.2097361237488626, "grad_norm": 1.7942792412582231, "learning_rate": 2.954124813935171e-06, "loss": 0.0766, "step": 4857 }, { "epoch": 2.210191082802548, "grad_norm": 1.413944292057448, "learning_rate": 2.9534220367917533e-06, "loss": 0.0283, "step": 4858 }, { "epoch": 2.210646041856233, "grad_norm": 1.5327009340868198, "learning_rate": 2.952719222596735e-06, "loss": 0.0748, "step": 4859 }, { "epoch": 2.211101000909918, "grad_norm": 1.2413956474424566, "learning_rate": 2.952016371407546e-06, "loss": 0.0433, "step": 4860 }, { "epoch": 2.2115559599636034, "grad_norm": 1.3449999402524466, "learning_rate": 2.9513134832816206e-06, "loss": 0.0453, "step": 4861 }, { "epoch": 2.2120109190172883, "grad_norm": 1.1466773916300619, "learning_rate": 2.9506105582763955e-06, "loss": 0.0522, "step": 4862 }, { "epoch": 2.2124658780709736, "grad_norm": 1.5930471884026738, "learning_rate": 2.9499075964493103e-06, "loss": 0.0686, "step": 4863 }, { "epoch": 2.212920837124659, "grad_norm": 1.446293161592482, "learning_rate": 2.949204597857808e-06, "loss": 0.0856, "step": 4864 }, { "epoch": 2.213375796178344, "grad_norm": 1.4231052832868267, "learning_rate": 2.948501562559335e-06, "loss": 0.0375, "step": 4865 }, { "epoch": 2.213830755232029, "grad_norm": 1.043253993552045, "learning_rate": 2.9477984906113395e-06, "loss": 0.0372, "step": 4866 }, { "epoch": 2.2142857142857144, "grad_norm": 1.4102302853778943, "learning_rate": 2.9470953820712737e-06, "loss": 0.049, "step": 4867 }, { "epoch": 2.2147406733393993, "grad_norm": 1.168605638817134, "learning_rate": 2.946392236996592e-06, "loss": 0.0339, "step": 4868 }, { "epoch": 2.2151956323930846, "grad_norm": 0.9448063598544195, "learning_rate": 2.9456890554447527e-06, "loss": 0.0664, "step": 4869 }, { "epoch": 2.21565059144677, "grad_norm": 1.3155378254099435, "learning_rate": 2.944985837473217e-06, "loss": 0.034, "step": 4870 }, { "epoch": 2.2161055505004548, "grad_norm": 1.0658320910831403, "learning_rate": 2.9442825831394474e-06, "loss": 0.0317, "step": 4871 }, { "epoch": 2.21656050955414, "grad_norm": 1.163958867014049, "learning_rate": 2.9435792925009123e-06, "loss": 0.0731, "step": 4872 }, { "epoch": 2.2170154686078254, "grad_norm": 1.2916288289435573, "learning_rate": 2.9428759656150795e-06, "loss": 0.0375, "step": 4873 }, { "epoch": 2.2174704276615103, "grad_norm": 1.147735374534481, "learning_rate": 2.9421726025394235e-06, "loss": 0.0682, "step": 4874 }, { "epoch": 2.2179253867151956, "grad_norm": 1.1136888564721377, "learning_rate": 2.9414692033314198e-06, "loss": 0.0317, "step": 4875 }, { "epoch": 2.218380345768881, "grad_norm": 0.8113319291478326, "learning_rate": 2.9407657680485454e-06, "loss": 0.055, "step": 4876 }, { "epoch": 2.2188353048225657, "grad_norm": 1.1451135680369162, "learning_rate": 2.9400622967482838e-06, "loss": 0.0456, "step": 4877 }, { "epoch": 2.219290263876251, "grad_norm": 1.2671700602688307, "learning_rate": 2.939358789488118e-06, "loss": 0.0367, "step": 4878 }, { "epoch": 2.2197452229299364, "grad_norm": 0.7789086900584208, "learning_rate": 2.9386552463255364e-06, "loss": 0.0341, "step": 4879 }, { "epoch": 2.2202001819836217, "grad_norm": 1.251411530606978, "learning_rate": 2.937951667318028e-06, "loss": 0.052, "step": 4880 }, { "epoch": 2.2206551410373065, "grad_norm": 1.1231464963302638, "learning_rate": 2.9372480525230884e-06, "loss": 0.0485, "step": 4881 }, { "epoch": 2.221110100090992, "grad_norm": 0.8834008604403095, "learning_rate": 2.936544401998212e-06, "loss": 0.0351, "step": 4882 }, { "epoch": 2.221565059144677, "grad_norm": 1.4884199311745954, "learning_rate": 2.9358407158008984e-06, "loss": 0.0319, "step": 4883 }, { "epoch": 2.222020018198362, "grad_norm": 1.4668019239209842, "learning_rate": 2.9351369939886504e-06, "loss": 0.0381, "step": 4884 }, { "epoch": 2.2224749772520473, "grad_norm": 1.505351767811715, "learning_rate": 2.934433236618972e-06, "loss": 0.0538, "step": 4885 }, { "epoch": 2.2229299363057327, "grad_norm": 1.9950487032313242, "learning_rate": 2.9337294437493715e-06, "loss": 0.0552, "step": 4886 }, { "epoch": 2.2233848953594175, "grad_norm": 0.8678421183491278, "learning_rate": 2.9330256154373595e-06, "loss": 0.0128, "step": 4887 }, { "epoch": 2.223839854413103, "grad_norm": 1.8603184452541563, "learning_rate": 2.9323217517404488e-06, "loss": 0.056, "step": 4888 }, { "epoch": 2.224294813466788, "grad_norm": 1.31208362189531, "learning_rate": 2.9316178527161583e-06, "loss": 0.0822, "step": 4889 }, { "epoch": 2.224749772520473, "grad_norm": 1.6368688596565661, "learning_rate": 2.930913918422005e-06, "loss": 0.0463, "step": 4890 }, { "epoch": 2.2252047315741583, "grad_norm": 1.5262792112347499, "learning_rate": 2.9302099489155126e-06, "loss": 0.0798, "step": 4891 }, { "epoch": 2.2256596906278436, "grad_norm": 1.6341702024810936, "learning_rate": 2.929505944254206e-06, "loss": 0.0585, "step": 4892 }, { "epoch": 2.2261146496815285, "grad_norm": 1.2279105540612898, "learning_rate": 2.928801904495614e-06, "loss": 0.0358, "step": 4893 }, { "epoch": 2.226569608735214, "grad_norm": 1.1859415882092814, "learning_rate": 2.9280978296972657e-06, "loss": 0.0452, "step": 4894 }, { "epoch": 2.227024567788899, "grad_norm": 1.8766565789218665, "learning_rate": 2.9273937199166962e-06, "loss": 0.0607, "step": 4895 }, { "epoch": 2.227479526842584, "grad_norm": 1.454420850804186, "learning_rate": 2.9266895752114426e-06, "loss": 0.0675, "step": 4896 }, { "epoch": 2.2279344858962693, "grad_norm": 1.9225730674558246, "learning_rate": 2.925985395639043e-06, "loss": 0.0482, "step": 4897 }, { "epoch": 2.2283894449499546, "grad_norm": 1.234267225480716, "learning_rate": 2.9252811812570415e-06, "loss": 0.0445, "step": 4898 }, { "epoch": 2.2288444040036395, "grad_norm": 0.8454416741441403, "learning_rate": 2.9245769321229817e-06, "loss": 0.0309, "step": 4899 }, { "epoch": 2.229299363057325, "grad_norm": 2.0829682106297276, "learning_rate": 2.9238726482944134e-06, "loss": 0.0346, "step": 4900 }, { "epoch": 2.22975432211101, "grad_norm": 1.4289499163893815, "learning_rate": 2.9231683298288853e-06, "loss": 0.0443, "step": 4901 }, { "epoch": 2.2302092811646954, "grad_norm": 1.2120000684052246, "learning_rate": 2.922463976783953e-06, "loss": 0.026, "step": 4902 }, { "epoch": 2.2306642402183803, "grad_norm": 1.1403352846154797, "learning_rate": 2.9217595892171724e-06, "loss": 0.0298, "step": 4903 }, { "epoch": 2.2311191992720656, "grad_norm": 1.5188561498196684, "learning_rate": 2.9210551671861016e-06, "loss": 0.0603, "step": 4904 }, { "epoch": 2.231574158325751, "grad_norm": 1.2862214149638092, "learning_rate": 2.9203507107483055e-06, "loss": 0.0262, "step": 4905 }, { "epoch": 2.2320291173794358, "grad_norm": 1.576426355773263, "learning_rate": 2.9196462199613473e-06, "loss": 0.0387, "step": 4906 }, { "epoch": 2.232484076433121, "grad_norm": 1.078068667555504, "learning_rate": 2.9189416948827946e-06, "loss": 0.0325, "step": 4907 }, { "epoch": 2.2329390354868064, "grad_norm": 1.1731985013806197, "learning_rate": 2.9182371355702188e-06, "loss": 0.036, "step": 4908 }, { "epoch": 2.2333939945404913, "grad_norm": 2.3414875136339846, "learning_rate": 2.917532542081193e-06, "loss": 0.047, "step": 4909 }, { "epoch": 2.2338489535941766, "grad_norm": 1.6784982408401712, "learning_rate": 2.9168279144732936e-06, "loss": 0.0446, "step": 4910 }, { "epoch": 2.234303912647862, "grad_norm": 1.2567375624989257, "learning_rate": 2.916123252804099e-06, "loss": 0.0558, "step": 4911 }, { "epoch": 2.2347588717015467, "grad_norm": 1.1480049784198874, "learning_rate": 2.915418557131192e-06, "loss": 0.065, "step": 4912 }, { "epoch": 2.235213830755232, "grad_norm": 1.9188670985562968, "learning_rate": 2.914713827512156e-06, "loss": 0.0771, "step": 4913 }, { "epoch": 2.2356687898089174, "grad_norm": 1.2517438994337773, "learning_rate": 2.914009064004578e-06, "loss": 0.0687, "step": 4914 }, { "epoch": 2.2361237488626022, "grad_norm": 1.6205787914239442, "learning_rate": 2.9133042666660505e-06, "loss": 0.0446, "step": 4915 }, { "epoch": 2.2365787079162875, "grad_norm": 1.1897351925893138, "learning_rate": 2.912599435554164e-06, "loss": 0.0305, "step": 4916 }, { "epoch": 2.237033666969973, "grad_norm": 0.7176375144818231, "learning_rate": 2.9118945707265154e-06, "loss": 0.0471, "step": 4917 }, { "epoch": 2.2374886260236577, "grad_norm": 1.4800895277515258, "learning_rate": 2.911189672240702e-06, "loss": 0.0912, "step": 4918 }, { "epoch": 2.237943585077343, "grad_norm": 1.2772966738303304, "learning_rate": 2.910484740154326e-06, "loss": 0.037, "step": 4919 }, { "epoch": 2.2383985441310283, "grad_norm": 1.0265105019147462, "learning_rate": 2.909779774524991e-06, "loss": 0.0187, "step": 4920 }, { "epoch": 2.238853503184713, "grad_norm": 1.3955114070136738, "learning_rate": 2.9090747754103022e-06, "loss": 0.0419, "step": 4921 }, { "epoch": 2.2393084622383985, "grad_norm": 1.129964077241819, "learning_rate": 2.9083697428678713e-06, "loss": 0.0362, "step": 4922 }, { "epoch": 2.239763421292084, "grad_norm": 1.2936384937372054, "learning_rate": 2.907664676955309e-06, "loss": 0.0418, "step": 4923 }, { "epoch": 2.2402183803457687, "grad_norm": 1.4885963977451986, "learning_rate": 2.90695957773023e-06, "loss": 0.0302, "step": 4924 }, { "epoch": 2.240673339399454, "grad_norm": 1.1734005265019614, "learning_rate": 2.9062544452502515e-06, "loss": 0.0369, "step": 4925 }, { "epoch": 2.2411282984531393, "grad_norm": 1.2889898225251004, "learning_rate": 2.9055492795729954e-06, "loss": 0.0558, "step": 4926 }, { "epoch": 2.241583257506824, "grad_norm": 1.4647780100269598, "learning_rate": 2.9048440807560836e-06, "loss": 0.0748, "step": 4927 }, { "epoch": 2.2420382165605095, "grad_norm": 1.2033199753789243, "learning_rate": 2.9041388488571416e-06, "loss": 0.0378, "step": 4928 }, { "epoch": 2.242493175614195, "grad_norm": 2.0222356501495664, "learning_rate": 2.9034335839337975e-06, "loss": 0.0717, "step": 4929 }, { "epoch": 2.2429481346678797, "grad_norm": 1.1716655585477946, "learning_rate": 2.9027282860436833e-06, "loss": 0.0346, "step": 4930 }, { "epoch": 2.243403093721565, "grad_norm": 1.7427411660636234, "learning_rate": 2.902022955244432e-06, "loss": 0.0542, "step": 4931 }, { "epoch": 2.2438580527752503, "grad_norm": 1.4431290872974885, "learning_rate": 2.90131759159368e-06, "loss": 0.0259, "step": 4932 }, { "epoch": 2.244313011828935, "grad_norm": 1.7384130589338576, "learning_rate": 2.9006121951490673e-06, "loss": 0.0637, "step": 4933 }, { "epoch": 2.2447679708826205, "grad_norm": 1.214745128555294, "learning_rate": 2.899906765968235e-06, "loss": 0.0606, "step": 4934 }, { "epoch": 2.245222929936306, "grad_norm": 1.3213274322664825, "learning_rate": 2.8992013041088274e-06, "loss": 0.0357, "step": 4935 }, { "epoch": 2.245677888989991, "grad_norm": 0.9031469131152714, "learning_rate": 2.8984958096284927e-06, "loss": 0.0285, "step": 4936 }, { "epoch": 2.246132848043676, "grad_norm": 1.161428589696332, "learning_rate": 2.8977902825848798e-06, "loss": 0.0431, "step": 4937 }, { "epoch": 2.2465878070973613, "grad_norm": 1.2961951955906217, "learning_rate": 2.8970847230356414e-06, "loss": 0.0766, "step": 4938 }, { "epoch": 2.2470427661510466, "grad_norm": 1.5849880269234837, "learning_rate": 2.896379131038432e-06, "loss": 0.0981, "step": 4939 }, { "epoch": 2.2474977252047315, "grad_norm": 1.356275929557653, "learning_rate": 2.8956735066509113e-06, "loss": 0.0338, "step": 4940 }, { "epoch": 2.2479526842584168, "grad_norm": 1.2856006494755698, "learning_rate": 2.8949678499307376e-06, "loss": 0.1089, "step": 4941 }, { "epoch": 2.248407643312102, "grad_norm": 1.2042897035013165, "learning_rate": 2.894262160935575e-06, "loss": 0.039, "step": 4942 }, { "epoch": 2.248862602365787, "grad_norm": 2.1200683598378554, "learning_rate": 2.8935564397230885e-06, "loss": 0.0629, "step": 4943 }, { "epoch": 2.2493175614194723, "grad_norm": 0.9825819590323274, "learning_rate": 2.892850686350948e-06, "loss": 0.0399, "step": 4944 }, { "epoch": 2.2497725204731576, "grad_norm": 1.6550430592613854, "learning_rate": 2.892144900876823e-06, "loss": 0.0518, "step": 4945 }, { "epoch": 2.2502274795268424, "grad_norm": 1.4385403740003337, "learning_rate": 2.8914390833583877e-06, "loss": 0.0828, "step": 4946 }, { "epoch": 2.2506824385805277, "grad_norm": 1.3112649257110762, "learning_rate": 2.8907332338533182e-06, "loss": 0.0369, "step": 4947 }, { "epoch": 2.251137397634213, "grad_norm": 2.310372566281053, "learning_rate": 2.8900273524192936e-06, "loss": 0.0795, "step": 4948 }, { "epoch": 2.251592356687898, "grad_norm": 1.6500817140605286, "learning_rate": 2.889321439113995e-06, "loss": 0.0706, "step": 4949 }, { "epoch": 2.2520473157415832, "grad_norm": 1.2890952568099243, "learning_rate": 2.888615493995106e-06, "loss": 0.0529, "step": 4950 }, { "epoch": 2.2525022747952685, "grad_norm": 1.0839031554318346, "learning_rate": 2.8879095171203147e-06, "loss": 0.0284, "step": 4951 }, { "epoch": 2.2529572338489534, "grad_norm": 1.2303230435990236, "learning_rate": 2.887203508547309e-06, "loss": 0.032, "step": 4952 }, { "epoch": 2.2534121929026387, "grad_norm": 1.5244008506321645, "learning_rate": 2.886497468333781e-06, "loss": 0.046, "step": 4953 }, { "epoch": 2.253867151956324, "grad_norm": 0.9775657805597391, "learning_rate": 2.8857913965374264e-06, "loss": 0.0267, "step": 4954 }, { "epoch": 2.2543221110100093, "grad_norm": 1.4894899908996087, "learning_rate": 2.88508529321594e-06, "loss": 0.034, "step": 4955 }, { "epoch": 2.254777070063694, "grad_norm": 1.5977157507211035, "learning_rate": 2.8843791584270226e-06, "loss": 0.037, "step": 4956 }, { "epoch": 2.2552320291173795, "grad_norm": 1.1706331137531776, "learning_rate": 2.8836729922283756e-06, "loss": 0.0259, "step": 4957 }, { "epoch": 2.255686988171065, "grad_norm": 1.134580852464133, "learning_rate": 2.8829667946777058e-06, "loss": 0.0437, "step": 4958 }, { "epoch": 2.2561419472247497, "grad_norm": 2.35934662766806, "learning_rate": 2.8822605658327184e-06, "loss": 0.0636, "step": 4959 }, { "epoch": 2.256596906278435, "grad_norm": 1.137272029851148, "learning_rate": 2.8815543057511232e-06, "loss": 0.0381, "step": 4960 }, { "epoch": 2.2570518653321203, "grad_norm": 1.5886811467070987, "learning_rate": 2.8808480144906344e-06, "loss": 0.079, "step": 4961 }, { "epoch": 2.257506824385805, "grad_norm": 1.0887023848657293, "learning_rate": 2.8801416921089642e-06, "loss": 0.0374, "step": 4962 }, { "epoch": 2.2579617834394905, "grad_norm": 1.0654005809893197, "learning_rate": 2.8794353386638324e-06, "loss": 0.0123, "step": 4963 }, { "epoch": 2.258416742493176, "grad_norm": 1.1816844514632245, "learning_rate": 2.8787289542129588e-06, "loss": 0.0435, "step": 4964 }, { "epoch": 2.2588717015468607, "grad_norm": 1.3102466095972438, "learning_rate": 2.8780225388140648e-06, "loss": 0.0156, "step": 4965 }, { "epoch": 2.259326660600546, "grad_norm": 1.1795717561769457, "learning_rate": 2.8773160925248766e-06, "loss": 0.0337, "step": 4966 }, { "epoch": 2.2597816196542313, "grad_norm": 1.083863406608199, "learning_rate": 2.87660961540312e-06, "loss": 0.0327, "step": 4967 }, { "epoch": 2.260236578707916, "grad_norm": 1.1900939968823376, "learning_rate": 2.8759031075065276e-06, "loss": 0.0238, "step": 4968 }, { "epoch": 2.2606915377616015, "grad_norm": 1.1896965640150095, "learning_rate": 2.87519656889283e-06, "loss": 0.0255, "step": 4969 }, { "epoch": 2.261146496815287, "grad_norm": 1.2940057222699595, "learning_rate": 2.874489999619764e-06, "loss": 0.0315, "step": 4970 }, { "epoch": 2.2616014558689717, "grad_norm": 1.6113696819027903, "learning_rate": 2.8737833997450658e-06, "loss": 0.1023, "step": 4971 }, { "epoch": 2.262056414922657, "grad_norm": 1.6858991884281305, "learning_rate": 2.8730767693264765e-06, "loss": 0.04, "step": 4972 }, { "epoch": 2.2625113739763423, "grad_norm": 1.4254819124592668, "learning_rate": 2.8723701084217388e-06, "loss": 0.0842, "step": 4973 }, { "epoch": 2.262966333030027, "grad_norm": 1.4538932892589904, "learning_rate": 2.871663417088596e-06, "loss": 0.0616, "step": 4974 }, { "epoch": 2.2634212920837125, "grad_norm": 2.088679539340773, "learning_rate": 2.8709566953847984e-06, "loss": 0.1011, "step": 4975 }, { "epoch": 2.2638762511373978, "grad_norm": 1.3612500391113815, "learning_rate": 2.870249943368095e-06, "loss": 0.0731, "step": 4976 }, { "epoch": 2.2643312101910826, "grad_norm": 1.2762013593381327, "learning_rate": 2.869543161096237e-06, "loss": 0.0436, "step": 4977 }, { "epoch": 2.264786169244768, "grad_norm": 1.6452805838613949, "learning_rate": 2.868836348626982e-06, "loss": 0.0357, "step": 4978 }, { "epoch": 2.2652411282984533, "grad_norm": 1.2421102158384851, "learning_rate": 2.8681295060180856e-06, "loss": 0.0473, "step": 4979 }, { "epoch": 2.265696087352138, "grad_norm": 1.1310942720566495, "learning_rate": 2.8674226333273093e-06, "loss": 0.0182, "step": 4980 }, { "epoch": 2.2661510464058234, "grad_norm": 1.4421140244272375, "learning_rate": 2.866715730612414e-06, "loss": 0.0246, "step": 4981 }, { "epoch": 2.2666060054595087, "grad_norm": 2.1145918372143693, "learning_rate": 2.8660087979311647e-06, "loss": 0.0439, "step": 4982 }, { "epoch": 2.2670609645131936, "grad_norm": 1.2946735143348782, "learning_rate": 2.8653018353413305e-06, "loss": 0.0952, "step": 4983 }, { "epoch": 2.267515923566879, "grad_norm": 1.1985252252975946, "learning_rate": 2.8645948429006787e-06, "loss": 0.0496, "step": 4984 }, { "epoch": 2.2679708826205642, "grad_norm": 1.043170686672923, "learning_rate": 2.863887820666984e-06, "loss": 0.0424, "step": 4985 }, { "epoch": 2.268425841674249, "grad_norm": 1.5472788851286614, "learning_rate": 2.863180768698019e-06, "loss": 0.0591, "step": 4986 }, { "epoch": 2.2688808007279344, "grad_norm": 1.4384557612605513, "learning_rate": 2.8624736870515624e-06, "loss": 0.056, "step": 4987 }, { "epoch": 2.2693357597816197, "grad_norm": 1.4273142177434686, "learning_rate": 2.8617665757853925e-06, "loss": 0.0402, "step": 4988 }, { "epoch": 2.2697907188353046, "grad_norm": 2.4727622021614346, "learning_rate": 2.8610594349572917e-06, "loss": 0.0443, "step": 4989 }, { "epoch": 2.27024567788899, "grad_norm": 1.7151788234302556, "learning_rate": 2.8603522646250453e-06, "loss": 0.0422, "step": 4990 }, { "epoch": 2.270700636942675, "grad_norm": 1.7797404568054, "learning_rate": 2.859645064846438e-06, "loss": 0.0342, "step": 4991 }, { "epoch": 2.2711555959963605, "grad_norm": 1.5768019146398888, "learning_rate": 2.8589378356792607e-06, "loss": 0.0385, "step": 4992 }, { "epoch": 2.2716105550500454, "grad_norm": 1.9495283814506403, "learning_rate": 2.8582305771813047e-06, "loss": 0.068, "step": 4993 }, { "epoch": 2.2720655141037307, "grad_norm": 2.6729818914830865, "learning_rate": 2.857523289410363e-06, "loss": 0.0708, "step": 4994 }, { "epoch": 2.272520473157416, "grad_norm": 1.3866786061258438, "learning_rate": 2.8568159724242333e-06, "loss": 0.0873, "step": 4995 }, { "epoch": 2.272975432211101, "grad_norm": 1.2678634233522372, "learning_rate": 2.856108626280713e-06, "loss": 0.0483, "step": 4996 }, { "epoch": 2.273430391264786, "grad_norm": 1.2101816558376715, "learning_rate": 2.855401251037605e-06, "loss": 0.0186, "step": 4997 }, { "epoch": 2.2738853503184715, "grad_norm": 1.4462330773757621, "learning_rate": 2.8546938467527106e-06, "loss": 0.0751, "step": 4998 }, { "epoch": 2.2743403093721564, "grad_norm": 1.4408916473048548, "learning_rate": 2.8539864134838374e-06, "loss": 0.0723, "step": 4999 }, { "epoch": 2.2747952684258417, "grad_norm": 1.3715766541348446, "learning_rate": 2.8532789512887936e-06, "loss": 0.0547, "step": 5000 }, { "epoch": 2.275250227479527, "grad_norm": 1.487689291571814, "learning_rate": 2.8525714602253885e-06, "loss": 0.0595, "step": 5001 }, { "epoch": 2.275705186533212, "grad_norm": 1.308041152963123, "learning_rate": 2.851863940351436e-06, "loss": 0.0306, "step": 5002 }, { "epoch": 2.276160145586897, "grad_norm": 1.2552913864142992, "learning_rate": 2.851156391724751e-06, "loss": 0.0492, "step": 5003 }, { "epoch": 2.2766151046405825, "grad_norm": 1.1551718346501334, "learning_rate": 2.850448814403152e-06, "loss": 0.0276, "step": 5004 }, { "epoch": 2.2770700636942673, "grad_norm": 1.7056803342532538, "learning_rate": 2.8497412084444585e-06, "loss": 0.0422, "step": 5005 }, { "epoch": 2.2775250227479527, "grad_norm": 1.2118929646199736, "learning_rate": 2.849033573906493e-06, "loss": 0.0218, "step": 5006 }, { "epoch": 2.277979981801638, "grad_norm": 1.147506014680052, "learning_rate": 2.8483259108470796e-06, "loss": 0.0653, "step": 5007 }, { "epoch": 2.278434940855323, "grad_norm": 1.0994511141177354, "learning_rate": 2.8476182193240458e-06, "loss": 0.032, "step": 5008 }, { "epoch": 2.278889899909008, "grad_norm": 1.268148936459825, "learning_rate": 2.846910499395221e-06, "loss": 0.0513, "step": 5009 }, { "epoch": 2.2793448589626935, "grad_norm": 1.5352327518983082, "learning_rate": 2.846202751118437e-06, "loss": 0.0727, "step": 5010 }, { "epoch": 2.2797998180163788, "grad_norm": 1.8654276600307438, "learning_rate": 2.845494974551528e-06, "loss": 0.0588, "step": 5011 }, { "epoch": 2.2802547770700636, "grad_norm": 1.428276110728018, "learning_rate": 2.8447871697523294e-06, "loss": 0.0646, "step": 5012 }, { "epoch": 2.280709736123749, "grad_norm": 1.6735288762093976, "learning_rate": 2.84407933677868e-06, "loss": 0.0635, "step": 5013 }, { "epoch": 2.2811646951774343, "grad_norm": 1.3116809307076178, "learning_rate": 2.843371475688422e-06, "loss": 0.0463, "step": 5014 }, { "epoch": 2.281619654231119, "grad_norm": 1.3335555694530055, "learning_rate": 2.842663586539397e-06, "loss": 0.0421, "step": 5015 }, { "epoch": 2.2820746132848044, "grad_norm": 1.216106492340112, "learning_rate": 2.841955669389451e-06, "loss": 0.0303, "step": 5016 }, { "epoch": 2.2825295723384897, "grad_norm": 0.9014150871751705, "learning_rate": 2.8412477242964326e-06, "loss": 0.0527, "step": 5017 }, { "epoch": 2.2829845313921746, "grad_norm": 1.727107966282504, "learning_rate": 2.840539751318191e-06, "loss": 0.0579, "step": 5018 }, { "epoch": 2.28343949044586, "grad_norm": 1.4047672763283032, "learning_rate": 2.8398317505125783e-06, "loss": 0.0503, "step": 5019 }, { "epoch": 2.2838944494995452, "grad_norm": 1.2271315436022472, "learning_rate": 2.8391237219374495e-06, "loss": 0.0456, "step": 5020 }, { "epoch": 2.28434940855323, "grad_norm": 1.2402337961062408, "learning_rate": 2.838415665650663e-06, "loss": 0.0542, "step": 5021 }, { "epoch": 2.2848043676069154, "grad_norm": 1.490810070095296, "learning_rate": 2.837707581710075e-06, "loss": 0.0349, "step": 5022 }, { "epoch": 2.2852593266606007, "grad_norm": 1.346758170846685, "learning_rate": 2.836999470173549e-06, "loss": 0.0488, "step": 5023 }, { "epoch": 2.2857142857142856, "grad_norm": 1.1149368243372237, "learning_rate": 2.8362913310989485e-06, "loss": 0.0253, "step": 5024 }, { "epoch": 2.286169244767971, "grad_norm": 1.1385456912076495, "learning_rate": 2.835583164544139e-06, "loss": 0.0349, "step": 5025 }, { "epoch": 2.286624203821656, "grad_norm": 1.0576810990383791, "learning_rate": 2.834874970566989e-06, "loss": 0.0289, "step": 5026 }, { "epoch": 2.287079162875341, "grad_norm": 1.0987752771872525, "learning_rate": 2.834166749225368e-06, "loss": 0.0446, "step": 5027 }, { "epoch": 2.2875341219290264, "grad_norm": 1.1737374642100065, "learning_rate": 2.83345850057715e-06, "loss": 0.0626, "step": 5028 }, { "epoch": 2.2879890809827117, "grad_norm": 1.029172100043105, "learning_rate": 2.832750224680209e-06, "loss": 0.0697, "step": 5029 }, { "epoch": 2.2884440400363966, "grad_norm": 1.2590289807639075, "learning_rate": 2.8320419215924217e-06, "loss": 0.0432, "step": 5030 }, { "epoch": 2.288898999090082, "grad_norm": 0.9896857204012415, "learning_rate": 2.831333591371669e-06, "loss": 0.0447, "step": 5031 }, { "epoch": 2.289353958143767, "grad_norm": 1.6221183732606554, "learning_rate": 2.830625234075831e-06, "loss": 0.0376, "step": 5032 }, { "epoch": 2.289808917197452, "grad_norm": 1.1663081083922142, "learning_rate": 2.829916849762792e-06, "loss": 0.0672, "step": 5033 }, { "epoch": 2.2902638762511374, "grad_norm": 1.4231028247806472, "learning_rate": 2.8292084384904383e-06, "loss": 0.0536, "step": 5034 }, { "epoch": 2.2907188353048227, "grad_norm": 1.7111534667891934, "learning_rate": 2.8285000003166574e-06, "loss": 0.0582, "step": 5035 }, { "epoch": 2.2911737943585075, "grad_norm": 1.4045682622691635, "learning_rate": 2.8277915352993403e-06, "loss": 0.0328, "step": 5036 }, { "epoch": 2.291628753412193, "grad_norm": 0.8542716649033394, "learning_rate": 2.8270830434963783e-06, "loss": 0.0294, "step": 5037 }, { "epoch": 2.292083712465878, "grad_norm": 1.2792267902306818, "learning_rate": 2.826374524965668e-06, "loss": 0.0653, "step": 5038 }, { "epoch": 2.292538671519563, "grad_norm": 1.3183376388483907, "learning_rate": 2.825665979765105e-06, "loss": 0.0605, "step": 5039 }, { "epoch": 2.2929936305732483, "grad_norm": 1.133292781329831, "learning_rate": 2.8249574079525887e-06, "loss": 0.053, "step": 5040 }, { "epoch": 2.2934485896269337, "grad_norm": 1.3149336130453024, "learning_rate": 2.824248809586021e-06, "loss": 0.0504, "step": 5041 }, { "epoch": 2.2939035486806185, "grad_norm": 1.1362206413808167, "learning_rate": 2.8235401847233045e-06, "loss": 0.0493, "step": 5042 }, { "epoch": 2.294358507734304, "grad_norm": 1.0853721705912787, "learning_rate": 2.822831533422346e-06, "loss": 0.0176, "step": 5043 }, { "epoch": 2.294813466787989, "grad_norm": 2.36916812435566, "learning_rate": 2.8221228557410505e-06, "loss": 0.0582, "step": 5044 }, { "epoch": 2.295268425841674, "grad_norm": 1.2112443378538411, "learning_rate": 2.8214141517373324e-06, "loss": 0.0737, "step": 5045 }, { "epoch": 2.2957233848953593, "grad_norm": 1.719709722038787, "learning_rate": 2.8207054214691e-06, "loss": 0.0352, "step": 5046 }, { "epoch": 2.2961783439490446, "grad_norm": 1.3639816096974777, "learning_rate": 2.8199966649942683e-06, "loss": 0.0437, "step": 5047 }, { "epoch": 2.29663330300273, "grad_norm": 1.231794446965683, "learning_rate": 2.8192878823707554e-06, "loss": 0.0361, "step": 5048 }, { "epoch": 2.297088262056415, "grad_norm": 1.2684930164635728, "learning_rate": 2.818579073656478e-06, "loss": 0.0264, "step": 5049 }, { "epoch": 2.2975432211101, "grad_norm": 1.6964417798576983, "learning_rate": 2.817870238909358e-06, "loss": 0.0603, "step": 5050 }, { "epoch": 2.2979981801637854, "grad_norm": 1.1776691086494184, "learning_rate": 2.817161378187317e-06, "loss": 0.0205, "step": 5051 }, { "epoch": 2.2984531392174703, "grad_norm": 1.457160712919205, "learning_rate": 2.8164524915482804e-06, "loss": 0.0315, "step": 5052 }, { "epoch": 2.2989080982711556, "grad_norm": 1.1610727627295971, "learning_rate": 2.815743579050176e-06, "loss": 0.036, "step": 5053 }, { "epoch": 2.299363057324841, "grad_norm": 1.494479445750586, "learning_rate": 2.815034640750931e-06, "loss": 0.0254, "step": 5054 }, { "epoch": 2.299818016378526, "grad_norm": 1.608648898688762, "learning_rate": 2.8143256767084785e-06, "loss": 0.0412, "step": 5055 }, { "epoch": 2.300272975432211, "grad_norm": 1.9906110826560912, "learning_rate": 2.8136166869807513e-06, "loss": 0.0316, "step": 5056 }, { "epoch": 2.3007279344858964, "grad_norm": 2.2330087360632507, "learning_rate": 2.812907671625685e-06, "loss": 0.0486, "step": 5057 }, { "epoch": 2.3011828935395813, "grad_norm": 1.4088846853514039, "learning_rate": 2.812198630701216e-06, "loss": 0.0407, "step": 5058 }, { "epoch": 2.3016378525932666, "grad_norm": 1.8490382112611876, "learning_rate": 2.811489564265285e-06, "loss": 0.0461, "step": 5059 }, { "epoch": 2.302092811646952, "grad_norm": 1.0195260237687804, "learning_rate": 2.810780472375834e-06, "loss": 0.0575, "step": 5060 }, { "epoch": 2.3025477707006368, "grad_norm": 1.108702575790503, "learning_rate": 2.8100713550908053e-06, "loss": 0.0137, "step": 5061 }, { "epoch": 2.303002729754322, "grad_norm": 0.9423130020283254, "learning_rate": 2.8093622124681474e-06, "loss": 0.0204, "step": 5062 }, { "epoch": 2.3034576888080074, "grad_norm": 1.1998796598419361, "learning_rate": 2.808653044565805e-06, "loss": 0.0255, "step": 5063 }, { "epoch": 2.3039126478616927, "grad_norm": 1.7116561415864575, "learning_rate": 2.807943851441731e-06, "loss": 0.0447, "step": 5064 }, { "epoch": 2.3043676069153776, "grad_norm": 1.3936375659890752, "learning_rate": 2.807234633153875e-06, "loss": 0.0461, "step": 5065 }, { "epoch": 2.304822565969063, "grad_norm": 1.4833399321845686, "learning_rate": 2.8065253897601924e-06, "loss": 0.0446, "step": 5066 }, { "epoch": 2.305277525022748, "grad_norm": 1.2594120714654462, "learning_rate": 2.8058161213186396e-06, "loss": 0.0365, "step": 5067 }, { "epoch": 2.305732484076433, "grad_norm": 1.2260727550783885, "learning_rate": 2.8051068278871746e-06, "loss": 0.018, "step": 5068 }, { "epoch": 2.3061874431301184, "grad_norm": 1.5519539696696174, "learning_rate": 2.804397509523757e-06, "loss": 0.0381, "step": 5069 }, { "epoch": 2.3066424021838037, "grad_norm": 1.243395086808284, "learning_rate": 2.8036881662863497e-06, "loss": 0.0327, "step": 5070 }, { "epoch": 2.3070973612374885, "grad_norm": 1.0347373912739795, "learning_rate": 2.802978798232917e-06, "loss": 0.0525, "step": 5071 }, { "epoch": 2.307552320291174, "grad_norm": 1.5625671413983802, "learning_rate": 2.802269405421425e-06, "loss": 0.0449, "step": 5072 }, { "epoch": 2.308007279344859, "grad_norm": 1.3105098590147573, "learning_rate": 2.801559987909842e-06, "loss": 0.034, "step": 5073 }, { "epoch": 2.308462238398544, "grad_norm": 1.1891098254648602, "learning_rate": 2.8008505457561393e-06, "loss": 0.062, "step": 5074 }, { "epoch": 2.3089171974522293, "grad_norm": 1.2021483913505118, "learning_rate": 2.8001410790182876e-06, "loss": 0.0386, "step": 5075 }, { "epoch": 2.3093721565059147, "grad_norm": 1.5728988514267814, "learning_rate": 2.799431587754263e-06, "loss": 0.0356, "step": 5076 }, { "epoch": 2.3098271155595995, "grad_norm": 1.278741531680671, "learning_rate": 2.7987220720220415e-06, "loss": 0.056, "step": 5077 }, { "epoch": 2.310282074613285, "grad_norm": 1.7316044321477053, "learning_rate": 2.798012531879601e-06, "loss": 0.0882, "step": 5078 }, { "epoch": 2.31073703366697, "grad_norm": 0.7167404742238086, "learning_rate": 2.7973029673849224e-06, "loss": 0.0368, "step": 5079 }, { "epoch": 2.311191992720655, "grad_norm": 1.4480217361785621, "learning_rate": 2.796593378595987e-06, "loss": 0.0369, "step": 5080 }, { "epoch": 2.3116469517743403, "grad_norm": 1.4234855152267074, "learning_rate": 2.7958837655707817e-06, "loss": 0.0988, "step": 5081 }, { "epoch": 2.3121019108280256, "grad_norm": 1.2876862045609432, "learning_rate": 2.79517412836729e-06, "loss": 0.0658, "step": 5082 }, { "epoch": 2.3125568698817105, "grad_norm": 1.5347601919276819, "learning_rate": 2.7944644670435015e-06, "loss": 0.0415, "step": 5083 }, { "epoch": 2.313011828935396, "grad_norm": 1.9823145519136762, "learning_rate": 2.7937547816574073e-06, "loss": 0.0591, "step": 5084 }, { "epoch": 2.313466787989081, "grad_norm": 1.4517892575557587, "learning_rate": 2.793045072266999e-06, "loss": 0.0619, "step": 5085 }, { "epoch": 2.313921747042766, "grad_norm": 1.5306739062903967, "learning_rate": 2.79233533893027e-06, "loss": 0.0922, "step": 5086 }, { "epoch": 2.3143767060964513, "grad_norm": 1.147837234231125, "learning_rate": 2.791625581705218e-06, "loss": 0.0586, "step": 5087 }, { "epoch": 2.3148316651501366, "grad_norm": 1.4648620474943548, "learning_rate": 2.79091580064984e-06, "loss": 0.0481, "step": 5088 }, { "epoch": 2.3152866242038215, "grad_norm": 1.3147862733123061, "learning_rate": 2.7902059958221363e-06, "loss": 0.0686, "step": 5089 }, { "epoch": 2.315741583257507, "grad_norm": 1.2731684900687392, "learning_rate": 2.7894961672801095e-06, "loss": 0.0517, "step": 5090 }, { "epoch": 2.316196542311192, "grad_norm": 1.7330240712515197, "learning_rate": 2.7887863150817636e-06, "loss": 0.0561, "step": 5091 }, { "epoch": 2.316651501364877, "grad_norm": 1.433105608034936, "learning_rate": 2.788076439285104e-06, "loss": 0.056, "step": 5092 }, { "epoch": 2.3171064604185623, "grad_norm": 0.9566317344973143, "learning_rate": 2.7873665399481382e-06, "loss": 0.0324, "step": 5093 }, { "epoch": 2.3175614194722476, "grad_norm": 1.7019517653856582, "learning_rate": 2.7866566171288773e-06, "loss": 0.0407, "step": 5094 }, { "epoch": 2.3180163785259325, "grad_norm": 1.0783759320329898, "learning_rate": 2.7859466708853315e-06, "loss": 0.0398, "step": 5095 }, { "epoch": 2.3184713375796178, "grad_norm": 1.6618017809843626, "learning_rate": 2.785236701275515e-06, "loss": 0.0579, "step": 5096 }, { "epoch": 2.318926296633303, "grad_norm": 1.2407696168884286, "learning_rate": 2.7845267083574433e-06, "loss": 0.0592, "step": 5097 }, { "epoch": 2.319381255686988, "grad_norm": 2.271635277311075, "learning_rate": 2.783816692189135e-06, "loss": 0.045, "step": 5098 }, { "epoch": 2.3198362147406733, "grad_norm": 1.258918571218276, "learning_rate": 2.7831066528286075e-06, "loss": 0.078, "step": 5099 }, { "epoch": 2.3202911737943586, "grad_norm": 1.4380560238699254, "learning_rate": 2.782396590333883e-06, "loss": 0.0257, "step": 5100 }, { "epoch": 2.3207461328480434, "grad_norm": 1.3344502491230243, "learning_rate": 2.781686504762985e-06, "loss": 0.0457, "step": 5101 }, { "epoch": 2.3212010919017287, "grad_norm": 1.181095953352301, "learning_rate": 2.7809763961739366e-06, "loss": 0.0249, "step": 5102 }, { "epoch": 2.321656050955414, "grad_norm": 1.46773495878289, "learning_rate": 2.7802662646247667e-06, "loss": 0.0374, "step": 5103 }, { "epoch": 2.3221110100090994, "grad_norm": 1.2580869951184557, "learning_rate": 2.7795561101735035e-06, "loss": 0.0624, "step": 5104 }, { "epoch": 2.3225659690627842, "grad_norm": 1.6167385687816664, "learning_rate": 2.7788459328781777e-06, "loss": 0.0329, "step": 5105 }, { "epoch": 2.3230209281164695, "grad_norm": 1.4385657352976569, "learning_rate": 2.778135732796821e-06, "loss": 0.0531, "step": 5106 }, { "epoch": 2.323475887170155, "grad_norm": 1.0451546588389402, "learning_rate": 2.7774255099874676e-06, "loss": 0.0197, "step": 5107 }, { "epoch": 2.3239308462238397, "grad_norm": 1.9506975795944392, "learning_rate": 2.7767152645081557e-06, "loss": 0.0329, "step": 5108 }, { "epoch": 2.324385805277525, "grad_norm": 1.3982184885495055, "learning_rate": 2.776004996416921e-06, "loss": 0.0607, "step": 5109 }, { "epoch": 2.3248407643312103, "grad_norm": 1.6121595167431892, "learning_rate": 2.775294705771805e-06, "loss": 0.0451, "step": 5110 }, { "epoch": 2.325295723384895, "grad_norm": 1.1682220707863116, "learning_rate": 2.774584392630849e-06, "loss": 0.0605, "step": 5111 }, { "epoch": 2.3257506824385805, "grad_norm": 1.429318725951821, "learning_rate": 2.773874057052096e-06, "loss": 0.0612, "step": 5112 }, { "epoch": 2.326205641492266, "grad_norm": 1.0267411628757754, "learning_rate": 2.773163699093592e-06, "loss": 0.0335, "step": 5113 }, { "epoch": 2.3266606005459507, "grad_norm": 1.8305321390705005, "learning_rate": 2.772453318813384e-06, "loss": 0.0642, "step": 5114 }, { "epoch": 2.327115559599636, "grad_norm": 1.6752131735460964, "learning_rate": 2.7717429162695215e-06, "loss": 0.0643, "step": 5115 }, { "epoch": 2.3275705186533213, "grad_norm": 1.310004633234762, "learning_rate": 2.771032491520055e-06, "loss": 0.0377, "step": 5116 }, { "epoch": 2.328025477707006, "grad_norm": 1.5320231854213826, "learning_rate": 2.7703220446230367e-06, "loss": 0.0648, "step": 5117 }, { "epoch": 2.3284804367606915, "grad_norm": 1.4019569733451958, "learning_rate": 2.7696115756365227e-06, "loss": 0.0619, "step": 5118 }, { "epoch": 2.328935395814377, "grad_norm": 1.5717261696149054, "learning_rate": 2.768901084618567e-06, "loss": 0.0448, "step": 5119 }, { "epoch": 2.329390354868062, "grad_norm": 1.5245198153027137, "learning_rate": 2.7681905716272307e-06, "loss": 0.0735, "step": 5120 }, { "epoch": 2.329845313921747, "grad_norm": 1.1612923248285274, "learning_rate": 2.7674800367205707e-06, "loss": 0.0418, "step": 5121 }, { "epoch": 2.3303002729754323, "grad_norm": 1.277953993806982, "learning_rate": 2.7667694799566503e-06, "loss": 0.0305, "step": 5122 }, { "epoch": 2.3307552320291176, "grad_norm": 1.0931901029722626, "learning_rate": 2.7660589013935327e-06, "loss": 0.0488, "step": 5123 }, { "epoch": 2.3312101910828025, "grad_norm": 1.2200448868151161, "learning_rate": 2.765348301089283e-06, "loss": 0.0365, "step": 5124 }, { "epoch": 2.331665150136488, "grad_norm": 1.6392495895899475, "learning_rate": 2.764637679101969e-06, "loss": 0.0758, "step": 5125 }, { "epoch": 2.332120109190173, "grad_norm": 1.7409368663475637, "learning_rate": 2.7639270354896586e-06, "loss": 0.03, "step": 5126 }, { "epoch": 2.332575068243858, "grad_norm": 0.9852427635326125, "learning_rate": 2.763216370310423e-06, "loss": 0.039, "step": 5127 }, { "epoch": 2.3330300272975433, "grad_norm": 1.2031976286883186, "learning_rate": 2.762505683622334e-06, "loss": 0.017, "step": 5128 }, { "epoch": 2.3334849863512286, "grad_norm": 1.3799738709112226, "learning_rate": 2.761794975483466e-06, "loss": 0.0941, "step": 5129 }, { "epoch": 2.3339399454049135, "grad_norm": 1.2922739775423482, "learning_rate": 2.7610842459518957e-06, "loss": 0.0442, "step": 5130 }, { "epoch": 2.3343949044585988, "grad_norm": 1.0779050706540043, "learning_rate": 2.760373495085698e-06, "loss": 0.0305, "step": 5131 }, { "epoch": 2.334849863512284, "grad_norm": 1.4864566338909988, "learning_rate": 2.7596627229429557e-06, "loss": 0.0365, "step": 5132 }, { "epoch": 2.335304822565969, "grad_norm": 1.2085133762476339, "learning_rate": 2.758951929581748e-06, "loss": 0.0566, "step": 5133 }, { "epoch": 2.3357597816196543, "grad_norm": 1.2100904288822898, "learning_rate": 2.758241115060158e-06, "loss": 0.0559, "step": 5134 }, { "epoch": 2.3362147406733396, "grad_norm": 1.5525994016114144, "learning_rate": 2.7575302794362704e-06, "loss": 0.0829, "step": 5135 }, { "epoch": 2.3366696997270244, "grad_norm": 1.3808718491785876, "learning_rate": 2.7568194227681703e-06, "loss": 0.0538, "step": 5136 }, { "epoch": 2.3371246587807097, "grad_norm": 1.2098840233623962, "learning_rate": 2.756108545113948e-06, "loss": 0.0604, "step": 5137 }, { "epoch": 2.337579617834395, "grad_norm": 1.2102614934674498, "learning_rate": 2.7553976465316915e-06, "loss": 0.0597, "step": 5138 }, { "epoch": 2.33803457688808, "grad_norm": 1.961877990423004, "learning_rate": 2.754686727079493e-06, "loss": 0.0358, "step": 5139 }, { "epoch": 2.3384895359417652, "grad_norm": 1.1429417180834747, "learning_rate": 2.7539757868154452e-06, "loss": 0.0394, "step": 5140 }, { "epoch": 2.3389444949954505, "grad_norm": 1.283861289690001, "learning_rate": 2.753264825797643e-06, "loss": 0.0774, "step": 5141 }, { "epoch": 2.3393994540491354, "grad_norm": 1.4290298028507462, "learning_rate": 2.7525538440841828e-06, "loss": 0.0714, "step": 5142 }, { "epoch": 2.3398544131028207, "grad_norm": 1.8906949176845327, "learning_rate": 2.751842841733163e-06, "loss": 0.05, "step": 5143 }, { "epoch": 2.340309372156506, "grad_norm": 1.6342511661036279, "learning_rate": 2.751131818802684e-06, "loss": 0.0517, "step": 5144 }, { "epoch": 2.340764331210191, "grad_norm": 2.056160474488437, "learning_rate": 2.750420775350846e-06, "loss": 0.0815, "step": 5145 }, { "epoch": 2.341219290263876, "grad_norm": 1.4711481196558722, "learning_rate": 2.749709711435753e-06, "loss": 0.0467, "step": 5146 }, { "epoch": 2.3416742493175615, "grad_norm": 0.943358257167078, "learning_rate": 2.7489986271155112e-06, "loss": 0.0269, "step": 5147 }, { "epoch": 2.3421292083712464, "grad_norm": 1.3757151218647332, "learning_rate": 2.748287522448225e-06, "loss": 0.0452, "step": 5148 }, { "epoch": 2.3425841674249317, "grad_norm": 1.383491332390399, "learning_rate": 2.7475763974920046e-06, "loss": 0.0347, "step": 5149 }, { "epoch": 2.343039126478617, "grad_norm": 1.4538269466509102, "learning_rate": 2.746865252304958e-06, "loss": 0.0381, "step": 5150 }, { "epoch": 2.343494085532302, "grad_norm": 1.3684907031513858, "learning_rate": 2.746154086945199e-06, "loss": 0.0546, "step": 5151 }, { "epoch": 2.343949044585987, "grad_norm": 1.2816678946439908, "learning_rate": 2.745442901470839e-06, "loss": 0.0443, "step": 5152 }, { "epoch": 2.3444040036396725, "grad_norm": 1.073350991408307, "learning_rate": 2.744731695939993e-06, "loss": 0.0235, "step": 5153 }, { "epoch": 2.3448589626933574, "grad_norm": 1.5901841466074937, "learning_rate": 2.744020470410779e-06, "loss": 0.0742, "step": 5154 }, { "epoch": 2.3453139217470427, "grad_norm": 1.3139715532075629, "learning_rate": 2.743309224941314e-06, "loss": 0.0736, "step": 5155 }, { "epoch": 2.345768880800728, "grad_norm": 1.5437824909076996, "learning_rate": 2.742597959589717e-06, "loss": 0.0274, "step": 5156 }, { "epoch": 2.3462238398544133, "grad_norm": 1.0487880079980194, "learning_rate": 2.741886674414112e-06, "loss": 0.0444, "step": 5157 }, { "epoch": 2.346678798908098, "grad_norm": 1.1429545685123004, "learning_rate": 2.741175369472619e-06, "loss": 0.0321, "step": 5158 }, { "epoch": 2.3471337579617835, "grad_norm": 1.8500968332623149, "learning_rate": 2.7404640448233637e-06, "loss": 0.0348, "step": 5159 }, { "epoch": 2.347588717015469, "grad_norm": 1.2468737906689336, "learning_rate": 2.7397527005244734e-06, "loss": 0.0483, "step": 5160 }, { "epoch": 2.3480436760691537, "grad_norm": 1.7537959250701098, "learning_rate": 2.7390413366340753e-06, "loss": 0.0686, "step": 5161 }, { "epoch": 2.348498635122839, "grad_norm": 2.124244946729701, "learning_rate": 2.738329953210298e-06, "loss": 0.0708, "step": 5162 }, { "epoch": 2.3489535941765243, "grad_norm": 1.8021383741438721, "learning_rate": 2.7376185503112728e-06, "loss": 0.0271, "step": 5163 }, { "epoch": 2.349408553230209, "grad_norm": 1.9475433607444537, "learning_rate": 2.7369071279951342e-06, "loss": 0.067, "step": 5164 }, { "epoch": 2.3498635122838945, "grad_norm": 1.7711023829616264, "learning_rate": 2.736195686320014e-06, "loss": 0.0539, "step": 5165 }, { "epoch": 2.3503184713375798, "grad_norm": 1.2551448629129007, "learning_rate": 2.735484225344049e-06, "loss": 0.0544, "step": 5166 }, { "epoch": 2.3507734303912646, "grad_norm": 1.40867008792886, "learning_rate": 2.7347727451253763e-06, "loss": 0.0417, "step": 5167 }, { "epoch": 2.35122838944495, "grad_norm": 1.6991939076244647, "learning_rate": 2.7340612457221355e-06, "loss": 0.042, "step": 5168 }, { "epoch": 2.3516833484986353, "grad_norm": 1.2961076430694385, "learning_rate": 2.733349727192467e-06, "loss": 0.0533, "step": 5169 }, { "epoch": 2.35213830755232, "grad_norm": 0.892660434803345, "learning_rate": 2.732638189594512e-06, "loss": 0.0669, "step": 5170 }, { "epoch": 2.3525932666060054, "grad_norm": 1.7789088173114873, "learning_rate": 2.7319266329864153e-06, "loss": 0.0491, "step": 5171 }, { "epoch": 2.3530482256596907, "grad_norm": 1.2102878264768222, "learning_rate": 2.7312150574263207e-06, "loss": 0.0396, "step": 5172 }, { "epoch": 2.3535031847133756, "grad_norm": 1.2091487883399648, "learning_rate": 2.7305034629723765e-06, "loss": 0.0328, "step": 5173 }, { "epoch": 2.353958143767061, "grad_norm": 1.1607540465144142, "learning_rate": 2.7297918496827302e-06, "loss": 0.0471, "step": 5174 }, { "epoch": 2.3544131028207462, "grad_norm": 1.2909126494463314, "learning_rate": 2.729080217615531e-06, "loss": 0.0342, "step": 5175 }, { "epoch": 2.3548680618744315, "grad_norm": 1.6525678828617132, "learning_rate": 2.7283685668289324e-06, "loss": 0.0361, "step": 5176 }, { "epoch": 2.3553230209281164, "grad_norm": 1.2829969039878055, "learning_rate": 2.7276568973810835e-06, "loss": 0.032, "step": 5177 }, { "epoch": 2.3557779799818017, "grad_norm": 1.2710379480328875, "learning_rate": 2.726945209330143e-06, "loss": 0.0252, "step": 5178 }, { "epoch": 2.356232939035487, "grad_norm": 1.07901028766875, "learning_rate": 2.726233502734264e-06, "loss": 0.0216, "step": 5179 }, { "epoch": 2.356687898089172, "grad_norm": 1.6256945480771654, "learning_rate": 2.725521777651605e-06, "loss": 0.0504, "step": 5180 }, { "epoch": 2.357142857142857, "grad_norm": 1.3650269589807502, "learning_rate": 2.724810034140325e-06, "loss": 0.0392, "step": 5181 }, { "epoch": 2.3575978161965425, "grad_norm": 1.0851473725656062, "learning_rate": 2.724098272258584e-06, "loss": 0.0432, "step": 5182 }, { "epoch": 2.3580527752502274, "grad_norm": 1.134555742436068, "learning_rate": 2.723386492064545e-06, "loss": 0.0494, "step": 5183 }, { "epoch": 2.3585077343039127, "grad_norm": 1.333640634755996, "learning_rate": 2.722674693616369e-06, "loss": 0.0667, "step": 5184 }, { "epoch": 2.358962693357598, "grad_norm": 1.7380882575739913, "learning_rate": 2.721962876972224e-06, "loss": 0.034, "step": 5185 }, { "epoch": 2.359417652411283, "grad_norm": 0.8680028147254383, "learning_rate": 2.7212510421902743e-06, "loss": 0.0347, "step": 5186 }, { "epoch": 2.359872611464968, "grad_norm": 1.1458476364793158, "learning_rate": 2.7205391893286892e-06, "loss": 0.023, "step": 5187 }, { "epoch": 2.3603275705186535, "grad_norm": 1.3905766695258155, "learning_rate": 2.7198273184456376e-06, "loss": 0.0548, "step": 5188 }, { "epoch": 2.3607825295723384, "grad_norm": 1.1364249610600718, "learning_rate": 2.7191154295992893e-06, "loss": 0.0281, "step": 5189 }, { "epoch": 2.3612374886260237, "grad_norm": 1.0186163965780095, "learning_rate": 2.718403522847819e-06, "loss": 0.0273, "step": 5190 }, { "epoch": 2.361692447679709, "grad_norm": 1.1212150193143522, "learning_rate": 2.7176915982493975e-06, "loss": 0.0319, "step": 5191 }, { "epoch": 2.362147406733394, "grad_norm": 1.5649504203082003, "learning_rate": 2.716979655862203e-06, "loss": 0.0347, "step": 5192 }, { "epoch": 2.362602365787079, "grad_norm": 1.7606202073150885, "learning_rate": 2.7162676957444106e-06, "loss": 0.0628, "step": 5193 }, { "epoch": 2.3630573248407645, "grad_norm": 1.0815844093237013, "learning_rate": 2.715555717954198e-06, "loss": 0.022, "step": 5194 }, { "epoch": 2.3635122838944493, "grad_norm": 0.999003281680082, "learning_rate": 2.7148437225497466e-06, "loss": 0.0256, "step": 5195 }, { "epoch": 2.3639672429481347, "grad_norm": 1.3302362969219457, "learning_rate": 2.7141317095892356e-06, "loss": 0.0326, "step": 5196 }, { "epoch": 2.36442220200182, "grad_norm": 1.4824843438033863, "learning_rate": 2.7134196791308493e-06, "loss": 0.0466, "step": 5197 }, { "epoch": 2.364877161055505, "grad_norm": 1.938624676588355, "learning_rate": 2.7127076312327695e-06, "loss": 0.0517, "step": 5198 }, { "epoch": 2.36533212010919, "grad_norm": 1.029348106941278, "learning_rate": 2.711995565953183e-06, "loss": 0.0455, "step": 5199 }, { "epoch": 2.3657870791628755, "grad_norm": 1.3397637079452633, "learning_rate": 2.7112834833502766e-06, "loss": 0.0446, "step": 5200 }, { "epoch": 2.3662420382165603, "grad_norm": 1.8243565070604082, "learning_rate": 2.7105713834822374e-06, "loss": 0.0566, "step": 5201 }, { "epoch": 2.3666969972702456, "grad_norm": 1.1439795567471782, "learning_rate": 2.7098592664072563e-06, "loss": 0.0328, "step": 5202 }, { "epoch": 2.367151956323931, "grad_norm": 1.4030220704366307, "learning_rate": 2.709147132183523e-06, "loss": 0.0313, "step": 5203 }, { "epoch": 2.367606915377616, "grad_norm": 1.985014332727087, "learning_rate": 2.7084349808692316e-06, "loss": 0.0602, "step": 5204 }, { "epoch": 2.368061874431301, "grad_norm": 1.3580500106271856, "learning_rate": 2.707722812522574e-06, "loss": 0.0583, "step": 5205 }, { "epoch": 2.3685168334849864, "grad_norm": 1.491384922066355, "learning_rate": 2.7070106272017465e-06, "loss": 0.0467, "step": 5206 }, { "epoch": 2.3689717925386713, "grad_norm": 1.393521061287623, "learning_rate": 2.706298424964946e-06, "loss": 0.0284, "step": 5207 }, { "epoch": 2.3694267515923566, "grad_norm": 0.8334176308730441, "learning_rate": 2.7055862058703685e-06, "loss": 0.0402, "step": 5208 }, { "epoch": 2.369881710646042, "grad_norm": 1.5072277200955289, "learning_rate": 2.704873969976216e-06, "loss": 0.0587, "step": 5209 }, { "epoch": 2.370336669699727, "grad_norm": 1.2264422231465415, "learning_rate": 2.7041617173406875e-06, "loss": 0.0321, "step": 5210 }, { "epoch": 2.370791628753412, "grad_norm": 2.3458795224265026, "learning_rate": 2.703449448021985e-06, "loss": 0.0659, "step": 5211 }, { "epoch": 2.3712465878070974, "grad_norm": 0.9788065200242112, "learning_rate": 2.7027371620783127e-06, "loss": 0.0215, "step": 5212 }, { "epoch": 2.3717015468607827, "grad_norm": 1.1476161623664831, "learning_rate": 2.7020248595678744e-06, "loss": 0.0223, "step": 5213 }, { "epoch": 2.3721565059144676, "grad_norm": 1.5205797759791584, "learning_rate": 2.7013125405488782e-06, "loss": 0.035, "step": 5214 }, { "epoch": 2.372611464968153, "grad_norm": 1.0681844103890026, "learning_rate": 2.7006002050795294e-06, "loss": 0.0918, "step": 5215 }, { "epoch": 2.373066424021838, "grad_norm": 1.7907697135989358, "learning_rate": 2.6998878532180378e-06, "loss": 0.0626, "step": 5216 }, { "epoch": 2.373521383075523, "grad_norm": 1.588221531978981, "learning_rate": 2.6991754850226143e-06, "loss": 0.0308, "step": 5217 }, { "epoch": 2.3739763421292084, "grad_norm": 1.3306418742061252, "learning_rate": 2.6984631005514685e-06, "loss": 0.0477, "step": 5218 }, { "epoch": 2.3744313011828937, "grad_norm": 1.1514145134012803, "learning_rate": 2.697750699862815e-06, "loss": 0.033, "step": 5219 }, { "epoch": 2.3748862602365786, "grad_norm": 1.3304879662631028, "learning_rate": 2.6970382830148665e-06, "loss": 0.0373, "step": 5220 }, { "epoch": 2.375341219290264, "grad_norm": 1.118380185198624, "learning_rate": 2.6963258500658406e-06, "loss": 0.0328, "step": 5221 }, { "epoch": 2.375796178343949, "grad_norm": 1.4740628544535623, "learning_rate": 2.695613401073952e-06, "loss": 0.0512, "step": 5222 }, { "epoch": 2.376251137397634, "grad_norm": 1.5478119102810735, "learning_rate": 2.69490093609742e-06, "loss": 0.0264, "step": 5223 }, { "epoch": 2.3767060964513194, "grad_norm": 1.3817378583383193, "learning_rate": 2.694188455194464e-06, "loss": 0.0402, "step": 5224 }, { "epoch": 2.3771610555050047, "grad_norm": 1.5243816730490878, "learning_rate": 2.693475958423304e-06, "loss": 0.072, "step": 5225 }, { "epoch": 2.3776160145586895, "grad_norm": 2.5701282640600054, "learning_rate": 2.692763445842162e-06, "loss": 0.0775, "step": 5226 }, { "epoch": 2.378070973612375, "grad_norm": 1.4261097545345478, "learning_rate": 2.6920509175092622e-06, "loss": 0.0344, "step": 5227 }, { "epoch": 2.37852593266606, "grad_norm": 3.8620782029724285, "learning_rate": 2.6913383734828293e-06, "loss": 0.0646, "step": 5228 }, { "epoch": 2.3789808917197455, "grad_norm": 1.0968313953975415, "learning_rate": 2.690625813821087e-06, "loss": 0.0439, "step": 5229 }, { "epoch": 2.3794358507734303, "grad_norm": 0.9123587385762045, "learning_rate": 2.689913238582265e-06, "loss": 0.0516, "step": 5230 }, { "epoch": 2.3798908098271156, "grad_norm": 1.3031530657261257, "learning_rate": 2.689200647824591e-06, "loss": 0.0856, "step": 5231 }, { "epoch": 2.380345768880801, "grad_norm": 1.2775756783708034, "learning_rate": 2.6884880416062942e-06, "loss": 0.0532, "step": 5232 }, { "epoch": 2.380800727934486, "grad_norm": 0.9073886532034023, "learning_rate": 2.687775419985606e-06, "loss": 0.0265, "step": 5233 }, { "epoch": 2.381255686988171, "grad_norm": 1.7178301995551801, "learning_rate": 2.6870627830207585e-06, "loss": 0.0523, "step": 5234 }, { "epoch": 2.3817106460418564, "grad_norm": 1.9306981251250406, "learning_rate": 2.686350130769985e-06, "loss": 0.0593, "step": 5235 }, { "epoch": 2.3821656050955413, "grad_norm": 0.9510969497449144, "learning_rate": 2.68563746329152e-06, "loss": 0.0209, "step": 5236 }, { "epoch": 2.3826205641492266, "grad_norm": 1.056784953683312, "learning_rate": 2.6849247806436e-06, "loss": 0.0313, "step": 5237 }, { "epoch": 2.383075523202912, "grad_norm": 1.1854129630358927, "learning_rate": 2.6842120828844625e-06, "loss": 0.0655, "step": 5238 }, { "epoch": 2.383530482256597, "grad_norm": 1.3075596108428733, "learning_rate": 2.6834993700723454e-06, "loss": 0.0409, "step": 5239 }, { "epoch": 2.383985441310282, "grad_norm": 1.0932183613730486, "learning_rate": 2.682786642265488e-06, "loss": 0.029, "step": 5240 }, { "epoch": 2.3844404003639674, "grad_norm": 1.316834163087933, "learning_rate": 2.6820738995221323e-06, "loss": 0.0295, "step": 5241 }, { "epoch": 2.3848953594176523, "grad_norm": 1.2127672688982216, "learning_rate": 2.681361141900519e-06, "loss": 0.0521, "step": 5242 }, { "epoch": 2.3853503184713376, "grad_norm": 1.3919821097030576, "learning_rate": 2.6806483694588926e-06, "loss": 0.0517, "step": 5243 }, { "epoch": 2.385805277525023, "grad_norm": 1.804192035259728, "learning_rate": 2.6799355822554974e-06, "loss": 0.0529, "step": 5244 }, { "epoch": 2.386260236578708, "grad_norm": 1.77689071906947, "learning_rate": 2.6792227803485788e-06, "loss": 0.0522, "step": 5245 }, { "epoch": 2.386715195632393, "grad_norm": 1.6392100631994284, "learning_rate": 2.6785099637963847e-06, "loss": 0.0323, "step": 5246 }, { "epoch": 2.3871701546860784, "grad_norm": 1.2344281756431172, "learning_rate": 2.6777971326571605e-06, "loss": 0.0593, "step": 5247 }, { "epoch": 2.3876251137397633, "grad_norm": 1.4438396141461232, "learning_rate": 2.6770842869891593e-06, "loss": 0.0404, "step": 5248 }, { "epoch": 2.3880800727934486, "grad_norm": 1.5207238414212165, "learning_rate": 2.6763714268506297e-06, "loss": 0.0442, "step": 5249 }, { "epoch": 2.388535031847134, "grad_norm": 1.306279909252367, "learning_rate": 2.675658552299823e-06, "loss": 0.0214, "step": 5250 }, { "epoch": 2.3889899909008188, "grad_norm": 1.6298217788001161, "learning_rate": 2.6749456633949932e-06, "loss": 0.0702, "step": 5251 }, { "epoch": 2.389444949954504, "grad_norm": 1.2532350256117335, "learning_rate": 2.6742327601943936e-06, "loss": 0.0391, "step": 5252 }, { "epoch": 2.3898999090081894, "grad_norm": 1.3655531389520348, "learning_rate": 2.67351984275628e-06, "loss": 0.0581, "step": 5253 }, { "epoch": 2.3903548680618742, "grad_norm": 0.9563603890179186, "learning_rate": 2.6728069111389073e-06, "loss": 0.0313, "step": 5254 }, { "epoch": 2.3908098271155596, "grad_norm": 1.086629836840935, "learning_rate": 2.672093965400536e-06, "loss": 0.0325, "step": 5255 }, { "epoch": 2.391264786169245, "grad_norm": 1.1197006151874265, "learning_rate": 2.6713810055994215e-06, "loss": 0.0238, "step": 5256 }, { "epoch": 2.3917197452229297, "grad_norm": 1.4277782899293696, "learning_rate": 2.6706680317938256e-06, "loss": 0.0601, "step": 5257 }, { "epoch": 2.392174704276615, "grad_norm": 1.5516595769685968, "learning_rate": 2.6699550440420093e-06, "loss": 0.0461, "step": 5258 }, { "epoch": 2.3926296633303004, "grad_norm": 2.015510009639456, "learning_rate": 2.6692420424022335e-06, "loss": 0.0535, "step": 5259 }, { "epoch": 2.3930846223839852, "grad_norm": 2.805137594950723, "learning_rate": 2.6685290269327637e-06, "loss": 0.0961, "step": 5260 }, { "epoch": 2.3935395814376705, "grad_norm": 1.84945163578645, "learning_rate": 2.667815997691861e-06, "loss": 0.0512, "step": 5261 }, { "epoch": 2.393994540491356, "grad_norm": 1.600370712687317, "learning_rate": 2.6671029547377943e-06, "loss": 0.0622, "step": 5262 }, { "epoch": 2.3944494995450407, "grad_norm": 1.4950781503440156, "learning_rate": 2.666389898128828e-06, "loss": 0.0286, "step": 5263 }, { "epoch": 2.394904458598726, "grad_norm": 1.155575473006123, "learning_rate": 2.665676827923231e-06, "loss": 0.0478, "step": 5264 }, { "epoch": 2.3953594176524113, "grad_norm": 1.3958573508546097, "learning_rate": 2.664963744179272e-06, "loss": 0.0733, "step": 5265 }, { "epoch": 2.395814376706096, "grad_norm": 0.7798996098145348, "learning_rate": 2.6642506469552198e-06, "loss": 0.0263, "step": 5266 }, { "epoch": 2.3962693357597815, "grad_norm": 1.098665456503385, "learning_rate": 2.663537536309348e-06, "loss": 0.044, "step": 5267 }, { "epoch": 2.396724294813467, "grad_norm": 1.5510597332231475, "learning_rate": 2.6628244122999265e-06, "loss": 0.0588, "step": 5268 }, { "epoch": 2.397179253867152, "grad_norm": 1.6299071007832437, "learning_rate": 2.662111274985229e-06, "loss": 0.1006, "step": 5269 }, { "epoch": 2.397634212920837, "grad_norm": 1.4464012223317138, "learning_rate": 2.661398124423531e-06, "loss": 0.0359, "step": 5270 }, { "epoch": 2.3980891719745223, "grad_norm": 1.1745334060076575, "learning_rate": 2.6606849606731056e-06, "loss": 0.0318, "step": 5271 }, { "epoch": 2.3985441310282076, "grad_norm": 1.4948739564130398, "learning_rate": 2.6599717837922322e-06, "loss": 0.0332, "step": 5272 }, { "epoch": 2.3989990900818925, "grad_norm": 1.1511346886797893, "learning_rate": 2.659258593839187e-06, "loss": 0.0484, "step": 5273 }, { "epoch": 2.399454049135578, "grad_norm": 1.216374413325874, "learning_rate": 2.6585453908722484e-06, "loss": 0.0439, "step": 5274 }, { "epoch": 2.399909008189263, "grad_norm": 1.2232512470452126, "learning_rate": 2.6578321749496965e-06, "loss": 0.0604, "step": 5275 }, { "epoch": 2.400363967242948, "grad_norm": 1.1087040999277555, "learning_rate": 2.657118946129812e-06, "loss": 0.0291, "step": 5276 }, { "epoch": 2.4008189262966333, "grad_norm": 1.5770102977586127, "learning_rate": 2.6564057044708767e-06, "loss": 0.0294, "step": 5277 }, { "epoch": 2.4012738853503186, "grad_norm": 1.082191305205626, "learning_rate": 2.655692450031173e-06, "loss": 0.0357, "step": 5278 }, { "epoch": 2.4017288444040035, "grad_norm": 1.9885589973117188, "learning_rate": 2.6549791828689864e-06, "loss": 0.0547, "step": 5279 }, { "epoch": 2.402183803457689, "grad_norm": 2.2535946290293305, "learning_rate": 2.654265903042601e-06, "loss": 0.1317, "step": 5280 }, { "epoch": 2.402638762511374, "grad_norm": 1.3001900086743903, "learning_rate": 2.653552610610302e-06, "loss": 0.0535, "step": 5281 }, { "epoch": 2.403093721565059, "grad_norm": 1.37117653612899, "learning_rate": 2.6528393056303767e-06, "loss": 0.0515, "step": 5282 }, { "epoch": 2.4035486806187443, "grad_norm": 1.3518358863094737, "learning_rate": 2.6521259881611144e-06, "loss": 0.0294, "step": 5283 }, { "epoch": 2.4040036396724296, "grad_norm": 2.0719887859788133, "learning_rate": 2.6514126582608037e-06, "loss": 0.052, "step": 5284 }, { "epoch": 2.404458598726115, "grad_norm": 2.531056366643776, "learning_rate": 2.650699315987733e-06, "loss": 0.0556, "step": 5285 }, { "epoch": 2.4049135577797998, "grad_norm": 1.5854866554085887, "learning_rate": 2.6499859614001954e-06, "loss": 0.0431, "step": 5286 }, { "epoch": 2.405368516833485, "grad_norm": 2.1862126446111114, "learning_rate": 2.649272594556483e-06, "loss": 0.0432, "step": 5287 }, { "epoch": 2.4058234758871704, "grad_norm": 1.4575982548881428, "learning_rate": 2.6485592155148875e-06, "loss": 0.0606, "step": 5288 }, { "epoch": 2.4062784349408552, "grad_norm": 1.1321384622754644, "learning_rate": 2.6478458243337035e-06, "loss": 0.0224, "step": 5289 }, { "epoch": 2.4067333939945406, "grad_norm": 1.2504123054806682, "learning_rate": 2.647132421071227e-06, "loss": 0.0392, "step": 5290 }, { "epoch": 2.407188353048226, "grad_norm": 1.5824647801351837, "learning_rate": 2.6464190057857535e-06, "loss": 0.0312, "step": 5291 }, { "epoch": 2.4076433121019107, "grad_norm": 1.360455128740188, "learning_rate": 2.6457055785355802e-06, "loss": 0.0497, "step": 5292 }, { "epoch": 2.408098271155596, "grad_norm": 1.36627771765421, "learning_rate": 2.6449921393790045e-06, "loss": 0.0527, "step": 5293 }, { "epoch": 2.4085532302092814, "grad_norm": 1.1315936642467672, "learning_rate": 2.6442786883743267e-06, "loss": 0.0204, "step": 5294 }, { "epoch": 2.4090081892629662, "grad_norm": 1.012576515337893, "learning_rate": 2.643565225579845e-06, "loss": 0.0341, "step": 5295 }, { "epoch": 2.4094631483166515, "grad_norm": 1.5518043578549927, "learning_rate": 2.642851751053862e-06, "loss": 0.0326, "step": 5296 }, { "epoch": 2.409918107370337, "grad_norm": 1.3227198006026681, "learning_rate": 2.642138264854679e-06, "loss": 0.0371, "step": 5297 }, { "epoch": 2.4103730664240217, "grad_norm": 1.2170317918602016, "learning_rate": 2.641424767040599e-06, "loss": 0.0358, "step": 5298 }, { "epoch": 2.410828025477707, "grad_norm": 2.1069718812025973, "learning_rate": 2.640711257669925e-06, "loss": 0.063, "step": 5299 }, { "epoch": 2.4112829845313923, "grad_norm": 1.2679637409147577, "learning_rate": 2.6399977368009632e-06, "loss": 0.0462, "step": 5300 }, { "epoch": 2.411737943585077, "grad_norm": 1.7521579592828735, "learning_rate": 2.6392842044920187e-06, "loss": 0.0491, "step": 5301 }, { "epoch": 2.4121929026387625, "grad_norm": 1.4889353009072546, "learning_rate": 2.6385706608013977e-06, "loss": 0.071, "step": 5302 }, { "epoch": 2.412647861692448, "grad_norm": 1.0904011138441683, "learning_rate": 2.637857105787408e-06, "loss": 0.0279, "step": 5303 }, { "epoch": 2.4131028207461327, "grad_norm": 0.8195551334548937, "learning_rate": 2.6371435395083585e-06, "loss": 0.0237, "step": 5304 }, { "epoch": 2.413557779799818, "grad_norm": 1.1667178253125152, "learning_rate": 2.636429962022558e-06, "loss": 0.0326, "step": 5305 }, { "epoch": 2.4140127388535033, "grad_norm": 1.1998262490597045, "learning_rate": 2.6357163733883168e-06, "loss": 0.0616, "step": 5306 }, { "epoch": 2.414467697907188, "grad_norm": 1.3338340006584133, "learning_rate": 2.6350027736639467e-06, "loss": 0.0531, "step": 5307 }, { "epoch": 2.4149226569608735, "grad_norm": 1.4493607925528078, "learning_rate": 2.6342891629077603e-06, "loss": 0.043, "step": 5308 }, { "epoch": 2.415377616014559, "grad_norm": 1.1522354534495378, "learning_rate": 2.633575541178069e-06, "loss": 0.0331, "step": 5309 }, { "epoch": 2.4158325750682437, "grad_norm": 1.6109941537922234, "learning_rate": 2.632861908533188e-06, "loss": 0.037, "step": 5310 }, { "epoch": 2.416287534121929, "grad_norm": 2.2229572739629635, "learning_rate": 2.6321482650314324e-06, "loss": 0.0446, "step": 5311 }, { "epoch": 2.4167424931756143, "grad_norm": 2.2701980126262944, "learning_rate": 2.631434610731117e-06, "loss": 0.0379, "step": 5312 }, { "epoch": 2.417197452229299, "grad_norm": 0.9749083715638894, "learning_rate": 2.630720945690558e-06, "loss": 0.0155, "step": 5313 }, { "epoch": 2.4176524112829845, "grad_norm": 1.3187967333305466, "learning_rate": 2.630007269968074e-06, "loss": 0.0282, "step": 5314 }, { "epoch": 2.41810737033667, "grad_norm": 1.4431398857626616, "learning_rate": 2.629293583621984e-06, "loss": 0.0295, "step": 5315 }, { "epoch": 2.4185623293903546, "grad_norm": 1.3802821089680821, "learning_rate": 2.6285798867106054e-06, "loss": 0.0584, "step": 5316 }, { "epoch": 2.41901728844404, "grad_norm": 1.3435442552027141, "learning_rate": 2.6278661792922587e-06, "loss": 0.0591, "step": 5317 }, { "epoch": 2.4194722474977253, "grad_norm": 0.9814470492629613, "learning_rate": 2.6271524614252663e-06, "loss": 0.0261, "step": 5318 }, { "epoch": 2.41992720655141, "grad_norm": 1.645065202453022, "learning_rate": 2.6264387331679486e-06, "loss": 0.0538, "step": 5319 }, { "epoch": 2.4203821656050954, "grad_norm": 1.8432160834481668, "learning_rate": 2.6257249945786285e-06, "loss": 0.0507, "step": 5320 }, { "epoch": 2.4208371246587808, "grad_norm": 1.2049873048863138, "learning_rate": 2.6250112457156296e-06, "loss": 0.0699, "step": 5321 }, { "epoch": 2.421292083712466, "grad_norm": 1.116399902853374, "learning_rate": 2.6242974866372762e-06, "loss": 0.0178, "step": 5322 }, { "epoch": 2.421747042766151, "grad_norm": 0.9883682103550543, "learning_rate": 2.6235837174018937e-06, "loss": 0.0265, "step": 5323 }, { "epoch": 2.4222020018198362, "grad_norm": 1.84168326397131, "learning_rate": 2.6228699380678074e-06, "loss": 0.0523, "step": 5324 }, { "epoch": 2.4226569608735216, "grad_norm": 0.9660132958996044, "learning_rate": 2.6221561486933454e-06, "loss": 0.0585, "step": 5325 }, { "epoch": 2.4231119199272064, "grad_norm": 2.071450615531615, "learning_rate": 2.621442349336834e-06, "loss": 0.0595, "step": 5326 }, { "epoch": 2.4235668789808917, "grad_norm": 0.8568900377544397, "learning_rate": 2.6207285400566025e-06, "loss": 0.0156, "step": 5327 }, { "epoch": 2.424021838034577, "grad_norm": 1.4131345914733615, "learning_rate": 2.62001472091098e-06, "loss": 0.0469, "step": 5328 }, { "epoch": 2.424476797088262, "grad_norm": 1.0095626319094675, "learning_rate": 2.6193008919582962e-06, "loss": 0.0242, "step": 5329 }, { "epoch": 2.4249317561419472, "grad_norm": 1.1528082083131772, "learning_rate": 2.618587053256882e-06, "loss": 0.025, "step": 5330 }, { "epoch": 2.4253867151956325, "grad_norm": 1.4145461066006544, "learning_rate": 2.6178732048650694e-06, "loss": 0.0537, "step": 5331 }, { "epoch": 2.4258416742493174, "grad_norm": 2.2456303112754394, "learning_rate": 2.617159346841192e-06, "loss": 0.0681, "step": 5332 }, { "epoch": 2.4262966333030027, "grad_norm": 1.4006196450664905, "learning_rate": 2.616445479243581e-06, "loss": 0.0324, "step": 5333 }, { "epoch": 2.426751592356688, "grad_norm": 1.2769868137247284, "learning_rate": 2.615731602130571e-06, "loss": 0.0522, "step": 5334 }, { "epoch": 2.427206551410373, "grad_norm": 1.0702146156874695, "learning_rate": 2.6150177155604977e-06, "loss": 0.0653, "step": 5335 }, { "epoch": 2.427661510464058, "grad_norm": 1.239585797873716, "learning_rate": 2.614303819591696e-06, "loss": 0.0852, "step": 5336 }, { "epoch": 2.4281164695177435, "grad_norm": 1.897051184566686, "learning_rate": 2.6135899142825015e-06, "loss": 0.0392, "step": 5337 }, { "epoch": 2.4285714285714284, "grad_norm": 1.0758757351631232, "learning_rate": 2.6128759996912533e-06, "loss": 0.0223, "step": 5338 }, { "epoch": 2.4290263876251137, "grad_norm": 1.4650373458204449, "learning_rate": 2.6121620758762877e-06, "loss": 0.0414, "step": 5339 }, { "epoch": 2.429481346678799, "grad_norm": 1.128781723998616, "learning_rate": 2.6114481428959445e-06, "loss": 0.0476, "step": 5340 }, { "epoch": 2.4299363057324843, "grad_norm": 1.9632208961353903, "learning_rate": 2.6107342008085605e-06, "loss": 0.0253, "step": 5341 }, { "epoch": 2.430391264786169, "grad_norm": 1.673620096938587, "learning_rate": 2.610020249672479e-06, "loss": 0.0757, "step": 5342 }, { "epoch": 2.4308462238398545, "grad_norm": 1.8787054409726727, "learning_rate": 2.6093062895460398e-06, "loss": 0.0486, "step": 5343 }, { "epoch": 2.43130118289354, "grad_norm": 1.1584716691259955, "learning_rate": 2.6085923204875835e-06, "loss": 0.02, "step": 5344 }, { "epoch": 2.4317561419472247, "grad_norm": 1.2800336451161494, "learning_rate": 2.6078783425554538e-06, "loss": 0.0271, "step": 5345 }, { "epoch": 2.43221110100091, "grad_norm": 1.2634078534011128, "learning_rate": 2.607164355807992e-06, "loss": 0.0363, "step": 5346 }, { "epoch": 2.4326660600545953, "grad_norm": 1.523754930880825, "learning_rate": 2.6064503603035447e-06, "loss": 0.0248, "step": 5347 }, { "epoch": 2.43312101910828, "grad_norm": 1.3017086975370538, "learning_rate": 2.6057363561004527e-06, "loss": 0.0534, "step": 5348 }, { "epoch": 2.4335759781619655, "grad_norm": 1.2508444433965238, "learning_rate": 2.6050223432570646e-06, "loss": 0.0737, "step": 5349 }, { "epoch": 2.434030937215651, "grad_norm": 2.082555734500323, "learning_rate": 2.604308321831725e-06, "loss": 0.0441, "step": 5350 }, { "epoch": 2.4344858962693356, "grad_norm": 1.026279237839995, "learning_rate": 2.6035942918827795e-06, "loss": 0.0258, "step": 5351 }, { "epoch": 2.434940855323021, "grad_norm": 1.178081455201079, "learning_rate": 2.6028802534685773e-06, "loss": 0.0453, "step": 5352 }, { "epoch": 2.4353958143767063, "grad_norm": 1.4880164904214115, "learning_rate": 2.6021662066474646e-06, "loss": 0.0456, "step": 5353 }, { "epoch": 2.435850773430391, "grad_norm": 1.4379157547653634, "learning_rate": 2.601452151477791e-06, "loss": 0.0274, "step": 5354 }, { "epoch": 2.4363057324840764, "grad_norm": 1.5126209926254583, "learning_rate": 2.6007380880179063e-06, "loss": 0.0299, "step": 5355 }, { "epoch": 2.4367606915377618, "grad_norm": 1.5174673537967835, "learning_rate": 2.6000240163261593e-06, "loss": 0.0365, "step": 5356 }, { "epoch": 2.4372156505914466, "grad_norm": 2.6811166950708007, "learning_rate": 2.599309936460902e-06, "loss": 0.0517, "step": 5357 }, { "epoch": 2.437670609645132, "grad_norm": 1.6710633928259102, "learning_rate": 2.5985958484804843e-06, "loss": 0.0521, "step": 5358 }, { "epoch": 2.4381255686988172, "grad_norm": 1.3738298280816192, "learning_rate": 2.597881752443259e-06, "loss": 0.0595, "step": 5359 }, { "epoch": 2.438580527752502, "grad_norm": 1.3820521097558847, "learning_rate": 2.59716764840758e-06, "loss": 0.0639, "step": 5360 }, { "epoch": 2.4390354868061874, "grad_norm": 1.1133460581344794, "learning_rate": 2.5964535364317992e-06, "loss": 0.0303, "step": 5361 }, { "epoch": 2.4394904458598727, "grad_norm": 1.7127349786595853, "learning_rate": 2.5957394165742712e-06, "loss": 0.0586, "step": 5362 }, { "epoch": 2.4399454049135576, "grad_norm": 2.219689278209333, "learning_rate": 2.5950252888933495e-06, "loss": 0.0537, "step": 5363 }, { "epoch": 2.440400363967243, "grad_norm": 1.3182833425033842, "learning_rate": 2.5943111534473914e-06, "loss": 0.0566, "step": 5364 }, { "epoch": 2.4408553230209282, "grad_norm": 1.5974052225361874, "learning_rate": 2.5935970102947505e-06, "loss": 0.0579, "step": 5365 }, { "epoch": 2.441310282074613, "grad_norm": 1.1044215722393804, "learning_rate": 2.5928828594937854e-06, "loss": 0.0178, "step": 5366 }, { "epoch": 2.4417652411282984, "grad_norm": 2.0345112903416838, "learning_rate": 2.5921687011028525e-06, "loss": 0.064, "step": 5367 }, { "epoch": 2.4422202001819837, "grad_norm": 0.9998744757512545, "learning_rate": 2.59145453518031e-06, "loss": 0.0203, "step": 5368 }, { "epoch": 2.4426751592356686, "grad_norm": 1.1319097258094928, "learning_rate": 2.590740361784515e-06, "loss": 0.0181, "step": 5369 }, { "epoch": 2.443130118289354, "grad_norm": 1.4581654225769303, "learning_rate": 2.590026180973828e-06, "loss": 0.0322, "step": 5370 }, { "epoch": 2.443585077343039, "grad_norm": 1.6481493214217668, "learning_rate": 2.589311992806608e-06, "loss": 0.0671, "step": 5371 }, { "epoch": 2.444040036396724, "grad_norm": 1.1976474015606398, "learning_rate": 2.5885977973412154e-06, "loss": 0.0401, "step": 5372 }, { "epoch": 2.4444949954504094, "grad_norm": 1.3011000063762603, "learning_rate": 2.58788359463601e-06, "loss": 0.0511, "step": 5373 }, { "epoch": 2.4449499545040947, "grad_norm": 1.1296000309521215, "learning_rate": 2.5871693847493555e-06, "loss": 0.0229, "step": 5374 }, { "epoch": 2.4454049135577796, "grad_norm": 1.4223040383828625, "learning_rate": 2.5864551677396116e-06, "loss": 0.086, "step": 5375 }, { "epoch": 2.445859872611465, "grad_norm": 1.157993190951171, "learning_rate": 2.5857409436651416e-06, "loss": 0.057, "step": 5376 }, { "epoch": 2.44631483166515, "grad_norm": 1.525744560442962, "learning_rate": 2.5850267125843093e-06, "loss": 0.0527, "step": 5377 }, { "epoch": 2.4467697907188355, "grad_norm": 0.8627445056375942, "learning_rate": 2.584312474555478e-06, "loss": 0.0272, "step": 5378 }, { "epoch": 2.4472247497725204, "grad_norm": 1.32438820015048, "learning_rate": 2.583598229637012e-06, "loss": 0.0592, "step": 5379 }, { "epoch": 2.4476797088262057, "grad_norm": 1.4665824664534308, "learning_rate": 2.582883977887277e-06, "loss": 0.034, "step": 5380 }, { "epoch": 2.448134667879891, "grad_norm": 0.8676159402861846, "learning_rate": 2.5821697193646367e-06, "loss": 0.0332, "step": 5381 }, { "epoch": 2.448589626933576, "grad_norm": 0.9615725664973912, "learning_rate": 2.5814554541274583e-06, "loss": 0.0446, "step": 5382 }, { "epoch": 2.449044585987261, "grad_norm": 1.5499529943663009, "learning_rate": 2.580741182234108e-06, "loss": 0.0438, "step": 5383 }, { "epoch": 2.4494995450409465, "grad_norm": 1.157099641641353, "learning_rate": 2.5800269037429522e-06, "loss": 0.0261, "step": 5384 }, { "epoch": 2.4499545040946313, "grad_norm": 1.11799376593568, "learning_rate": 2.57931261871236e-06, "loss": 0.0441, "step": 5385 }, { "epoch": 2.4504094631483166, "grad_norm": 1.2405412989894733, "learning_rate": 2.5785983272006987e-06, "loss": 0.0433, "step": 5386 }, { "epoch": 2.450864422202002, "grad_norm": 0.7807847318253236, "learning_rate": 2.577884029266337e-06, "loss": 0.0547, "step": 5387 }, { "epoch": 2.451319381255687, "grad_norm": 1.21460675558352, "learning_rate": 2.577169724967645e-06, "loss": 0.0322, "step": 5388 }, { "epoch": 2.451774340309372, "grad_norm": 1.3682594480852435, "learning_rate": 2.57645541436299e-06, "loss": 0.0207, "step": 5389 }, { "epoch": 2.4522292993630574, "grad_norm": 1.1468495875835198, "learning_rate": 2.5757410975107444e-06, "loss": 0.0288, "step": 5390 }, { "epoch": 2.4526842584167423, "grad_norm": 1.5459027707710604, "learning_rate": 2.5750267744692785e-06, "loss": 0.0563, "step": 5391 }, { "epoch": 2.4531392174704276, "grad_norm": 1.341265152594787, "learning_rate": 2.5743124452969636e-06, "loss": 0.0346, "step": 5392 }, { "epoch": 2.453594176524113, "grad_norm": 1.0264565490404551, "learning_rate": 2.573598110052171e-06, "loss": 0.0433, "step": 5393 }, { "epoch": 2.4540491355777982, "grad_norm": 2.41463468911341, "learning_rate": 2.572883768793273e-06, "loss": 0.0483, "step": 5394 }, { "epoch": 2.454504094631483, "grad_norm": 1.930056538983476, "learning_rate": 2.572169421578643e-06, "loss": 0.032, "step": 5395 }, { "epoch": 2.4549590536851684, "grad_norm": 1.7959534393293912, "learning_rate": 2.5714550684666532e-06, "loss": 0.0697, "step": 5396 }, { "epoch": 2.4554140127388537, "grad_norm": 1.5258781320611614, "learning_rate": 2.5707407095156783e-06, "loss": 0.0326, "step": 5397 }, { "epoch": 2.4558689717925386, "grad_norm": 0.9323018716742659, "learning_rate": 2.5700263447840927e-06, "loss": 0.0394, "step": 5398 }, { "epoch": 2.456323930846224, "grad_norm": 1.6092700520212921, "learning_rate": 2.5693119743302697e-06, "loss": 0.0292, "step": 5399 }, { "epoch": 2.4567788898999092, "grad_norm": 1.3041315795376156, "learning_rate": 2.5685975982125848e-06, "loss": 0.0269, "step": 5400 }, { "epoch": 2.457233848953594, "grad_norm": 1.1443521683193907, "learning_rate": 2.5678832164894145e-06, "loss": 0.0329, "step": 5401 }, { "epoch": 2.4576888080072794, "grad_norm": 1.0817087182834504, "learning_rate": 2.5671688292191347e-06, "loss": 0.022, "step": 5402 }, { "epoch": 2.4581437670609647, "grad_norm": 1.6582149937330346, "learning_rate": 2.566454436460121e-06, "loss": 0.0336, "step": 5403 }, { "epoch": 2.4585987261146496, "grad_norm": 1.778351428897476, "learning_rate": 2.5657400382707507e-06, "loss": 0.0496, "step": 5404 }, { "epoch": 2.459053685168335, "grad_norm": 1.361054981660227, "learning_rate": 2.565025634709402e-06, "loss": 0.0431, "step": 5405 }, { "epoch": 2.45950864422202, "grad_norm": 1.7553299589432358, "learning_rate": 2.5643112258344517e-06, "loss": 0.0715, "step": 5406 }, { "epoch": 2.459963603275705, "grad_norm": 1.271245918074581, "learning_rate": 2.563596811704278e-06, "loss": 0.026, "step": 5407 }, { "epoch": 2.4604185623293904, "grad_norm": 1.234453601224146, "learning_rate": 2.5628823923772606e-06, "loss": 0.0333, "step": 5408 }, { "epoch": 2.4608735213830757, "grad_norm": 1.0423922894593614, "learning_rate": 2.5621679679117778e-06, "loss": 0.0226, "step": 5409 }, { "epoch": 2.4613284804367606, "grad_norm": 1.4597246596625342, "learning_rate": 2.56145353836621e-06, "loss": 0.0604, "step": 5410 }, { "epoch": 2.461783439490446, "grad_norm": 1.963990740327164, "learning_rate": 2.5607391037989354e-06, "loss": 0.0513, "step": 5411 }, { "epoch": 2.462238398544131, "grad_norm": 1.0330834888926703, "learning_rate": 2.560024664268337e-06, "loss": 0.0388, "step": 5412 }, { "epoch": 2.462693357597816, "grad_norm": 1.4640005265002423, "learning_rate": 2.5593102198327927e-06, "loss": 0.0603, "step": 5413 }, { "epoch": 2.4631483166515014, "grad_norm": 1.8982326535645346, "learning_rate": 2.558595770550686e-06, "loss": 0.0787, "step": 5414 }, { "epoch": 2.4636032757051867, "grad_norm": 1.1268453859237941, "learning_rate": 2.5578813164803974e-06, "loss": 0.0367, "step": 5415 }, { "epoch": 2.4640582347588715, "grad_norm": 1.3990054445896076, "learning_rate": 2.5571668576803087e-06, "loss": 0.0602, "step": 5416 }, { "epoch": 2.464513193812557, "grad_norm": 0.9962370342961491, "learning_rate": 2.5564523942088033e-06, "loss": 0.0223, "step": 5417 }, { "epoch": 2.464968152866242, "grad_norm": 1.2941088412148987, "learning_rate": 2.5557379261242615e-06, "loss": 0.0813, "step": 5418 }, { "epoch": 2.465423111919927, "grad_norm": 1.9910732295561582, "learning_rate": 2.55502345348507e-06, "loss": 0.0366, "step": 5419 }, { "epoch": 2.4658780709736123, "grad_norm": 1.6462728323114084, "learning_rate": 2.5543089763496092e-06, "loss": 0.0388, "step": 5420 }, { "epoch": 2.4663330300272976, "grad_norm": 1.0342494383231198, "learning_rate": 2.5535944947762643e-06, "loss": 0.0367, "step": 5421 }, { "epoch": 2.4667879890809825, "grad_norm": 2.1240886032657924, "learning_rate": 2.5528800088234194e-06, "loss": 0.0442, "step": 5422 }, { "epoch": 2.467242948134668, "grad_norm": 1.1705219357731595, "learning_rate": 2.5521655185494592e-06, "loss": 0.0726, "step": 5423 }, { "epoch": 2.467697907188353, "grad_norm": 1.7779374361305362, "learning_rate": 2.551451024012769e-06, "loss": 0.0552, "step": 5424 }, { "epoch": 2.468152866242038, "grad_norm": 1.4376109031400903, "learning_rate": 2.550736525271732e-06, "loss": 0.0395, "step": 5425 }, { "epoch": 2.4686078252957233, "grad_norm": 1.187080031658505, "learning_rate": 2.550022022384736e-06, "loss": 0.0347, "step": 5426 }, { "epoch": 2.4690627843494086, "grad_norm": 1.1580215340533189, "learning_rate": 2.5493075154101665e-06, "loss": 0.0401, "step": 5427 }, { "epoch": 2.4695177434030935, "grad_norm": 1.7255964178096097, "learning_rate": 2.548593004406409e-06, "loss": 0.0519, "step": 5428 }, { "epoch": 2.469972702456779, "grad_norm": 0.874117884386352, "learning_rate": 2.547878489431851e-06, "loss": 0.0213, "step": 5429 }, { "epoch": 2.470427661510464, "grad_norm": 1.577294289324264, "learning_rate": 2.547163970544879e-06, "loss": 0.0248, "step": 5430 }, { "epoch": 2.470882620564149, "grad_norm": 0.9818352388356933, "learning_rate": 2.5464494478038802e-06, "loss": 0.0376, "step": 5431 }, { "epoch": 2.4713375796178343, "grad_norm": 1.523780964271709, "learning_rate": 2.5457349212672423e-06, "loss": 0.0349, "step": 5432 }, { "epoch": 2.4717925386715196, "grad_norm": 0.7888207959477409, "learning_rate": 2.545020390993353e-06, "loss": 0.0409, "step": 5433 }, { "epoch": 2.472247497725205, "grad_norm": 1.4535507762672755, "learning_rate": 2.5443058570406016e-06, "loss": 0.0808, "step": 5434 }, { "epoch": 2.47270245677889, "grad_norm": 1.6762977343403134, "learning_rate": 2.5435913194673738e-06, "loss": 0.0805, "step": 5435 }, { "epoch": 2.473157415832575, "grad_norm": 1.3328043063578014, "learning_rate": 2.542876778332062e-06, "loss": 0.0608, "step": 5436 }, { "epoch": 2.4736123748862604, "grad_norm": 1.5431075684510465, "learning_rate": 2.542162233693053e-06, "loss": 0.0418, "step": 5437 }, { "epoch": 2.4740673339399453, "grad_norm": 1.2819827795073264, "learning_rate": 2.5414476856087367e-06, "loss": 0.0313, "step": 5438 }, { "epoch": 2.4745222929936306, "grad_norm": 1.6013929095312942, "learning_rate": 2.5407331341375025e-06, "loss": 0.0325, "step": 5439 }, { "epoch": 2.474977252047316, "grad_norm": 1.949051780617635, "learning_rate": 2.5400185793377404e-06, "loss": 0.0673, "step": 5440 }, { "epoch": 2.4754322111010008, "grad_norm": 0.9142299008252769, "learning_rate": 2.539304021267841e-06, "loss": 0.0165, "step": 5441 }, { "epoch": 2.475887170154686, "grad_norm": 1.9507365311279543, "learning_rate": 2.538589459986194e-06, "loss": 0.0634, "step": 5442 }, { "epoch": 2.4763421292083714, "grad_norm": 2.801478076226403, "learning_rate": 2.537874895551191e-06, "loss": 0.0863, "step": 5443 }, { "epoch": 2.4767970882620562, "grad_norm": 1.4817859651573162, "learning_rate": 2.537160328021223e-06, "loss": 0.0557, "step": 5444 }, { "epoch": 2.4772520473157416, "grad_norm": 1.8421011613065237, "learning_rate": 2.5364457574546803e-06, "loss": 0.0516, "step": 5445 }, { "epoch": 2.477707006369427, "grad_norm": 1.5235066945137232, "learning_rate": 2.5357311839099546e-06, "loss": 0.0412, "step": 5446 }, { "epoch": 2.4781619654231117, "grad_norm": 1.693820106957003, "learning_rate": 2.535016607445438e-06, "loss": 0.0412, "step": 5447 }, { "epoch": 2.478616924476797, "grad_norm": 1.2502475890921039, "learning_rate": 2.534302028119523e-06, "loss": 0.044, "step": 5448 }, { "epoch": 2.4790718835304824, "grad_norm": 1.0756242924931223, "learning_rate": 2.5335874459906007e-06, "loss": 0.0335, "step": 5449 }, { "epoch": 2.4795268425841677, "grad_norm": 1.6138176582906918, "learning_rate": 2.532872861117064e-06, "loss": 0.0634, "step": 5450 }, { "epoch": 2.4799818016378525, "grad_norm": 1.0948641962968078, "learning_rate": 2.532158273557306e-06, "loss": 0.014, "step": 5451 }, { "epoch": 2.480436760691538, "grad_norm": 1.1978243808106384, "learning_rate": 2.5314436833697182e-06, "loss": 0.0435, "step": 5452 }, { "epoch": 2.480891719745223, "grad_norm": 1.3903968898212553, "learning_rate": 2.5307290906126954e-06, "loss": 0.0311, "step": 5453 }, { "epoch": 2.481346678798908, "grad_norm": 0.9170338866514459, "learning_rate": 2.5300144953446294e-06, "loss": 0.0379, "step": 5454 }, { "epoch": 2.4818016378525933, "grad_norm": 1.1028146805239103, "learning_rate": 2.529299897623915e-06, "loss": 0.0597, "step": 5455 }, { "epoch": 2.4822565969062786, "grad_norm": 1.0784466582906767, "learning_rate": 2.5285852975089454e-06, "loss": 0.0347, "step": 5456 }, { "epoch": 2.4827115559599635, "grad_norm": 1.2173073237729186, "learning_rate": 2.5278706950581133e-06, "loss": 0.0172, "step": 5457 }, { "epoch": 2.483166515013649, "grad_norm": 1.3261573159903088, "learning_rate": 2.5271560903298154e-06, "loss": 0.0395, "step": 5458 }, { "epoch": 2.483621474067334, "grad_norm": 1.4550839789761847, "learning_rate": 2.5264414833824437e-06, "loss": 0.0319, "step": 5459 }, { "epoch": 2.484076433121019, "grad_norm": 1.5464936749995448, "learning_rate": 2.525726874274393e-06, "loss": 0.043, "step": 5460 }, { "epoch": 2.4845313921747043, "grad_norm": 1.1686378653027176, "learning_rate": 2.525012263064059e-06, "loss": 0.0367, "step": 5461 }, { "epoch": 2.4849863512283896, "grad_norm": 1.2804298369915839, "learning_rate": 2.5242976498098355e-06, "loss": 0.0419, "step": 5462 }, { "epoch": 2.4854413102820745, "grad_norm": 1.5308047905625415, "learning_rate": 2.5235830345701175e-06, "loss": 0.0474, "step": 5463 }, { "epoch": 2.48589626933576, "grad_norm": 1.202605584310228, "learning_rate": 2.5228684174033e-06, "loss": 0.0405, "step": 5464 }, { "epoch": 2.486351228389445, "grad_norm": 1.3930936912097045, "learning_rate": 2.52215379836778e-06, "loss": 0.0631, "step": 5465 }, { "epoch": 2.48680618744313, "grad_norm": 1.5715958839386848, "learning_rate": 2.521439177521951e-06, "loss": 0.0547, "step": 5466 }, { "epoch": 2.4872611464968153, "grad_norm": 0.914720322057875, "learning_rate": 2.520724554924209e-06, "loss": 0.0162, "step": 5467 }, { "epoch": 2.4877161055505006, "grad_norm": 0.946503443269331, "learning_rate": 2.5200099306329507e-06, "loss": 0.0463, "step": 5468 }, { "epoch": 2.4881710646041855, "grad_norm": 1.5736511046944706, "learning_rate": 2.5192953047065704e-06, "loss": 0.0278, "step": 5469 }, { "epoch": 2.488626023657871, "grad_norm": 1.0609541786269119, "learning_rate": 2.518580677203465e-06, "loss": 0.0342, "step": 5470 }, { "epoch": 2.489080982711556, "grad_norm": 1.544082428360054, "learning_rate": 2.5178660481820305e-06, "loss": 0.0607, "step": 5471 }, { "epoch": 2.489535941765241, "grad_norm": 1.2955020267506296, "learning_rate": 2.517151417700664e-06, "loss": 0.023, "step": 5472 }, { "epoch": 2.4899909008189263, "grad_norm": 1.7476939610667268, "learning_rate": 2.516436785817761e-06, "loss": 0.0717, "step": 5473 }, { "epoch": 2.4904458598726116, "grad_norm": 1.1545954031978594, "learning_rate": 2.5157221525917175e-06, "loss": 0.043, "step": 5474 }, { "epoch": 2.4909008189262964, "grad_norm": 1.190812745405171, "learning_rate": 2.5150075180809315e-06, "loss": 0.0465, "step": 5475 }, { "epoch": 2.4913557779799818, "grad_norm": 1.1619002024901555, "learning_rate": 2.514292882343798e-06, "loss": 0.0269, "step": 5476 }, { "epoch": 2.491810737033667, "grad_norm": 1.1809070390954601, "learning_rate": 2.513578245438715e-06, "loss": 0.0236, "step": 5477 }, { "epoch": 2.492265696087352, "grad_norm": 1.7242229989955054, "learning_rate": 2.512863607424079e-06, "loss": 0.0495, "step": 5478 }, { "epoch": 2.4927206551410372, "grad_norm": 1.7943488156526437, "learning_rate": 2.512148968358287e-06, "loss": 0.0581, "step": 5479 }, { "epoch": 2.4931756141947226, "grad_norm": 1.4357572265334344, "learning_rate": 2.5114343282997372e-06, "loss": 0.0337, "step": 5480 }, { "epoch": 2.4936305732484074, "grad_norm": 1.8357879680914697, "learning_rate": 2.510719687306824e-06, "loss": 0.044, "step": 5481 }, { "epoch": 2.4940855323020927, "grad_norm": 1.243631453885575, "learning_rate": 2.5100050454379475e-06, "loss": 0.0345, "step": 5482 }, { "epoch": 2.494540491355778, "grad_norm": 1.6027293674670227, "learning_rate": 2.5092904027515037e-06, "loss": 0.053, "step": 5483 }, { "epoch": 2.494995450409463, "grad_norm": 1.3904763617007365, "learning_rate": 2.50857575930589e-06, "loss": 0.0631, "step": 5484 }, { "epoch": 2.4954504094631482, "grad_norm": 1.4608080267761756, "learning_rate": 2.5078611151595046e-06, "loss": 0.0481, "step": 5485 }, { "epoch": 2.4959053685168335, "grad_norm": 1.9004423132779207, "learning_rate": 2.5071464703707437e-06, "loss": 0.0336, "step": 5486 }, { "epoch": 2.496360327570519, "grad_norm": 1.7789887467365393, "learning_rate": 2.5064318249980065e-06, "loss": 0.0278, "step": 5487 }, { "epoch": 2.4968152866242037, "grad_norm": 1.48749636476749, "learning_rate": 2.5057171790996875e-06, "loss": 0.0751, "step": 5488 }, { "epoch": 2.497270245677889, "grad_norm": 1.214982475350953, "learning_rate": 2.5050025327341883e-06, "loss": 0.0604, "step": 5489 }, { "epoch": 2.4977252047315743, "grad_norm": 1.073401852198196, "learning_rate": 2.504287885959904e-06, "loss": 0.0231, "step": 5490 }, { "epoch": 2.498180163785259, "grad_norm": 1.3049060214639088, "learning_rate": 2.503573238835233e-06, "loss": 0.071, "step": 5491 }, { "epoch": 2.4986351228389445, "grad_norm": 1.398778413228875, "learning_rate": 2.5028585914185736e-06, "loss": 0.0281, "step": 5492 }, { "epoch": 2.49909008189263, "grad_norm": 1.0306385215056395, "learning_rate": 2.5021439437683224e-06, "loss": 0.0398, "step": 5493 }, { "epoch": 2.4995450409463147, "grad_norm": 1.2296470737346816, "learning_rate": 2.501429295942878e-06, "loss": 0.0255, "step": 5494 }, { "epoch": 2.5, "grad_norm": 1.386799643685056, "learning_rate": 2.5007146480006376e-06, "loss": 0.0601, "step": 5495 }, { "epoch": 2.5004549590536853, "grad_norm": 1.267420495507714, "learning_rate": 2.5e-06, "loss": 0.0258, "step": 5496 }, { "epoch": 2.50090991810737, "grad_norm": 1.48583792989226, "learning_rate": 2.4992853519993628e-06, "loss": 0.0468, "step": 5497 }, { "epoch": 2.5013648771610555, "grad_norm": 1.5119920557279545, "learning_rate": 2.4985707040571228e-06, "loss": 0.0802, "step": 5498 }, { "epoch": 2.501819836214741, "grad_norm": 0.9571387612984649, "learning_rate": 2.497856056231679e-06, "loss": 0.0279, "step": 5499 }, { "epoch": 2.502274795268426, "grad_norm": 1.4392578806126923, "learning_rate": 2.497141408581427e-06, "loss": 0.0263, "step": 5500 }, { "epoch": 2.502729754322111, "grad_norm": 1.1693368488922071, "learning_rate": 2.4964267611647673e-06, "loss": 0.0305, "step": 5501 }, { "epoch": 2.5031847133757963, "grad_norm": 1.1989490034502057, "learning_rate": 2.4957121140400966e-06, "loss": 0.0311, "step": 5502 }, { "epoch": 2.5036396724294816, "grad_norm": 1.5915144464325457, "learning_rate": 2.4949974672658126e-06, "loss": 0.0323, "step": 5503 }, { "epoch": 2.5040946314831665, "grad_norm": 1.1010252781536118, "learning_rate": 2.494282820900313e-06, "loss": 0.0363, "step": 5504 }, { "epoch": 2.5045495905368518, "grad_norm": 0.8235023686366892, "learning_rate": 2.493568175001995e-06, "loss": 0.0293, "step": 5505 }, { "epoch": 2.505004549590537, "grad_norm": 1.737722683342737, "learning_rate": 2.4928535296292576e-06, "loss": 0.0863, "step": 5506 }, { "epoch": 2.505459508644222, "grad_norm": 1.6440028108876434, "learning_rate": 2.4921388848404962e-06, "loss": 0.0407, "step": 5507 }, { "epoch": 2.5059144676979073, "grad_norm": 1.51740471717215, "learning_rate": 2.49142424069411e-06, "loss": 0.0286, "step": 5508 }, { "epoch": 2.5063694267515926, "grad_norm": 1.1951175250375952, "learning_rate": 2.4907095972484967e-06, "loss": 0.0255, "step": 5509 }, { "epoch": 2.5068243858052774, "grad_norm": 1.1197785485937601, "learning_rate": 2.489994954562053e-06, "loss": 0.0354, "step": 5510 }, { "epoch": 2.5072793448589628, "grad_norm": 2.089655190680738, "learning_rate": 2.489280312693177e-06, "loss": 0.0508, "step": 5511 }, { "epoch": 2.507734303912648, "grad_norm": 1.257760625326669, "learning_rate": 2.488565671700264e-06, "loss": 0.0252, "step": 5512 }, { "epoch": 2.508189262966333, "grad_norm": 1.2948630100072744, "learning_rate": 2.487851031641714e-06, "loss": 0.0255, "step": 5513 }, { "epoch": 2.5086442220200182, "grad_norm": 1.6378442298434481, "learning_rate": 2.4871363925759216e-06, "loss": 0.046, "step": 5514 }, { "epoch": 2.5090991810737036, "grad_norm": 1.2000141399850974, "learning_rate": 2.4864217545612855e-06, "loss": 0.0283, "step": 5515 }, { "epoch": 2.5095541401273884, "grad_norm": 1.3927287274523177, "learning_rate": 2.485707117656203e-06, "loss": 0.0347, "step": 5516 }, { "epoch": 2.5100090991810737, "grad_norm": 1.792107592972426, "learning_rate": 2.4849924819190698e-06, "loss": 0.0503, "step": 5517 }, { "epoch": 2.510464058234759, "grad_norm": 1.401271420059988, "learning_rate": 2.4842778474082833e-06, "loss": 0.0402, "step": 5518 }, { "epoch": 2.510919017288444, "grad_norm": 1.4296397331256754, "learning_rate": 2.48356321418224e-06, "loss": 0.0349, "step": 5519 }, { "epoch": 2.511373976342129, "grad_norm": 1.7491032467573402, "learning_rate": 2.482848582299337e-06, "loss": 0.0565, "step": 5520 }, { "epoch": 2.5118289353958145, "grad_norm": 1.7455526103708328, "learning_rate": 2.4821339518179695e-06, "loss": 0.0525, "step": 5521 }, { "epoch": 2.5122838944494994, "grad_norm": 2.0601830195027833, "learning_rate": 2.481419322796535e-06, "loss": 0.0491, "step": 5522 }, { "epoch": 2.5127388535031847, "grad_norm": 1.1934175326363499, "learning_rate": 2.48070469529343e-06, "loss": 0.0152, "step": 5523 }, { "epoch": 2.51319381255687, "grad_norm": 1.2865213990527746, "learning_rate": 2.47999006936705e-06, "loss": 0.0277, "step": 5524 }, { "epoch": 2.513648771610555, "grad_norm": 1.6559152432587987, "learning_rate": 2.479275445075792e-06, "loss": 0.038, "step": 5525 }, { "epoch": 2.51410373066424, "grad_norm": 1.4330891461128488, "learning_rate": 2.47856082247805e-06, "loss": 0.0402, "step": 5526 }, { "epoch": 2.5145586897179255, "grad_norm": 1.4733185144421408, "learning_rate": 2.477846201632221e-06, "loss": 0.0729, "step": 5527 }, { "epoch": 2.5150136487716104, "grad_norm": 1.3024866145240448, "learning_rate": 2.4771315825967e-06, "loss": 0.0218, "step": 5528 }, { "epoch": 2.5154686078252957, "grad_norm": 1.7035582966532716, "learning_rate": 2.476416965429883e-06, "loss": 0.0414, "step": 5529 }, { "epoch": 2.515923566878981, "grad_norm": 1.016784116413894, "learning_rate": 2.4757023501901654e-06, "loss": 0.0152, "step": 5530 }, { "epoch": 2.516378525932666, "grad_norm": 1.3443107548037645, "learning_rate": 2.4749877369359418e-06, "loss": 0.0476, "step": 5531 }, { "epoch": 2.516833484986351, "grad_norm": 2.00586606282255, "learning_rate": 2.474273125725608e-06, "loss": 0.0295, "step": 5532 }, { "epoch": 2.5172884440400365, "grad_norm": 0.8577487288134017, "learning_rate": 2.473558516617558e-06, "loss": 0.0355, "step": 5533 }, { "epoch": 2.5177434030937214, "grad_norm": 1.1358212757123347, "learning_rate": 2.472843909670186e-06, "loss": 0.0197, "step": 5534 }, { "epoch": 2.5181983621474067, "grad_norm": 2.3465238606264642, "learning_rate": 2.4721293049418867e-06, "loss": 0.0639, "step": 5535 }, { "epoch": 2.518653321201092, "grad_norm": 2.150396747083585, "learning_rate": 2.471414702491056e-06, "loss": 0.0785, "step": 5536 }, { "epoch": 2.519108280254777, "grad_norm": 1.1418550620026526, "learning_rate": 2.4707001023760852e-06, "loss": 0.0281, "step": 5537 }, { "epoch": 2.519563239308462, "grad_norm": 1.4400987286380247, "learning_rate": 2.4699855046553714e-06, "loss": 0.0406, "step": 5538 }, { "epoch": 2.5200181983621475, "grad_norm": 1.6154432759211126, "learning_rate": 2.4692709093873054e-06, "loss": 0.0335, "step": 5539 }, { "epoch": 2.5204731574158323, "grad_norm": 1.2537795783488135, "learning_rate": 2.468556316630283e-06, "loss": 0.0484, "step": 5540 }, { "epoch": 2.5209281164695176, "grad_norm": 1.1261268592767328, "learning_rate": 2.4678417264426953e-06, "loss": 0.0536, "step": 5541 }, { "epoch": 2.521383075523203, "grad_norm": 1.7923278739995467, "learning_rate": 2.467127138882936e-06, "loss": 0.0931, "step": 5542 }, { "epoch": 2.521838034576888, "grad_norm": 1.174366642396358, "learning_rate": 2.4664125540094e-06, "loss": 0.0263, "step": 5543 }, { "epoch": 2.522292993630573, "grad_norm": 1.0884888973601539, "learning_rate": 2.4656979718804775e-06, "loss": 0.034, "step": 5544 }, { "epoch": 2.5227479526842584, "grad_norm": 1.3125580783315782, "learning_rate": 2.4649833925545626e-06, "loss": 0.0392, "step": 5545 }, { "epoch": 2.5232029117379433, "grad_norm": 1.4987360079152638, "learning_rate": 2.464268816090046e-06, "loss": 0.0521, "step": 5546 }, { "epoch": 2.5236578707916286, "grad_norm": 1.84419002776008, "learning_rate": 2.4635542425453213e-06, "loss": 0.0399, "step": 5547 }, { "epoch": 2.524112829845314, "grad_norm": 1.6507620906152158, "learning_rate": 2.4628396719787783e-06, "loss": 0.0544, "step": 5548 }, { "epoch": 2.5245677888989992, "grad_norm": 1.1354151827603978, "learning_rate": 2.4621251044488094e-06, "loss": 0.0385, "step": 5549 }, { "epoch": 2.525022747952684, "grad_norm": 1.4107638223320063, "learning_rate": 2.4614105400138066e-06, "loss": 0.0231, "step": 5550 }, { "epoch": 2.5254777070063694, "grad_norm": 1.0217945571436333, "learning_rate": 2.4606959787321596e-06, "loss": 0.0271, "step": 5551 }, { "epoch": 2.5259326660600547, "grad_norm": 1.3871043416477609, "learning_rate": 2.4599814206622604e-06, "loss": 0.0465, "step": 5552 }, { "epoch": 2.5263876251137396, "grad_norm": 1.3754369674991378, "learning_rate": 2.4592668658624984e-06, "loss": 0.0514, "step": 5553 }, { "epoch": 2.526842584167425, "grad_norm": 0.9943194780392173, "learning_rate": 2.4585523143912645e-06, "loss": 0.0276, "step": 5554 }, { "epoch": 2.52729754322111, "grad_norm": 1.348223439167325, "learning_rate": 2.457837766306948e-06, "loss": 0.0287, "step": 5555 }, { "epoch": 2.5277525022747955, "grad_norm": 0.9731272977505819, "learning_rate": 2.457123221667938e-06, "loss": 0.0165, "step": 5556 }, { "epoch": 2.5282074613284804, "grad_norm": 1.576533702592406, "learning_rate": 2.4564086805326262e-06, "loss": 0.0594, "step": 5557 }, { "epoch": 2.5286624203821657, "grad_norm": 1.473468195160971, "learning_rate": 2.4556941429593993e-06, "loss": 0.0328, "step": 5558 }, { "epoch": 2.529117379435851, "grad_norm": 1.3444794989631332, "learning_rate": 2.4549796090066473e-06, "loss": 0.0594, "step": 5559 }, { "epoch": 2.529572338489536, "grad_norm": 1.3805390961872805, "learning_rate": 2.454265078732758e-06, "loss": 0.0375, "step": 5560 }, { "epoch": 2.530027297543221, "grad_norm": 0.9912320945883921, "learning_rate": 2.453550552196121e-06, "loss": 0.0266, "step": 5561 }, { "epoch": 2.5304822565969065, "grad_norm": 1.465583987400481, "learning_rate": 2.4528360294551216e-06, "loss": 0.0971, "step": 5562 }, { "epoch": 2.5309372156505914, "grad_norm": 1.3885640603312894, "learning_rate": 2.452121510568149e-06, "loss": 0.0251, "step": 5563 }, { "epoch": 2.5313921747042767, "grad_norm": 1.5988153281363364, "learning_rate": 2.4514069955935914e-06, "loss": 0.04, "step": 5564 }, { "epoch": 2.531847133757962, "grad_norm": 1.1169785491544262, "learning_rate": 2.450692484589834e-06, "loss": 0.0326, "step": 5565 }, { "epoch": 2.532302092811647, "grad_norm": 1.3418084829070873, "learning_rate": 2.4499779776152647e-06, "loss": 0.0622, "step": 5566 }, { "epoch": 2.532757051865332, "grad_norm": 0.9194955376171311, "learning_rate": 2.4492634747282686e-06, "loss": 0.0468, "step": 5567 }, { "epoch": 2.5332120109190175, "grad_norm": 1.8788139617626, "learning_rate": 2.4485489759872324e-06, "loss": 0.0379, "step": 5568 }, { "epoch": 2.5336669699727024, "grad_norm": 1.5499898077085326, "learning_rate": 2.447834481450542e-06, "loss": 0.0678, "step": 5569 }, { "epoch": 2.5341219290263877, "grad_norm": 2.381388348107702, "learning_rate": 2.447119991176581e-06, "loss": 0.0738, "step": 5570 }, { "epoch": 2.534576888080073, "grad_norm": 1.3795428110212382, "learning_rate": 2.446405505223736e-06, "loss": 0.0463, "step": 5571 }, { "epoch": 2.535031847133758, "grad_norm": 1.7501310296367991, "learning_rate": 2.4456910236503916e-06, "loss": 0.0369, "step": 5572 }, { "epoch": 2.535486806187443, "grad_norm": 1.2692424149553578, "learning_rate": 2.444976546514931e-06, "loss": 0.0423, "step": 5573 }, { "epoch": 2.5359417652411285, "grad_norm": 1.5777488818168321, "learning_rate": 2.4442620738757393e-06, "loss": 0.0545, "step": 5574 }, { "epoch": 2.5363967242948133, "grad_norm": 1.9304002557638231, "learning_rate": 2.4435476057911984e-06, "loss": 0.0509, "step": 5575 }, { "epoch": 2.5368516833484986, "grad_norm": 0.9459518397937714, "learning_rate": 2.4428331423196926e-06, "loss": 0.0454, "step": 5576 }, { "epoch": 2.537306642402184, "grad_norm": 2.134806437932298, "learning_rate": 2.4421186835196035e-06, "loss": 0.0526, "step": 5577 }, { "epoch": 2.537761601455869, "grad_norm": 1.3415061285350212, "learning_rate": 2.4414042294493146e-06, "loss": 0.0409, "step": 5578 }, { "epoch": 2.538216560509554, "grad_norm": 1.123528610318048, "learning_rate": 2.440689780167208e-06, "loss": 0.0248, "step": 5579 }, { "epoch": 2.5386715195632394, "grad_norm": 1.6146292023061026, "learning_rate": 2.439975335731664e-06, "loss": 0.0438, "step": 5580 }, { "epoch": 2.5391264786169243, "grad_norm": 1.69876128355802, "learning_rate": 2.4392608962010654e-06, "loss": 0.05, "step": 5581 }, { "epoch": 2.5395814376706096, "grad_norm": 1.1278575100593862, "learning_rate": 2.438546461633791e-06, "loss": 0.0296, "step": 5582 }, { "epoch": 2.540036396724295, "grad_norm": 1.2558764013056256, "learning_rate": 2.4378320320882235e-06, "loss": 0.0338, "step": 5583 }, { "epoch": 2.54049135577798, "grad_norm": 1.300336563501376, "learning_rate": 2.43711760762274e-06, "loss": 0.0519, "step": 5584 }, { "epoch": 2.540946314831665, "grad_norm": 1.4631411211550263, "learning_rate": 2.4364031882957223e-06, "loss": 0.0715, "step": 5585 }, { "epoch": 2.5414012738853504, "grad_norm": 1.2031861079539652, "learning_rate": 2.4356887741655496e-06, "loss": 0.0308, "step": 5586 }, { "epoch": 2.5418562329390353, "grad_norm": 1.3235358188353066, "learning_rate": 2.434974365290599e-06, "loss": 0.0363, "step": 5587 }, { "epoch": 2.5423111919927206, "grad_norm": 1.0032120034121397, "learning_rate": 2.43425996172925e-06, "loss": 0.0466, "step": 5588 }, { "epoch": 2.542766151046406, "grad_norm": 1.0510362235729844, "learning_rate": 2.4335455635398796e-06, "loss": 0.0401, "step": 5589 }, { "epoch": 2.5432211101000908, "grad_norm": 1.693628011531299, "learning_rate": 2.4328311707808666e-06, "loss": 0.0451, "step": 5590 }, { "epoch": 2.543676069153776, "grad_norm": 1.207855322605014, "learning_rate": 2.4321167835105855e-06, "loss": 0.0487, "step": 5591 }, { "epoch": 2.5441310282074614, "grad_norm": 1.5907401397905223, "learning_rate": 2.4314024017874152e-06, "loss": 0.0574, "step": 5592 }, { "epoch": 2.5445859872611463, "grad_norm": 1.2854299487701824, "learning_rate": 2.430688025669731e-06, "loss": 0.0348, "step": 5593 }, { "epoch": 2.5450409463148316, "grad_norm": 1.1839980470304428, "learning_rate": 2.429973655215908e-06, "loss": 0.0248, "step": 5594 }, { "epoch": 2.545495905368517, "grad_norm": 1.2881813772213309, "learning_rate": 2.429259290484322e-06, "loss": 0.0551, "step": 5595 }, { "epoch": 2.5459508644222018, "grad_norm": 0.8249892490304929, "learning_rate": 2.428544931533347e-06, "loss": 0.0152, "step": 5596 }, { "epoch": 2.546405823475887, "grad_norm": 1.3749800808350576, "learning_rate": 2.4278305784213583e-06, "loss": 0.059, "step": 5597 }, { "epoch": 2.5468607825295724, "grad_norm": 1.3180712241410177, "learning_rate": 2.4271162312067274e-06, "loss": 0.0727, "step": 5598 }, { "epoch": 2.5473157415832572, "grad_norm": 1.1343223371773048, "learning_rate": 2.426401889947829e-06, "loss": 0.0175, "step": 5599 }, { "epoch": 2.5477707006369426, "grad_norm": 1.1654452946510612, "learning_rate": 2.4256875547030372e-06, "loss": 0.0459, "step": 5600 }, { "epoch": 2.548225659690628, "grad_norm": 1.0954792947188483, "learning_rate": 2.424973225530722e-06, "loss": 0.0183, "step": 5601 }, { "epoch": 2.548680618744313, "grad_norm": 1.1150015879193569, "learning_rate": 2.4242589024892564e-06, "loss": 0.0338, "step": 5602 }, { "epoch": 2.549135577797998, "grad_norm": 1.0904708507238057, "learning_rate": 2.423544585637011e-06, "loss": 0.0439, "step": 5603 }, { "epoch": 2.5495905368516834, "grad_norm": 1.1011116796001776, "learning_rate": 2.422830275032357e-06, "loss": 0.0385, "step": 5604 }, { "epoch": 2.5500454959053687, "grad_norm": 1.075878115594318, "learning_rate": 2.4221159707336633e-06, "loss": 0.0447, "step": 5605 }, { "epoch": 2.5505004549590535, "grad_norm": 1.3691925724032799, "learning_rate": 2.421401672799302e-06, "loss": 0.0527, "step": 5606 }, { "epoch": 2.550955414012739, "grad_norm": 1.2459964225210791, "learning_rate": 2.4206873812876404e-06, "loss": 0.0263, "step": 5607 }, { "epoch": 2.551410373066424, "grad_norm": 1.295275670186209, "learning_rate": 2.419973096257048e-06, "loss": 0.0309, "step": 5608 }, { "epoch": 2.5518653321201095, "grad_norm": 1.67366669105733, "learning_rate": 2.4192588177658934e-06, "loss": 0.0469, "step": 5609 }, { "epoch": 2.5523202911737943, "grad_norm": 1.6262121529492062, "learning_rate": 2.418544545872543e-06, "loss": 0.0683, "step": 5610 }, { "epoch": 2.5527752502274796, "grad_norm": 1.31603376139169, "learning_rate": 2.4178302806353646e-06, "loss": 0.023, "step": 5611 }, { "epoch": 2.553230209281165, "grad_norm": 1.4942115583200535, "learning_rate": 2.4171160221127236e-06, "loss": 0.032, "step": 5612 }, { "epoch": 2.55368516833485, "grad_norm": 1.4902730824668702, "learning_rate": 2.4164017703629885e-06, "loss": 0.0343, "step": 5613 }, { "epoch": 2.554140127388535, "grad_norm": 0.8396662739506405, "learning_rate": 2.4156875254445224e-06, "loss": 0.0338, "step": 5614 }, { "epoch": 2.5545950864422204, "grad_norm": 1.4004132932784032, "learning_rate": 2.4149732874156915e-06, "loss": 0.0273, "step": 5615 }, { "epoch": 2.5550500454959053, "grad_norm": 1.388962233721564, "learning_rate": 2.414259056334859e-06, "loss": 0.0428, "step": 5616 }, { "epoch": 2.5555050045495906, "grad_norm": 1.178703191450467, "learning_rate": 2.4135448322603896e-06, "loss": 0.0603, "step": 5617 }, { "epoch": 2.555959963603276, "grad_norm": 1.4681983537694012, "learning_rate": 2.4128306152506457e-06, "loss": 0.0435, "step": 5618 }, { "epoch": 2.556414922656961, "grad_norm": 1.417904914246492, "learning_rate": 2.4121164053639902e-06, "loss": 0.0201, "step": 5619 }, { "epoch": 2.556869881710646, "grad_norm": 1.788076954984443, "learning_rate": 2.4114022026587854e-06, "loss": 0.0297, "step": 5620 }, { "epoch": 2.5573248407643314, "grad_norm": 1.300958666230986, "learning_rate": 2.4106880071933923e-06, "loss": 0.0426, "step": 5621 }, { "epoch": 2.5577797998180163, "grad_norm": 1.3951739838002077, "learning_rate": 2.4099738190261727e-06, "loss": 0.0417, "step": 5622 }, { "epoch": 2.5582347588717016, "grad_norm": 1.4208977128932119, "learning_rate": 2.4092596382154855e-06, "loss": 0.0454, "step": 5623 }, { "epoch": 2.558689717925387, "grad_norm": 1.3159118103202607, "learning_rate": 2.4085454648196912e-06, "loss": 0.0341, "step": 5624 }, { "epoch": 2.5591446769790718, "grad_norm": 1.868458328219338, "learning_rate": 2.407831298897148e-06, "loss": 0.0534, "step": 5625 }, { "epoch": 2.559599636032757, "grad_norm": 1.2704894789639154, "learning_rate": 2.4071171405062145e-06, "loss": 0.034, "step": 5626 }, { "epoch": 2.5600545950864424, "grad_norm": 1.018212466287081, "learning_rate": 2.4064029897052495e-06, "loss": 0.0377, "step": 5627 }, { "epoch": 2.5605095541401273, "grad_norm": 1.5856424908996767, "learning_rate": 2.4056888465526095e-06, "loss": 0.0771, "step": 5628 }, { "epoch": 2.5609645131938126, "grad_norm": 1.696265395015431, "learning_rate": 2.4049747111066513e-06, "loss": 0.0591, "step": 5629 }, { "epoch": 2.561419472247498, "grad_norm": 1.626139237978395, "learning_rate": 2.40426058342573e-06, "loss": 0.0463, "step": 5630 }, { "epoch": 2.5618744313011828, "grad_norm": 1.4501912250417655, "learning_rate": 2.403546463568202e-06, "loss": 0.0426, "step": 5631 }, { "epoch": 2.562329390354868, "grad_norm": 1.8631472567435432, "learning_rate": 2.402832351592421e-06, "loss": 0.0614, "step": 5632 }, { "epoch": 2.5627843494085534, "grad_norm": 1.0524538789515478, "learning_rate": 2.4021182475567404e-06, "loss": 0.0318, "step": 5633 }, { "epoch": 2.5632393084622382, "grad_norm": 0.9271748100035777, "learning_rate": 2.401404151519516e-06, "loss": 0.0391, "step": 5634 }, { "epoch": 2.5636942675159236, "grad_norm": 1.4498218681613977, "learning_rate": 2.400690063539099e-06, "loss": 0.0206, "step": 5635 }, { "epoch": 2.564149226569609, "grad_norm": 1.8266710500386767, "learning_rate": 2.3999759836738415e-06, "loss": 0.0608, "step": 5636 }, { "epoch": 2.5646041856232937, "grad_norm": 1.6559559579497194, "learning_rate": 2.3992619119820945e-06, "loss": 0.0314, "step": 5637 }, { "epoch": 2.565059144676979, "grad_norm": 1.525020818731364, "learning_rate": 2.39854784852221e-06, "loss": 0.0398, "step": 5638 }, { "epoch": 2.5655141037306644, "grad_norm": 2.777418873003795, "learning_rate": 2.3978337933525366e-06, "loss": 0.0427, "step": 5639 }, { "epoch": 2.565969062784349, "grad_norm": 1.3410612241622977, "learning_rate": 2.397119746531423e-06, "loss": 0.0321, "step": 5640 }, { "epoch": 2.5664240218380345, "grad_norm": 1.0457402660414623, "learning_rate": 2.3964057081172205e-06, "loss": 0.0412, "step": 5641 }, { "epoch": 2.56687898089172, "grad_norm": 1.7111410269040228, "learning_rate": 2.395691678168276e-06, "loss": 0.0886, "step": 5642 }, { "epoch": 2.5673339399454047, "grad_norm": 1.25764324873156, "learning_rate": 2.3949776567429358e-06, "loss": 0.0298, "step": 5643 }, { "epoch": 2.56778889899909, "grad_norm": 1.3669208030843554, "learning_rate": 2.3942636438995478e-06, "loss": 0.0257, "step": 5644 }, { "epoch": 2.5682438580527753, "grad_norm": 1.0696651183072534, "learning_rate": 2.3935496396964565e-06, "loss": 0.0514, "step": 5645 }, { "epoch": 2.56869881710646, "grad_norm": 1.7511344752956242, "learning_rate": 2.3928356441920087e-06, "loss": 0.0384, "step": 5646 }, { "epoch": 2.5691537761601455, "grad_norm": 1.604099119040206, "learning_rate": 2.392121657444547e-06, "loss": 0.0788, "step": 5647 }, { "epoch": 2.569608735213831, "grad_norm": 1.2303723757832998, "learning_rate": 2.391407679512417e-06, "loss": 0.0525, "step": 5648 }, { "epoch": 2.5700636942675157, "grad_norm": 1.0723298668254626, "learning_rate": 2.390693710453961e-06, "loss": 0.0285, "step": 5649 }, { "epoch": 2.570518653321201, "grad_norm": 0.874799469381408, "learning_rate": 2.3899797503275214e-06, "loss": 0.0306, "step": 5650 }, { "epoch": 2.5709736123748863, "grad_norm": 1.8329462568368757, "learning_rate": 2.38926579919144e-06, "loss": 0.0778, "step": 5651 }, { "epoch": 2.571428571428571, "grad_norm": 1.1220390002692775, "learning_rate": 2.388551857104057e-06, "loss": 0.0265, "step": 5652 }, { "epoch": 2.5718835304822565, "grad_norm": 0.9452557901718116, "learning_rate": 2.3878379241237136e-06, "loss": 0.0291, "step": 5653 }, { "epoch": 2.572338489535942, "grad_norm": 1.1511107949087234, "learning_rate": 2.387124000308747e-06, "loss": 0.0388, "step": 5654 }, { "epoch": 2.5727934485896267, "grad_norm": 1.137144152450151, "learning_rate": 2.3864100857174985e-06, "loss": 0.0869, "step": 5655 }, { "epoch": 2.573248407643312, "grad_norm": 1.0938670888525541, "learning_rate": 2.385696180408305e-06, "loss": 0.0221, "step": 5656 }, { "epoch": 2.5737033666969973, "grad_norm": 1.0225691131894297, "learning_rate": 2.384982284439503e-06, "loss": 0.0519, "step": 5657 }, { "epoch": 2.5741583257506826, "grad_norm": 1.224010513839223, "learning_rate": 2.3842683978694296e-06, "loss": 0.0279, "step": 5658 }, { "epoch": 2.5746132848043675, "grad_norm": 1.1235329964306733, "learning_rate": 2.38355452075642e-06, "loss": 0.0548, "step": 5659 }, { "epoch": 2.5750682438580528, "grad_norm": 1.8360653962014188, "learning_rate": 2.382840653158809e-06, "loss": 0.035, "step": 5660 }, { "epoch": 2.575523202911738, "grad_norm": 1.2030357532056195, "learning_rate": 2.3821267951349306e-06, "loss": 0.0529, "step": 5661 }, { "epoch": 2.575978161965423, "grad_norm": 1.6473691147322327, "learning_rate": 2.381412946743118e-06, "loss": 0.0568, "step": 5662 }, { "epoch": 2.5764331210191083, "grad_norm": 1.5147755901061837, "learning_rate": 2.3806991080417046e-06, "loss": 0.0492, "step": 5663 }, { "epoch": 2.5768880800727936, "grad_norm": 1.3993482296316013, "learning_rate": 2.3799852790890208e-06, "loss": 0.0623, "step": 5664 }, { "epoch": 2.577343039126479, "grad_norm": 1.5669824003406856, "learning_rate": 2.3792714599433988e-06, "loss": 0.0542, "step": 5665 }, { "epoch": 2.5777979981801638, "grad_norm": 1.3143150472424106, "learning_rate": 2.378557650663167e-06, "loss": 0.0383, "step": 5666 }, { "epoch": 2.578252957233849, "grad_norm": 1.8320784881555565, "learning_rate": 2.377843851306656e-06, "loss": 0.0698, "step": 5667 }, { "epoch": 2.5787079162875344, "grad_norm": 1.0465560528907596, "learning_rate": 2.377130061932193e-06, "loss": 0.0269, "step": 5668 }, { "epoch": 2.5791628753412192, "grad_norm": 0.9723393070641713, "learning_rate": 2.3764162825981067e-06, "loss": 0.0258, "step": 5669 }, { "epoch": 2.5796178343949046, "grad_norm": 1.2476841176866118, "learning_rate": 2.3757025133627246e-06, "loss": 0.0483, "step": 5670 }, { "epoch": 2.58007279344859, "grad_norm": 1.5657578740092402, "learning_rate": 2.374988754284371e-06, "loss": 0.0801, "step": 5671 }, { "epoch": 2.5805277525022747, "grad_norm": 1.2705522934095026, "learning_rate": 2.3742750054213728e-06, "loss": 0.0193, "step": 5672 }, { "epoch": 2.58098271155596, "grad_norm": 1.2676285163338399, "learning_rate": 2.3735612668320522e-06, "loss": 0.0562, "step": 5673 }, { "epoch": 2.5814376706096454, "grad_norm": 1.5899403453035004, "learning_rate": 2.372847538574735e-06, "loss": 0.0381, "step": 5674 }, { "epoch": 2.58189262966333, "grad_norm": 1.2886914896749688, "learning_rate": 2.3721338207077413e-06, "loss": 0.0539, "step": 5675 }, { "epoch": 2.5823475887170155, "grad_norm": 1.0598733279864154, "learning_rate": 2.371420113289395e-06, "loss": 0.0247, "step": 5676 }, { "epoch": 2.582802547770701, "grad_norm": 1.6664951213732633, "learning_rate": 2.370706416378017e-06, "loss": 0.062, "step": 5677 }, { "epoch": 2.5832575068243857, "grad_norm": 0.972427732804997, "learning_rate": 2.3699927300319262e-06, "loss": 0.0376, "step": 5678 }, { "epoch": 2.583712465878071, "grad_norm": 1.6803912097460523, "learning_rate": 2.3692790543094427e-06, "loss": 0.0657, "step": 5679 }, { "epoch": 2.5841674249317563, "grad_norm": 2.3427268890623294, "learning_rate": 2.3685653892688845e-06, "loss": 0.0773, "step": 5680 }, { "epoch": 2.584622383985441, "grad_norm": 1.5709734357855352, "learning_rate": 2.367851734968569e-06, "loss": 0.0597, "step": 5681 }, { "epoch": 2.5850773430391265, "grad_norm": 1.5082027168933945, "learning_rate": 2.367138091466812e-06, "loss": 0.0408, "step": 5682 }, { "epoch": 2.585532302092812, "grad_norm": 1.7894785035630558, "learning_rate": 2.3664244588219315e-06, "loss": 0.0487, "step": 5683 }, { "epoch": 2.5859872611464967, "grad_norm": 1.755154474255159, "learning_rate": 2.3657108370922405e-06, "loss": 0.0302, "step": 5684 }, { "epoch": 2.586442220200182, "grad_norm": 1.2015758890059645, "learning_rate": 2.364997226336054e-06, "loss": 0.0625, "step": 5685 }, { "epoch": 2.5868971792538673, "grad_norm": 1.2930620697608937, "learning_rate": 2.3642836266116836e-06, "loss": 0.0334, "step": 5686 }, { "epoch": 2.587352138307552, "grad_norm": 1.4456837093378156, "learning_rate": 2.3635700379774436e-06, "loss": 0.0469, "step": 5687 }, { "epoch": 2.5878070973612375, "grad_norm": 1.4078533542985612, "learning_rate": 2.362856460491643e-06, "loss": 0.0518, "step": 5688 }, { "epoch": 2.588262056414923, "grad_norm": 1.7297322339655012, "learning_rate": 2.3621428942125923e-06, "loss": 0.0346, "step": 5689 }, { "epoch": 2.5887170154686077, "grad_norm": 1.356358913742497, "learning_rate": 2.361429339198603e-06, "loss": 0.036, "step": 5690 }, { "epoch": 2.589171974522293, "grad_norm": 1.3383891317153027, "learning_rate": 2.3607157955079817e-06, "loss": 0.026, "step": 5691 }, { "epoch": 2.5896269335759783, "grad_norm": 1.6168494720367201, "learning_rate": 2.3600022631990376e-06, "loss": 0.0601, "step": 5692 }, { "epoch": 2.590081892629663, "grad_norm": 1.5472006653694907, "learning_rate": 2.3592887423300752e-06, "loss": 0.0493, "step": 5693 }, { "epoch": 2.5905368516833485, "grad_norm": 2.0311147374244474, "learning_rate": 2.3585752329594026e-06, "loss": 0.0646, "step": 5694 }, { "epoch": 2.5909918107370338, "grad_norm": 1.235238117801551, "learning_rate": 2.357861735145322e-06, "loss": 0.0284, "step": 5695 }, { "epoch": 2.5914467697907186, "grad_norm": 0.8945839289808152, "learning_rate": 2.3571482489461383e-06, "loss": 0.0139, "step": 5696 }, { "epoch": 2.591901728844404, "grad_norm": 1.1016567992735011, "learning_rate": 2.3564347744201556e-06, "loss": 0.0487, "step": 5697 }, { "epoch": 2.5923566878980893, "grad_norm": 1.8390793450365195, "learning_rate": 2.3557213116256745e-06, "loss": 0.0399, "step": 5698 }, { "epoch": 2.592811646951774, "grad_norm": 1.61444704008722, "learning_rate": 2.3550078606209963e-06, "loss": 0.0344, "step": 5699 }, { "epoch": 2.5932666060054594, "grad_norm": 1.0647051464289667, "learning_rate": 2.354294421464421e-06, "loss": 0.0385, "step": 5700 }, { "epoch": 2.5937215650591448, "grad_norm": 1.9284438731807443, "learning_rate": 2.3535809942142478e-06, "loss": 0.0449, "step": 5701 }, { "epoch": 2.5941765241128296, "grad_norm": 1.3411248008894583, "learning_rate": 2.352867578928774e-06, "loss": 0.0289, "step": 5702 }, { "epoch": 2.594631483166515, "grad_norm": 1.3227141783320804, "learning_rate": 2.3521541756662965e-06, "loss": 0.0293, "step": 5703 }, { "epoch": 2.5950864422202002, "grad_norm": 1.6591667052010635, "learning_rate": 2.3514407844851133e-06, "loss": 0.0372, "step": 5704 }, { "epoch": 2.595541401273885, "grad_norm": 1.3149320752656068, "learning_rate": 2.350727405443518e-06, "loss": 0.0556, "step": 5705 }, { "epoch": 2.5959963603275704, "grad_norm": 1.4622062804470484, "learning_rate": 2.350014038599805e-06, "loss": 0.0266, "step": 5706 }, { "epoch": 2.5964513193812557, "grad_norm": 1.7949855861209827, "learning_rate": 2.3493006840122676e-06, "loss": 0.0628, "step": 5707 }, { "epoch": 2.5969062784349406, "grad_norm": 2.39957028995805, "learning_rate": 2.348587341739198e-06, "loss": 0.0775, "step": 5708 }, { "epoch": 2.597361237488626, "grad_norm": 1.1163677144403035, "learning_rate": 2.3478740118388865e-06, "loss": 0.0207, "step": 5709 }, { "epoch": 2.597816196542311, "grad_norm": 1.3134700692549783, "learning_rate": 2.3471606943696232e-06, "loss": 0.0284, "step": 5710 }, { "epoch": 2.598271155595996, "grad_norm": 1.0327832486664614, "learning_rate": 2.346447389389699e-06, "loss": 0.0165, "step": 5711 }, { "epoch": 2.5987261146496814, "grad_norm": 1.1736485881393444, "learning_rate": 2.3457340969573995e-06, "loss": 0.0408, "step": 5712 }, { "epoch": 2.5991810737033667, "grad_norm": 1.3608495021871192, "learning_rate": 2.345020817131014e-06, "loss": 0.0506, "step": 5713 }, { "epoch": 2.599636032757052, "grad_norm": 1.1250526535646734, "learning_rate": 2.3443075499688277e-06, "loss": 0.0416, "step": 5714 }, { "epoch": 2.600090991810737, "grad_norm": 1.3183729989341642, "learning_rate": 2.343594295529124e-06, "loss": 0.0273, "step": 5715 }, { "epoch": 2.600545950864422, "grad_norm": 1.1353401462665478, "learning_rate": 2.34288105387019e-06, "loss": 0.0399, "step": 5716 }, { "epoch": 2.6010009099181075, "grad_norm": 1.7605677566518638, "learning_rate": 2.3421678250503043e-06, "loss": 0.025, "step": 5717 }, { "epoch": 2.6014558689717924, "grad_norm": 1.146001776437578, "learning_rate": 2.3414546091277524e-06, "loss": 0.0267, "step": 5718 }, { "epoch": 2.6019108280254777, "grad_norm": 1.0662320527680487, "learning_rate": 2.3407414061608142e-06, "loss": 0.0292, "step": 5719 }, { "epoch": 2.602365787079163, "grad_norm": 1.0048191183940594, "learning_rate": 2.340028216207768e-06, "loss": 0.0235, "step": 5720 }, { "epoch": 2.6028207461328483, "grad_norm": 1.76504640084546, "learning_rate": 2.3393150393268952e-06, "loss": 0.0481, "step": 5721 }, { "epoch": 2.603275705186533, "grad_norm": 0.867543118130427, "learning_rate": 2.3386018755764704e-06, "loss": 0.0187, "step": 5722 }, { "epoch": 2.6037306642402185, "grad_norm": 1.7730760545511253, "learning_rate": 2.3378887250147724e-06, "loss": 0.0449, "step": 5723 }, { "epoch": 2.604185623293904, "grad_norm": 1.7420066079050467, "learning_rate": 2.3371755877000747e-06, "loss": 0.0456, "step": 5724 }, { "epoch": 2.6046405823475887, "grad_norm": 1.163075229990015, "learning_rate": 2.3364624636906528e-06, "loss": 0.0373, "step": 5725 }, { "epoch": 2.605095541401274, "grad_norm": 1.3827435424537409, "learning_rate": 2.3357493530447806e-06, "loss": 0.0419, "step": 5726 }, { "epoch": 2.6055505004549593, "grad_norm": 1.1432617707541932, "learning_rate": 2.335036255820729e-06, "loss": 0.0494, "step": 5727 }, { "epoch": 2.606005459508644, "grad_norm": 1.373629667356988, "learning_rate": 2.33432317207677e-06, "loss": 0.0285, "step": 5728 }, { "epoch": 2.6064604185623295, "grad_norm": 1.089813923313114, "learning_rate": 2.3336101018711726e-06, "loss": 0.0291, "step": 5729 }, { "epoch": 2.6069153776160148, "grad_norm": 2.0837631560123273, "learning_rate": 2.332897045262207e-06, "loss": 0.0422, "step": 5730 }, { "epoch": 2.6073703366696996, "grad_norm": 1.6093149610968218, "learning_rate": 2.3321840023081392e-06, "loss": 0.04, "step": 5731 }, { "epoch": 2.607825295723385, "grad_norm": 1.340441189094497, "learning_rate": 2.331470973067237e-06, "loss": 0.0631, "step": 5732 }, { "epoch": 2.6082802547770703, "grad_norm": 1.2092692194844186, "learning_rate": 2.330757957597767e-06, "loss": 0.0553, "step": 5733 }, { "epoch": 2.608735213830755, "grad_norm": 0.9284889255529485, "learning_rate": 2.3300449559579916e-06, "loss": 0.0295, "step": 5734 }, { "epoch": 2.6091901728844404, "grad_norm": 1.1063884896090157, "learning_rate": 2.3293319682061752e-06, "loss": 0.047, "step": 5735 }, { "epoch": 2.6096451319381258, "grad_norm": 1.3015589939427092, "learning_rate": 2.3286189944005794e-06, "loss": 0.0293, "step": 5736 }, { "epoch": 2.6101000909918106, "grad_norm": 1.4582477428013585, "learning_rate": 2.327906034599466e-06, "loss": 0.0348, "step": 5737 }, { "epoch": 2.610555050045496, "grad_norm": 1.2985476312182662, "learning_rate": 2.3271930888610927e-06, "loss": 0.0364, "step": 5738 }, { "epoch": 2.6110100090991812, "grad_norm": 1.369166352020669, "learning_rate": 2.3264801572437206e-06, "loss": 0.0196, "step": 5739 }, { "epoch": 2.611464968152866, "grad_norm": 1.105016588930666, "learning_rate": 2.325767239805607e-06, "loss": 0.049, "step": 5740 }, { "epoch": 2.6119199272065514, "grad_norm": 1.1909203785201037, "learning_rate": 2.325054336605007e-06, "loss": 0.0306, "step": 5741 }, { "epoch": 2.6123748862602367, "grad_norm": 1.2965716911788712, "learning_rate": 2.324341447700178e-06, "loss": 0.0383, "step": 5742 }, { "epoch": 2.6128298453139216, "grad_norm": 1.762184390908751, "learning_rate": 2.323628573149371e-06, "loss": 0.0863, "step": 5743 }, { "epoch": 2.613284804367607, "grad_norm": 1.4814310361956273, "learning_rate": 2.322915713010842e-06, "loss": 0.0487, "step": 5744 }, { "epoch": 2.613739763421292, "grad_norm": 2.0820707021198963, "learning_rate": 2.3222028673428394e-06, "loss": 0.0672, "step": 5745 }, { "epoch": 2.614194722474977, "grad_norm": 1.3412567924199623, "learning_rate": 2.3214900362036165e-06, "loss": 0.0397, "step": 5746 }, { "epoch": 2.6146496815286624, "grad_norm": 1.5110740365697903, "learning_rate": 2.3207772196514216e-06, "loss": 0.0436, "step": 5747 }, { "epoch": 2.6151046405823477, "grad_norm": 1.139472485807002, "learning_rate": 2.3200644177445034e-06, "loss": 0.0404, "step": 5748 }, { "epoch": 2.6155595996360326, "grad_norm": 1.267665056584967, "learning_rate": 2.3193516305411082e-06, "loss": 0.0363, "step": 5749 }, { "epoch": 2.616014558689718, "grad_norm": 1.380652340261362, "learning_rate": 2.318638858099482e-06, "loss": 0.0396, "step": 5750 }, { "epoch": 2.616469517743403, "grad_norm": 1.7410070128951463, "learning_rate": 2.317926100477869e-06, "loss": 0.0441, "step": 5751 }, { "epoch": 2.616924476797088, "grad_norm": 1.1740774333060071, "learning_rate": 2.317213357734512e-06, "loss": 0.0213, "step": 5752 }, { "epoch": 2.6173794358507734, "grad_norm": 1.4084620990684478, "learning_rate": 2.3165006299276555e-06, "loss": 0.0228, "step": 5753 }, { "epoch": 2.6178343949044587, "grad_norm": 1.4684945236170786, "learning_rate": 2.315787917115538e-06, "loss": 0.0378, "step": 5754 }, { "epoch": 2.6182893539581436, "grad_norm": 1.4850143422963749, "learning_rate": 2.3150752193564006e-06, "loss": 0.0595, "step": 5755 }, { "epoch": 2.618744313011829, "grad_norm": 1.1780732233130256, "learning_rate": 2.3143625367084802e-06, "loss": 0.0291, "step": 5756 }, { "epoch": 2.619199272065514, "grad_norm": 1.4329148699630239, "learning_rate": 2.313649869230016e-06, "loss": 0.0434, "step": 5757 }, { "epoch": 2.619654231119199, "grad_norm": 1.5817660518337056, "learning_rate": 2.3129372169792427e-06, "loss": 0.0454, "step": 5758 }, { "epoch": 2.6201091901728844, "grad_norm": 1.43641833783825, "learning_rate": 2.3122245800143944e-06, "loss": 0.0417, "step": 5759 }, { "epoch": 2.6205641492265697, "grad_norm": 1.3617958814870417, "learning_rate": 2.311511958393706e-06, "loss": 0.0229, "step": 5760 }, { "epoch": 2.6210191082802545, "grad_norm": 1.1476068296597186, "learning_rate": 2.3107993521754092e-06, "loss": 0.0294, "step": 5761 }, { "epoch": 2.62147406733394, "grad_norm": 1.5687119062805126, "learning_rate": 2.3100867614177353e-06, "loss": 0.0628, "step": 5762 }, { "epoch": 2.621929026387625, "grad_norm": 0.8503048768107889, "learning_rate": 2.3093741861789133e-06, "loss": 0.0322, "step": 5763 }, { "epoch": 2.62238398544131, "grad_norm": 1.138296005881361, "learning_rate": 2.3086616265171724e-06, "loss": 0.0487, "step": 5764 }, { "epoch": 2.6228389444949953, "grad_norm": 1.1409219519199543, "learning_rate": 2.3079490824907386e-06, "loss": 0.0478, "step": 5765 }, { "epoch": 2.6232939035486806, "grad_norm": 1.4996774508666832, "learning_rate": 2.307236554157838e-06, "loss": 0.0777, "step": 5766 }, { "epoch": 2.623748862602366, "grad_norm": 0.8169288522923935, "learning_rate": 2.3065240415766966e-06, "loss": 0.0376, "step": 5767 }, { "epoch": 2.624203821656051, "grad_norm": 1.2783170718034171, "learning_rate": 2.3058115448055363e-06, "loss": 0.034, "step": 5768 }, { "epoch": 2.624658780709736, "grad_norm": 1.4909588374771514, "learning_rate": 2.3050990639025804e-06, "loss": 0.034, "step": 5769 }, { "epoch": 2.6251137397634214, "grad_norm": 1.132041750918534, "learning_rate": 2.304386598926048e-06, "loss": 0.0314, "step": 5770 }, { "epoch": 2.6255686988171063, "grad_norm": 1.689782811550132, "learning_rate": 2.3036741499341607e-06, "loss": 0.0375, "step": 5771 }, { "epoch": 2.6260236578707916, "grad_norm": 1.4261744506360678, "learning_rate": 2.302961716985134e-06, "loss": 0.0444, "step": 5772 }, { "epoch": 2.626478616924477, "grad_norm": 1.76393514406114, "learning_rate": 2.3022493001371853e-06, "loss": 0.0581, "step": 5773 }, { "epoch": 2.6269335759781622, "grad_norm": 1.4644386037950934, "learning_rate": 2.301536899448532e-06, "loss": 0.0442, "step": 5774 }, { "epoch": 2.627388535031847, "grad_norm": 0.9930866182082181, "learning_rate": 2.3008245149773865e-06, "loss": 0.0366, "step": 5775 }, { "epoch": 2.6278434940855324, "grad_norm": 1.3607403478633464, "learning_rate": 2.300112146781963e-06, "loss": 0.0526, "step": 5776 }, { "epoch": 2.6282984531392177, "grad_norm": 1.6141493298102616, "learning_rate": 2.2993997949204714e-06, "loss": 0.0707, "step": 5777 }, { "epoch": 2.6287534121929026, "grad_norm": 1.319129392322984, "learning_rate": 2.2986874594511234e-06, "loss": 0.0296, "step": 5778 }, { "epoch": 2.629208371246588, "grad_norm": 1.1292435578113582, "learning_rate": 2.297975140432126e-06, "loss": 0.0498, "step": 5779 }, { "epoch": 2.629663330300273, "grad_norm": 1.8839729684754156, "learning_rate": 2.2972628379216877e-06, "loss": 0.0547, "step": 5780 }, { "epoch": 2.630118289353958, "grad_norm": 1.693294310205762, "learning_rate": 2.2965505519780156e-06, "loss": 0.0261, "step": 5781 }, { "epoch": 2.6305732484076434, "grad_norm": 1.4844425701617447, "learning_rate": 2.295838282659313e-06, "loss": 0.0392, "step": 5782 }, { "epoch": 2.6310282074613287, "grad_norm": 1.5138157232438012, "learning_rate": 2.2951260300237847e-06, "loss": 0.0444, "step": 5783 }, { "epoch": 2.6314831665150136, "grad_norm": 1.954852958196234, "learning_rate": 2.2944137941296323e-06, "loss": 0.0835, "step": 5784 }, { "epoch": 2.631938125568699, "grad_norm": 1.4903421471032359, "learning_rate": 2.2937015750350554e-06, "loss": 0.0488, "step": 5785 }, { "epoch": 2.632393084622384, "grad_norm": 1.4869022558263778, "learning_rate": 2.2929893727982547e-06, "loss": 0.0641, "step": 5786 }, { "epoch": 2.632848043676069, "grad_norm": 0.8798033098959384, "learning_rate": 2.2922771874774263e-06, "loss": 0.0242, "step": 5787 }, { "epoch": 2.6333030027297544, "grad_norm": 1.474834637166476, "learning_rate": 2.291565019130769e-06, "loss": 0.0774, "step": 5788 }, { "epoch": 2.6337579617834397, "grad_norm": 1.3647930316827968, "learning_rate": 2.2908528678164773e-06, "loss": 0.0639, "step": 5789 }, { "epoch": 2.6342129208371245, "grad_norm": 1.389820092667736, "learning_rate": 2.290140733592744e-06, "loss": 0.0363, "step": 5790 }, { "epoch": 2.63466787989081, "grad_norm": 1.57833809224894, "learning_rate": 2.2894286165177634e-06, "loss": 0.0228, "step": 5791 }, { "epoch": 2.635122838944495, "grad_norm": 1.0830294000955387, "learning_rate": 2.2887165166497242e-06, "loss": 0.0463, "step": 5792 }, { "epoch": 2.63557779799818, "grad_norm": 1.2779551428095977, "learning_rate": 2.288004434046818e-06, "loss": 0.0554, "step": 5793 }, { "epoch": 2.6360327570518653, "grad_norm": 1.313388279221622, "learning_rate": 2.287292368767231e-06, "loss": 0.0458, "step": 5794 }, { "epoch": 2.6364877161055507, "grad_norm": 1.2411558983677518, "learning_rate": 2.2865803208691515e-06, "loss": 0.0298, "step": 5795 }, { "epoch": 2.6369426751592355, "grad_norm": 1.1345034636818117, "learning_rate": 2.285868290410765e-06, "loss": 0.06, "step": 5796 }, { "epoch": 2.637397634212921, "grad_norm": 1.1738303061500572, "learning_rate": 2.285156277450254e-06, "loss": 0.0509, "step": 5797 }, { "epoch": 2.637852593266606, "grad_norm": 1.335173560353498, "learning_rate": 2.284444282045803e-06, "loss": 0.07, "step": 5798 }, { "epoch": 2.638307552320291, "grad_norm": 1.7879760479939364, "learning_rate": 2.2837323042555906e-06, "loss": 0.0438, "step": 5799 }, { "epoch": 2.6387625113739763, "grad_norm": 2.669886840539805, "learning_rate": 2.2830203441377984e-06, "loss": 0.0672, "step": 5800 }, { "epoch": 2.6392174704276616, "grad_norm": 1.3164385563768088, "learning_rate": 2.2823084017506025e-06, "loss": 0.0532, "step": 5801 }, { "epoch": 2.6396724294813465, "grad_norm": 1.4727933632903587, "learning_rate": 2.281596477152182e-06, "loss": 0.0398, "step": 5802 }, { "epoch": 2.640127388535032, "grad_norm": 1.118821202883955, "learning_rate": 2.280884570400711e-06, "loss": 0.0497, "step": 5803 }, { "epoch": 2.640582347588717, "grad_norm": 1.4094131063598676, "learning_rate": 2.2801726815543633e-06, "loss": 0.1082, "step": 5804 }, { "epoch": 2.641037306642402, "grad_norm": 1.6801246766877964, "learning_rate": 2.2794608106713116e-06, "loss": 0.0648, "step": 5805 }, { "epoch": 2.6414922656960873, "grad_norm": 1.4471146516591855, "learning_rate": 2.278748957809726e-06, "loss": 0.0383, "step": 5806 }, { "epoch": 2.6419472247497726, "grad_norm": 1.0310476703298108, "learning_rate": 2.2780371230277773e-06, "loss": 0.0294, "step": 5807 }, { "epoch": 2.6424021838034575, "grad_norm": 1.0482153542423722, "learning_rate": 2.2773253063836313e-06, "loss": 0.0297, "step": 5808 }, { "epoch": 2.642857142857143, "grad_norm": 1.127955531407443, "learning_rate": 2.276613507935456e-06, "loss": 0.0184, "step": 5809 }, { "epoch": 2.643312101910828, "grad_norm": 1.6619962658377485, "learning_rate": 2.2759017277414165e-06, "loss": 0.0445, "step": 5810 }, { "epoch": 2.643767060964513, "grad_norm": 1.6117193329935218, "learning_rate": 2.2751899658596755e-06, "loss": 0.0421, "step": 5811 }, { "epoch": 2.6442220200181983, "grad_norm": 1.6250907575327147, "learning_rate": 2.2744782223483956e-06, "loss": 0.0441, "step": 5812 }, { "epoch": 2.6446769790718836, "grad_norm": 1.3227713328111046, "learning_rate": 2.2737664972657367e-06, "loss": 0.0354, "step": 5813 }, { "epoch": 2.6451319381255685, "grad_norm": 1.4940391095785879, "learning_rate": 2.2730547906698582e-06, "loss": 0.0737, "step": 5814 }, { "epoch": 2.6455868971792538, "grad_norm": 1.5949164731798229, "learning_rate": 2.2723431026189165e-06, "loss": 0.0267, "step": 5815 }, { "epoch": 2.646041856232939, "grad_norm": 1.2540774271619461, "learning_rate": 2.271631433171069e-06, "loss": 0.0515, "step": 5816 }, { "epoch": 2.646496815286624, "grad_norm": 1.2035281916695417, "learning_rate": 2.2709197823844696e-06, "loss": 0.0504, "step": 5817 }, { "epoch": 2.6469517743403093, "grad_norm": 1.6205519387690714, "learning_rate": 2.2702081503172706e-06, "loss": 0.0371, "step": 5818 }, { "epoch": 2.6474067333939946, "grad_norm": 1.665231767047539, "learning_rate": 2.2694965370276244e-06, "loss": 0.0595, "step": 5819 }, { "epoch": 2.6478616924476794, "grad_norm": 1.2195361638697408, "learning_rate": 2.2687849425736806e-06, "loss": 0.0456, "step": 5820 }, { "epoch": 2.6483166515013647, "grad_norm": 1.2136984438612404, "learning_rate": 2.2680733670135864e-06, "loss": 0.0448, "step": 5821 }, { "epoch": 2.64877161055505, "grad_norm": 2.032103037842563, "learning_rate": 2.2673618104054885e-06, "loss": 0.0776, "step": 5822 }, { "epoch": 2.6492265696087354, "grad_norm": 1.2358120212016412, "learning_rate": 2.266650272807534e-06, "loss": 0.0331, "step": 5823 }, { "epoch": 2.6496815286624202, "grad_norm": 1.2169892358013767, "learning_rate": 2.265938754277865e-06, "loss": 0.0216, "step": 5824 }, { "epoch": 2.6501364877161055, "grad_norm": 0.9835772775157626, "learning_rate": 2.2652272548746245e-06, "loss": 0.0614, "step": 5825 }, { "epoch": 2.650591446769791, "grad_norm": 1.1387845963635916, "learning_rate": 2.264515774655952e-06, "loss": 0.022, "step": 5826 }, { "epoch": 2.6510464058234757, "grad_norm": 1.0947745383201988, "learning_rate": 2.2638043136799876e-06, "loss": 0.0613, "step": 5827 }, { "epoch": 2.651501364877161, "grad_norm": 1.536286165277198, "learning_rate": 2.2630928720048674e-06, "loss": 0.0419, "step": 5828 }, { "epoch": 2.6519563239308463, "grad_norm": 1.2270364942398981, "learning_rate": 2.262381449688727e-06, "loss": 0.0243, "step": 5829 }, { "epoch": 2.6524112829845317, "grad_norm": 1.4534495992938212, "learning_rate": 2.261670046789703e-06, "loss": 0.0561, "step": 5830 }, { "epoch": 2.6528662420382165, "grad_norm": 1.0152888155951603, "learning_rate": 2.2609586633659256e-06, "loss": 0.023, "step": 5831 }, { "epoch": 2.653321201091902, "grad_norm": 1.1351692485227576, "learning_rate": 2.2602472994755274e-06, "loss": 0.0466, "step": 5832 }, { "epoch": 2.653776160145587, "grad_norm": 1.1038600848613107, "learning_rate": 2.2595359551766367e-06, "loss": 0.0339, "step": 5833 }, { "epoch": 2.654231119199272, "grad_norm": 0.8859688957015871, "learning_rate": 2.2588246305273823e-06, "loss": 0.028, "step": 5834 }, { "epoch": 2.6546860782529573, "grad_norm": 1.2867215702673611, "learning_rate": 2.2581133255858893e-06, "loss": 0.054, "step": 5835 }, { "epoch": 2.6551410373066426, "grad_norm": 1.2833773739008405, "learning_rate": 2.2574020404102824e-06, "loss": 0.0428, "step": 5836 }, { "epoch": 2.6555959963603275, "grad_norm": 1.6666191587464119, "learning_rate": 2.2566907750586867e-06, "loss": 0.0489, "step": 5837 }, { "epoch": 2.656050955414013, "grad_norm": 1.2985885110584603, "learning_rate": 2.2559795295892214e-06, "loss": 0.0369, "step": 5838 }, { "epoch": 2.656505914467698, "grad_norm": 1.6036229651990042, "learning_rate": 2.2552683040600072e-06, "loss": 0.0565, "step": 5839 }, { "epoch": 2.656960873521383, "grad_norm": 2.035932519891711, "learning_rate": 2.254557098529162e-06, "loss": 0.0497, "step": 5840 }, { "epoch": 2.6574158325750683, "grad_norm": 1.1068511415105486, "learning_rate": 2.253845913054802e-06, "loss": 0.0261, "step": 5841 }, { "epoch": 2.6578707916287536, "grad_norm": 1.1218051142461554, "learning_rate": 2.2531347476950422e-06, "loss": 0.0336, "step": 5842 }, { "epoch": 2.6583257506824385, "grad_norm": 1.0392054340276797, "learning_rate": 2.252423602507996e-06, "loss": 0.0381, "step": 5843 }, { "epoch": 2.658780709736124, "grad_norm": 1.3511054901772184, "learning_rate": 2.2517124775517753e-06, "loss": 0.0329, "step": 5844 }, { "epoch": 2.659235668789809, "grad_norm": 1.528512744688674, "learning_rate": 2.2510013728844896e-06, "loss": 0.0607, "step": 5845 }, { "epoch": 2.659690627843494, "grad_norm": 1.5257316109182997, "learning_rate": 2.2502902885642474e-06, "loss": 0.0399, "step": 5846 }, { "epoch": 2.6601455868971793, "grad_norm": 1.03589682949899, "learning_rate": 2.249579224649155e-06, "loss": 0.0388, "step": 5847 }, { "epoch": 2.6606005459508646, "grad_norm": 1.4078353550251166, "learning_rate": 2.248868181197318e-06, "loss": 0.0298, "step": 5848 }, { "epoch": 2.6610555050045495, "grad_norm": 1.36632851209019, "learning_rate": 2.2481571582668384e-06, "loss": 0.0271, "step": 5849 }, { "epoch": 2.6615104640582348, "grad_norm": 1.5224786037765872, "learning_rate": 2.2474461559158176e-06, "loss": 0.0238, "step": 5850 }, { "epoch": 2.66196542311192, "grad_norm": 1.676827226530445, "learning_rate": 2.246735174202358e-06, "loss": 0.0362, "step": 5851 }, { "epoch": 2.662420382165605, "grad_norm": 1.050188137237914, "learning_rate": 2.2460242131845556e-06, "loss": 0.0326, "step": 5852 }, { "epoch": 2.6628753412192903, "grad_norm": 1.3917068781354038, "learning_rate": 2.2453132729205078e-06, "loss": 0.0437, "step": 5853 }, { "epoch": 2.6633303002729756, "grad_norm": 1.1538080613717259, "learning_rate": 2.244602353468309e-06, "loss": 0.0622, "step": 5854 }, { "epoch": 2.6637852593266604, "grad_norm": 1.2415168757124517, "learning_rate": 2.243891454886053e-06, "loss": 0.0504, "step": 5855 }, { "epoch": 2.6642402183803457, "grad_norm": 1.304101463794975, "learning_rate": 2.243180577231831e-06, "loss": 0.0345, "step": 5856 }, { "epoch": 2.664695177434031, "grad_norm": 0.7926066101477782, "learning_rate": 2.242469720563731e-06, "loss": 0.0132, "step": 5857 }, { "epoch": 2.665150136487716, "grad_norm": 1.1270277762742966, "learning_rate": 2.241758884939843e-06, "loss": 0.031, "step": 5858 }, { "epoch": 2.6656050955414012, "grad_norm": 1.1315798857399417, "learning_rate": 2.241048070418253e-06, "loss": 0.0464, "step": 5859 }, { "epoch": 2.6660600545950865, "grad_norm": 1.6372328182121187, "learning_rate": 2.240337277057045e-06, "loss": 0.0495, "step": 5860 }, { "epoch": 2.6665150136487714, "grad_norm": 1.2527933894419219, "learning_rate": 2.2396265049143027e-06, "loss": 0.0548, "step": 5861 }, { "epoch": 2.6669699727024567, "grad_norm": 2.3453715638857346, "learning_rate": 2.238915754048106e-06, "loss": 0.0522, "step": 5862 }, { "epoch": 2.667424931756142, "grad_norm": 1.3287575543959989, "learning_rate": 2.2382050245165355e-06, "loss": 0.0416, "step": 5863 }, { "epoch": 2.667879890809827, "grad_norm": 1.006561682235034, "learning_rate": 2.2374943163776665e-06, "loss": 0.0244, "step": 5864 }, { "epoch": 2.668334849863512, "grad_norm": 1.5361624828998013, "learning_rate": 2.2367836296895777e-06, "loss": 0.0395, "step": 5865 }, { "epoch": 2.6687898089171975, "grad_norm": 1.3376433201290328, "learning_rate": 2.2360729645103423e-06, "loss": 0.0388, "step": 5866 }, { "epoch": 2.6692447679708824, "grad_norm": 1.509100803623229, "learning_rate": 2.235362320898032e-06, "loss": 0.0503, "step": 5867 }, { "epoch": 2.6696997270245677, "grad_norm": 1.494188143144183, "learning_rate": 2.234651698910718e-06, "loss": 0.077, "step": 5868 }, { "epoch": 2.670154686078253, "grad_norm": 1.7715489338926238, "learning_rate": 2.233941098606468e-06, "loss": 0.0696, "step": 5869 }, { "epoch": 2.670609645131938, "grad_norm": 1.185493653990117, "learning_rate": 2.2332305200433514e-06, "loss": 0.0353, "step": 5870 }, { "epoch": 2.671064604185623, "grad_norm": 2.41961146884144, "learning_rate": 2.23251996327943e-06, "loss": 0.0807, "step": 5871 }, { "epoch": 2.6715195632393085, "grad_norm": 1.4745092856132758, "learning_rate": 2.23180942837277e-06, "loss": 0.0217, "step": 5872 }, { "epoch": 2.6719745222929934, "grad_norm": 1.2053064070302153, "learning_rate": 2.2310989153814334e-06, "loss": 0.0277, "step": 5873 }, { "epoch": 2.6724294813466787, "grad_norm": 1.2959800216580866, "learning_rate": 2.230388424363478e-06, "loss": 0.0359, "step": 5874 }, { "epoch": 2.672884440400364, "grad_norm": 1.2362490815502882, "learning_rate": 2.229677955376964e-06, "loss": 0.0389, "step": 5875 }, { "epoch": 2.673339399454049, "grad_norm": 1.114865971713579, "learning_rate": 2.2289675084799463e-06, "loss": 0.0318, "step": 5876 }, { "epoch": 2.673794358507734, "grad_norm": 1.8744524340929873, "learning_rate": 2.2282570837304797e-06, "loss": 0.0661, "step": 5877 }, { "epoch": 2.6742493175614195, "grad_norm": 1.371382355636352, "learning_rate": 2.2275466811866163e-06, "loss": 0.0521, "step": 5878 }, { "epoch": 2.674704276615105, "grad_norm": 1.063976323707747, "learning_rate": 2.2268363009064082e-06, "loss": 0.0487, "step": 5879 }, { "epoch": 2.6751592356687897, "grad_norm": 0.8637385436320743, "learning_rate": 2.226125942947905e-06, "loss": 0.023, "step": 5880 }, { "epoch": 2.675614194722475, "grad_norm": 0.9358703801670021, "learning_rate": 2.2254156073691517e-06, "loss": 0.024, "step": 5881 }, { "epoch": 2.6760691537761603, "grad_norm": 1.620769336108698, "learning_rate": 2.2247052942281958e-06, "loss": 0.059, "step": 5882 }, { "epoch": 2.676524112829845, "grad_norm": 0.9565687333501728, "learning_rate": 2.2239950035830797e-06, "loss": 0.0338, "step": 5883 }, { "epoch": 2.6769790718835305, "grad_norm": 1.5422160326484022, "learning_rate": 2.223284735491846e-06, "loss": 0.0371, "step": 5884 }, { "epoch": 2.6774340309372158, "grad_norm": 1.3517782081174057, "learning_rate": 2.2225744900125324e-06, "loss": 0.0453, "step": 5885 }, { "epoch": 2.677888989990901, "grad_norm": 1.226339150203925, "learning_rate": 2.2218642672031794e-06, "loss": 0.0211, "step": 5886 }, { "epoch": 2.678343949044586, "grad_norm": 1.1460092884714945, "learning_rate": 2.2211540671218236e-06, "loss": 0.0336, "step": 5887 }, { "epoch": 2.6787989080982713, "grad_norm": 1.020759933000087, "learning_rate": 2.2204438898264973e-06, "loss": 0.0379, "step": 5888 }, { "epoch": 2.6792538671519566, "grad_norm": 1.2027823636587427, "learning_rate": 2.219733735375234e-06, "loss": 0.0286, "step": 5889 }, { "epoch": 2.6797088262056414, "grad_norm": 1.5500209222127528, "learning_rate": 2.2190236038260647e-06, "loss": 0.0784, "step": 5890 }, { "epoch": 2.6801637852593267, "grad_norm": 1.3708700711352408, "learning_rate": 2.2183134952370154e-06, "loss": 0.0506, "step": 5891 }, { "epoch": 2.680618744313012, "grad_norm": 0.9574483288025054, "learning_rate": 2.2176034096661174e-06, "loss": 0.0507, "step": 5892 }, { "epoch": 2.681073703366697, "grad_norm": 0.987600614689962, "learning_rate": 2.2168933471713933e-06, "loss": 0.018, "step": 5893 }, { "epoch": 2.6815286624203822, "grad_norm": 1.9813567005975228, "learning_rate": 2.2161833078108657e-06, "loss": 0.053, "step": 5894 }, { "epoch": 2.6819836214740675, "grad_norm": 1.4481128188269823, "learning_rate": 2.215473291642557e-06, "loss": 0.0222, "step": 5895 }, { "epoch": 2.6824385805277524, "grad_norm": 1.884587091475724, "learning_rate": 2.2147632987244855e-06, "loss": 0.063, "step": 5896 }, { "epoch": 2.6828935395814377, "grad_norm": 2.7864194234778057, "learning_rate": 2.2140533291146697e-06, "loss": 0.0393, "step": 5897 }, { "epoch": 2.683348498635123, "grad_norm": 1.1473881575380345, "learning_rate": 2.2133433828711235e-06, "loss": 0.0601, "step": 5898 }, { "epoch": 2.683803457688808, "grad_norm": 2.047623648675534, "learning_rate": 2.212633460051862e-06, "loss": 0.039, "step": 5899 }, { "epoch": 2.684258416742493, "grad_norm": 1.0576224730963764, "learning_rate": 2.211923560714897e-06, "loss": 0.0177, "step": 5900 }, { "epoch": 2.6847133757961785, "grad_norm": 1.0051439788424796, "learning_rate": 2.211213684918237e-06, "loss": 0.0314, "step": 5901 }, { "epoch": 2.6851683348498634, "grad_norm": 1.226008704807055, "learning_rate": 2.2105038327198914e-06, "loss": 0.0462, "step": 5902 }, { "epoch": 2.6856232939035487, "grad_norm": 1.6747395826791187, "learning_rate": 2.209794004177864e-06, "loss": 0.0502, "step": 5903 }, { "epoch": 2.686078252957234, "grad_norm": 1.2629195578121142, "learning_rate": 2.2090841993501614e-06, "loss": 0.0333, "step": 5904 }, { "epoch": 2.686533212010919, "grad_norm": 1.3017445936583356, "learning_rate": 2.2083744182947828e-06, "loss": 0.0254, "step": 5905 }, { "epoch": 2.686988171064604, "grad_norm": 1.1466562049182458, "learning_rate": 2.2076646610697304e-06, "loss": 0.0236, "step": 5906 }, { "epoch": 2.6874431301182895, "grad_norm": 1.469571713357236, "learning_rate": 2.206954927733002e-06, "loss": 0.0576, "step": 5907 }, { "epoch": 2.6878980891719744, "grad_norm": 1.4153953083044666, "learning_rate": 2.206245218342593e-06, "loss": 0.0694, "step": 5908 }, { "epoch": 2.6883530482256597, "grad_norm": 1.2529268810429626, "learning_rate": 2.205535532956499e-06, "loss": 0.049, "step": 5909 }, { "epoch": 2.688808007279345, "grad_norm": 1.456677336667436, "learning_rate": 2.2048258716327107e-06, "loss": 0.0485, "step": 5910 }, { "epoch": 2.68926296633303, "grad_norm": 1.4295623259944956, "learning_rate": 2.20411623442922e-06, "loss": 0.0622, "step": 5911 }, { "epoch": 2.689717925386715, "grad_norm": 1.5819969963480511, "learning_rate": 2.203406621404013e-06, "loss": 0.0359, "step": 5912 }, { "epoch": 2.6901728844404005, "grad_norm": 1.1864241953797379, "learning_rate": 2.202697032615078e-06, "loss": 0.0378, "step": 5913 }, { "epoch": 2.6906278434940853, "grad_norm": 1.618310887206484, "learning_rate": 2.2019874681204e-06, "loss": 0.0736, "step": 5914 }, { "epoch": 2.6910828025477707, "grad_norm": 1.2147388257238685, "learning_rate": 2.2012779279779593e-06, "loss": 0.0294, "step": 5915 }, { "epoch": 2.691537761601456, "grad_norm": 1.296026447230426, "learning_rate": 2.2005684122457377e-06, "loss": 0.0663, "step": 5916 }, { "epoch": 2.691992720655141, "grad_norm": 0.8666764868080868, "learning_rate": 2.199858920981713e-06, "loss": 0.022, "step": 5917 }, { "epoch": 2.692447679708826, "grad_norm": 0.9786601028083441, "learning_rate": 2.199149454243862e-06, "loss": 0.0338, "step": 5918 }, { "epoch": 2.6929026387625115, "grad_norm": 1.3529150472699571, "learning_rate": 2.1984400120901585e-06, "loss": 0.0327, "step": 5919 }, { "epoch": 2.6933575978161963, "grad_norm": 1.5830775882729409, "learning_rate": 2.1977305945785756e-06, "loss": 0.0363, "step": 5920 }, { "epoch": 2.6938125568698816, "grad_norm": 1.375261559530157, "learning_rate": 2.197021201767084e-06, "loss": 0.0214, "step": 5921 }, { "epoch": 2.694267515923567, "grad_norm": 1.2935567174612754, "learning_rate": 2.1963118337136508e-06, "loss": 0.0504, "step": 5922 }, { "epoch": 2.694722474977252, "grad_norm": 1.5885734242261698, "learning_rate": 2.195602490476244e-06, "loss": 0.0302, "step": 5923 }, { "epoch": 2.695177434030937, "grad_norm": 1.1986893531514495, "learning_rate": 2.1948931721128262e-06, "loss": 0.0405, "step": 5924 }, { "epoch": 2.6956323930846224, "grad_norm": 1.3821106174222006, "learning_rate": 2.194183878681361e-06, "loss": 0.0349, "step": 5925 }, { "epoch": 2.6960873521383073, "grad_norm": 1.98109251555538, "learning_rate": 2.1934746102398076e-06, "loss": 0.0585, "step": 5926 }, { "epoch": 2.6965423111919926, "grad_norm": 1.7424497685645426, "learning_rate": 2.1927653668461253e-06, "loss": 0.0468, "step": 5927 }, { "epoch": 2.696997270245678, "grad_norm": 1.2005329587126594, "learning_rate": 2.1920561485582696e-06, "loss": 0.0359, "step": 5928 }, { "epoch": 2.697452229299363, "grad_norm": 1.3549669052672246, "learning_rate": 2.1913469554341953e-06, "loss": 0.0702, "step": 5929 }, { "epoch": 2.697907188353048, "grad_norm": 1.0721048845246677, "learning_rate": 2.1906377875318535e-06, "loss": 0.03, "step": 5930 }, { "epoch": 2.6983621474067334, "grad_norm": 1.4293752832955167, "learning_rate": 2.189928644909195e-06, "loss": 0.0666, "step": 5931 }, { "epoch": 2.6988171064604187, "grad_norm": 1.2293039351859312, "learning_rate": 2.1892195276241667e-06, "loss": 0.034, "step": 5932 }, { "epoch": 2.6992720655141036, "grad_norm": 1.329393795638654, "learning_rate": 2.188510435734715e-06, "loss": 0.0603, "step": 5933 }, { "epoch": 2.699727024567789, "grad_norm": 1.3741780182235417, "learning_rate": 2.1878013692987848e-06, "loss": 0.0321, "step": 5934 }, { "epoch": 2.700181983621474, "grad_norm": 1.330937003472593, "learning_rate": 2.1870923283743156e-06, "loss": 0.0551, "step": 5935 }, { "epoch": 2.700636942675159, "grad_norm": 1.6674869504809813, "learning_rate": 2.1863833130192495e-06, "loss": 0.0406, "step": 5936 }, { "epoch": 2.7010919017288444, "grad_norm": 1.288052725755841, "learning_rate": 2.185674323291522e-06, "loss": 0.1035, "step": 5937 }, { "epoch": 2.7015468607825297, "grad_norm": 2.5725124933824954, "learning_rate": 2.18496535924907e-06, "loss": 0.0461, "step": 5938 }, { "epoch": 2.702001819836215, "grad_norm": 1.4020608438469275, "learning_rate": 2.1842564209498254e-06, "loss": 0.0284, "step": 5939 }, { "epoch": 2.7024567788899, "grad_norm": 1.6963939680571734, "learning_rate": 2.18354750845172e-06, "loss": 0.0371, "step": 5940 }, { "epoch": 2.702911737943585, "grad_norm": 2.85706937974312, "learning_rate": 2.1828386218126835e-06, "loss": 0.0475, "step": 5941 }, { "epoch": 2.7033666969972705, "grad_norm": 1.3835381832648936, "learning_rate": 2.182129761090643e-06, "loss": 0.0421, "step": 5942 }, { "epoch": 2.7038216560509554, "grad_norm": 1.6051961733259574, "learning_rate": 2.1814209263435226e-06, "loss": 0.0497, "step": 5943 }, { "epoch": 2.7042766151046407, "grad_norm": 1.2971828917325257, "learning_rate": 2.1807121176292455e-06, "loss": 0.0314, "step": 5944 }, { "epoch": 2.704731574158326, "grad_norm": 1.2758176070089022, "learning_rate": 2.180003335005732e-06, "loss": 0.0305, "step": 5945 }, { "epoch": 2.705186533212011, "grad_norm": 1.211370478368446, "learning_rate": 2.1792945785309013e-06, "loss": 0.0381, "step": 5946 }, { "epoch": 2.705641492265696, "grad_norm": 1.2379857722667746, "learning_rate": 2.178585848262668e-06, "loss": 0.0305, "step": 5947 }, { "epoch": 2.7060964513193815, "grad_norm": 2.9135340518085333, "learning_rate": 2.177877144258949e-06, "loss": 0.1203, "step": 5948 }, { "epoch": 2.7065514103730663, "grad_norm": 1.434617595092852, "learning_rate": 2.1771684665776547e-06, "loss": 0.0645, "step": 5949 }, { "epoch": 2.7070063694267517, "grad_norm": 1.8581621898766885, "learning_rate": 2.1764598152766963e-06, "loss": 0.038, "step": 5950 }, { "epoch": 2.707461328480437, "grad_norm": 1.3424778676943725, "learning_rate": 2.1757511904139795e-06, "loss": 0.0327, "step": 5951 }, { "epoch": 2.707916287534122, "grad_norm": 1.2582879916472438, "learning_rate": 2.175042592047412e-06, "loss": 0.0458, "step": 5952 }, { "epoch": 2.708371246587807, "grad_norm": 1.2233593474101425, "learning_rate": 2.1743340202348956e-06, "loss": 0.0464, "step": 5953 }, { "epoch": 2.7088262056414925, "grad_norm": 0.9825416999785169, "learning_rate": 2.1736254750343324e-06, "loss": 0.011, "step": 5954 }, { "epoch": 2.7092811646951773, "grad_norm": 1.1282824424234352, "learning_rate": 2.1729169565036217e-06, "loss": 0.046, "step": 5955 }, { "epoch": 2.7097361237488626, "grad_norm": 1.1986056551219735, "learning_rate": 2.17220846470066e-06, "loss": 0.0695, "step": 5956 }, { "epoch": 2.710191082802548, "grad_norm": 1.6002362317200316, "learning_rate": 2.1714999996833434e-06, "loss": 0.0486, "step": 5957 }, { "epoch": 2.710646041856233, "grad_norm": 1.6107785151226182, "learning_rate": 2.170791561509562e-06, "loss": 0.0585, "step": 5958 }, { "epoch": 2.711101000909918, "grad_norm": 1.5281051272214985, "learning_rate": 2.170083150237209e-06, "loss": 0.0416, "step": 5959 }, { "epoch": 2.7115559599636034, "grad_norm": 1.0271008483670074, "learning_rate": 2.1693747659241695e-06, "loss": 0.0208, "step": 5960 }, { "epoch": 2.7120109190172883, "grad_norm": 1.4464482076193692, "learning_rate": 2.168666408628331e-06, "loss": 0.0412, "step": 5961 }, { "epoch": 2.7124658780709736, "grad_norm": 1.3639551970634356, "learning_rate": 2.1679580784075783e-06, "loss": 0.0496, "step": 5962 }, { "epoch": 2.712920837124659, "grad_norm": 1.4496895560984584, "learning_rate": 2.1672497753197914e-06, "loss": 0.0359, "step": 5963 }, { "epoch": 2.713375796178344, "grad_norm": 1.7262159452223929, "learning_rate": 2.1665414994228505e-06, "loss": 0.0711, "step": 5964 }, { "epoch": 2.713830755232029, "grad_norm": 1.3020281586144393, "learning_rate": 2.165833250774633e-06, "loss": 0.0594, "step": 5965 }, { "epoch": 2.7142857142857144, "grad_norm": 1.187660225518449, "learning_rate": 2.1651250294330124e-06, "loss": 0.044, "step": 5966 }, { "epoch": 2.7147406733393993, "grad_norm": 1.408382102265263, "learning_rate": 2.1644168354558623e-06, "loss": 0.0504, "step": 5967 }, { "epoch": 2.7151956323930846, "grad_norm": 1.0887479610108044, "learning_rate": 2.163708668901052e-06, "loss": 0.0256, "step": 5968 }, { "epoch": 2.71565059144677, "grad_norm": 1.0164759132645707, "learning_rate": 2.1630005298264513e-06, "loss": 0.0341, "step": 5969 }, { "epoch": 2.7161055505004548, "grad_norm": 0.9285079697646706, "learning_rate": 2.1622924182899257e-06, "loss": 0.0354, "step": 5970 }, { "epoch": 2.71656050955414, "grad_norm": 1.1579766493842227, "learning_rate": 2.1615843343493383e-06, "loss": 0.0767, "step": 5971 }, { "epoch": 2.7170154686078254, "grad_norm": 1.1465225688764018, "learning_rate": 2.1608762780625513e-06, "loss": 0.0554, "step": 5972 }, { "epoch": 2.7174704276615103, "grad_norm": 1.9984901391127963, "learning_rate": 2.1601682494874226e-06, "loss": 0.0672, "step": 5973 }, { "epoch": 2.7179253867151956, "grad_norm": 1.3021913544184491, "learning_rate": 2.1594602486818107e-06, "loss": 0.028, "step": 5974 }, { "epoch": 2.718380345768881, "grad_norm": 1.436896091960354, "learning_rate": 2.158752275703568e-06, "loss": 0.0201, "step": 5975 }, { "epoch": 2.7188353048225657, "grad_norm": 0.9065292828482833, "learning_rate": 2.1580443306105494e-06, "loss": 0.038, "step": 5976 }, { "epoch": 2.719290263876251, "grad_norm": 1.1393869379046295, "learning_rate": 2.157336413460604e-06, "loss": 0.0493, "step": 5977 }, { "epoch": 2.7197452229299364, "grad_norm": 0.7147402477259625, "learning_rate": 2.156628524311579e-06, "loss": 0.0166, "step": 5978 }, { "epoch": 2.7202001819836212, "grad_norm": 1.0450342490594267, "learning_rate": 2.155920663221321e-06, "loss": 0.0267, "step": 5979 }, { "epoch": 2.7206551410373065, "grad_norm": 1.212035355781631, "learning_rate": 2.1552128302476715e-06, "loss": 0.034, "step": 5980 }, { "epoch": 2.721110100090992, "grad_norm": 1.3171917616867153, "learning_rate": 2.1545050254484732e-06, "loss": 0.0539, "step": 5981 }, { "epoch": 2.7215650591446767, "grad_norm": 1.1852251125057824, "learning_rate": 2.1537972488815633e-06, "loss": 0.0472, "step": 5982 }, { "epoch": 2.722020018198362, "grad_norm": 1.15771628863432, "learning_rate": 2.153089500604779e-06, "loss": 0.0198, "step": 5983 }, { "epoch": 2.7224749772520473, "grad_norm": 1.6045727103814618, "learning_rate": 2.1523817806759546e-06, "loss": 0.0265, "step": 5984 }, { "epoch": 2.722929936305732, "grad_norm": 1.0625457946823418, "learning_rate": 2.151674089152921e-06, "loss": 0.035, "step": 5985 }, { "epoch": 2.7233848953594175, "grad_norm": 1.158553036959002, "learning_rate": 2.150966426093508e-06, "loss": 0.0259, "step": 5986 }, { "epoch": 2.723839854413103, "grad_norm": 1.4184034915701138, "learning_rate": 2.1502587915555423e-06, "loss": 0.0629, "step": 5987 }, { "epoch": 2.724294813466788, "grad_norm": 1.1580006537425855, "learning_rate": 2.149551185596849e-06, "loss": 0.0205, "step": 5988 }, { "epoch": 2.724749772520473, "grad_norm": 0.9559838942341601, "learning_rate": 2.148843608275249e-06, "loss": 0.0334, "step": 5989 }, { "epoch": 2.7252047315741583, "grad_norm": 1.5443105823837129, "learning_rate": 2.148136059648564e-06, "loss": 0.0356, "step": 5990 }, { "epoch": 2.7256596906278436, "grad_norm": 1.7354225374725594, "learning_rate": 2.1474285397746123e-06, "loss": 0.0595, "step": 5991 }, { "epoch": 2.7261146496815285, "grad_norm": 1.2937862690089617, "learning_rate": 2.1467210487112072e-06, "loss": 0.0394, "step": 5992 }, { "epoch": 2.726569608735214, "grad_norm": 1.3695303435782027, "learning_rate": 2.146013586516163e-06, "loss": 0.0357, "step": 5993 }, { "epoch": 2.727024567788899, "grad_norm": 1.367959849509894, "learning_rate": 2.14530615324729e-06, "loss": 0.0715, "step": 5994 }, { "epoch": 2.7274795268425844, "grad_norm": 1.3708058587157819, "learning_rate": 2.144598748962396e-06, "loss": 0.0685, "step": 5995 }, { "epoch": 2.7279344858962693, "grad_norm": 1.4444636711317322, "learning_rate": 2.1438913737192867e-06, "loss": 0.0661, "step": 5996 }, { "epoch": 2.7283894449499546, "grad_norm": 1.3440437655310615, "learning_rate": 2.143184027575767e-06, "loss": 0.0533, "step": 5997 }, { "epoch": 2.72884440400364, "grad_norm": 1.066006720764387, "learning_rate": 2.1424767105896372e-06, "loss": 0.031, "step": 5998 }, { "epoch": 2.729299363057325, "grad_norm": 1.0457262298068613, "learning_rate": 2.1417694228186957e-06, "loss": 0.0324, "step": 5999 }, { "epoch": 2.72975432211101, "grad_norm": 1.063120019056367, "learning_rate": 2.14106216432074e-06, "loss": 0.0523, "step": 6000 }, { "epoch": 2.7302092811646954, "grad_norm": 1.3861781675818878, "learning_rate": 2.140354935153563e-06, "loss": 0.0524, "step": 6001 }, { "epoch": 2.7306642402183803, "grad_norm": 1.2783901032395795, "learning_rate": 2.1396477353749564e-06, "loss": 0.0404, "step": 6002 }, { "epoch": 2.7311191992720656, "grad_norm": 3.211875327224294, "learning_rate": 2.1389405650427083e-06, "loss": 0.0539, "step": 6003 }, { "epoch": 2.731574158325751, "grad_norm": 1.1327759154893093, "learning_rate": 2.138233424214608e-06, "loss": 0.0186, "step": 6004 }, { "epoch": 2.7320291173794358, "grad_norm": 1.1041312254910824, "learning_rate": 2.1375263129484385e-06, "loss": 0.0524, "step": 6005 }, { "epoch": 2.732484076433121, "grad_norm": 1.2983293365956527, "learning_rate": 2.1368192313019817e-06, "loss": 0.0668, "step": 6006 }, { "epoch": 2.7329390354868064, "grad_norm": 1.040200770005113, "learning_rate": 2.136112179333017e-06, "loss": 0.0321, "step": 6007 }, { "epoch": 2.7333939945404913, "grad_norm": 1.75853837317855, "learning_rate": 2.135405157099322e-06, "loss": 0.0616, "step": 6008 }, { "epoch": 2.7338489535941766, "grad_norm": 2.0456525093924625, "learning_rate": 2.134698164658671e-06, "loss": 0.0793, "step": 6009 }, { "epoch": 2.734303912647862, "grad_norm": 1.21385587735698, "learning_rate": 2.1339912020688353e-06, "loss": 0.0334, "step": 6010 }, { "epoch": 2.7347588717015467, "grad_norm": 0.835887975582969, "learning_rate": 2.133284269387587e-06, "loss": 0.0304, "step": 6011 }, { "epoch": 2.735213830755232, "grad_norm": 1.4387113475981197, "learning_rate": 2.1325773666726915e-06, "loss": 0.0244, "step": 6012 }, { "epoch": 2.7356687898089174, "grad_norm": 1.4855703274118006, "learning_rate": 2.1318704939819148e-06, "loss": 0.0407, "step": 6013 }, { "epoch": 2.7361237488626022, "grad_norm": 1.1230615680781095, "learning_rate": 2.1311636513730185e-06, "loss": 0.0333, "step": 6014 }, { "epoch": 2.7365787079162875, "grad_norm": 1.0964588735344405, "learning_rate": 2.1304568389037635e-06, "loss": 0.0284, "step": 6015 }, { "epoch": 2.737033666969973, "grad_norm": 1.39748161332601, "learning_rate": 2.129750056631906e-06, "loss": 0.0497, "step": 6016 }, { "epoch": 2.7374886260236577, "grad_norm": 1.278327121646064, "learning_rate": 2.1290433046152015e-06, "loss": 0.0297, "step": 6017 }, { "epoch": 2.737943585077343, "grad_norm": 1.0813940525259995, "learning_rate": 2.128336582911404e-06, "loss": 0.0231, "step": 6018 }, { "epoch": 2.7383985441310283, "grad_norm": 1.269707205187869, "learning_rate": 2.127629891578262e-06, "loss": 0.0479, "step": 6019 }, { "epoch": 2.738853503184713, "grad_norm": 1.5474068969812709, "learning_rate": 2.1269232306735243e-06, "loss": 0.0574, "step": 6020 }, { "epoch": 2.7393084622383985, "grad_norm": 1.086233463345767, "learning_rate": 2.1262166002549346e-06, "loss": 0.0459, "step": 6021 }, { "epoch": 2.739763421292084, "grad_norm": 1.1665058161847206, "learning_rate": 2.125510000380237e-06, "loss": 0.0322, "step": 6022 }, { "epoch": 2.7402183803457687, "grad_norm": 1.2010732596888103, "learning_rate": 2.1248034311071702e-06, "loss": 0.0259, "step": 6023 }, { "epoch": 2.740673339399454, "grad_norm": 1.7074980745995874, "learning_rate": 2.1240968924934724e-06, "loss": 0.0818, "step": 6024 }, { "epoch": 2.7411282984531393, "grad_norm": 1.3975322190196071, "learning_rate": 2.12339038459688e-06, "loss": 0.0228, "step": 6025 }, { "epoch": 2.741583257506824, "grad_norm": 1.3918176472811772, "learning_rate": 2.1226839074751243e-06, "loss": 0.051, "step": 6026 }, { "epoch": 2.7420382165605095, "grad_norm": 1.2125209257760363, "learning_rate": 2.121977461185936e-06, "loss": 0.0266, "step": 6027 }, { "epoch": 2.742493175614195, "grad_norm": 1.5261332104315009, "learning_rate": 2.1212710457870416e-06, "loss": 0.0688, "step": 6028 }, { "epoch": 2.7429481346678797, "grad_norm": 1.2807810106752058, "learning_rate": 2.120564661336168e-06, "loss": 0.0482, "step": 6029 }, { "epoch": 2.743403093721565, "grad_norm": 1.4059078156447078, "learning_rate": 2.119858307891036e-06, "loss": 0.0198, "step": 6030 }, { "epoch": 2.7438580527752503, "grad_norm": 1.048390117973357, "learning_rate": 2.119151985509366e-06, "loss": 0.0435, "step": 6031 }, { "epoch": 2.744313011828935, "grad_norm": 1.7111094842806829, "learning_rate": 2.118445694248877e-06, "loss": 0.056, "step": 6032 }, { "epoch": 2.7447679708826205, "grad_norm": 2.2019712322126694, "learning_rate": 2.117739434167282e-06, "loss": 0.0715, "step": 6033 }, { "epoch": 2.745222929936306, "grad_norm": 1.1792882362508463, "learning_rate": 2.117033205322295e-06, "loss": 0.0232, "step": 6034 }, { "epoch": 2.7456778889899907, "grad_norm": 1.2561656324548196, "learning_rate": 2.1163270077716248e-06, "loss": 0.0205, "step": 6035 }, { "epoch": 2.746132848043676, "grad_norm": 1.206181035428752, "learning_rate": 2.1156208415729786e-06, "loss": 0.05, "step": 6036 }, { "epoch": 2.7465878070973613, "grad_norm": 1.9685548281875023, "learning_rate": 2.1149147067840616e-06, "loss": 0.0338, "step": 6037 }, { "epoch": 2.747042766151046, "grad_norm": 0.9407996913738015, "learning_rate": 2.1142086034625744e-06, "loss": 0.0232, "step": 6038 }, { "epoch": 2.7474977252047315, "grad_norm": 1.5066607894443202, "learning_rate": 2.113502531666219e-06, "loss": 0.0294, "step": 6039 }, { "epoch": 2.7479526842584168, "grad_norm": 1.3211882727405444, "learning_rate": 2.1127964914526917e-06, "loss": 0.0387, "step": 6040 }, { "epoch": 2.7484076433121016, "grad_norm": 1.1525231709197636, "learning_rate": 2.1120904828796857e-06, "loss": 0.0496, "step": 6041 }, { "epoch": 2.748862602365787, "grad_norm": 1.201307545534009, "learning_rate": 2.1113845060048943e-06, "loss": 0.0541, "step": 6042 }, { "epoch": 2.7493175614194723, "grad_norm": 0.8849933467700286, "learning_rate": 2.110678560886006e-06, "loss": 0.0247, "step": 6043 }, { "epoch": 2.7497725204731576, "grad_norm": 1.3063365766432882, "learning_rate": 2.1099726475807077e-06, "loss": 0.0465, "step": 6044 }, { "epoch": 2.7502274795268424, "grad_norm": 1.2908511498556678, "learning_rate": 2.109266766146682e-06, "loss": 0.0422, "step": 6045 }, { "epoch": 2.7506824385805277, "grad_norm": 1.1326058356267628, "learning_rate": 2.1085609166416128e-06, "loss": 0.0699, "step": 6046 }, { "epoch": 2.751137397634213, "grad_norm": 1.31325427602823, "learning_rate": 2.1078550991231777e-06, "loss": 0.0542, "step": 6047 }, { "epoch": 2.7515923566878984, "grad_norm": 1.6452287771605802, "learning_rate": 2.1071493136490527e-06, "loss": 0.0351, "step": 6048 }, { "epoch": 2.7520473157415832, "grad_norm": 1.6059309658074055, "learning_rate": 2.106443560276912e-06, "loss": 0.0419, "step": 6049 }, { "epoch": 2.7525022747952685, "grad_norm": 1.9243883564755209, "learning_rate": 2.1057378390644263e-06, "loss": 0.0676, "step": 6050 }, { "epoch": 2.752957233848954, "grad_norm": 1.2274873870913423, "learning_rate": 2.1050321500692637e-06, "loss": 0.0446, "step": 6051 }, { "epoch": 2.7534121929026387, "grad_norm": 1.5131811341286128, "learning_rate": 2.1043264933490896e-06, "loss": 0.0592, "step": 6052 }, { "epoch": 2.753867151956324, "grad_norm": 1.3208730564429154, "learning_rate": 2.1036208689615683e-06, "loss": 0.0285, "step": 6053 }, { "epoch": 2.7543221110100093, "grad_norm": 1.0918145968935975, "learning_rate": 2.1029152769643595e-06, "loss": 0.0166, "step": 6054 }, { "epoch": 2.754777070063694, "grad_norm": 1.1062653655554564, "learning_rate": 2.102209717415121e-06, "loss": 0.0305, "step": 6055 }, { "epoch": 2.7552320291173795, "grad_norm": 1.4396771563010529, "learning_rate": 2.1015041903715085e-06, "loss": 0.0657, "step": 6056 }, { "epoch": 2.755686988171065, "grad_norm": 1.4573060239378821, "learning_rate": 2.100798695891173e-06, "loss": 0.0589, "step": 6057 }, { "epoch": 2.7561419472247497, "grad_norm": 2.4396354740127393, "learning_rate": 2.100093234031766e-06, "loss": 0.0412, "step": 6058 }, { "epoch": 2.756596906278435, "grad_norm": 1.3619994402313926, "learning_rate": 2.099387804850933e-06, "loss": 0.0405, "step": 6059 }, { "epoch": 2.7570518653321203, "grad_norm": 1.5199604056611247, "learning_rate": 2.09868240840632e-06, "loss": 0.0348, "step": 6060 }, { "epoch": 2.757506824385805, "grad_norm": 1.5130047923258474, "learning_rate": 2.097977044755569e-06, "loss": 0.0586, "step": 6061 }, { "epoch": 2.7579617834394905, "grad_norm": 1.5425234571508364, "learning_rate": 2.0972717139563176e-06, "loss": 0.0524, "step": 6062 }, { "epoch": 2.758416742493176, "grad_norm": 1.5723956045087994, "learning_rate": 2.0965664160662038e-06, "loss": 0.0521, "step": 6063 }, { "epoch": 2.7588717015468607, "grad_norm": 1.8578453609400156, "learning_rate": 2.0958611511428596e-06, "loss": 0.0599, "step": 6064 }, { "epoch": 2.759326660600546, "grad_norm": 1.078722649063515, "learning_rate": 2.0951559192439177e-06, "loss": 0.0227, "step": 6065 }, { "epoch": 2.7597816196542313, "grad_norm": 1.7438477470858293, "learning_rate": 2.094450720427005e-06, "loss": 0.0446, "step": 6066 }, { "epoch": 2.760236578707916, "grad_norm": 1.2354746118132574, "learning_rate": 2.093745554749748e-06, "loss": 0.0509, "step": 6067 }, { "epoch": 2.7606915377616015, "grad_norm": 1.8089091873909036, "learning_rate": 2.0930404222697707e-06, "loss": 0.0294, "step": 6068 }, { "epoch": 2.761146496815287, "grad_norm": 1.088180649213693, "learning_rate": 2.0923353230446918e-06, "loss": 0.0538, "step": 6069 }, { "epoch": 2.7616014558689717, "grad_norm": 1.354337311298728, "learning_rate": 2.0916302571321295e-06, "loss": 0.0349, "step": 6070 }, { "epoch": 2.762056414922657, "grad_norm": 1.6043121120008572, "learning_rate": 2.0909252245896986e-06, "loss": 0.0472, "step": 6071 }, { "epoch": 2.7625113739763423, "grad_norm": 1.2199934225165563, "learning_rate": 2.0902202254750105e-06, "loss": 0.0386, "step": 6072 }, { "epoch": 2.762966333030027, "grad_norm": 1.4839304293779376, "learning_rate": 2.0895152598456744e-06, "loss": 0.0432, "step": 6073 }, { "epoch": 2.7634212920837125, "grad_norm": 1.370420199040153, "learning_rate": 2.0888103277592982e-06, "loss": 0.0287, "step": 6074 }, { "epoch": 2.7638762511373978, "grad_norm": 1.2532896455563338, "learning_rate": 2.088105429273485e-06, "loss": 0.0308, "step": 6075 }, { "epoch": 2.7643312101910826, "grad_norm": 1.6938843381263569, "learning_rate": 2.0874005644458367e-06, "loss": 0.0426, "step": 6076 }, { "epoch": 2.764786169244768, "grad_norm": 1.6385136263212323, "learning_rate": 2.0866957333339503e-06, "loss": 0.0602, "step": 6077 }, { "epoch": 2.7652411282984533, "grad_norm": 1.3329071035674496, "learning_rate": 2.0859909359954224e-06, "loss": 0.0623, "step": 6078 }, { "epoch": 2.765696087352138, "grad_norm": 1.6543258003753887, "learning_rate": 2.0852861724878452e-06, "loss": 0.0518, "step": 6079 }, { "epoch": 2.7661510464058234, "grad_norm": 1.1208084701229104, "learning_rate": 2.0845814428688086e-06, "loss": 0.0543, "step": 6080 }, { "epoch": 2.7666060054595087, "grad_norm": 1.293238383854207, "learning_rate": 2.0838767471959015e-06, "loss": 0.0395, "step": 6081 }, { "epoch": 2.7670609645131936, "grad_norm": 2.2839142638868872, "learning_rate": 2.083172085526707e-06, "loss": 0.0818, "step": 6082 }, { "epoch": 2.767515923566879, "grad_norm": 1.6424150183675432, "learning_rate": 2.082467457918808e-06, "loss": 0.0467, "step": 6083 }, { "epoch": 2.7679708826205642, "grad_norm": 1.1400843408257069, "learning_rate": 2.081762864429782e-06, "loss": 0.0302, "step": 6084 }, { "epoch": 2.768425841674249, "grad_norm": 0.996375400169196, "learning_rate": 2.0810583051172066e-06, "loss": 0.0432, "step": 6085 }, { "epoch": 2.7688808007279344, "grad_norm": 0.9747040105519118, "learning_rate": 2.080353780038654e-06, "loss": 0.022, "step": 6086 }, { "epoch": 2.7693357597816197, "grad_norm": 1.1327537607972527, "learning_rate": 2.079649289251695e-06, "loss": 0.0471, "step": 6087 }, { "epoch": 2.7697907188353046, "grad_norm": 0.8712420692322036, "learning_rate": 2.0789448328138984e-06, "loss": 0.0172, "step": 6088 }, { "epoch": 2.77024567788899, "grad_norm": 1.031558457626875, "learning_rate": 2.0782404107828284e-06, "loss": 0.0329, "step": 6089 }, { "epoch": 2.770700636942675, "grad_norm": 1.0647009247919499, "learning_rate": 2.077536023216048e-06, "loss": 0.0778, "step": 6090 }, { "epoch": 2.77115559599636, "grad_norm": 0.991825837008359, "learning_rate": 2.076831670171115e-06, "loss": 0.0279, "step": 6091 }, { "epoch": 2.7716105550500454, "grad_norm": 1.282805362393132, "learning_rate": 2.076127351705588e-06, "loss": 0.0427, "step": 6092 }, { "epoch": 2.7720655141037307, "grad_norm": 1.4233731007645227, "learning_rate": 2.0754230678770188e-06, "loss": 0.0592, "step": 6093 }, { "epoch": 2.7725204731574156, "grad_norm": 0.9817133373149859, "learning_rate": 2.0747188187429585e-06, "loss": 0.0175, "step": 6094 }, { "epoch": 2.772975432211101, "grad_norm": 1.5479223304048602, "learning_rate": 2.074014604360957e-06, "loss": 0.0352, "step": 6095 }, { "epoch": 2.773430391264786, "grad_norm": 1.0342782151984329, "learning_rate": 2.073310424788558e-06, "loss": 0.0336, "step": 6096 }, { "epoch": 2.7738853503184715, "grad_norm": 0.9700949868129407, "learning_rate": 2.072606280083304e-06, "loss": 0.0234, "step": 6097 }, { "epoch": 2.7743403093721564, "grad_norm": 1.200361569403955, "learning_rate": 2.071902170302735e-06, "loss": 0.022, "step": 6098 }, { "epoch": 2.7747952684258417, "grad_norm": 1.2869499689454122, "learning_rate": 2.0711980955043875e-06, "loss": 0.0281, "step": 6099 }, { "epoch": 2.775250227479527, "grad_norm": 1.2435385143520776, "learning_rate": 2.0704940557457948e-06, "loss": 0.0797, "step": 6100 }, { "epoch": 2.775705186533212, "grad_norm": 1.279290570841041, "learning_rate": 2.0697900510844874e-06, "loss": 0.0324, "step": 6101 }, { "epoch": 2.776160145586897, "grad_norm": 1.5238008344517995, "learning_rate": 2.0690860815779954e-06, "loss": 0.0966, "step": 6102 }, { "epoch": 2.7766151046405825, "grad_norm": 1.047159227561751, "learning_rate": 2.068382147283842e-06, "loss": 0.0329, "step": 6103 }, { "epoch": 2.777070063694268, "grad_norm": 1.003024239377609, "learning_rate": 2.0676782482595516e-06, "loss": 0.0427, "step": 6104 }, { "epoch": 2.7775250227479527, "grad_norm": 1.7961382478189862, "learning_rate": 2.0669743845626417e-06, "loss": 0.05, "step": 6105 }, { "epoch": 2.777979981801638, "grad_norm": 1.1696303714684526, "learning_rate": 2.0662705562506298e-06, "loss": 0.0506, "step": 6106 }, { "epoch": 2.7784349408553233, "grad_norm": 1.2930375634194877, "learning_rate": 2.0655667633810293e-06, "loss": 0.049, "step": 6107 }, { "epoch": 2.778889899909008, "grad_norm": 1.0837839251145458, "learning_rate": 2.0648630060113496e-06, "loss": 0.0175, "step": 6108 }, { "epoch": 2.7793448589626935, "grad_norm": 1.6112720426331102, "learning_rate": 2.0641592841991016e-06, "loss": 0.0271, "step": 6109 }, { "epoch": 2.7797998180163788, "grad_norm": 1.6005048972486284, "learning_rate": 2.0634555980017884e-06, "loss": 0.0249, "step": 6110 }, { "epoch": 2.7802547770700636, "grad_norm": 1.9565583850339532, "learning_rate": 2.062751947476912e-06, "loss": 0.0682, "step": 6111 }, { "epoch": 2.780709736123749, "grad_norm": 1.1665873694460687, "learning_rate": 2.0620483326819724e-06, "loss": 0.0529, "step": 6112 }, { "epoch": 2.7811646951774343, "grad_norm": 1.233042908489579, "learning_rate": 2.0613447536744645e-06, "loss": 0.048, "step": 6113 }, { "epoch": 2.781619654231119, "grad_norm": 1.6708971174905043, "learning_rate": 2.060641210511883e-06, "loss": 0.0411, "step": 6114 }, { "epoch": 2.7820746132848044, "grad_norm": 1.438276391533111, "learning_rate": 2.059937703251717e-06, "loss": 0.037, "step": 6115 }, { "epoch": 2.7825295723384897, "grad_norm": 1.2415404479678898, "learning_rate": 2.059234231951455e-06, "loss": 0.0639, "step": 6116 }, { "epoch": 2.7829845313921746, "grad_norm": 0.8944344365629783, "learning_rate": 2.0585307966685815e-06, "loss": 0.0304, "step": 6117 }, { "epoch": 2.78343949044586, "grad_norm": 1.1640082549013726, "learning_rate": 2.057827397460577e-06, "loss": 0.0247, "step": 6118 }, { "epoch": 2.7838944494995452, "grad_norm": 1.168440536123649, "learning_rate": 2.0571240343849213e-06, "loss": 0.0599, "step": 6119 }, { "epoch": 2.78434940855323, "grad_norm": 1.2130211786439764, "learning_rate": 2.056420707499089e-06, "loss": 0.0462, "step": 6120 }, { "epoch": 2.7848043676069154, "grad_norm": 1.4172488189811647, "learning_rate": 2.055717416860554e-06, "loss": 0.0804, "step": 6121 }, { "epoch": 2.7852593266606007, "grad_norm": 1.4374363921089373, "learning_rate": 2.055014162526784e-06, "loss": 0.0484, "step": 6122 }, { "epoch": 2.7857142857142856, "grad_norm": 2.2413063273227025, "learning_rate": 2.0543109445552477e-06, "loss": 0.0438, "step": 6123 }, { "epoch": 2.786169244767971, "grad_norm": 1.6002515748537596, "learning_rate": 2.053607763003409e-06, "loss": 0.0618, "step": 6124 }, { "epoch": 2.786624203821656, "grad_norm": 1.5322331966643439, "learning_rate": 2.052904617928727e-06, "loss": 0.0476, "step": 6125 }, { "epoch": 2.787079162875341, "grad_norm": 1.2719876790103073, "learning_rate": 2.0522015093886614e-06, "loss": 0.0241, "step": 6126 }, { "epoch": 2.7875341219290264, "grad_norm": 1.5364117342679728, "learning_rate": 2.051498437440666e-06, "loss": 0.0391, "step": 6127 }, { "epoch": 2.7879890809827117, "grad_norm": 1.3353010042920748, "learning_rate": 2.050795402142193e-06, "loss": 0.0651, "step": 6128 }, { "epoch": 2.7884440400363966, "grad_norm": 1.5271248189891602, "learning_rate": 2.05009240355069e-06, "loss": 0.039, "step": 6129 }, { "epoch": 2.788898999090082, "grad_norm": 1.094905384494984, "learning_rate": 2.049389441723605e-06, "loss": 0.0446, "step": 6130 }, { "epoch": 2.789353958143767, "grad_norm": 1.3921639758759843, "learning_rate": 2.04868651671838e-06, "loss": 0.0401, "step": 6131 }, { "epoch": 2.789808917197452, "grad_norm": 0.9160328840173109, "learning_rate": 2.0479836285924543e-06, "loss": 0.0426, "step": 6132 }, { "epoch": 2.7902638762511374, "grad_norm": 1.4953740430677789, "learning_rate": 2.047280777403266e-06, "loss": 0.0544, "step": 6133 }, { "epoch": 2.7907188353048227, "grad_norm": 0.9647953513061686, "learning_rate": 2.046577963208247e-06, "loss": 0.0144, "step": 6134 }, { "epoch": 2.7911737943585075, "grad_norm": 1.3732351489967298, "learning_rate": 2.0458751860648304e-06, "loss": 0.0394, "step": 6135 }, { "epoch": 2.791628753412193, "grad_norm": 1.5264469528596059, "learning_rate": 2.0451724460304416e-06, "loss": 0.045, "step": 6136 }, { "epoch": 2.792083712465878, "grad_norm": 1.1221248963672408, "learning_rate": 2.0444697431625068e-06, "loss": 0.0265, "step": 6137 }, { "epoch": 2.792538671519563, "grad_norm": 1.455864834258983, "learning_rate": 2.043767077518448e-06, "loss": 0.0274, "step": 6138 }, { "epoch": 2.7929936305732483, "grad_norm": 1.7585677595662816, "learning_rate": 2.0430644491556826e-06, "loss": 0.0406, "step": 6139 }, { "epoch": 2.7934485896269337, "grad_norm": 1.5116444274446916, "learning_rate": 2.0423618581316277e-06, "loss": 0.0576, "step": 6140 }, { "epoch": 2.7939035486806185, "grad_norm": 1.0056858325394578, "learning_rate": 2.041659304503695e-06, "loss": 0.0305, "step": 6141 }, { "epoch": 2.794358507734304, "grad_norm": 1.3861135272886598, "learning_rate": 2.0409567883292938e-06, "loss": 0.0487, "step": 6142 }, { "epoch": 2.794813466787989, "grad_norm": 0.9883492391207411, "learning_rate": 2.04025430966583e-06, "loss": 0.0365, "step": 6143 }, { "epoch": 2.795268425841674, "grad_norm": 1.2627571782932614, "learning_rate": 2.0395518685707086e-06, "loss": 0.0457, "step": 6144 }, { "epoch": 2.7957233848953593, "grad_norm": 1.5340714599980936, "learning_rate": 2.0388494651013293e-06, "loss": 0.034, "step": 6145 }, { "epoch": 2.7961783439490446, "grad_norm": 1.4569398974412813, "learning_rate": 2.0381470993150894e-06, "loss": 0.0456, "step": 6146 }, { "epoch": 2.7966333030027295, "grad_norm": 1.5023185025622463, "learning_rate": 2.0374447712693824e-06, "loss": 0.0654, "step": 6147 }, { "epoch": 2.797088262056415, "grad_norm": 1.3533231432226556, "learning_rate": 2.0367424810216004e-06, "loss": 0.0523, "step": 6148 }, { "epoch": 2.7975432211101, "grad_norm": 1.2609605582769747, "learning_rate": 2.0360402286291302e-06, "loss": 0.04, "step": 6149 }, { "epoch": 2.797998180163785, "grad_norm": 1.344561040475176, "learning_rate": 2.0353380141493563e-06, "loss": 0.042, "step": 6150 }, { "epoch": 2.7984531392174703, "grad_norm": 1.0944207039414982, "learning_rate": 2.034635837639663e-06, "loss": 0.0289, "step": 6151 }, { "epoch": 2.7989080982711556, "grad_norm": 1.1636096953827157, "learning_rate": 2.0339336991574267e-06, "loss": 0.0925, "step": 6152 }, { "epoch": 2.799363057324841, "grad_norm": 1.595478189383148, "learning_rate": 2.033231598760025e-06, "loss": 0.0465, "step": 6153 }, { "epoch": 2.799818016378526, "grad_norm": 1.1998764226934406, "learning_rate": 2.032529536504828e-06, "loss": 0.0189, "step": 6154 }, { "epoch": 2.800272975432211, "grad_norm": 1.2723759554646534, "learning_rate": 2.0318275124492066e-06, "loss": 0.0277, "step": 6155 }, { "epoch": 2.8007279344858964, "grad_norm": 1.7106405288003104, "learning_rate": 2.0311255266505264e-06, "loss": 0.056, "step": 6156 }, { "epoch": 2.8011828935395813, "grad_norm": 1.2636410551449602, "learning_rate": 2.03042357916615e-06, "loss": 0.0443, "step": 6157 }, { "epoch": 2.8016378525932666, "grad_norm": 2.1527595361595115, "learning_rate": 2.0297216700534396e-06, "loss": 0.0749, "step": 6158 }, { "epoch": 2.802092811646952, "grad_norm": 1.4609131693508923, "learning_rate": 2.0290197993697493e-06, "loss": 0.0437, "step": 6159 }, { "epoch": 2.802547770700637, "grad_norm": 1.53133562065683, "learning_rate": 2.028317967172435e-06, "loss": 0.0526, "step": 6160 }, { "epoch": 2.803002729754322, "grad_norm": 1.426493303874008, "learning_rate": 2.0276161735188458e-06, "loss": 0.0404, "step": 6161 }, { "epoch": 2.8034576888080074, "grad_norm": 1.5235019439971766, "learning_rate": 2.02691441846633e-06, "loss": 0.0661, "step": 6162 }, { "epoch": 2.8039126478616927, "grad_norm": 1.0530549824808195, "learning_rate": 2.0262127020722315e-06, "loss": 0.0358, "step": 6163 }, { "epoch": 2.8043676069153776, "grad_norm": 1.363546168495423, "learning_rate": 2.0255110243938903e-06, "loss": 0.0526, "step": 6164 }, { "epoch": 2.804822565969063, "grad_norm": 1.9401483751193371, "learning_rate": 2.024809385488647e-06, "loss": 0.0741, "step": 6165 }, { "epoch": 2.805277525022748, "grad_norm": 0.9483452064619643, "learning_rate": 2.024107785413834e-06, "loss": 0.0196, "step": 6166 }, { "epoch": 2.805732484076433, "grad_norm": 1.227092179844755, "learning_rate": 2.023406224226784e-06, "loss": 0.023, "step": 6167 }, { "epoch": 2.8061874431301184, "grad_norm": 2.0117103960222957, "learning_rate": 2.0227047019848246e-06, "loss": 0.0513, "step": 6168 }, { "epoch": 2.8066424021838037, "grad_norm": 1.139324494952597, "learning_rate": 2.022003218745282e-06, "loss": 0.0451, "step": 6169 }, { "epoch": 2.8070973612374885, "grad_norm": 1.5044738092276697, "learning_rate": 2.0213017745654774e-06, "loss": 0.066, "step": 6170 }, { "epoch": 2.807552320291174, "grad_norm": 0.6056735730433279, "learning_rate": 2.0206003695027294e-06, "loss": 0.0112, "step": 6171 }, { "epoch": 2.808007279344859, "grad_norm": 1.248841826660653, "learning_rate": 2.0198990036143553e-06, "loss": 0.0256, "step": 6172 }, { "epoch": 2.808462238398544, "grad_norm": 1.2866830273445369, "learning_rate": 2.019197676957666e-06, "loss": 0.037, "step": 6173 }, { "epoch": 2.8089171974522293, "grad_norm": 1.2340801992407044, "learning_rate": 2.018496389589972e-06, "loss": 0.0401, "step": 6174 }, { "epoch": 2.8093721565059147, "grad_norm": 1.426223719146641, "learning_rate": 2.0177951415685777e-06, "loss": 0.0409, "step": 6175 }, { "epoch": 2.8098271155595995, "grad_norm": 1.3638482777305185, "learning_rate": 2.017093932950788e-06, "loss": 0.0378, "step": 6176 }, { "epoch": 2.810282074613285, "grad_norm": 1.343341377990012, "learning_rate": 2.0163927637939002e-06, "loss": 0.055, "step": 6177 }, { "epoch": 2.81073703366697, "grad_norm": 1.0076410242516645, "learning_rate": 2.015691634155211e-06, "loss": 0.0373, "step": 6178 }, { "epoch": 2.811191992720655, "grad_norm": 1.2582128008209512, "learning_rate": 2.0149905440920155e-06, "loss": 0.0238, "step": 6179 }, { "epoch": 2.8116469517743403, "grad_norm": 1.542476682770642, "learning_rate": 2.014289493661603e-06, "loss": 0.0311, "step": 6180 }, { "epoch": 2.8121019108280256, "grad_norm": 0.7510584641751509, "learning_rate": 2.013588482921259e-06, "loss": 0.0239, "step": 6181 }, { "epoch": 2.8125568698817105, "grad_norm": 2.1310844139258953, "learning_rate": 2.0128875119282676e-06, "loss": 0.0204, "step": 6182 }, { "epoch": 2.813011828935396, "grad_norm": 0.9165558663078011, "learning_rate": 2.0121865807399087e-06, "loss": 0.025, "step": 6183 }, { "epoch": 2.813466787989081, "grad_norm": 0.9200795314929799, "learning_rate": 2.01148568941346e-06, "loss": 0.0426, "step": 6184 }, { "epoch": 2.813921747042766, "grad_norm": 1.2407036117047634, "learning_rate": 2.0107848380061932e-06, "loss": 0.041, "step": 6185 }, { "epoch": 2.8143767060964513, "grad_norm": 1.7717150080089104, "learning_rate": 2.0100840265753813e-06, "loss": 0.0375, "step": 6186 }, { "epoch": 2.8148316651501366, "grad_norm": 1.828642040939532, "learning_rate": 2.009383255178291e-06, "loss": 0.0844, "step": 6187 }, { "epoch": 2.8152866242038215, "grad_norm": 1.6571190304398047, "learning_rate": 2.008682523872184e-06, "loss": 0.0812, "step": 6188 }, { "epoch": 2.815741583257507, "grad_norm": 0.9791085254951238, "learning_rate": 2.0079818327143235e-06, "loss": 0.0259, "step": 6189 }, { "epoch": 2.816196542311192, "grad_norm": 1.0515141473308895, "learning_rate": 2.0072811817619655e-06, "loss": 0.0285, "step": 6190 }, { "epoch": 2.816651501364877, "grad_norm": 1.457684018883722, "learning_rate": 2.0065805710723645e-06, "loss": 0.0339, "step": 6191 }, { "epoch": 2.8171064604185623, "grad_norm": 1.653151064812303, "learning_rate": 2.0058800007027697e-06, "loss": 0.051, "step": 6192 }, { "epoch": 2.8175614194722476, "grad_norm": 1.4883863216946513, "learning_rate": 2.0051794707104304e-06, "loss": 0.0383, "step": 6193 }, { "epoch": 2.8180163785259325, "grad_norm": 1.2194133600607318, "learning_rate": 2.0044789811525904e-06, "loss": 0.0296, "step": 6194 }, { "epoch": 2.8184713375796178, "grad_norm": 1.2692100119423522, "learning_rate": 2.0037785320864904e-06, "loss": 0.0697, "step": 6195 }, { "epoch": 2.818926296633303, "grad_norm": 1.1672079140816054, "learning_rate": 2.0030781235693682e-06, "loss": 0.0265, "step": 6196 }, { "epoch": 2.819381255686988, "grad_norm": 1.5017283299144701, "learning_rate": 2.0023777556584567e-06, "loss": 0.0534, "step": 6197 }, { "epoch": 2.8198362147406733, "grad_norm": 1.3142327428456237, "learning_rate": 2.001677428410989e-06, "loss": 0.0414, "step": 6198 }, { "epoch": 2.8202911737943586, "grad_norm": 1.1454995701593005, "learning_rate": 2.0009771418841897e-06, "loss": 0.0587, "step": 6199 }, { "epoch": 2.8207461328480434, "grad_norm": 1.1040974344283294, "learning_rate": 2.0002768961352858e-06, "loss": 0.0575, "step": 6200 }, { "epoch": 2.8212010919017287, "grad_norm": 1.1708289977497515, "learning_rate": 1.9995766912214976e-06, "loss": 0.0365, "step": 6201 }, { "epoch": 2.821656050955414, "grad_norm": 1.2958421744162556, "learning_rate": 1.9988765272000414e-06, "loss": 0.0346, "step": 6202 }, { "epoch": 2.822111010009099, "grad_norm": 1.1210638143732485, "learning_rate": 1.9981764041281334e-06, "loss": 0.0413, "step": 6203 }, { "epoch": 2.8225659690627842, "grad_norm": 1.8888151706967589, "learning_rate": 1.9974763220629826e-06, "loss": 0.0624, "step": 6204 }, { "epoch": 2.8230209281164695, "grad_norm": 1.3257597095423412, "learning_rate": 1.996776281061798e-06, "loss": 0.0323, "step": 6205 }, { "epoch": 2.823475887170155, "grad_norm": 0.9031111903896074, "learning_rate": 1.9960762811817822e-06, "loss": 0.0196, "step": 6206 }, { "epoch": 2.8239308462238397, "grad_norm": 1.5776830596064464, "learning_rate": 1.9953763224801375e-06, "loss": 0.0498, "step": 6207 }, { "epoch": 2.824385805277525, "grad_norm": 0.8012515087269647, "learning_rate": 1.9946764050140616e-06, "loss": 0.0125, "step": 6208 }, { "epoch": 2.8248407643312103, "grad_norm": 1.2751197190562968, "learning_rate": 1.993976528840747e-06, "loss": 0.0481, "step": 6209 }, { "epoch": 2.825295723384895, "grad_norm": 1.134932985762207, "learning_rate": 1.993276694017386e-06, "loss": 0.0222, "step": 6210 }, { "epoch": 2.8257506824385805, "grad_norm": 1.2279674835022985, "learning_rate": 1.9925769006011645e-06, "loss": 0.0402, "step": 6211 }, { "epoch": 2.826205641492266, "grad_norm": 1.0501112777500208, "learning_rate": 1.991877148649268e-06, "loss": 0.0215, "step": 6212 }, { "epoch": 2.826660600545951, "grad_norm": 1.1672810236971256, "learning_rate": 1.991177438218875e-06, "loss": 0.0309, "step": 6213 }, { "epoch": 2.827115559599636, "grad_norm": 1.4522919102653145, "learning_rate": 1.9904777693671646e-06, "loss": 0.0529, "step": 6214 }, { "epoch": 2.8275705186533213, "grad_norm": 1.402150716607109, "learning_rate": 1.9897781421513103e-06, "loss": 0.0363, "step": 6215 }, { "epoch": 2.8280254777070066, "grad_norm": 2.059658264879804, "learning_rate": 1.9890785566284822e-06, "loss": 0.033, "step": 6216 }, { "epoch": 2.8284804367606915, "grad_norm": 0.7724948624922748, "learning_rate": 1.9883790128558463e-06, "loss": 0.02, "step": 6217 }, { "epoch": 2.828935395814377, "grad_norm": 1.1724004296318575, "learning_rate": 1.987679510890568e-06, "loss": 0.0358, "step": 6218 }, { "epoch": 2.829390354868062, "grad_norm": 2.5646898406357654, "learning_rate": 1.9869800507898053e-06, "loss": 0.0699, "step": 6219 }, { "epoch": 2.829845313921747, "grad_norm": 1.0578315215414993, "learning_rate": 1.9862806326107162e-06, "loss": 0.0321, "step": 6220 }, { "epoch": 2.8303002729754323, "grad_norm": 1.4007717655114462, "learning_rate": 1.9855812564104547e-06, "loss": 0.0381, "step": 6221 }, { "epoch": 2.8307552320291176, "grad_norm": 0.9634985429357497, "learning_rate": 1.984881922246169e-06, "loss": 0.0403, "step": 6222 }, { "epoch": 2.8312101910828025, "grad_norm": 1.4872079678815902, "learning_rate": 1.984182630175007e-06, "loss": 0.0466, "step": 6223 }, { "epoch": 2.831665150136488, "grad_norm": 1.1897131417699658, "learning_rate": 1.9834833802541107e-06, "loss": 0.0358, "step": 6224 }, { "epoch": 2.832120109190173, "grad_norm": 1.952011224968798, "learning_rate": 1.982784172540621e-06, "loss": 0.0433, "step": 6225 }, { "epoch": 2.832575068243858, "grad_norm": 1.054563644306939, "learning_rate": 1.982085007091672e-06, "loss": 0.0561, "step": 6226 }, { "epoch": 2.8330300272975433, "grad_norm": 1.4883216622237627, "learning_rate": 1.9813858839643965e-06, "loss": 0.0413, "step": 6227 }, { "epoch": 2.8334849863512286, "grad_norm": 0.9924472336746436, "learning_rate": 1.980686803215926e-06, "loss": 0.0395, "step": 6228 }, { "epoch": 2.8339399454049135, "grad_norm": 1.0289421630312963, "learning_rate": 1.9799877649033837e-06, "loss": 0.0297, "step": 6229 }, { "epoch": 2.8343949044585988, "grad_norm": 1.7411353453912826, "learning_rate": 1.979288769083894e-06, "loss": 0.0508, "step": 6230 }, { "epoch": 2.834849863512284, "grad_norm": 1.797384498982944, "learning_rate": 1.978589815814574e-06, "loss": 0.0428, "step": 6231 }, { "epoch": 2.835304822565969, "grad_norm": 2.4495918292286034, "learning_rate": 1.9778909051525397e-06, "loss": 0.0687, "step": 6232 }, { "epoch": 2.8357597816196543, "grad_norm": 2.526932329464511, "learning_rate": 1.9771920371549025e-06, "loss": 0.037, "step": 6233 }, { "epoch": 2.8362147406733396, "grad_norm": 1.3018139125236101, "learning_rate": 1.9764932118787705e-06, "loss": 0.0411, "step": 6234 }, { "epoch": 2.8366696997270244, "grad_norm": 0.9853917591915696, "learning_rate": 1.97579442938125e-06, "loss": 0.0202, "step": 6235 }, { "epoch": 2.8371246587807097, "grad_norm": 1.282119012665977, "learning_rate": 1.9750956897194413e-06, "loss": 0.0652, "step": 6236 }, { "epoch": 2.837579617834395, "grad_norm": 0.7470264722298448, "learning_rate": 1.9743969929504427e-06, "loss": 0.0266, "step": 6237 }, { "epoch": 2.83803457688808, "grad_norm": 1.157835809442877, "learning_rate": 1.973698339131348e-06, "loss": 0.0533, "step": 6238 }, { "epoch": 2.8384895359417652, "grad_norm": 1.3713191985112188, "learning_rate": 1.9729997283192483e-06, "loss": 0.0292, "step": 6239 }, { "epoch": 2.8389444949954505, "grad_norm": 1.164274584435465, "learning_rate": 1.9723011605712307e-06, "loss": 0.0657, "step": 6240 }, { "epoch": 2.8393994540491354, "grad_norm": 2.7017857790126993, "learning_rate": 1.9716026359443784e-06, "loss": 0.086, "step": 6241 }, { "epoch": 2.8398544131028207, "grad_norm": 1.1005402525224681, "learning_rate": 1.970904154495774e-06, "loss": 0.0438, "step": 6242 }, { "epoch": 2.840309372156506, "grad_norm": 1.15804616045248, "learning_rate": 1.9702057162824916e-06, "loss": 0.0273, "step": 6243 }, { "epoch": 2.840764331210191, "grad_norm": 1.436143938755856, "learning_rate": 1.9695073213616066e-06, "loss": 0.0274, "step": 6244 }, { "epoch": 2.841219290263876, "grad_norm": 1.1343221809194568, "learning_rate": 1.968808969790187e-06, "loss": 0.0247, "step": 6245 }, { "epoch": 2.8416742493175615, "grad_norm": 1.348358961361138, "learning_rate": 1.9681106616252995e-06, "loss": 0.0389, "step": 6246 }, { "epoch": 2.8421292083712464, "grad_norm": 1.1032491461611846, "learning_rate": 1.9674123969240067e-06, "loss": 0.0383, "step": 6247 }, { "epoch": 2.8425841674249317, "grad_norm": 1.2666439201615758, "learning_rate": 1.9667141757433667e-06, "loss": 0.0324, "step": 6248 }, { "epoch": 2.843039126478617, "grad_norm": 1.516472864890303, "learning_rate": 1.9660159981404373e-06, "loss": 0.0253, "step": 6249 }, { "epoch": 2.843494085532302, "grad_norm": 1.092557533529047, "learning_rate": 1.9653178641722688e-06, "loss": 0.0542, "step": 6250 }, { "epoch": 2.843949044585987, "grad_norm": 1.2427282733558598, "learning_rate": 1.9646197738959104e-06, "loss": 0.0182, "step": 6251 }, { "epoch": 2.8444040036396725, "grad_norm": 1.3759959670430653, "learning_rate": 1.963921727368406e-06, "loss": 0.0441, "step": 6252 }, { "epoch": 2.8448589626933574, "grad_norm": 1.4093200587907806, "learning_rate": 1.9632237246467967e-06, "loss": 0.0866, "step": 6253 }, { "epoch": 2.8453139217470427, "grad_norm": 1.1790248825408893, "learning_rate": 1.962525765788121e-06, "loss": 0.0514, "step": 6254 }, { "epoch": 2.845768880800728, "grad_norm": 1.4536316296524603, "learning_rate": 1.9618278508494114e-06, "loss": 0.0452, "step": 6255 }, { "epoch": 2.846223839854413, "grad_norm": 1.2095141916738423, "learning_rate": 1.9611299798877004e-06, "loss": 0.019, "step": 6256 }, { "epoch": 2.846678798908098, "grad_norm": 1.4317651438364072, "learning_rate": 1.960432152960014e-06, "loss": 0.0895, "step": 6257 }, { "epoch": 2.8471337579617835, "grad_norm": 1.2775498914719436, "learning_rate": 1.9597343701233754e-06, "loss": 0.0415, "step": 6258 }, { "epoch": 2.8475887170154683, "grad_norm": 0.9932914576420461, "learning_rate": 1.9590366314348043e-06, "loss": 0.0144, "step": 6259 }, { "epoch": 2.8480436760691537, "grad_norm": 1.1463923439864574, "learning_rate": 1.9583389369513164e-06, "loss": 0.0432, "step": 6260 }, { "epoch": 2.848498635122839, "grad_norm": 1.2537069882323892, "learning_rate": 1.957641286729925e-06, "loss": 0.0408, "step": 6261 }, { "epoch": 2.8489535941765243, "grad_norm": 1.1264228008804082, "learning_rate": 1.956943680827637e-06, "loss": 0.0215, "step": 6262 }, { "epoch": 2.849408553230209, "grad_norm": 0.9831732778384115, "learning_rate": 1.95624611930146e-06, "loss": 0.0162, "step": 6263 }, { "epoch": 2.8498635122838945, "grad_norm": 1.2895863763940243, "learning_rate": 1.9555486022083947e-06, "loss": 0.053, "step": 6264 }, { "epoch": 2.8503184713375798, "grad_norm": 1.5102288381107094, "learning_rate": 1.9548511296054386e-06, "loss": 0.0333, "step": 6265 }, { "epoch": 2.8507734303912646, "grad_norm": 1.0860522593630832, "learning_rate": 1.9541537015495867e-06, "loss": 0.0552, "step": 6266 }, { "epoch": 2.85122838944495, "grad_norm": 0.9786907315397415, "learning_rate": 1.953456318097829e-06, "loss": 0.0189, "step": 6267 }, { "epoch": 2.8516833484986353, "grad_norm": 1.5991348708000075, "learning_rate": 1.952758979307153e-06, "loss": 0.0577, "step": 6268 }, { "epoch": 2.8521383075523206, "grad_norm": 1.6023458643282593, "learning_rate": 1.952061685234541e-06, "loss": 0.0455, "step": 6269 }, { "epoch": 2.8525932666060054, "grad_norm": 1.6357068340001077, "learning_rate": 1.951364435936974e-06, "loss": 0.0336, "step": 6270 }, { "epoch": 2.8530482256596907, "grad_norm": 1.3114123030529674, "learning_rate": 1.9506672314714285e-06, "loss": 0.0356, "step": 6271 }, { "epoch": 2.853503184713376, "grad_norm": 1.3600289422573908, "learning_rate": 1.9499700718948754e-06, "loss": 0.1262, "step": 6272 }, { "epoch": 2.853958143767061, "grad_norm": 1.2131089092201695, "learning_rate": 1.9492729572642846e-06, "loss": 0.0339, "step": 6273 }, { "epoch": 2.8544131028207462, "grad_norm": 1.315628086113743, "learning_rate": 1.94857588763662e-06, "loss": 0.0728, "step": 6274 }, { "epoch": 2.8548680618744315, "grad_norm": 1.4820229094932449, "learning_rate": 1.9478788630688444e-06, "loss": 0.0497, "step": 6275 }, { "epoch": 2.8553230209281164, "grad_norm": 0.8815916326150468, "learning_rate": 1.9471818836179137e-06, "loss": 0.0431, "step": 6276 }, { "epoch": 2.8557779799818017, "grad_norm": 1.7962076077859424, "learning_rate": 1.9464849493407836e-06, "loss": 0.051, "step": 6277 }, { "epoch": 2.856232939035487, "grad_norm": 1.2471879293495818, "learning_rate": 1.945788060294404e-06, "loss": 0.0313, "step": 6278 }, { "epoch": 2.856687898089172, "grad_norm": 1.3750119401327192, "learning_rate": 1.945091216535721e-06, "loss": 0.0478, "step": 6279 }, { "epoch": 2.857142857142857, "grad_norm": 1.535873006332577, "learning_rate": 1.9443944181216782e-06, "loss": 0.0399, "step": 6280 }, { "epoch": 2.8575978161965425, "grad_norm": 1.3285953331824087, "learning_rate": 1.9436976651092143e-06, "loss": 0.0609, "step": 6281 }, { "epoch": 2.8580527752502274, "grad_norm": 1.7161946511799746, "learning_rate": 1.943000957555265e-06, "loss": 0.0877, "step": 6282 }, { "epoch": 2.8585077343039127, "grad_norm": 1.348034324877862, "learning_rate": 1.9423042955167613e-06, "loss": 0.0374, "step": 6283 }, { "epoch": 2.858962693357598, "grad_norm": 1.1106847506588307, "learning_rate": 1.941607679050633e-06, "loss": 0.0271, "step": 6284 }, { "epoch": 2.859417652411283, "grad_norm": 1.2330709894857865, "learning_rate": 1.9409111082138034e-06, "loss": 0.0216, "step": 6285 }, { "epoch": 2.859872611464968, "grad_norm": 1.1160092838822422, "learning_rate": 1.9402145830631926e-06, "loss": 0.0294, "step": 6286 }, { "epoch": 2.8603275705186535, "grad_norm": 1.203461271832311, "learning_rate": 1.939518103655719e-06, "loss": 0.0372, "step": 6287 }, { "epoch": 2.8607825295723384, "grad_norm": 1.2629579680481686, "learning_rate": 1.938821670048295e-06, "loss": 0.0522, "step": 6288 }, { "epoch": 2.8612374886260237, "grad_norm": 1.368670374716161, "learning_rate": 1.938125282297829e-06, "loss": 0.0329, "step": 6289 }, { "epoch": 2.861692447679709, "grad_norm": 1.6808870420426625, "learning_rate": 1.9374289404612266e-06, "loss": 0.036, "step": 6290 }, { "epoch": 2.862147406733394, "grad_norm": 0.9217415354725401, "learning_rate": 1.9367326445953924e-06, "loss": 0.0613, "step": 6291 }, { "epoch": 2.862602365787079, "grad_norm": 1.2491979466820617, "learning_rate": 1.936036394757222e-06, "loss": 0.0344, "step": 6292 }, { "epoch": 2.8630573248407645, "grad_norm": 1.2590731861087066, "learning_rate": 1.9353401910036115e-06, "loss": 0.0534, "step": 6293 }, { "epoch": 2.8635122838944493, "grad_norm": 1.6472625892130635, "learning_rate": 1.934644033391449e-06, "loss": 0.0427, "step": 6294 }, { "epoch": 2.8639672429481347, "grad_norm": 1.3218035208961143, "learning_rate": 1.9339479219776246e-06, "loss": 0.0298, "step": 6295 }, { "epoch": 2.86442220200182, "grad_norm": 0.9817788417251342, "learning_rate": 1.9332518568190186e-06, "loss": 0.0223, "step": 6296 }, { "epoch": 2.864877161055505, "grad_norm": 1.1669107190208303, "learning_rate": 1.9325558379725113e-06, "loss": 0.0471, "step": 6297 }, { "epoch": 2.86533212010919, "grad_norm": 1.2216379317896175, "learning_rate": 1.931859865494979e-06, "loss": 0.0746, "step": 6298 }, { "epoch": 2.8657870791628755, "grad_norm": 1.1969887864935669, "learning_rate": 1.9311639394432926e-06, "loss": 0.0219, "step": 6299 }, { "epoch": 2.8662420382165603, "grad_norm": 1.7692361966630663, "learning_rate": 1.930468059874321e-06, "loss": 0.0382, "step": 6300 }, { "epoch": 2.8666969972702456, "grad_norm": 1.840377777493732, "learning_rate": 1.9297722268449264e-06, "loss": 0.0322, "step": 6301 }, { "epoch": 2.867151956323931, "grad_norm": 0.9268434824144577, "learning_rate": 1.9290764404119714e-06, "loss": 0.0668, "step": 6302 }, { "epoch": 2.867606915377616, "grad_norm": 1.7857890290034186, "learning_rate": 1.9283807006323104e-06, "loss": 0.1393, "step": 6303 }, { "epoch": 2.868061874431301, "grad_norm": 1.1851621638370684, "learning_rate": 1.9276850075627968e-06, "loss": 0.0416, "step": 6304 }, { "epoch": 2.8685168334849864, "grad_norm": 1.202189431541165, "learning_rate": 1.926989361260281e-06, "loss": 0.1045, "step": 6305 }, { "epoch": 2.8689717925386713, "grad_norm": 1.0959001464206397, "learning_rate": 1.9262937617816062e-06, "loss": 0.0295, "step": 6306 }, { "epoch": 2.8694267515923566, "grad_norm": 1.3733079832755208, "learning_rate": 1.925598209183615e-06, "loss": 0.0585, "step": 6307 }, { "epoch": 2.869881710646042, "grad_norm": 1.5235886662086287, "learning_rate": 1.924902703523144e-06, "loss": 0.0426, "step": 6308 }, { "epoch": 2.870336669699727, "grad_norm": 0.9383250145519952, "learning_rate": 1.924207244857027e-06, "loss": 0.0265, "step": 6309 }, { "epoch": 2.870791628753412, "grad_norm": 1.1650939407896763, "learning_rate": 1.9235118332420934e-06, "loss": 0.0412, "step": 6310 }, { "epoch": 2.8712465878070974, "grad_norm": 1.1250244926880153, "learning_rate": 1.9228164687351688e-06, "loss": 0.0528, "step": 6311 }, { "epoch": 2.8717015468607823, "grad_norm": 1.1497215473455082, "learning_rate": 1.9221211513930766e-06, "loss": 0.0573, "step": 6312 }, { "epoch": 2.8721565059144676, "grad_norm": 1.5109635265418992, "learning_rate": 1.9214258812726338e-06, "loss": 0.0856, "step": 6313 }, { "epoch": 2.872611464968153, "grad_norm": 1.7944079085493259, "learning_rate": 1.920730658430656e-06, "loss": 0.0522, "step": 6314 }, { "epoch": 2.8730664240218378, "grad_norm": 1.0174543541406718, "learning_rate": 1.920035482923952e-06, "loss": 0.0189, "step": 6315 }, { "epoch": 2.873521383075523, "grad_norm": 1.1690073999334192, "learning_rate": 1.91934035480933e-06, "loss": 0.0794, "step": 6316 }, { "epoch": 2.8739763421292084, "grad_norm": 1.0508820209715204, "learning_rate": 1.9186452741435914e-06, "loss": 0.0617, "step": 6317 }, { "epoch": 2.8744313011828937, "grad_norm": 1.6449654903921085, "learning_rate": 1.917950240983535e-06, "loss": 0.045, "step": 6318 }, { "epoch": 2.8748862602365786, "grad_norm": 1.747437085087716, "learning_rate": 1.917255255385957e-06, "loss": 0.0979, "step": 6319 }, { "epoch": 2.875341219290264, "grad_norm": 1.4117498676909022, "learning_rate": 1.916560317407648e-06, "loss": 0.0452, "step": 6320 }, { "epoch": 2.875796178343949, "grad_norm": 1.0568775439388487, "learning_rate": 1.9158654271053957e-06, "loss": 0.0253, "step": 6321 }, { "epoch": 2.876251137397634, "grad_norm": 0.8969398138470651, "learning_rate": 1.9151705845359825e-06, "loss": 0.0315, "step": 6322 }, { "epoch": 2.8767060964513194, "grad_norm": 1.8644524268588996, "learning_rate": 1.914475789756187e-06, "loss": 0.0398, "step": 6323 }, { "epoch": 2.8771610555050047, "grad_norm": 1.1908699831031493, "learning_rate": 1.913781042822787e-06, "loss": 0.0297, "step": 6324 }, { "epoch": 2.87761601455869, "grad_norm": 1.1470973864721496, "learning_rate": 1.913086343792552e-06, "loss": 0.0416, "step": 6325 }, { "epoch": 2.878070973612375, "grad_norm": 1.129534738719781, "learning_rate": 1.9123916927222506e-06, "loss": 0.0526, "step": 6326 }, { "epoch": 2.87852593266606, "grad_norm": 1.1690922455193518, "learning_rate": 1.9116970896686467e-06, "loss": 0.0504, "step": 6327 }, { "epoch": 2.8789808917197455, "grad_norm": 1.353922665441383, "learning_rate": 1.9110025346885e-06, "loss": 0.0321, "step": 6328 }, { "epoch": 2.8794358507734303, "grad_norm": 1.069213039368295, "learning_rate": 1.910308027838566e-06, "loss": 0.0557, "step": 6329 }, { "epoch": 2.8798908098271156, "grad_norm": 1.487269349425608, "learning_rate": 1.909613569175597e-06, "loss": 0.0638, "step": 6330 }, { "epoch": 2.880345768880801, "grad_norm": 1.3662055537269033, "learning_rate": 1.9089191587563414e-06, "loss": 0.0463, "step": 6331 }, { "epoch": 2.880800727934486, "grad_norm": 1.4867564869187215, "learning_rate": 1.9082247966375417e-06, "loss": 0.0491, "step": 6332 }, { "epoch": 2.881255686988171, "grad_norm": 1.2720141681248112, "learning_rate": 1.90753048287594e-06, "loss": 0.0224, "step": 6333 }, { "epoch": 2.8817106460418564, "grad_norm": 1.1995348921643425, "learning_rate": 1.906836217528272e-06, "loss": 0.0415, "step": 6334 }, { "epoch": 2.8821656050955413, "grad_norm": 1.6573800809190056, "learning_rate": 1.906142000651269e-06, "loss": 0.0873, "step": 6335 }, { "epoch": 2.8826205641492266, "grad_norm": 1.0814036961556814, "learning_rate": 1.9054478323016607e-06, "loss": 0.046, "step": 6336 }, { "epoch": 2.883075523202912, "grad_norm": 1.6689982485761452, "learning_rate": 1.9047537125361695e-06, "loss": 0.039, "step": 6337 }, { "epoch": 2.883530482256597, "grad_norm": 2.0177578278039476, "learning_rate": 1.9040596414115175e-06, "loss": 0.0655, "step": 6338 }, { "epoch": 2.883985441310282, "grad_norm": 1.5749442139724794, "learning_rate": 1.9033656189844196e-06, "loss": 0.0576, "step": 6339 }, { "epoch": 2.8844404003639674, "grad_norm": 0.912009435601261, "learning_rate": 1.9026716453115893e-06, "loss": 0.0311, "step": 6340 }, { "epoch": 2.8848953594176523, "grad_norm": 1.4992177493134904, "learning_rate": 1.9019777204497353e-06, "loss": 0.032, "step": 6341 }, { "epoch": 2.8853503184713376, "grad_norm": 2.3786153871538054, "learning_rate": 1.9012838444555605e-06, "loss": 0.0451, "step": 6342 }, { "epoch": 2.885805277525023, "grad_norm": 1.3620247417386477, "learning_rate": 1.900590017385767e-06, "loss": 0.0443, "step": 6343 }, { "epoch": 2.886260236578708, "grad_norm": 1.23931575273074, "learning_rate": 1.8998962392970496e-06, "loss": 0.0527, "step": 6344 }, { "epoch": 2.886715195632393, "grad_norm": 1.4006475131863647, "learning_rate": 1.899202510246102e-06, "loss": 0.0511, "step": 6345 }, { "epoch": 2.8871701546860784, "grad_norm": 0.9425720617345986, "learning_rate": 1.8985088302896113e-06, "loss": 0.0298, "step": 6346 }, { "epoch": 2.8876251137397633, "grad_norm": 1.120759421966426, "learning_rate": 1.8978151994842632e-06, "loss": 0.0231, "step": 6347 }, { "epoch": 2.8880800727934486, "grad_norm": 1.285843161305239, "learning_rate": 1.8971216178867378e-06, "loss": 0.0643, "step": 6348 }, { "epoch": 2.888535031847134, "grad_norm": 0.949070177125858, "learning_rate": 1.8964280855537106e-06, "loss": 0.0237, "step": 6349 }, { "epoch": 2.8889899909008188, "grad_norm": 1.0575025886614047, "learning_rate": 1.8957346025418555e-06, "loss": 0.0438, "step": 6350 }, { "epoch": 2.889444949954504, "grad_norm": 1.2182719950286616, "learning_rate": 1.895041168907839e-06, "loss": 0.0484, "step": 6351 }, { "epoch": 2.8898999090081894, "grad_norm": 1.6808154109751547, "learning_rate": 1.894347784708327e-06, "loss": 0.0355, "step": 6352 }, { "epoch": 2.8903548680618742, "grad_norm": 1.3357734587303143, "learning_rate": 1.8936544499999777e-06, "loss": 0.038, "step": 6353 }, { "epoch": 2.8908098271155596, "grad_norm": 1.430621791498753, "learning_rate": 1.892961164839449e-06, "loss": 0.0372, "step": 6354 }, { "epoch": 2.891264786169245, "grad_norm": 1.3393118802816406, "learning_rate": 1.892267929283393e-06, "loss": 0.0348, "step": 6355 }, { "epoch": 2.8917197452229297, "grad_norm": 1.7244820292819039, "learning_rate": 1.8915747433884567e-06, "loss": 0.0673, "step": 6356 }, { "epoch": 2.892174704276615, "grad_norm": 1.4186566914156615, "learning_rate": 1.8908816072112857e-06, "loss": 0.042, "step": 6357 }, { "epoch": 2.8926296633303004, "grad_norm": 1.6739141001929443, "learning_rate": 1.8901885208085186e-06, "loss": 0.0269, "step": 6358 }, { "epoch": 2.8930846223839852, "grad_norm": 0.9051300759352677, "learning_rate": 1.8894954842367912e-06, "loss": 0.0206, "step": 6359 }, { "epoch": 2.8935395814376705, "grad_norm": 0.8539185815476124, "learning_rate": 1.8888024975527359e-06, "loss": 0.032, "step": 6360 }, { "epoch": 2.893994540491356, "grad_norm": 1.6501893677741197, "learning_rate": 1.8881095608129807e-06, "loss": 0.0438, "step": 6361 }, { "epoch": 2.8944494995450407, "grad_norm": 1.006643793730625, "learning_rate": 1.8874166740741487e-06, "loss": 0.0187, "step": 6362 }, { "epoch": 2.894904458598726, "grad_norm": 1.3927634684085275, "learning_rate": 1.88672383739286e-06, "loss": 0.0748, "step": 6363 }, { "epoch": 2.8953594176524113, "grad_norm": 1.5465107175059818, "learning_rate": 1.8860310508257297e-06, "loss": 0.0405, "step": 6364 }, { "epoch": 2.895814376706096, "grad_norm": 1.0790205583140318, "learning_rate": 1.8853383144293693e-06, "loss": 0.0673, "step": 6365 }, { "epoch": 2.8962693357597815, "grad_norm": 2.3828150772662324, "learning_rate": 1.8846456282603858e-06, "loss": 0.0484, "step": 6366 }, { "epoch": 2.896724294813467, "grad_norm": 1.361738887827398, "learning_rate": 1.8839529923753822e-06, "loss": 0.0625, "step": 6367 }, { "epoch": 2.8971792538671517, "grad_norm": 1.1774442237707887, "learning_rate": 1.8832604068309588e-06, "loss": 0.0182, "step": 6368 }, { "epoch": 2.897634212920837, "grad_norm": 1.2722652096655345, "learning_rate": 1.8825678716837093e-06, "loss": 0.0393, "step": 6369 }, { "epoch": 2.8980891719745223, "grad_norm": 1.0034915829678432, "learning_rate": 1.8818753869902256e-06, "loss": 0.0191, "step": 6370 }, { "epoch": 2.8985441310282076, "grad_norm": 2.4252251205670037, "learning_rate": 1.8811829528070935e-06, "loss": 0.0903, "step": 6371 }, { "epoch": 2.8989990900818925, "grad_norm": 1.2022177534105134, "learning_rate": 1.8804905691908965e-06, "loss": 0.0461, "step": 6372 }, { "epoch": 2.899454049135578, "grad_norm": 0.9769488272906166, "learning_rate": 1.8797982361982118e-06, "loss": 0.0181, "step": 6373 }, { "epoch": 2.899909008189263, "grad_norm": 0.8762506984205896, "learning_rate": 1.8791059538856138e-06, "loss": 0.0212, "step": 6374 }, { "epoch": 2.900363967242948, "grad_norm": 1.507162325283675, "learning_rate": 1.8784137223096743e-06, "loss": 0.0599, "step": 6375 }, { "epoch": 2.9008189262966333, "grad_norm": 0.8881275648568184, "learning_rate": 1.8777215415269582e-06, "loss": 0.0319, "step": 6376 }, { "epoch": 2.9012738853503186, "grad_norm": 1.3152398636201532, "learning_rate": 1.8770294115940279e-06, "loss": 0.0182, "step": 6377 }, { "epoch": 2.901728844404004, "grad_norm": 1.4852616965203091, "learning_rate": 1.87633733256744e-06, "loss": 0.0505, "step": 6378 }, { "epoch": 2.902183803457689, "grad_norm": 2.285441581794636, "learning_rate": 1.8756453045037499e-06, "loss": 0.0569, "step": 6379 }, { "epoch": 2.902638762511374, "grad_norm": 1.3871292657715832, "learning_rate": 1.874953327459505e-06, "loss": 0.0498, "step": 6380 }, { "epoch": 2.9030937215650594, "grad_norm": 1.4950025200683998, "learning_rate": 1.874261401491251e-06, "loss": 0.0251, "step": 6381 }, { "epoch": 2.9035486806187443, "grad_norm": 1.3045662177991533, "learning_rate": 1.8735695266555306e-06, "loss": 0.03, "step": 6382 }, { "epoch": 2.9040036396724296, "grad_norm": 1.110991374287391, "learning_rate": 1.872877703008879e-06, "loss": 0.0335, "step": 6383 }, { "epoch": 2.904458598726115, "grad_norm": 1.4215839283342968, "learning_rate": 1.87218593060783e-06, "loss": 0.0507, "step": 6384 }, { "epoch": 2.9049135577797998, "grad_norm": 1.3246098008416807, "learning_rate": 1.8714942095089112e-06, "loss": 0.0756, "step": 6385 }, { "epoch": 2.905368516833485, "grad_norm": 2.130284179400489, "learning_rate": 1.8708025397686474e-06, "loss": 0.0333, "step": 6386 }, { "epoch": 2.9058234758871704, "grad_norm": 1.7874671866191363, "learning_rate": 1.8701109214435586e-06, "loss": 0.0292, "step": 6387 }, { "epoch": 2.9062784349408552, "grad_norm": 1.1995600400405613, "learning_rate": 1.8694193545901602e-06, "loss": 0.0653, "step": 6388 }, { "epoch": 2.9067333939945406, "grad_norm": 1.4642292117356566, "learning_rate": 1.868727839264965e-06, "loss": 0.0412, "step": 6389 }, { "epoch": 2.907188353048226, "grad_norm": 1.2175741159501654, "learning_rate": 1.86803637552448e-06, "loss": 0.0314, "step": 6390 }, { "epoch": 2.9076433121019107, "grad_norm": 1.6900241441225388, "learning_rate": 1.8673449634252087e-06, "loss": 0.029, "step": 6391 }, { "epoch": 2.908098271155596, "grad_norm": 1.117984193777972, "learning_rate": 1.8666536030236494e-06, "loss": 0.0333, "step": 6392 }, { "epoch": 2.9085532302092814, "grad_norm": 1.2849729382751411, "learning_rate": 1.8659622943762978e-06, "loss": 0.0499, "step": 6393 }, { "epoch": 2.9090081892629662, "grad_norm": 1.4731332313622902, "learning_rate": 1.865271037539645e-06, "loss": 0.0484, "step": 6394 }, { "epoch": 2.9094631483166515, "grad_norm": 1.5333705782629112, "learning_rate": 1.864579832570174e-06, "loss": 0.041, "step": 6395 }, { "epoch": 2.909918107370337, "grad_norm": 1.6609053613406775, "learning_rate": 1.8638886795243718e-06, "loss": 0.0489, "step": 6396 }, { "epoch": 2.9103730664240217, "grad_norm": 1.5418185108897091, "learning_rate": 1.863197578458714e-06, "loss": 0.0393, "step": 6397 }, { "epoch": 2.910828025477707, "grad_norm": 1.3724752767772408, "learning_rate": 1.8625065294296734e-06, "loss": 0.0211, "step": 6398 }, { "epoch": 2.9112829845313923, "grad_norm": 1.2130807621544626, "learning_rate": 1.8618155324937214e-06, "loss": 0.0354, "step": 6399 }, { "epoch": 2.911737943585077, "grad_norm": 1.3793944613939886, "learning_rate": 1.8611245877073214e-06, "loss": 0.0229, "step": 6400 }, { "epoch": 2.9121929026387625, "grad_norm": 1.020883418044874, "learning_rate": 1.8604336951269352e-06, "loss": 0.0214, "step": 6401 }, { "epoch": 2.912647861692448, "grad_norm": 1.2890040299070564, "learning_rate": 1.8597428548090183e-06, "loss": 0.044, "step": 6402 }, { "epoch": 2.9131028207461327, "grad_norm": 1.372679299321148, "learning_rate": 1.8590520668100243e-06, "loss": 0.0238, "step": 6403 }, { "epoch": 2.913557779799818, "grad_norm": 1.1229600385322536, "learning_rate": 1.8583613311864018e-06, "loss": 0.0332, "step": 6404 }, { "epoch": 2.9140127388535033, "grad_norm": 1.3699289599492959, "learning_rate": 1.8576706479945928e-06, "loss": 0.0338, "step": 6405 }, { "epoch": 2.914467697907188, "grad_norm": 1.6752313297137755, "learning_rate": 1.8569800172910384e-06, "loss": 0.0702, "step": 6406 }, { "epoch": 2.9149226569608735, "grad_norm": 1.0526611393524796, "learning_rate": 1.8562894391321725e-06, "loss": 0.0474, "step": 6407 }, { "epoch": 2.915377616014559, "grad_norm": 1.5306771716252712, "learning_rate": 1.8555989135744273e-06, "loss": 0.0325, "step": 6408 }, { "epoch": 2.9158325750682437, "grad_norm": 1.6885341440635415, "learning_rate": 1.854908440674228e-06, "loss": 0.0381, "step": 6409 }, { "epoch": 2.916287534121929, "grad_norm": 1.4216094407526791, "learning_rate": 1.8542180204879978e-06, "loss": 0.0492, "step": 6410 }, { "epoch": 2.9167424931756143, "grad_norm": 1.0950993164007932, "learning_rate": 1.8535276530721553e-06, "loss": 0.024, "step": 6411 }, { "epoch": 2.917197452229299, "grad_norm": 1.1242868000210768, "learning_rate": 1.852837338483113e-06, "loss": 0.024, "step": 6412 }, { "epoch": 2.9176524112829845, "grad_norm": 1.8323610650026383, "learning_rate": 1.8521470767772814e-06, "loss": 0.0504, "step": 6413 }, { "epoch": 2.91810737033667, "grad_norm": 1.1385274479199978, "learning_rate": 1.8514568680110646e-06, "loss": 0.0458, "step": 6414 }, { "epoch": 2.9185623293903546, "grad_norm": 1.0135728827496362, "learning_rate": 1.850766712240864e-06, "loss": 0.0177, "step": 6415 }, { "epoch": 2.91901728844404, "grad_norm": 0.7445065674641337, "learning_rate": 1.8500766095230749e-06, "loss": 0.0104, "step": 6416 }, { "epoch": 2.9194722474977253, "grad_norm": 1.1674411828452067, "learning_rate": 1.849386559914091e-06, "loss": 0.0444, "step": 6417 }, { "epoch": 2.91992720655141, "grad_norm": 1.578486502915576, "learning_rate": 1.8486965634702997e-06, "loss": 0.0361, "step": 6418 }, { "epoch": 2.9203821656050954, "grad_norm": 1.208262139974905, "learning_rate": 1.848006620248083e-06, "loss": 0.0499, "step": 6419 }, { "epoch": 2.9208371246587808, "grad_norm": 0.9514121870240937, "learning_rate": 1.847316730303822e-06, "loss": 0.0138, "step": 6420 }, { "epoch": 2.9212920837124656, "grad_norm": 0.8508934502807499, "learning_rate": 1.8466268936938895e-06, "loss": 0.0158, "step": 6421 }, { "epoch": 2.921747042766151, "grad_norm": 1.5559286052496935, "learning_rate": 1.845937110474657e-06, "loss": 0.032, "step": 6422 }, { "epoch": 2.9222020018198362, "grad_norm": 0.8280650968972749, "learning_rate": 1.8452473807024896e-06, "loss": 0.0232, "step": 6423 }, { "epoch": 2.922656960873521, "grad_norm": 1.1071725311616134, "learning_rate": 1.8445577044337492e-06, "loss": 0.0302, "step": 6424 }, { "epoch": 2.9231119199272064, "grad_norm": 1.8219988496745025, "learning_rate": 1.8438680817247944e-06, "loss": 0.0767, "step": 6425 }, { "epoch": 2.9235668789808917, "grad_norm": 1.3329206050642093, "learning_rate": 1.8431785126319761e-06, "loss": 0.0682, "step": 6426 }, { "epoch": 2.924021838034577, "grad_norm": 1.273344941425383, "learning_rate": 1.8424889972116442e-06, "loss": 0.0251, "step": 6427 }, { "epoch": 2.924476797088262, "grad_norm": 1.0665665668256277, "learning_rate": 1.8417995355201415e-06, "loss": 0.0393, "step": 6428 }, { "epoch": 2.9249317561419472, "grad_norm": 1.3409658888794163, "learning_rate": 1.8411101276138088e-06, "loss": 0.0548, "step": 6429 }, { "epoch": 2.9253867151956325, "grad_norm": 2.5247870088650544, "learning_rate": 1.8404207735489801e-06, "loss": 0.0481, "step": 6430 }, { "epoch": 2.9258416742493174, "grad_norm": 2.575716512709149, "learning_rate": 1.8397314733819876e-06, "loss": 0.0464, "step": 6431 }, { "epoch": 2.9262966333030027, "grad_norm": 2.0493306697807534, "learning_rate": 1.839042227169158e-06, "loss": 0.0556, "step": 6432 }, { "epoch": 2.926751592356688, "grad_norm": 1.220588520446331, "learning_rate": 1.8383530349668127e-06, "loss": 0.0526, "step": 6433 }, { "epoch": 2.9272065514103733, "grad_norm": 1.4622571644145574, "learning_rate": 1.8376638968312687e-06, "loss": 0.0191, "step": 6434 }, { "epoch": 2.927661510464058, "grad_norm": 1.8065129511367526, "learning_rate": 1.8369748128188408e-06, "loss": 0.0437, "step": 6435 }, { "epoch": 2.9281164695177435, "grad_norm": 1.572332823005696, "learning_rate": 1.836285782985836e-06, "loss": 0.0301, "step": 6436 }, { "epoch": 2.928571428571429, "grad_norm": 1.2384549409502843, "learning_rate": 1.8355968073885594e-06, "loss": 0.0578, "step": 6437 }, { "epoch": 2.9290263876251137, "grad_norm": 1.7907246647648518, "learning_rate": 1.8349078860833125e-06, "loss": 0.0407, "step": 6438 }, { "epoch": 2.929481346678799, "grad_norm": 1.0529419605013375, "learning_rate": 1.8342190191263892e-06, "loss": 0.0297, "step": 6439 }, { "epoch": 2.9299363057324843, "grad_norm": 1.2639376109230231, "learning_rate": 1.8335302065740812e-06, "loss": 0.0527, "step": 6440 }, { "epoch": 2.930391264786169, "grad_norm": 1.2222525191743743, "learning_rate": 1.8328414484826746e-06, "loss": 0.0274, "step": 6441 }, { "epoch": 2.9308462238398545, "grad_norm": 1.967000028282946, "learning_rate": 1.8321527449084525e-06, "loss": 0.0583, "step": 6442 }, { "epoch": 2.93130118289354, "grad_norm": 1.1995838134927836, "learning_rate": 1.8314640959076916e-06, "loss": 0.024, "step": 6443 }, { "epoch": 2.9317561419472247, "grad_norm": 1.335813956466345, "learning_rate": 1.8307755015366651e-06, "loss": 0.0212, "step": 6444 }, { "epoch": 2.93221110100091, "grad_norm": 1.2528422892446045, "learning_rate": 1.8300869618516434e-06, "loss": 0.0256, "step": 6445 }, { "epoch": 2.9326660600545953, "grad_norm": 1.0565175520794763, "learning_rate": 1.8293984769088896e-06, "loss": 0.0192, "step": 6446 }, { "epoch": 2.93312101910828, "grad_norm": 1.4404400377426394, "learning_rate": 1.828710046764664e-06, "loss": 0.0678, "step": 6447 }, { "epoch": 2.9335759781619655, "grad_norm": 0.7813404735197201, "learning_rate": 1.8280216714752215e-06, "loss": 0.0369, "step": 6448 }, { "epoch": 2.934030937215651, "grad_norm": 1.334215763627233, "learning_rate": 1.8273333510968142e-06, "loss": 0.0432, "step": 6449 }, { "epoch": 2.9344858962693356, "grad_norm": 1.1694727266290939, "learning_rate": 1.8266450856856871e-06, "loss": 0.0397, "step": 6450 }, { "epoch": 2.934940855323021, "grad_norm": 1.349401059294145, "learning_rate": 1.8259568752980818e-06, "loss": 0.0306, "step": 6451 }, { "epoch": 2.9353958143767063, "grad_norm": 1.0627152570897775, "learning_rate": 1.825268719990238e-06, "loss": 0.0289, "step": 6452 }, { "epoch": 2.935850773430391, "grad_norm": 1.3675490420485394, "learning_rate": 1.824580619818387e-06, "loss": 0.065, "step": 6453 }, { "epoch": 2.9363057324840764, "grad_norm": 2.016206109464844, "learning_rate": 1.823892574838758e-06, "loss": 0.0549, "step": 6454 }, { "epoch": 2.9367606915377618, "grad_norm": 1.259205374736832, "learning_rate": 1.8232045851075742e-06, "loss": 0.0254, "step": 6455 }, { "epoch": 2.9372156505914466, "grad_norm": 1.953418473809822, "learning_rate": 1.8225166506810555e-06, "loss": 0.0714, "step": 6456 }, { "epoch": 2.937670609645132, "grad_norm": 1.2887770627128665, "learning_rate": 1.821828771615416e-06, "loss": 0.0212, "step": 6457 }, { "epoch": 2.9381255686988172, "grad_norm": 2.2251132218295635, "learning_rate": 1.8211409479668663e-06, "loss": 0.0635, "step": 6458 }, { "epoch": 2.938580527752502, "grad_norm": 1.1393990534440006, "learning_rate": 1.820453179791614e-06, "loss": 0.0578, "step": 6459 }, { "epoch": 2.9390354868061874, "grad_norm": 1.144073019256667, "learning_rate": 1.8197654671458581e-06, "loss": 0.0202, "step": 6460 }, { "epoch": 2.9394904458598727, "grad_norm": 1.8892506981296877, "learning_rate": 1.819077810085797e-06, "loss": 0.0991, "step": 6461 }, { "epoch": 2.9399454049135576, "grad_norm": 1.5123033867412035, "learning_rate": 1.8183902086676217e-06, "loss": 0.0602, "step": 6462 }, { "epoch": 2.940400363967243, "grad_norm": 1.4949147664422306, "learning_rate": 1.8177026629475208e-06, "loss": 0.0474, "step": 6463 }, { "epoch": 2.9408553230209282, "grad_norm": 1.0091604326164765, "learning_rate": 1.8170151729816776e-06, "loss": 0.0303, "step": 6464 }, { "epoch": 2.941310282074613, "grad_norm": 1.0840854539236011, "learning_rate": 1.8163277388262678e-06, "loss": 0.0334, "step": 6465 }, { "epoch": 2.9417652411282984, "grad_norm": 1.4090577341673047, "learning_rate": 1.81564036053747e-06, "loss": 0.0565, "step": 6466 }, { "epoch": 2.9422202001819837, "grad_norm": 1.2412995727375888, "learning_rate": 1.8149530381714508e-06, "loss": 0.0297, "step": 6467 }, { "epoch": 2.9426751592356686, "grad_norm": 2.202566825026762, "learning_rate": 1.8142657717843756e-06, "loss": 0.0419, "step": 6468 }, { "epoch": 2.943130118289354, "grad_norm": 1.6812501417782229, "learning_rate": 1.8135785614324054e-06, "loss": 0.0323, "step": 6469 }, { "epoch": 2.943585077343039, "grad_norm": 1.1799658791888972, "learning_rate": 1.8128914071716943e-06, "loss": 0.0412, "step": 6470 }, { "epoch": 2.944040036396724, "grad_norm": 1.1854932034275336, "learning_rate": 1.8122043090583951e-06, "loss": 0.0491, "step": 6471 }, { "epoch": 2.9444949954504094, "grad_norm": 1.2940809401495506, "learning_rate": 1.811517267148653e-06, "loss": 0.0623, "step": 6472 }, { "epoch": 2.9449499545040947, "grad_norm": 0.9436733296411024, "learning_rate": 1.810830281498611e-06, "loss": 0.0339, "step": 6473 }, { "epoch": 2.9454049135577796, "grad_norm": 1.3006319302513918, "learning_rate": 1.8101433521644063e-06, "loss": 0.0423, "step": 6474 }, { "epoch": 2.945859872611465, "grad_norm": 1.2657378950998217, "learning_rate": 1.8094564792021713e-06, "loss": 0.0393, "step": 6475 }, { "epoch": 2.94631483166515, "grad_norm": 0.908333769545082, "learning_rate": 1.8087696626680352e-06, "loss": 0.0322, "step": 6476 }, { "epoch": 2.946769790718835, "grad_norm": 2.4177155638473775, "learning_rate": 1.8080829026181197e-06, "loss": 0.0689, "step": 6477 }, { "epoch": 2.9472247497725204, "grad_norm": 1.671868451532666, "learning_rate": 1.8073961991085453e-06, "loss": 0.076, "step": 6478 }, { "epoch": 2.9476797088262057, "grad_norm": 1.2921194789676902, "learning_rate": 1.8067095521954248e-06, "loss": 0.0655, "step": 6479 }, { "epoch": 2.9481346678798905, "grad_norm": 1.4539415004197282, "learning_rate": 1.8060229619348693e-06, "loss": 0.0376, "step": 6480 }, { "epoch": 2.948589626933576, "grad_norm": 1.2402812565296462, "learning_rate": 1.805336428382984e-06, "loss": 0.0392, "step": 6481 }, { "epoch": 2.949044585987261, "grad_norm": 1.1232274032633751, "learning_rate": 1.8046499515958683e-06, "loss": 0.0413, "step": 6482 }, { "epoch": 2.9494995450409465, "grad_norm": 1.3426806809915857, "learning_rate": 1.8039635316296184e-06, "loss": 0.0742, "step": 6483 }, { "epoch": 2.9499545040946313, "grad_norm": 1.7896024292088544, "learning_rate": 1.8032771685403252e-06, "loss": 0.0674, "step": 6484 }, { "epoch": 2.9504094631483166, "grad_norm": 1.1839950343387071, "learning_rate": 1.802590862384076e-06, "loss": 0.023, "step": 6485 }, { "epoch": 2.950864422202002, "grad_norm": 1.3388091196372573, "learning_rate": 1.801904613216951e-06, "loss": 0.0741, "step": 6486 }, { "epoch": 2.951319381255687, "grad_norm": 1.1361664065905532, "learning_rate": 1.801218421095029e-06, "loss": 0.0358, "step": 6487 }, { "epoch": 2.951774340309372, "grad_norm": 0.8800343998523913, "learning_rate": 1.8005322860743824e-06, "loss": 0.0273, "step": 6488 }, { "epoch": 2.9522292993630574, "grad_norm": 2.9567407581343543, "learning_rate": 1.7998462082110779e-06, "loss": 0.0386, "step": 6489 }, { "epoch": 2.9526842584167428, "grad_norm": 1.3966592736962737, "learning_rate": 1.7991601875611803e-06, "loss": 0.0288, "step": 6490 }, { "epoch": 2.9531392174704276, "grad_norm": 1.0952929234880837, "learning_rate": 1.7984742241807461e-06, "loss": 0.0172, "step": 6491 }, { "epoch": 2.953594176524113, "grad_norm": 1.3993181040525213, "learning_rate": 1.7977883181258316e-06, "loss": 0.04, "step": 6492 }, { "epoch": 2.9540491355777982, "grad_norm": 1.3939940514161064, "learning_rate": 1.797102469452483e-06, "loss": 0.0465, "step": 6493 }, { "epoch": 2.954504094631483, "grad_norm": 0.7734567215544216, "learning_rate": 1.7964166782167468e-06, "loss": 0.0178, "step": 6494 }, { "epoch": 2.9549590536851684, "grad_norm": 1.250385429497852, "learning_rate": 1.795730944474663e-06, "loss": 0.0289, "step": 6495 }, { "epoch": 2.9554140127388537, "grad_norm": 1.338179092990669, "learning_rate": 1.7950452682822655e-06, "loss": 0.0398, "step": 6496 }, { "epoch": 2.9558689717925386, "grad_norm": 1.1516867687692396, "learning_rate": 1.7943596496955856e-06, "loss": 0.0492, "step": 6497 }, { "epoch": 2.956323930846224, "grad_norm": 1.4590913208264673, "learning_rate": 1.7936740887706478e-06, "loss": 0.0518, "step": 6498 }, { "epoch": 2.9567788898999092, "grad_norm": 1.52718638999095, "learning_rate": 1.7929885855634743e-06, "loss": 0.0841, "step": 6499 }, { "epoch": 2.957233848953594, "grad_norm": 1.1573575396234783, "learning_rate": 1.79230314013008e-06, "loss": 0.0372, "step": 6500 }, { "epoch": 2.9576888080072794, "grad_norm": 1.65109114238502, "learning_rate": 1.7916177525264775e-06, "loss": 0.0718, "step": 6501 }, { "epoch": 2.9581437670609647, "grad_norm": 1.2976473336080638, "learning_rate": 1.790932422808674e-06, "loss": 0.0567, "step": 6502 }, { "epoch": 2.9585987261146496, "grad_norm": 1.362230893942166, "learning_rate": 1.7902471510326701e-06, "loss": 0.0349, "step": 6503 }, { "epoch": 2.959053685168335, "grad_norm": 1.3388268898852798, "learning_rate": 1.7895619372544636e-06, "loss": 0.0775, "step": 6504 }, { "epoch": 2.95950864422202, "grad_norm": 1.0166008263289066, "learning_rate": 1.7888767815300481e-06, "loss": 0.0496, "step": 6505 }, { "epoch": 2.959963603275705, "grad_norm": 1.5699098652329357, "learning_rate": 1.78819168391541e-06, "loss": 0.041, "step": 6506 }, { "epoch": 2.9604185623293904, "grad_norm": 1.1189905920273184, "learning_rate": 1.7875066444665324e-06, "loss": 0.0468, "step": 6507 }, { "epoch": 2.9608735213830757, "grad_norm": 1.1885568380961138, "learning_rate": 1.7868216632393951e-06, "loss": 0.0356, "step": 6508 }, { "epoch": 2.9613284804367606, "grad_norm": 1.087515797392964, "learning_rate": 1.7861367402899705e-06, "loss": 0.0338, "step": 6509 }, { "epoch": 2.961783439490446, "grad_norm": 0.704359193830936, "learning_rate": 1.7854518756742278e-06, "loss": 0.0283, "step": 6510 }, { "epoch": 2.962238398544131, "grad_norm": 1.1521104478146131, "learning_rate": 1.784767069448131e-06, "loss": 0.039, "step": 6511 }, { "epoch": 2.962693357597816, "grad_norm": 2.282322353472128, "learning_rate": 1.7840823216676395e-06, "loss": 0.0846, "step": 6512 }, { "epoch": 2.9631483166515014, "grad_norm": 1.3183786998186893, "learning_rate": 1.783397632388707e-06, "loss": 0.0335, "step": 6513 }, { "epoch": 2.9636032757051867, "grad_norm": 1.3859161845362749, "learning_rate": 1.7827130016672836e-06, "loss": 0.0641, "step": 6514 }, { "epoch": 2.9640582347588715, "grad_norm": 1.516108168823944, "learning_rate": 1.7820284295593155e-06, "loss": 0.0531, "step": 6515 }, { "epoch": 2.964513193812557, "grad_norm": 1.1709680516071004, "learning_rate": 1.7813439161207412e-06, "loss": 0.0407, "step": 6516 }, { "epoch": 2.964968152866242, "grad_norm": 1.3702535680861332, "learning_rate": 1.7806594614074973e-06, "loss": 0.0239, "step": 6517 }, { "epoch": 2.965423111919927, "grad_norm": 1.237433158228125, "learning_rate": 1.7799750654755126e-06, "loss": 0.0334, "step": 6518 }, { "epoch": 2.9658780709736123, "grad_norm": 2.024607590433574, "learning_rate": 1.7792907283807154e-06, "loss": 0.0547, "step": 6519 }, { "epoch": 2.9663330300272976, "grad_norm": 1.3803856061957827, "learning_rate": 1.778606450179024e-06, "loss": 0.0531, "step": 6520 }, { "epoch": 2.9667879890809825, "grad_norm": 1.1755136134219866, "learning_rate": 1.7779222309263556e-06, "loss": 0.0349, "step": 6521 }, { "epoch": 2.967242948134668, "grad_norm": 1.4012678416935043, "learning_rate": 1.7772380706786222e-06, "loss": 0.0712, "step": 6522 }, { "epoch": 2.967697907188353, "grad_norm": 1.2535151201576178, "learning_rate": 1.7765539694917294e-06, "loss": 0.0317, "step": 6523 }, { "epoch": 2.968152866242038, "grad_norm": 0.9617633668346146, "learning_rate": 1.7758699274215796e-06, "loss": 0.022, "step": 6524 }, { "epoch": 2.9686078252957233, "grad_norm": 1.6415944788953625, "learning_rate": 1.7751859445240688e-06, "loss": 0.0469, "step": 6525 }, { "epoch": 2.9690627843494086, "grad_norm": 1.3520261413085852, "learning_rate": 1.7745020208550897e-06, "loss": 0.0592, "step": 6526 }, { "epoch": 2.9695177434030935, "grad_norm": 1.2807964282359827, "learning_rate": 1.7738181564705288e-06, "loss": 0.0542, "step": 6527 }, { "epoch": 2.969972702456779, "grad_norm": 1.1267685341905755, "learning_rate": 1.7731343514262683e-06, "loss": 0.0498, "step": 6528 }, { "epoch": 2.970427661510464, "grad_norm": 1.6317550695780536, "learning_rate": 1.772450605778187e-06, "loss": 0.0718, "step": 6529 }, { "epoch": 2.970882620564149, "grad_norm": 1.2324165979358044, "learning_rate": 1.771766919582156e-06, "loss": 0.0259, "step": 6530 }, { "epoch": 2.9713375796178343, "grad_norm": 0.7681588310287301, "learning_rate": 1.7710832928940444e-06, "loss": 0.0228, "step": 6531 }, { "epoch": 2.9717925386715196, "grad_norm": 1.451632751551344, "learning_rate": 1.7703997257697136e-06, "loss": 0.0612, "step": 6532 }, { "epoch": 2.9722474977252045, "grad_norm": 1.5428696526101813, "learning_rate": 1.769716218265023e-06, "loss": 0.0412, "step": 6533 }, { "epoch": 2.97270245677889, "grad_norm": 1.5837701049320498, "learning_rate": 1.7690327704358245e-06, "loss": 0.0629, "step": 6534 }, { "epoch": 2.973157415832575, "grad_norm": 1.7579335748795921, "learning_rate": 1.7683493823379666e-06, "loss": 0.0746, "step": 6535 }, { "epoch": 2.9736123748862604, "grad_norm": 1.343315284536529, "learning_rate": 1.7676660540272945e-06, "loss": 0.086, "step": 6536 }, { "epoch": 2.9740673339399453, "grad_norm": 1.5241842554555087, "learning_rate": 1.7669827855596439e-06, "loss": 0.0326, "step": 6537 }, { "epoch": 2.9745222929936306, "grad_norm": 1.4193332434898622, "learning_rate": 1.766299576990851e-06, "loss": 0.0453, "step": 6538 }, { "epoch": 2.974977252047316, "grad_norm": 1.1754801177660499, "learning_rate": 1.765616428376743e-06, "loss": 0.0226, "step": 6539 }, { "epoch": 2.9754322111010008, "grad_norm": 1.409525065507084, "learning_rate": 1.7649333397731433e-06, "loss": 0.0295, "step": 6540 }, { "epoch": 2.975887170154686, "grad_norm": 1.0554611558452034, "learning_rate": 1.7642503112358725e-06, "loss": 0.0096, "step": 6541 }, { "epoch": 2.9763421292083714, "grad_norm": 1.417940216681696, "learning_rate": 1.7635673428207424e-06, "loss": 0.0422, "step": 6542 }, { "epoch": 2.9767970882620567, "grad_norm": 0.6603776009321584, "learning_rate": 1.762884434583564e-06, "loss": 0.0336, "step": 6543 }, { "epoch": 2.9772520473157416, "grad_norm": 1.2427927953607554, "learning_rate": 1.7622015865801412e-06, "loss": 0.0246, "step": 6544 }, { "epoch": 2.977707006369427, "grad_norm": 1.291793802476831, "learning_rate": 1.7615187988662724e-06, "loss": 0.0306, "step": 6545 }, { "epoch": 2.978161965423112, "grad_norm": 1.3795030599337146, "learning_rate": 1.760836071497753e-06, "loss": 0.0409, "step": 6546 }, { "epoch": 2.978616924476797, "grad_norm": 1.0946782229338219, "learning_rate": 1.7601534045303708e-06, "loss": 0.0173, "step": 6547 }, { "epoch": 2.9790718835304824, "grad_norm": 1.216451401551823, "learning_rate": 1.7594707980199125e-06, "loss": 0.0864, "step": 6548 }, { "epoch": 2.9795268425841677, "grad_norm": 1.4894551125558462, "learning_rate": 1.758788252022155e-06, "loss": 0.0652, "step": 6549 }, { "epoch": 2.9799818016378525, "grad_norm": 1.794702614469512, "learning_rate": 1.7581057665928747e-06, "loss": 0.0629, "step": 6550 }, { "epoch": 2.980436760691538, "grad_norm": 1.0576503620191264, "learning_rate": 1.7574233417878414e-06, "loss": 0.039, "step": 6551 }, { "epoch": 2.980891719745223, "grad_norm": 1.5898842222907013, "learning_rate": 1.7567409776628187e-06, "loss": 0.0617, "step": 6552 }, { "epoch": 2.981346678798908, "grad_norm": 0.9023347603978038, "learning_rate": 1.756058674273567e-06, "loss": 0.0394, "step": 6553 }, { "epoch": 2.9818016378525933, "grad_norm": 1.3755549484654794, "learning_rate": 1.755376431675841e-06, "loss": 0.03, "step": 6554 }, { "epoch": 2.9822565969062786, "grad_norm": 1.183005831587578, "learning_rate": 1.75469424992539e-06, "loss": 0.0403, "step": 6555 }, { "epoch": 2.9827115559599635, "grad_norm": 1.3766814844958197, "learning_rate": 1.754012129077959e-06, "loss": 0.0386, "step": 6556 }, { "epoch": 2.983166515013649, "grad_norm": 1.6235530212453555, "learning_rate": 1.7533300691892874e-06, "loss": 0.041, "step": 6557 }, { "epoch": 2.983621474067334, "grad_norm": 1.2607962164958089, "learning_rate": 1.752648070315112e-06, "loss": 0.0525, "step": 6558 }, { "epoch": 2.984076433121019, "grad_norm": 1.042801479318249, "learning_rate": 1.7519661325111603e-06, "loss": 0.0348, "step": 6559 }, { "epoch": 2.9845313921747043, "grad_norm": 1.2674796196132017, "learning_rate": 1.7512842558331588e-06, "loss": 0.056, "step": 6560 }, { "epoch": 2.9849863512283896, "grad_norm": 1.882851185780373, "learning_rate": 1.7506024403368262e-06, "loss": 0.0674, "step": 6561 }, { "epoch": 2.9854413102820745, "grad_norm": 1.7473257949517715, "learning_rate": 1.7499206860778786e-06, "loss": 0.0265, "step": 6562 }, { "epoch": 2.98589626933576, "grad_norm": 1.5254410276898396, "learning_rate": 1.7492389931120241e-06, "loss": 0.0389, "step": 6563 }, { "epoch": 2.986351228389445, "grad_norm": 1.4215686645827523, "learning_rate": 1.748557361494969e-06, "loss": 0.0526, "step": 6564 }, { "epoch": 2.98680618744313, "grad_norm": 1.067583190350828, "learning_rate": 1.7478757912824135e-06, "loss": 0.029, "step": 6565 }, { "epoch": 2.9872611464968153, "grad_norm": 1.0678145547583475, "learning_rate": 1.7471942825300514e-06, "loss": 0.0283, "step": 6566 }, { "epoch": 2.9877161055505006, "grad_norm": 1.496154088702822, "learning_rate": 1.7465128352935734e-06, "loss": 0.0358, "step": 6567 }, { "epoch": 2.9881710646041855, "grad_norm": 1.4399214697139529, "learning_rate": 1.7458314496286633e-06, "loss": 0.0244, "step": 6568 }, { "epoch": 2.988626023657871, "grad_norm": 0.8453009369510988, "learning_rate": 1.7451501255910014e-06, "loss": 0.0157, "step": 6569 }, { "epoch": 2.989080982711556, "grad_norm": 1.3955447470625273, "learning_rate": 1.7444688632362616e-06, "loss": 0.023, "step": 6570 }, { "epoch": 2.989535941765241, "grad_norm": 1.1745865595512803, "learning_rate": 1.743787662620115e-06, "loss": 0.0396, "step": 6571 }, { "epoch": 2.9899909008189263, "grad_norm": 1.4084912380486838, "learning_rate": 1.7431065237982258e-06, "loss": 0.0472, "step": 6572 }, { "epoch": 2.9904458598726116, "grad_norm": 1.260179212218826, "learning_rate": 1.7424254468262531e-06, "loss": 0.0551, "step": 6573 }, { "epoch": 2.9909008189262964, "grad_norm": 1.259582276297131, "learning_rate": 1.7417444317598522e-06, "loss": 0.0618, "step": 6574 }, { "epoch": 2.9913557779799818, "grad_norm": 1.4632755796054802, "learning_rate": 1.741063478654672e-06, "loss": 0.0882, "step": 6575 }, { "epoch": 2.991810737033667, "grad_norm": 1.277670749823898, "learning_rate": 1.7403825875663567e-06, "loss": 0.0498, "step": 6576 }, { "epoch": 2.992265696087352, "grad_norm": 1.425457231866722, "learning_rate": 1.7397017585505454e-06, "loss": 0.0322, "step": 6577 }, { "epoch": 2.9927206551410372, "grad_norm": 1.5845641611956964, "learning_rate": 1.7390209916628736e-06, "loss": 0.0708, "step": 6578 }, { "epoch": 2.9931756141947226, "grad_norm": 1.164097606797873, "learning_rate": 1.7383402869589696e-06, "loss": 0.0573, "step": 6579 }, { "epoch": 2.9936305732484074, "grad_norm": 1.4492128297604512, "learning_rate": 1.7376596444944583e-06, "loss": 0.0442, "step": 6580 }, { "epoch": 2.9940855323020927, "grad_norm": 1.0404048068474192, "learning_rate": 1.7369790643249573e-06, "loss": 0.0179, "step": 6581 }, { "epoch": 2.994540491355778, "grad_norm": 0.7873114688959021, "learning_rate": 1.7362985465060823e-06, "loss": 0.0112, "step": 6582 }, { "epoch": 2.994995450409463, "grad_norm": 1.49434953601512, "learning_rate": 1.7356180910934407e-06, "loss": 0.0298, "step": 6583 }, { "epoch": 2.9954504094631482, "grad_norm": 1.1921547491338333, "learning_rate": 1.7349376981426358e-06, "loss": 0.0416, "step": 6584 }, { "epoch": 2.9959053685168335, "grad_norm": 1.719800608107341, "learning_rate": 1.7342573677092684e-06, "loss": 0.0301, "step": 6585 }, { "epoch": 2.9963603275705184, "grad_norm": 1.2089215224119132, "learning_rate": 1.7335770998489304e-06, "loss": 0.0329, "step": 6586 }, { "epoch": 2.9968152866242037, "grad_norm": 1.1040209005084678, "learning_rate": 1.7328968946172114e-06, "loss": 0.0358, "step": 6587 }, { "epoch": 2.997270245677889, "grad_norm": 1.487780532456455, "learning_rate": 1.7322167520696933e-06, "loss": 0.0501, "step": 6588 }, { "epoch": 2.997725204731574, "grad_norm": 1.4336972326941086, "learning_rate": 1.7315366722619554e-06, "loss": 0.0463, "step": 6589 }, { "epoch": 2.998180163785259, "grad_norm": 1.2905577799675605, "learning_rate": 1.7308566552495698e-06, "loss": 0.039, "step": 6590 }, { "epoch": 2.9986351228389445, "grad_norm": 0.45917813626624837, "learning_rate": 1.7301767010881044e-06, "loss": 0.0106, "step": 6591 }, { "epoch": 2.99909008189263, "grad_norm": 1.6223189679198124, "learning_rate": 1.729496809833124e-06, "loss": 0.0464, "step": 6592 }, { "epoch": 2.9995450409463147, "grad_norm": 1.6963019898076352, "learning_rate": 1.7288169815401833e-06, "loss": 0.0385, "step": 6593 }, { "epoch": 3.0, "grad_norm": 0.8596105312896911, "learning_rate": 1.7281372162648375e-06, "loss": 0.0268, "step": 6594 }, { "epoch": 3.0004549590536853, "grad_norm": 0.8450929659665005, "learning_rate": 1.7274575140626318e-06, "loss": 0.007, "step": 6595 }, { "epoch": 3.00090991810737, "grad_norm": 1.0347077841424577, "learning_rate": 1.7267778749891097e-06, "loss": 0.0117, "step": 6596 }, { "epoch": 3.0013648771610555, "grad_norm": 0.9934564403343767, "learning_rate": 1.7260982990998075e-06, "loss": 0.0122, "step": 6597 }, { "epoch": 3.001819836214741, "grad_norm": 0.6037398453389746, "learning_rate": 1.7254187864502569e-06, "loss": 0.0154, "step": 6598 }, { "epoch": 3.0022747952684257, "grad_norm": 0.6764993930172275, "learning_rate": 1.724739337095986e-06, "loss": 0.0162, "step": 6599 }, { "epoch": 3.002729754322111, "grad_norm": 0.7920830267307468, "learning_rate": 1.724059951092515e-06, "loss": 0.008, "step": 6600 }, { "epoch": 3.0031847133757963, "grad_norm": 1.1791749026979832, "learning_rate": 1.7233806284953613e-06, "loss": 0.0298, "step": 6601 }, { "epoch": 3.003639672429481, "grad_norm": 0.6497200728706654, "learning_rate": 1.7227013693600348e-06, "loss": 0.0231, "step": 6602 }, { "epoch": 3.0040946314831665, "grad_norm": 0.7834715554379803, "learning_rate": 1.7220221737420428e-06, "loss": 0.0113, "step": 6603 }, { "epoch": 3.0045495905368518, "grad_norm": 0.6018642513065381, "learning_rate": 1.7213430416968848e-06, "loss": 0.0096, "step": 6604 }, { "epoch": 3.0050045495905366, "grad_norm": 1.2281258491108884, "learning_rate": 1.7206639732800568e-06, "loss": 0.029, "step": 6605 }, { "epoch": 3.005459508644222, "grad_norm": 0.6840678738239503, "learning_rate": 1.7199849685470498e-06, "loss": 0.0102, "step": 6606 }, { "epoch": 3.0059144676979073, "grad_norm": 1.035884263615007, "learning_rate": 1.7193060275533488e-06, "loss": 0.0176, "step": 6607 }, { "epoch": 3.0063694267515926, "grad_norm": 0.8044579648904372, "learning_rate": 1.718627150354434e-06, "loss": 0.0096, "step": 6608 }, { "epoch": 3.0068243858052774, "grad_norm": 0.6494355105866253, "learning_rate": 1.7179483370057797e-06, "loss": 0.0093, "step": 6609 }, { "epoch": 3.0072793448589628, "grad_norm": 0.8957268307514487, "learning_rate": 1.7172695875628553e-06, "loss": 0.016, "step": 6610 }, { "epoch": 3.007734303912648, "grad_norm": 1.6768217416438715, "learning_rate": 1.7165909020811255e-06, "loss": 0.028, "step": 6611 }, { "epoch": 3.008189262966333, "grad_norm": 0.8256028457218672, "learning_rate": 1.7159122806160488e-06, "loss": 0.0155, "step": 6612 }, { "epoch": 3.0086442220200182, "grad_norm": 1.3724174804558966, "learning_rate": 1.7152337232230798e-06, "loss": 0.0254, "step": 6613 }, { "epoch": 3.0090991810737036, "grad_norm": 0.5494582613145093, "learning_rate": 1.714555229957668e-06, "loss": 0.0194, "step": 6614 }, { "epoch": 3.0095541401273884, "grad_norm": 0.950724040036526, "learning_rate": 1.7138768008752545e-06, "loss": 0.0148, "step": 6615 }, { "epoch": 3.0100090991810737, "grad_norm": 0.7668026898966924, "learning_rate": 1.7131984360312799e-06, "loss": 0.0057, "step": 6616 }, { "epoch": 3.010464058234759, "grad_norm": 1.2993474822865427, "learning_rate": 1.7125201354811749e-06, "loss": 0.0043, "step": 6617 }, { "epoch": 3.010919017288444, "grad_norm": 1.0631606408933136, "learning_rate": 1.711841899280369e-06, "loss": 0.0181, "step": 6618 }, { "epoch": 3.011373976342129, "grad_norm": 0.9157784653077969, "learning_rate": 1.7111637274842827e-06, "loss": 0.018, "step": 6619 }, { "epoch": 3.0118289353958145, "grad_norm": 0.5997471915420164, "learning_rate": 1.7104856201483346e-06, "loss": 0.0061, "step": 6620 }, { "epoch": 3.0122838944494994, "grad_norm": 0.8025506818963243, "learning_rate": 1.709807577327937e-06, "loss": 0.026, "step": 6621 }, { "epoch": 3.0127388535031847, "grad_norm": 0.7058672496033535, "learning_rate": 1.7091295990784952e-06, "loss": 0.0134, "step": 6622 }, { "epoch": 3.01319381255687, "grad_norm": 1.1425768475304325, "learning_rate": 1.708451685455411e-06, "loss": 0.0246, "step": 6623 }, { "epoch": 3.013648771610555, "grad_norm": 1.8291440751180574, "learning_rate": 1.7077738365140805e-06, "loss": 0.0197, "step": 6624 }, { "epoch": 3.01410373066424, "grad_norm": 1.8407133746958415, "learning_rate": 1.707096052309895e-06, "loss": 0.032, "step": 6625 }, { "epoch": 3.0145586897179255, "grad_norm": 1.7054435208125864, "learning_rate": 1.706418332898238e-06, "loss": 0.0286, "step": 6626 }, { "epoch": 3.0150136487716104, "grad_norm": 0.4826674578713323, "learning_rate": 1.7057406783344918e-06, "loss": 0.0043, "step": 6627 }, { "epoch": 3.0154686078252957, "grad_norm": 0.6534844005402582, "learning_rate": 1.705063088674031e-06, "loss": 0.0112, "step": 6628 }, { "epoch": 3.015923566878981, "grad_norm": 0.7657525579018354, "learning_rate": 1.704385563972224e-06, "loss": 0.0063, "step": 6629 }, { "epoch": 3.016378525932666, "grad_norm": 1.0520146138627524, "learning_rate": 1.7037081042844367e-06, "loss": 0.0095, "step": 6630 }, { "epoch": 3.016833484986351, "grad_norm": 1.0609600907438999, "learning_rate": 1.7030307096660262e-06, "loss": 0.0118, "step": 6631 }, { "epoch": 3.0172884440400365, "grad_norm": 1.2959467297350045, "learning_rate": 1.7023533801723474e-06, "loss": 0.0411, "step": 6632 }, { "epoch": 3.0177434030937214, "grad_norm": 0.9912304468016923, "learning_rate": 1.7016761158587474e-06, "loss": 0.0198, "step": 6633 }, { "epoch": 3.0181983621474067, "grad_norm": 0.6681184077581709, "learning_rate": 1.7009989167805707e-06, "loss": 0.0171, "step": 6634 }, { "epoch": 3.018653321201092, "grad_norm": 0.8423342557301313, "learning_rate": 1.7003217829931545e-06, "loss": 0.0055, "step": 6635 }, { "epoch": 3.0191082802547773, "grad_norm": 0.8280781806985265, "learning_rate": 1.6996447145518307e-06, "loss": 0.024, "step": 6636 }, { "epoch": 3.019563239308462, "grad_norm": 1.2712898741435057, "learning_rate": 1.6989677115119268e-06, "loss": 0.0132, "step": 6637 }, { "epoch": 3.0200181983621475, "grad_norm": 1.3212906158508197, "learning_rate": 1.6982907739287636e-06, "loss": 0.012, "step": 6638 }, { "epoch": 3.0204731574158328, "grad_norm": 0.4795615505234215, "learning_rate": 1.6976139018576581e-06, "loss": 0.0063, "step": 6639 }, { "epoch": 3.0209281164695176, "grad_norm": 1.3566368673110294, "learning_rate": 1.6969370953539202e-06, "loss": 0.0258, "step": 6640 }, { "epoch": 3.021383075523203, "grad_norm": 0.9795900012768285, "learning_rate": 1.6962603544728567e-06, "loss": 0.0198, "step": 6641 }, { "epoch": 3.0218380345768883, "grad_norm": 0.8535682675137541, "learning_rate": 1.695583679269768e-06, "loss": 0.0126, "step": 6642 }, { "epoch": 3.022292993630573, "grad_norm": 0.5123868198468435, "learning_rate": 1.6949070697999479e-06, "loss": 0.0079, "step": 6643 }, { "epoch": 3.0227479526842584, "grad_norm": 1.1653260816774855, "learning_rate": 1.6942305261186865e-06, "loss": 0.0557, "step": 6644 }, { "epoch": 3.0232029117379438, "grad_norm": 1.0281588838528546, "learning_rate": 1.6935540482812678e-06, "loss": 0.0071, "step": 6645 }, { "epoch": 3.0236578707916286, "grad_norm": 0.7588554586888987, "learning_rate": 1.6928776363429699e-06, "loss": 0.0046, "step": 6646 }, { "epoch": 3.024112829845314, "grad_norm": 1.554362522616789, "learning_rate": 1.6922012903590663e-06, "loss": 0.0295, "step": 6647 }, { "epoch": 3.0245677888989992, "grad_norm": 1.161736490885813, "learning_rate": 1.691525010384826e-06, "loss": 0.0208, "step": 6648 }, { "epoch": 3.025022747952684, "grad_norm": 0.9085941737657202, "learning_rate": 1.6908487964755105e-06, "loss": 0.0062, "step": 6649 }, { "epoch": 3.0254777070063694, "grad_norm": 1.0458226290781643, "learning_rate": 1.690172648686378e-06, "loss": 0.0211, "step": 6650 }, { "epoch": 3.0259326660600547, "grad_norm": 0.7632912154739573, "learning_rate": 1.6894965670726782e-06, "loss": 0.013, "step": 6651 }, { "epoch": 3.0263876251137396, "grad_norm": 0.9337432746401524, "learning_rate": 1.6888205516896599e-06, "loss": 0.0136, "step": 6652 }, { "epoch": 3.026842584167425, "grad_norm": 0.6657879044504245, "learning_rate": 1.6881446025925624e-06, "loss": 0.0186, "step": 6653 }, { "epoch": 3.02729754322111, "grad_norm": 0.6980149954950136, "learning_rate": 1.6874687198366207e-06, "loss": 0.0104, "step": 6654 }, { "epoch": 3.027752502274795, "grad_norm": 0.5795905849015976, "learning_rate": 1.6867929034770672e-06, "loss": 0.0149, "step": 6655 }, { "epoch": 3.0282074613284804, "grad_norm": 0.9313874254005436, "learning_rate": 1.6861171535691245e-06, "loss": 0.0092, "step": 6656 }, { "epoch": 3.0286624203821657, "grad_norm": 0.761175270213566, "learning_rate": 1.6854414701680133e-06, "loss": 0.0072, "step": 6657 }, { "epoch": 3.0291173794358506, "grad_norm": 1.1955236815762362, "learning_rate": 1.684765853328946e-06, "loss": 0.023, "step": 6658 }, { "epoch": 3.029572338489536, "grad_norm": 1.2598827393227265, "learning_rate": 1.684090303107132e-06, "loss": 0.0196, "step": 6659 }, { "epoch": 3.030027297543221, "grad_norm": 1.1460055164938023, "learning_rate": 1.6834148195577737e-06, "loss": 0.0208, "step": 6660 }, { "epoch": 3.030482256596906, "grad_norm": 1.1611833606080506, "learning_rate": 1.6827394027360678e-06, "loss": 0.0366, "step": 6661 }, { "epoch": 3.0309372156505914, "grad_norm": 0.5411446248193297, "learning_rate": 1.6820640526972083e-06, "loss": 0.0127, "step": 6662 }, { "epoch": 3.0313921747042767, "grad_norm": 0.9672919541290518, "learning_rate": 1.68138876949638e-06, "loss": 0.0277, "step": 6663 }, { "epoch": 3.031847133757962, "grad_norm": 1.2280099073756496, "learning_rate": 1.6807135531887653e-06, "loss": 0.0098, "step": 6664 }, { "epoch": 3.032302092811647, "grad_norm": 0.8291349905793312, "learning_rate": 1.6800384038295386e-06, "loss": 0.0126, "step": 6665 }, { "epoch": 3.032757051865332, "grad_norm": 1.2205341763887938, "learning_rate": 1.6793633214738713e-06, "loss": 0.0169, "step": 6666 }, { "epoch": 3.0332120109190175, "grad_norm": 0.7307183925757023, "learning_rate": 1.6786883061769268e-06, "loss": 0.0049, "step": 6667 }, { "epoch": 3.0336669699727024, "grad_norm": 1.2732361845879236, "learning_rate": 1.6780133579938646e-06, "loss": 0.0415, "step": 6668 }, { "epoch": 3.0341219290263877, "grad_norm": 0.6475454815811565, "learning_rate": 1.6773384769798395e-06, "loss": 0.0102, "step": 6669 }, { "epoch": 3.034576888080073, "grad_norm": 0.779274929341675, "learning_rate": 1.6766636631899986e-06, "loss": 0.0056, "step": 6670 }, { "epoch": 3.035031847133758, "grad_norm": 0.7576420223637901, "learning_rate": 1.6759889166794851e-06, "loss": 0.0186, "step": 6671 }, { "epoch": 3.035486806187443, "grad_norm": 1.1514060942899789, "learning_rate": 1.6753142375034359e-06, "loss": 0.0345, "step": 6672 }, { "epoch": 3.0359417652411285, "grad_norm": 0.43433264605019306, "learning_rate": 1.6746396257169836e-06, "loss": 0.0036, "step": 6673 }, { "epoch": 3.0363967242948133, "grad_norm": 1.246262594507008, "learning_rate": 1.6739650813752526e-06, "loss": 0.0162, "step": 6674 }, { "epoch": 3.0368516833484986, "grad_norm": 1.7251378966697013, "learning_rate": 1.6732906045333651e-06, "loss": 0.027, "step": 6675 }, { "epoch": 3.037306642402184, "grad_norm": 0.9930950202200053, "learning_rate": 1.6726161952464371e-06, "loss": 0.0145, "step": 6676 }, { "epoch": 3.037761601455869, "grad_norm": 0.9392088135952996, "learning_rate": 1.6719418535695764e-06, "loss": 0.0158, "step": 6677 }, { "epoch": 3.038216560509554, "grad_norm": 0.6311894090301353, "learning_rate": 1.6712675795578883e-06, "loss": 0.0246, "step": 6678 }, { "epoch": 3.0386715195632394, "grad_norm": 1.3688291252585874, "learning_rate": 1.6705933732664708e-06, "loss": 0.0258, "step": 6679 }, { "epoch": 3.0391264786169243, "grad_norm": 0.6166305445638867, "learning_rate": 1.6699192347504178e-06, "loss": 0.015, "step": 6680 }, { "epoch": 3.0395814376706096, "grad_norm": 1.2402281195680713, "learning_rate": 1.669245164064815e-06, "loss": 0.035, "step": 6681 }, { "epoch": 3.040036396724295, "grad_norm": 0.932272326089519, "learning_rate": 1.6685711612647466e-06, "loss": 0.0133, "step": 6682 }, { "epoch": 3.04049135577798, "grad_norm": 0.9172821345276787, "learning_rate": 1.6678972264052884e-06, "loss": 0.0164, "step": 6683 }, { "epoch": 3.040946314831665, "grad_norm": 0.8750182827606666, "learning_rate": 1.667223359541511e-06, "loss": 0.0227, "step": 6684 }, { "epoch": 3.0414012738853504, "grad_norm": 0.7118178968955645, "learning_rate": 1.66654956072848e-06, "loss": 0.0198, "step": 6685 }, { "epoch": 3.0418562329390353, "grad_norm": 0.9310567799562325, "learning_rate": 1.6658758300212552e-06, "loss": 0.0338, "step": 6686 }, { "epoch": 3.0423111919927206, "grad_norm": 0.8050716544088155, "learning_rate": 1.66520216747489e-06, "loss": 0.0107, "step": 6687 }, { "epoch": 3.042766151046406, "grad_norm": 0.8787722838000573, "learning_rate": 1.6645285731444332e-06, "loss": 0.0105, "step": 6688 }, { "epoch": 3.0432211101000908, "grad_norm": 0.8513553752649614, "learning_rate": 1.6638550470849298e-06, "loss": 0.0163, "step": 6689 }, { "epoch": 3.043676069153776, "grad_norm": 0.7620474631670393, "learning_rate": 1.6631815893514154e-06, "loss": 0.0181, "step": 6690 }, { "epoch": 3.0441310282074614, "grad_norm": 1.1842706773700475, "learning_rate": 1.6625081999989228e-06, "loss": 0.0133, "step": 6691 }, { "epoch": 3.0445859872611467, "grad_norm": 0.8088964928895361, "learning_rate": 1.6618348790824778e-06, "loss": 0.012, "step": 6692 }, { "epoch": 3.0450409463148316, "grad_norm": 0.9490610230328843, "learning_rate": 1.6611616266571017e-06, "loss": 0.0394, "step": 6693 }, { "epoch": 3.045495905368517, "grad_norm": 1.1777088537203837, "learning_rate": 1.660488442777809e-06, "loss": 0.0117, "step": 6694 }, { "epoch": 3.045950864422202, "grad_norm": 1.1690496755428184, "learning_rate": 1.6598153274996088e-06, "loss": 0.0204, "step": 6695 }, { "epoch": 3.046405823475887, "grad_norm": 1.5475429077322242, "learning_rate": 1.6591422808775068e-06, "loss": 0.0153, "step": 6696 }, { "epoch": 3.0468607825295724, "grad_norm": 0.7259840668727896, "learning_rate": 1.6584693029665e-06, "loss": 0.0142, "step": 6697 }, { "epoch": 3.0473157415832577, "grad_norm": 0.6963564033020999, "learning_rate": 1.657796393821582e-06, "loss": 0.0354, "step": 6698 }, { "epoch": 3.0477707006369426, "grad_norm": 0.7267424930498577, "learning_rate": 1.6571235534977383e-06, "loss": 0.0098, "step": 6699 }, { "epoch": 3.048225659690628, "grad_norm": 1.0088562484195005, "learning_rate": 1.6564507820499526e-06, "loss": 0.0169, "step": 6700 }, { "epoch": 3.048680618744313, "grad_norm": 0.9175464428936948, "learning_rate": 1.6557780795331984e-06, "loss": 0.0157, "step": 6701 }, { "epoch": 3.049135577797998, "grad_norm": 0.8172318728922892, "learning_rate": 1.6551054460024468e-06, "loss": 0.0264, "step": 6702 }, { "epoch": 3.0495905368516834, "grad_norm": 1.4372246126605657, "learning_rate": 1.6544328815126639e-06, "loss": 0.0085, "step": 6703 }, { "epoch": 3.0500454959053687, "grad_norm": 0.8481595367855065, "learning_rate": 1.6537603861188068e-06, "loss": 0.0162, "step": 6704 }, { "epoch": 3.0505004549590535, "grad_norm": 1.3613541029644705, "learning_rate": 1.6530879598758299e-06, "loss": 0.028, "step": 6705 }, { "epoch": 3.050955414012739, "grad_norm": 0.741722505182829, "learning_rate": 1.6524156028386796e-06, "loss": 0.0191, "step": 6706 }, { "epoch": 3.051410373066424, "grad_norm": 0.5700766848941015, "learning_rate": 1.6517433150622992e-06, "loss": 0.0063, "step": 6707 }, { "epoch": 3.051865332120109, "grad_norm": 1.2687442257313741, "learning_rate": 1.651071096601624e-06, "loss": 0.0277, "step": 6708 }, { "epoch": 3.0523202911737943, "grad_norm": 1.1437349195130555, "learning_rate": 1.6503989475115842e-06, "loss": 0.0267, "step": 6709 }, { "epoch": 3.0527752502274796, "grad_norm": 0.9498961967828254, "learning_rate": 1.6497268678471069e-06, "loss": 0.0056, "step": 6710 }, { "epoch": 3.0532302092811645, "grad_norm": 0.5939965985390613, "learning_rate": 1.6490548576631095e-06, "loss": 0.0077, "step": 6711 }, { "epoch": 3.05368516833485, "grad_norm": 1.0001775646420945, "learning_rate": 1.648382917014507e-06, "loss": 0.0165, "step": 6712 }, { "epoch": 3.054140127388535, "grad_norm": 0.845024912571423, "learning_rate": 1.6477110459562062e-06, "loss": 0.0176, "step": 6713 }, { "epoch": 3.05459508644222, "grad_norm": 0.9459441019539542, "learning_rate": 1.64703924454311e-06, "loss": 0.0111, "step": 6714 }, { "epoch": 3.0550500454959053, "grad_norm": 0.6063711413384143, "learning_rate": 1.6463675128301146e-06, "loss": 0.0089, "step": 6715 }, { "epoch": 3.0555050045495906, "grad_norm": 0.8566158589919646, "learning_rate": 1.6456958508721106e-06, "loss": 0.01, "step": 6716 }, { "epoch": 3.055959963603276, "grad_norm": 1.6454214801419333, "learning_rate": 1.6450242587239845e-06, "loss": 0.0495, "step": 6717 }, { "epoch": 3.056414922656961, "grad_norm": 1.0787509268860203, "learning_rate": 1.6443527364406142e-06, "loss": 0.0175, "step": 6718 }, { "epoch": 3.056869881710646, "grad_norm": 0.79309001600755, "learning_rate": 1.6436812840768751e-06, "loss": 0.0154, "step": 6719 }, { "epoch": 3.0573248407643314, "grad_norm": 0.7867281810988929, "learning_rate": 1.6430099016876345e-06, "loss": 0.0115, "step": 6720 }, { "epoch": 3.0577797998180163, "grad_norm": 0.8778890228067188, "learning_rate": 1.6423385893277537e-06, "loss": 0.0176, "step": 6721 }, { "epoch": 3.0582347588717016, "grad_norm": 1.0601817397132967, "learning_rate": 1.6416673470520912e-06, "loss": 0.0218, "step": 6722 }, { "epoch": 3.058689717925387, "grad_norm": 0.8295387860236446, "learning_rate": 1.6409961749154952e-06, "loss": 0.0052, "step": 6723 }, { "epoch": 3.0591446769790718, "grad_norm": 1.8743570687483877, "learning_rate": 1.6403250729728134e-06, "loss": 0.0243, "step": 6724 }, { "epoch": 3.059599636032757, "grad_norm": 1.2187628235866097, "learning_rate": 1.639654041278885e-06, "loss": 0.0174, "step": 6725 }, { "epoch": 3.0600545950864424, "grad_norm": 0.9618550509348607, "learning_rate": 1.6389830798885425e-06, "loss": 0.0123, "step": 6726 }, { "epoch": 3.0605095541401273, "grad_norm": 0.7603110016650619, "learning_rate": 1.638312188856615e-06, "loss": 0.0051, "step": 6727 }, { "epoch": 3.0609645131938126, "grad_norm": 0.6854944378075257, "learning_rate": 1.6376413682379232e-06, "loss": 0.0049, "step": 6728 }, { "epoch": 3.061419472247498, "grad_norm": 0.6749098108240004, "learning_rate": 1.6369706180872851e-06, "loss": 0.0136, "step": 6729 }, { "epoch": 3.0618744313011828, "grad_norm": 0.8033394261045683, "learning_rate": 1.63629993845951e-06, "loss": 0.0064, "step": 6730 }, { "epoch": 3.062329390354868, "grad_norm": 1.4217447175343614, "learning_rate": 1.6356293294094037e-06, "loss": 0.01, "step": 6731 }, { "epoch": 3.0627843494085534, "grad_norm": 1.2563973950338267, "learning_rate": 1.6349587909917655e-06, "loss": 0.005, "step": 6732 }, { "epoch": 3.0632393084622382, "grad_norm": 1.1162399221836268, "learning_rate": 1.6342883232613883e-06, "loss": 0.0274, "step": 6733 }, { "epoch": 3.0636942675159236, "grad_norm": 0.7425836319926284, "learning_rate": 1.63361792627306e-06, "loss": 0.0098, "step": 6734 }, { "epoch": 3.064149226569609, "grad_norm": 1.431387858004974, "learning_rate": 1.6329476000815616e-06, "loss": 0.0385, "step": 6735 }, { "epoch": 3.0646041856232937, "grad_norm": 1.0741739751149875, "learning_rate": 1.6322773447416707e-06, "loss": 0.0089, "step": 6736 }, { "epoch": 3.065059144676979, "grad_norm": 1.5305036740339377, "learning_rate": 1.6316071603081551e-06, "loss": 0.0131, "step": 6737 }, { "epoch": 3.0655141037306644, "grad_norm": 1.179800516125893, "learning_rate": 1.6309370468357816e-06, "loss": 0.0205, "step": 6738 }, { "epoch": 3.065969062784349, "grad_norm": 1.7818318356538472, "learning_rate": 1.6302670043793084e-06, "loss": 0.0108, "step": 6739 }, { "epoch": 3.0664240218380345, "grad_norm": 1.8062586691496543, "learning_rate": 1.6295970329934873e-06, "loss": 0.0255, "step": 6740 }, { "epoch": 3.06687898089172, "grad_norm": 1.116919398349887, "learning_rate": 1.6289271327330663e-06, "loss": 0.0204, "step": 6741 }, { "epoch": 3.0673339399454047, "grad_norm": 1.2943012776479366, "learning_rate": 1.628257303652786e-06, "loss": 0.0321, "step": 6742 }, { "epoch": 3.06778889899909, "grad_norm": 0.8713414553428358, "learning_rate": 1.6275875458073828e-06, "loss": 0.0341, "step": 6743 }, { "epoch": 3.0682438580527753, "grad_norm": 3.530549207774594, "learning_rate": 1.6269178592515844e-06, "loss": 0.021, "step": 6744 }, { "epoch": 3.06869881710646, "grad_norm": 0.7626578768952453, "learning_rate": 1.6262482440401162e-06, "loss": 0.0102, "step": 6745 }, { "epoch": 3.0691537761601455, "grad_norm": 0.6088350614090168, "learning_rate": 1.6255787002276962e-06, "loss": 0.0053, "step": 6746 }, { "epoch": 3.069608735213831, "grad_norm": 0.7969218239738025, "learning_rate": 1.6249092278690353e-06, "loss": 0.0195, "step": 6747 }, { "epoch": 3.070063694267516, "grad_norm": 0.7633311263843767, "learning_rate": 1.6242398270188412e-06, "loss": 0.0071, "step": 6748 }, { "epoch": 3.070518653321201, "grad_norm": 0.9409471012855168, "learning_rate": 1.6235704977318128e-06, "loss": 0.0136, "step": 6749 }, { "epoch": 3.0709736123748863, "grad_norm": 1.1658155669706312, "learning_rate": 1.622901240062646e-06, "loss": 0.035, "step": 6750 }, { "epoch": 3.0714285714285716, "grad_norm": 0.8906406356577609, "learning_rate": 1.622232054066028e-06, "loss": 0.015, "step": 6751 }, { "epoch": 3.0718835304822565, "grad_norm": 2.5564701197276234, "learning_rate": 1.6215629397966432e-06, "loss": 0.0418, "step": 6752 }, { "epoch": 3.072338489535942, "grad_norm": 1.0159486915540066, "learning_rate": 1.620893897309168e-06, "loss": 0.012, "step": 6753 }, { "epoch": 3.072793448589627, "grad_norm": 0.8390181569065237, "learning_rate": 1.620224926658274e-06, "loss": 0.0121, "step": 6754 }, { "epoch": 3.073248407643312, "grad_norm": 0.9782543710930848, "learning_rate": 1.619556027898625e-06, "loss": 0.0167, "step": 6755 }, { "epoch": 3.0737033666969973, "grad_norm": 1.3471697664897195, "learning_rate": 1.6188872010848821e-06, "loss": 0.0101, "step": 6756 }, { "epoch": 3.0741583257506826, "grad_norm": 1.2384031422429242, "learning_rate": 1.6182184462716977e-06, "loss": 0.0302, "step": 6757 }, { "epoch": 3.0746132848043675, "grad_norm": 1.1421063564877423, "learning_rate": 1.617549763513719e-06, "loss": 0.037, "step": 6758 }, { "epoch": 3.0750682438580528, "grad_norm": 1.0107287837220154, "learning_rate": 1.6168811528655897e-06, "loss": 0.0124, "step": 6759 }, { "epoch": 3.075523202911738, "grad_norm": 0.21293497064312586, "learning_rate": 1.616212614381944e-06, "loss": 0.0008, "step": 6760 }, { "epoch": 3.075978161965423, "grad_norm": 0.4787183225052584, "learning_rate": 1.6155441481174128e-06, "loss": 0.0054, "step": 6761 }, { "epoch": 3.0764331210191083, "grad_norm": 1.2681875175911026, "learning_rate": 1.614875754126619e-06, "loss": 0.0139, "step": 6762 }, { "epoch": 3.0768880800727936, "grad_norm": 0.7019731367890785, "learning_rate": 1.614207432464182e-06, "loss": 0.0092, "step": 6763 }, { "epoch": 3.0773430391264784, "grad_norm": 1.0910213629465615, "learning_rate": 1.6135391831847127e-06, "loss": 0.0096, "step": 6764 }, { "epoch": 3.0777979981801638, "grad_norm": 1.0649835386120086, "learning_rate": 1.6128710063428179e-06, "loss": 0.0185, "step": 6765 }, { "epoch": 3.078252957233849, "grad_norm": 1.0925005008114888, "learning_rate": 1.612202901993099e-06, "loss": 0.0309, "step": 6766 }, { "epoch": 3.078707916287534, "grad_norm": 0.858909865315797, "learning_rate": 1.6115348701901496e-06, "loss": 0.0132, "step": 6767 }, { "epoch": 3.0791628753412192, "grad_norm": 1.0128984256449127, "learning_rate": 1.6108669109885583e-06, "loss": 0.0198, "step": 6768 }, { "epoch": 3.0796178343949046, "grad_norm": 1.3972934270294168, "learning_rate": 1.6101990244429077e-06, "loss": 0.0189, "step": 6769 }, { "epoch": 3.0800727934485894, "grad_norm": 0.8656446189357758, "learning_rate": 1.6095312106077749e-06, "loss": 0.0089, "step": 6770 }, { "epoch": 3.0805277525022747, "grad_norm": 1.3808347798340106, "learning_rate": 1.6088634695377294e-06, "loss": 0.0259, "step": 6771 }, { "epoch": 3.08098271155596, "grad_norm": 1.5699905762726054, "learning_rate": 1.6081958012873367e-06, "loss": 0.0123, "step": 6772 }, { "epoch": 3.0814376706096454, "grad_norm": 0.8280292982415257, "learning_rate": 1.6075282059111565e-06, "loss": 0.0103, "step": 6773 }, { "epoch": 3.08189262966333, "grad_norm": 0.9195083904865452, "learning_rate": 1.6068606834637406e-06, "loss": 0.0166, "step": 6774 }, { "epoch": 3.0823475887170155, "grad_norm": 1.2050405131287232, "learning_rate": 1.6061932339996366e-06, "loss": 0.0091, "step": 6775 }, { "epoch": 3.082802547770701, "grad_norm": 0.37619309567170367, "learning_rate": 1.605525857573385e-06, "loss": 0.0035, "step": 6776 }, { "epoch": 3.0832575068243857, "grad_norm": 1.48739364212107, "learning_rate": 1.604858554239521e-06, "loss": 0.0155, "step": 6777 }, { "epoch": 3.083712465878071, "grad_norm": 1.8936591345736504, "learning_rate": 1.6041913240525735e-06, "loss": 0.0369, "step": 6778 }, { "epoch": 3.0841674249317563, "grad_norm": 1.0506530202098834, "learning_rate": 1.6035241670670648e-06, "loss": 0.0131, "step": 6779 }, { "epoch": 3.084622383985441, "grad_norm": 0.729436050816002, "learning_rate": 1.6028570833375134e-06, "loss": 0.0102, "step": 6780 }, { "epoch": 3.0850773430391265, "grad_norm": 0.9777720191051373, "learning_rate": 1.6021900729184299e-06, "loss": 0.0061, "step": 6781 }, { "epoch": 3.085532302092812, "grad_norm": 0.6878654470260818, "learning_rate": 1.601523135864319e-06, "loss": 0.0171, "step": 6782 }, { "epoch": 3.0859872611464967, "grad_norm": 0.9110847442866048, "learning_rate": 1.6008562722296797e-06, "loss": 0.0154, "step": 6783 }, { "epoch": 3.086442220200182, "grad_norm": 1.0913670892534941, "learning_rate": 1.6001894820690058e-06, "loss": 0.0191, "step": 6784 }, { "epoch": 3.0868971792538673, "grad_norm": 2.105104416301491, "learning_rate": 1.5995227654367833e-06, "loss": 0.0108, "step": 6785 }, { "epoch": 3.087352138307552, "grad_norm": 1.0419423330648991, "learning_rate": 1.5988561223874938e-06, "loss": 0.031, "step": 6786 }, { "epoch": 3.0878070973612375, "grad_norm": 1.0678163389072868, "learning_rate": 1.598189552975613e-06, "loss": 0.0204, "step": 6787 }, { "epoch": 3.088262056414923, "grad_norm": 0.9003324347960118, "learning_rate": 1.5975230572556094e-06, "loss": 0.0141, "step": 6788 }, { "epoch": 3.0887170154686077, "grad_norm": 2.5482273764944545, "learning_rate": 1.596856635281946e-06, "loss": 0.0191, "step": 6789 }, { "epoch": 3.089171974522293, "grad_norm": 1.2869606159184679, "learning_rate": 1.5961902871090801e-06, "loss": 0.0288, "step": 6790 }, { "epoch": 3.0896269335759783, "grad_norm": 0.5516605319203002, "learning_rate": 1.5955240127914617e-06, "loss": 0.0034, "step": 6791 }, { "epoch": 3.090081892629663, "grad_norm": 1.1444559349943701, "learning_rate": 1.594857812383537e-06, "loss": 0.0221, "step": 6792 }, { "epoch": 3.0905368516833485, "grad_norm": 2.2159795799511297, "learning_rate": 1.5941916859397432e-06, "loss": 0.0331, "step": 6793 }, { "epoch": 3.0909918107370338, "grad_norm": 1.947119713145705, "learning_rate": 1.593525633514515e-06, "loss": 0.0394, "step": 6794 }, { "epoch": 3.0914467697907186, "grad_norm": 1.2817045276896206, "learning_rate": 1.5928596551622785e-06, "loss": 0.0492, "step": 6795 }, { "epoch": 3.091901728844404, "grad_norm": 1.9053878767288515, "learning_rate": 1.592193750937454e-06, "loss": 0.0172, "step": 6796 }, { "epoch": 3.0923566878980893, "grad_norm": 0.9643710329678407, "learning_rate": 1.5915279208944572e-06, "loss": 0.018, "step": 6797 }, { "epoch": 3.092811646951774, "grad_norm": 0.9194090405419986, "learning_rate": 1.5908621650876956e-06, "loss": 0.0143, "step": 6798 }, { "epoch": 3.0932666060054594, "grad_norm": 1.4100529411440688, "learning_rate": 1.5901964835715728e-06, "loss": 0.025, "step": 6799 }, { "epoch": 3.0937215650591448, "grad_norm": 0.8138477451628192, "learning_rate": 1.5895308764004835e-06, "loss": 0.0079, "step": 6800 }, { "epoch": 3.0941765241128296, "grad_norm": 0.864140794895651, "learning_rate": 1.5888653436288198e-06, "loss": 0.0096, "step": 6801 }, { "epoch": 3.094631483166515, "grad_norm": 0.7495923013789474, "learning_rate": 1.5881998853109665e-06, "loss": 0.0084, "step": 6802 }, { "epoch": 3.0950864422202002, "grad_norm": 1.7656733224740215, "learning_rate": 1.5875345015012999e-06, "loss": 0.0334, "step": 6803 }, { "epoch": 3.0955414012738856, "grad_norm": 0.7383551907323553, "learning_rate": 1.586869192254194e-06, "loss": 0.0178, "step": 6804 }, { "epoch": 3.0959963603275704, "grad_norm": 0.9634677149216688, "learning_rate": 1.5862039576240134e-06, "loss": 0.0111, "step": 6805 }, { "epoch": 3.0964513193812557, "grad_norm": 1.471138467148381, "learning_rate": 1.5855387976651194e-06, "loss": 0.0142, "step": 6806 }, { "epoch": 3.096906278434941, "grad_norm": 1.5183922434610342, "learning_rate": 1.584873712431864e-06, "loss": 0.0168, "step": 6807 }, { "epoch": 3.097361237488626, "grad_norm": 1.4405331364039804, "learning_rate": 1.5842087019785966e-06, "loss": 0.0182, "step": 6808 }, { "epoch": 3.097816196542311, "grad_norm": 0.7157471872355154, "learning_rate": 1.583543766359659e-06, "loss": 0.0082, "step": 6809 }, { "epoch": 3.0982711555959965, "grad_norm": 1.1376928994790576, "learning_rate": 1.5828789056293857e-06, "loss": 0.0159, "step": 6810 }, { "epoch": 3.0987261146496814, "grad_norm": 1.1505880099910708, "learning_rate": 1.5822141198421068e-06, "loss": 0.0237, "step": 6811 }, { "epoch": 3.0991810737033667, "grad_norm": 0.8694515830390649, "learning_rate": 1.581549409052145e-06, "loss": 0.0193, "step": 6812 }, { "epoch": 3.099636032757052, "grad_norm": 1.0476155128949531, "learning_rate": 1.5808847733138182e-06, "loss": 0.0118, "step": 6813 }, { "epoch": 3.100090991810737, "grad_norm": 1.497344416764289, "learning_rate": 1.5802202126814365e-06, "loss": 0.0423, "step": 6814 }, { "epoch": 3.100545950864422, "grad_norm": 0.8196907402150779, "learning_rate": 1.5795557272093053e-06, "loss": 0.0154, "step": 6815 }, { "epoch": 3.1010009099181075, "grad_norm": 0.9183340532789931, "learning_rate": 1.578891316951724e-06, "loss": 0.0073, "step": 6816 }, { "epoch": 3.1014558689717924, "grad_norm": 0.6524199879713473, "learning_rate": 1.5782269819629843e-06, "loss": 0.0083, "step": 6817 }, { "epoch": 3.1019108280254777, "grad_norm": 0.7017209359874211, "learning_rate": 1.5775627222973734e-06, "loss": 0.0099, "step": 6818 }, { "epoch": 3.102365787079163, "grad_norm": 0.7718135608945799, "learning_rate": 1.5768985380091703e-06, "loss": 0.0123, "step": 6819 }, { "epoch": 3.102820746132848, "grad_norm": 0.84186046803478, "learning_rate": 1.5762344291526507e-06, "loss": 0.0184, "step": 6820 }, { "epoch": 3.103275705186533, "grad_norm": 0.727398097107073, "learning_rate": 1.575570395782081e-06, "loss": 0.0098, "step": 6821 }, { "epoch": 3.1037306642402185, "grad_norm": 1.785920237127084, "learning_rate": 1.5749064379517242e-06, "loss": 0.0508, "step": 6822 }, { "epoch": 3.1041856232939034, "grad_norm": 1.0751109913886137, "learning_rate": 1.5742425557158362e-06, "loss": 0.0104, "step": 6823 }, { "epoch": 3.1046405823475887, "grad_norm": 1.6412858938506705, "learning_rate": 1.5735787491286653e-06, "loss": 0.0356, "step": 6824 }, { "epoch": 3.105095541401274, "grad_norm": 0.8152813715854906, "learning_rate": 1.5729150182444559e-06, "loss": 0.0149, "step": 6825 }, { "epoch": 3.105550500454959, "grad_norm": 0.7731715843572944, "learning_rate": 1.5722513631174445e-06, "loss": 0.0111, "step": 6826 }, { "epoch": 3.106005459508644, "grad_norm": 0.8366672655762668, "learning_rate": 1.5715877838018615e-06, "loss": 0.0118, "step": 6827 }, { "epoch": 3.1064604185623295, "grad_norm": 1.058647157502258, "learning_rate": 1.5709242803519314e-06, "loss": 0.0166, "step": 6828 }, { "epoch": 3.1069153776160148, "grad_norm": 1.0678009207792802, "learning_rate": 1.570260852821875e-06, "loss": 0.0097, "step": 6829 }, { "epoch": 3.1073703366696996, "grad_norm": 1.0986743071530403, "learning_rate": 1.569597501265902e-06, "loss": 0.0125, "step": 6830 }, { "epoch": 3.107825295723385, "grad_norm": 1.3684334778576193, "learning_rate": 1.5689342257382206e-06, "loss": 0.0208, "step": 6831 }, { "epoch": 3.1082802547770703, "grad_norm": 0.9897523102593918, "learning_rate": 1.5682710262930287e-06, "loss": 0.0162, "step": 6832 }, { "epoch": 3.108735213830755, "grad_norm": 0.9092361319063863, "learning_rate": 1.5676079029845215e-06, "loss": 0.0176, "step": 6833 }, { "epoch": 3.1091901728844404, "grad_norm": 1.1752465698981858, "learning_rate": 1.5669448558668855e-06, "loss": 0.0173, "step": 6834 }, { "epoch": 3.1096451319381258, "grad_norm": 1.6999023712910912, "learning_rate": 1.5662818849943011e-06, "loss": 0.0162, "step": 6835 }, { "epoch": 3.1101000909918106, "grad_norm": 0.6111044239065148, "learning_rate": 1.5656189904209463e-06, "loss": 0.0347, "step": 6836 }, { "epoch": 3.110555050045496, "grad_norm": 1.1791035379641472, "learning_rate": 1.5649561722009868e-06, "loss": 0.0151, "step": 6837 }, { "epoch": 3.1110100090991812, "grad_norm": 0.9252623530668961, "learning_rate": 1.564293430388587e-06, "loss": 0.0133, "step": 6838 }, { "epoch": 3.111464968152866, "grad_norm": 0.9055184831091005, "learning_rate": 1.563630765037902e-06, "loss": 0.0421, "step": 6839 }, { "epoch": 3.1119199272065514, "grad_norm": 0.5890183906689261, "learning_rate": 1.562968176203083e-06, "loss": 0.0098, "step": 6840 }, { "epoch": 3.1123748862602367, "grad_norm": 1.4759712988538831, "learning_rate": 1.5623056639382721e-06, "loss": 0.0164, "step": 6841 }, { "epoch": 3.1128298453139216, "grad_norm": 0.8236202658685771, "learning_rate": 1.5616432282976075e-06, "loss": 0.0214, "step": 6842 }, { "epoch": 3.113284804367607, "grad_norm": 0.8931575393392439, "learning_rate": 1.5609808693352217e-06, "loss": 0.0274, "step": 6843 }, { "epoch": 3.113739763421292, "grad_norm": 1.068840709617675, "learning_rate": 1.5603185871052378e-06, "loss": 0.0303, "step": 6844 }, { "epoch": 3.114194722474977, "grad_norm": 1.0915148776447108, "learning_rate": 1.5596563816617766e-06, "loss": 0.0171, "step": 6845 }, { "epoch": 3.1146496815286624, "grad_norm": 0.5805912396267162, "learning_rate": 1.5589942530589482e-06, "loss": 0.0053, "step": 6846 }, { "epoch": 3.1151046405823477, "grad_norm": 0.9057346742710283, "learning_rate": 1.5583322013508605e-06, "loss": 0.0144, "step": 6847 }, { "epoch": 3.1155595996360326, "grad_norm": 0.6167670690510106, "learning_rate": 1.5576702265916126e-06, "loss": 0.0093, "step": 6848 }, { "epoch": 3.116014558689718, "grad_norm": 0.8911814477690756, "learning_rate": 1.5570083288352977e-06, "loss": 0.0174, "step": 6849 }, { "epoch": 3.116469517743403, "grad_norm": 0.7121317876787063, "learning_rate": 1.5563465081360047e-06, "loss": 0.0118, "step": 6850 }, { "epoch": 3.116924476797088, "grad_norm": 0.39754572408202316, "learning_rate": 1.5556847645478128e-06, "loss": 0.0035, "step": 6851 }, { "epoch": 3.1173794358507734, "grad_norm": 0.5672313766013756, "learning_rate": 1.5550230981247983e-06, "loss": 0.0126, "step": 6852 }, { "epoch": 3.1178343949044587, "grad_norm": 1.3434837575680543, "learning_rate": 1.5543615089210279e-06, "loss": 0.0129, "step": 6853 }, { "epoch": 3.1182893539581436, "grad_norm": 1.4852799262784222, "learning_rate": 1.553699996990565e-06, "loss": 0.0278, "step": 6854 }, { "epoch": 3.118744313011829, "grad_norm": 0.9757775829095106, "learning_rate": 1.5530385623874643e-06, "loss": 0.0129, "step": 6855 }, { "epoch": 3.119199272065514, "grad_norm": 0.8144000690281554, "learning_rate": 1.5523772051657757e-06, "loss": 0.0223, "step": 6856 }, { "epoch": 3.1196542311191995, "grad_norm": 1.009991359844638, "learning_rate": 1.5517159253795434e-06, "loss": 0.0214, "step": 6857 }, { "epoch": 3.1201091901728844, "grad_norm": 1.2852412584914283, "learning_rate": 1.5510547230828026e-06, "loss": 0.0341, "step": 6858 }, { "epoch": 3.1205641492265697, "grad_norm": 0.6651650645913721, "learning_rate": 1.550393598329585e-06, "loss": 0.0084, "step": 6859 }, { "epoch": 3.121019108280255, "grad_norm": 0.7583512807543024, "learning_rate": 1.5497325511739136e-06, "loss": 0.0078, "step": 6860 }, { "epoch": 3.12147406733394, "grad_norm": 1.3461326750563023, "learning_rate": 1.5490715816698077e-06, "loss": 0.0382, "step": 6861 }, { "epoch": 3.121929026387625, "grad_norm": 1.075104247993585, "learning_rate": 1.5484106898712771e-06, "loss": 0.0191, "step": 6862 }, { "epoch": 3.1223839854413105, "grad_norm": 0.6005434603978939, "learning_rate": 1.5477498758323268e-06, "loss": 0.0139, "step": 6863 }, { "epoch": 3.1228389444949953, "grad_norm": 1.6595143048873662, "learning_rate": 1.547089139606957e-06, "loss": 0.0238, "step": 6864 }, { "epoch": 3.1232939035486806, "grad_norm": 1.1523632215004413, "learning_rate": 1.54642848124916e-06, "loss": 0.0109, "step": 6865 }, { "epoch": 3.123748862602366, "grad_norm": 0.9461426794745755, "learning_rate": 1.5457679008129205e-06, "loss": 0.0164, "step": 6866 }, { "epoch": 3.124203821656051, "grad_norm": 0.7893522781185096, "learning_rate": 1.5451073983522196e-06, "loss": 0.0088, "step": 6867 }, { "epoch": 3.124658780709736, "grad_norm": 1.9817803086420613, "learning_rate": 1.5444469739210291e-06, "loss": 0.0216, "step": 6868 }, { "epoch": 3.1251137397634214, "grad_norm": 0.7383444265918763, "learning_rate": 1.543786627573317e-06, "loss": 0.009, "step": 6869 }, { "epoch": 3.1255686988171063, "grad_norm": 1.442334756371306, "learning_rate": 1.543126359363043e-06, "loss": 0.0143, "step": 6870 }, { "epoch": 3.1260236578707916, "grad_norm": 1.112522475298696, "learning_rate": 1.5424661693441618e-06, "loss": 0.0176, "step": 6871 }, { "epoch": 3.126478616924477, "grad_norm": 1.0671950425985754, "learning_rate": 1.5418060575706218e-06, "loss": 0.0203, "step": 6872 }, { "epoch": 3.126933575978162, "grad_norm": 1.025795271184022, "learning_rate": 1.5411460240963627e-06, "loss": 0.0405, "step": 6873 }, { "epoch": 3.127388535031847, "grad_norm": 2.0977605535351698, "learning_rate": 1.5404860689753216e-06, "loss": 0.0401, "step": 6874 }, { "epoch": 3.1278434940855324, "grad_norm": 1.248473750418072, "learning_rate": 1.5398261922614244e-06, "loss": 0.0378, "step": 6875 }, { "epoch": 3.1282984531392173, "grad_norm": 1.744085668410383, "learning_rate": 1.5391663940085958e-06, "loss": 0.034, "step": 6876 }, { "epoch": 3.1287534121929026, "grad_norm": 0.482600937133836, "learning_rate": 1.538506674270749e-06, "loss": 0.0039, "step": 6877 }, { "epoch": 3.129208371246588, "grad_norm": 1.118894646494006, "learning_rate": 1.5378470331017955e-06, "loss": 0.0078, "step": 6878 }, { "epoch": 3.1296633303002728, "grad_norm": 0.6997549284036975, "learning_rate": 1.5371874705556377e-06, "loss": 0.0152, "step": 6879 }, { "epoch": 3.130118289353958, "grad_norm": 1.4138556490334668, "learning_rate": 1.5365279866861716e-06, "loss": 0.0158, "step": 6880 }, { "epoch": 3.1305732484076434, "grad_norm": 0.7018032329403289, "learning_rate": 1.535868581547288e-06, "loss": 0.0177, "step": 6881 }, { "epoch": 3.1310282074613287, "grad_norm": 0.8503489543514207, "learning_rate": 1.5352092551928691e-06, "loss": 0.0147, "step": 6882 }, { "epoch": 3.1314831665150136, "grad_norm": 1.477338646020259, "learning_rate": 1.5345500076767932e-06, "loss": 0.022, "step": 6883 }, { "epoch": 3.131938125568699, "grad_norm": 1.5844280387673884, "learning_rate": 1.5338908390529302e-06, "loss": 0.0374, "step": 6884 }, { "epoch": 3.132393084622384, "grad_norm": 2.296058702477279, "learning_rate": 1.5332317493751452e-06, "loss": 0.0253, "step": 6885 }, { "epoch": 3.132848043676069, "grad_norm": 0.4824828136249776, "learning_rate": 1.5325727386972963e-06, "loss": 0.0048, "step": 6886 }, { "epoch": 3.1333030027297544, "grad_norm": 0.629505027865736, "learning_rate": 1.531913807073234e-06, "loss": 0.0068, "step": 6887 }, { "epoch": 3.1337579617834397, "grad_norm": 1.3413876809002014, "learning_rate": 1.531254954556804e-06, "loss": 0.0126, "step": 6888 }, { "epoch": 3.1342129208371245, "grad_norm": 0.9709124060524114, "learning_rate": 1.5305961812018435e-06, "loss": 0.0179, "step": 6889 }, { "epoch": 3.13466787989081, "grad_norm": 1.0187868783249074, "learning_rate": 1.5299374870621859e-06, "loss": 0.0307, "step": 6890 }, { "epoch": 3.135122838944495, "grad_norm": 1.3928784453707155, "learning_rate": 1.529278872191655e-06, "loss": 0.0372, "step": 6891 }, { "epoch": 3.13557779799818, "grad_norm": 0.6433585319892282, "learning_rate": 1.528620336644072e-06, "loss": 0.014, "step": 6892 }, { "epoch": 3.1360327570518653, "grad_norm": 1.1771282449005798, "learning_rate": 1.5279618804732481e-06, "loss": 0.0293, "step": 6893 }, { "epoch": 3.1364877161055507, "grad_norm": 0.8298089325325243, "learning_rate": 1.5273035037329898e-06, "loss": 0.0108, "step": 6894 }, { "epoch": 3.1369426751592355, "grad_norm": 0.9439697198187752, "learning_rate": 1.5266452064770964e-06, "loss": 0.0308, "step": 6895 }, { "epoch": 3.137397634212921, "grad_norm": 0.7968658697383201, "learning_rate": 1.5259869887593618e-06, "loss": 0.0171, "step": 6896 }, { "epoch": 3.137852593266606, "grad_norm": 1.1492535906057653, "learning_rate": 1.525328850633571e-06, "loss": 0.0148, "step": 6897 }, { "epoch": 3.138307552320291, "grad_norm": 0.8360880725308368, "learning_rate": 1.5246707921535043e-06, "loss": 0.0087, "step": 6898 }, { "epoch": 3.1387625113739763, "grad_norm": 0.9515498500873363, "learning_rate": 1.524012813372937e-06, "loss": 0.0224, "step": 6899 }, { "epoch": 3.1392174704276616, "grad_norm": 0.8281937980553093, "learning_rate": 1.5233549143456348e-06, "loss": 0.0204, "step": 6900 }, { "epoch": 3.1396724294813465, "grad_norm": 1.1728063376151743, "learning_rate": 1.522697095125359e-06, "loss": 0.0195, "step": 6901 }, { "epoch": 3.140127388535032, "grad_norm": 1.1920412872701194, "learning_rate": 1.5220393557658621e-06, "loss": 0.0115, "step": 6902 }, { "epoch": 3.140582347588717, "grad_norm": 0.7949597645115948, "learning_rate": 1.5213816963208938e-06, "loss": 0.0245, "step": 6903 }, { "epoch": 3.141037306642402, "grad_norm": 1.011808445541511, "learning_rate": 1.5207241168441928e-06, "loss": 0.0144, "step": 6904 }, { "epoch": 3.1414922656960873, "grad_norm": 0.7466560729522829, "learning_rate": 1.5200666173894945e-06, "loss": 0.0087, "step": 6905 }, { "epoch": 3.1419472247497726, "grad_norm": 1.2788216799572278, "learning_rate": 1.5194091980105277e-06, "loss": 0.0373, "step": 6906 }, { "epoch": 3.1424021838034575, "grad_norm": 1.0649489581256397, "learning_rate": 1.5187518587610123e-06, "loss": 0.0229, "step": 6907 }, { "epoch": 3.142857142857143, "grad_norm": 1.2117682392425317, "learning_rate": 1.5180945996946643e-06, "loss": 0.0138, "step": 6908 }, { "epoch": 3.143312101910828, "grad_norm": 1.0025371842682131, "learning_rate": 1.5174374208651913e-06, "loss": 0.0137, "step": 6909 }, { "epoch": 3.143767060964513, "grad_norm": 1.1484144097427569, "learning_rate": 1.516780322326295e-06, "loss": 0.0436, "step": 6910 }, { "epoch": 3.1442220200181983, "grad_norm": 1.0723583278394966, "learning_rate": 1.5161233041316702e-06, "loss": 0.0205, "step": 6911 }, { "epoch": 3.1446769790718836, "grad_norm": 0.7711109063437087, "learning_rate": 1.5154663663350055e-06, "loss": 0.0106, "step": 6912 }, { "epoch": 3.145131938125569, "grad_norm": 1.2957481765683991, "learning_rate": 1.5148095089899844e-06, "loss": 0.037, "step": 6913 }, { "epoch": 3.1455868971792538, "grad_norm": 1.8296129785989006, "learning_rate": 1.5141527321502803e-06, "loss": 0.0088, "step": 6914 }, { "epoch": 3.146041856232939, "grad_norm": 0.8902441762761633, "learning_rate": 1.5134960358695635e-06, "loss": 0.0135, "step": 6915 }, { "epoch": 3.1464968152866244, "grad_norm": 0.9127412051750272, "learning_rate": 1.5128394202014952e-06, "loss": 0.0112, "step": 6916 }, { "epoch": 3.1469517743403093, "grad_norm": 1.0155213840616524, "learning_rate": 1.512182885199732e-06, "loss": 0.0238, "step": 6917 }, { "epoch": 3.1474067333939946, "grad_norm": 0.5836419969418394, "learning_rate": 1.5115264309179218e-06, "loss": 0.0057, "step": 6918 }, { "epoch": 3.14786169244768, "grad_norm": 1.270011346944163, "learning_rate": 1.5108700574097074e-06, "loss": 0.0229, "step": 6919 }, { "epoch": 3.1483166515013647, "grad_norm": 1.6104586457112533, "learning_rate": 1.5102137647287263e-06, "loss": 0.0146, "step": 6920 }, { "epoch": 3.14877161055505, "grad_norm": 0.8330085306429167, "learning_rate": 1.5095575529286055e-06, "loss": 0.0146, "step": 6921 }, { "epoch": 3.1492265696087354, "grad_norm": 0.8145571024815246, "learning_rate": 1.5089014220629694e-06, "loss": 0.0148, "step": 6922 }, { "epoch": 3.1496815286624202, "grad_norm": 0.7287933155817448, "learning_rate": 1.508245372185433e-06, "loss": 0.018, "step": 6923 }, { "epoch": 3.1501364877161055, "grad_norm": 0.9571683716094657, "learning_rate": 1.5075894033496063e-06, "loss": 0.0055, "step": 6924 }, { "epoch": 3.150591446769791, "grad_norm": 1.3232196344760694, "learning_rate": 1.5069335156090915e-06, "loss": 0.0155, "step": 6925 }, { "epoch": 3.1510464058234757, "grad_norm": 0.9078313750374666, "learning_rate": 1.5062777090174847e-06, "loss": 0.0065, "step": 6926 }, { "epoch": 3.151501364877161, "grad_norm": 1.4086141405777237, "learning_rate": 1.5056219836283763e-06, "loss": 0.0277, "step": 6927 }, { "epoch": 3.1519563239308463, "grad_norm": 1.4299390657983793, "learning_rate": 1.504966339495349e-06, "loss": 0.0364, "step": 6928 }, { "epoch": 3.152411282984531, "grad_norm": 0.986165181550568, "learning_rate": 1.5043107766719795e-06, "loss": 0.0127, "step": 6929 }, { "epoch": 3.1528662420382165, "grad_norm": 0.763006107807447, "learning_rate": 1.503655295211836e-06, "loss": 0.0126, "step": 6930 }, { "epoch": 3.153321201091902, "grad_norm": 0.7025420544054664, "learning_rate": 1.5029998951684829e-06, "loss": 0.0038, "step": 6931 }, { "epoch": 3.1537761601455867, "grad_norm": 1.125445067384293, "learning_rate": 1.502344576595476e-06, "loss": 0.0296, "step": 6932 }, { "epoch": 3.154231119199272, "grad_norm": 1.0462568734429938, "learning_rate": 1.5016893395463633e-06, "loss": 0.0156, "step": 6933 }, { "epoch": 3.1546860782529573, "grad_norm": 1.0482224327959062, "learning_rate": 1.5010341840746912e-06, "loss": 0.0172, "step": 6934 }, { "epoch": 3.1551410373066426, "grad_norm": 1.4470870164873089, "learning_rate": 1.500379110233994e-06, "loss": 0.0329, "step": 6935 }, { "epoch": 3.1555959963603275, "grad_norm": 0.7326934048690441, "learning_rate": 1.4997241180778013e-06, "loss": 0.0066, "step": 6936 }, { "epoch": 3.156050955414013, "grad_norm": 1.0113510893466038, "learning_rate": 1.4990692076596368e-06, "loss": 0.0218, "step": 6937 }, { "epoch": 3.156505914467698, "grad_norm": 0.9526619393467849, "learning_rate": 1.4984143790330164e-06, "loss": 0.0146, "step": 6938 }, { "epoch": 3.156960873521383, "grad_norm": 0.860096984793379, "learning_rate": 1.4977596322514498e-06, "loss": 0.0219, "step": 6939 }, { "epoch": 3.1574158325750683, "grad_norm": 0.9615759996900262, "learning_rate": 1.4971049673684396e-06, "loss": 0.0146, "step": 6940 }, { "epoch": 3.1578707916287536, "grad_norm": 1.4017538628364217, "learning_rate": 1.4964503844374824e-06, "loss": 0.0345, "step": 6941 }, { "epoch": 3.1583257506824385, "grad_norm": 0.5615033454549452, "learning_rate": 1.4957958835120684e-06, "loss": 0.0034, "step": 6942 }, { "epoch": 3.158780709736124, "grad_norm": 1.080169440024345, "learning_rate": 1.4951414646456794e-06, "loss": 0.0188, "step": 6943 }, { "epoch": 3.159235668789809, "grad_norm": 1.407875319208736, "learning_rate": 1.4944871278917928e-06, "loss": 0.013, "step": 6944 }, { "epoch": 3.159690627843494, "grad_norm": 1.242728051905288, "learning_rate": 1.4938328733038762e-06, "loss": 0.0284, "step": 6945 }, { "epoch": 3.1601455868971793, "grad_norm": 1.1728299065231482, "learning_rate": 1.4931787009353943e-06, "loss": 0.0223, "step": 6946 }, { "epoch": 3.1606005459508646, "grad_norm": 1.5614313987162112, "learning_rate": 1.4925246108398008e-06, "loss": 0.0219, "step": 6947 }, { "epoch": 3.1610555050045495, "grad_norm": 1.2460160837485417, "learning_rate": 1.491870603070547e-06, "loss": 0.036, "step": 6948 }, { "epoch": 3.1615104640582348, "grad_norm": 1.862040836447372, "learning_rate": 1.4912166776810757e-06, "loss": 0.0153, "step": 6949 }, { "epoch": 3.16196542311192, "grad_norm": 1.7493944668254269, "learning_rate": 1.4905628347248214e-06, "loss": 0.0133, "step": 6950 }, { "epoch": 3.162420382165605, "grad_norm": 1.4041780303473708, "learning_rate": 1.4899090742552136e-06, "loss": 0.0263, "step": 6951 }, { "epoch": 3.1628753412192903, "grad_norm": 0.5813409962823143, "learning_rate": 1.4892553963256745e-06, "loss": 0.0153, "step": 6952 }, { "epoch": 3.1633303002729756, "grad_norm": 1.6494911274416078, "learning_rate": 1.4886018009896208e-06, "loss": 0.0172, "step": 6953 }, { "epoch": 3.1637852593266604, "grad_norm": 1.242148673225626, "learning_rate": 1.4879482883004593e-06, "loss": 0.036, "step": 6954 }, { "epoch": 3.1642402183803457, "grad_norm": 0.914731598201084, "learning_rate": 1.4872948583115935e-06, "loss": 0.0214, "step": 6955 }, { "epoch": 3.164695177434031, "grad_norm": 0.687616459622606, "learning_rate": 1.4866415110764193e-06, "loss": 0.005, "step": 6956 }, { "epoch": 3.165150136487716, "grad_norm": 0.9677756162007703, "learning_rate": 1.4859882466483239e-06, "loss": 0.0166, "step": 6957 }, { "epoch": 3.1656050955414012, "grad_norm": 1.489101655897015, "learning_rate": 1.4853350650806903e-06, "loss": 0.0243, "step": 6958 }, { "epoch": 3.1660600545950865, "grad_norm": 1.3965961487237, "learning_rate": 1.4846819664268925e-06, "loss": 0.0188, "step": 6959 }, { "epoch": 3.1665150136487714, "grad_norm": 1.007612995555781, "learning_rate": 1.4840289507402995e-06, "loss": 0.0295, "step": 6960 }, { "epoch": 3.1669699727024567, "grad_norm": 0.7416781342982406, "learning_rate": 1.4833760180742718e-06, "loss": 0.0356, "step": 6961 }, { "epoch": 3.167424931756142, "grad_norm": 1.576722174966557, "learning_rate": 1.4827231684821652e-06, "loss": 0.0545, "step": 6962 }, { "epoch": 3.167879890809827, "grad_norm": 0.7178547289130046, "learning_rate": 1.4820704020173281e-06, "loss": 0.0196, "step": 6963 }, { "epoch": 3.168334849863512, "grad_norm": 1.65719577866767, "learning_rate": 1.4814177187331003e-06, "loss": 0.0137, "step": 6964 }, { "epoch": 3.1687898089171975, "grad_norm": 1.2609006134162093, "learning_rate": 1.480765118682817e-06, "loss": 0.0233, "step": 6965 }, { "epoch": 3.1692447679708824, "grad_norm": 0.9289295023823317, "learning_rate": 1.4801126019198048e-06, "loss": 0.0127, "step": 6966 }, { "epoch": 3.1696997270245677, "grad_norm": 0.649737079200339, "learning_rate": 1.479460168497386e-06, "loss": 0.019, "step": 6967 }, { "epoch": 3.170154686078253, "grad_norm": 1.1233089728729342, "learning_rate": 1.478807818468872e-06, "loss": 0.0234, "step": 6968 }, { "epoch": 3.1706096451319383, "grad_norm": 0.7514283421866631, "learning_rate": 1.4781555518875718e-06, "loss": 0.0042, "step": 6969 }, { "epoch": 3.171064604185623, "grad_norm": 1.070645121364496, "learning_rate": 1.4775033688067862e-06, "loss": 0.0192, "step": 6970 }, { "epoch": 3.1715195632393085, "grad_norm": 1.0391625226016254, "learning_rate": 1.4768512692798075e-06, "loss": 0.0329, "step": 6971 }, { "epoch": 3.171974522292994, "grad_norm": 0.6994377876146016, "learning_rate": 1.476199253359922e-06, "loss": 0.0132, "step": 6972 }, { "epoch": 3.1724294813466787, "grad_norm": 1.324964823369201, "learning_rate": 1.4755473211004106e-06, "loss": 0.0202, "step": 6973 }, { "epoch": 3.172884440400364, "grad_norm": 1.1789563154308742, "learning_rate": 1.4748954725545456e-06, "loss": 0.0097, "step": 6974 }, { "epoch": 3.1733393994540493, "grad_norm": 0.9219130329184093, "learning_rate": 1.4742437077755925e-06, "loss": 0.0186, "step": 6975 }, { "epoch": 3.173794358507734, "grad_norm": 1.013926027405009, "learning_rate": 1.4735920268168126e-06, "loss": 0.0085, "step": 6976 }, { "epoch": 3.1742493175614195, "grad_norm": 1.190576583235572, "learning_rate": 1.4729404297314559e-06, "loss": 0.0403, "step": 6977 }, { "epoch": 3.174704276615105, "grad_norm": 0.9078057134591003, "learning_rate": 1.47228891657277e-06, "loss": 0.0105, "step": 6978 }, { "epoch": 3.1751592356687897, "grad_norm": 1.3242801764757277, "learning_rate": 1.4716374873939922e-06, "loss": 0.0122, "step": 6979 }, { "epoch": 3.175614194722475, "grad_norm": 1.1264678592878412, "learning_rate": 1.4709861422483557e-06, "loss": 0.0151, "step": 6980 }, { "epoch": 3.1760691537761603, "grad_norm": 0.713464046193387, "learning_rate": 1.470334881189084e-06, "loss": 0.0202, "step": 6981 }, { "epoch": 3.176524112829845, "grad_norm": 1.1684298921680405, "learning_rate": 1.469683704269395e-06, "loss": 0.0153, "step": 6982 }, { "epoch": 3.1769790718835305, "grad_norm": 0.9589310447486246, "learning_rate": 1.4690326115425018e-06, "loss": 0.0083, "step": 6983 }, { "epoch": 3.1774340309372158, "grad_norm": 0.8326952687540903, "learning_rate": 1.4683816030616077e-06, "loss": 0.0153, "step": 6984 }, { "epoch": 3.1778889899909006, "grad_norm": 0.9373839723439515, "learning_rate": 1.4677306788799106e-06, "loss": 0.0053, "step": 6985 }, { "epoch": 3.178343949044586, "grad_norm": 1.0804864158441274, "learning_rate": 1.4670798390506002e-06, "loss": 0.0086, "step": 6986 }, { "epoch": 3.1787989080982713, "grad_norm": 0.9561331344284631, "learning_rate": 1.4664290836268613e-06, "loss": 0.0175, "step": 6987 }, { "epoch": 3.179253867151956, "grad_norm": 1.0107474103223144, "learning_rate": 1.4657784126618697e-06, "loss": 0.0147, "step": 6988 }, { "epoch": 3.1797088262056414, "grad_norm": 1.0869347909480989, "learning_rate": 1.4651278262087954e-06, "loss": 0.0115, "step": 6989 }, { "epoch": 3.1801637852593267, "grad_norm": 1.3232894759566396, "learning_rate": 1.4644773243208021e-06, "loss": 0.0197, "step": 6990 }, { "epoch": 3.180618744313012, "grad_norm": 0.8849179652829284, "learning_rate": 1.4638269070510453e-06, "loss": 0.0264, "step": 6991 }, { "epoch": 3.181073703366697, "grad_norm": 1.2098044195573474, "learning_rate": 1.463176574452675e-06, "loss": 0.0362, "step": 6992 }, { "epoch": 3.1815286624203822, "grad_norm": 1.1928352927640806, "learning_rate": 1.462526326578832e-06, "loss": 0.0353, "step": 6993 }, { "epoch": 3.1819836214740675, "grad_norm": 1.9975917937357324, "learning_rate": 1.461876163482653e-06, "loss": 0.0569, "step": 6994 }, { "epoch": 3.1824385805277524, "grad_norm": 0.9929518897398405, "learning_rate": 1.4612260852172656e-06, "loss": 0.0178, "step": 6995 }, { "epoch": 3.1828935395814377, "grad_norm": 1.422519126884096, "learning_rate": 1.4605760918357903e-06, "loss": 0.025, "step": 6996 }, { "epoch": 3.183348498635123, "grad_norm": 0.8343370115448746, "learning_rate": 1.4599261833913443e-06, "loss": 0.0221, "step": 6997 }, { "epoch": 3.183803457688808, "grad_norm": 0.6825389778160977, "learning_rate": 1.4592763599370336e-06, "loss": 0.0207, "step": 6998 }, { "epoch": 3.184258416742493, "grad_norm": 1.2686468136133213, "learning_rate": 1.4586266215259575e-06, "loss": 0.0361, "step": 6999 }, { "epoch": 3.1847133757961785, "grad_norm": 0.5608038314549577, "learning_rate": 1.4579769682112127e-06, "loss": 0.0095, "step": 7000 }, { "epoch": 3.1851683348498634, "grad_norm": 0.8617615435051782, "learning_rate": 1.457327400045884e-06, "loss": 0.0195, "step": 7001 }, { "epoch": 3.1856232939035487, "grad_norm": 0.98969169937357, "learning_rate": 1.4566779170830514e-06, "loss": 0.0201, "step": 7002 }, { "epoch": 3.186078252957234, "grad_norm": 0.853709224593858, "learning_rate": 1.456028519375787e-06, "loss": 0.0062, "step": 7003 }, { "epoch": 3.186533212010919, "grad_norm": 1.416607523329389, "learning_rate": 1.4553792069771574e-06, "loss": 0.0279, "step": 7004 }, { "epoch": 3.186988171064604, "grad_norm": 1.2318160387578847, "learning_rate": 1.4547299799402225e-06, "loss": 0.0154, "step": 7005 }, { "epoch": 3.1874431301182895, "grad_norm": 0.8229211817589824, "learning_rate": 1.4540808383180333e-06, "loss": 0.0111, "step": 7006 }, { "epoch": 3.1878980891719744, "grad_norm": 1.0617302792397103, "learning_rate": 1.4534317821636345e-06, "loss": 0.0109, "step": 7007 }, { "epoch": 3.1883530482256597, "grad_norm": 1.19685078706583, "learning_rate": 1.4527828115300646e-06, "loss": 0.0079, "step": 7008 }, { "epoch": 3.188808007279345, "grad_norm": 1.2183419690048582, "learning_rate": 1.4521339264703526e-06, "loss": 0.0208, "step": 7009 }, { "epoch": 3.18926296633303, "grad_norm": 1.008899045059162, "learning_rate": 1.4514851270375246e-06, "loss": 0.0063, "step": 7010 }, { "epoch": 3.189717925386715, "grad_norm": 1.353215357609719, "learning_rate": 1.4508364132845976e-06, "loss": 0.057, "step": 7011 }, { "epoch": 3.1901728844404005, "grad_norm": 1.416109273448669, "learning_rate": 1.450187785264581e-06, "loss": 0.0291, "step": 7012 }, { "epoch": 3.1906278434940853, "grad_norm": 1.114001712310235, "learning_rate": 1.4495392430304777e-06, "loss": 0.0136, "step": 7013 }, { "epoch": 3.1910828025477707, "grad_norm": 0.7326310680792785, "learning_rate": 1.4488907866352826e-06, "loss": 0.0125, "step": 7014 }, { "epoch": 3.191537761601456, "grad_norm": 0.6441599129712274, "learning_rate": 1.4482424161319865e-06, "loss": 0.0079, "step": 7015 }, { "epoch": 3.191992720655141, "grad_norm": 0.559343130245902, "learning_rate": 1.4475941315735706e-06, "loss": 0.0057, "step": 7016 }, { "epoch": 3.192447679708826, "grad_norm": 1.0729410865684348, "learning_rate": 1.4469459330130087e-06, "loss": 0.0208, "step": 7017 }, { "epoch": 3.1929026387625115, "grad_norm": 1.0749121990272135, "learning_rate": 1.4462978205032707e-06, "loss": 0.0215, "step": 7018 }, { "epoch": 3.1933575978161963, "grad_norm": 1.5309168313404518, "learning_rate": 1.4456497940973152e-06, "loss": 0.0413, "step": 7019 }, { "epoch": 3.1938125568698816, "grad_norm": 1.166139021897145, "learning_rate": 1.445001853848098e-06, "loss": 0.014, "step": 7020 }, { "epoch": 3.194267515923567, "grad_norm": 0.7459424407582054, "learning_rate": 1.444353999808565e-06, "loss": 0.0063, "step": 7021 }, { "epoch": 3.194722474977252, "grad_norm": 0.9381817538873644, "learning_rate": 1.4437062320316557e-06, "loss": 0.0204, "step": 7022 }, { "epoch": 3.195177434030937, "grad_norm": 1.174616102384314, "learning_rate": 1.4430585505703026e-06, "loss": 0.0119, "step": 7023 }, { "epoch": 3.1956323930846224, "grad_norm": 1.1513203959393878, "learning_rate": 1.4424109554774312e-06, "loss": 0.0285, "step": 7024 }, { "epoch": 3.1960873521383077, "grad_norm": 2.310512947486867, "learning_rate": 1.4417634468059617e-06, "loss": 0.0327, "step": 7025 }, { "epoch": 3.1965423111919926, "grad_norm": 0.8838146989485933, "learning_rate": 1.441116024608804e-06, "loss": 0.0115, "step": 7026 }, { "epoch": 3.196997270245678, "grad_norm": 0.6952453786816865, "learning_rate": 1.4404686889388631e-06, "loss": 0.0069, "step": 7027 }, { "epoch": 3.1974522292993632, "grad_norm": 0.3197560115251115, "learning_rate": 1.439821439849035e-06, "loss": 0.002, "step": 7028 }, { "epoch": 3.197907188353048, "grad_norm": 0.8241759386221584, "learning_rate": 1.4391742773922124e-06, "loss": 0.0095, "step": 7029 }, { "epoch": 3.1983621474067334, "grad_norm": 0.7959986157139589, "learning_rate": 1.438527201621277e-06, "loss": 0.0078, "step": 7030 }, { "epoch": 3.1988171064604187, "grad_norm": 1.5084093982692153, "learning_rate": 1.4378802125891038e-06, "loss": 0.011, "step": 7031 }, { "epoch": 3.1992720655141036, "grad_norm": 1.5319692332564498, "learning_rate": 1.4372333103485648e-06, "loss": 0.0088, "step": 7032 }, { "epoch": 3.199727024567789, "grad_norm": 1.335152099576636, "learning_rate": 1.4365864949525187e-06, "loss": 0.015, "step": 7033 }, { "epoch": 3.200181983621474, "grad_norm": 1.0808919132831154, "learning_rate": 1.4359397664538232e-06, "loss": 0.0042, "step": 7034 }, { "epoch": 3.200636942675159, "grad_norm": 1.3212609674520628, "learning_rate": 1.4352931249053248e-06, "loss": 0.0415, "step": 7035 }, { "epoch": 3.2010919017288444, "grad_norm": 1.366177019579045, "learning_rate": 1.4346465703598638e-06, "loss": 0.0238, "step": 7036 }, { "epoch": 3.2015468607825297, "grad_norm": 1.2017587127230291, "learning_rate": 1.4340001028702733e-06, "loss": 0.0132, "step": 7037 }, { "epoch": 3.2020018198362146, "grad_norm": 1.012313396256444, "learning_rate": 1.43335372248938e-06, "loss": 0.0108, "step": 7038 }, { "epoch": 3.2024567788899, "grad_norm": 0.5643147510546298, "learning_rate": 1.432707429270005e-06, "loss": 0.014, "step": 7039 }, { "epoch": 3.202911737943585, "grad_norm": 0.8257202103950368, "learning_rate": 1.432061223264959e-06, "loss": 0.0138, "step": 7040 }, { "epoch": 3.20336669699727, "grad_norm": 0.9714954779888508, "learning_rate": 1.4314151045270469e-06, "loss": 0.0113, "step": 7041 }, { "epoch": 3.2038216560509554, "grad_norm": 1.2205047371073638, "learning_rate": 1.4307690731090666e-06, "loss": 0.0257, "step": 7042 }, { "epoch": 3.2042766151046407, "grad_norm": 1.066369054018426, "learning_rate": 1.4301231290638083e-06, "loss": 0.0038, "step": 7043 }, { "epoch": 3.2047315741583255, "grad_norm": 1.1809639275692836, "learning_rate": 1.429477272444057e-06, "loss": 0.0156, "step": 7044 }, { "epoch": 3.205186533212011, "grad_norm": 0.7824412822602509, "learning_rate": 1.428831503302588e-06, "loss": 0.0058, "step": 7045 }, { "epoch": 3.205641492265696, "grad_norm": 1.3665031623958837, "learning_rate": 1.4281858216921719e-06, "loss": 0.0197, "step": 7046 }, { "epoch": 3.2060964513193815, "grad_norm": 0.990403143195734, "learning_rate": 1.4275402276655703e-06, "loss": 0.0292, "step": 7047 }, { "epoch": 3.2065514103730663, "grad_norm": 1.17006287638636, "learning_rate": 1.4268947212755371e-06, "loss": 0.0188, "step": 7048 }, { "epoch": 3.2070063694267517, "grad_norm": 1.230715579647923, "learning_rate": 1.4262493025748219e-06, "loss": 0.0211, "step": 7049 }, { "epoch": 3.207461328480437, "grad_norm": 1.2046401068665114, "learning_rate": 1.425603971616165e-06, "loss": 0.0187, "step": 7050 }, { "epoch": 3.207916287534122, "grad_norm": 0.7391482500466002, "learning_rate": 1.4249587284522998e-06, "loss": 0.0077, "step": 7051 }, { "epoch": 3.208371246587807, "grad_norm": 0.6638116542963773, "learning_rate": 1.4243135731359512e-06, "loss": 0.0079, "step": 7052 }, { "epoch": 3.2088262056414925, "grad_norm": 1.2870717254716537, "learning_rate": 1.4236685057198395e-06, "loss": 0.0063, "step": 7053 }, { "epoch": 3.2092811646951773, "grad_norm": 1.0135024117651361, "learning_rate": 1.4230235262566783e-06, "loss": 0.0098, "step": 7054 }, { "epoch": 3.2097361237488626, "grad_norm": 1.3032273874280678, "learning_rate": 1.4223786347991706e-06, "loss": 0.0262, "step": 7055 }, { "epoch": 3.210191082802548, "grad_norm": 0.8816257813429638, "learning_rate": 1.4217338314000146e-06, "loss": 0.0187, "step": 7056 }, { "epoch": 3.210646041856233, "grad_norm": 0.6498476994740212, "learning_rate": 1.4210891161118991e-06, "loss": 0.0082, "step": 7057 }, { "epoch": 3.211101000909918, "grad_norm": 1.4599540382751501, "learning_rate": 1.4204444889875102e-06, "loss": 0.0177, "step": 7058 }, { "epoch": 3.2115559599636034, "grad_norm": 1.0697796666991644, "learning_rate": 1.419799950079521e-06, "loss": 0.0133, "step": 7059 }, { "epoch": 3.2120109190172883, "grad_norm": 0.8903713684010225, "learning_rate": 1.419155499440603e-06, "loss": 0.0167, "step": 7060 }, { "epoch": 3.2124658780709736, "grad_norm": 2.1590554033513647, "learning_rate": 1.4185111371234162e-06, "loss": 0.0076, "step": 7061 }, { "epoch": 3.212920837124659, "grad_norm": 0.9152028724275051, "learning_rate": 1.4178668631806147e-06, "loss": 0.0175, "step": 7062 }, { "epoch": 3.213375796178344, "grad_norm": 1.2049417827182807, "learning_rate": 1.4172226776648471e-06, "loss": 0.0084, "step": 7063 }, { "epoch": 3.213830755232029, "grad_norm": 0.8243238500098167, "learning_rate": 1.4165785806287525e-06, "loss": 0.0132, "step": 7064 }, { "epoch": 3.2142857142857144, "grad_norm": 1.1899613986519615, "learning_rate": 1.4159345721249637e-06, "loss": 0.0126, "step": 7065 }, { "epoch": 3.2147406733393993, "grad_norm": 0.6202623834204618, "learning_rate": 1.415290652206105e-06, "loss": 0.004, "step": 7066 }, { "epoch": 3.2151956323930846, "grad_norm": 1.3409950131956463, "learning_rate": 1.4146468209247956e-06, "loss": 0.0697, "step": 7067 }, { "epoch": 3.21565059144677, "grad_norm": 1.1098556219088944, "learning_rate": 1.4140030783336478e-06, "loss": 0.0149, "step": 7068 }, { "epoch": 3.2161055505004548, "grad_norm": 0.6977138597999201, "learning_rate": 1.4133594244852638e-06, "loss": 0.0075, "step": 7069 }, { "epoch": 3.21656050955414, "grad_norm": 1.0107281280835927, "learning_rate": 1.412715859432241e-06, "loss": 0.0098, "step": 7070 }, { "epoch": 3.2170154686078254, "grad_norm": 1.1127697908701237, "learning_rate": 1.4120723832271665e-06, "loss": 0.0284, "step": 7071 }, { "epoch": 3.2174704276615103, "grad_norm": 1.2489734090665319, "learning_rate": 1.411428995922625e-06, "loss": 0.0111, "step": 7072 }, { "epoch": 3.2179253867151956, "grad_norm": 0.693499982803166, "learning_rate": 1.4107856975711886e-06, "loss": 0.0092, "step": 7073 }, { "epoch": 3.218380345768881, "grad_norm": 0.8948299933198073, "learning_rate": 1.4101424882254277e-06, "loss": 0.0189, "step": 7074 }, { "epoch": 3.2188353048225657, "grad_norm": 0.6306085788616652, "learning_rate": 1.4094993679379009e-06, "loss": 0.0224, "step": 7075 }, { "epoch": 3.219290263876251, "grad_norm": 0.9473488818181557, "learning_rate": 1.4088563367611597e-06, "loss": 0.0239, "step": 7076 }, { "epoch": 3.2197452229299364, "grad_norm": 0.5623350821712613, "learning_rate": 1.4082133947477522e-06, "loss": 0.0272, "step": 7077 }, { "epoch": 3.2202001819836217, "grad_norm": 1.7250257867242453, "learning_rate": 1.4075705419502162e-06, "loss": 0.0205, "step": 7078 }, { "epoch": 3.2206551410373065, "grad_norm": 1.7220652304594493, "learning_rate": 1.4069277784210813e-06, "loss": 0.0129, "step": 7079 }, { "epoch": 3.221110100090992, "grad_norm": 0.6018513644549814, "learning_rate": 1.4062851042128716e-06, "loss": 0.0107, "step": 7080 }, { "epoch": 3.221565059144677, "grad_norm": 0.7505432648583616, "learning_rate": 1.4056425193781048e-06, "loss": 0.0154, "step": 7081 }, { "epoch": 3.222020018198362, "grad_norm": 1.1019261143210253, "learning_rate": 1.4050000239692885e-06, "loss": 0.0142, "step": 7082 }, { "epoch": 3.2224749772520473, "grad_norm": 1.1402570202882885, "learning_rate": 1.4043576180389257e-06, "loss": 0.0126, "step": 7083 }, { "epoch": 3.2229299363057327, "grad_norm": 1.5274024158573536, "learning_rate": 1.403715301639511e-06, "loss": 0.0214, "step": 7084 }, { "epoch": 3.2233848953594175, "grad_norm": 1.198053095955022, "learning_rate": 1.403073074823531e-06, "loss": 0.0139, "step": 7085 }, { "epoch": 3.223839854413103, "grad_norm": 1.2380808770742653, "learning_rate": 1.4024309376434645e-06, "loss": 0.0143, "step": 7086 }, { "epoch": 3.224294813466788, "grad_norm": 1.1117970740844039, "learning_rate": 1.4017888901517851e-06, "loss": 0.0245, "step": 7087 }, { "epoch": 3.224749772520473, "grad_norm": 1.0881322056512301, "learning_rate": 1.4011469324009594e-06, "loss": 0.01, "step": 7088 }, { "epoch": 3.2252047315741583, "grad_norm": 0.4887983238480007, "learning_rate": 1.400505064443444e-06, "loss": 0.0045, "step": 7089 }, { "epoch": 3.2256596906278436, "grad_norm": 1.069572004304911, "learning_rate": 1.3998632863316892e-06, "loss": 0.0359, "step": 7090 }, { "epoch": 3.2261146496815285, "grad_norm": 0.4086165574303455, "learning_rate": 1.3992215981181379e-06, "loss": 0.0022, "step": 7091 }, { "epoch": 3.226569608735214, "grad_norm": 0.6565652865080412, "learning_rate": 1.398579999855227e-06, "loss": 0.0064, "step": 7092 }, { "epoch": 3.227024567788899, "grad_norm": 0.4344379765571783, "learning_rate": 1.3979384915953847e-06, "loss": 0.0031, "step": 7093 }, { "epoch": 3.227479526842584, "grad_norm": 1.056724278619332, "learning_rate": 1.3972970733910313e-06, "loss": 0.0098, "step": 7094 }, { "epoch": 3.2279344858962693, "grad_norm": 0.8239837332359908, "learning_rate": 1.396655745294582e-06, "loss": 0.0147, "step": 7095 }, { "epoch": 3.2283894449499546, "grad_norm": 1.335059913130574, "learning_rate": 1.3960145073584415e-06, "loss": 0.0209, "step": 7096 }, { "epoch": 3.2288444040036395, "grad_norm": 1.4086173858783717, "learning_rate": 1.3953733596350111e-06, "loss": 0.018, "step": 7097 }, { "epoch": 3.229299363057325, "grad_norm": 1.1048801045546064, "learning_rate": 1.3947323021766812e-06, "loss": 0.0112, "step": 7098 }, { "epoch": 3.22975432211101, "grad_norm": 0.8578932927739985, "learning_rate": 1.3940913350358362e-06, "loss": 0.0061, "step": 7099 }, { "epoch": 3.2302092811646954, "grad_norm": 1.481787633734573, "learning_rate": 1.3934504582648523e-06, "loss": 0.0188, "step": 7100 }, { "epoch": 3.2306642402183803, "grad_norm": 0.8464914728946613, "learning_rate": 1.3928096719160994e-06, "loss": 0.0054, "step": 7101 }, { "epoch": 3.2311191992720656, "grad_norm": 1.2131836677136416, "learning_rate": 1.3921689760419416e-06, "loss": 0.0241, "step": 7102 }, { "epoch": 3.231574158325751, "grad_norm": 2.13247004174433, "learning_rate": 1.391528370694732e-06, "loss": 0.0316, "step": 7103 }, { "epoch": 3.2320291173794358, "grad_norm": 0.979659829510678, "learning_rate": 1.3908878559268177e-06, "loss": 0.0119, "step": 7104 }, { "epoch": 3.232484076433121, "grad_norm": 0.7101613038409313, "learning_rate": 1.3902474317905384e-06, "loss": 0.0238, "step": 7105 }, { "epoch": 3.2329390354868064, "grad_norm": 1.3562395729632197, "learning_rate": 1.3896070983382284e-06, "loss": 0.0195, "step": 7106 }, { "epoch": 3.2333939945404913, "grad_norm": 0.9814773456735375, "learning_rate": 1.3889668556222119e-06, "loss": 0.0187, "step": 7107 }, { "epoch": 3.2338489535941766, "grad_norm": 0.6476284766630567, "learning_rate": 1.3883267036948056e-06, "loss": 0.013, "step": 7108 }, { "epoch": 3.234303912647862, "grad_norm": 1.0867345116037677, "learning_rate": 1.3876866426083214e-06, "loss": 0.0162, "step": 7109 }, { "epoch": 3.2347588717015467, "grad_norm": 0.5239737961880028, "learning_rate": 1.387046672415061e-06, "loss": 0.0033, "step": 7110 }, { "epoch": 3.235213830755232, "grad_norm": 0.9894038492050778, "learning_rate": 1.3864067931673214e-06, "loss": 0.0249, "step": 7111 }, { "epoch": 3.2356687898089174, "grad_norm": 0.8442169006609916, "learning_rate": 1.3857670049173897e-06, "loss": 0.0281, "step": 7112 }, { "epoch": 3.2361237488626022, "grad_norm": 1.0711772306359602, "learning_rate": 1.3851273077175465e-06, "loss": 0.025, "step": 7113 }, { "epoch": 3.2365787079162875, "grad_norm": 0.5731270847520525, "learning_rate": 1.384487701620065e-06, "loss": 0.0063, "step": 7114 }, { "epoch": 3.237033666969973, "grad_norm": 0.4468830623213079, "learning_rate": 1.38384818667721e-06, "loss": 0.0079, "step": 7115 }, { "epoch": 3.2374886260236577, "grad_norm": 0.41007051634410674, "learning_rate": 1.3832087629412406e-06, "loss": 0.0033, "step": 7116 }, { "epoch": 3.237943585077343, "grad_norm": 0.5045441569787975, "learning_rate": 1.3825694304644089e-06, "loss": 0.0068, "step": 7117 }, { "epoch": 3.2383985441310283, "grad_norm": 0.9917612273481944, "learning_rate": 1.3819301892989567e-06, "loss": 0.0205, "step": 7118 }, { "epoch": 3.238853503184713, "grad_norm": 1.23627619905563, "learning_rate": 1.3812910394971205e-06, "loss": 0.0218, "step": 7119 }, { "epoch": 3.2393084622383985, "grad_norm": 1.3216624357820799, "learning_rate": 1.3806519811111275e-06, "loss": 0.005, "step": 7120 }, { "epoch": 3.239763421292084, "grad_norm": 1.5348214395060766, "learning_rate": 1.3800130141932005e-06, "loss": 0.0316, "step": 7121 }, { "epoch": 3.2402183803457687, "grad_norm": 1.2264796406692187, "learning_rate": 1.3793741387955512e-06, "loss": 0.0542, "step": 7122 }, { "epoch": 3.240673339399454, "grad_norm": 0.9468355570359155, "learning_rate": 1.378735354970388e-06, "loss": 0.0232, "step": 7123 }, { "epoch": 3.2411282984531393, "grad_norm": 1.4079667267888258, "learning_rate": 1.3780966627699078e-06, "loss": 0.0265, "step": 7124 }, { "epoch": 3.241583257506824, "grad_norm": 0.964398175163656, "learning_rate": 1.3774580622463005e-06, "loss": 0.0109, "step": 7125 }, { "epoch": 3.2420382165605095, "grad_norm": 0.7338478463458145, "learning_rate": 1.3768195534517523e-06, "loss": 0.0093, "step": 7126 }, { "epoch": 3.242493175614195, "grad_norm": 0.6233212208863096, "learning_rate": 1.3761811364384378e-06, "loss": 0.0193, "step": 7127 }, { "epoch": 3.2429481346678797, "grad_norm": 0.6445944864937828, "learning_rate": 1.3755428112585257e-06, "loss": 0.0125, "step": 7128 }, { "epoch": 3.243403093721565, "grad_norm": 0.5956437125596798, "learning_rate": 1.3749045779641763e-06, "loss": 0.0111, "step": 7129 }, { "epoch": 3.2438580527752503, "grad_norm": 1.3333581147606823, "learning_rate": 1.3742664366075436e-06, "loss": 0.0143, "step": 7130 }, { "epoch": 3.244313011828935, "grad_norm": 1.2021633418878717, "learning_rate": 1.3736283872407753e-06, "loss": 0.0173, "step": 7131 }, { "epoch": 3.2447679708826205, "grad_norm": 0.8007339611996417, "learning_rate": 1.3729904299160083e-06, "loss": 0.0329, "step": 7132 }, { "epoch": 3.245222929936306, "grad_norm": 0.6590561537237312, "learning_rate": 1.3723525646853738e-06, "loss": 0.0051, "step": 7133 }, { "epoch": 3.245677888989991, "grad_norm": 0.542374396132093, "learning_rate": 1.3717147916009943e-06, "loss": 0.0089, "step": 7134 }, { "epoch": 3.246132848043676, "grad_norm": 1.0710685641710802, "learning_rate": 1.3710771107149878e-06, "loss": 0.0546, "step": 7135 }, { "epoch": 3.2465878070973613, "grad_norm": 1.167961646968647, "learning_rate": 1.3704395220794608e-06, "loss": 0.0113, "step": 7136 }, { "epoch": 3.2470427661510466, "grad_norm": 1.220608566418892, "learning_rate": 1.3698020257465158e-06, "loss": 0.0212, "step": 7137 }, { "epoch": 3.2474977252047315, "grad_norm": 1.3629979430338308, "learning_rate": 1.3691646217682454e-06, "loss": 0.0214, "step": 7138 }, { "epoch": 3.2479526842584168, "grad_norm": 0.8610483832281862, "learning_rate": 1.3685273101967345e-06, "loss": 0.0073, "step": 7139 }, { "epoch": 3.248407643312102, "grad_norm": 0.9234540671864837, "learning_rate": 1.3678900910840627e-06, "loss": 0.0057, "step": 7140 }, { "epoch": 3.248862602365787, "grad_norm": 0.7653840134378624, "learning_rate": 1.3672529644823004e-06, "loss": 0.0105, "step": 7141 }, { "epoch": 3.2493175614194723, "grad_norm": 0.6668136993397836, "learning_rate": 1.3666159304435104e-06, "loss": 0.0088, "step": 7142 }, { "epoch": 3.2497725204731576, "grad_norm": 0.28194864580987755, "learning_rate": 1.3659789890197471e-06, "loss": 0.0014, "step": 7143 }, { "epoch": 3.2502274795268424, "grad_norm": 1.385830189472334, "learning_rate": 1.3653421402630595e-06, "loss": 0.0146, "step": 7144 }, { "epoch": 3.2506824385805277, "grad_norm": 0.33886690816290266, "learning_rate": 1.3647053842254896e-06, "loss": 0.0023, "step": 7145 }, { "epoch": 3.251137397634213, "grad_norm": 1.21091034990347, "learning_rate": 1.3640687209590683e-06, "loss": 0.0341, "step": 7146 }, { "epoch": 3.251592356687898, "grad_norm": 1.2232777866853826, "learning_rate": 1.3634321505158216e-06, "loss": 0.0294, "step": 7147 }, { "epoch": 3.2520473157415832, "grad_norm": 1.2093172358801012, "learning_rate": 1.3627956729477664e-06, "loss": 0.0392, "step": 7148 }, { "epoch": 3.2525022747952685, "grad_norm": 1.620378026547906, "learning_rate": 1.3621592883069128e-06, "loss": 0.0272, "step": 7149 }, { "epoch": 3.2529572338489534, "grad_norm": 0.6993636037277924, "learning_rate": 1.3615229966452638e-06, "loss": 0.0248, "step": 7150 }, { "epoch": 3.2534121929026387, "grad_norm": 1.2497757493904433, "learning_rate": 1.3608867980148147e-06, "loss": 0.0075, "step": 7151 }, { "epoch": 3.253867151956324, "grad_norm": 0.42621249895893076, "learning_rate": 1.3602506924675524e-06, "loss": 0.0171, "step": 7152 }, { "epoch": 3.2543221110100093, "grad_norm": 0.9712841623096715, "learning_rate": 1.3596146800554567e-06, "loss": 0.0202, "step": 7153 }, { "epoch": 3.254777070063694, "grad_norm": 1.2011375447661319, "learning_rate": 1.358978760830498e-06, "loss": 0.0475, "step": 7154 }, { "epoch": 3.2552320291173795, "grad_norm": 0.8591304612796659, "learning_rate": 1.3583429348446433e-06, "loss": 0.0138, "step": 7155 }, { "epoch": 3.255686988171065, "grad_norm": 0.8969984657689867, "learning_rate": 1.3577072021498484e-06, "loss": 0.0116, "step": 7156 }, { "epoch": 3.2561419472247497, "grad_norm": 0.960027270842959, "learning_rate": 1.3570715627980614e-06, "loss": 0.01, "step": 7157 }, { "epoch": 3.256596906278435, "grad_norm": 0.9566371702858006, "learning_rate": 1.3564360168412262e-06, "loss": 0.0198, "step": 7158 }, { "epoch": 3.2570518653321203, "grad_norm": 0.6420507926210972, "learning_rate": 1.3558005643312739e-06, "loss": 0.0077, "step": 7159 }, { "epoch": 3.257506824385805, "grad_norm": 0.9099852997660002, "learning_rate": 1.3551652053201334e-06, "loss": 0.0093, "step": 7160 }, { "epoch": 3.2579617834394905, "grad_norm": 1.4108519780589643, "learning_rate": 1.3545299398597223e-06, "loss": 0.0359, "step": 7161 }, { "epoch": 3.258416742493176, "grad_norm": 0.8459210337729043, "learning_rate": 1.3538947680019515e-06, "loss": 0.012, "step": 7162 }, { "epoch": 3.2588717015468607, "grad_norm": 1.5135489500817372, "learning_rate": 1.3532596897987237e-06, "loss": 0.0111, "step": 7163 }, { "epoch": 3.259326660600546, "grad_norm": 0.7377964803381473, "learning_rate": 1.3526247053019354e-06, "loss": 0.0218, "step": 7164 }, { "epoch": 3.2597816196542313, "grad_norm": 0.6287431336020588, "learning_rate": 1.3519898145634758e-06, "loss": 0.0111, "step": 7165 }, { "epoch": 3.260236578707916, "grad_norm": 1.5316425515472047, "learning_rate": 1.3513550176352242e-06, "loss": 0.0122, "step": 7166 }, { "epoch": 3.2606915377616015, "grad_norm": 0.9208697143723562, "learning_rate": 1.3507203145690529e-06, "loss": 0.0088, "step": 7167 }, { "epoch": 3.261146496815287, "grad_norm": 1.5837914435106897, "learning_rate": 1.3500857054168267e-06, "loss": 0.0244, "step": 7168 }, { "epoch": 3.2616014558689717, "grad_norm": 1.339747412446492, "learning_rate": 1.3494511902304047e-06, "loss": 0.0163, "step": 7169 }, { "epoch": 3.262056414922657, "grad_norm": 1.1245270608576041, "learning_rate": 1.3488167690616355e-06, "loss": 0.0089, "step": 7170 }, { "epoch": 3.2625113739763423, "grad_norm": 1.635665188556439, "learning_rate": 1.3481824419623605e-06, "loss": 0.0278, "step": 7171 }, { "epoch": 3.262966333030027, "grad_norm": 1.3951727879053535, "learning_rate": 1.3475482089844155e-06, "loss": 0.018, "step": 7172 }, { "epoch": 3.2634212920837125, "grad_norm": 0.9707503003486665, "learning_rate": 1.3469140701796254e-06, "loss": 0.0084, "step": 7173 }, { "epoch": 3.2638762511373978, "grad_norm": 1.1315420960048395, "learning_rate": 1.3462800255998116e-06, "loss": 0.009, "step": 7174 }, { "epoch": 3.2643312101910826, "grad_norm": 1.5361197419123502, "learning_rate": 1.3456460752967834e-06, "loss": 0.0327, "step": 7175 }, { "epoch": 3.264786169244768, "grad_norm": 0.9399834236781344, "learning_rate": 1.3450122193223452e-06, "loss": 0.0084, "step": 7176 }, { "epoch": 3.2652411282984533, "grad_norm": 0.6136951412040854, "learning_rate": 1.3443784577282915e-06, "loss": 0.0047, "step": 7177 }, { "epoch": 3.265696087352138, "grad_norm": 1.0975768586489363, "learning_rate": 1.3437447905664114e-06, "loss": 0.0429, "step": 7178 }, { "epoch": 3.2661510464058234, "grad_norm": 1.0737156518663467, "learning_rate": 1.3431112178884868e-06, "loss": 0.0178, "step": 7179 }, { "epoch": 3.2666060054595087, "grad_norm": 0.9590337795663121, "learning_rate": 1.3424777397462884e-06, "loss": 0.0108, "step": 7180 }, { "epoch": 3.2670609645131936, "grad_norm": 0.3637910801752254, "learning_rate": 1.3418443561915823e-06, "loss": 0.002, "step": 7181 }, { "epoch": 3.267515923566879, "grad_norm": 1.2680089971127235, "learning_rate": 1.3412110672761243e-06, "loss": 0.0106, "step": 7182 }, { "epoch": 3.2679708826205642, "grad_norm": 1.7557426075105178, "learning_rate": 1.3405778730516656e-06, "loss": 0.0145, "step": 7183 }, { "epoch": 3.268425841674249, "grad_norm": 1.166172500096561, "learning_rate": 1.3399447735699473e-06, "loss": 0.0133, "step": 7184 }, { "epoch": 3.2688808007279344, "grad_norm": 1.4302170181555272, "learning_rate": 1.339311768882702e-06, "loss": 0.0346, "step": 7185 }, { "epoch": 3.2693357597816197, "grad_norm": 1.030015861256858, "learning_rate": 1.3386788590416586e-06, "loss": 0.0142, "step": 7186 }, { "epoch": 3.2697907188353046, "grad_norm": 0.9808905199710511, "learning_rate": 1.3380460440985344e-06, "loss": 0.0093, "step": 7187 }, { "epoch": 3.27024567788899, "grad_norm": 0.8119454438933927, "learning_rate": 1.337413324105039e-06, "loss": 0.0113, "step": 7188 }, { "epoch": 3.270700636942675, "grad_norm": 1.003279601539296, "learning_rate": 1.3367806991128775e-06, "loss": 0.0508, "step": 7189 }, { "epoch": 3.2711555959963605, "grad_norm": 0.9391989937289991, "learning_rate": 1.3361481691737444e-06, "loss": 0.0102, "step": 7190 }, { "epoch": 3.2716105550500454, "grad_norm": 1.171361133808406, "learning_rate": 1.3355157343393272e-06, "loss": 0.0159, "step": 7191 }, { "epoch": 3.2720655141037307, "grad_norm": 0.8623671125355323, "learning_rate": 1.3348833946613039e-06, "loss": 0.0088, "step": 7192 }, { "epoch": 3.272520473157416, "grad_norm": 0.9828587309157427, "learning_rate": 1.3342511501913483e-06, "loss": 0.0152, "step": 7193 }, { "epoch": 3.272975432211101, "grad_norm": 0.7823991158595773, "learning_rate": 1.3336190009811252e-06, "loss": 0.0065, "step": 7194 }, { "epoch": 3.273430391264786, "grad_norm": 0.6680067389494546, "learning_rate": 1.3329869470822898e-06, "loss": 0.0048, "step": 7195 }, { "epoch": 3.2738853503184715, "grad_norm": 1.6131835680711342, "learning_rate": 1.3323549885464912e-06, "loss": 0.0213, "step": 7196 }, { "epoch": 3.2743403093721564, "grad_norm": 1.1193409691139753, "learning_rate": 1.3317231254253687e-06, "loss": 0.006, "step": 7197 }, { "epoch": 3.2747952684258417, "grad_norm": 1.0494485302451761, "learning_rate": 1.3310913577705575e-06, "loss": 0.0238, "step": 7198 }, { "epoch": 3.275250227479527, "grad_norm": 0.9658747368024386, "learning_rate": 1.330459685633681e-06, "loss": 0.0061, "step": 7199 }, { "epoch": 3.275705186533212, "grad_norm": 0.8854848738937331, "learning_rate": 1.3298281090663584e-06, "loss": 0.0161, "step": 7200 }, { "epoch": 3.276160145586897, "grad_norm": 0.8562960383554103, "learning_rate": 1.329196628120198e-06, "loss": 0.0159, "step": 7201 }, { "epoch": 3.2766151046405825, "grad_norm": 0.8619672763559862, "learning_rate": 1.328565242846801e-06, "loss": 0.0197, "step": 7202 }, { "epoch": 3.2770700636942673, "grad_norm": 1.1517327592076785, "learning_rate": 1.327933953297763e-06, "loss": 0.013, "step": 7203 }, { "epoch": 3.2775250227479527, "grad_norm": 0.8261769224035421, "learning_rate": 1.32730275952467e-06, "loss": 0.0065, "step": 7204 }, { "epoch": 3.277979981801638, "grad_norm": 0.7840862139895779, "learning_rate": 1.326671661579099e-06, "loss": 0.0101, "step": 7205 }, { "epoch": 3.278434940855323, "grad_norm": 1.1907999796673796, "learning_rate": 1.3260406595126202e-06, "loss": 0.0173, "step": 7206 }, { "epoch": 3.278889899909008, "grad_norm": 0.6854808280819521, "learning_rate": 1.3254097533767973e-06, "loss": 0.0061, "step": 7207 }, { "epoch": 3.2793448589626935, "grad_norm": 0.8603518084094921, "learning_rate": 1.324778943223186e-06, "loss": 0.0088, "step": 7208 }, { "epoch": 3.2797998180163788, "grad_norm": 0.7840319038362785, "learning_rate": 1.324148229103332e-06, "loss": 0.0074, "step": 7209 }, { "epoch": 3.2802547770700636, "grad_norm": 1.017013108080027, "learning_rate": 1.3235176110687748e-06, "loss": 0.0233, "step": 7210 }, { "epoch": 3.280709736123749, "grad_norm": 0.9824737348651515, "learning_rate": 1.3228870891710443e-06, "loss": 0.0109, "step": 7211 }, { "epoch": 3.2811646951774343, "grad_norm": 0.5092930245845696, "learning_rate": 1.3222566634616663e-06, "loss": 0.0045, "step": 7212 }, { "epoch": 3.281619654231119, "grad_norm": 0.859751528277341, "learning_rate": 1.3216263339921537e-06, "loss": 0.0087, "step": 7213 }, { "epoch": 3.2820746132848044, "grad_norm": 1.0954516278374302, "learning_rate": 1.320996100814017e-06, "loss": 0.0215, "step": 7214 }, { "epoch": 3.2825295723384897, "grad_norm": 0.8471948560612581, "learning_rate": 1.3203659639787544e-06, "loss": 0.011, "step": 7215 }, { "epoch": 3.2829845313921746, "grad_norm": 1.3666714891712912, "learning_rate": 1.319735923537857e-06, "loss": 0.0105, "step": 7216 }, { "epoch": 3.28343949044586, "grad_norm": 0.8916104504996843, "learning_rate": 1.3191059795428113e-06, "loss": 0.0172, "step": 7217 }, { "epoch": 3.2838944494995452, "grad_norm": 0.8267965411008622, "learning_rate": 1.3184761320450918e-06, "loss": 0.024, "step": 7218 }, { "epoch": 3.28434940855323, "grad_norm": 1.210571071820678, "learning_rate": 1.3178463810961672e-06, "loss": 0.0284, "step": 7219 }, { "epoch": 3.2848043676069154, "grad_norm": 1.2089542951006211, "learning_rate": 1.3172167267474966e-06, "loss": 0.0362, "step": 7220 }, { "epoch": 3.2852593266606007, "grad_norm": 0.7343464350846485, "learning_rate": 1.316587169050534e-06, "loss": 0.0089, "step": 7221 }, { "epoch": 3.2857142857142856, "grad_norm": 0.6448350036865802, "learning_rate": 1.3159577080567242e-06, "loss": 0.011, "step": 7222 }, { "epoch": 3.286169244767971, "grad_norm": 0.3776897288077131, "learning_rate": 1.3153283438175036e-06, "loss": 0.0034, "step": 7223 }, { "epoch": 3.286624203821656, "grad_norm": 0.7697768063724554, "learning_rate": 1.3146990763843009e-06, "loss": 0.0103, "step": 7224 }, { "epoch": 3.287079162875341, "grad_norm": 0.8865805025422063, "learning_rate": 1.3140699058085368e-06, "loss": 0.0271, "step": 7225 }, { "epoch": 3.2875341219290264, "grad_norm": 1.78450594069202, "learning_rate": 1.3134408321416236e-06, "loss": 0.0225, "step": 7226 }, { "epoch": 3.2879890809827117, "grad_norm": 0.7779735900679363, "learning_rate": 1.312811855434967e-06, "loss": 0.0043, "step": 7227 }, { "epoch": 3.2884440400363966, "grad_norm": 0.9889266024349289, "learning_rate": 1.312182975739965e-06, "loss": 0.0069, "step": 7228 }, { "epoch": 3.288898999090082, "grad_norm": 0.29672893303590747, "learning_rate": 1.3115541931080067e-06, "loss": 0.0025, "step": 7229 }, { "epoch": 3.289353958143767, "grad_norm": 1.089670666612609, "learning_rate": 1.3109255075904725e-06, "loss": 0.0194, "step": 7230 }, { "epoch": 3.289808917197452, "grad_norm": 1.8684346734562391, "learning_rate": 1.3102969192387349e-06, "loss": 0.0275, "step": 7231 }, { "epoch": 3.2902638762511374, "grad_norm": 0.7704360992234441, "learning_rate": 1.3096684281041613e-06, "loss": 0.0173, "step": 7232 }, { "epoch": 3.2907188353048227, "grad_norm": 0.9141548636355631, "learning_rate": 1.3090400342381084e-06, "loss": 0.0292, "step": 7233 }, { "epoch": 3.2911737943585075, "grad_norm": 0.6844354809421923, "learning_rate": 1.3084117376919249e-06, "loss": 0.0186, "step": 7234 }, { "epoch": 3.291628753412193, "grad_norm": 0.7821804356695097, "learning_rate": 1.3077835385169535e-06, "loss": 0.0137, "step": 7235 }, { "epoch": 3.292083712465878, "grad_norm": 0.617256110101671, "learning_rate": 1.3071554367645267e-06, "loss": 0.0043, "step": 7236 }, { "epoch": 3.292538671519563, "grad_norm": 1.0584801273337188, "learning_rate": 1.3065274324859717e-06, "loss": 0.0133, "step": 7237 }, { "epoch": 3.2929936305732483, "grad_norm": 1.7499425992052593, "learning_rate": 1.305899525732605e-06, "loss": 0.031, "step": 7238 }, { "epoch": 3.2934485896269337, "grad_norm": 1.1405911559747581, "learning_rate": 1.3052717165557365e-06, "loss": 0.0324, "step": 7239 }, { "epoch": 3.2939035486806185, "grad_norm": 0.9464867769961921, "learning_rate": 1.3046440050066675e-06, "loss": 0.0198, "step": 7240 }, { "epoch": 3.294358507734304, "grad_norm": 0.8048701397386541, "learning_rate": 1.3040163911366918e-06, "loss": 0.0084, "step": 7241 }, { "epoch": 3.294813466787989, "grad_norm": 0.9087793811559542, "learning_rate": 1.3033888749970969e-06, "loss": 0.0088, "step": 7242 }, { "epoch": 3.295268425841674, "grad_norm": 0.9647948430543872, "learning_rate": 1.3027614566391588e-06, "loss": 0.0319, "step": 7243 }, { "epoch": 3.2957233848953593, "grad_norm": 0.9064629317637867, "learning_rate": 1.3021341361141482e-06, "loss": 0.0173, "step": 7244 }, { "epoch": 3.2961783439490446, "grad_norm": 0.9463347037579698, "learning_rate": 1.3015069134733255e-06, "loss": 0.0194, "step": 7245 }, { "epoch": 3.29663330300273, "grad_norm": 1.158728231930721, "learning_rate": 1.3008797887679464e-06, "loss": 0.0479, "step": 7246 }, { "epoch": 3.297088262056415, "grad_norm": 1.0982977111641523, "learning_rate": 1.3002527620492556e-06, "loss": 0.0076, "step": 7247 }, { "epoch": 3.2975432211101, "grad_norm": 1.153700538902321, "learning_rate": 1.2996258333684903e-06, "loss": 0.0382, "step": 7248 }, { "epoch": 3.2979981801637854, "grad_norm": 1.5617932014273164, "learning_rate": 1.298999002776882e-06, "loss": 0.0196, "step": 7249 }, { "epoch": 3.2984531392174703, "grad_norm": 1.178113793348873, "learning_rate": 1.2983722703256506e-06, "loss": 0.0299, "step": 7250 }, { "epoch": 3.2989080982711556, "grad_norm": 1.2446109400716547, "learning_rate": 1.2977456360660119e-06, "loss": 0.0053, "step": 7251 }, { "epoch": 3.299363057324841, "grad_norm": 0.9489197124453541, "learning_rate": 1.2971191000491701e-06, "loss": 0.0234, "step": 7252 }, { "epoch": 3.299818016378526, "grad_norm": 1.3735840153236836, "learning_rate": 1.2964926623263233e-06, "loss": 0.0289, "step": 7253 }, { "epoch": 3.300272975432211, "grad_norm": 1.2972788375420368, "learning_rate": 1.2958663229486612e-06, "loss": 0.0302, "step": 7254 }, { "epoch": 3.3007279344858964, "grad_norm": 1.0762237532057986, "learning_rate": 1.2952400819673636e-06, "loss": 0.0269, "step": 7255 }, { "epoch": 3.3011828935395813, "grad_norm": 0.986263273840619, "learning_rate": 1.2946139394336077e-06, "loss": 0.0365, "step": 7256 }, { "epoch": 3.3016378525932666, "grad_norm": 1.2508344296001053, "learning_rate": 1.2939878953985572e-06, "loss": 0.0299, "step": 7257 }, { "epoch": 3.302092811646952, "grad_norm": 0.9352679142491748, "learning_rate": 1.2933619499133693e-06, "loss": 0.0103, "step": 7258 }, { "epoch": 3.3025477707006368, "grad_norm": 1.712519136238566, "learning_rate": 1.292736103029194e-06, "loss": 0.0102, "step": 7259 }, { "epoch": 3.303002729754322, "grad_norm": 0.7897630833080888, "learning_rate": 1.2921103547971715e-06, "loss": 0.0197, "step": 7260 }, { "epoch": 3.3034576888080074, "grad_norm": 0.7048400693789791, "learning_rate": 1.291484705268437e-06, "loss": 0.0124, "step": 7261 }, { "epoch": 3.3039126478616927, "grad_norm": 0.7425469398382967, "learning_rate": 1.2908591544941138e-06, "loss": 0.0105, "step": 7262 }, { "epoch": 3.3043676069153776, "grad_norm": 1.872263993614512, "learning_rate": 1.290233702525321e-06, "loss": 0.0348, "step": 7263 }, { "epoch": 3.304822565969063, "grad_norm": 1.0103519008828974, "learning_rate": 1.2896083494131668e-06, "loss": 0.0131, "step": 7264 }, { "epoch": 3.305277525022748, "grad_norm": 1.1007498941091696, "learning_rate": 1.2889830952087511e-06, "loss": 0.0149, "step": 7265 }, { "epoch": 3.305732484076433, "grad_norm": 0.8864942845026035, "learning_rate": 1.288357939963169e-06, "loss": 0.0125, "step": 7266 }, { "epoch": 3.3061874431301184, "grad_norm": 0.7661422195317444, "learning_rate": 1.2877328837275045e-06, "loss": 0.0213, "step": 7267 }, { "epoch": 3.3066424021838037, "grad_norm": 0.5059720461865929, "learning_rate": 1.2871079265528335e-06, "loss": 0.0038, "step": 7268 }, { "epoch": 3.3070973612374885, "grad_norm": 0.8431621947325294, "learning_rate": 1.2864830684902253e-06, "loss": 0.0235, "step": 7269 }, { "epoch": 3.307552320291174, "grad_norm": 0.9034818825769863, "learning_rate": 1.2858583095907402e-06, "loss": 0.0148, "step": 7270 }, { "epoch": 3.308007279344859, "grad_norm": 0.708804787727126, "learning_rate": 1.2852336499054318e-06, "loss": 0.0112, "step": 7271 }, { "epoch": 3.308462238398544, "grad_norm": 0.44343393431430916, "learning_rate": 1.284609089485344e-06, "loss": 0.0033, "step": 7272 }, { "epoch": 3.3089171974522293, "grad_norm": 1.2064201903749183, "learning_rate": 1.2839846283815124e-06, "loss": 0.0214, "step": 7273 }, { "epoch": 3.3093721565059147, "grad_norm": 1.2456334168154939, "learning_rate": 1.2833602666449647e-06, "loss": 0.0231, "step": 7274 }, { "epoch": 3.3098271155595995, "grad_norm": 0.762420187386553, "learning_rate": 1.2827360043267228e-06, "loss": 0.0095, "step": 7275 }, { "epoch": 3.310282074613285, "grad_norm": 0.8176545661581822, "learning_rate": 1.2821118414777963e-06, "loss": 0.0173, "step": 7276 }, { "epoch": 3.31073703366697, "grad_norm": 1.5267369643044957, "learning_rate": 1.2814877781491914e-06, "loss": 0.0164, "step": 7277 }, { "epoch": 3.311191992720655, "grad_norm": 1.5953390972878707, "learning_rate": 1.2808638143919021e-06, "loss": 0.0113, "step": 7278 }, { "epoch": 3.3116469517743403, "grad_norm": 0.8613657543834387, "learning_rate": 1.280239950256916e-06, "loss": 0.0096, "step": 7279 }, { "epoch": 3.3121019108280256, "grad_norm": 0.7174530620379678, "learning_rate": 1.2796161857952133e-06, "loss": 0.0176, "step": 7280 }, { "epoch": 3.3125568698817105, "grad_norm": 1.0869763298135788, "learning_rate": 1.2789925210577647e-06, "loss": 0.0167, "step": 7281 }, { "epoch": 3.313011828935396, "grad_norm": 0.9668203179502445, "learning_rate": 1.2783689560955336e-06, "loss": 0.0177, "step": 7282 }, { "epoch": 3.313466787989081, "grad_norm": 0.9180872119097331, "learning_rate": 1.2777454909594733e-06, "loss": 0.0306, "step": 7283 }, { "epoch": 3.313921747042766, "grad_norm": 0.5600000181329456, "learning_rate": 1.2771221257005317e-06, "loss": 0.0057, "step": 7284 }, { "epoch": 3.3143767060964513, "grad_norm": 1.2121620845305172, "learning_rate": 1.2764988603696489e-06, "loss": 0.0085, "step": 7285 }, { "epoch": 3.3148316651501366, "grad_norm": 1.1738189870948255, "learning_rate": 1.2758756950177536e-06, "loss": 0.0277, "step": 7286 }, { "epoch": 3.3152866242038215, "grad_norm": 1.801997127048802, "learning_rate": 1.2752526296957684e-06, "loss": 0.0275, "step": 7287 }, { "epoch": 3.315741583257507, "grad_norm": 1.0823076120248578, "learning_rate": 1.274629664454607e-06, "loss": 0.0264, "step": 7288 }, { "epoch": 3.316196542311192, "grad_norm": 1.1031845898462043, "learning_rate": 1.274006799345176e-06, "loss": 0.0114, "step": 7289 }, { "epoch": 3.316651501364877, "grad_norm": 1.122198142915479, "learning_rate": 1.2733840344183719e-06, "loss": 0.0159, "step": 7290 }, { "epoch": 3.3171064604185623, "grad_norm": 1.431147829077079, "learning_rate": 1.2727613697250863e-06, "loss": 0.0188, "step": 7291 }, { "epoch": 3.3175614194722476, "grad_norm": 2.043480064749315, "learning_rate": 1.2721388053161992e-06, "loss": 0.015, "step": 7292 }, { "epoch": 3.3180163785259325, "grad_norm": 0.5887079969957104, "learning_rate": 1.2715163412425846e-06, "loss": 0.0076, "step": 7293 }, { "epoch": 3.3184713375796178, "grad_norm": 0.6853223749832688, "learning_rate": 1.2708939775551052e-06, "loss": 0.0101, "step": 7294 }, { "epoch": 3.318926296633303, "grad_norm": 0.8826429064866624, "learning_rate": 1.2702717143046206e-06, "loss": 0.006, "step": 7295 }, { "epoch": 3.319381255686988, "grad_norm": 1.7915770572057528, "learning_rate": 1.269649551541978e-06, "loss": 0.021, "step": 7296 }, { "epoch": 3.3198362147406733, "grad_norm": 1.0297906942773376, "learning_rate": 1.2690274893180167e-06, "loss": 0.0198, "step": 7297 }, { "epoch": 3.3202911737943586, "grad_norm": 1.4276209944261973, "learning_rate": 1.2684055276835713e-06, "loss": 0.0213, "step": 7298 }, { "epoch": 3.3207461328480434, "grad_norm": 0.6992817574495818, "learning_rate": 1.2677836666894632e-06, "loss": 0.0059, "step": 7299 }, { "epoch": 3.3212010919017287, "grad_norm": 1.0058565494896554, "learning_rate": 1.26716190638651e-06, "loss": 0.011, "step": 7300 }, { "epoch": 3.321656050955414, "grad_norm": 0.9476663714097732, "learning_rate": 1.2665402468255187e-06, "loss": 0.0177, "step": 7301 }, { "epoch": 3.3221110100090994, "grad_norm": 0.7857813588759723, "learning_rate": 1.2659186880572879e-06, "loss": 0.0122, "step": 7302 }, { "epoch": 3.3225659690627842, "grad_norm": 0.3560220829168529, "learning_rate": 1.2652972301326084e-06, "loss": 0.0081, "step": 7303 }, { "epoch": 3.3230209281164695, "grad_norm": 1.1740426207642785, "learning_rate": 1.2646758731022627e-06, "loss": 0.0375, "step": 7304 }, { "epoch": 3.323475887170155, "grad_norm": 1.0976796930172668, "learning_rate": 1.264054617017027e-06, "loss": 0.0091, "step": 7305 }, { "epoch": 3.3239308462238397, "grad_norm": 0.8009583303844279, "learning_rate": 1.2634334619276669e-06, "loss": 0.0093, "step": 7306 }, { "epoch": 3.324385805277525, "grad_norm": 0.7977453908727318, "learning_rate": 1.26281240788494e-06, "loss": 0.0149, "step": 7307 }, { "epoch": 3.3248407643312103, "grad_norm": 1.0206177782263182, "learning_rate": 1.2621914549395947e-06, "loss": 0.0088, "step": 7308 }, { "epoch": 3.325295723384895, "grad_norm": 1.492145364987483, "learning_rate": 1.2615706031423751e-06, "loss": 0.0206, "step": 7309 }, { "epoch": 3.3257506824385805, "grad_norm": 0.9481242495949631, "learning_rate": 1.2609498525440131e-06, "loss": 0.0082, "step": 7310 }, { "epoch": 3.326205641492266, "grad_norm": 1.6714268334345213, "learning_rate": 1.2603292031952324e-06, "loss": 0.0265, "step": 7311 }, { "epoch": 3.3266606005459507, "grad_norm": 1.7297074879945344, "learning_rate": 1.2597086551467522e-06, "loss": 0.0316, "step": 7312 }, { "epoch": 3.327115559599636, "grad_norm": 0.9182806485473802, "learning_rate": 1.2590882084492783e-06, "loss": 0.0189, "step": 7313 }, { "epoch": 3.3275705186533213, "grad_norm": 0.8192249250498257, "learning_rate": 1.2584678631535136e-06, "loss": 0.0125, "step": 7314 }, { "epoch": 3.328025477707006, "grad_norm": 0.9678334740404074, "learning_rate": 1.257847619310148e-06, "loss": 0.0154, "step": 7315 }, { "epoch": 3.3284804367606915, "grad_norm": 0.7925137667190734, "learning_rate": 1.2572274769698656e-06, "loss": 0.0177, "step": 7316 }, { "epoch": 3.328935395814377, "grad_norm": 1.3705505303404393, "learning_rate": 1.2566074361833403e-06, "loss": 0.0172, "step": 7317 }, { "epoch": 3.329390354868062, "grad_norm": 1.4423260751105411, "learning_rate": 1.2559874970012403e-06, "loss": 0.0496, "step": 7318 }, { "epoch": 3.329845313921747, "grad_norm": 1.7688679031843042, "learning_rate": 1.2553676594742251e-06, "loss": 0.0501, "step": 7319 }, { "epoch": 3.3303002729754323, "grad_norm": 0.8201330253505975, "learning_rate": 1.2547479236529442e-06, "loss": 0.0109, "step": 7320 }, { "epoch": 3.3307552320291176, "grad_norm": 1.223885851719845, "learning_rate": 1.254128289588039e-06, "loss": 0.0326, "step": 7321 }, { "epoch": 3.3312101910828025, "grad_norm": 1.2278973519479706, "learning_rate": 1.2535087573301432e-06, "loss": 0.0168, "step": 7322 }, { "epoch": 3.331665150136488, "grad_norm": 0.9103989288123862, "learning_rate": 1.2528893269298837e-06, "loss": 0.0036, "step": 7323 }, { "epoch": 3.332120109190173, "grad_norm": 0.3972845549490509, "learning_rate": 1.252269998437876e-06, "loss": 0.006, "step": 7324 }, { "epoch": 3.332575068243858, "grad_norm": 1.0728879020733006, "learning_rate": 1.2516507719047289e-06, "loss": 0.0164, "step": 7325 }, { "epoch": 3.3330300272975433, "grad_norm": 0.6474173104912273, "learning_rate": 1.2510316473810436e-06, "loss": 0.0081, "step": 7326 }, { "epoch": 3.3334849863512286, "grad_norm": 1.4066202656538762, "learning_rate": 1.2504126249174114e-06, "loss": 0.0127, "step": 7327 }, { "epoch": 3.3339399454049135, "grad_norm": 1.391436489774183, "learning_rate": 1.2497937045644171e-06, "loss": 0.031, "step": 7328 }, { "epoch": 3.3343949044585988, "grad_norm": 1.1285650488117163, "learning_rate": 1.2491748863726352e-06, "loss": 0.026, "step": 7329 }, { "epoch": 3.334849863512284, "grad_norm": 1.8969771157878081, "learning_rate": 1.2485561703926333e-06, "loss": 0.0368, "step": 7330 }, { "epoch": 3.335304822565969, "grad_norm": 1.8961261454585308, "learning_rate": 1.2479375566749694e-06, "loss": 0.0276, "step": 7331 }, { "epoch": 3.3357597816196543, "grad_norm": 1.7628014501872014, "learning_rate": 1.2473190452701934e-06, "loss": 0.0294, "step": 7332 }, { "epoch": 3.3362147406733396, "grad_norm": 0.5133985287490694, "learning_rate": 1.2467006362288476e-06, "loss": 0.0027, "step": 7333 }, { "epoch": 3.3366696997270244, "grad_norm": 0.7768520187011414, "learning_rate": 1.246082329601467e-06, "loss": 0.0146, "step": 7334 }, { "epoch": 3.3371246587807097, "grad_norm": 0.7822448815099561, "learning_rate": 1.245464125438576e-06, "loss": 0.012, "step": 7335 }, { "epoch": 3.337579617834395, "grad_norm": 0.894367632686374, "learning_rate": 1.2448460237906912e-06, "loss": 0.024, "step": 7336 }, { "epoch": 3.33803457688808, "grad_norm": 1.6066982100887512, "learning_rate": 1.24422802470832e-06, "loss": 0.0273, "step": 7337 }, { "epoch": 3.3384895359417652, "grad_norm": 1.1332571177813728, "learning_rate": 1.2436101282419646e-06, "loss": 0.0305, "step": 7338 }, { "epoch": 3.3389444949954505, "grad_norm": 1.6997843721106107, "learning_rate": 1.242992334442115e-06, "loss": 0.0078, "step": 7339 }, { "epoch": 3.3393994540491354, "grad_norm": 1.2189610883650435, "learning_rate": 1.2423746433592557e-06, "loss": 0.0247, "step": 7340 }, { "epoch": 3.3398544131028207, "grad_norm": 1.0813652948771437, "learning_rate": 1.2417570550438616e-06, "loss": 0.0122, "step": 7341 }, { "epoch": 3.340309372156506, "grad_norm": 1.371913434582463, "learning_rate": 1.2411395695463976e-06, "loss": 0.0264, "step": 7342 }, { "epoch": 3.340764331210191, "grad_norm": 1.639268719077998, "learning_rate": 1.240522186917324e-06, "loss": 0.0263, "step": 7343 }, { "epoch": 3.341219290263876, "grad_norm": 1.332695536061651, "learning_rate": 1.2399049072070895e-06, "loss": 0.0372, "step": 7344 }, { "epoch": 3.3416742493175615, "grad_norm": 1.0536556978194858, "learning_rate": 1.2392877304661357e-06, "loss": 0.0084, "step": 7345 }, { "epoch": 3.3421292083712464, "grad_norm": 1.1069299571935522, "learning_rate": 1.238670656744894e-06, "loss": 0.0307, "step": 7346 }, { "epoch": 3.3425841674249317, "grad_norm": 0.8789519669611527, "learning_rate": 1.2380536860937902e-06, "loss": 0.01, "step": 7347 }, { "epoch": 3.343039126478617, "grad_norm": 0.6855386700200069, "learning_rate": 1.2374368185632413e-06, "loss": 0.0069, "step": 7348 }, { "epoch": 3.343494085532302, "grad_norm": 1.012474692352138, "learning_rate": 1.2368200542036537e-06, "loss": 0.0277, "step": 7349 }, { "epoch": 3.343949044585987, "grad_norm": 1.132832297446787, "learning_rate": 1.2362033930654272e-06, "loss": 0.0218, "step": 7350 }, { "epoch": 3.3444040036396725, "grad_norm": 1.2790687800144656, "learning_rate": 1.2355868351989507e-06, "loss": 0.0124, "step": 7351 }, { "epoch": 3.3448589626933574, "grad_norm": 2.1546785551572136, "learning_rate": 1.2349703806546092e-06, "loss": 0.0453, "step": 7352 }, { "epoch": 3.3453139217470427, "grad_norm": 2.5958010587916736, "learning_rate": 1.2343540294827747e-06, "loss": 0.0534, "step": 7353 }, { "epoch": 3.345768880800728, "grad_norm": 1.432239882641429, "learning_rate": 1.233737781733814e-06, "loss": 0.0437, "step": 7354 }, { "epoch": 3.3462238398544133, "grad_norm": 0.7127509462773024, "learning_rate": 1.2331216374580832e-06, "loss": 0.0073, "step": 7355 }, { "epoch": 3.346678798908098, "grad_norm": 0.9783494605345466, "learning_rate": 1.2325055967059302e-06, "loss": 0.0076, "step": 7356 }, { "epoch": 3.3471337579617835, "grad_norm": 0.5642066366470367, "learning_rate": 1.231889659527697e-06, "loss": 0.0043, "step": 7357 }, { "epoch": 3.347588717015469, "grad_norm": 1.4029809828686617, "learning_rate": 1.231273825973714e-06, "loss": 0.0177, "step": 7358 }, { "epoch": 3.3480436760691537, "grad_norm": 0.4588405928424404, "learning_rate": 1.2306580960943044e-06, "loss": 0.0046, "step": 7359 }, { "epoch": 3.348498635122839, "grad_norm": 1.0990091885955027, "learning_rate": 1.2300424699397817e-06, "loss": 0.0218, "step": 7360 }, { "epoch": 3.3489535941765243, "grad_norm": 1.2161936208506565, "learning_rate": 1.2294269475604536e-06, "loss": 0.0356, "step": 7361 }, { "epoch": 3.349408553230209, "grad_norm": 0.3990758206896544, "learning_rate": 1.2288115290066183e-06, "loss": 0.0062, "step": 7362 }, { "epoch": 3.3498635122838945, "grad_norm": 0.6773835937545216, "learning_rate": 1.2281962143285643e-06, "loss": 0.0122, "step": 7363 }, { "epoch": 3.3503184713375798, "grad_norm": 1.2003242609952558, "learning_rate": 1.227581003576572e-06, "loss": 0.0171, "step": 7364 }, { "epoch": 3.3507734303912646, "grad_norm": 0.49674102984573915, "learning_rate": 1.2269658968009144e-06, "loss": 0.008, "step": 7365 }, { "epoch": 3.35122838944495, "grad_norm": 0.6272896621123213, "learning_rate": 1.2263508940518534e-06, "loss": 0.0098, "step": 7366 }, { "epoch": 3.3516833484986353, "grad_norm": 1.0377525850609268, "learning_rate": 1.2257359953796455e-06, "loss": 0.0164, "step": 7367 }, { "epoch": 3.35213830755232, "grad_norm": 0.9211605337551543, "learning_rate": 1.2251212008345387e-06, "loss": 0.0269, "step": 7368 }, { "epoch": 3.3525932666060054, "grad_norm": 0.9850106222165228, "learning_rate": 1.22450651046677e-06, "loss": 0.0102, "step": 7369 }, { "epoch": 3.3530482256596907, "grad_norm": 2.01996761760414, "learning_rate": 1.2238919243265693e-06, "loss": 0.0227, "step": 7370 }, { "epoch": 3.3535031847133756, "grad_norm": 1.104446307885531, "learning_rate": 1.2232774424641566e-06, "loss": 0.0224, "step": 7371 }, { "epoch": 3.353958143767061, "grad_norm": 1.1437153170081686, "learning_rate": 1.2226630649297466e-06, "loss": 0.0121, "step": 7372 }, { "epoch": 3.3544131028207462, "grad_norm": 1.1259378754055989, "learning_rate": 1.2220487917735426e-06, "loss": 0.0287, "step": 7373 }, { "epoch": 3.3548680618744315, "grad_norm": 1.1208778454591233, "learning_rate": 1.2214346230457391e-06, "loss": 0.0296, "step": 7374 }, { "epoch": 3.3553230209281164, "grad_norm": 1.8723531625032563, "learning_rate": 1.2208205587965255e-06, "loss": 0.0108, "step": 7375 }, { "epoch": 3.3557779799818017, "grad_norm": 0.9786874754713979, "learning_rate": 1.220206599076078e-06, "loss": 0.0134, "step": 7376 }, { "epoch": 3.356232939035487, "grad_norm": 0.7676387811994745, "learning_rate": 1.2195927439345687e-06, "loss": 0.0199, "step": 7377 }, { "epoch": 3.356687898089172, "grad_norm": 0.525195520351395, "learning_rate": 1.218978993422158e-06, "loss": 0.0033, "step": 7378 }, { "epoch": 3.357142857142857, "grad_norm": 0.7366105765871134, "learning_rate": 1.218365347588999e-06, "loss": 0.0045, "step": 7379 }, { "epoch": 3.3575978161965425, "grad_norm": 0.8937137678772822, "learning_rate": 1.217751806485235e-06, "loss": 0.0233, "step": 7380 }, { "epoch": 3.3580527752502274, "grad_norm": 1.2224649315007534, "learning_rate": 1.2171383701610026e-06, "loss": 0.0205, "step": 7381 }, { "epoch": 3.3585077343039127, "grad_norm": 0.9796708803895046, "learning_rate": 1.2165250386664304e-06, "loss": 0.0251, "step": 7382 }, { "epoch": 3.358962693357598, "grad_norm": 0.889067797694893, "learning_rate": 1.2159118120516361e-06, "loss": 0.0066, "step": 7383 }, { "epoch": 3.359417652411283, "grad_norm": 1.5932109179528566, "learning_rate": 1.2152986903667294e-06, "loss": 0.0103, "step": 7384 }, { "epoch": 3.359872611464968, "grad_norm": 1.0375007347151761, "learning_rate": 1.214685673661811e-06, "loss": 0.003, "step": 7385 }, { "epoch": 3.3603275705186535, "grad_norm": 1.6241944214473798, "learning_rate": 1.214072761986976e-06, "loss": 0.0214, "step": 7386 }, { "epoch": 3.3607825295723384, "grad_norm": 1.5255542941575173, "learning_rate": 1.2134599553923076e-06, "loss": 0.0341, "step": 7387 }, { "epoch": 3.3612374886260237, "grad_norm": 1.378735344831735, "learning_rate": 1.212847253927881e-06, "loss": 0.016, "step": 7388 }, { "epoch": 3.361692447679709, "grad_norm": 1.7887882696838662, "learning_rate": 1.212234657643765e-06, "loss": 0.0186, "step": 7389 }, { "epoch": 3.362147406733394, "grad_norm": 0.936782998797403, "learning_rate": 1.211622166590016e-06, "loss": 0.0113, "step": 7390 }, { "epoch": 3.362602365787079, "grad_norm": 2.718640783593097, "learning_rate": 1.2110097808166865e-06, "loss": 0.0277, "step": 7391 }, { "epoch": 3.3630573248407645, "grad_norm": 1.199474453590393, "learning_rate": 1.2103975003738167e-06, "loss": 0.0192, "step": 7392 }, { "epoch": 3.3635122838944493, "grad_norm": 0.7456735796298126, "learning_rate": 1.2097853253114393e-06, "loss": 0.016, "step": 7393 }, { "epoch": 3.3639672429481347, "grad_norm": 1.4459314915016817, "learning_rate": 1.2091732556795774e-06, "loss": 0.022, "step": 7394 }, { "epoch": 3.36442220200182, "grad_norm": 1.6218976851751525, "learning_rate": 1.208561291528248e-06, "loss": 0.0187, "step": 7395 }, { "epoch": 3.364877161055505, "grad_norm": 0.9924050165277052, "learning_rate": 1.2079494329074587e-06, "loss": 0.0324, "step": 7396 }, { "epoch": 3.36533212010919, "grad_norm": 1.4000490145224642, "learning_rate": 1.2073376798672068e-06, "loss": 0.0113, "step": 7397 }, { "epoch": 3.3657870791628755, "grad_norm": 1.0439307784146343, "learning_rate": 1.2067260324574823e-06, "loss": 0.0212, "step": 7398 }, { "epoch": 3.3662420382165603, "grad_norm": 0.7243576291502799, "learning_rate": 1.2061144907282656e-06, "loss": 0.01, "step": 7399 }, { "epoch": 3.3666969972702456, "grad_norm": 0.880893942462934, "learning_rate": 1.2055030547295292e-06, "loss": 0.005, "step": 7400 }, { "epoch": 3.367151956323931, "grad_norm": 0.8925726060991405, "learning_rate": 1.2048917245112377e-06, "loss": 0.0104, "step": 7401 }, { "epoch": 3.367606915377616, "grad_norm": 1.3251379602704496, "learning_rate": 1.2042805001233452e-06, "loss": 0.0151, "step": 7402 }, { "epoch": 3.368061874431301, "grad_norm": 0.8929125126440054, "learning_rate": 1.2036693816157995e-06, "loss": 0.0292, "step": 7403 }, { "epoch": 3.3685168334849864, "grad_norm": 1.4059094114536188, "learning_rate": 1.2030583690385381e-06, "loss": 0.0261, "step": 7404 }, { "epoch": 3.3689717925386713, "grad_norm": 0.7552905929657446, "learning_rate": 1.2024474624414885e-06, "loss": 0.009, "step": 7405 }, { "epoch": 3.3694267515923566, "grad_norm": 1.137875029540633, "learning_rate": 1.2018366618745738e-06, "loss": 0.0164, "step": 7406 }, { "epoch": 3.369881710646042, "grad_norm": 1.1571762200620517, "learning_rate": 1.2012259673877047e-06, "loss": 0.0219, "step": 7407 }, { "epoch": 3.370336669699727, "grad_norm": 1.4782254146252654, "learning_rate": 1.2006153790307843e-06, "loss": 0.0109, "step": 7408 }, { "epoch": 3.370791628753412, "grad_norm": 1.568256820808801, "learning_rate": 1.200004896853706e-06, "loss": 0.0253, "step": 7409 }, { "epoch": 3.3712465878070974, "grad_norm": 0.7499900179706327, "learning_rate": 1.1993945209063567e-06, "loss": 0.0232, "step": 7410 }, { "epoch": 3.3717015468607827, "grad_norm": 1.493494514365395, "learning_rate": 1.198784251238615e-06, "loss": 0.0159, "step": 7411 }, { "epoch": 3.3721565059144676, "grad_norm": 0.725013296029611, "learning_rate": 1.1981740879003479e-06, "loss": 0.0163, "step": 7412 }, { "epoch": 3.372611464968153, "grad_norm": 0.6239709126372447, "learning_rate": 1.1975640309414152e-06, "loss": 0.011, "step": 7413 }, { "epoch": 3.373066424021838, "grad_norm": 1.0409021055514054, "learning_rate": 1.1969540804116676e-06, "loss": 0.0236, "step": 7414 }, { "epoch": 3.373521383075523, "grad_norm": 1.1169400531127067, "learning_rate": 1.1963442363609487e-06, "loss": 0.036, "step": 7415 }, { "epoch": 3.3739763421292084, "grad_norm": 1.0828602216954653, "learning_rate": 1.1957344988390904e-06, "loss": 0.0254, "step": 7416 }, { "epoch": 3.3744313011828937, "grad_norm": 0.830004024083808, "learning_rate": 1.1951248678959202e-06, "loss": 0.0182, "step": 7417 }, { "epoch": 3.3748862602365786, "grad_norm": 0.9137494317794648, "learning_rate": 1.1945153435812529e-06, "loss": 0.0159, "step": 7418 }, { "epoch": 3.375341219290264, "grad_norm": 1.336291977513701, "learning_rate": 1.1939059259448952e-06, "loss": 0.0145, "step": 7419 }, { "epoch": 3.375796178343949, "grad_norm": 0.7357466665845688, "learning_rate": 1.1932966150366477e-06, "loss": 0.005, "step": 7420 }, { "epoch": 3.376251137397634, "grad_norm": 1.1215002437541193, "learning_rate": 1.1926874109063e-06, "loss": 0.0367, "step": 7421 }, { "epoch": 3.3767060964513194, "grad_norm": 1.5861328779518893, "learning_rate": 1.1920783136036336e-06, "loss": 0.008, "step": 7422 }, { "epoch": 3.3771610555050047, "grad_norm": 0.9980195140721819, "learning_rate": 1.1914693231784194e-06, "loss": 0.0392, "step": 7423 }, { "epoch": 3.3776160145586895, "grad_norm": 1.1796481194972992, "learning_rate": 1.1908604396804233e-06, "loss": 0.0236, "step": 7424 }, { "epoch": 3.378070973612375, "grad_norm": 1.6246999902646397, "learning_rate": 1.1902516631594005e-06, "loss": 0.031, "step": 7425 }, { "epoch": 3.37852593266606, "grad_norm": 0.5084581037880519, "learning_rate": 1.1896429936650975e-06, "loss": 0.0045, "step": 7426 }, { "epoch": 3.3789808917197455, "grad_norm": 0.9957871477406596, "learning_rate": 1.1890344312472513e-06, "loss": 0.014, "step": 7427 }, { "epoch": 3.3794358507734303, "grad_norm": 0.9749831545880819, "learning_rate": 1.1884259759555902e-06, "loss": 0.0207, "step": 7428 }, { "epoch": 3.3798908098271156, "grad_norm": 1.5697309442737977, "learning_rate": 1.1878176278398363e-06, "loss": 0.0121, "step": 7429 }, { "epoch": 3.380345768880801, "grad_norm": 1.5791325998625816, "learning_rate": 1.187209386949699e-06, "loss": 0.0377, "step": 7430 }, { "epoch": 3.380800727934486, "grad_norm": 1.4102256465891287, "learning_rate": 1.1866012533348834e-06, "loss": 0.0133, "step": 7431 }, { "epoch": 3.381255686988171, "grad_norm": 1.011548238251301, "learning_rate": 1.1859932270450817e-06, "loss": 0.0245, "step": 7432 }, { "epoch": 3.3817106460418564, "grad_norm": 1.2152554973934917, "learning_rate": 1.1853853081299787e-06, "loss": 0.0163, "step": 7433 }, { "epoch": 3.3821656050955413, "grad_norm": 0.9008741310556732, "learning_rate": 1.1847774966392526e-06, "loss": 0.0069, "step": 7434 }, { "epoch": 3.3826205641492266, "grad_norm": 1.275580657368717, "learning_rate": 1.1841697926225698e-06, "loss": 0.0133, "step": 7435 }, { "epoch": 3.383075523202912, "grad_norm": 1.0521678399164966, "learning_rate": 1.1835621961295895e-06, "loss": 0.0178, "step": 7436 }, { "epoch": 3.383530482256597, "grad_norm": 0.7729793289452848, "learning_rate": 1.1829547072099607e-06, "loss": 0.0087, "step": 7437 }, { "epoch": 3.383985441310282, "grad_norm": 1.1599956270917364, "learning_rate": 1.1823473259133261e-06, "loss": 0.0132, "step": 7438 }, { "epoch": 3.3844404003639674, "grad_norm": 1.619058979261084, "learning_rate": 1.1817400522893169e-06, "loss": 0.0204, "step": 7439 }, { "epoch": 3.3848953594176523, "grad_norm": 0.8017983045132376, "learning_rate": 1.181132886387558e-06, "loss": 0.0091, "step": 7440 }, { "epoch": 3.3853503184713376, "grad_norm": 1.1239016513985083, "learning_rate": 1.180525828257664e-06, "loss": 0.0248, "step": 7441 }, { "epoch": 3.385805277525023, "grad_norm": 0.6792097079203006, "learning_rate": 1.1799188779492407e-06, "loss": 0.0099, "step": 7442 }, { "epoch": 3.386260236578708, "grad_norm": 1.4156624488617202, "learning_rate": 1.1793120355118843e-06, "loss": 0.0277, "step": 7443 }, { "epoch": 3.386715195632393, "grad_norm": 1.1445429657738784, "learning_rate": 1.1787053009951837e-06, "loss": 0.0078, "step": 7444 }, { "epoch": 3.3871701546860784, "grad_norm": 1.5521806361164303, "learning_rate": 1.1780986744487204e-06, "loss": 0.0237, "step": 7445 }, { "epoch": 3.3876251137397633, "grad_norm": 1.2284132571230602, "learning_rate": 1.1774921559220637e-06, "loss": 0.0246, "step": 7446 }, { "epoch": 3.3880800727934486, "grad_norm": 1.0733393397929343, "learning_rate": 1.1768857454647756e-06, "loss": 0.013, "step": 7447 }, { "epoch": 3.388535031847134, "grad_norm": 1.1705886631124898, "learning_rate": 1.1762794431264082e-06, "loss": 0.0158, "step": 7448 }, { "epoch": 3.3889899909008188, "grad_norm": 0.8049920206491054, "learning_rate": 1.175673248956508e-06, "loss": 0.0113, "step": 7449 }, { "epoch": 3.389444949954504, "grad_norm": 1.1189355082028385, "learning_rate": 1.175067163004609e-06, "loss": 0.0299, "step": 7450 }, { "epoch": 3.3898999090081894, "grad_norm": 1.2170248953200788, "learning_rate": 1.1744611853202376e-06, "loss": 0.0196, "step": 7451 }, { "epoch": 3.3903548680618742, "grad_norm": 1.1172234519579525, "learning_rate": 1.1738553159529126e-06, "loss": 0.0091, "step": 7452 }, { "epoch": 3.3908098271155596, "grad_norm": 3.7934175908920125, "learning_rate": 1.1732495549521413e-06, "loss": 0.038, "step": 7453 }, { "epoch": 3.391264786169245, "grad_norm": 0.6556204122493325, "learning_rate": 1.172643902367426e-06, "loss": 0.0101, "step": 7454 }, { "epoch": 3.3917197452229297, "grad_norm": 0.9250289427159699, "learning_rate": 1.1720383582482569e-06, "loss": 0.0053, "step": 7455 }, { "epoch": 3.392174704276615, "grad_norm": 1.8419787107700543, "learning_rate": 1.171432922644116e-06, "loss": 0.0349, "step": 7456 }, { "epoch": 3.3926296633303004, "grad_norm": 1.2457850514888216, "learning_rate": 1.1708275956044757e-06, "loss": 0.035, "step": 7457 }, { "epoch": 3.3930846223839852, "grad_norm": 0.7533567431814873, "learning_rate": 1.170222377178802e-06, "loss": 0.0083, "step": 7458 }, { "epoch": 3.3935395814376705, "grad_norm": 1.1915938040046463, "learning_rate": 1.1696172674165516e-06, "loss": 0.0123, "step": 7459 }, { "epoch": 3.393994540491356, "grad_norm": 0.9617718315992385, "learning_rate": 1.16901226636717e-06, "loss": 0.0137, "step": 7460 }, { "epoch": 3.3944494995450407, "grad_norm": 0.8636069256158052, "learning_rate": 1.168407374080095e-06, "loss": 0.0084, "step": 7461 }, { "epoch": 3.394904458598726, "grad_norm": 1.6531753119286525, "learning_rate": 1.1678025906047552e-06, "loss": 0.0307, "step": 7462 }, { "epoch": 3.3953594176524113, "grad_norm": 0.38725150595139907, "learning_rate": 1.1671979159905724e-06, "loss": 0.0034, "step": 7463 }, { "epoch": 3.395814376706096, "grad_norm": 0.8585547539637095, "learning_rate": 1.1665933502869563e-06, "loss": 0.0124, "step": 7464 }, { "epoch": 3.3962693357597815, "grad_norm": 0.9595777749963582, "learning_rate": 1.1659888935433108e-06, "loss": 0.01, "step": 7465 }, { "epoch": 3.396724294813467, "grad_norm": 0.75141595170107, "learning_rate": 1.1653845458090287e-06, "loss": 0.0189, "step": 7466 }, { "epoch": 3.397179253867152, "grad_norm": 1.430529279734458, "learning_rate": 1.1647803071334935e-06, "loss": 0.0123, "step": 7467 }, { "epoch": 3.397634212920837, "grad_norm": 1.6147212482576765, "learning_rate": 1.1641761775660826e-06, "loss": 0.0181, "step": 7468 }, { "epoch": 3.3980891719745223, "grad_norm": 1.202192338291703, "learning_rate": 1.163572157156162e-06, "loss": 0.0093, "step": 7469 }, { "epoch": 3.3985441310282076, "grad_norm": 0.852382395505312, "learning_rate": 1.1629682459530898e-06, "loss": 0.0078, "step": 7470 }, { "epoch": 3.3989990900818925, "grad_norm": 1.0306559160729576, "learning_rate": 1.1623644440062133e-06, "loss": 0.0143, "step": 7471 }, { "epoch": 3.399454049135578, "grad_norm": 0.6715036558986726, "learning_rate": 1.1617607513648735e-06, "loss": 0.0068, "step": 7472 }, { "epoch": 3.399909008189263, "grad_norm": 0.9502651146861935, "learning_rate": 1.161157168078403e-06, "loss": 0.0115, "step": 7473 }, { "epoch": 3.400363967242948, "grad_norm": 0.9037925201943289, "learning_rate": 1.1605536941961223e-06, "loss": 0.0194, "step": 7474 }, { "epoch": 3.4008189262966333, "grad_norm": 1.0796608191349382, "learning_rate": 1.159950329767345e-06, "loss": 0.0126, "step": 7475 }, { "epoch": 3.4012738853503186, "grad_norm": 0.6397196251495686, "learning_rate": 1.159347074841375e-06, "loss": 0.0095, "step": 7476 }, { "epoch": 3.4017288444040035, "grad_norm": 0.7787546277670884, "learning_rate": 1.1587439294675067e-06, "loss": 0.0072, "step": 7477 }, { "epoch": 3.402183803457689, "grad_norm": 1.0244092386979164, "learning_rate": 1.1581408936950278e-06, "loss": 0.0305, "step": 7478 }, { "epoch": 3.402638762511374, "grad_norm": 1.1186047240103383, "learning_rate": 1.157537967573216e-06, "loss": 0.011, "step": 7479 }, { "epoch": 3.403093721565059, "grad_norm": 0.7356775921031583, "learning_rate": 1.1569351511513388e-06, "loss": 0.0073, "step": 7480 }, { "epoch": 3.4035486806187443, "grad_norm": 1.9599719864087823, "learning_rate": 1.1563324444786562e-06, "loss": 0.0144, "step": 7481 }, { "epoch": 3.4040036396724296, "grad_norm": 1.210438440443029, "learning_rate": 1.155729847604417e-06, "loss": 0.0414, "step": 7482 }, { "epoch": 3.404458598726115, "grad_norm": 0.7005409386643594, "learning_rate": 1.155127360577865e-06, "loss": 0.0045, "step": 7483 }, { "epoch": 3.4049135577797998, "grad_norm": 1.208451756014488, "learning_rate": 1.1545249834482319e-06, "loss": 0.0225, "step": 7484 }, { "epoch": 3.405368516833485, "grad_norm": 0.8696721682762517, "learning_rate": 1.1539227162647398e-06, "loss": 0.0079, "step": 7485 }, { "epoch": 3.4058234758871704, "grad_norm": 1.351383404284545, "learning_rate": 1.1533205590766056e-06, "loss": 0.0524, "step": 7486 }, { "epoch": 3.4062784349408552, "grad_norm": 0.9707375289584066, "learning_rate": 1.1527185119330327e-06, "loss": 0.0202, "step": 7487 }, { "epoch": 3.4067333939945406, "grad_norm": 1.110941234310674, "learning_rate": 1.15211657488322e-06, "loss": 0.0142, "step": 7488 }, { "epoch": 3.407188353048226, "grad_norm": 1.711370012223175, "learning_rate": 1.1515147479763536e-06, "loss": 0.016, "step": 7489 }, { "epoch": 3.4076433121019107, "grad_norm": 1.2813489475854343, "learning_rate": 1.1509130312616123e-06, "loss": 0.0442, "step": 7490 }, { "epoch": 3.408098271155596, "grad_norm": 1.445868751093827, "learning_rate": 1.1503114247881648e-06, "loss": 0.0246, "step": 7491 }, { "epoch": 3.4085532302092814, "grad_norm": 1.1472295242489716, "learning_rate": 1.1497099286051724e-06, "loss": 0.029, "step": 7492 }, { "epoch": 3.4090081892629662, "grad_norm": 1.0191601385004936, "learning_rate": 1.149108542761788e-06, "loss": 0.0097, "step": 7493 }, { "epoch": 3.4094631483166515, "grad_norm": 1.0336951111503287, "learning_rate": 1.1485072673071522e-06, "loss": 0.0192, "step": 7494 }, { "epoch": 3.409918107370337, "grad_norm": 2.656133725924073, "learning_rate": 1.1479061022904001e-06, "loss": 0.0211, "step": 7495 }, { "epoch": 3.4103730664240217, "grad_norm": 0.8036292948821893, "learning_rate": 1.147305047760654e-06, "loss": 0.0118, "step": 7496 }, { "epoch": 3.410828025477707, "grad_norm": 0.6332223749855916, "learning_rate": 1.1467041037670315e-06, "loss": 0.0055, "step": 7497 }, { "epoch": 3.4112829845313923, "grad_norm": 2.362165180003215, "learning_rate": 1.1461032703586383e-06, "loss": 0.0175, "step": 7498 }, { "epoch": 3.411737943585077, "grad_norm": 1.0682808643200905, "learning_rate": 1.1455025475845708e-06, "loss": 0.0199, "step": 7499 }, { "epoch": 3.4121929026387625, "grad_norm": 0.4022475925009477, "learning_rate": 1.1449019354939193e-06, "loss": 0.0022, "step": 7500 }, { "epoch": 3.412647861692448, "grad_norm": 1.339742823177407, "learning_rate": 1.1443014341357609e-06, "loss": 0.0104, "step": 7501 }, { "epoch": 3.4131028207461327, "grad_norm": 0.9722478362833145, "learning_rate": 1.143701043559168e-06, "loss": 0.0208, "step": 7502 }, { "epoch": 3.413557779799818, "grad_norm": 1.2471436666052076, "learning_rate": 1.1431007638132008e-06, "loss": 0.054, "step": 7503 }, { "epoch": 3.4140127388535033, "grad_norm": 1.1629613993775725, "learning_rate": 1.1425005949469118e-06, "loss": 0.0146, "step": 7504 }, { "epoch": 3.414467697907188, "grad_norm": 1.1940141108710431, "learning_rate": 1.1419005370093425e-06, "loss": 0.0415, "step": 7505 }, { "epoch": 3.4149226569608735, "grad_norm": 1.0663367831926178, "learning_rate": 1.1413005900495284e-06, "loss": 0.0503, "step": 7506 }, { "epoch": 3.415377616014559, "grad_norm": 0.9362988610794477, "learning_rate": 1.140700754116495e-06, "loss": 0.0178, "step": 7507 }, { "epoch": 3.4158325750682437, "grad_norm": 1.8523289006207317, "learning_rate": 1.1401010292592574e-06, "loss": 0.04, "step": 7508 }, { "epoch": 3.416287534121929, "grad_norm": 1.665488625462918, "learning_rate": 1.1395014155268225e-06, "loss": 0.0404, "step": 7509 }, { "epoch": 3.4167424931756143, "grad_norm": 1.5559218314251668, "learning_rate": 1.138901912968188e-06, "loss": 0.0099, "step": 7510 }, { "epoch": 3.417197452229299, "grad_norm": 1.1124294617889434, "learning_rate": 1.1383025216323418e-06, "loss": 0.0277, "step": 7511 }, { "epoch": 3.4176524112829845, "grad_norm": 0.7197888657147834, "learning_rate": 1.1377032415682648e-06, "loss": 0.0057, "step": 7512 }, { "epoch": 3.41810737033667, "grad_norm": 0.7193998967677554, "learning_rate": 1.1371040728249258e-06, "loss": 0.0146, "step": 7513 }, { "epoch": 3.4185623293903546, "grad_norm": 1.2765660592263472, "learning_rate": 1.1365050154512883e-06, "loss": 0.0255, "step": 7514 }, { "epoch": 3.41901728844404, "grad_norm": 1.336085842418711, "learning_rate": 1.1359060694963036e-06, "loss": 0.0278, "step": 7515 }, { "epoch": 3.4194722474977253, "grad_norm": 0.7947123111984269, "learning_rate": 1.1353072350089136e-06, "loss": 0.0073, "step": 7516 }, { "epoch": 3.41992720655141, "grad_norm": 0.6084383363510923, "learning_rate": 1.1347085120380543e-06, "loss": 0.0069, "step": 7517 }, { "epoch": 3.4203821656050954, "grad_norm": 1.4091395112259244, "learning_rate": 1.13410990063265e-06, "loss": 0.0197, "step": 7518 }, { "epoch": 3.4208371246587808, "grad_norm": 0.3799683285165179, "learning_rate": 1.1335114008416163e-06, "loss": 0.0034, "step": 7519 }, { "epoch": 3.421292083712466, "grad_norm": 0.8186864814484323, "learning_rate": 1.1329130127138588e-06, "loss": 0.0279, "step": 7520 }, { "epoch": 3.421747042766151, "grad_norm": 0.852314477097122, "learning_rate": 1.1323147362982761e-06, "loss": 0.0094, "step": 7521 }, { "epoch": 3.4222020018198362, "grad_norm": 1.079268764075382, "learning_rate": 1.1317165716437581e-06, "loss": 0.0117, "step": 7522 }, { "epoch": 3.4226569608735216, "grad_norm": 1.2080006195988457, "learning_rate": 1.1311185187991825e-06, "loss": 0.0144, "step": 7523 }, { "epoch": 3.4231119199272064, "grad_norm": 1.065636702960828, "learning_rate": 1.1305205778134195e-06, "loss": 0.0464, "step": 7524 }, { "epoch": 3.4235668789808917, "grad_norm": 1.2621265321170223, "learning_rate": 1.1299227487353297e-06, "loss": 0.032, "step": 7525 }, { "epoch": 3.424021838034577, "grad_norm": 0.778295292836877, "learning_rate": 1.1293250316137666e-06, "loss": 0.0064, "step": 7526 }, { "epoch": 3.424476797088262, "grad_norm": 1.1158000019033023, "learning_rate": 1.1287274264975711e-06, "loss": 0.032, "step": 7527 }, { "epoch": 3.4249317561419472, "grad_norm": 1.6263756520315245, "learning_rate": 1.1281299334355785e-06, "loss": 0.0311, "step": 7528 }, { "epoch": 3.4253867151956325, "grad_norm": 1.8932215747166534, "learning_rate": 1.1275325524766127e-06, "loss": 0.0174, "step": 7529 }, { "epoch": 3.4258416742493174, "grad_norm": 0.8757768664350393, "learning_rate": 1.1269352836694874e-06, "loss": 0.009, "step": 7530 }, { "epoch": 3.4262966333030027, "grad_norm": 1.1916951905969495, "learning_rate": 1.126338127063011e-06, "loss": 0.0053, "step": 7531 }, { "epoch": 3.426751592356688, "grad_norm": 1.5129255920278408, "learning_rate": 1.1257410827059795e-06, "loss": 0.0122, "step": 7532 }, { "epoch": 3.427206551410373, "grad_norm": 1.0269620410797726, "learning_rate": 1.1251441506471807e-06, "loss": 0.0142, "step": 7533 }, { "epoch": 3.427661510464058, "grad_norm": 1.3643047091562142, "learning_rate": 1.1245473309353922e-06, "loss": 0.0275, "step": 7534 }, { "epoch": 3.4281164695177435, "grad_norm": 0.2935488875480498, "learning_rate": 1.1239506236193843e-06, "loss": 0.0028, "step": 7535 }, { "epoch": 3.4285714285714284, "grad_norm": 0.897665873217888, "learning_rate": 1.1233540287479182e-06, "loss": 0.0224, "step": 7536 }, { "epoch": 3.4290263876251137, "grad_norm": 0.6770812288346897, "learning_rate": 1.122757546369744e-06, "loss": 0.0088, "step": 7537 }, { "epoch": 3.429481346678799, "grad_norm": 1.2286113456232162, "learning_rate": 1.1221611765336035e-06, "loss": 0.0203, "step": 7538 }, { "epoch": 3.4299363057324843, "grad_norm": 0.6426126453985682, "learning_rate": 1.1215649192882283e-06, "loss": 0.0171, "step": 7539 }, { "epoch": 3.430391264786169, "grad_norm": 0.8452797892274212, "learning_rate": 1.120968774682344e-06, "loss": 0.0181, "step": 7540 }, { "epoch": 3.4308462238398545, "grad_norm": 1.5729653816908136, "learning_rate": 1.120372742764663e-06, "loss": 0.0405, "step": 7541 }, { "epoch": 3.43130118289354, "grad_norm": 0.7419102653408403, "learning_rate": 1.1197768235838917e-06, "loss": 0.0247, "step": 7542 }, { "epoch": 3.4317561419472247, "grad_norm": 1.3342137807873393, "learning_rate": 1.1191810171887258e-06, "loss": 0.0248, "step": 7543 }, { "epoch": 3.43221110100091, "grad_norm": 0.9661288095714657, "learning_rate": 1.1185853236278513e-06, "loss": 0.0066, "step": 7544 }, { "epoch": 3.4326660600545953, "grad_norm": 0.9238860256588091, "learning_rate": 1.1179897429499447e-06, "loss": 0.0086, "step": 7545 }, { "epoch": 3.43312101910828, "grad_norm": 0.5409813772099663, "learning_rate": 1.1173942752036762e-06, "loss": 0.0065, "step": 7546 }, { "epoch": 3.4335759781619655, "grad_norm": 1.9728648308878758, "learning_rate": 1.1167989204377036e-06, "loss": 0.0522, "step": 7547 }, { "epoch": 3.434030937215651, "grad_norm": 0.9639081579824792, "learning_rate": 1.116203678700676e-06, "loss": 0.0306, "step": 7548 }, { "epoch": 3.4344858962693356, "grad_norm": 0.564248289391494, "learning_rate": 1.1156085500412355e-06, "loss": 0.0067, "step": 7549 }, { "epoch": 3.434940855323021, "grad_norm": 0.6771371362384193, "learning_rate": 1.1150135345080115e-06, "loss": 0.0086, "step": 7550 }, { "epoch": 3.4353958143767063, "grad_norm": 1.0315086293965683, "learning_rate": 1.1144186321496279e-06, "loss": 0.0147, "step": 7551 }, { "epoch": 3.435850773430391, "grad_norm": 1.021234880531866, "learning_rate": 1.113823843014696e-06, "loss": 0.0377, "step": 7552 }, { "epoch": 3.4363057324840764, "grad_norm": 1.9083271834338622, "learning_rate": 1.1132291671518203e-06, "loss": 0.0157, "step": 7553 }, { "epoch": 3.4367606915377618, "grad_norm": 1.1803774549724608, "learning_rate": 1.1126346046095932e-06, "loss": 0.0196, "step": 7554 }, { "epoch": 3.4372156505914466, "grad_norm": 1.2594437800166554, "learning_rate": 1.1120401554366012e-06, "loss": 0.0134, "step": 7555 }, { "epoch": 3.437670609645132, "grad_norm": 1.1023786272694245, "learning_rate": 1.1114458196814204e-06, "loss": 0.0185, "step": 7556 }, { "epoch": 3.4381255686988172, "grad_norm": 1.142739561106916, "learning_rate": 1.1108515973926168e-06, "loss": 0.0297, "step": 7557 }, { "epoch": 3.438580527752502, "grad_norm": 0.8457234802604334, "learning_rate": 1.110257488618747e-06, "loss": 0.0134, "step": 7558 }, { "epoch": 3.4390354868061874, "grad_norm": 1.555975474060645, "learning_rate": 1.1096634934083586e-06, "loss": 0.0492, "step": 7559 }, { "epoch": 3.4394904458598727, "grad_norm": 0.547462876682005, "learning_rate": 1.1090696118099914e-06, "loss": 0.0191, "step": 7560 }, { "epoch": 3.4399454049135576, "grad_norm": 0.6198949993333976, "learning_rate": 1.1084758438721744e-06, "loss": 0.0124, "step": 7561 }, { "epoch": 3.440400363967243, "grad_norm": 1.0396095598466795, "learning_rate": 1.1078821896434264e-06, "loss": 0.0215, "step": 7562 }, { "epoch": 3.4408553230209282, "grad_norm": 0.9281829149834743, "learning_rate": 1.10728864917226e-06, "loss": 0.0037, "step": 7563 }, { "epoch": 3.441310282074613, "grad_norm": 1.3583899463697107, "learning_rate": 1.1066952225071751e-06, "loss": 0.0078, "step": 7564 }, { "epoch": 3.4417652411282984, "grad_norm": 0.8043057267845712, "learning_rate": 1.1061019096966648e-06, "loss": 0.0145, "step": 7565 }, { "epoch": 3.4422202001819837, "grad_norm": 1.0017489839066425, "learning_rate": 1.1055087107892124e-06, "loss": 0.0053, "step": 7566 }, { "epoch": 3.4426751592356686, "grad_norm": 1.736474185842419, "learning_rate": 1.1049156258332903e-06, "loss": 0.0525, "step": 7567 }, { "epoch": 3.443130118289354, "grad_norm": 0.5257107017086458, "learning_rate": 1.1043226548773622e-06, "loss": 0.0059, "step": 7568 }, { "epoch": 3.443585077343039, "grad_norm": 1.0055803898936413, "learning_rate": 1.1037297979698837e-06, "loss": 0.0231, "step": 7569 }, { "epoch": 3.444040036396724, "grad_norm": 0.7489153966958186, "learning_rate": 1.1031370551593018e-06, "loss": 0.0162, "step": 7570 }, { "epoch": 3.4444949954504094, "grad_norm": 0.8152462829060104, "learning_rate": 1.1025444264940515e-06, "loss": 0.0081, "step": 7571 }, { "epoch": 3.4449499545040947, "grad_norm": 0.43314852142174576, "learning_rate": 1.10195191202256e-06, "loss": 0.0044, "step": 7572 }, { "epoch": 3.4454049135577796, "grad_norm": 1.3561838746936177, "learning_rate": 1.1013595117932437e-06, "loss": 0.0276, "step": 7573 }, { "epoch": 3.445859872611465, "grad_norm": 0.7640199012017233, "learning_rate": 1.1007672258545126e-06, "loss": 0.0141, "step": 7574 }, { "epoch": 3.44631483166515, "grad_norm": 2.12699712070388, "learning_rate": 1.100175054254765e-06, "loss": 0.0217, "step": 7575 }, { "epoch": 3.4467697907188355, "grad_norm": 1.121888384450576, "learning_rate": 1.0995829970423898e-06, "loss": 0.0356, "step": 7576 }, { "epoch": 3.4472247497725204, "grad_norm": 1.056297608596252, "learning_rate": 1.0989910542657686e-06, "loss": 0.03, "step": 7577 }, { "epoch": 3.4476797088262057, "grad_norm": 1.2318332670508079, "learning_rate": 1.0983992259732707e-06, "loss": 0.0234, "step": 7578 }, { "epoch": 3.448134667879891, "grad_norm": 1.3604619087054108, "learning_rate": 1.0978075122132592e-06, "loss": 0.0329, "step": 7579 }, { "epoch": 3.448589626933576, "grad_norm": 0.7264179350795699, "learning_rate": 1.0972159130340857e-06, "loss": 0.0039, "step": 7580 }, { "epoch": 3.449044585987261, "grad_norm": 1.5784113291971014, "learning_rate": 1.0966244284840926e-06, "loss": 0.0148, "step": 7581 }, { "epoch": 3.4494995450409465, "grad_norm": 0.9407083859068337, "learning_rate": 1.096033058611614e-06, "loss": 0.0417, "step": 7582 }, { "epoch": 3.4499545040946313, "grad_norm": 1.1028431203007958, "learning_rate": 1.0954418034649724e-06, "loss": 0.0101, "step": 7583 }, { "epoch": 3.4504094631483166, "grad_norm": 0.6724007485039663, "learning_rate": 1.0948506630924839e-06, "loss": 0.0104, "step": 7584 }, { "epoch": 3.450864422202002, "grad_norm": 1.9383738226065332, "learning_rate": 1.0942596375424544e-06, "loss": 0.0222, "step": 7585 }, { "epoch": 3.451319381255687, "grad_norm": 0.5708670024706882, "learning_rate": 1.093668726863179e-06, "loss": 0.0064, "step": 7586 }, { "epoch": 3.451774340309372, "grad_norm": 1.4358209913310123, "learning_rate": 1.0930779311029444e-06, "loss": 0.0056, "step": 7587 }, { "epoch": 3.4522292993630574, "grad_norm": 1.083546145314089, "learning_rate": 1.0924872503100268e-06, "loss": 0.0204, "step": 7588 }, { "epoch": 3.4526842584167423, "grad_norm": 1.1688331873145483, "learning_rate": 1.0918966845326955e-06, "loss": 0.0088, "step": 7589 }, { "epoch": 3.4531392174704276, "grad_norm": 1.2741470290161339, "learning_rate": 1.0913062338192076e-06, "loss": 0.0163, "step": 7590 }, { "epoch": 3.453594176524113, "grad_norm": 1.0137991173061025, "learning_rate": 1.0907158982178135e-06, "loss": 0.0155, "step": 7591 }, { "epoch": 3.4540491355777982, "grad_norm": 0.4400289001096548, "learning_rate": 1.0901256777767519e-06, "loss": 0.0044, "step": 7592 }, { "epoch": 3.454504094631483, "grad_norm": 1.5286846430395327, "learning_rate": 1.0895355725442519e-06, "loss": 0.0217, "step": 7593 }, { "epoch": 3.4549590536851684, "grad_norm": 1.1687773336237477, "learning_rate": 1.0889455825685364e-06, "loss": 0.0202, "step": 7594 }, { "epoch": 3.4554140127388537, "grad_norm": 0.8736527206583483, "learning_rate": 1.0883557078978155e-06, "loss": 0.0202, "step": 7595 }, { "epoch": 3.4558689717925386, "grad_norm": 1.2123948307403718, "learning_rate": 1.0877659485802914e-06, "loss": 0.0246, "step": 7596 }, { "epoch": 3.456323930846224, "grad_norm": 1.5374115486151494, "learning_rate": 1.0871763046641553e-06, "loss": 0.0491, "step": 7597 }, { "epoch": 3.4567788898999092, "grad_norm": 0.9336661516757412, "learning_rate": 1.0865867761975916e-06, "loss": 0.0293, "step": 7598 }, { "epoch": 3.457233848953594, "grad_norm": 0.9180597807575865, "learning_rate": 1.0859973632287742e-06, "loss": 0.0064, "step": 7599 }, { "epoch": 3.4576888080072794, "grad_norm": 1.3175079115874222, "learning_rate": 1.0854080658058669e-06, "loss": 0.0302, "step": 7600 }, { "epoch": 3.4581437670609647, "grad_norm": 1.181078086557783, "learning_rate": 1.084818883977024e-06, "loss": 0.0205, "step": 7601 }, { "epoch": 3.4585987261146496, "grad_norm": 0.9466874270124895, "learning_rate": 1.0842298177903904e-06, "loss": 0.0108, "step": 7602 }, { "epoch": 3.459053685168335, "grad_norm": 1.2714127528533903, "learning_rate": 1.0836408672941034e-06, "loss": 0.0183, "step": 7603 }, { "epoch": 3.45950864422202, "grad_norm": 0.7662604330651837, "learning_rate": 1.0830520325362876e-06, "loss": 0.0189, "step": 7604 }, { "epoch": 3.459963603275705, "grad_norm": 0.7506148375187652, "learning_rate": 1.0824633135650614e-06, "loss": 0.0123, "step": 7605 }, { "epoch": 3.4604185623293904, "grad_norm": 0.6705705364650779, "learning_rate": 1.081874710428532e-06, "loss": 0.009, "step": 7606 }, { "epoch": 3.4608735213830757, "grad_norm": 0.7737895773574421, "learning_rate": 1.081286223174796e-06, "loss": 0.0275, "step": 7607 }, { "epoch": 3.4613284804367606, "grad_norm": 2.1793165307462927, "learning_rate": 1.080697851851944e-06, "loss": 0.0109, "step": 7608 }, { "epoch": 3.461783439490446, "grad_norm": 1.3492741813622435, "learning_rate": 1.080109596508054e-06, "loss": 0.0148, "step": 7609 }, { "epoch": 3.462238398544131, "grad_norm": 1.396282488385764, "learning_rate": 1.0795214571911955e-06, "loss": 0.0211, "step": 7610 }, { "epoch": 3.462693357597816, "grad_norm": 0.48215166232524004, "learning_rate": 1.0789334339494278e-06, "loss": 0.0065, "step": 7611 }, { "epoch": 3.4631483166515014, "grad_norm": 1.718783490547032, "learning_rate": 1.0783455268308026e-06, "loss": 0.0492, "step": 7612 }, { "epoch": 3.4636032757051867, "grad_norm": 1.1473820629255156, "learning_rate": 1.0777577358833615e-06, "loss": 0.0441, "step": 7613 }, { "epoch": 3.4640582347588715, "grad_norm": 0.8998564604606949, "learning_rate": 1.0771700611551355e-06, "loss": 0.0116, "step": 7614 }, { "epoch": 3.464513193812557, "grad_norm": 0.7601557840452411, "learning_rate": 1.0765825026941467e-06, "loss": 0.0099, "step": 7615 }, { "epoch": 3.464968152866242, "grad_norm": 0.9142509384548829, "learning_rate": 1.075995060548408e-06, "loss": 0.0201, "step": 7616 }, { "epoch": 3.465423111919927, "grad_norm": 0.7487678170352101, "learning_rate": 1.0754077347659209e-06, "loss": 0.0195, "step": 7617 }, { "epoch": 3.4658780709736123, "grad_norm": 0.9856139199140111, "learning_rate": 1.0748205253946804e-06, "loss": 0.0212, "step": 7618 }, { "epoch": 3.4663330300272976, "grad_norm": 1.1061319799530613, "learning_rate": 1.0742334324826715e-06, "loss": 0.0105, "step": 7619 }, { "epoch": 3.4667879890809825, "grad_norm": 0.9630523475766942, "learning_rate": 1.0736464560778675e-06, "loss": 0.0269, "step": 7620 }, { "epoch": 3.467242948134668, "grad_norm": 1.7981743430567623, "learning_rate": 1.073059596228234e-06, "loss": 0.0294, "step": 7621 }, { "epoch": 3.467697907188353, "grad_norm": 1.735616561940244, "learning_rate": 1.0724728529817253e-06, "loss": 0.0091, "step": 7622 }, { "epoch": 3.468152866242038, "grad_norm": 0.7211017897946917, "learning_rate": 1.0718862263862892e-06, "loss": 0.0066, "step": 7623 }, { "epoch": 3.4686078252957233, "grad_norm": 1.0724074175195653, "learning_rate": 1.0712997164898616e-06, "loss": 0.0135, "step": 7624 }, { "epoch": 3.4690627843494086, "grad_norm": 0.7931169399051015, "learning_rate": 1.0707133233403682e-06, "loss": 0.0068, "step": 7625 }, { "epoch": 3.4695177434030935, "grad_norm": 0.9538079412890421, "learning_rate": 1.0701270469857282e-06, "loss": 0.0054, "step": 7626 }, { "epoch": 3.469972702456779, "grad_norm": 1.1008210945971122, "learning_rate": 1.069540887473848e-06, "loss": 0.0135, "step": 7627 }, { "epoch": 3.470427661510464, "grad_norm": 1.1837926318271457, "learning_rate": 1.0689548448526273e-06, "loss": 0.0261, "step": 7628 }, { "epoch": 3.470882620564149, "grad_norm": 0.9334458031290314, "learning_rate": 1.0683689191699544e-06, "loss": 0.0139, "step": 7629 }, { "epoch": 3.4713375796178343, "grad_norm": 0.9544200758885456, "learning_rate": 1.067783110473708e-06, "loss": 0.0191, "step": 7630 }, { "epoch": 3.4717925386715196, "grad_norm": 0.7051596476403937, "learning_rate": 1.0671974188117573e-06, "loss": 0.0088, "step": 7631 }, { "epoch": 3.472247497725205, "grad_norm": 0.5436967448130244, "learning_rate": 1.0666118442319628e-06, "loss": 0.0028, "step": 7632 }, { "epoch": 3.47270245677889, "grad_norm": 1.274911178482299, "learning_rate": 1.0660263867821763e-06, "loss": 0.0274, "step": 7633 }, { "epoch": 3.473157415832575, "grad_norm": 0.43913936154960076, "learning_rate": 1.0654410465102376e-06, "loss": 0.0136, "step": 7634 }, { "epoch": 3.4736123748862604, "grad_norm": 1.4013606943948484, "learning_rate": 1.0648558234639783e-06, "loss": 0.0097, "step": 7635 }, { "epoch": 3.4740673339399453, "grad_norm": 0.608365879366347, "learning_rate": 1.064270717691219e-06, "loss": 0.0328, "step": 7636 }, { "epoch": 3.4745222929936306, "grad_norm": 1.33568201619224, "learning_rate": 1.063685729239774e-06, "loss": 0.0172, "step": 7637 }, { "epoch": 3.474977252047316, "grad_norm": 0.8742969628768843, "learning_rate": 1.0631008581574448e-06, "loss": 0.0097, "step": 7638 }, { "epoch": 3.4754322111010008, "grad_norm": 1.0695038632969838, "learning_rate": 1.0625161044920238e-06, "loss": 0.0378, "step": 7639 }, { "epoch": 3.475887170154686, "grad_norm": 0.9768445810846939, "learning_rate": 1.0619314682912956e-06, "loss": 0.0215, "step": 7640 }, { "epoch": 3.4763421292083714, "grad_norm": 0.9582819102509673, "learning_rate": 1.0613469496030329e-06, "loss": 0.0182, "step": 7641 }, { "epoch": 3.4767970882620562, "grad_norm": 1.0027808434677195, "learning_rate": 1.0607625484750014e-06, "loss": 0.027, "step": 7642 }, { "epoch": 3.4772520473157416, "grad_norm": 1.0215469582043004, "learning_rate": 1.060178264954955e-06, "loss": 0.0227, "step": 7643 }, { "epoch": 3.477707006369427, "grad_norm": 0.9698754905467374, "learning_rate": 1.0595940990906387e-06, "loss": 0.0216, "step": 7644 }, { "epoch": 3.4781619654231117, "grad_norm": 0.47385934095437143, "learning_rate": 1.0590100509297866e-06, "loss": 0.0029, "step": 7645 }, { "epoch": 3.478616924476797, "grad_norm": 1.1176049853863823, "learning_rate": 1.058426120520126e-06, "loss": 0.0065, "step": 7646 }, { "epoch": 3.4790718835304824, "grad_norm": 1.2412097096472487, "learning_rate": 1.0578423079093734e-06, "loss": 0.0172, "step": 7647 }, { "epoch": 3.4795268425841677, "grad_norm": 1.1855130585761646, "learning_rate": 1.0572586131452347e-06, "loss": 0.0141, "step": 7648 }, { "epoch": 3.4799818016378525, "grad_norm": 1.5816483067947957, "learning_rate": 1.0566750362754069e-06, "loss": 0.0139, "step": 7649 }, { "epoch": 3.480436760691538, "grad_norm": 1.080577218601834, "learning_rate": 1.0560915773475761e-06, "loss": 0.0172, "step": 7650 }, { "epoch": 3.480891719745223, "grad_norm": 0.6230082059639668, "learning_rate": 1.0555082364094222e-06, "loss": 0.0047, "step": 7651 }, { "epoch": 3.481346678798908, "grad_norm": 3.3816962683379233, "learning_rate": 1.0549250135086114e-06, "loss": 0.0483, "step": 7652 }, { "epoch": 3.4818016378525933, "grad_norm": 1.3106493204760516, "learning_rate": 1.054341908692802e-06, "loss": 0.0076, "step": 7653 }, { "epoch": 3.4822565969062786, "grad_norm": 0.9589065274997015, "learning_rate": 1.0537589220096441e-06, "loss": 0.0241, "step": 7654 }, { "epoch": 3.4827115559599635, "grad_norm": 1.0424977955931523, "learning_rate": 1.0531760535067762e-06, "loss": 0.0167, "step": 7655 }, { "epoch": 3.483166515013649, "grad_norm": 1.333565394916533, "learning_rate": 1.0525933032318264e-06, "loss": 0.0375, "step": 7656 }, { "epoch": 3.483621474067334, "grad_norm": 1.5383047080993406, "learning_rate": 1.052010671232416e-06, "loss": 0.0528, "step": 7657 }, { "epoch": 3.484076433121019, "grad_norm": 1.2332256640266677, "learning_rate": 1.051428157556155e-06, "loss": 0.0106, "step": 7658 }, { "epoch": 3.4845313921747043, "grad_norm": 1.0257674320747174, "learning_rate": 1.050845762250643e-06, "loss": 0.0247, "step": 7659 }, { "epoch": 3.4849863512283896, "grad_norm": 1.4980019307985468, "learning_rate": 1.05026348536347e-06, "loss": 0.0246, "step": 7660 }, { "epoch": 3.4854413102820745, "grad_norm": 1.1465821754182801, "learning_rate": 1.049681326942218e-06, "loss": 0.0165, "step": 7661 }, { "epoch": 3.48589626933576, "grad_norm": 1.231825246747117, "learning_rate": 1.0490992870344593e-06, "loss": 0.0166, "step": 7662 }, { "epoch": 3.486351228389445, "grad_norm": 1.0992466287263791, "learning_rate": 1.0485173656877547e-06, "loss": 0.0093, "step": 7663 }, { "epoch": 3.48680618744313, "grad_norm": 0.6608297971823364, "learning_rate": 1.0479355629496563e-06, "loss": 0.0079, "step": 7664 }, { "epoch": 3.4872611464968153, "grad_norm": 0.8887126679550241, "learning_rate": 1.0473538788677051e-06, "loss": 0.0076, "step": 7665 }, { "epoch": 3.4877161055505006, "grad_norm": 0.9737065446886256, "learning_rate": 1.0467723134894359e-06, "loss": 0.0083, "step": 7666 }, { "epoch": 3.4881710646041855, "grad_norm": 0.6576888667925646, "learning_rate": 1.0461908668623697e-06, "loss": 0.004, "step": 7667 }, { "epoch": 3.488626023657871, "grad_norm": 0.7538476136142855, "learning_rate": 1.0456095390340213e-06, "loss": 0.0064, "step": 7668 }, { "epoch": 3.489080982711556, "grad_norm": 1.0810843490198643, "learning_rate": 1.0450283300518933e-06, "loss": 0.015, "step": 7669 }, { "epoch": 3.489535941765241, "grad_norm": 1.1558948858647722, "learning_rate": 1.0444472399634786e-06, "loss": 0.0204, "step": 7670 }, { "epoch": 3.4899909008189263, "grad_norm": 0.8841678074315, "learning_rate": 1.0438662688162635e-06, "loss": 0.007, "step": 7671 }, { "epoch": 3.4904458598726116, "grad_norm": 1.5007405164941998, "learning_rate": 1.0432854166577207e-06, "loss": 0.0167, "step": 7672 }, { "epoch": 3.4909008189262964, "grad_norm": 0.9730361955436946, "learning_rate": 1.0427046835353154e-06, "loss": 0.0077, "step": 7673 }, { "epoch": 3.4913557779799818, "grad_norm": 1.0494249575339112, "learning_rate": 1.0421240694965012e-06, "loss": 0.013, "step": 7674 }, { "epoch": 3.491810737033667, "grad_norm": 0.9315880181612078, "learning_rate": 1.0415435745887245e-06, "loss": 0.0095, "step": 7675 }, { "epoch": 3.492265696087352, "grad_norm": 1.1897652811127588, "learning_rate": 1.0409631988594216e-06, "loss": 0.0083, "step": 7676 }, { "epoch": 3.4927206551410372, "grad_norm": 0.7245849277052571, "learning_rate": 1.0403829423560168e-06, "loss": 0.0242, "step": 7677 }, { "epoch": 3.4931756141947226, "grad_norm": 1.2556427030036703, "learning_rate": 1.0398028051259266e-06, "loss": 0.0211, "step": 7678 }, { "epoch": 3.4936305732484074, "grad_norm": 1.9221613771589314, "learning_rate": 1.0392227872165557e-06, "loss": 0.0195, "step": 7679 }, { "epoch": 3.4940855323020927, "grad_norm": 0.8344162586338595, "learning_rate": 1.038642888675303e-06, "loss": 0.0081, "step": 7680 }, { "epoch": 3.494540491355778, "grad_norm": 0.9858997330015975, "learning_rate": 1.0380631095495532e-06, "loss": 0.0178, "step": 7681 }, { "epoch": 3.494995450409463, "grad_norm": 1.3689331779067377, "learning_rate": 1.037483449886685e-06, "loss": 0.005, "step": 7682 }, { "epoch": 3.4954504094631482, "grad_norm": 2.143134684655348, "learning_rate": 1.0369039097340644e-06, "loss": 0.0084, "step": 7683 }, { "epoch": 3.4959053685168335, "grad_norm": 1.4747613114665976, "learning_rate": 1.036324489139048e-06, "loss": 0.0218, "step": 7684 }, { "epoch": 3.496360327570519, "grad_norm": 1.158843355278959, "learning_rate": 1.0357451881489858e-06, "loss": 0.0242, "step": 7685 }, { "epoch": 3.4968152866242037, "grad_norm": 0.8044638678328448, "learning_rate": 1.0351660068112138e-06, "loss": 0.0185, "step": 7686 }, { "epoch": 3.497270245677889, "grad_norm": 1.3727399538676173, "learning_rate": 1.0345869451730609e-06, "loss": 0.0211, "step": 7687 }, { "epoch": 3.4977252047315743, "grad_norm": 0.6553374396089223, "learning_rate": 1.0340080032818442e-06, "loss": 0.0157, "step": 7688 }, { "epoch": 3.498180163785259, "grad_norm": 0.7651683629378062, "learning_rate": 1.0334291811848736e-06, "loss": 0.0121, "step": 7689 }, { "epoch": 3.4986351228389445, "grad_norm": 0.344766360602028, "learning_rate": 1.032850478929447e-06, "loss": 0.0033, "step": 7690 }, { "epoch": 3.49909008189263, "grad_norm": 0.9305468774854447, "learning_rate": 1.0322718965628542e-06, "loss": 0.018, "step": 7691 }, { "epoch": 3.4995450409463147, "grad_norm": 1.2396210831864773, "learning_rate": 1.031693434132374e-06, "loss": 0.0191, "step": 7692 }, { "epoch": 3.5, "grad_norm": 1.220410263207243, "learning_rate": 1.0311150916852755e-06, "loss": 0.0378, "step": 7693 }, { "epoch": 3.5004549590536853, "grad_norm": 1.954840033956282, "learning_rate": 1.0305368692688175e-06, "loss": 0.0087, "step": 7694 }, { "epoch": 3.50090991810737, "grad_norm": 0.8049267810027169, "learning_rate": 1.0299587669302501e-06, "loss": 0.0116, "step": 7695 }, { "epoch": 3.5013648771610555, "grad_norm": 0.9511345074107113, "learning_rate": 1.029380784716815e-06, "loss": 0.0131, "step": 7696 }, { "epoch": 3.501819836214741, "grad_norm": 1.5598034218237522, "learning_rate": 1.0288029226757407e-06, "loss": 0.065, "step": 7697 }, { "epoch": 3.502274795268426, "grad_norm": 1.0513234784011467, "learning_rate": 1.0282251808542476e-06, "loss": 0.012, "step": 7698 }, { "epoch": 3.502729754322111, "grad_norm": 0.7538599950899834, "learning_rate": 1.0276475592995455e-06, "loss": 0.0122, "step": 7699 }, { "epoch": 3.5031847133757963, "grad_norm": 1.4843540789543792, "learning_rate": 1.0270700580588367e-06, "loss": 0.0299, "step": 7700 }, { "epoch": 3.5036396724294816, "grad_norm": 1.681108900024952, "learning_rate": 1.026492677179311e-06, "loss": 0.0143, "step": 7701 }, { "epoch": 3.5040946314831665, "grad_norm": 1.1565822667303305, "learning_rate": 1.0259154167081484e-06, "loss": 0.0076, "step": 7702 }, { "epoch": 3.5045495905368518, "grad_norm": 0.9533356926042086, "learning_rate": 1.0253382766925222e-06, "loss": 0.0078, "step": 7703 }, { "epoch": 3.505004549590537, "grad_norm": 0.801580898924158, "learning_rate": 1.0247612571795914e-06, "loss": 0.0067, "step": 7704 }, { "epoch": 3.505459508644222, "grad_norm": 0.5013000441989857, "learning_rate": 1.0241843582165095e-06, "loss": 0.007, "step": 7705 }, { "epoch": 3.5059144676979073, "grad_norm": 1.2671952813494636, "learning_rate": 1.0236075798504172e-06, "loss": 0.0254, "step": 7706 }, { "epoch": 3.5063694267515926, "grad_norm": 0.7090946474861222, "learning_rate": 1.023030922128446e-06, "loss": 0.0155, "step": 7707 }, { "epoch": 3.5068243858052774, "grad_norm": 1.394955358793103, "learning_rate": 1.022454385097717e-06, "loss": 0.0255, "step": 7708 }, { "epoch": 3.5072793448589628, "grad_norm": 0.7074739261565995, "learning_rate": 1.021877968805343e-06, "loss": 0.008, "step": 7709 }, { "epoch": 3.507734303912648, "grad_norm": 1.2293289162421774, "learning_rate": 1.0213016732984276e-06, "loss": 0.0058, "step": 7710 }, { "epoch": 3.508189262966333, "grad_norm": 0.7168579698697297, "learning_rate": 1.0207254986240615e-06, "loss": 0.0066, "step": 7711 }, { "epoch": 3.5086442220200182, "grad_norm": 0.8550777438203389, "learning_rate": 1.0201494448293272e-06, "loss": 0.0097, "step": 7712 }, { "epoch": 3.5090991810737036, "grad_norm": 1.032010574896032, "learning_rate": 1.0195735119612965e-06, "loss": 0.0199, "step": 7713 }, { "epoch": 3.5095541401273884, "grad_norm": 0.9946250560942277, "learning_rate": 1.0189977000670338e-06, "loss": 0.0137, "step": 7714 }, { "epoch": 3.5100090991810737, "grad_norm": 0.7785055283759919, "learning_rate": 1.0184220091935906e-06, "loss": 0.0117, "step": 7715 }, { "epoch": 3.510464058234759, "grad_norm": 1.4324817328912762, "learning_rate": 1.0178464393880095e-06, "loss": 0.0151, "step": 7716 }, { "epoch": 3.510919017288444, "grad_norm": 1.1738441491379117, "learning_rate": 1.017270990697325e-06, "loss": 0.0085, "step": 7717 }, { "epoch": 3.511373976342129, "grad_norm": 1.1474628170829864, "learning_rate": 1.0166956631685578e-06, "loss": 0.015, "step": 7718 }, { "epoch": 3.5118289353958145, "grad_norm": 0.6238007890583428, "learning_rate": 1.016120456848724e-06, "loss": 0.0055, "step": 7719 }, { "epoch": 3.5122838944494994, "grad_norm": 1.9164035809335525, "learning_rate": 1.015545371784825e-06, "loss": 0.0169, "step": 7720 }, { "epoch": 3.5127388535031847, "grad_norm": 0.8176357827228384, "learning_rate": 1.0149704080238542e-06, "loss": 0.009, "step": 7721 }, { "epoch": 3.51319381255687, "grad_norm": 0.805818806924173, "learning_rate": 1.0143955656127958e-06, "loss": 0.0087, "step": 7722 }, { "epoch": 3.513648771610555, "grad_norm": 0.9398012338766545, "learning_rate": 1.0138208445986208e-06, "loss": 0.0144, "step": 7723 }, { "epoch": 3.51410373066424, "grad_norm": 0.6559698273249314, "learning_rate": 1.0132462450282969e-06, "loss": 0.0062, "step": 7724 }, { "epoch": 3.5145586897179255, "grad_norm": 1.7629048920949333, "learning_rate": 1.0126717669487753e-06, "loss": 0.0086, "step": 7725 }, { "epoch": 3.5150136487716104, "grad_norm": 1.2064301759580822, "learning_rate": 1.0120974104070005e-06, "loss": 0.0235, "step": 7726 }, { "epoch": 3.5154686078252957, "grad_norm": 1.24844426096374, "learning_rate": 1.011523175449906e-06, "loss": 0.0104, "step": 7727 }, { "epoch": 3.515923566878981, "grad_norm": 0.6948756435755419, "learning_rate": 1.0109490621244148e-06, "loss": 0.0157, "step": 7728 }, { "epoch": 3.516378525932666, "grad_norm": 1.2997748981107677, "learning_rate": 1.0103750704774427e-06, "loss": 0.0329, "step": 7729 }, { "epoch": 3.516833484986351, "grad_norm": 1.3931486004169293, "learning_rate": 1.0098012005558916e-06, "loss": 0.0304, "step": 7730 }, { "epoch": 3.5172884440400365, "grad_norm": 1.1322725090010088, "learning_rate": 1.0092274524066578e-06, "loss": 0.0153, "step": 7731 }, { "epoch": 3.5177434030937214, "grad_norm": 0.9432593713575499, "learning_rate": 1.0086538260766243e-06, "loss": 0.0084, "step": 7732 }, { "epoch": 3.5181983621474067, "grad_norm": 1.6212433285786017, "learning_rate": 1.0080803216126644e-06, "loss": 0.0099, "step": 7733 }, { "epoch": 3.518653321201092, "grad_norm": 1.161877111686535, "learning_rate": 1.007506939061644e-06, "loss": 0.0294, "step": 7734 }, { "epoch": 3.519108280254777, "grad_norm": 0.863535522321409, "learning_rate": 1.0069336784704165e-06, "loss": 0.0242, "step": 7735 }, { "epoch": 3.519563239308462, "grad_norm": 0.7412040987141346, "learning_rate": 1.0063605398858261e-06, "loss": 0.0055, "step": 7736 }, { "epoch": 3.5200181983621475, "grad_norm": 1.1678953782121888, "learning_rate": 1.0057875233547066e-06, "loss": 0.0371, "step": 7737 }, { "epoch": 3.5204731574158323, "grad_norm": 0.9174388430719123, "learning_rate": 1.0052146289238826e-06, "loss": 0.0166, "step": 7738 }, { "epoch": 3.5209281164695176, "grad_norm": 1.3304556978151538, "learning_rate": 1.0046418566401698e-06, "loss": 0.0132, "step": 7739 }, { "epoch": 3.521383075523203, "grad_norm": 0.9619118862395893, "learning_rate": 1.0040692065503712e-06, "loss": 0.0093, "step": 7740 }, { "epoch": 3.521838034576888, "grad_norm": 0.963569873624518, "learning_rate": 1.0034966787012817e-06, "loss": 0.0116, "step": 7741 }, { "epoch": 3.522292993630573, "grad_norm": 1.1032718832851574, "learning_rate": 1.0029242731396847e-06, "loss": 0.0326, "step": 7742 }, { "epoch": 3.5227479526842584, "grad_norm": 1.2749158646892182, "learning_rate": 1.002351989912356e-06, "loss": 0.0485, "step": 7743 }, { "epoch": 3.5232029117379433, "grad_norm": 1.2228894103453491, "learning_rate": 1.0017798290660585e-06, "loss": 0.0309, "step": 7744 }, { "epoch": 3.5236578707916286, "grad_norm": 0.9714596797508632, "learning_rate": 1.0012077906475484e-06, "loss": 0.0328, "step": 7745 }, { "epoch": 3.524112829845314, "grad_norm": 1.5637713756802458, "learning_rate": 1.0006358747035692e-06, "loss": 0.0117, "step": 7746 }, { "epoch": 3.5245677888989992, "grad_norm": 1.1671534542690294, "learning_rate": 1.0000640812808543e-06, "loss": 0.0181, "step": 7747 }, { "epoch": 3.525022747952684, "grad_norm": 0.9599031552646553, "learning_rate": 9.9949241042613e-07, "loss": 0.0123, "step": 7748 }, { "epoch": 3.5254777070063694, "grad_norm": 1.3277823651114902, "learning_rate": 9.989208621861096e-07, "loss": 0.0421, "step": 7749 }, { "epoch": 3.5259326660600547, "grad_norm": 0.8719279462481612, "learning_rate": 9.983494366074975e-07, "loss": 0.0159, "step": 7750 }, { "epoch": 3.5263876251137396, "grad_norm": 1.4955898779667576, "learning_rate": 9.977781337369875e-07, "loss": 0.0323, "step": 7751 }, { "epoch": 3.526842584167425, "grad_norm": 1.3890706628306566, "learning_rate": 9.972069536212638e-07, "loss": 0.0316, "step": 7752 }, { "epoch": 3.52729754322111, "grad_norm": 1.0013076921228485, "learning_rate": 9.966358963070027e-07, "loss": 0.0262, "step": 7753 }, { "epoch": 3.5277525022747955, "grad_norm": 1.1825546837607122, "learning_rate": 9.96064961840867e-07, "loss": 0.0146, "step": 7754 }, { "epoch": 3.5282074613284804, "grad_norm": 1.2429644656579366, "learning_rate": 9.954941502695106e-07, "loss": 0.023, "step": 7755 }, { "epoch": 3.5286624203821657, "grad_norm": 0.7610671542235654, "learning_rate": 9.949234616395773e-07, "loss": 0.0115, "step": 7756 }, { "epoch": 3.529117379435851, "grad_norm": 1.153447591324199, "learning_rate": 9.943528959977028e-07, "loss": 0.0308, "step": 7757 }, { "epoch": 3.529572338489536, "grad_norm": 0.664059432388529, "learning_rate": 9.937824533905092e-07, "loss": 0.0044, "step": 7758 }, { "epoch": 3.530027297543221, "grad_norm": 1.4218939106516548, "learning_rate": 9.932121338646122e-07, "loss": 0.0282, "step": 7759 }, { "epoch": 3.5304822565969065, "grad_norm": 0.9721892136660684, "learning_rate": 9.926419374666152e-07, "loss": 0.0165, "step": 7760 }, { "epoch": 3.5309372156505914, "grad_norm": 1.091087884037172, "learning_rate": 9.92071864243112e-07, "loss": 0.0434, "step": 7761 }, { "epoch": 3.5313921747042767, "grad_norm": 1.5357649883011046, "learning_rate": 9.915019142406854e-07, "loss": 0.0111, "step": 7762 }, { "epoch": 3.531847133757962, "grad_norm": 1.63137831156848, "learning_rate": 9.90932087505911e-07, "loss": 0.0478, "step": 7763 }, { "epoch": 3.532302092811647, "grad_norm": 0.6923371848901749, "learning_rate": 9.90362384085351e-07, "loss": 0.0079, "step": 7764 }, { "epoch": 3.532757051865332, "grad_norm": 1.1298655005296938, "learning_rate": 9.897928040255592e-07, "loss": 0.0161, "step": 7765 }, { "epoch": 3.5332120109190175, "grad_norm": 0.7930144092768824, "learning_rate": 9.8922334737308e-07, "loss": 0.0186, "step": 7766 }, { "epoch": 3.5336669699727024, "grad_norm": 0.7699108634973774, "learning_rate": 9.886540141744456e-07, "loss": 0.0076, "step": 7767 }, { "epoch": 3.5341219290263877, "grad_norm": 0.716710878748384, "learning_rate": 9.880848044761806e-07, "loss": 0.0105, "step": 7768 }, { "epoch": 3.534576888080073, "grad_norm": 0.9373815048737973, "learning_rate": 9.875157183247977e-07, "loss": 0.0149, "step": 7769 }, { "epoch": 3.535031847133758, "grad_norm": 1.1759647962953055, "learning_rate": 9.869467557668002e-07, "loss": 0.0321, "step": 7770 }, { "epoch": 3.535486806187443, "grad_norm": 0.7231821486492475, "learning_rate": 9.863779168486797e-07, "loss": 0.0074, "step": 7771 }, { "epoch": 3.5359417652411285, "grad_norm": 0.9676953102958026, "learning_rate": 9.858092016169207e-07, "loss": 0.0201, "step": 7772 }, { "epoch": 3.5363967242948133, "grad_norm": 0.7469678260568903, "learning_rate": 9.852406101179964e-07, "loss": 0.0112, "step": 7773 }, { "epoch": 3.5368516833484986, "grad_norm": 1.222737143731747, "learning_rate": 9.846721423983692e-07, "loss": 0.0186, "step": 7774 }, { "epoch": 3.537306642402184, "grad_norm": 1.1358345864804333, "learning_rate": 9.841037985044907e-07, "loss": 0.0354, "step": 7775 }, { "epoch": 3.537761601455869, "grad_norm": 1.2853091317372318, "learning_rate": 9.835355784828038e-07, "loss": 0.0347, "step": 7776 }, { "epoch": 3.538216560509554, "grad_norm": 1.196143367039387, "learning_rate": 9.829674823797417e-07, "loss": 0.0335, "step": 7777 }, { "epoch": 3.5386715195632394, "grad_norm": 1.0976808277189318, "learning_rate": 9.82399510241726e-07, "loss": 0.0141, "step": 7778 }, { "epoch": 3.5391264786169243, "grad_norm": 0.6049034186296854, "learning_rate": 9.818316621151683e-07, "loss": 0.0071, "step": 7779 }, { "epoch": 3.5395814376706096, "grad_norm": 1.323457936498297, "learning_rate": 9.81263938046472e-07, "loss": 0.0255, "step": 7780 }, { "epoch": 3.540036396724295, "grad_norm": 0.7619059501873704, "learning_rate": 9.806963380820271e-07, "loss": 0.0244, "step": 7781 }, { "epoch": 3.54049135577798, "grad_norm": 0.8704199216158572, "learning_rate": 9.801288622682172e-07, "loss": 0.0275, "step": 7782 }, { "epoch": 3.540946314831665, "grad_norm": 0.29461004439455396, "learning_rate": 9.795615106514133e-07, "loss": 0.0023, "step": 7783 }, { "epoch": 3.5414012738853504, "grad_norm": 0.6561849562803252, "learning_rate": 9.789942832779765e-07, "loss": 0.0056, "step": 7784 }, { "epoch": 3.5418562329390353, "grad_norm": 0.9876665036035394, "learning_rate": 9.784271801942568e-07, "loss": 0.0092, "step": 7785 }, { "epoch": 3.5423111919927206, "grad_norm": 1.4603967885606148, "learning_rate": 9.778602014465968e-07, "loss": 0.0184, "step": 7786 }, { "epoch": 3.542766151046406, "grad_norm": 0.5381536476927186, "learning_rate": 9.772933470813281e-07, "loss": 0.0063, "step": 7787 }, { "epoch": 3.5432211101000908, "grad_norm": 1.273589762176134, "learning_rate": 9.767266171447706e-07, "loss": 0.0195, "step": 7788 }, { "epoch": 3.543676069153776, "grad_norm": 0.9884570377605886, "learning_rate": 9.761600116832347e-07, "loss": 0.0196, "step": 7789 }, { "epoch": 3.5441310282074614, "grad_norm": 0.6962434836949343, "learning_rate": 9.755935307430203e-07, "loss": 0.0105, "step": 7790 }, { "epoch": 3.5445859872611463, "grad_norm": 1.0839533560712697, "learning_rate": 9.750271743704195e-07, "loss": 0.0066, "step": 7791 }, { "epoch": 3.5450409463148316, "grad_norm": 1.447256633469542, "learning_rate": 9.74460942611711e-07, "loss": 0.0136, "step": 7792 }, { "epoch": 3.545495905368517, "grad_norm": 1.6807030310165316, "learning_rate": 9.738948355131642e-07, "loss": 0.0187, "step": 7793 }, { "epoch": 3.5459508644222018, "grad_norm": 1.2004314605177084, "learning_rate": 9.733288531210406e-07, "loss": 0.0164, "step": 7794 }, { "epoch": 3.546405823475887, "grad_norm": 1.0450933560773255, "learning_rate": 9.72762995481588e-07, "loss": 0.0105, "step": 7795 }, { "epoch": 3.5468607825295724, "grad_norm": 1.317028704389055, "learning_rate": 9.72197262641047e-07, "loss": 0.0539, "step": 7796 }, { "epoch": 3.5473157415832572, "grad_norm": 1.080643473761697, "learning_rate": 9.716316546456462e-07, "loss": 0.0187, "step": 7797 }, { "epoch": 3.5477707006369426, "grad_norm": 1.3621619234966162, "learning_rate": 9.710661715416048e-07, "loss": 0.0186, "step": 7798 }, { "epoch": 3.548225659690628, "grad_norm": 0.6479791699616033, "learning_rate": 9.70500813375131e-07, "loss": 0.0252, "step": 7799 }, { "epoch": 3.548680618744313, "grad_norm": 1.7492732810144815, "learning_rate": 9.69935580192423e-07, "loss": 0.0305, "step": 7800 }, { "epoch": 3.549135577797998, "grad_norm": 1.3772640359999744, "learning_rate": 9.693704720396693e-07, "loss": 0.0226, "step": 7801 }, { "epoch": 3.5495905368516834, "grad_norm": 0.9331787390935692, "learning_rate": 9.688054889630493e-07, "loss": 0.0139, "step": 7802 }, { "epoch": 3.5500454959053687, "grad_norm": 0.8404188932810707, "learning_rate": 9.682406310087304e-07, "loss": 0.0206, "step": 7803 }, { "epoch": 3.5505004549590535, "grad_norm": 1.3460393329028353, "learning_rate": 9.676758982228693e-07, "loss": 0.0444, "step": 7804 }, { "epoch": 3.550955414012739, "grad_norm": 0.772788178343164, "learning_rate": 9.67111290651613e-07, "loss": 0.019, "step": 7805 }, { "epoch": 3.551410373066424, "grad_norm": 1.0008086744355513, "learning_rate": 9.665468083411005e-07, "loss": 0.0102, "step": 7806 }, { "epoch": 3.5518653321201095, "grad_norm": 0.5573363946539817, "learning_rate": 9.659824513374572e-07, "loss": 0.0051, "step": 7807 }, { "epoch": 3.5523202911737943, "grad_norm": 0.8931039711616251, "learning_rate": 9.654182196868012e-07, "loss": 0.0117, "step": 7808 }, { "epoch": 3.5527752502274796, "grad_norm": 0.9675677939637315, "learning_rate": 9.648541134352379e-07, "loss": 0.0186, "step": 7809 }, { "epoch": 3.553230209281165, "grad_norm": 0.5198082842427189, "learning_rate": 9.642901326288631e-07, "loss": 0.0043, "step": 7810 }, { "epoch": 3.55368516833485, "grad_norm": 0.8301891840510133, "learning_rate": 9.637262773137642e-07, "loss": 0.0211, "step": 7811 }, { "epoch": 3.554140127388535, "grad_norm": 0.9540044583133775, "learning_rate": 9.631625475360166e-07, "loss": 0.0223, "step": 7812 }, { "epoch": 3.5545950864422204, "grad_norm": 0.8962736322921374, "learning_rate": 9.625989433416848e-07, "loss": 0.0039, "step": 7813 }, { "epoch": 3.5550500454959053, "grad_norm": 0.5438833206752809, "learning_rate": 9.62035464776824e-07, "loss": 0.0027, "step": 7814 }, { "epoch": 3.5555050045495906, "grad_norm": 1.15212007754663, "learning_rate": 9.614721118874796e-07, "loss": 0.0233, "step": 7815 }, { "epoch": 3.555959963603276, "grad_norm": 1.221956177229352, "learning_rate": 9.609088847196869e-07, "loss": 0.0216, "step": 7816 }, { "epoch": 3.556414922656961, "grad_norm": 1.1598308009722635, "learning_rate": 9.603457833194698e-07, "loss": 0.0448, "step": 7817 }, { "epoch": 3.556869881710646, "grad_norm": 1.147803936361469, "learning_rate": 9.597828077328422e-07, "loss": 0.0213, "step": 7818 }, { "epoch": 3.5573248407643314, "grad_norm": 0.9409827623267729, "learning_rate": 9.592199580058073e-07, "loss": 0.0142, "step": 7819 }, { "epoch": 3.5577797998180163, "grad_norm": 0.22302078239042936, "learning_rate": 9.5865723418436e-07, "loss": 0.001, "step": 7820 }, { "epoch": 3.5582347588717016, "grad_norm": 0.6100587722390832, "learning_rate": 9.580946363144822e-07, "loss": 0.0052, "step": 7821 }, { "epoch": 3.558689717925387, "grad_norm": 0.862927372793996, "learning_rate": 9.575321644421482e-07, "loss": 0.0181, "step": 7822 }, { "epoch": 3.5591446769790718, "grad_norm": 1.0454432921017172, "learning_rate": 9.569698186133204e-07, "loss": 0.0487, "step": 7823 }, { "epoch": 3.559599636032757, "grad_norm": 1.351041486558695, "learning_rate": 9.564075988739494e-07, "loss": 0.0183, "step": 7824 }, { "epoch": 3.5600545950864424, "grad_norm": 0.9630946146677668, "learning_rate": 9.558455052699797e-07, "loss": 0.013, "step": 7825 }, { "epoch": 3.5605095541401273, "grad_norm": 0.924497963177177, "learning_rate": 9.552835378473418e-07, "loss": 0.0199, "step": 7826 }, { "epoch": 3.5609645131938126, "grad_norm": 1.2411743552879084, "learning_rate": 9.547216966519575e-07, "loss": 0.0267, "step": 7827 }, { "epoch": 3.561419472247498, "grad_norm": 0.918505608389065, "learning_rate": 9.54159981729737e-07, "loss": 0.0385, "step": 7828 }, { "epoch": 3.5618744313011828, "grad_norm": 0.5540340172509823, "learning_rate": 9.535983931265816e-07, "loss": 0.0044, "step": 7829 }, { "epoch": 3.562329390354868, "grad_norm": 1.4917964778269868, "learning_rate": 9.53036930888383e-07, "loss": 0.0174, "step": 7830 }, { "epoch": 3.5627843494085534, "grad_norm": 0.485404446150419, "learning_rate": 9.524755950610204e-07, "loss": 0.0045, "step": 7831 }, { "epoch": 3.5632393084622382, "grad_norm": 0.709207699421339, "learning_rate": 9.519143856903634e-07, "loss": 0.0141, "step": 7832 }, { "epoch": 3.5636942675159236, "grad_norm": 0.6470269407717185, "learning_rate": 9.513533028222719e-07, "loss": 0.008, "step": 7833 }, { "epoch": 3.564149226569609, "grad_norm": 0.8649028178731235, "learning_rate": 9.507923465025939e-07, "loss": 0.008, "step": 7834 }, { "epoch": 3.5646041856232937, "grad_norm": 0.8811276541717554, "learning_rate": 9.502315167771695e-07, "loss": 0.011, "step": 7835 }, { "epoch": 3.565059144676979, "grad_norm": 1.3536352431151075, "learning_rate": 9.496708136918273e-07, "loss": 0.0275, "step": 7836 }, { "epoch": 3.5655141037306644, "grad_norm": 2.116713213713124, "learning_rate": 9.491102372923852e-07, "loss": 0.0181, "step": 7837 }, { "epoch": 3.565969062784349, "grad_norm": 1.634084549668978, "learning_rate": 9.485497876246508e-07, "loss": 0.0108, "step": 7838 }, { "epoch": 3.5664240218380345, "grad_norm": 0.6891324460247287, "learning_rate": 9.479894647344204e-07, "loss": 0.012, "step": 7839 }, { "epoch": 3.56687898089172, "grad_norm": 0.8688955403832235, "learning_rate": 9.474292686674832e-07, "loss": 0.0037, "step": 7840 }, { "epoch": 3.5673339399454047, "grad_norm": 0.5638917195188292, "learning_rate": 9.468691994696147e-07, "loss": 0.0075, "step": 7841 }, { "epoch": 3.56778889899909, "grad_norm": 0.7217378089943544, "learning_rate": 9.463092571865804e-07, "loss": 0.0156, "step": 7842 }, { "epoch": 3.5682438580527753, "grad_norm": 1.185538355200751, "learning_rate": 9.457494418641383e-07, "loss": 0.022, "step": 7843 }, { "epoch": 3.56869881710646, "grad_norm": 0.42185498644971076, "learning_rate": 9.451897535480318e-07, "loss": 0.0042, "step": 7844 }, { "epoch": 3.5691537761601455, "grad_norm": 1.506420011631079, "learning_rate": 9.446301922839981e-07, "loss": 0.0221, "step": 7845 }, { "epoch": 3.569608735213831, "grad_norm": 0.9827016398049638, "learning_rate": 9.440707581177611e-07, "loss": 0.0077, "step": 7846 }, { "epoch": 3.5700636942675157, "grad_norm": 1.1586969442384674, "learning_rate": 9.435114510950353e-07, "loss": 0.0168, "step": 7847 }, { "epoch": 3.570518653321201, "grad_norm": 1.2634829875832247, "learning_rate": 9.429522712615238e-07, "loss": 0.0105, "step": 7848 }, { "epoch": 3.5709736123748863, "grad_norm": 0.47586430539057256, "learning_rate": 9.423932186629208e-07, "loss": 0.0055, "step": 7849 }, { "epoch": 3.571428571428571, "grad_norm": 0.6123307915215965, "learning_rate": 9.418342933449112e-07, "loss": 0.004, "step": 7850 }, { "epoch": 3.5718835304822565, "grad_norm": 1.4783665550165894, "learning_rate": 9.412754953531664e-07, "loss": 0.0277, "step": 7851 }, { "epoch": 3.572338489535942, "grad_norm": 1.1259856381421558, "learning_rate": 9.407168247333489e-07, "loss": 0.0187, "step": 7852 }, { "epoch": 3.5727934485896267, "grad_norm": 1.3228760231361176, "learning_rate": 9.4015828153111e-07, "loss": 0.0215, "step": 7853 }, { "epoch": 3.573248407643312, "grad_norm": 1.3847432904671384, "learning_rate": 9.395998657920932e-07, "loss": 0.0466, "step": 7854 }, { "epoch": 3.5737033666969973, "grad_norm": 1.0560032406452209, "learning_rate": 9.390415775619283e-07, "loss": 0.0105, "step": 7855 }, { "epoch": 3.5741583257506826, "grad_norm": 1.109701408165435, "learning_rate": 9.384834168862358e-07, "loss": 0.0094, "step": 7856 }, { "epoch": 3.5746132848043675, "grad_norm": 1.0655278358421465, "learning_rate": 9.379253838106275e-07, "loss": 0.0152, "step": 7857 }, { "epoch": 3.5750682438580528, "grad_norm": 3.742164648024442, "learning_rate": 9.373674783807018e-07, "loss": 0.0252, "step": 7858 }, { "epoch": 3.575523202911738, "grad_norm": 1.3278100217744648, "learning_rate": 9.368097006420498e-07, "loss": 0.034, "step": 7859 }, { "epoch": 3.575978161965423, "grad_norm": 0.914025091616374, "learning_rate": 9.362520506402497e-07, "loss": 0.0022, "step": 7860 }, { "epoch": 3.5764331210191083, "grad_norm": 0.778132118538466, "learning_rate": 9.356945284208704e-07, "loss": 0.0158, "step": 7861 }, { "epoch": 3.5768880800727936, "grad_norm": 0.9870624833874688, "learning_rate": 9.35137134029469e-07, "loss": 0.0089, "step": 7862 }, { "epoch": 3.577343039126479, "grad_norm": 1.2192232982123943, "learning_rate": 9.345798675115939e-07, "loss": 0.022, "step": 7863 }, { "epoch": 3.5777979981801638, "grad_norm": 0.4697352096406583, "learning_rate": 9.340227289127837e-07, "loss": 0.0054, "step": 7864 }, { "epoch": 3.578252957233849, "grad_norm": 1.627739697162146, "learning_rate": 9.334657182785642e-07, "loss": 0.0086, "step": 7865 }, { "epoch": 3.5787079162875344, "grad_norm": 0.5452065820341387, "learning_rate": 9.329088356544519e-07, "loss": 0.0069, "step": 7866 }, { "epoch": 3.5791628753412192, "grad_norm": 0.9421581329426438, "learning_rate": 9.323520810859523e-07, "loss": 0.0146, "step": 7867 }, { "epoch": 3.5796178343949046, "grad_norm": 0.8639573200579959, "learning_rate": 9.317954546185607e-07, "loss": 0.0062, "step": 7868 }, { "epoch": 3.58007279344859, "grad_norm": 0.8368241819721607, "learning_rate": 9.31238956297763e-07, "loss": 0.0048, "step": 7869 }, { "epoch": 3.5805277525022747, "grad_norm": 1.0924229828934786, "learning_rate": 9.30682586169033e-07, "loss": 0.0061, "step": 7870 }, { "epoch": 3.58098271155596, "grad_norm": 3.877151727140727, "learning_rate": 9.301263442778358e-07, "loss": 0.0457, "step": 7871 }, { "epoch": 3.5814376706096454, "grad_norm": 1.38007353665375, "learning_rate": 9.295702306696239e-07, "loss": 0.0289, "step": 7872 }, { "epoch": 3.58189262966333, "grad_norm": 0.7834452323872453, "learning_rate": 9.290142453898402e-07, "loss": 0.0035, "step": 7873 }, { "epoch": 3.5823475887170155, "grad_norm": 1.2545432815917716, "learning_rate": 9.284583884839187e-07, "loss": 0.0164, "step": 7874 }, { "epoch": 3.582802547770701, "grad_norm": 1.134500209199805, "learning_rate": 9.279026599972807e-07, "loss": 0.0187, "step": 7875 }, { "epoch": 3.5832575068243857, "grad_norm": 1.380997155576024, "learning_rate": 9.273470599753376e-07, "loss": 0.0234, "step": 7876 }, { "epoch": 3.583712465878071, "grad_norm": 1.1812230999674014, "learning_rate": 9.267915884634901e-07, "loss": 0.014, "step": 7877 }, { "epoch": 3.5841674249317563, "grad_norm": 0.8300449639551227, "learning_rate": 9.262362455071294e-07, "loss": 0.0089, "step": 7878 }, { "epoch": 3.584622383985441, "grad_norm": 1.7303725209931928, "learning_rate": 9.256810311516365e-07, "loss": 0.0397, "step": 7879 }, { "epoch": 3.5850773430391265, "grad_norm": 1.3431802463039362, "learning_rate": 9.2512594544238e-07, "loss": 0.0108, "step": 7880 }, { "epoch": 3.585532302092812, "grad_norm": 0.7577913391268346, "learning_rate": 9.245709884247195e-07, "loss": 0.0094, "step": 7881 }, { "epoch": 3.5859872611464967, "grad_norm": 0.8131300243578831, "learning_rate": 9.24016160144002e-07, "loss": 0.022, "step": 7882 }, { "epoch": 3.586442220200182, "grad_norm": 0.780190865759865, "learning_rate": 9.234614606455681e-07, "loss": 0.0069, "step": 7883 }, { "epoch": 3.5868971792538673, "grad_norm": 0.9536406886431206, "learning_rate": 9.229068899747428e-07, "loss": 0.0219, "step": 7884 }, { "epoch": 3.587352138307552, "grad_norm": 1.4564084150288035, "learning_rate": 9.223524481768454e-07, "loss": 0.0168, "step": 7885 }, { "epoch": 3.5878070973612375, "grad_norm": 0.798949204504729, "learning_rate": 9.217981352971814e-07, "loss": 0.0045, "step": 7886 }, { "epoch": 3.588262056414923, "grad_norm": 0.8465370494351792, "learning_rate": 9.212439513810457e-07, "loss": 0.015, "step": 7887 }, { "epoch": 3.5887170154686077, "grad_norm": 1.1996244342886677, "learning_rate": 9.206898964737257e-07, "loss": 0.018, "step": 7888 }, { "epoch": 3.589171974522293, "grad_norm": 1.1354475265364403, "learning_rate": 9.201359706204952e-07, "loss": 0.0511, "step": 7889 }, { "epoch": 3.5896269335759783, "grad_norm": 0.8151109814115137, "learning_rate": 9.195821738666183e-07, "loss": 0.0123, "step": 7890 }, { "epoch": 3.590081892629663, "grad_norm": 0.578214704294651, "learning_rate": 9.190285062573484e-07, "loss": 0.0051, "step": 7891 }, { "epoch": 3.5905368516833485, "grad_norm": 0.5379046286098681, "learning_rate": 9.184749678379296e-07, "loss": 0.0034, "step": 7892 }, { "epoch": 3.5909918107370338, "grad_norm": 0.9992803377205075, "learning_rate": 9.17921558653595e-07, "loss": 0.014, "step": 7893 }, { "epoch": 3.5914467697907186, "grad_norm": 1.1510474679816225, "learning_rate": 9.173682787495658e-07, "loss": 0.0148, "step": 7894 }, { "epoch": 3.591901728844404, "grad_norm": 0.9349343815897105, "learning_rate": 9.168151281710542e-07, "loss": 0.0109, "step": 7895 }, { "epoch": 3.5923566878980893, "grad_norm": 1.4317710539441806, "learning_rate": 9.162621069632596e-07, "loss": 0.0271, "step": 7896 }, { "epoch": 3.592811646951774, "grad_norm": 1.0631052487660484, "learning_rate": 9.157092151713742e-07, "loss": 0.0123, "step": 7897 }, { "epoch": 3.5932666060054594, "grad_norm": 1.074161850933214, "learning_rate": 9.151564528405765e-07, "loss": 0.0268, "step": 7898 }, { "epoch": 3.5937215650591448, "grad_norm": 1.5412963336133736, "learning_rate": 9.146038200160373e-07, "loss": 0.0175, "step": 7899 }, { "epoch": 3.5941765241128296, "grad_norm": 0.8409549039294753, "learning_rate": 9.140513167429144e-07, "loss": 0.0219, "step": 7900 }, { "epoch": 3.594631483166515, "grad_norm": 0.8670467997214022, "learning_rate": 9.134989430663549e-07, "loss": 0.0141, "step": 7901 }, { "epoch": 3.5950864422202002, "grad_norm": 0.8887698417966831, "learning_rate": 9.129466990314978e-07, "loss": 0.0076, "step": 7902 }, { "epoch": 3.595541401273885, "grad_norm": 1.175362815196912, "learning_rate": 9.123945846834697e-07, "loss": 0.0351, "step": 7903 }, { "epoch": 3.5959963603275704, "grad_norm": 0.5509100825262075, "learning_rate": 9.118426000673864e-07, "loss": 0.0055, "step": 7904 }, { "epoch": 3.5964513193812557, "grad_norm": 0.7932043985978411, "learning_rate": 9.112907452283528e-07, "loss": 0.0313, "step": 7905 }, { "epoch": 3.5969062784349406, "grad_norm": 0.6670486008292993, "learning_rate": 9.10739020211466e-07, "loss": 0.0234, "step": 7906 }, { "epoch": 3.597361237488626, "grad_norm": 0.7422158229374406, "learning_rate": 9.101874250618086e-07, "loss": 0.0145, "step": 7907 }, { "epoch": 3.597816196542311, "grad_norm": 0.6091171049417888, "learning_rate": 9.096359598244562e-07, "loss": 0.0119, "step": 7908 }, { "epoch": 3.598271155595996, "grad_norm": 1.1726502010251245, "learning_rate": 9.090846245444709e-07, "loss": 0.026, "step": 7909 }, { "epoch": 3.5987261146496814, "grad_norm": 0.8837495842141715, "learning_rate": 9.085334192669057e-07, "loss": 0.0162, "step": 7910 }, { "epoch": 3.5991810737033667, "grad_norm": 1.3175932319235473, "learning_rate": 9.079823440368018e-07, "loss": 0.0156, "step": 7911 }, { "epoch": 3.599636032757052, "grad_norm": 0.8434898030225454, "learning_rate": 9.074313988991909e-07, "loss": 0.0114, "step": 7912 }, { "epoch": 3.600090991810737, "grad_norm": 0.9449784852168329, "learning_rate": 9.068805838990952e-07, "loss": 0.0295, "step": 7913 }, { "epoch": 3.600545950864422, "grad_norm": 1.1620558234943796, "learning_rate": 9.063298990815237e-07, "loss": 0.0285, "step": 7914 }, { "epoch": 3.6010009099181075, "grad_norm": 1.7532621583587558, "learning_rate": 9.057793444914758e-07, "loss": 0.0109, "step": 7915 }, { "epoch": 3.6014558689717924, "grad_norm": 0.8462689179312916, "learning_rate": 9.052289201739397e-07, "loss": 0.0089, "step": 7916 }, { "epoch": 3.6019108280254777, "grad_norm": 0.776976807399508, "learning_rate": 9.046786261738952e-07, "loss": 0.0086, "step": 7917 }, { "epoch": 3.602365787079163, "grad_norm": 0.6449445391688674, "learning_rate": 9.041284625363089e-07, "loss": 0.005, "step": 7918 }, { "epoch": 3.6028207461328483, "grad_norm": 1.1019457786426115, "learning_rate": 9.035784293061367e-07, "loss": 0.0381, "step": 7919 }, { "epoch": 3.603275705186533, "grad_norm": 0.8981146214348563, "learning_rate": 9.03028526528327e-07, "loss": 0.0152, "step": 7920 }, { "epoch": 3.6037306642402185, "grad_norm": 0.8907245330644379, "learning_rate": 9.024787542478133e-07, "loss": 0.0066, "step": 7921 }, { "epoch": 3.604185623293904, "grad_norm": 0.7539276632042256, "learning_rate": 9.019291125095222e-07, "loss": 0.0264, "step": 7922 }, { "epoch": 3.6046405823475887, "grad_norm": 1.6357531725517371, "learning_rate": 9.013796013583675e-07, "loss": 0.0414, "step": 7923 }, { "epoch": 3.605095541401274, "grad_norm": 0.8238012486645919, "learning_rate": 9.008302208392522e-07, "loss": 0.0161, "step": 7924 }, { "epoch": 3.6055505004549593, "grad_norm": 1.1557541731396292, "learning_rate": 9.002809709970686e-07, "loss": 0.017, "step": 7925 }, { "epoch": 3.606005459508644, "grad_norm": 1.7172265096933252, "learning_rate": 8.997318518767001e-07, "loss": 0.0224, "step": 7926 }, { "epoch": 3.6064604185623295, "grad_norm": 1.2384365505086836, "learning_rate": 8.991828635230185e-07, "loss": 0.0309, "step": 7927 }, { "epoch": 3.6069153776160148, "grad_norm": 0.7524298382179778, "learning_rate": 8.98634005980884e-07, "loss": 0.0108, "step": 7928 }, { "epoch": 3.6073703366696996, "grad_norm": 0.9972158600478777, "learning_rate": 8.980852792951472e-07, "loss": 0.0155, "step": 7929 }, { "epoch": 3.607825295723385, "grad_norm": 0.6428298066888714, "learning_rate": 8.975366835106461e-07, "loss": 0.0151, "step": 7930 }, { "epoch": 3.6082802547770703, "grad_norm": 1.038958478516155, "learning_rate": 8.969882186722112e-07, "loss": 0.0122, "step": 7931 }, { "epoch": 3.608735213830755, "grad_norm": 1.2401391858541155, "learning_rate": 8.964398848246602e-07, "loss": 0.0236, "step": 7932 }, { "epoch": 3.6091901728844404, "grad_norm": 0.693583756602261, "learning_rate": 8.958916820127994e-07, "loss": 0.0086, "step": 7933 }, { "epoch": 3.6096451319381258, "grad_norm": 0.6129188696284765, "learning_rate": 8.95343610281427e-07, "loss": 0.004, "step": 7934 }, { "epoch": 3.6101000909918106, "grad_norm": 0.6656702784250963, "learning_rate": 8.947956696753274e-07, "loss": 0.0044, "step": 7935 }, { "epoch": 3.610555050045496, "grad_norm": 1.2254422230486648, "learning_rate": 8.942478602392773e-07, "loss": 0.0184, "step": 7936 }, { "epoch": 3.6110100090991812, "grad_norm": 0.49369824194740475, "learning_rate": 8.937001820180408e-07, "loss": 0.0064, "step": 7937 }, { "epoch": 3.611464968152866, "grad_norm": 0.5950707453111485, "learning_rate": 8.931526350563713e-07, "loss": 0.0057, "step": 7938 }, { "epoch": 3.6119199272065514, "grad_norm": 1.7915173389537864, "learning_rate": 8.92605219399012e-07, "loss": 0.0234, "step": 7939 }, { "epoch": 3.6123748862602367, "grad_norm": 1.0808924278808167, "learning_rate": 8.920579350906936e-07, "loss": 0.0095, "step": 7940 }, { "epoch": 3.6128298453139216, "grad_norm": 1.2978749320415215, "learning_rate": 8.915107821761409e-07, "loss": 0.0098, "step": 7941 }, { "epoch": 3.613284804367607, "grad_norm": 0.6451405636548729, "learning_rate": 8.909637607000632e-07, "loss": 0.0043, "step": 7942 }, { "epoch": 3.613739763421292, "grad_norm": 0.8192147025555999, "learning_rate": 8.904168707071609e-07, "loss": 0.0069, "step": 7943 }, { "epoch": 3.614194722474977, "grad_norm": 2.261267444258353, "learning_rate": 8.89870112242123e-07, "loss": 0.0297, "step": 7944 }, { "epoch": 3.6146496815286624, "grad_norm": 2.7315476842722894, "learning_rate": 8.893234853496271e-07, "loss": 0.0229, "step": 7945 }, { "epoch": 3.6151046405823477, "grad_norm": 1.5087394986396927, "learning_rate": 8.887769900743434e-07, "loss": 0.0346, "step": 7946 }, { "epoch": 3.6155595996360326, "grad_norm": 1.7762666623260113, "learning_rate": 8.882306264609269e-07, "loss": 0.037, "step": 7947 }, { "epoch": 3.616014558689718, "grad_norm": 1.1929123589699502, "learning_rate": 8.876843945540259e-07, "loss": 0.0232, "step": 7948 }, { "epoch": 3.616469517743403, "grad_norm": 1.303753112657488, "learning_rate": 8.87138294398275e-07, "loss": 0.0193, "step": 7949 }, { "epoch": 3.616924476797088, "grad_norm": 1.0447891480680496, "learning_rate": 8.865923260382981e-07, "loss": 0.0146, "step": 7950 }, { "epoch": 3.6173794358507734, "grad_norm": 1.332486052555615, "learning_rate": 8.860464895187113e-07, "loss": 0.0133, "step": 7951 }, { "epoch": 3.6178343949044587, "grad_norm": 0.9506982567441117, "learning_rate": 8.855007848841166e-07, "loss": 0.0172, "step": 7952 }, { "epoch": 3.6182893539581436, "grad_norm": 1.04117194869197, "learning_rate": 8.849552121791067e-07, "loss": 0.0219, "step": 7953 }, { "epoch": 3.618744313011829, "grad_norm": 0.7859735150365906, "learning_rate": 8.844097714482625e-07, "loss": 0.0115, "step": 7954 }, { "epoch": 3.619199272065514, "grad_norm": 1.7125260261887427, "learning_rate": 8.838644627361562e-07, "loss": 0.0261, "step": 7955 }, { "epoch": 3.619654231119199, "grad_norm": 0.5905321709685497, "learning_rate": 8.83319286087348e-07, "loss": 0.0063, "step": 7956 }, { "epoch": 3.6201091901728844, "grad_norm": 0.8729541400475664, "learning_rate": 8.827742415463872e-07, "loss": 0.0243, "step": 7957 }, { "epoch": 3.6205641492265697, "grad_norm": 1.1585258556308984, "learning_rate": 8.822293291578119e-07, "loss": 0.0095, "step": 7958 }, { "epoch": 3.6210191082802545, "grad_norm": 1.7041643710291536, "learning_rate": 8.816845489661493e-07, "loss": 0.0189, "step": 7959 }, { "epoch": 3.62147406733394, "grad_norm": 1.3204105806423263, "learning_rate": 8.811399010159177e-07, "loss": 0.0245, "step": 7960 }, { "epoch": 3.621929026387625, "grad_norm": 0.5258481758777129, "learning_rate": 8.805953853516222e-07, "loss": 0.0032, "step": 7961 }, { "epoch": 3.62238398544131, "grad_norm": 0.963381883086469, "learning_rate": 8.800510020177591e-07, "loss": 0.014, "step": 7962 }, { "epoch": 3.6228389444949953, "grad_norm": 1.2311974909496015, "learning_rate": 8.795067510588129e-07, "loss": 0.0134, "step": 7963 }, { "epoch": 3.6232939035486806, "grad_norm": 0.5500434508376162, "learning_rate": 8.789626325192557e-07, "loss": 0.0046, "step": 7964 }, { "epoch": 3.623748862602366, "grad_norm": 0.8443784578183441, "learning_rate": 8.784186464435526e-07, "loss": 0.0237, "step": 7965 }, { "epoch": 3.624203821656051, "grad_norm": 0.6193809476316912, "learning_rate": 8.778747928761549e-07, "loss": 0.0066, "step": 7966 }, { "epoch": 3.624658780709736, "grad_norm": 0.23655686353711575, "learning_rate": 8.773310718615036e-07, "loss": 0.0017, "step": 7967 }, { "epoch": 3.6251137397634214, "grad_norm": 1.0832161825627922, "learning_rate": 8.767874834440282e-07, "loss": 0.0072, "step": 7968 }, { "epoch": 3.6255686988171063, "grad_norm": 1.4888899528514261, "learning_rate": 8.762440276681494e-07, "loss": 0.0305, "step": 7969 }, { "epoch": 3.6260236578707916, "grad_norm": 0.9105759233104014, "learning_rate": 8.757007045782768e-07, "loss": 0.0158, "step": 7970 }, { "epoch": 3.626478616924477, "grad_norm": 1.20001277579189, "learning_rate": 8.751575142188071e-07, "loss": 0.0091, "step": 7971 }, { "epoch": 3.6269335759781622, "grad_norm": 1.6508780066232305, "learning_rate": 8.746144566341277e-07, "loss": 0.023, "step": 7972 }, { "epoch": 3.627388535031847, "grad_norm": 0.7842467157993954, "learning_rate": 8.740715318686149e-07, "loss": 0.0093, "step": 7973 }, { "epoch": 3.6278434940855324, "grad_norm": 1.2305855247950173, "learning_rate": 8.735287399666329e-07, "loss": 0.0201, "step": 7974 }, { "epoch": 3.6282984531392177, "grad_norm": 1.699361454290843, "learning_rate": 8.729860809725371e-07, "loss": 0.0125, "step": 7975 }, { "epoch": 3.6287534121929026, "grad_norm": 1.6858133600349565, "learning_rate": 8.724435549306723e-07, "loss": 0.0248, "step": 7976 }, { "epoch": 3.629208371246588, "grad_norm": 0.8937532439990024, "learning_rate": 8.719011618853701e-07, "loss": 0.0186, "step": 7977 }, { "epoch": 3.629663330300273, "grad_norm": 1.110188339694041, "learning_rate": 8.713589018809523e-07, "loss": 0.0107, "step": 7978 }, { "epoch": 3.630118289353958, "grad_norm": 1.3074733135770074, "learning_rate": 8.708167749617296e-07, "loss": 0.0172, "step": 7979 }, { "epoch": 3.6305732484076434, "grad_norm": 1.1872953921725176, "learning_rate": 8.702747811720035e-07, "loss": 0.0261, "step": 7980 }, { "epoch": 3.6310282074613287, "grad_norm": 1.020487867100934, "learning_rate": 8.697329205560625e-07, "loss": 0.0188, "step": 7981 }, { "epoch": 3.6314831665150136, "grad_norm": 0.7747801418810322, "learning_rate": 8.691911931581843e-07, "loss": 0.0239, "step": 7982 }, { "epoch": 3.631938125568699, "grad_norm": 0.85958130673634, "learning_rate": 8.686495990226377e-07, "loss": 0.0093, "step": 7983 }, { "epoch": 3.632393084622384, "grad_norm": 1.1436881012130118, "learning_rate": 8.681081381936779e-07, "loss": 0.0102, "step": 7984 }, { "epoch": 3.632848043676069, "grad_norm": 1.3318838584456447, "learning_rate": 8.675668107155527e-07, "loss": 0.0108, "step": 7985 }, { "epoch": 3.6333030027297544, "grad_norm": 0.701931489207476, "learning_rate": 8.670256166324953e-07, "loss": 0.0079, "step": 7986 }, { "epoch": 3.6337579617834397, "grad_norm": 0.7408909999279194, "learning_rate": 8.664845559887303e-07, "loss": 0.0082, "step": 7987 }, { "epoch": 3.6342129208371245, "grad_norm": 0.9496662072275952, "learning_rate": 8.659436288284698e-07, "loss": 0.0041, "step": 7988 }, { "epoch": 3.63466787989081, "grad_norm": 2.06996598821233, "learning_rate": 8.654028351959162e-07, "loss": 0.009, "step": 7989 }, { "epoch": 3.635122838944495, "grad_norm": 1.5220098194334175, "learning_rate": 8.648621751352624e-07, "loss": 0.0136, "step": 7990 }, { "epoch": 3.63557779799818, "grad_norm": 1.431904737115889, "learning_rate": 8.643216486906872e-07, "loss": 0.0333, "step": 7991 }, { "epoch": 3.6360327570518653, "grad_norm": 0.7600446006457456, "learning_rate": 8.637812559063602e-07, "loss": 0.0077, "step": 7992 }, { "epoch": 3.6364877161055507, "grad_norm": 1.1181193907086873, "learning_rate": 8.63240996826439e-07, "loss": 0.0439, "step": 7993 }, { "epoch": 3.6369426751592355, "grad_norm": 0.9720724854013117, "learning_rate": 8.62700871495073e-07, "loss": 0.0281, "step": 7994 }, { "epoch": 3.637397634212921, "grad_norm": 0.8206806144511175, "learning_rate": 8.621608799563977e-07, "loss": 0.0137, "step": 7995 }, { "epoch": 3.637852593266606, "grad_norm": 0.744652361634002, "learning_rate": 8.616210222545382e-07, "loss": 0.0049, "step": 7996 }, { "epoch": 3.638307552320291, "grad_norm": 1.010140162500639, "learning_rate": 8.610812984336106e-07, "loss": 0.0119, "step": 7997 }, { "epoch": 3.6387625113739763, "grad_norm": 0.8082795791667803, "learning_rate": 8.605417085377171e-07, "loss": 0.0158, "step": 7998 }, { "epoch": 3.6392174704276616, "grad_norm": 0.9284707061342413, "learning_rate": 8.600022526109522e-07, "loss": 0.0132, "step": 7999 }, { "epoch": 3.6396724294813465, "grad_norm": 0.7950437632351541, "learning_rate": 8.594629306973973e-07, "loss": 0.0201, "step": 8000 }, { "epoch": 3.640127388535032, "grad_norm": 1.152942995477859, "learning_rate": 8.589237428411229e-07, "loss": 0.0339, "step": 8001 }, { "epoch": 3.640582347588717, "grad_norm": 0.8201480511881102, "learning_rate": 8.583846890861885e-07, "loss": 0.0138, "step": 8002 }, { "epoch": 3.641037306642402, "grad_norm": 1.2111732134181852, "learning_rate": 8.57845769476644e-07, "loss": 0.0058, "step": 8003 }, { "epoch": 3.6414922656960873, "grad_norm": 0.6338066154294342, "learning_rate": 8.573069840565279e-07, "loss": 0.0134, "step": 8004 }, { "epoch": 3.6419472247497726, "grad_norm": 0.9702614408757292, "learning_rate": 8.567683328698667e-07, "loss": 0.0139, "step": 8005 }, { "epoch": 3.6424021838034575, "grad_norm": 0.9717131038735561, "learning_rate": 8.562298159606766e-07, "loss": 0.0248, "step": 8006 }, { "epoch": 3.642857142857143, "grad_norm": 1.2640436692497927, "learning_rate": 8.556914333729621e-07, "loss": 0.0228, "step": 8007 }, { "epoch": 3.643312101910828, "grad_norm": 1.3398977310646707, "learning_rate": 8.551531851507186e-07, "loss": 0.0226, "step": 8008 }, { "epoch": 3.643767060964513, "grad_norm": 0.5485213342415916, "learning_rate": 8.54615071337929e-07, "loss": 0.003, "step": 8009 }, { "epoch": 3.6442220200181983, "grad_norm": 1.1046985467789374, "learning_rate": 8.540770919785643e-07, "loss": 0.0256, "step": 8010 }, { "epoch": 3.6446769790718836, "grad_norm": 0.9460257319135054, "learning_rate": 8.535392471165877e-07, "loss": 0.0375, "step": 8011 }, { "epoch": 3.6451319381255685, "grad_norm": 1.0145909886615814, "learning_rate": 8.530015367959482e-07, "loss": 0.0107, "step": 8012 }, { "epoch": 3.6455868971792538, "grad_norm": 0.5264400167506763, "learning_rate": 8.524639610605848e-07, "loss": 0.0072, "step": 8013 }, { "epoch": 3.646041856232939, "grad_norm": 2.200810426832457, "learning_rate": 8.519265199544269e-07, "loss": 0.0082, "step": 8014 }, { "epoch": 3.646496815286624, "grad_norm": 1.3867910394816225, "learning_rate": 8.513892135213911e-07, "loss": 0.0225, "step": 8015 }, { "epoch": 3.6469517743403093, "grad_norm": 1.1628272568019085, "learning_rate": 8.50852041805384e-07, "loss": 0.0083, "step": 8016 }, { "epoch": 3.6474067333939946, "grad_norm": 0.9443407889999814, "learning_rate": 8.503150048502995e-07, "loss": 0.0138, "step": 8017 }, { "epoch": 3.6478616924476794, "grad_norm": 0.6137697005370077, "learning_rate": 8.497781027000229e-07, "loss": 0.0058, "step": 8018 }, { "epoch": 3.6483166515013647, "grad_norm": 1.3958282565132698, "learning_rate": 8.492413353984283e-07, "loss": 0.0254, "step": 8019 }, { "epoch": 3.64877161055505, "grad_norm": 1.4631505598387837, "learning_rate": 8.487047029893772e-07, "loss": 0.0366, "step": 8020 }, { "epoch": 3.6492265696087354, "grad_norm": 0.604106046320339, "learning_rate": 8.481682055167203e-07, "loss": 0.0087, "step": 8021 }, { "epoch": 3.6496815286624202, "grad_norm": 0.8459098786908889, "learning_rate": 8.476318430242972e-07, "loss": 0.0177, "step": 8022 }, { "epoch": 3.6501364877161055, "grad_norm": 0.6949148213344816, "learning_rate": 8.47095615555939e-07, "loss": 0.0075, "step": 8023 }, { "epoch": 3.650591446769791, "grad_norm": 1.3776754486055194, "learning_rate": 8.465595231554616e-07, "loss": 0.0271, "step": 8024 }, { "epoch": 3.6510464058234757, "grad_norm": 0.8535020960730262, "learning_rate": 8.460235658666738e-07, "loss": 0.0158, "step": 8025 }, { "epoch": 3.651501364877161, "grad_norm": 1.6601720171550316, "learning_rate": 8.454877437333711e-07, "loss": 0.0368, "step": 8026 }, { "epoch": 3.6519563239308463, "grad_norm": 0.9235792893848497, "learning_rate": 8.449520567993375e-07, "loss": 0.0219, "step": 8027 }, { "epoch": 3.6524112829845317, "grad_norm": 0.8876577481731414, "learning_rate": 8.444165051083483e-07, "loss": 0.0145, "step": 8028 }, { "epoch": 3.6528662420382165, "grad_norm": 1.0741219751956863, "learning_rate": 8.43881088704166e-07, "loss": 0.029, "step": 8029 }, { "epoch": 3.653321201091902, "grad_norm": 1.272391225096877, "learning_rate": 8.433458076305418e-07, "loss": 0.0113, "step": 8030 }, { "epoch": 3.653776160145587, "grad_norm": 0.7531563575601997, "learning_rate": 8.428106619312162e-07, "loss": 0.0141, "step": 8031 }, { "epoch": 3.654231119199272, "grad_norm": 1.1460949375144378, "learning_rate": 8.422756516499194e-07, "loss": 0.023, "step": 8032 }, { "epoch": 3.6546860782529573, "grad_norm": 0.18357805417563028, "learning_rate": 8.417407768303712e-07, "loss": 0.0009, "step": 8033 }, { "epoch": 3.6551410373066426, "grad_norm": 0.5893451955192246, "learning_rate": 8.412060375162781e-07, "loss": 0.0066, "step": 8034 }, { "epoch": 3.6555959963603275, "grad_norm": 1.310498796314294, "learning_rate": 8.406714337513364e-07, "loss": 0.0179, "step": 8035 }, { "epoch": 3.656050955414013, "grad_norm": 0.9206283989095281, "learning_rate": 8.401369655792307e-07, "loss": 0.0199, "step": 8036 }, { "epoch": 3.656505914467698, "grad_norm": 0.860637204799522, "learning_rate": 8.396026330436374e-07, "loss": 0.013, "step": 8037 }, { "epoch": 3.656960873521383, "grad_norm": 1.925586916481033, "learning_rate": 8.390684361882176e-07, "loss": 0.0208, "step": 8038 }, { "epoch": 3.6574158325750683, "grad_norm": 1.1319045387202615, "learning_rate": 8.385343750566255e-07, "loss": 0.0084, "step": 8039 }, { "epoch": 3.6578707916287536, "grad_norm": 0.958115890283917, "learning_rate": 8.380004496925012e-07, "loss": 0.0293, "step": 8040 }, { "epoch": 3.6583257506824385, "grad_norm": 1.0309278525043164, "learning_rate": 8.374666601394737e-07, "loss": 0.0174, "step": 8041 }, { "epoch": 3.658780709736124, "grad_norm": 1.0245747742911995, "learning_rate": 8.369330064411635e-07, "loss": 0.0069, "step": 8042 }, { "epoch": 3.659235668789809, "grad_norm": 1.0389276870038906, "learning_rate": 8.363994886411778e-07, "loss": 0.0336, "step": 8043 }, { "epoch": 3.659690627843494, "grad_norm": 1.3876910034675276, "learning_rate": 8.358661067831131e-07, "loss": 0.0183, "step": 8044 }, { "epoch": 3.6601455868971793, "grad_norm": 1.5663192053290709, "learning_rate": 8.353328609105543e-07, "loss": 0.0088, "step": 8045 }, { "epoch": 3.6606005459508646, "grad_norm": 1.216017124646405, "learning_rate": 8.347997510670763e-07, "loss": 0.0132, "step": 8046 }, { "epoch": 3.6610555050045495, "grad_norm": 0.8545556827790144, "learning_rate": 8.342667772962437e-07, "loss": 0.0096, "step": 8047 }, { "epoch": 3.6615104640582348, "grad_norm": 0.6529426342585322, "learning_rate": 8.337339396416075e-07, "loss": 0.0091, "step": 8048 }, { "epoch": 3.66196542311192, "grad_norm": 1.3197505849405757, "learning_rate": 8.332012381467091e-07, "loss": 0.0136, "step": 8049 }, { "epoch": 3.662420382165605, "grad_norm": 1.0998703807548145, "learning_rate": 8.326686728550781e-07, "loss": 0.0343, "step": 8050 }, { "epoch": 3.6628753412192903, "grad_norm": 0.5919414868589075, "learning_rate": 8.321362438102329e-07, "loss": 0.005, "step": 8051 }, { "epoch": 3.6633303002729756, "grad_norm": 1.3095860851969188, "learning_rate": 8.31603951055682e-07, "loss": 0.0329, "step": 8052 }, { "epoch": 3.6637852593266604, "grad_norm": 0.8767058473953897, "learning_rate": 8.310717946349226e-07, "loss": 0.007, "step": 8053 }, { "epoch": 3.6642402183803457, "grad_norm": 0.7474897352972312, "learning_rate": 8.30539774591439e-07, "loss": 0.0054, "step": 8054 }, { "epoch": 3.664695177434031, "grad_norm": 0.9717428518720151, "learning_rate": 8.30007890968706e-07, "loss": 0.0136, "step": 8055 }, { "epoch": 3.665150136487716, "grad_norm": 1.0550324679164, "learning_rate": 8.294761438101859e-07, "loss": 0.024, "step": 8056 }, { "epoch": 3.6656050955414012, "grad_norm": 1.391904403930039, "learning_rate": 8.289445331593319e-07, "loss": 0.0112, "step": 8057 }, { "epoch": 3.6660600545950865, "grad_norm": 1.2260997724546663, "learning_rate": 8.284130590595843e-07, "loss": 0.0242, "step": 8058 }, { "epoch": 3.6665150136487714, "grad_norm": 1.0548551121670038, "learning_rate": 8.278817215543717e-07, "loss": 0.0053, "step": 8059 }, { "epoch": 3.6669699727024567, "grad_norm": 1.0930253405058488, "learning_rate": 8.273505206871146e-07, "loss": 0.0273, "step": 8060 }, { "epoch": 3.667424931756142, "grad_norm": 1.0997610902018078, "learning_rate": 8.268194565012185e-07, "loss": 0.0086, "step": 8061 }, { "epoch": 3.667879890809827, "grad_norm": 1.0265712630615447, "learning_rate": 8.262885290400813e-07, "loss": 0.0242, "step": 8062 }, { "epoch": 3.668334849863512, "grad_norm": 0.9965762919668885, "learning_rate": 8.257577383470869e-07, "loss": 0.012, "step": 8063 }, { "epoch": 3.6687898089171975, "grad_norm": 2.633091828408031, "learning_rate": 8.252270844656093e-07, "loss": 0.0142, "step": 8064 }, { "epoch": 3.6692447679708824, "grad_norm": 0.9455163615241313, "learning_rate": 8.246965674390106e-07, "loss": 0.0078, "step": 8065 }, { "epoch": 3.6696997270245677, "grad_norm": 1.0167204246024764, "learning_rate": 8.241661873106427e-07, "loss": 0.0228, "step": 8066 }, { "epoch": 3.670154686078253, "grad_norm": 1.0929302744513554, "learning_rate": 8.236359441238467e-07, "loss": 0.0066, "step": 8067 }, { "epoch": 3.670609645131938, "grad_norm": 1.1032835101420904, "learning_rate": 8.231058379219509e-07, "loss": 0.0053, "step": 8068 }, { "epoch": 3.671064604185623, "grad_norm": 1.3220962779579895, "learning_rate": 8.225758687482732e-07, "loss": 0.0097, "step": 8069 }, { "epoch": 3.6715195632393085, "grad_norm": 1.0818547754120251, "learning_rate": 8.220460366461197e-07, "loss": 0.0354, "step": 8070 }, { "epoch": 3.6719745222929934, "grad_norm": 0.48470748620290116, "learning_rate": 8.215163416587874e-07, "loss": 0.0044, "step": 8071 }, { "epoch": 3.6724294813466787, "grad_norm": 1.9885712913789044, "learning_rate": 8.209867838295596e-07, "loss": 0.0203, "step": 8072 }, { "epoch": 3.672884440400364, "grad_norm": 1.2603703522906315, "learning_rate": 8.204573632017084e-07, "loss": 0.0181, "step": 8073 }, { "epoch": 3.673339399454049, "grad_norm": 1.06289284915797, "learning_rate": 8.199280798184978e-07, "loss": 0.0242, "step": 8074 }, { "epoch": 3.673794358507734, "grad_norm": 0.89851873753254, "learning_rate": 8.193989337231764e-07, "loss": 0.0107, "step": 8075 }, { "epoch": 3.6742493175614195, "grad_norm": 1.499474261734751, "learning_rate": 8.188699249589857e-07, "loss": 0.019, "step": 8076 }, { "epoch": 3.674704276615105, "grad_norm": 1.3780033042348767, "learning_rate": 8.183410535691527e-07, "loss": 0.0172, "step": 8077 }, { "epoch": 3.6751592356687897, "grad_norm": 1.0083491045561612, "learning_rate": 8.178123195968943e-07, "loss": 0.0282, "step": 8078 }, { "epoch": 3.675614194722475, "grad_norm": 0.9296778714008781, "learning_rate": 8.172837230854158e-07, "loss": 0.023, "step": 8079 }, { "epoch": 3.6760691537761603, "grad_norm": 1.234261277432219, "learning_rate": 8.167552640779125e-07, "loss": 0.0485, "step": 8080 }, { "epoch": 3.676524112829845, "grad_norm": 0.7841366391585245, "learning_rate": 8.162269426175681e-07, "loss": 0.0052, "step": 8081 }, { "epoch": 3.6769790718835305, "grad_norm": 1.892591101693828, "learning_rate": 8.156987587475542e-07, "loss": 0.0106, "step": 8082 }, { "epoch": 3.6774340309372158, "grad_norm": 0.9188160578271203, "learning_rate": 8.151707125110317e-07, "loss": 0.0151, "step": 8083 }, { "epoch": 3.677888989990901, "grad_norm": 0.8628792990275591, "learning_rate": 8.146428039511498e-07, "loss": 0.018, "step": 8084 }, { "epoch": 3.678343949044586, "grad_norm": 0.9843113481598225, "learning_rate": 8.141150331110459e-07, "loss": 0.0144, "step": 8085 }, { "epoch": 3.6787989080982713, "grad_norm": 1.5432577945792845, "learning_rate": 8.135874000338492e-07, "loss": 0.0379, "step": 8086 }, { "epoch": 3.6792538671519566, "grad_norm": 0.596637826804328, "learning_rate": 8.130599047626736e-07, "loss": 0.0049, "step": 8087 }, { "epoch": 3.6797088262056414, "grad_norm": 1.0733530273089904, "learning_rate": 8.12532547340625e-07, "loss": 0.0284, "step": 8088 }, { "epoch": 3.6801637852593267, "grad_norm": 0.9028046900098784, "learning_rate": 8.120053278107964e-07, "loss": 0.0127, "step": 8089 }, { "epoch": 3.680618744313012, "grad_norm": 0.5870362428861751, "learning_rate": 8.114782462162684e-07, "loss": 0.0056, "step": 8090 }, { "epoch": 3.681073703366697, "grad_norm": 1.157009885129847, "learning_rate": 8.10951302600114e-07, "loss": 0.0055, "step": 8091 }, { "epoch": 3.6815286624203822, "grad_norm": 1.0397150506808177, "learning_rate": 8.104244970053912e-07, "loss": 0.0287, "step": 8092 }, { "epoch": 3.6819836214740675, "grad_norm": 1.0208280679115853, "learning_rate": 8.098978294751484e-07, "loss": 0.0107, "step": 8093 }, { "epoch": 3.6824385805277524, "grad_norm": 1.4245967065402418, "learning_rate": 8.093713000524217e-07, "loss": 0.0254, "step": 8094 }, { "epoch": 3.6828935395814377, "grad_norm": 0.7734968013541829, "learning_rate": 8.088449087802378e-07, "loss": 0.0111, "step": 8095 }, { "epoch": 3.683348498635123, "grad_norm": 1.1342843152326045, "learning_rate": 8.083186557016115e-07, "loss": 0.0122, "step": 8096 }, { "epoch": 3.683803457688808, "grad_norm": 0.32739963479995776, "learning_rate": 8.07792540859545e-07, "loss": 0.0036, "step": 8097 }, { "epoch": 3.684258416742493, "grad_norm": 1.1029895906510163, "learning_rate": 8.072665642970301e-07, "loss": 0.0177, "step": 8098 }, { "epoch": 3.6847133757961785, "grad_norm": 1.0414674433003621, "learning_rate": 8.067407260570465e-07, "loss": 0.0189, "step": 8099 }, { "epoch": 3.6851683348498634, "grad_norm": 1.1733559407829661, "learning_rate": 8.062150261825649e-07, "loss": 0.0209, "step": 8100 }, { "epoch": 3.6856232939035487, "grad_norm": 1.6440291218954424, "learning_rate": 8.056894647165415e-07, "loss": 0.0233, "step": 8101 }, { "epoch": 3.686078252957234, "grad_norm": 1.3525149079443815, "learning_rate": 8.051640417019244e-07, "loss": 0.0322, "step": 8102 }, { "epoch": 3.686533212010919, "grad_norm": 1.5809906135508256, "learning_rate": 8.04638757181648e-07, "loss": 0.0216, "step": 8103 }, { "epoch": 3.686988171064604, "grad_norm": 1.1779190723135506, "learning_rate": 8.041136111986352e-07, "loss": 0.0113, "step": 8104 }, { "epoch": 3.6874431301182895, "grad_norm": 0.9488829499982296, "learning_rate": 8.035886037958008e-07, "loss": 0.0083, "step": 8105 }, { "epoch": 3.6878980891719744, "grad_norm": 1.0083737974238036, "learning_rate": 8.030637350160442e-07, "loss": 0.0254, "step": 8106 }, { "epoch": 3.6883530482256597, "grad_norm": 0.7840981106102425, "learning_rate": 8.025390049022563e-07, "loss": 0.0112, "step": 8107 }, { "epoch": 3.688808007279345, "grad_norm": 0.7951108283312102, "learning_rate": 8.020144134973143e-07, "loss": 0.0112, "step": 8108 }, { "epoch": 3.68926296633303, "grad_norm": 1.0273024090723675, "learning_rate": 8.014899608440863e-07, "loss": 0.0161, "step": 8109 }, { "epoch": 3.689717925386715, "grad_norm": 1.1369905920322538, "learning_rate": 8.009656469854294e-07, "loss": 0.0103, "step": 8110 }, { "epoch": 3.6901728844404005, "grad_norm": 0.5553761662997223, "learning_rate": 8.004414719641868e-07, "loss": 0.0054, "step": 8111 }, { "epoch": 3.6906278434940853, "grad_norm": 1.5174813398954405, "learning_rate": 7.999174358231917e-07, "loss": 0.0105, "step": 8112 }, { "epoch": 3.6910828025477707, "grad_norm": 1.4961812075429943, "learning_rate": 7.993935386052659e-07, "loss": 0.0212, "step": 8113 }, { "epoch": 3.691537761601456, "grad_norm": 1.4597828039556873, "learning_rate": 7.988697803532208e-07, "loss": 0.0134, "step": 8114 }, { "epoch": 3.691992720655141, "grad_norm": 1.0479710692083863, "learning_rate": 7.983461611098545e-07, "loss": 0.0146, "step": 8115 }, { "epoch": 3.692447679708826, "grad_norm": 1.2079640329681058, "learning_rate": 7.97822680917956e-07, "loss": 0.025, "step": 8116 }, { "epoch": 3.6929026387625115, "grad_norm": 0.3520684601381012, "learning_rate": 7.972993398203008e-07, "loss": 0.0036, "step": 8117 }, { "epoch": 3.6933575978161963, "grad_norm": 1.3916667055864362, "learning_rate": 7.967761378596545e-07, "loss": 0.0257, "step": 8118 }, { "epoch": 3.6938125568698816, "grad_norm": 0.639523678770514, "learning_rate": 7.962530750787698e-07, "loss": 0.0057, "step": 8119 }, { "epoch": 3.694267515923567, "grad_norm": 2.122423861761964, "learning_rate": 7.957301515203902e-07, "loss": 0.011, "step": 8120 }, { "epoch": 3.694722474977252, "grad_norm": 0.8513772442069306, "learning_rate": 7.952073672272464e-07, "loss": 0.0071, "step": 8121 }, { "epoch": 3.695177434030937, "grad_norm": 1.2556762284499867, "learning_rate": 7.94684722242057e-07, "loss": 0.0474, "step": 8122 }, { "epoch": 3.6956323930846224, "grad_norm": 0.47748861871345244, "learning_rate": 7.941622166075316e-07, "loss": 0.0055, "step": 8123 }, { "epoch": 3.6960873521383073, "grad_norm": 0.5634346537060365, "learning_rate": 7.936398503663658e-07, "loss": 0.0049, "step": 8124 }, { "epoch": 3.6965423111919926, "grad_norm": 1.3501965851425497, "learning_rate": 7.931176235612462e-07, "loss": 0.031, "step": 8125 }, { "epoch": 3.696997270245678, "grad_norm": 0.8779667051670639, "learning_rate": 7.925955362348464e-07, "loss": 0.0135, "step": 8126 }, { "epoch": 3.697452229299363, "grad_norm": 1.4788816401297284, "learning_rate": 7.920735884298286e-07, "loss": 0.017, "step": 8127 }, { "epoch": 3.697907188353048, "grad_norm": 0.895244149526211, "learning_rate": 7.915517801888434e-07, "loss": 0.0242, "step": 8128 }, { "epoch": 3.6983621474067334, "grad_norm": 1.6206080193586248, "learning_rate": 7.910301115545316e-07, "loss": 0.0351, "step": 8129 }, { "epoch": 3.6988171064604187, "grad_norm": 0.797643153708099, "learning_rate": 7.905085825695222e-07, "loss": 0.0138, "step": 8130 }, { "epoch": 3.6992720655141036, "grad_norm": 1.0783276148616272, "learning_rate": 7.899871932764314e-07, "loss": 0.0176, "step": 8131 }, { "epoch": 3.699727024567789, "grad_norm": 1.5123609680081593, "learning_rate": 7.894659437178648e-07, "loss": 0.0217, "step": 8132 }, { "epoch": 3.700181983621474, "grad_norm": 2.3381116743191876, "learning_rate": 7.889448339364159e-07, "loss": 0.0284, "step": 8133 }, { "epoch": 3.700636942675159, "grad_norm": 1.1071402553093241, "learning_rate": 7.884238639746685e-07, "loss": 0.0134, "step": 8134 }, { "epoch": 3.7010919017288444, "grad_norm": 0.9004703339533969, "learning_rate": 7.879030338751939e-07, "loss": 0.0063, "step": 8135 }, { "epoch": 3.7015468607825297, "grad_norm": 0.9411988952797516, "learning_rate": 7.873823436805508e-07, "loss": 0.01, "step": 8136 }, { "epoch": 3.702001819836215, "grad_norm": 0.9194374102020312, "learning_rate": 7.868617934332893e-07, "loss": 0.0251, "step": 8137 }, { "epoch": 3.7024567788899, "grad_norm": 0.7904490377352819, "learning_rate": 7.863413831759448e-07, "loss": 0.005, "step": 8138 }, { "epoch": 3.702911737943585, "grad_norm": 1.4259354780926565, "learning_rate": 7.858211129510443e-07, "loss": 0.0069, "step": 8139 }, { "epoch": 3.7033666969972705, "grad_norm": 1.2230386745410815, "learning_rate": 7.853009828011013e-07, "loss": 0.0303, "step": 8140 }, { "epoch": 3.7038216560509554, "grad_norm": 0.3810042183516033, "learning_rate": 7.847809927686184e-07, "loss": 0.0025, "step": 8141 }, { "epoch": 3.7042766151046407, "grad_norm": 1.622342453415607, "learning_rate": 7.842611428960861e-07, "loss": 0.0285, "step": 8142 }, { "epoch": 3.704731574158326, "grad_norm": 1.8796069713071688, "learning_rate": 7.837414332259852e-07, "loss": 0.0321, "step": 8143 }, { "epoch": 3.705186533212011, "grad_norm": 1.620722632870381, "learning_rate": 7.832218638007846e-07, "loss": 0.0096, "step": 8144 }, { "epoch": 3.705641492265696, "grad_norm": 2.380046440830263, "learning_rate": 7.827024346629403e-07, "loss": 0.0292, "step": 8145 }, { "epoch": 3.7060964513193815, "grad_norm": 1.0444505838388076, "learning_rate": 7.821831458548978e-07, "loss": 0.0207, "step": 8146 }, { "epoch": 3.7065514103730663, "grad_norm": 1.2239214321189478, "learning_rate": 7.816639974190901e-07, "loss": 0.0559, "step": 8147 }, { "epoch": 3.7070063694267517, "grad_norm": 0.9964223093574078, "learning_rate": 7.811449893979416e-07, "loss": 0.0091, "step": 8148 }, { "epoch": 3.707461328480437, "grad_norm": 0.9994704561119732, "learning_rate": 7.806261218338623e-07, "loss": 0.0227, "step": 8149 }, { "epoch": 3.707916287534122, "grad_norm": 1.1457300367269354, "learning_rate": 7.801073947692508e-07, "loss": 0.0068, "step": 8150 }, { "epoch": 3.708371246587807, "grad_norm": 0.907110571847765, "learning_rate": 7.795888082464967e-07, "loss": 0.0182, "step": 8151 }, { "epoch": 3.7088262056414925, "grad_norm": 0.9662691181917747, "learning_rate": 7.790703623079754e-07, "loss": 0.0206, "step": 8152 }, { "epoch": 3.7092811646951773, "grad_norm": 0.8152823490795528, "learning_rate": 7.78552056996053e-07, "loss": 0.0062, "step": 8153 }, { "epoch": 3.7097361237488626, "grad_norm": 0.80359040563423, "learning_rate": 7.780338923530825e-07, "loss": 0.0099, "step": 8154 }, { "epoch": 3.710191082802548, "grad_norm": 1.0446521592129756, "learning_rate": 7.775158684214062e-07, "loss": 0.0167, "step": 8155 }, { "epoch": 3.710646041856233, "grad_norm": 1.1435839503506788, "learning_rate": 7.769979852433543e-07, "loss": 0.016, "step": 8156 }, { "epoch": 3.711101000909918, "grad_norm": 1.4430948094050886, "learning_rate": 7.764802428612453e-07, "loss": 0.019, "step": 8157 }, { "epoch": 3.7115559599636034, "grad_norm": 0.5705374265933254, "learning_rate": 7.759626413173873e-07, "loss": 0.0169, "step": 8158 }, { "epoch": 3.7120109190172883, "grad_norm": 1.0194556134349353, "learning_rate": 7.754451806540778e-07, "loss": 0.0119, "step": 8159 }, { "epoch": 3.7124658780709736, "grad_norm": 0.5403880550076176, "learning_rate": 7.749278609135996e-07, "loss": 0.0048, "step": 8160 }, { "epoch": 3.712920837124659, "grad_norm": 1.0016062715621807, "learning_rate": 7.744106821382266e-07, "loss": 0.0149, "step": 8161 }, { "epoch": 3.713375796178344, "grad_norm": 0.8930964741082748, "learning_rate": 7.738936443702191e-07, "loss": 0.0193, "step": 8162 }, { "epoch": 3.713830755232029, "grad_norm": 0.9981660119994334, "learning_rate": 7.733767476518286e-07, "loss": 0.0164, "step": 8163 }, { "epoch": 3.7142857142857144, "grad_norm": 0.9103567940092974, "learning_rate": 7.728599920252925e-07, "loss": 0.011, "step": 8164 }, { "epoch": 3.7147406733393993, "grad_norm": 1.3107545425937148, "learning_rate": 7.723433775328385e-07, "loss": 0.0232, "step": 8165 }, { "epoch": 3.7151956323930846, "grad_norm": 0.8705567352890671, "learning_rate": 7.718269042166818e-07, "loss": 0.0308, "step": 8166 }, { "epoch": 3.71565059144677, "grad_norm": 1.6125338018416673, "learning_rate": 7.713105721190257e-07, "loss": 0.029, "step": 8167 }, { "epoch": 3.7161055505004548, "grad_norm": 1.2237764325451086, "learning_rate": 7.707943812820632e-07, "loss": 0.0196, "step": 8168 }, { "epoch": 3.71656050955414, "grad_norm": 1.3437048628139412, "learning_rate": 7.702783317479751e-07, "loss": 0.0143, "step": 8169 }, { "epoch": 3.7170154686078254, "grad_norm": 1.3214581903471712, "learning_rate": 7.697624235589304e-07, "loss": 0.0223, "step": 8170 }, { "epoch": 3.7174704276615103, "grad_norm": 0.9165144004096566, "learning_rate": 7.692466567570858e-07, "loss": 0.0093, "step": 8171 }, { "epoch": 3.7179253867151956, "grad_norm": 0.5375567355764077, "learning_rate": 7.687310313845886e-07, "loss": 0.0026, "step": 8172 }, { "epoch": 3.718380345768881, "grad_norm": 0.47497970497233205, "learning_rate": 7.682155474835739e-07, "loss": 0.0102, "step": 8173 }, { "epoch": 3.7188353048225657, "grad_norm": 1.173012779176592, "learning_rate": 7.67700205096164e-07, "loss": 0.0234, "step": 8174 }, { "epoch": 3.719290263876251, "grad_norm": 0.573569822245123, "learning_rate": 7.671850042644702e-07, "loss": 0.0047, "step": 8175 }, { "epoch": 3.7197452229299364, "grad_norm": 0.7137466668351119, "learning_rate": 7.66669945030592e-07, "loss": 0.0179, "step": 8176 }, { "epoch": 3.7202001819836212, "grad_norm": 0.8820780873661533, "learning_rate": 7.661550274366189e-07, "loss": 0.0365, "step": 8177 }, { "epoch": 3.7206551410373065, "grad_norm": 1.168177574335348, "learning_rate": 7.656402515246261e-07, "loss": 0.0166, "step": 8178 }, { "epoch": 3.721110100090992, "grad_norm": 2.0283927468113343, "learning_rate": 7.651256173366805e-07, "loss": 0.0271, "step": 8179 }, { "epoch": 3.7215650591446767, "grad_norm": 0.8803669668804288, "learning_rate": 7.646111249148349e-07, "loss": 0.0059, "step": 8180 }, { "epoch": 3.722020018198362, "grad_norm": 2.0981610884501167, "learning_rate": 7.640967743011304e-07, "loss": 0.0146, "step": 8181 }, { "epoch": 3.7224749772520473, "grad_norm": 1.1366445930445248, "learning_rate": 7.635825655375989e-07, "loss": 0.0437, "step": 8182 }, { "epoch": 3.722929936305732, "grad_norm": 1.1908571092363478, "learning_rate": 7.630684986662587e-07, "loss": 0.0194, "step": 8183 }, { "epoch": 3.7233848953594175, "grad_norm": 0.9658616244579158, "learning_rate": 7.625545737291168e-07, "loss": 0.0064, "step": 8184 }, { "epoch": 3.723839854413103, "grad_norm": 1.4417953527403624, "learning_rate": 7.620407907681682e-07, "loss": 0.0156, "step": 8185 }, { "epoch": 3.724294813466788, "grad_norm": 1.0654696139021465, "learning_rate": 7.615271498253976e-07, "loss": 0.0212, "step": 8186 }, { "epoch": 3.724749772520473, "grad_norm": 0.48131217483812927, "learning_rate": 7.610136509427782e-07, "loss": 0.0041, "step": 8187 }, { "epoch": 3.7252047315741583, "grad_norm": 0.7906828264744988, "learning_rate": 7.6050029416227e-07, "loss": 0.0038, "step": 8188 }, { "epoch": 3.7256596906278436, "grad_norm": 0.9033085433798823, "learning_rate": 7.599870795258224e-07, "loss": 0.0067, "step": 8189 }, { "epoch": 3.7261146496815285, "grad_norm": 1.0826524957137855, "learning_rate": 7.594740070753725e-07, "loss": 0.0167, "step": 8190 }, { "epoch": 3.726569608735214, "grad_norm": 1.3492001893324697, "learning_rate": 7.58961076852846e-07, "loss": 0.0249, "step": 8191 }, { "epoch": 3.727024567788899, "grad_norm": 1.2585084102526478, "learning_rate": 7.58448288900158e-07, "loss": 0.023, "step": 8192 }, { "epoch": 3.7274795268425844, "grad_norm": 0.302131376393728, "learning_rate": 7.579356432592117e-07, "loss": 0.0017, "step": 8193 }, { "epoch": 3.7279344858962693, "grad_norm": 0.5563997947409505, "learning_rate": 7.574231399718976e-07, "loss": 0.0046, "step": 8194 }, { "epoch": 3.7283894449499546, "grad_norm": 1.163334397115874, "learning_rate": 7.56910779080095e-07, "loss": 0.0248, "step": 8195 }, { "epoch": 3.72884440400364, "grad_norm": 0.9235131194141017, "learning_rate": 7.56398560625671e-07, "loss": 0.021, "step": 8196 }, { "epoch": 3.729299363057325, "grad_norm": 0.6125792289367459, "learning_rate": 7.558864846504834e-07, "loss": 0.0052, "step": 8197 }, { "epoch": 3.72975432211101, "grad_norm": 1.5500217962787601, "learning_rate": 7.553745511963761e-07, "loss": 0.0167, "step": 8198 }, { "epoch": 3.7302092811646954, "grad_norm": 1.054271475386214, "learning_rate": 7.548627603051809e-07, "loss": 0.0077, "step": 8199 }, { "epoch": 3.7306642402183803, "grad_norm": 1.1950356802090645, "learning_rate": 7.543511120187208e-07, "loss": 0.0229, "step": 8200 }, { "epoch": 3.7311191992720656, "grad_norm": 0.7437930661683735, "learning_rate": 7.538396063788037e-07, "loss": 0.0099, "step": 8201 }, { "epoch": 3.731574158325751, "grad_norm": 0.5830466054585307, "learning_rate": 7.533282434272294e-07, "loss": 0.0047, "step": 8202 }, { "epoch": 3.7320291173794358, "grad_norm": 1.260975718999625, "learning_rate": 7.528170232057827e-07, "loss": 0.0154, "step": 8203 }, { "epoch": 3.732484076433121, "grad_norm": 0.858674960900998, "learning_rate": 7.52305945756239e-07, "loss": 0.0265, "step": 8204 }, { "epoch": 3.7329390354868064, "grad_norm": 0.6793014106460655, "learning_rate": 7.517950111203598e-07, "loss": 0.0155, "step": 8205 }, { "epoch": 3.7333939945404913, "grad_norm": 1.1931266817699566, "learning_rate": 7.512842193398979e-07, "loss": 0.0157, "step": 8206 }, { "epoch": 3.7338489535941766, "grad_norm": 0.9373141278533387, "learning_rate": 7.50773570456593e-07, "loss": 0.011, "step": 8207 }, { "epoch": 3.734303912647862, "grad_norm": 1.3560388560220304, "learning_rate": 7.502630645121722e-07, "loss": 0.0175, "step": 8208 }, { "epoch": 3.7347588717015467, "grad_norm": 1.7775431316435006, "learning_rate": 7.497527015483525e-07, "loss": 0.0092, "step": 8209 }, { "epoch": 3.735213830755232, "grad_norm": 0.9013815612843408, "learning_rate": 7.49242481606837e-07, "loss": 0.0096, "step": 8210 }, { "epoch": 3.7356687898089174, "grad_norm": 0.7194325808696166, "learning_rate": 7.487324047293204e-07, "loss": 0.0036, "step": 8211 }, { "epoch": 3.7361237488626022, "grad_norm": 2.6345987039985985, "learning_rate": 7.482224709574828e-07, "loss": 0.0134, "step": 8212 }, { "epoch": 3.7365787079162875, "grad_norm": 1.6390739116274675, "learning_rate": 7.477126803329934e-07, "loss": 0.0381, "step": 8213 }, { "epoch": 3.737033666969973, "grad_norm": 0.9586342819338665, "learning_rate": 7.472030328975114e-07, "loss": 0.0212, "step": 8214 }, { "epoch": 3.7374886260236577, "grad_norm": 1.4886154532468394, "learning_rate": 7.466935286926808e-07, "loss": 0.0289, "step": 8215 }, { "epoch": 3.737943585077343, "grad_norm": 1.138263962471556, "learning_rate": 7.461841677601381e-07, "loss": 0.0245, "step": 8216 }, { "epoch": 3.7383985441310283, "grad_norm": 1.1809008995454002, "learning_rate": 7.456749501415053e-07, "loss": 0.0143, "step": 8217 }, { "epoch": 3.738853503184713, "grad_norm": 1.1724457098119956, "learning_rate": 7.451658758783928e-07, "loss": 0.0196, "step": 8218 }, { "epoch": 3.7393084622383985, "grad_norm": 1.1636095318444104, "learning_rate": 7.446569450123994e-07, "loss": 0.0103, "step": 8219 }, { "epoch": 3.739763421292084, "grad_norm": 0.9014108248518008, "learning_rate": 7.441481575851136e-07, "loss": 0.0077, "step": 8220 }, { "epoch": 3.7402183803457687, "grad_norm": 0.8315290481903961, "learning_rate": 7.436395136381117e-07, "loss": 0.0202, "step": 8221 }, { "epoch": 3.740673339399454, "grad_norm": 0.2038958904201513, "learning_rate": 7.431310132129571e-07, "loss": 0.0005, "step": 8222 }, { "epoch": 3.7411282984531393, "grad_norm": 0.8354163293625796, "learning_rate": 7.426226563512021e-07, "loss": 0.0074, "step": 8223 }, { "epoch": 3.741583257506824, "grad_norm": 0.993228874239246, "learning_rate": 7.421144430943866e-07, "loss": 0.0167, "step": 8224 }, { "epoch": 3.7420382165605095, "grad_norm": 1.8198736402836566, "learning_rate": 7.416063734840412e-07, "loss": 0.0279, "step": 8225 }, { "epoch": 3.742493175614195, "grad_norm": 0.5054454188256415, "learning_rate": 7.41098447561682e-07, "loss": 0.0039, "step": 8226 }, { "epoch": 3.7429481346678797, "grad_norm": 1.4204547005052117, "learning_rate": 7.405906653688136e-07, "loss": 0.0108, "step": 8227 }, { "epoch": 3.743403093721565, "grad_norm": 0.6412967257308027, "learning_rate": 7.400830269469317e-07, "loss": 0.0154, "step": 8228 }, { "epoch": 3.7438580527752503, "grad_norm": 1.1015983753956227, "learning_rate": 7.39575532337517e-07, "loss": 0.0102, "step": 8229 }, { "epoch": 3.744313011828935, "grad_norm": 0.8041762661229317, "learning_rate": 7.390681815820388e-07, "loss": 0.0081, "step": 8230 }, { "epoch": 3.7447679708826205, "grad_norm": 0.5671873948299966, "learning_rate": 7.385609747219574e-07, "loss": 0.0127, "step": 8231 }, { "epoch": 3.745222929936306, "grad_norm": 1.444338361284641, "learning_rate": 7.380539117987187e-07, "loss": 0.0403, "step": 8232 }, { "epoch": 3.7456778889899907, "grad_norm": 1.1104266470760757, "learning_rate": 7.375469928537574e-07, "loss": 0.0123, "step": 8233 }, { "epoch": 3.746132848043676, "grad_norm": 1.1463754023630275, "learning_rate": 7.370402179284958e-07, "loss": 0.0304, "step": 8234 }, { "epoch": 3.7465878070973613, "grad_norm": 1.327569744231558, "learning_rate": 7.365335870643462e-07, "loss": 0.0317, "step": 8235 }, { "epoch": 3.747042766151046, "grad_norm": 1.4561842136030523, "learning_rate": 7.360271003027089e-07, "loss": 0.0123, "step": 8236 }, { "epoch": 3.7474977252047315, "grad_norm": 0.8280453542322732, "learning_rate": 7.35520757684971e-07, "loss": 0.0087, "step": 8237 }, { "epoch": 3.7479526842584168, "grad_norm": 0.8378001927218176, "learning_rate": 7.350145592525082e-07, "loss": 0.022, "step": 8238 }, { "epoch": 3.7484076433121016, "grad_norm": 1.0681150488966236, "learning_rate": 7.345085050466846e-07, "loss": 0.0086, "step": 8239 }, { "epoch": 3.748862602365787, "grad_norm": 1.007909031784699, "learning_rate": 7.340025951088537e-07, "loss": 0.0373, "step": 8240 }, { "epoch": 3.7493175614194723, "grad_norm": 1.0946587976709838, "learning_rate": 7.334968294803546e-07, "loss": 0.019, "step": 8241 }, { "epoch": 3.7497725204731576, "grad_norm": 1.573831077938959, "learning_rate": 7.329912082025182e-07, "loss": 0.0429, "step": 8242 }, { "epoch": 3.7502274795268424, "grad_norm": 1.0312435110932927, "learning_rate": 7.324857313166603e-07, "loss": 0.0188, "step": 8243 }, { "epoch": 3.7506824385805277, "grad_norm": 0.9976165396463154, "learning_rate": 7.319803988640858e-07, "loss": 0.0115, "step": 8244 }, { "epoch": 3.751137397634213, "grad_norm": 0.5367722539111381, "learning_rate": 7.314752108860895e-07, "loss": 0.0077, "step": 8245 }, { "epoch": 3.7515923566878984, "grad_norm": 1.747295700178061, "learning_rate": 7.309701674239522e-07, "loss": 0.0304, "step": 8246 }, { "epoch": 3.7520473157415832, "grad_norm": 1.1299926210962132, "learning_rate": 7.304652685189434e-07, "loss": 0.0403, "step": 8247 }, { "epoch": 3.7525022747952685, "grad_norm": 1.4459460460215623, "learning_rate": 7.299605142123226e-07, "loss": 0.0184, "step": 8248 }, { "epoch": 3.752957233848954, "grad_norm": 0.7843391269308279, "learning_rate": 7.294559045453342e-07, "loss": 0.0072, "step": 8249 }, { "epoch": 3.7534121929026387, "grad_norm": 0.889635414803977, "learning_rate": 7.289514395592143e-07, "loss": 0.0161, "step": 8250 }, { "epoch": 3.753867151956324, "grad_norm": 0.7022132125003895, "learning_rate": 7.284471192951848e-07, "loss": 0.0136, "step": 8251 }, { "epoch": 3.7543221110100093, "grad_norm": 1.4706109393941051, "learning_rate": 7.279429437944565e-07, "loss": 0.0105, "step": 8252 }, { "epoch": 3.754777070063694, "grad_norm": 0.7310665373234202, "learning_rate": 7.274389130982276e-07, "loss": 0.0099, "step": 8253 }, { "epoch": 3.7552320291173795, "grad_norm": 0.8696292149271273, "learning_rate": 7.269350272476858e-07, "loss": 0.0136, "step": 8254 }, { "epoch": 3.755686988171065, "grad_norm": 0.935096356269044, "learning_rate": 7.264312862840073e-07, "loss": 0.0099, "step": 8255 }, { "epoch": 3.7561419472247497, "grad_norm": 1.0760035743723741, "learning_rate": 7.259276902483547e-07, "loss": 0.016, "step": 8256 }, { "epoch": 3.756596906278435, "grad_norm": 1.5499629150814533, "learning_rate": 7.254242391818794e-07, "loss": 0.0266, "step": 8257 }, { "epoch": 3.7570518653321203, "grad_norm": 1.6897491610112998, "learning_rate": 7.249209331257209e-07, "loss": 0.0105, "step": 8258 }, { "epoch": 3.757506824385805, "grad_norm": 1.0677935715354379, "learning_rate": 7.244177721210083e-07, "loss": 0.0132, "step": 8259 }, { "epoch": 3.7579617834394905, "grad_norm": 0.5396489164329316, "learning_rate": 7.239147562088566e-07, "loss": 0.0066, "step": 8260 }, { "epoch": 3.758416742493176, "grad_norm": 0.3750213396426127, "learning_rate": 7.234118854303699e-07, "loss": 0.0027, "step": 8261 }, { "epoch": 3.7588717015468607, "grad_norm": 1.0365313410215262, "learning_rate": 7.229091598266417e-07, "loss": 0.0159, "step": 8262 }, { "epoch": 3.759326660600546, "grad_norm": 0.5272702684657992, "learning_rate": 7.224065794387513e-07, "loss": 0.0042, "step": 8263 }, { "epoch": 3.7597816196542313, "grad_norm": 1.1404353875351814, "learning_rate": 7.219041443077673e-07, "loss": 0.0065, "step": 8264 }, { "epoch": 3.760236578707916, "grad_norm": 1.7266743696959586, "learning_rate": 7.214018544747473e-07, "loss": 0.0364, "step": 8265 }, { "epoch": 3.7606915377616015, "grad_norm": 1.216642070677178, "learning_rate": 7.208997099807358e-07, "loss": 0.0289, "step": 8266 }, { "epoch": 3.761146496815287, "grad_norm": 1.2803169199832058, "learning_rate": 7.203977108667656e-07, "loss": 0.0133, "step": 8267 }, { "epoch": 3.7616014558689717, "grad_norm": 1.3366303054797226, "learning_rate": 7.198958571738573e-07, "loss": 0.023, "step": 8268 }, { "epoch": 3.762056414922657, "grad_norm": 0.9415553455514774, "learning_rate": 7.193941489430206e-07, "loss": 0.011, "step": 8269 }, { "epoch": 3.7625113739763423, "grad_norm": 1.4609200334330463, "learning_rate": 7.188925862152535e-07, "loss": 0.0108, "step": 8270 }, { "epoch": 3.762966333030027, "grad_norm": 1.09979800363846, "learning_rate": 7.18391169031541e-07, "loss": 0.0094, "step": 8271 }, { "epoch": 3.7634212920837125, "grad_norm": 0.8604316284976868, "learning_rate": 7.178898974328563e-07, "loss": 0.0099, "step": 8272 }, { "epoch": 3.7638762511373978, "grad_norm": 1.2042889416258016, "learning_rate": 7.173887714601607e-07, "loss": 0.0554, "step": 8273 }, { "epoch": 3.7643312101910826, "grad_norm": 0.8980016575608877, "learning_rate": 7.16887791154405e-07, "loss": 0.0178, "step": 8274 }, { "epoch": 3.764786169244768, "grad_norm": 1.179570103857568, "learning_rate": 7.16386956556526e-07, "loss": 0.0108, "step": 8275 }, { "epoch": 3.7652411282984533, "grad_norm": 1.25069710230668, "learning_rate": 7.15886267707451e-07, "loss": 0.0355, "step": 8276 }, { "epoch": 3.765696087352138, "grad_norm": 2.0278113125781334, "learning_rate": 7.15385724648093e-07, "loss": 0.0237, "step": 8277 }, { "epoch": 3.7661510464058234, "grad_norm": 1.1224619176504749, "learning_rate": 7.148853274193537e-07, "loss": 0.0157, "step": 8278 }, { "epoch": 3.7666060054595087, "grad_norm": 0.7598064248243543, "learning_rate": 7.143850760621246e-07, "loss": 0.0094, "step": 8279 }, { "epoch": 3.7670609645131936, "grad_norm": 1.1563323959494167, "learning_rate": 7.138849706172835e-07, "loss": 0.0411, "step": 8280 }, { "epoch": 3.767515923566879, "grad_norm": 1.5580621538364112, "learning_rate": 7.133850111256965e-07, "loss": 0.0104, "step": 8281 }, { "epoch": 3.7679708826205642, "grad_norm": 1.6865141582271073, "learning_rate": 7.128851976282172e-07, "loss": 0.0121, "step": 8282 }, { "epoch": 3.768425841674249, "grad_norm": 0.8548463082190557, "learning_rate": 7.123855301656893e-07, "loss": 0.0105, "step": 8283 }, { "epoch": 3.7688808007279344, "grad_norm": 1.0696788063163039, "learning_rate": 7.118860087789436e-07, "loss": 0.0335, "step": 8284 }, { "epoch": 3.7693357597816197, "grad_norm": 0.5525279937954748, "learning_rate": 7.113866335087982e-07, "loss": 0.0048, "step": 8285 }, { "epoch": 3.7697907188353046, "grad_norm": 1.4294528369863484, "learning_rate": 7.108874043960601e-07, "loss": 0.0252, "step": 8286 }, { "epoch": 3.77024567788899, "grad_norm": 1.1378283139639525, "learning_rate": 7.103883214815227e-07, "loss": 0.0075, "step": 8287 }, { "epoch": 3.770700636942675, "grad_norm": 0.9951447263921236, "learning_rate": 7.098893848059707e-07, "loss": 0.0304, "step": 8288 }, { "epoch": 3.77115559599636, "grad_norm": 1.5322242939435051, "learning_rate": 7.093905944101734e-07, "loss": 0.0118, "step": 8289 }, { "epoch": 3.7716105550500454, "grad_norm": 0.9892319857561496, "learning_rate": 7.088919503348909e-07, "loss": 0.026, "step": 8290 }, { "epoch": 3.7720655141037307, "grad_norm": 0.7778986553077333, "learning_rate": 7.0839345262087e-07, "loss": 0.0132, "step": 8291 }, { "epoch": 3.7725204731574156, "grad_norm": 0.7892197982980101, "learning_rate": 7.078951013088445e-07, "loss": 0.0259, "step": 8292 }, { "epoch": 3.772975432211101, "grad_norm": 1.9607023835166517, "learning_rate": 7.073968964395389e-07, "loss": 0.0458, "step": 8293 }, { "epoch": 3.773430391264786, "grad_norm": 0.7177649297286528, "learning_rate": 7.068988380536634e-07, "loss": 0.0148, "step": 8294 }, { "epoch": 3.7738853503184715, "grad_norm": 0.6912945601784891, "learning_rate": 7.064009261919178e-07, "loss": 0.0086, "step": 8295 }, { "epoch": 3.7743403093721564, "grad_norm": 0.9229312920640103, "learning_rate": 7.059031608949873e-07, "loss": 0.0104, "step": 8296 }, { "epoch": 3.7747952684258417, "grad_norm": 1.07844472167591, "learning_rate": 7.054055422035488e-07, "loss": 0.0207, "step": 8297 }, { "epoch": 3.775250227479527, "grad_norm": 2.0064441908882125, "learning_rate": 7.049080701582658e-07, "loss": 0.0108, "step": 8298 }, { "epoch": 3.775705186533212, "grad_norm": 1.0229081006191194, "learning_rate": 7.044107447997888e-07, "loss": 0.017, "step": 8299 }, { "epoch": 3.776160145586897, "grad_norm": 1.3526287650198578, "learning_rate": 7.039135661687568e-07, "loss": 0.0088, "step": 8300 }, { "epoch": 3.7766151046405825, "grad_norm": 1.2044121822153186, "learning_rate": 7.034165343057972e-07, "loss": 0.0272, "step": 8301 }, { "epoch": 3.777070063694268, "grad_norm": 0.747055147974664, "learning_rate": 7.029196492515244e-07, "loss": 0.0106, "step": 8302 }, { "epoch": 3.7775250227479527, "grad_norm": 0.9282914432650254, "learning_rate": 7.024229110465422e-07, "loss": 0.0167, "step": 8303 }, { "epoch": 3.777979981801638, "grad_norm": 0.6389367003552956, "learning_rate": 7.019263197314427e-07, "loss": 0.022, "step": 8304 }, { "epoch": 3.7784349408553233, "grad_norm": 0.9187519289976793, "learning_rate": 7.014298753468043e-07, "loss": 0.0317, "step": 8305 }, { "epoch": 3.778889899909008, "grad_norm": 0.6166902371808161, "learning_rate": 7.009335779331944e-07, "loss": 0.011, "step": 8306 }, { "epoch": 3.7793448589626935, "grad_norm": 1.0559544358350508, "learning_rate": 7.004374275311671e-07, "loss": 0.0165, "step": 8307 }, { "epoch": 3.7797998180163788, "grad_norm": 1.4581453743211905, "learning_rate": 6.999414241812672e-07, "loss": 0.0115, "step": 8308 }, { "epoch": 3.7802547770700636, "grad_norm": 0.5535558849041227, "learning_rate": 6.994455679240253e-07, "loss": 0.0144, "step": 8309 }, { "epoch": 3.780709736123749, "grad_norm": 1.2206193525338969, "learning_rate": 6.989498587999593e-07, "loss": 0.0169, "step": 8310 }, { "epoch": 3.7811646951774343, "grad_norm": 0.6230130325733032, "learning_rate": 6.984542968495784e-07, "loss": 0.0046, "step": 8311 }, { "epoch": 3.781619654231119, "grad_norm": 1.366090323072301, "learning_rate": 6.979588821133756e-07, "loss": 0.0216, "step": 8312 }, { "epoch": 3.7820746132848044, "grad_norm": 1.6957780213855542, "learning_rate": 6.974636146318361e-07, "loss": 0.0156, "step": 8313 }, { "epoch": 3.7825295723384897, "grad_norm": 1.1743364215645333, "learning_rate": 6.969684944454297e-07, "loss": 0.0294, "step": 8314 }, { "epoch": 3.7829845313921746, "grad_norm": 0.7279794937041764, "learning_rate": 6.964735215946155e-07, "loss": 0.0132, "step": 8315 }, { "epoch": 3.78343949044586, "grad_norm": 1.6831680429496851, "learning_rate": 6.959786961198398e-07, "loss": 0.0176, "step": 8316 }, { "epoch": 3.7838944494995452, "grad_norm": 0.8679721620791506, "learning_rate": 6.95484018061538e-07, "loss": 0.0136, "step": 8317 }, { "epoch": 3.78434940855323, "grad_norm": 0.7452715241618637, "learning_rate": 6.949894874601337e-07, "loss": 0.0098, "step": 8318 }, { "epoch": 3.7848043676069154, "grad_norm": 0.8438360753430805, "learning_rate": 6.944951043560375e-07, "loss": 0.0086, "step": 8319 }, { "epoch": 3.7852593266606007, "grad_norm": 1.6371920166827865, "learning_rate": 6.940008687896476e-07, "loss": 0.0155, "step": 8320 }, { "epoch": 3.7857142857142856, "grad_norm": 1.1558954883720374, "learning_rate": 6.935067808013502e-07, "loss": 0.0138, "step": 8321 }, { "epoch": 3.786169244767971, "grad_norm": 0.5986130651790827, "learning_rate": 6.930128404315214e-07, "loss": 0.0063, "step": 8322 }, { "epoch": 3.786624203821656, "grad_norm": 1.3939148532582704, "learning_rate": 6.92519047720523e-07, "loss": 0.0078, "step": 8323 }, { "epoch": 3.787079162875341, "grad_norm": 1.0149155416438023, "learning_rate": 6.920254027087048e-07, "loss": 0.0122, "step": 8324 }, { "epoch": 3.7875341219290264, "grad_norm": 0.8972352631801547, "learning_rate": 6.915319054364064e-07, "loss": 0.0253, "step": 8325 }, { "epoch": 3.7879890809827117, "grad_norm": 0.9419921600133985, "learning_rate": 6.910385559439533e-07, "loss": 0.0075, "step": 8326 }, { "epoch": 3.7884440400363966, "grad_norm": 0.8309963679294444, "learning_rate": 6.905453542716608e-07, "loss": 0.0088, "step": 8327 }, { "epoch": 3.788898999090082, "grad_norm": 1.374888409248873, "learning_rate": 6.900523004598306e-07, "loss": 0.0529, "step": 8328 }, { "epoch": 3.789353958143767, "grad_norm": 1.2323817757292717, "learning_rate": 6.895593945487527e-07, "loss": 0.0143, "step": 8329 }, { "epoch": 3.789808917197452, "grad_norm": 1.313328968753526, "learning_rate": 6.890666365787043e-07, "loss": 0.0203, "step": 8330 }, { "epoch": 3.7902638762511374, "grad_norm": 1.140117892543799, "learning_rate": 6.885740265899527e-07, "loss": 0.015, "step": 8331 }, { "epoch": 3.7907188353048227, "grad_norm": 0.5831849267325965, "learning_rate": 6.880815646227518e-07, "loss": 0.0067, "step": 8332 }, { "epoch": 3.7911737943585075, "grad_norm": 0.8528200406255405, "learning_rate": 6.875892507173426e-07, "loss": 0.0307, "step": 8333 }, { "epoch": 3.791628753412193, "grad_norm": 0.8639019359046864, "learning_rate": 6.870970849139555e-07, "loss": 0.021, "step": 8334 }, { "epoch": 3.792083712465878, "grad_norm": 0.5417440893492098, "learning_rate": 6.866050672528074e-07, "loss": 0.0077, "step": 8335 }, { "epoch": 3.792538671519563, "grad_norm": 1.5930347911954883, "learning_rate": 6.861131977741034e-07, "loss": 0.0173, "step": 8336 }, { "epoch": 3.7929936305732483, "grad_norm": 0.933317065795566, "learning_rate": 6.85621476518038e-07, "loss": 0.0181, "step": 8337 }, { "epoch": 3.7934485896269337, "grad_norm": 0.802532239052224, "learning_rate": 6.851299035247913e-07, "loss": 0.0171, "step": 8338 }, { "epoch": 3.7939035486806185, "grad_norm": 1.2019591641561729, "learning_rate": 6.846384788345337e-07, "loss": 0.0083, "step": 8339 }, { "epoch": 3.794358507734304, "grad_norm": 1.0709666617311044, "learning_rate": 6.841472024874213e-07, "loss": 0.0241, "step": 8340 }, { "epoch": 3.794813466787989, "grad_norm": 0.5898732595818524, "learning_rate": 6.836560745235987e-07, "loss": 0.0069, "step": 8341 }, { "epoch": 3.795268425841674, "grad_norm": 0.6197087631981659, "learning_rate": 6.831650949831997e-07, "loss": 0.0091, "step": 8342 }, { "epoch": 3.7957233848953593, "grad_norm": 0.6825523882623872, "learning_rate": 6.826742639063447e-07, "loss": 0.0101, "step": 8343 }, { "epoch": 3.7961783439490446, "grad_norm": 0.986242517821951, "learning_rate": 6.821835813331415e-07, "loss": 0.0115, "step": 8344 }, { "epoch": 3.7966333030027295, "grad_norm": 0.6552070944778979, "learning_rate": 6.816930473036865e-07, "loss": 0.0091, "step": 8345 }, { "epoch": 3.797088262056415, "grad_norm": 2.0425884005362795, "learning_rate": 6.812026618580639e-07, "loss": 0.0115, "step": 8346 }, { "epoch": 3.7975432211101, "grad_norm": 0.9950402120786972, "learning_rate": 6.80712425036347e-07, "loss": 0.0223, "step": 8347 }, { "epoch": 3.797998180163785, "grad_norm": 1.542174982420801, "learning_rate": 6.802223368785951e-07, "loss": 0.0336, "step": 8348 }, { "epoch": 3.7984531392174703, "grad_norm": 1.528554845950007, "learning_rate": 6.797323974248557e-07, "loss": 0.0258, "step": 8349 }, { "epoch": 3.7989080982711556, "grad_norm": 1.148784622188022, "learning_rate": 6.792426067151636e-07, "loss": 0.0133, "step": 8350 }, { "epoch": 3.799363057324841, "grad_norm": 0.8528458170173915, "learning_rate": 6.787529647895441e-07, "loss": 0.0111, "step": 8351 }, { "epoch": 3.799818016378526, "grad_norm": 0.855662443302283, "learning_rate": 6.782634716880068e-07, "loss": 0.012, "step": 8352 }, { "epoch": 3.800272975432211, "grad_norm": 0.9063102041038658, "learning_rate": 6.777741274505525e-07, "loss": 0.0229, "step": 8353 }, { "epoch": 3.8007279344858964, "grad_norm": 0.960846822415347, "learning_rate": 6.772849321171676e-07, "loss": 0.0127, "step": 8354 }, { "epoch": 3.8011828935395813, "grad_norm": 1.4562553495063384, "learning_rate": 6.767958857278256e-07, "loss": 0.0167, "step": 8355 }, { "epoch": 3.8016378525932666, "grad_norm": 1.263821860710131, "learning_rate": 6.763069883224915e-07, "loss": 0.0213, "step": 8356 }, { "epoch": 3.802092811646952, "grad_norm": 0.6195902617180091, "learning_rate": 6.758182399411142e-07, "loss": 0.0087, "step": 8357 }, { "epoch": 3.802547770700637, "grad_norm": 1.0019131989036207, "learning_rate": 6.753296406236326e-07, "loss": 0.0205, "step": 8358 }, { "epoch": 3.803002729754322, "grad_norm": 0.7549144958676883, "learning_rate": 6.748411904099719e-07, "loss": 0.0125, "step": 8359 }, { "epoch": 3.8034576888080074, "grad_norm": 1.0974350341523929, "learning_rate": 6.743528893400466e-07, "loss": 0.0184, "step": 8360 }, { "epoch": 3.8039126478616927, "grad_norm": 0.559220145336633, "learning_rate": 6.738647374537597e-07, "loss": 0.0221, "step": 8361 }, { "epoch": 3.8043676069153776, "grad_norm": 0.8306170015969598, "learning_rate": 6.733767347909995e-07, "loss": 0.0129, "step": 8362 }, { "epoch": 3.804822565969063, "grad_norm": 1.0348555055399948, "learning_rate": 6.728888813916434e-07, "loss": 0.0332, "step": 8363 }, { "epoch": 3.805277525022748, "grad_norm": 0.7309460605832583, "learning_rate": 6.724011772955563e-07, "loss": 0.0095, "step": 8364 }, { "epoch": 3.805732484076433, "grad_norm": 1.0742712307958076, "learning_rate": 6.719136225425923e-07, "loss": 0.0099, "step": 8365 }, { "epoch": 3.8061874431301184, "grad_norm": 0.777144330503626, "learning_rate": 6.714262171725904e-07, "loss": 0.004, "step": 8366 }, { "epoch": 3.8066424021838037, "grad_norm": 1.313876974383607, "learning_rate": 6.709389612253817e-07, "loss": 0.019, "step": 8367 }, { "epoch": 3.8070973612374885, "grad_norm": 1.209340966002983, "learning_rate": 6.704518547407806e-07, "loss": 0.0216, "step": 8368 }, { "epoch": 3.807552320291174, "grad_norm": 1.0007596157567487, "learning_rate": 6.699648977585912e-07, "loss": 0.0385, "step": 8369 }, { "epoch": 3.808007279344859, "grad_norm": 1.0389472501609793, "learning_rate": 6.694780903186065e-07, "loss": 0.0277, "step": 8370 }, { "epoch": 3.808462238398544, "grad_norm": 1.1846226968988873, "learning_rate": 6.689914324606062e-07, "loss": 0.022, "step": 8371 }, { "epoch": 3.8089171974522293, "grad_norm": 0.6042077427458994, "learning_rate": 6.685049242243569e-07, "loss": 0.0106, "step": 8372 }, { "epoch": 3.8093721565059147, "grad_norm": 1.033401360567049, "learning_rate": 6.680185656496135e-07, "loss": 0.009, "step": 8373 }, { "epoch": 3.8098271155595995, "grad_norm": 1.4417127669708198, "learning_rate": 6.675323567761205e-07, "loss": 0.0182, "step": 8374 }, { "epoch": 3.810282074613285, "grad_norm": 1.0400863484943088, "learning_rate": 6.670462976436073e-07, "loss": 0.0394, "step": 8375 }, { "epoch": 3.81073703366697, "grad_norm": 1.1049263288941316, "learning_rate": 6.665603882917937e-07, "loss": 0.0409, "step": 8376 }, { "epoch": 3.811191992720655, "grad_norm": 1.3016403791967854, "learning_rate": 6.660746287603855e-07, "loss": 0.0373, "step": 8377 }, { "epoch": 3.8116469517743403, "grad_norm": 0.8584243260626157, "learning_rate": 6.655890190890769e-07, "loss": 0.0204, "step": 8378 }, { "epoch": 3.8121019108280256, "grad_norm": 0.38794142631583783, "learning_rate": 6.651035593175486e-07, "loss": 0.0027, "step": 8379 }, { "epoch": 3.8125568698817105, "grad_norm": 0.724118457385495, "learning_rate": 6.646182494854711e-07, "loss": 0.0102, "step": 8380 }, { "epoch": 3.813011828935396, "grad_norm": 1.3636202225780243, "learning_rate": 6.641330896325027e-07, "loss": 0.0144, "step": 8381 }, { "epoch": 3.813466787989081, "grad_norm": 0.8732682081026947, "learning_rate": 6.636480797982872e-07, "loss": 0.0091, "step": 8382 }, { "epoch": 3.813921747042766, "grad_norm": 0.7841920345989467, "learning_rate": 6.631632200224581e-07, "loss": 0.0304, "step": 8383 }, { "epoch": 3.8143767060964513, "grad_norm": 0.41267269629427794, "learning_rate": 6.626785103446345e-07, "loss": 0.0019, "step": 8384 }, { "epoch": 3.8148316651501366, "grad_norm": 0.93478500615721, "learning_rate": 6.621939508044267e-07, "loss": 0.01, "step": 8385 }, { "epoch": 3.8152866242038215, "grad_norm": 1.481767735744944, "learning_rate": 6.617095414414296e-07, "loss": 0.0199, "step": 8386 }, { "epoch": 3.815741583257507, "grad_norm": 0.9975144818999699, "learning_rate": 6.612252822952267e-07, "loss": 0.0064, "step": 8387 }, { "epoch": 3.816196542311192, "grad_norm": 1.1044870032514984, "learning_rate": 6.607411734053903e-07, "loss": 0.0146, "step": 8388 }, { "epoch": 3.816651501364877, "grad_norm": 1.9363314889032632, "learning_rate": 6.602572148114786e-07, "loss": 0.0132, "step": 8389 }, { "epoch": 3.8171064604185623, "grad_norm": 0.8856973622765788, "learning_rate": 6.597734065530398e-07, "loss": 0.0116, "step": 8390 }, { "epoch": 3.8175614194722476, "grad_norm": 1.4250853237180274, "learning_rate": 6.592897486696079e-07, "loss": 0.0357, "step": 8391 }, { "epoch": 3.8180163785259325, "grad_norm": 0.6785155304553901, "learning_rate": 6.588062412007051e-07, "loss": 0.0043, "step": 8392 }, { "epoch": 3.8184713375796178, "grad_norm": 0.588648212729418, "learning_rate": 6.583228841858407e-07, "loss": 0.0177, "step": 8393 }, { "epoch": 3.818926296633303, "grad_norm": 0.8386875306738923, "learning_rate": 6.578396776645136e-07, "loss": 0.0467, "step": 8394 }, { "epoch": 3.819381255686988, "grad_norm": 0.8231133472931068, "learning_rate": 6.573566216762092e-07, "loss": 0.0053, "step": 8395 }, { "epoch": 3.8198362147406733, "grad_norm": 0.7850474852861391, "learning_rate": 6.568737162604005e-07, "loss": 0.0154, "step": 8396 }, { "epoch": 3.8202911737943586, "grad_norm": 1.193857014876366, "learning_rate": 6.563909614565483e-07, "loss": 0.0123, "step": 8397 }, { "epoch": 3.8207461328480434, "grad_norm": 0.9021376202731588, "learning_rate": 6.559083573041003e-07, "loss": 0.0169, "step": 8398 }, { "epoch": 3.8212010919017287, "grad_norm": 1.1236369795731194, "learning_rate": 6.554259038424943e-07, "loss": 0.0122, "step": 8399 }, { "epoch": 3.821656050955414, "grad_norm": 1.4628078420592776, "learning_rate": 6.549436011111534e-07, "loss": 0.0588, "step": 8400 }, { "epoch": 3.822111010009099, "grad_norm": 0.9060317115652905, "learning_rate": 6.544614491494886e-07, "loss": 0.0122, "step": 8401 }, { "epoch": 3.8225659690627842, "grad_norm": 1.0618583601122447, "learning_rate": 6.539794479969003e-07, "loss": 0.0085, "step": 8402 }, { "epoch": 3.8230209281164695, "grad_norm": 0.6880359575291832, "learning_rate": 6.534975976927743e-07, "loss": 0.0084, "step": 8403 }, { "epoch": 3.823475887170155, "grad_norm": 0.46812025413988295, "learning_rate": 6.530158982764867e-07, "loss": 0.0059, "step": 8404 }, { "epoch": 3.8239308462238397, "grad_norm": 1.4699713980523084, "learning_rate": 6.52534349787399e-07, "loss": 0.0222, "step": 8405 }, { "epoch": 3.824385805277525, "grad_norm": 0.8080395713291659, "learning_rate": 6.520529522648608e-07, "loss": 0.0044, "step": 8406 }, { "epoch": 3.8248407643312103, "grad_norm": 1.1027662366630449, "learning_rate": 6.515717057482105e-07, "loss": 0.0143, "step": 8407 }, { "epoch": 3.825295723384895, "grad_norm": 0.9602865074994915, "learning_rate": 6.510906102767722e-07, "loss": 0.0155, "step": 8408 }, { "epoch": 3.8257506824385805, "grad_norm": 1.0721371416205858, "learning_rate": 6.506096658898594e-07, "loss": 0.012, "step": 8409 }, { "epoch": 3.826205641492266, "grad_norm": 1.0403202229306958, "learning_rate": 6.501288726267737e-07, "loss": 0.0246, "step": 8410 }, { "epoch": 3.826660600545951, "grad_norm": 1.1246146185900892, "learning_rate": 6.496482305268029e-07, "loss": 0.0235, "step": 8411 }, { "epoch": 3.827115559599636, "grad_norm": 1.229019107771201, "learning_rate": 6.491677396292223e-07, "loss": 0.0389, "step": 8412 }, { "epoch": 3.8275705186533213, "grad_norm": 1.5450055772174498, "learning_rate": 6.486873999732951e-07, "loss": 0.0203, "step": 8413 }, { "epoch": 3.8280254777070066, "grad_norm": 0.8247481988413441, "learning_rate": 6.482072115982738e-07, "loss": 0.0173, "step": 8414 }, { "epoch": 3.8284804367606915, "grad_norm": 1.3998829235552575, "learning_rate": 6.477271745433958e-07, "loss": 0.0117, "step": 8415 }, { "epoch": 3.828935395814377, "grad_norm": 1.0073950699571204, "learning_rate": 6.472472888478889e-07, "loss": 0.0196, "step": 8416 }, { "epoch": 3.829390354868062, "grad_norm": 2.3948166836855265, "learning_rate": 6.467675545509669e-07, "loss": 0.0357, "step": 8417 }, { "epoch": 3.829845313921747, "grad_norm": 2.1537313299598906, "learning_rate": 6.462879716918302e-07, "loss": 0.0194, "step": 8418 }, { "epoch": 3.8303002729754323, "grad_norm": 0.8037996194691686, "learning_rate": 6.4580854030967e-07, "loss": 0.0109, "step": 8419 }, { "epoch": 3.8307552320291176, "grad_norm": 0.840008071886488, "learning_rate": 6.453292604436626e-07, "loss": 0.0099, "step": 8420 }, { "epoch": 3.8312101910828025, "grad_norm": 0.5095321426126506, "learning_rate": 6.448501321329722e-07, "loss": 0.0055, "step": 8421 }, { "epoch": 3.831665150136488, "grad_norm": 0.5939788628153908, "learning_rate": 6.443711554167506e-07, "loss": 0.0054, "step": 8422 }, { "epoch": 3.832120109190173, "grad_norm": 0.956613985752129, "learning_rate": 6.438923303341382e-07, "loss": 0.005, "step": 8423 }, { "epoch": 3.832575068243858, "grad_norm": 1.0044616897952967, "learning_rate": 6.434136569242632e-07, "loss": 0.0085, "step": 8424 }, { "epoch": 3.8330300272975433, "grad_norm": 1.1911170373588338, "learning_rate": 6.429351352262401e-07, "loss": 0.0347, "step": 8425 }, { "epoch": 3.8334849863512286, "grad_norm": 0.7948445129975544, "learning_rate": 6.42456765279171e-07, "loss": 0.0058, "step": 8426 }, { "epoch": 3.8339399454049135, "grad_norm": 1.4842438417717951, "learning_rate": 6.419785471221459e-07, "loss": 0.0107, "step": 8427 }, { "epoch": 3.8343949044585988, "grad_norm": 0.9204597471609871, "learning_rate": 6.415004807942438e-07, "loss": 0.0194, "step": 8428 }, { "epoch": 3.834849863512284, "grad_norm": 1.7834706727997287, "learning_rate": 6.410225663345288e-07, "loss": 0.0394, "step": 8429 }, { "epoch": 3.835304822565969, "grad_norm": 0.6261027982711851, "learning_rate": 6.405448037820553e-07, "loss": 0.0092, "step": 8430 }, { "epoch": 3.8357597816196543, "grad_norm": 1.3206267958156996, "learning_rate": 6.400671931758634e-07, "loss": 0.0191, "step": 8431 }, { "epoch": 3.8362147406733396, "grad_norm": 1.2927374603436466, "learning_rate": 6.395897345549801e-07, "loss": 0.0124, "step": 8432 }, { "epoch": 3.8366696997270244, "grad_norm": 0.7218485211918536, "learning_rate": 6.391124279584229e-07, "loss": 0.0041, "step": 8433 }, { "epoch": 3.8371246587807097, "grad_norm": 1.2679218629455555, "learning_rate": 6.386352734251946e-07, "loss": 0.0296, "step": 8434 }, { "epoch": 3.837579617834395, "grad_norm": 1.1293824052983272, "learning_rate": 6.381582709942857e-07, "loss": 0.0204, "step": 8435 }, { "epoch": 3.83803457688808, "grad_norm": 1.6325877891697453, "learning_rate": 6.376814207046744e-07, "loss": 0.0685, "step": 8436 }, { "epoch": 3.8384895359417652, "grad_norm": 0.9583919207852333, "learning_rate": 6.37204722595327e-07, "loss": 0.0129, "step": 8437 }, { "epoch": 3.8389444949954505, "grad_norm": 1.0401249765240854, "learning_rate": 6.367281767051984e-07, "loss": 0.0087, "step": 8438 }, { "epoch": 3.8393994540491354, "grad_norm": 1.0267745226060123, "learning_rate": 6.362517830732284e-07, "loss": 0.0102, "step": 8439 }, { "epoch": 3.8398544131028207, "grad_norm": 1.296463495282313, "learning_rate": 6.357755417383462e-07, "loss": 0.04, "step": 8440 }, { "epoch": 3.840309372156506, "grad_norm": 1.398465760308172, "learning_rate": 6.352994527394679e-07, "loss": 0.0124, "step": 8441 }, { "epoch": 3.840764331210191, "grad_norm": 0.9947923465606335, "learning_rate": 6.34823516115497e-07, "loss": 0.0277, "step": 8442 }, { "epoch": 3.841219290263876, "grad_norm": 0.8911004895765644, "learning_rate": 6.343477319053248e-07, "loss": 0.0384, "step": 8443 }, { "epoch": 3.8416742493175615, "grad_norm": 0.5877561895129768, "learning_rate": 6.338721001478318e-07, "loss": 0.0053, "step": 8444 }, { "epoch": 3.8421292083712464, "grad_norm": 1.3164535363849494, "learning_rate": 6.333966208818834e-07, "loss": 0.0098, "step": 8445 }, { "epoch": 3.8425841674249317, "grad_norm": 0.8778006937886873, "learning_rate": 6.329212941463336e-07, "loss": 0.0341, "step": 8446 }, { "epoch": 3.843039126478617, "grad_norm": 0.8272329534963268, "learning_rate": 6.324461199800233e-07, "loss": 0.024, "step": 8447 }, { "epoch": 3.843494085532302, "grad_norm": 0.7969651091293863, "learning_rate": 6.319710984217827e-07, "loss": 0.0057, "step": 8448 }, { "epoch": 3.843949044585987, "grad_norm": 0.9045571525898052, "learning_rate": 6.314962295104285e-07, "loss": 0.0165, "step": 8449 }, { "epoch": 3.8444040036396725, "grad_norm": 1.2129041608714581, "learning_rate": 6.310215132847633e-07, "loss": 0.0517, "step": 8450 }, { "epoch": 3.8448589626933574, "grad_norm": 1.5855315844348774, "learning_rate": 6.305469497835803e-07, "loss": 0.0573, "step": 8451 }, { "epoch": 3.8453139217470427, "grad_norm": 1.5804823406984285, "learning_rate": 6.300725390456581e-07, "loss": 0.0149, "step": 8452 }, { "epoch": 3.845768880800728, "grad_norm": 0.8723146395874012, "learning_rate": 6.295982811097637e-07, "loss": 0.0126, "step": 8453 }, { "epoch": 3.846223839854413, "grad_norm": 0.9925317419076297, "learning_rate": 6.291241760146513e-07, "loss": 0.0295, "step": 8454 }, { "epoch": 3.846678798908098, "grad_norm": 1.4440460485803799, "learning_rate": 6.286502237990622e-07, "loss": 0.0259, "step": 8455 }, { "epoch": 3.8471337579617835, "grad_norm": 0.9475569370079674, "learning_rate": 6.281764245017255e-07, "loss": 0.0095, "step": 8456 }, { "epoch": 3.8475887170154683, "grad_norm": 0.698382090255731, "learning_rate": 6.277027781613581e-07, "loss": 0.0154, "step": 8457 }, { "epoch": 3.8480436760691537, "grad_norm": 0.7664915655799399, "learning_rate": 6.272292848166653e-07, "loss": 0.0241, "step": 8458 }, { "epoch": 3.848498635122839, "grad_norm": 0.7465680019607391, "learning_rate": 6.267559445063379e-07, "loss": 0.0089, "step": 8459 }, { "epoch": 3.8489535941765243, "grad_norm": 0.7509676296766118, "learning_rate": 6.262827572690552e-07, "loss": 0.0233, "step": 8460 }, { "epoch": 3.849408553230209, "grad_norm": 0.7594397043659938, "learning_rate": 6.258097231434832e-07, "loss": 0.0105, "step": 8461 }, { "epoch": 3.8498635122838945, "grad_norm": 1.022584656908761, "learning_rate": 6.253368421682776e-07, "loss": 0.0111, "step": 8462 }, { "epoch": 3.8503184713375798, "grad_norm": 0.7923117158993088, "learning_rate": 6.248641143820794e-07, "loss": 0.0214, "step": 8463 }, { "epoch": 3.8507734303912646, "grad_norm": 1.0126280318072571, "learning_rate": 6.24391539823517e-07, "loss": 0.0089, "step": 8464 }, { "epoch": 3.85122838944495, "grad_norm": 0.7962130936551857, "learning_rate": 6.239191185312085e-07, "loss": 0.0084, "step": 8465 }, { "epoch": 3.8516833484986353, "grad_norm": 1.1230651856230371, "learning_rate": 6.234468505437566e-07, "loss": 0.0215, "step": 8466 }, { "epoch": 3.8521383075523206, "grad_norm": 1.1747157620658253, "learning_rate": 6.229747358997542e-07, "loss": 0.0093, "step": 8467 }, { "epoch": 3.8525932666060054, "grad_norm": 0.6382220025542397, "learning_rate": 6.225027746377801e-07, "loss": 0.013, "step": 8468 }, { "epoch": 3.8530482256596907, "grad_norm": 1.5182237513942787, "learning_rate": 6.220309667964005e-07, "loss": 0.0414, "step": 8469 }, { "epoch": 3.853503184713376, "grad_norm": 1.1066793438895972, "learning_rate": 6.215593124141686e-07, "loss": 0.0151, "step": 8470 }, { "epoch": 3.853958143767061, "grad_norm": 0.9431279736906082, "learning_rate": 6.210878115296267e-07, "loss": 0.0109, "step": 8471 }, { "epoch": 3.8544131028207462, "grad_norm": 1.0736887737476404, "learning_rate": 6.206164641813048e-07, "loss": 0.0137, "step": 8472 }, { "epoch": 3.8548680618744315, "grad_norm": 2.128282157885702, "learning_rate": 6.201452704077179e-07, "loss": 0.0344, "step": 8473 }, { "epoch": 3.8553230209281164, "grad_norm": 1.050597569996516, "learning_rate": 6.196742302473701e-07, "loss": 0.0091, "step": 8474 }, { "epoch": 3.8557779799818017, "grad_norm": 1.1946197194498005, "learning_rate": 6.192033437387524e-07, "loss": 0.0086, "step": 8475 }, { "epoch": 3.856232939035487, "grad_norm": 0.6979576464172372, "learning_rate": 6.187326109203442e-07, "loss": 0.0077, "step": 8476 }, { "epoch": 3.856687898089172, "grad_norm": 3.8022760367925814, "learning_rate": 6.182620318306115e-07, "loss": 0.0217, "step": 8477 }, { "epoch": 3.857142857142857, "grad_norm": 0.6896185200365719, "learning_rate": 6.177916065080067e-07, "loss": 0.0059, "step": 8478 }, { "epoch": 3.8575978161965425, "grad_norm": 0.8577954546395776, "learning_rate": 6.17321334990973e-07, "loss": 0.0182, "step": 8479 }, { "epoch": 3.8580527752502274, "grad_norm": 1.0035733152005735, "learning_rate": 6.168512173179372e-07, "loss": 0.0326, "step": 8480 }, { "epoch": 3.8585077343039127, "grad_norm": 0.7938171519468203, "learning_rate": 6.163812535273153e-07, "loss": 0.007, "step": 8481 }, { "epoch": 3.858962693357598, "grad_norm": 1.1557125848266128, "learning_rate": 6.159114436575117e-07, "loss": 0.0196, "step": 8482 }, { "epoch": 3.859417652411283, "grad_norm": 0.256582905958145, "learning_rate": 6.154417877469165e-07, "loss": 0.0024, "step": 8483 }, { "epoch": 3.859872611464968, "grad_norm": 0.7238468104719901, "learning_rate": 6.149722858339077e-07, "loss": 0.0064, "step": 8484 }, { "epoch": 3.8603275705186535, "grad_norm": 0.8793706485734804, "learning_rate": 6.145029379568504e-07, "loss": 0.0071, "step": 8485 }, { "epoch": 3.8607825295723384, "grad_norm": 1.107751076966341, "learning_rate": 6.14033744154098e-07, "loss": 0.0074, "step": 8486 }, { "epoch": 3.8612374886260237, "grad_norm": 1.170393837070652, "learning_rate": 6.13564704463992e-07, "loss": 0.0405, "step": 8487 }, { "epoch": 3.861692447679709, "grad_norm": 0.6973038845505565, "learning_rate": 6.130958189248593e-07, "loss": 0.0041, "step": 8488 }, { "epoch": 3.862147406733394, "grad_norm": 0.7865433373169557, "learning_rate": 6.126270875750148e-07, "loss": 0.0149, "step": 8489 }, { "epoch": 3.862602365787079, "grad_norm": 1.2930017820185649, "learning_rate": 6.121585104527608e-07, "loss": 0.0113, "step": 8490 }, { "epoch": 3.8630573248407645, "grad_norm": 1.3480263361125109, "learning_rate": 6.116900875963888e-07, "loss": 0.0297, "step": 8491 }, { "epoch": 3.8635122838944493, "grad_norm": 1.279959397653739, "learning_rate": 6.112218190441746e-07, "loss": 0.0295, "step": 8492 }, { "epoch": 3.8639672429481347, "grad_norm": 1.360629655151807, "learning_rate": 6.107537048343842e-07, "loss": 0.0272, "step": 8493 }, { "epoch": 3.86442220200182, "grad_norm": 1.0873849054170839, "learning_rate": 6.102857450052694e-07, "loss": 0.0271, "step": 8494 }, { "epoch": 3.864877161055505, "grad_norm": 1.7695086418691426, "learning_rate": 6.09817939595069e-07, "loss": 0.0162, "step": 8495 }, { "epoch": 3.86533212010919, "grad_norm": 0.7054026990785298, "learning_rate": 6.093502886420111e-07, "loss": 0.0114, "step": 8496 }, { "epoch": 3.8657870791628755, "grad_norm": 1.4971535197436232, "learning_rate": 6.088827921843097e-07, "loss": 0.0287, "step": 8497 }, { "epoch": 3.8662420382165603, "grad_norm": 1.1764264546914083, "learning_rate": 6.084154502601661e-07, "loss": 0.0259, "step": 8498 }, { "epoch": 3.8666969972702456, "grad_norm": 0.6623210017914173, "learning_rate": 6.07948262907769e-07, "loss": 0.0061, "step": 8499 }, { "epoch": 3.867151956323931, "grad_norm": 1.3915318593787178, "learning_rate": 6.074812301652955e-07, "loss": 0.0189, "step": 8500 }, { "epoch": 3.867606915377616, "grad_norm": 1.0525852886039333, "learning_rate": 6.070143520709101e-07, "loss": 0.0215, "step": 8501 }, { "epoch": 3.868061874431301, "grad_norm": 1.7924786746485037, "learning_rate": 6.065476286627631e-07, "loss": 0.0184, "step": 8502 }, { "epoch": 3.8685168334849864, "grad_norm": 1.3288324650959056, "learning_rate": 6.06081059978993e-07, "loss": 0.0153, "step": 8503 }, { "epoch": 3.8689717925386713, "grad_norm": 1.2146977002842787, "learning_rate": 6.056146460577253e-07, "loss": 0.0148, "step": 8504 }, { "epoch": 3.8694267515923566, "grad_norm": 1.1860184169605643, "learning_rate": 6.051483869370745e-07, "loss": 0.0306, "step": 8505 }, { "epoch": 3.869881710646042, "grad_norm": 1.2704681332053351, "learning_rate": 6.046822826551393e-07, "loss": 0.0143, "step": 8506 }, { "epoch": 3.870336669699727, "grad_norm": 1.0607970374991285, "learning_rate": 6.042163332500101e-07, "loss": 0.0177, "step": 8507 }, { "epoch": 3.870791628753412, "grad_norm": 1.6383677660314633, "learning_rate": 6.037505387597603e-07, "loss": 0.0361, "step": 8508 }, { "epoch": 3.8712465878070974, "grad_norm": 0.8618058657422824, "learning_rate": 6.032848992224527e-07, "loss": 0.0132, "step": 8509 }, { "epoch": 3.8717015468607823, "grad_norm": 0.6706420250444279, "learning_rate": 6.028194146761384e-07, "loss": 0.0094, "step": 8510 }, { "epoch": 3.8721565059144676, "grad_norm": 1.5215553832773256, "learning_rate": 6.023540851588539e-07, "loss": 0.0297, "step": 8511 }, { "epoch": 3.872611464968153, "grad_norm": 0.9725667709526524, "learning_rate": 6.018889107086238e-07, "loss": 0.0105, "step": 8512 }, { "epoch": 3.8730664240218378, "grad_norm": 0.7085702619465087, "learning_rate": 6.014238913634593e-07, "loss": 0.0182, "step": 8513 }, { "epoch": 3.873521383075523, "grad_norm": 0.9862133731990671, "learning_rate": 6.009590271613608e-07, "loss": 0.0174, "step": 8514 }, { "epoch": 3.8739763421292084, "grad_norm": 0.8780607018383083, "learning_rate": 6.00494318140315e-07, "loss": 0.0181, "step": 8515 }, { "epoch": 3.8744313011828937, "grad_norm": 0.3859245199470852, "learning_rate": 6.000297643382957e-07, "loss": 0.0015, "step": 8516 }, { "epoch": 3.8748862602365786, "grad_norm": 0.9631408113635125, "learning_rate": 5.995653657932637e-07, "loss": 0.0292, "step": 8517 }, { "epoch": 3.875341219290264, "grad_norm": 1.1722552337998866, "learning_rate": 5.991011225431679e-07, "loss": 0.0328, "step": 8518 }, { "epoch": 3.875796178343949, "grad_norm": 0.6817281387828497, "learning_rate": 5.986370346259429e-07, "loss": 0.0095, "step": 8519 }, { "epoch": 3.876251137397634, "grad_norm": 1.1803918293752733, "learning_rate": 5.981731020795131e-07, "loss": 0.0121, "step": 8520 }, { "epoch": 3.8767060964513194, "grad_norm": 1.3293453845072063, "learning_rate": 5.977093249417898e-07, "loss": 0.0462, "step": 8521 }, { "epoch": 3.8771610555050047, "grad_norm": 1.273484011637596, "learning_rate": 5.972457032506695e-07, "loss": 0.0121, "step": 8522 }, { "epoch": 3.87761601455869, "grad_norm": 0.5078208449932002, "learning_rate": 5.96782237044038e-07, "loss": 0.0023, "step": 8523 }, { "epoch": 3.878070973612375, "grad_norm": 0.9314932353948467, "learning_rate": 5.96318926359766e-07, "loss": 0.0142, "step": 8524 }, { "epoch": 3.87852593266606, "grad_norm": 1.7517133191008472, "learning_rate": 5.958557712357152e-07, "loss": 0.0155, "step": 8525 }, { "epoch": 3.8789808917197455, "grad_norm": 2.1711379585308412, "learning_rate": 5.953927717097319e-07, "loss": 0.0294, "step": 8526 }, { "epoch": 3.8794358507734303, "grad_norm": 0.7133087609613493, "learning_rate": 5.949299278196494e-07, "loss": 0.012, "step": 8527 }, { "epoch": 3.8798908098271156, "grad_norm": 2.3100332690136844, "learning_rate": 5.944672396032908e-07, "loss": 0.0407, "step": 8528 }, { "epoch": 3.880345768880801, "grad_norm": 1.6365055090483753, "learning_rate": 5.940047070984631e-07, "loss": 0.013, "step": 8529 }, { "epoch": 3.880800727934486, "grad_norm": 1.189632181729963, "learning_rate": 5.935423303429644e-07, "loss": 0.0119, "step": 8530 }, { "epoch": 3.881255686988171, "grad_norm": 0.8705928388960313, "learning_rate": 5.930801093745766e-07, "loss": 0.0086, "step": 8531 }, { "epoch": 3.8817106460418564, "grad_norm": 0.8192376136138599, "learning_rate": 5.926180442310709e-07, "loss": 0.037, "step": 8532 }, { "epoch": 3.8821656050955413, "grad_norm": 1.1790486923855066, "learning_rate": 5.921561349502041e-07, "loss": 0.0147, "step": 8533 }, { "epoch": 3.8826205641492266, "grad_norm": 1.0132428632386445, "learning_rate": 5.916943815697223e-07, "loss": 0.0207, "step": 8534 }, { "epoch": 3.883075523202912, "grad_norm": 1.2931532111872057, "learning_rate": 5.912327841273588e-07, "loss": 0.0137, "step": 8535 }, { "epoch": 3.883530482256597, "grad_norm": 0.629854576625441, "learning_rate": 5.90771342660832e-07, "loss": 0.0047, "step": 8536 }, { "epoch": 3.883985441310282, "grad_norm": 1.4204340196136567, "learning_rate": 5.90310057207849e-07, "loss": 0.0133, "step": 8537 }, { "epoch": 3.8844404003639674, "grad_norm": 1.0925191049929126, "learning_rate": 5.898489278061034e-07, "loss": 0.0348, "step": 8538 }, { "epoch": 3.8848953594176523, "grad_norm": 0.55700395051143, "learning_rate": 5.89387954493278e-07, "loss": 0.0066, "step": 8539 }, { "epoch": 3.8853503184713376, "grad_norm": 1.2000372262422327, "learning_rate": 5.889271373070407e-07, "loss": 0.0264, "step": 8540 }, { "epoch": 3.885805277525023, "grad_norm": 1.8895026901251857, "learning_rate": 5.884664762850467e-07, "loss": 0.0206, "step": 8541 }, { "epoch": 3.886260236578708, "grad_norm": 1.0206338027153419, "learning_rate": 5.880059714649405e-07, "loss": 0.0147, "step": 8542 }, { "epoch": 3.886715195632393, "grad_norm": 0.6424473001570956, "learning_rate": 5.875456228843512e-07, "loss": 0.01, "step": 8543 }, { "epoch": 3.8871701546860784, "grad_norm": 0.9461039381438637, "learning_rate": 5.870854305808976e-07, "loss": 0.0098, "step": 8544 }, { "epoch": 3.8876251137397633, "grad_norm": 1.0381494710461558, "learning_rate": 5.866253945921841e-07, "loss": 0.0092, "step": 8545 }, { "epoch": 3.8880800727934486, "grad_norm": 1.438485929355662, "learning_rate": 5.861655149558026e-07, "loss": 0.019, "step": 8546 }, { "epoch": 3.888535031847134, "grad_norm": 0.9095223056946032, "learning_rate": 5.857057917093323e-07, "loss": 0.0125, "step": 8547 }, { "epoch": 3.8889899909008188, "grad_norm": 0.8417497748362587, "learning_rate": 5.852462248903388e-07, "loss": 0.0215, "step": 8548 }, { "epoch": 3.889444949954504, "grad_norm": 0.8108294860589268, "learning_rate": 5.847868145363777e-07, "loss": 0.0075, "step": 8549 }, { "epoch": 3.8898999090081894, "grad_norm": 0.6696804310125452, "learning_rate": 5.843275606849894e-07, "loss": 0.0083, "step": 8550 }, { "epoch": 3.8903548680618742, "grad_norm": 0.8426993094083803, "learning_rate": 5.838684633737018e-07, "loss": 0.0084, "step": 8551 }, { "epoch": 3.8908098271155596, "grad_norm": 0.9332144185812206, "learning_rate": 5.834095226400302e-07, "loss": 0.0051, "step": 8552 }, { "epoch": 3.891264786169245, "grad_norm": 1.0323847004276896, "learning_rate": 5.829507385214764e-07, "loss": 0.0348, "step": 8553 }, { "epoch": 3.8917197452229297, "grad_norm": 1.3402063389677088, "learning_rate": 5.824921110555315e-07, "loss": 0.0306, "step": 8554 }, { "epoch": 3.892174704276615, "grad_norm": 0.9065059896604516, "learning_rate": 5.820336402796712e-07, "loss": 0.0102, "step": 8555 }, { "epoch": 3.8926296633303004, "grad_norm": 0.8780817895379683, "learning_rate": 5.815753262313612e-07, "loss": 0.0062, "step": 8556 }, { "epoch": 3.8930846223839852, "grad_norm": 0.9093325355461187, "learning_rate": 5.811171689480518e-07, "loss": 0.0259, "step": 8557 }, { "epoch": 3.8935395814376705, "grad_norm": 1.1523167479240966, "learning_rate": 5.806591684671814e-07, "loss": 0.0159, "step": 8558 }, { "epoch": 3.893994540491356, "grad_norm": 1.167690708945315, "learning_rate": 5.802013248261768e-07, "loss": 0.0138, "step": 8559 }, { "epoch": 3.8944494995450407, "grad_norm": 1.0231175694877948, "learning_rate": 5.7974363806245e-07, "loss": 0.0209, "step": 8560 }, { "epoch": 3.894904458598726, "grad_norm": 0.9660679342136465, "learning_rate": 5.792861082134011e-07, "loss": 0.0124, "step": 8561 }, { "epoch": 3.8953594176524113, "grad_norm": 0.8164869392741999, "learning_rate": 5.788287353164171e-07, "loss": 0.008, "step": 8562 }, { "epoch": 3.895814376706096, "grad_norm": 1.0937354127876695, "learning_rate": 5.783715194088729e-07, "loss": 0.0099, "step": 8563 }, { "epoch": 3.8962693357597815, "grad_norm": 1.0913860296987647, "learning_rate": 5.779144605281309e-07, "loss": 0.0347, "step": 8564 }, { "epoch": 3.896724294813467, "grad_norm": 0.7681831595574331, "learning_rate": 5.774575587115389e-07, "loss": 0.0235, "step": 8565 }, { "epoch": 3.8971792538671517, "grad_norm": 1.068553938298465, "learning_rate": 5.770008139964334e-07, "loss": 0.0085, "step": 8566 }, { "epoch": 3.897634212920837, "grad_norm": 0.8619575621503434, "learning_rate": 5.765442264201362e-07, "loss": 0.0098, "step": 8567 }, { "epoch": 3.8980891719745223, "grad_norm": 1.084469625427511, "learning_rate": 5.760877960199596e-07, "loss": 0.0417, "step": 8568 }, { "epoch": 3.8985441310282076, "grad_norm": 1.4172558977413772, "learning_rate": 5.756315228331988e-07, "loss": 0.0282, "step": 8569 }, { "epoch": 3.8989990900818925, "grad_norm": 0.745285368634781, "learning_rate": 5.751754068971407e-07, "loss": 0.0083, "step": 8570 }, { "epoch": 3.899454049135578, "grad_norm": 1.3269807624725367, "learning_rate": 5.747194482490559e-07, "loss": 0.0325, "step": 8571 }, { "epoch": 3.899909008189263, "grad_norm": 1.3663153591662889, "learning_rate": 5.742636469262023e-07, "loss": 0.0142, "step": 8572 }, { "epoch": 3.900363967242948, "grad_norm": 1.7963633834619246, "learning_rate": 5.73808002965828e-07, "loss": 0.0601, "step": 8573 }, { "epoch": 3.9008189262966333, "grad_norm": 1.3740556636525063, "learning_rate": 5.733525164051648e-07, "loss": 0.0173, "step": 8574 }, { "epoch": 3.9012738853503186, "grad_norm": 1.1542953161134941, "learning_rate": 5.728971872814335e-07, "loss": 0.0216, "step": 8575 }, { "epoch": 3.901728844404004, "grad_norm": 0.9404310781414172, "learning_rate": 5.724420156318406e-07, "loss": 0.0055, "step": 8576 }, { "epoch": 3.902183803457689, "grad_norm": 0.9410650036124314, "learning_rate": 5.719870014935811e-07, "loss": 0.0283, "step": 8577 }, { "epoch": 3.902638762511374, "grad_norm": 1.1133580959108893, "learning_rate": 5.71532144903838e-07, "loss": 0.0088, "step": 8578 }, { "epoch": 3.9030937215650594, "grad_norm": 1.393415290747547, "learning_rate": 5.710774458997792e-07, "loss": 0.0152, "step": 8579 }, { "epoch": 3.9035486806187443, "grad_norm": 1.1036834767042005, "learning_rate": 5.706229045185604e-07, "loss": 0.0173, "step": 8580 }, { "epoch": 3.9040036396724296, "grad_norm": 0.710908371951688, "learning_rate": 5.701685207973243e-07, "loss": 0.0167, "step": 8581 }, { "epoch": 3.904458598726115, "grad_norm": 1.551942923445616, "learning_rate": 5.697142947732021e-07, "loss": 0.0356, "step": 8582 }, { "epoch": 3.9049135577797998, "grad_norm": 1.5163854927005915, "learning_rate": 5.692602264833103e-07, "loss": 0.0397, "step": 8583 }, { "epoch": 3.905368516833485, "grad_norm": 1.1757455427456647, "learning_rate": 5.688063159647539e-07, "loss": 0.0189, "step": 8584 }, { "epoch": 3.9058234758871704, "grad_norm": 1.0322111758278507, "learning_rate": 5.683525632546244e-07, "loss": 0.011, "step": 8585 }, { "epoch": 3.9062784349408552, "grad_norm": 1.162107152442943, "learning_rate": 5.678989683900002e-07, "loss": 0.0273, "step": 8586 }, { "epoch": 3.9067333939945406, "grad_norm": 0.9899948505438753, "learning_rate": 5.674455314079464e-07, "loss": 0.0144, "step": 8587 }, { "epoch": 3.907188353048226, "grad_norm": 1.4616908809535827, "learning_rate": 5.669922523455171e-07, "loss": 0.0291, "step": 8588 }, { "epoch": 3.9076433121019107, "grad_norm": 0.6524983238288512, "learning_rate": 5.665391312397514e-07, "loss": 0.0041, "step": 8589 }, { "epoch": 3.908098271155596, "grad_norm": 1.0833020064203143, "learning_rate": 5.660861681276758e-07, "loss": 0.009, "step": 8590 }, { "epoch": 3.9085532302092814, "grad_norm": 0.5716126199210907, "learning_rate": 5.65633363046306e-07, "loss": 0.0088, "step": 8591 }, { "epoch": 3.9090081892629662, "grad_norm": 2.276067081870815, "learning_rate": 5.651807160326414e-07, "loss": 0.0286, "step": 8592 }, { "epoch": 3.9094631483166515, "grad_norm": 1.1769175218991232, "learning_rate": 5.647282271236718e-07, "loss": 0.0196, "step": 8593 }, { "epoch": 3.909918107370337, "grad_norm": 0.3053133868923715, "learning_rate": 5.642758963563719e-07, "loss": 0.0066, "step": 8594 }, { "epoch": 3.9103730664240217, "grad_norm": 1.02405929958041, "learning_rate": 5.638237237677038e-07, "loss": 0.0156, "step": 8595 }, { "epoch": 3.910828025477707, "grad_norm": 0.6570781165199041, "learning_rate": 5.63371709394617e-07, "loss": 0.006, "step": 8596 }, { "epoch": 3.9112829845313923, "grad_norm": 0.48445040129218775, "learning_rate": 5.629198532740481e-07, "loss": 0.0032, "step": 8597 }, { "epoch": 3.911737943585077, "grad_norm": 0.6729205010591711, "learning_rate": 5.62468155442922e-07, "loss": 0.0102, "step": 8598 }, { "epoch": 3.9121929026387625, "grad_norm": 1.655469022619931, "learning_rate": 5.620166159381482e-07, "loss": 0.0165, "step": 8599 }, { "epoch": 3.912647861692448, "grad_norm": 0.3233827257620349, "learning_rate": 5.615652347966247e-07, "loss": 0.0031, "step": 8600 }, { "epoch": 3.9131028207461327, "grad_norm": 1.7987892634769487, "learning_rate": 5.611140120552358e-07, "loss": 0.0144, "step": 8601 }, { "epoch": 3.913557779799818, "grad_norm": 0.9933910281076908, "learning_rate": 5.606629477508543e-07, "loss": 0.0158, "step": 8602 }, { "epoch": 3.9140127388535033, "grad_norm": 0.8383559676909116, "learning_rate": 5.602120419203391e-07, "loss": 0.0349, "step": 8603 }, { "epoch": 3.914467697907188, "grad_norm": 0.8364755168239001, "learning_rate": 5.597612946005348e-07, "loss": 0.0114, "step": 8604 }, { "epoch": 3.9149226569608735, "grad_norm": 1.3491978186726388, "learning_rate": 5.593107058282765e-07, "loss": 0.0144, "step": 8605 }, { "epoch": 3.915377616014559, "grad_norm": 0.7075698969444528, "learning_rate": 5.588602756403822e-07, "loss": 0.0053, "step": 8606 }, { "epoch": 3.9158325750682437, "grad_norm": 0.5188664147675368, "learning_rate": 5.584100040736609e-07, "loss": 0.0106, "step": 8607 }, { "epoch": 3.916287534121929, "grad_norm": 1.1885950794920275, "learning_rate": 5.579598911649059e-07, "loss": 0.0114, "step": 8608 }, { "epoch": 3.9167424931756143, "grad_norm": 1.2097939745623765, "learning_rate": 5.575099369508985e-07, "loss": 0.0211, "step": 8609 }, { "epoch": 3.917197452229299, "grad_norm": 0.7035613777558455, "learning_rate": 5.570601414684062e-07, "loss": 0.0139, "step": 8610 }, { "epoch": 3.9176524112829845, "grad_norm": 1.51686482815332, "learning_rate": 5.566105047541848e-07, "loss": 0.0091, "step": 8611 }, { "epoch": 3.91810737033667, "grad_norm": 0.9786922439749565, "learning_rate": 5.561610268449775e-07, "loss": 0.0205, "step": 8612 }, { "epoch": 3.9185623293903546, "grad_norm": 0.6415869381101763, "learning_rate": 5.557117077775125e-07, "loss": 0.0115, "step": 8613 }, { "epoch": 3.91901728844404, "grad_norm": 0.8854428493890332, "learning_rate": 5.552625475885065e-07, "loss": 0.019, "step": 8614 }, { "epoch": 3.9194722474977253, "grad_norm": 0.8993375682760628, "learning_rate": 5.548135463146622e-07, "loss": 0.0048, "step": 8615 }, { "epoch": 3.91992720655141, "grad_norm": 1.2434453600335211, "learning_rate": 5.543647039926712e-07, "loss": 0.0267, "step": 8616 }, { "epoch": 3.9203821656050954, "grad_norm": 0.6930624054089322, "learning_rate": 5.5391602065921e-07, "loss": 0.0056, "step": 8617 }, { "epoch": 3.9208371246587808, "grad_norm": 0.8482498975522991, "learning_rate": 5.534674963509429e-07, "loss": 0.0217, "step": 8618 }, { "epoch": 3.9212920837124656, "grad_norm": 0.896605220266131, "learning_rate": 5.530191311045218e-07, "loss": 0.0114, "step": 8619 }, { "epoch": 3.921747042766151, "grad_norm": 0.5333648125333689, "learning_rate": 5.525709249565842e-07, "loss": 0.0045, "step": 8620 }, { "epoch": 3.9222020018198362, "grad_norm": 1.0184247114608702, "learning_rate": 5.521228779437568e-07, "loss": 0.0313, "step": 8621 }, { "epoch": 3.922656960873521, "grad_norm": 0.525314383913808, "learning_rate": 5.516749901026514e-07, "loss": 0.0171, "step": 8622 }, { "epoch": 3.9231119199272064, "grad_norm": 0.8001851017089467, "learning_rate": 5.512272614698672e-07, "loss": 0.0243, "step": 8623 }, { "epoch": 3.9235668789808917, "grad_norm": 0.9618863642454606, "learning_rate": 5.507796920819905e-07, "loss": 0.0179, "step": 8624 }, { "epoch": 3.924021838034577, "grad_norm": 1.033319512329814, "learning_rate": 5.503322819755941e-07, "loss": 0.0146, "step": 8625 }, { "epoch": 3.924476797088262, "grad_norm": 0.7696746892693678, "learning_rate": 5.498850311872392e-07, "loss": 0.0384, "step": 8626 }, { "epoch": 3.9249317561419472, "grad_norm": 0.6260619756400455, "learning_rate": 5.494379397534733e-07, "loss": 0.006, "step": 8627 }, { "epoch": 3.9253867151956325, "grad_norm": 1.015232125072497, "learning_rate": 5.489910077108304e-07, "loss": 0.0146, "step": 8628 }, { "epoch": 3.9258416742493174, "grad_norm": 1.0663966606117505, "learning_rate": 5.485442350958317e-07, "loss": 0.0067, "step": 8629 }, { "epoch": 3.9262966333030027, "grad_norm": 1.108365078022967, "learning_rate": 5.480976219449849e-07, "loss": 0.0136, "step": 8630 }, { "epoch": 3.926751592356688, "grad_norm": 0.8178975792920549, "learning_rate": 5.476511682947861e-07, "loss": 0.0136, "step": 8631 }, { "epoch": 3.9272065514103733, "grad_norm": 0.47994907400391373, "learning_rate": 5.472048741817165e-07, "loss": 0.013, "step": 8632 }, { "epoch": 3.927661510464058, "grad_norm": 1.221572800353289, "learning_rate": 5.467587396422467e-07, "loss": 0.0191, "step": 8633 }, { "epoch": 3.9281164695177435, "grad_norm": 0.8194417672032338, "learning_rate": 5.463127647128319e-07, "loss": 0.0148, "step": 8634 }, { "epoch": 3.928571428571429, "grad_norm": 0.6960703421062906, "learning_rate": 5.458669494299143e-07, "loss": 0.01, "step": 8635 }, { "epoch": 3.9290263876251137, "grad_norm": 1.1607477641241595, "learning_rate": 5.454212938299256e-07, "loss": 0.02, "step": 8636 }, { "epoch": 3.929481346678799, "grad_norm": 1.4930951668484196, "learning_rate": 5.449757979492821e-07, "loss": 0.0272, "step": 8637 }, { "epoch": 3.9299363057324843, "grad_norm": 1.2653038263229932, "learning_rate": 5.445304618243874e-07, "loss": 0.06, "step": 8638 }, { "epoch": 3.930391264786169, "grad_norm": 1.3165556886101375, "learning_rate": 5.44085285491632e-07, "loss": 0.0108, "step": 8639 }, { "epoch": 3.9308462238398545, "grad_norm": 0.8752352130245407, "learning_rate": 5.436402689873941e-07, "loss": 0.0091, "step": 8640 }, { "epoch": 3.93130118289354, "grad_norm": 1.0185213100120385, "learning_rate": 5.431954123480393e-07, "loss": 0.0156, "step": 8641 }, { "epoch": 3.9317561419472247, "grad_norm": 1.1698550809717092, "learning_rate": 5.427507156099185e-07, "loss": 0.038, "step": 8642 }, { "epoch": 3.93221110100091, "grad_norm": 1.0614865737401613, "learning_rate": 5.423061788093706e-07, "loss": 0.0226, "step": 8643 }, { "epoch": 3.9326660600545953, "grad_norm": 0.7991315002370278, "learning_rate": 5.418618019827199e-07, "loss": 0.0145, "step": 8644 }, { "epoch": 3.93312101910828, "grad_norm": 0.9635265984101691, "learning_rate": 5.414175851662806e-07, "loss": 0.0152, "step": 8645 }, { "epoch": 3.9335759781619655, "grad_norm": 1.0976069051072204, "learning_rate": 5.409735283963511e-07, "loss": 0.0361, "step": 8646 }, { "epoch": 3.934030937215651, "grad_norm": 0.653534688092675, "learning_rate": 5.405296317092182e-07, "loss": 0.006, "step": 8647 }, { "epoch": 3.9344858962693356, "grad_norm": 0.5961496937715537, "learning_rate": 5.40085895141155e-07, "loss": 0.012, "step": 8648 }, { "epoch": 3.934940855323021, "grad_norm": 0.595609458995723, "learning_rate": 5.396423187284208e-07, "loss": 0.0044, "step": 8649 }, { "epoch": 3.9353958143767063, "grad_norm": 1.1454261798278125, "learning_rate": 5.391989025072644e-07, "loss": 0.0126, "step": 8650 }, { "epoch": 3.935850773430391, "grad_norm": 1.4008591513366349, "learning_rate": 5.387556465139185e-07, "loss": 0.0317, "step": 8651 }, { "epoch": 3.9363057324840764, "grad_norm": 0.9971771861784312, "learning_rate": 5.383125507846043e-07, "loss": 0.0376, "step": 8652 }, { "epoch": 3.9367606915377618, "grad_norm": 0.8894748780053554, "learning_rate": 5.37869615355529e-07, "loss": 0.0141, "step": 8653 }, { "epoch": 3.9372156505914466, "grad_norm": 1.4530568566013289, "learning_rate": 5.374268402628877e-07, "loss": 0.0323, "step": 8654 }, { "epoch": 3.937670609645132, "grad_norm": 1.1582422690762895, "learning_rate": 5.369842255428628e-07, "loss": 0.011, "step": 8655 }, { "epoch": 3.9381255686988172, "grad_norm": 0.553407626958086, "learning_rate": 5.365417712316223e-07, "loss": 0.006, "step": 8656 }, { "epoch": 3.938580527752502, "grad_norm": 0.9278085663379685, "learning_rate": 5.360994773653211e-07, "loss": 0.0058, "step": 8657 }, { "epoch": 3.9390354868061874, "grad_norm": 1.3339019145614541, "learning_rate": 5.356573439801019e-07, "loss": 0.0167, "step": 8658 }, { "epoch": 3.9394904458598727, "grad_norm": 0.8024575972907306, "learning_rate": 5.352153711120928e-07, "loss": 0.0084, "step": 8659 }, { "epoch": 3.9399454049135576, "grad_norm": 1.5601907944890299, "learning_rate": 5.347735587974106e-07, "loss": 0.0224, "step": 8660 }, { "epoch": 3.940400363967243, "grad_norm": 1.0164131348341345, "learning_rate": 5.343319070721592e-07, "loss": 0.0417, "step": 8661 }, { "epoch": 3.9408553230209282, "grad_norm": 1.2925228165672114, "learning_rate": 5.338904159724275e-07, "loss": 0.0198, "step": 8662 }, { "epoch": 3.941310282074613, "grad_norm": 1.277305727316255, "learning_rate": 5.334490855342922e-07, "loss": 0.0304, "step": 8663 }, { "epoch": 3.9417652411282984, "grad_norm": 1.4476897234876094, "learning_rate": 5.330079157938159e-07, "loss": 0.0104, "step": 8664 }, { "epoch": 3.9422202001819837, "grad_norm": 0.928093819779046, "learning_rate": 5.325669067870503e-07, "loss": 0.0065, "step": 8665 }, { "epoch": 3.9426751592356686, "grad_norm": 0.6129503480209934, "learning_rate": 5.321260585500326e-07, "loss": 0.0109, "step": 8666 }, { "epoch": 3.943130118289354, "grad_norm": 1.358014010924264, "learning_rate": 5.316853711187858e-07, "loss": 0.0181, "step": 8667 }, { "epoch": 3.943585077343039, "grad_norm": 0.8407939988091633, "learning_rate": 5.312448445293225e-07, "loss": 0.0125, "step": 8668 }, { "epoch": 3.944040036396724, "grad_norm": 0.8561321671666512, "learning_rate": 5.308044788176387e-07, "loss": 0.0083, "step": 8669 }, { "epoch": 3.9444949954504094, "grad_norm": 0.7472119761273164, "learning_rate": 5.30364274019721e-07, "loss": 0.008, "step": 8670 }, { "epoch": 3.9449499545040947, "grad_norm": 1.1136249596968422, "learning_rate": 5.299242301715399e-07, "loss": 0.0271, "step": 8671 }, { "epoch": 3.9454049135577796, "grad_norm": 1.3404123009527777, "learning_rate": 5.294843473090539e-07, "loss": 0.0083, "step": 8672 }, { "epoch": 3.945859872611465, "grad_norm": 0.8484361878837143, "learning_rate": 5.290446254682074e-07, "loss": 0.02, "step": 8673 }, { "epoch": 3.94631483166515, "grad_norm": 0.9747616062579944, "learning_rate": 5.286050646849336e-07, "loss": 0.0116, "step": 8674 }, { "epoch": 3.946769790718835, "grad_norm": 0.8147586439745078, "learning_rate": 5.28165664995152e-07, "loss": 0.0183, "step": 8675 }, { "epoch": 3.9472247497725204, "grad_norm": 0.8200185670591856, "learning_rate": 5.277264264347673e-07, "loss": 0.0111, "step": 8676 }, { "epoch": 3.9476797088262057, "grad_norm": 1.2538759222281057, "learning_rate": 5.272873490396723e-07, "loss": 0.0182, "step": 8677 }, { "epoch": 3.9481346678798905, "grad_norm": 0.5962332694036871, "learning_rate": 5.268484328457457e-07, "loss": 0.0147, "step": 8678 }, { "epoch": 3.948589626933576, "grad_norm": 1.1790693861113752, "learning_rate": 5.264096778888555e-07, "loss": 0.0191, "step": 8679 }, { "epoch": 3.949044585987261, "grad_norm": 0.9973139918929886, "learning_rate": 5.259710842048535e-07, "loss": 0.0376, "step": 8680 }, { "epoch": 3.9494995450409465, "grad_norm": 1.0821046090040507, "learning_rate": 5.255326518295791e-07, "loss": 0.0299, "step": 8681 }, { "epoch": 3.9499545040946313, "grad_norm": 0.48295948781883297, "learning_rate": 5.250943807988607e-07, "loss": 0.0037, "step": 8682 }, { "epoch": 3.9504094631483166, "grad_norm": 0.9523311344718588, "learning_rate": 5.246562711485101e-07, "loss": 0.0193, "step": 8683 }, { "epoch": 3.950864422202002, "grad_norm": 1.2918991763543979, "learning_rate": 5.242183229143294e-07, "loss": 0.0318, "step": 8684 }, { "epoch": 3.951319381255687, "grad_norm": 1.0965786839383047, "learning_rate": 5.237805361321044e-07, "loss": 0.0232, "step": 8685 }, { "epoch": 3.951774340309372, "grad_norm": 0.8990520103437764, "learning_rate": 5.233429108376098e-07, "loss": 0.0292, "step": 8686 }, { "epoch": 3.9522292993630574, "grad_norm": 0.9826044263519114, "learning_rate": 5.22905447066605e-07, "loss": 0.02, "step": 8687 }, { "epoch": 3.9526842584167428, "grad_norm": 0.8511423644133704, "learning_rate": 5.224681448548388e-07, "loss": 0.0177, "step": 8688 }, { "epoch": 3.9531392174704276, "grad_norm": 0.5594976108590228, "learning_rate": 5.220310042380461e-07, "loss": 0.0308, "step": 8689 }, { "epoch": 3.953594176524113, "grad_norm": 0.8750067222724341, "learning_rate": 5.215940252519472e-07, "loss": 0.0067, "step": 8690 }, { "epoch": 3.9540491355777982, "grad_norm": 1.6572295817886091, "learning_rate": 5.211572079322499e-07, "loss": 0.021, "step": 8691 }, { "epoch": 3.954504094631483, "grad_norm": 1.4583742481729218, "learning_rate": 5.207205523146497e-07, "loss": 0.0321, "step": 8692 }, { "epoch": 3.9549590536851684, "grad_norm": 0.9637740749453334, "learning_rate": 5.202840584348265e-07, "loss": 0.0047, "step": 8693 }, { "epoch": 3.9554140127388537, "grad_norm": 1.1166391049644935, "learning_rate": 5.198477263284507e-07, "loss": 0.0223, "step": 8694 }, { "epoch": 3.9558689717925386, "grad_norm": 1.0853090214188477, "learning_rate": 5.194115560311755e-07, "loss": 0.0106, "step": 8695 }, { "epoch": 3.956323930846224, "grad_norm": 1.0669808553254516, "learning_rate": 5.189755475786446e-07, "loss": 0.0343, "step": 8696 }, { "epoch": 3.9567788898999092, "grad_norm": 1.199411031033764, "learning_rate": 5.185397010064855e-07, "loss": 0.0119, "step": 8697 }, { "epoch": 3.957233848953594, "grad_norm": 1.1498810274855191, "learning_rate": 5.181040163503132e-07, "loss": 0.0065, "step": 8698 }, { "epoch": 3.9576888080072794, "grad_norm": 0.644000232968435, "learning_rate": 5.176684936457313e-07, "loss": 0.0079, "step": 8699 }, { "epoch": 3.9581437670609647, "grad_norm": 0.5926726501595319, "learning_rate": 5.172331329283281e-07, "loss": 0.0093, "step": 8700 }, { "epoch": 3.9585987261146496, "grad_norm": 1.3297220831207555, "learning_rate": 5.167979342336787e-07, "loss": 0.0352, "step": 8701 }, { "epoch": 3.959053685168335, "grad_norm": 1.2596693099148535, "learning_rate": 5.163628975973459e-07, "loss": 0.0101, "step": 8702 }, { "epoch": 3.95950864422202, "grad_norm": 0.9885244508346562, "learning_rate": 5.159280230548789e-07, "loss": 0.0261, "step": 8703 }, { "epoch": 3.959963603275705, "grad_norm": 1.122041503652115, "learning_rate": 5.154933106418145e-07, "loss": 0.03, "step": 8704 }, { "epoch": 3.9604185623293904, "grad_norm": 0.2427614977936144, "learning_rate": 5.150587603936746e-07, "loss": 0.0018, "step": 8705 }, { "epoch": 3.9608735213830757, "grad_norm": 0.42259459136804917, "learning_rate": 5.146243723459692e-07, "loss": 0.0035, "step": 8706 }, { "epoch": 3.9613284804367606, "grad_norm": 1.0461900616898785, "learning_rate": 5.141901465341933e-07, "loss": 0.0191, "step": 8707 }, { "epoch": 3.961783439490446, "grad_norm": 1.1808050998572959, "learning_rate": 5.137560829938318e-07, "loss": 0.0234, "step": 8708 }, { "epoch": 3.962238398544131, "grad_norm": 1.6342689446001322, "learning_rate": 5.133221817603526e-07, "loss": 0.0454, "step": 8709 }, { "epoch": 3.962693357597816, "grad_norm": 1.055584952130033, "learning_rate": 5.128884428692136e-07, "loss": 0.0291, "step": 8710 }, { "epoch": 3.9631483166515014, "grad_norm": 1.2050496068081014, "learning_rate": 5.124548663558571e-07, "loss": 0.0208, "step": 8711 }, { "epoch": 3.9636032757051867, "grad_norm": 2.199703182882373, "learning_rate": 5.120214522557129e-07, "loss": 0.0134, "step": 8712 }, { "epoch": 3.9640582347588715, "grad_norm": 0.9140014160532166, "learning_rate": 5.115882006041983e-07, "loss": 0.0102, "step": 8713 }, { "epoch": 3.964513193812557, "grad_norm": 0.8301932687459863, "learning_rate": 5.111551114367166e-07, "loss": 0.0196, "step": 8714 }, { "epoch": 3.964968152866242, "grad_norm": 0.814486301324695, "learning_rate": 5.107221847886576e-07, "loss": 0.0124, "step": 8715 }, { "epoch": 3.965423111919927, "grad_norm": 1.3360737176133506, "learning_rate": 5.102894206953976e-07, "loss": 0.0168, "step": 8716 }, { "epoch": 3.9658780709736123, "grad_norm": 0.9435869669173691, "learning_rate": 5.098568191923007e-07, "loss": 0.008, "step": 8717 }, { "epoch": 3.9663330300272976, "grad_norm": 1.1339049101206546, "learning_rate": 5.094243803147175e-07, "loss": 0.0093, "step": 8718 }, { "epoch": 3.9667879890809825, "grad_norm": 0.9825951207630396, "learning_rate": 5.089921040979847e-07, "loss": 0.0137, "step": 8719 }, { "epoch": 3.967242948134668, "grad_norm": 0.8111285570662514, "learning_rate": 5.085599905774261e-07, "loss": 0.02, "step": 8720 }, { "epoch": 3.967697907188353, "grad_norm": 1.2176499674359855, "learning_rate": 5.081280397883509e-07, "loss": 0.0106, "step": 8721 }, { "epoch": 3.968152866242038, "grad_norm": 1.2881763487285163, "learning_rate": 5.076962517660577e-07, "loss": 0.012, "step": 8722 }, { "epoch": 3.9686078252957233, "grad_norm": 0.9802128854977287, "learning_rate": 5.072646265458292e-07, "loss": 0.0081, "step": 8723 }, { "epoch": 3.9690627843494086, "grad_norm": 0.8221378326325012, "learning_rate": 5.068331641629367e-07, "loss": 0.011, "step": 8724 }, { "epoch": 3.9695177434030935, "grad_norm": 0.8583706348288953, "learning_rate": 5.064018646526372e-07, "loss": 0.0126, "step": 8725 }, { "epoch": 3.969972702456779, "grad_norm": 0.6390994886807762, "learning_rate": 5.059707280501736e-07, "loss": 0.0166, "step": 8726 }, { "epoch": 3.970427661510464, "grad_norm": 0.6609645476379415, "learning_rate": 5.055397543907778e-07, "loss": 0.0109, "step": 8727 }, { "epoch": 3.970882620564149, "grad_norm": 0.853597966434476, "learning_rate": 5.051089437096662e-07, "loss": 0.0078, "step": 8728 }, { "epoch": 3.9713375796178343, "grad_norm": 1.5173464586255239, "learning_rate": 5.046782960420432e-07, "loss": 0.0196, "step": 8729 }, { "epoch": 3.9717925386715196, "grad_norm": 1.3874528577439789, "learning_rate": 5.042478114230981e-07, "loss": 0.0265, "step": 8730 }, { "epoch": 3.9722474977252045, "grad_norm": 0.8958734927297398, "learning_rate": 5.038174898880099e-07, "loss": 0.0104, "step": 8731 }, { "epoch": 3.97270245677889, "grad_norm": 1.020994397849771, "learning_rate": 5.033873314719409e-07, "loss": 0.0077, "step": 8732 }, { "epoch": 3.973157415832575, "grad_norm": 0.751426907279519, "learning_rate": 5.029573362100434e-07, "loss": 0.0155, "step": 8733 }, { "epoch": 3.9736123748862604, "grad_norm": 0.7915351011670297, "learning_rate": 5.025275041374538e-07, "loss": 0.0164, "step": 8734 }, { "epoch": 3.9740673339399453, "grad_norm": 1.2655238875546937, "learning_rate": 5.020978352892961e-07, "loss": 0.0085, "step": 8735 }, { "epoch": 3.9745222929936306, "grad_norm": 1.2262899521907886, "learning_rate": 5.016683297006803e-07, "loss": 0.0264, "step": 8736 }, { "epoch": 3.974977252047316, "grad_norm": 0.9125420827643771, "learning_rate": 5.012389874067039e-07, "loss": 0.0083, "step": 8737 }, { "epoch": 3.9754322111010008, "grad_norm": 0.5745865557745252, "learning_rate": 5.00809808442452e-07, "loss": 0.0057, "step": 8738 }, { "epoch": 3.975887170154686, "grad_norm": 1.0793573063508652, "learning_rate": 5.003807928429941e-07, "loss": 0.0145, "step": 8739 }, { "epoch": 3.9763421292083714, "grad_norm": 0.9476882291133458, "learning_rate": 4.999519406433878e-07, "loss": 0.0126, "step": 8740 }, { "epoch": 3.9767970882620567, "grad_norm": 1.1130823555009306, "learning_rate": 4.995232518786761e-07, "loss": 0.0073, "step": 8741 }, { "epoch": 3.9772520473157416, "grad_norm": 1.2329388321971315, "learning_rate": 4.990947265838906e-07, "loss": 0.0108, "step": 8742 }, { "epoch": 3.977707006369427, "grad_norm": 0.9334664567855798, "learning_rate": 4.986663647940481e-07, "loss": 0.0192, "step": 8743 }, { "epoch": 3.978161965423112, "grad_norm": 0.6451050338753588, "learning_rate": 4.982381665441519e-07, "loss": 0.0042, "step": 8744 }, { "epoch": 3.978616924476797, "grad_norm": 1.0000862985788528, "learning_rate": 4.978101318691936e-07, "loss": 0.0436, "step": 8745 }, { "epoch": 3.9790718835304824, "grad_norm": 2.7814517349713936, "learning_rate": 4.973822608041484e-07, "loss": 0.0158, "step": 8746 }, { "epoch": 3.9795268425841677, "grad_norm": 0.8097592745484016, "learning_rate": 4.96954553383982e-07, "loss": 0.0147, "step": 8747 }, { "epoch": 3.9799818016378525, "grad_norm": 1.1614793950886328, "learning_rate": 4.965270096436439e-07, "loss": 0.0082, "step": 8748 }, { "epoch": 3.980436760691538, "grad_norm": 0.7909429687048624, "learning_rate": 4.960996296180709e-07, "loss": 0.0153, "step": 8749 }, { "epoch": 3.980891719745223, "grad_norm": 1.1285638594953207, "learning_rate": 4.956724133421861e-07, "loss": 0.0309, "step": 8750 }, { "epoch": 3.981346678798908, "grad_norm": 0.9654738794213935, "learning_rate": 4.952453608509e-07, "loss": 0.0084, "step": 8751 }, { "epoch": 3.9818016378525933, "grad_norm": 0.5128803042914035, "learning_rate": 4.948184721791105e-07, "loss": 0.0038, "step": 8752 }, { "epoch": 3.9822565969062786, "grad_norm": 1.0051693680248381, "learning_rate": 4.943917473616999e-07, "loss": 0.029, "step": 8753 }, { "epoch": 3.9827115559599635, "grad_norm": 1.580418049237971, "learning_rate": 4.939651864335384e-07, "loss": 0.0063, "step": 8754 }, { "epoch": 3.983166515013649, "grad_norm": 0.9190016927169106, "learning_rate": 4.935387894294825e-07, "loss": 0.0274, "step": 8755 }, { "epoch": 3.983621474067334, "grad_norm": 0.9750724752763313, "learning_rate": 4.931125563843758e-07, "loss": 0.0128, "step": 8756 }, { "epoch": 3.984076433121019, "grad_norm": 0.9016818598482951, "learning_rate": 4.926864873330483e-07, "loss": 0.0177, "step": 8757 }, { "epoch": 3.9845313921747043, "grad_norm": 0.7570816918337403, "learning_rate": 4.922605823103152e-07, "loss": 0.0058, "step": 8758 }, { "epoch": 3.9849863512283896, "grad_norm": 0.891840003431888, "learning_rate": 4.918348413509813e-07, "loss": 0.0032, "step": 8759 }, { "epoch": 3.9854413102820745, "grad_norm": 1.2639132898891172, "learning_rate": 4.914092644898347e-07, "loss": 0.0128, "step": 8760 }, { "epoch": 3.98589626933576, "grad_norm": 1.0502193486150637, "learning_rate": 4.909838517616528e-07, "loss": 0.0212, "step": 8761 }, { "epoch": 3.986351228389445, "grad_norm": 0.3767489765939362, "learning_rate": 4.90558603201198e-07, "loss": 0.0027, "step": 8762 }, { "epoch": 3.98680618744313, "grad_norm": 2.6747398526949033, "learning_rate": 4.901335188432194e-07, "loss": 0.0185, "step": 8763 }, { "epoch": 3.9872611464968153, "grad_norm": 0.4167693888293392, "learning_rate": 4.897085987224534e-07, "loss": 0.0029, "step": 8764 }, { "epoch": 3.9877161055505006, "grad_norm": 0.977149864825625, "learning_rate": 4.892838428736211e-07, "loss": 0.0108, "step": 8765 }, { "epoch": 3.9881710646041855, "grad_norm": 1.1431622581647092, "learning_rate": 4.888592513314338e-07, "loss": 0.0089, "step": 8766 }, { "epoch": 3.988626023657871, "grad_norm": 1.1340465255285326, "learning_rate": 4.884348241305864e-07, "loss": 0.0182, "step": 8767 }, { "epoch": 3.989080982711556, "grad_norm": 0.685356447330023, "learning_rate": 4.880105613057612e-07, "loss": 0.0079, "step": 8768 }, { "epoch": 3.989535941765241, "grad_norm": 0.9598753400970724, "learning_rate": 4.875864628916266e-07, "loss": 0.0291, "step": 8769 }, { "epoch": 3.9899909008189263, "grad_norm": 0.9172998253704834, "learning_rate": 4.871625289228376e-07, "loss": 0.0055, "step": 8770 }, { "epoch": 3.9904458598726116, "grad_norm": 1.5493723999735072, "learning_rate": 4.867387594340378e-07, "loss": 0.0302, "step": 8771 }, { "epoch": 3.9909008189262964, "grad_norm": 1.0926066310061102, "learning_rate": 4.863151544598541e-07, "loss": 0.0237, "step": 8772 }, { "epoch": 3.9913557779799818, "grad_norm": 0.6209254757731776, "learning_rate": 4.858917140349026e-07, "loss": 0.0146, "step": 8773 }, { "epoch": 3.991810737033667, "grad_norm": 1.3263027644855192, "learning_rate": 4.854684381937846e-07, "loss": 0.0185, "step": 8774 }, { "epoch": 3.992265696087352, "grad_norm": 1.5949751818748326, "learning_rate": 4.850453269710878e-07, "loss": 0.0323, "step": 8775 }, { "epoch": 3.9927206551410372, "grad_norm": 2.524499115728809, "learning_rate": 4.846223804013883e-07, "loss": 0.0256, "step": 8776 }, { "epoch": 3.9931756141947226, "grad_norm": 1.003462315012334, "learning_rate": 4.841995985192463e-07, "loss": 0.018, "step": 8777 }, { "epoch": 3.9936305732484074, "grad_norm": 0.697997635002052, "learning_rate": 4.837769813592097e-07, "loss": 0.0042, "step": 8778 }, { "epoch": 3.9940855323020927, "grad_norm": 0.9450102185035006, "learning_rate": 4.833545289558125e-07, "loss": 0.0408, "step": 8779 }, { "epoch": 3.994540491355778, "grad_norm": 1.2046533565509756, "learning_rate": 4.829322413435761e-07, "loss": 0.0133, "step": 8780 }, { "epoch": 3.994995450409463, "grad_norm": 1.2820332232680884, "learning_rate": 4.825101185570086e-07, "loss": 0.013, "step": 8781 }, { "epoch": 3.9954504094631482, "grad_norm": 0.4932256163534949, "learning_rate": 4.820881606306033e-07, "loss": 0.0029, "step": 8782 }, { "epoch": 3.9959053685168335, "grad_norm": 0.7855115168573326, "learning_rate": 4.816663675988406e-07, "loss": 0.0044, "step": 8783 }, { "epoch": 3.9963603275705184, "grad_norm": 0.9219709545806718, "learning_rate": 4.81244739496187e-07, "loss": 0.0105, "step": 8784 }, { "epoch": 3.9968152866242037, "grad_norm": 0.515185565386514, "learning_rate": 4.808232763570972e-07, "loss": 0.0049, "step": 8785 }, { "epoch": 3.997270245677889, "grad_norm": 1.2359226393577387, "learning_rate": 4.804019782160105e-07, "loss": 0.0186, "step": 8786 }, { "epoch": 3.997725204731574, "grad_norm": 0.7622182116477785, "learning_rate": 4.799808451073539e-07, "loss": 0.0087, "step": 8787 }, { "epoch": 3.998180163785259, "grad_norm": 1.1231111543772347, "learning_rate": 4.795598770655407e-07, "loss": 0.0419, "step": 8788 }, { "epoch": 3.9986351228389445, "grad_norm": 0.7976570620994488, "learning_rate": 4.791390741249691e-07, "loss": 0.0146, "step": 8789 }, { "epoch": 3.99909008189263, "grad_norm": 1.0488282659111805, "learning_rate": 4.787184363200273e-07, "loss": 0.0242, "step": 8790 }, { "epoch": 3.9995450409463147, "grad_norm": 1.4545215399426576, "learning_rate": 4.782979636850866e-07, "loss": 0.0251, "step": 8791 }, { "epoch": 4.0, "grad_norm": 0.9890160227463002, "learning_rate": 4.778776562545063e-07, "loss": 0.0096, "step": 8792 }, { "epoch": 4.000454959053685, "grad_norm": 0.4475630616153182, "learning_rate": 4.774575140626317e-07, "loss": 0.0045, "step": 8793 }, { "epoch": 4.000909918107371, "grad_norm": 0.18603767242511685, "learning_rate": 4.770375371437952e-07, "loss": 0.0017, "step": 8794 }, { "epoch": 4.0013648771610555, "grad_norm": 0.5940634020523442, "learning_rate": 4.766177255323162e-07, "loss": 0.0046, "step": 8795 }, { "epoch": 4.00181983621474, "grad_norm": 0.4439999234305639, "learning_rate": 4.76198079262499e-07, "loss": 0.0036, "step": 8796 }, { "epoch": 4.002274795268426, "grad_norm": 0.560460778444776, "learning_rate": 4.7577859836863555e-07, "loss": 0.007, "step": 8797 }, { "epoch": 4.002729754322111, "grad_norm": 0.28609965516701763, "learning_rate": 4.7535928288500314e-07, "loss": 0.0026, "step": 8798 }, { "epoch": 4.003184713375796, "grad_norm": 0.27603648331941555, "learning_rate": 4.749401328458675e-07, "loss": 0.0014, "step": 8799 }, { "epoch": 4.003639672429482, "grad_norm": 0.16974864485021643, "learning_rate": 4.7452114828547835e-07, "loss": 0.001, "step": 8800 }, { "epoch": 4.0040946314831665, "grad_norm": 0.3583711849079781, "learning_rate": 4.741023292380748e-07, "loss": 0.0121, "step": 8801 }, { "epoch": 4.004549590536851, "grad_norm": 0.5672372157400036, "learning_rate": 4.7368367573787987e-07, "loss": 0.0159, "step": 8802 }, { "epoch": 4.005004549590537, "grad_norm": 0.3254563350074784, "learning_rate": 4.7326518781910443e-07, "loss": 0.0036, "step": 8803 }, { "epoch": 4.005459508644222, "grad_norm": 0.594467536996485, "learning_rate": 4.7284686551594435e-07, "loss": 0.0063, "step": 8804 }, { "epoch": 4.005914467697907, "grad_norm": 0.8809881632363672, "learning_rate": 4.724287088625845e-07, "loss": 0.0142, "step": 8805 }, { "epoch": 4.006369426751593, "grad_norm": 0.22588899671993737, "learning_rate": 4.720107178931943e-07, "loss": 0.0017, "step": 8806 }, { "epoch": 4.0068243858052774, "grad_norm": 0.5789802630045991, "learning_rate": 4.7159289264192917e-07, "loss": 0.0102, "step": 8807 }, { "epoch": 4.007279344858962, "grad_norm": 0.45078819400274994, "learning_rate": 4.711752331429334e-07, "loss": 0.0112, "step": 8808 }, { "epoch": 4.007734303912648, "grad_norm": 0.5283647334233481, "learning_rate": 4.7075773943033474e-07, "loss": 0.0057, "step": 8809 }, { "epoch": 4.008189262966333, "grad_norm": 0.5046087495424525, "learning_rate": 4.7034041153825013e-07, "loss": 0.0099, "step": 8810 }, { "epoch": 4.008644222020018, "grad_norm": 0.21706592943650044, "learning_rate": 4.699232495007816e-07, "loss": 0.0013, "step": 8811 }, { "epoch": 4.0090991810737036, "grad_norm": 0.8097434190844917, "learning_rate": 4.69506253352017e-07, "loss": 0.0134, "step": 8812 }, { "epoch": 4.009554140127388, "grad_norm": 0.3472758003685839, "learning_rate": 4.6908942312603125e-07, "loss": 0.002, "step": 8813 }, { "epoch": 4.010009099181073, "grad_norm": 0.4623288852966439, "learning_rate": 4.686727588568865e-07, "loss": 0.0114, "step": 8814 }, { "epoch": 4.010464058234759, "grad_norm": 0.5189740624183826, "learning_rate": 4.682562605786309e-07, "loss": 0.0033, "step": 8815 }, { "epoch": 4.010919017288444, "grad_norm": 0.5602517894330693, "learning_rate": 4.678399283252985e-07, "loss": 0.0053, "step": 8816 }, { "epoch": 4.011373976342129, "grad_norm": 0.37450500189338787, "learning_rate": 4.6742376213091e-07, "loss": 0.004, "step": 8817 }, { "epoch": 4.0118289353958145, "grad_norm": 0.3036913203733925, "learning_rate": 4.670077620294719e-07, "loss": 0.0021, "step": 8818 }, { "epoch": 4.012283894449499, "grad_norm": 0.21205345927312072, "learning_rate": 4.665919280549794e-07, "loss": 0.0012, "step": 8819 }, { "epoch": 4.012738853503185, "grad_norm": 0.8829843409297297, "learning_rate": 4.661762602414116e-07, "loss": 0.0157, "step": 8820 }, { "epoch": 4.01319381255687, "grad_norm": 0.19557249917290023, "learning_rate": 4.6576075862273445e-07, "loss": 0.0016, "step": 8821 }, { "epoch": 4.013648771610555, "grad_norm": 0.3257939334683237, "learning_rate": 4.6534542323290244e-07, "loss": 0.0032, "step": 8822 }, { "epoch": 4.014103730664241, "grad_norm": 0.5331358895735017, "learning_rate": 4.649302541058531e-07, "loss": 0.004, "step": 8823 }, { "epoch": 4.0145586897179255, "grad_norm": 0.8405846295390302, "learning_rate": 4.645152512755141e-07, "loss": 0.014, "step": 8824 }, { "epoch": 4.01501364877161, "grad_norm": 0.48180458084705335, "learning_rate": 4.641004147757963e-07, "loss": 0.002, "step": 8825 }, { "epoch": 4.015468607825296, "grad_norm": 0.4005734769054017, "learning_rate": 4.6368574464059875e-07, "loss": 0.0064, "step": 8826 }, { "epoch": 4.015923566878981, "grad_norm": 0.5632894908282152, "learning_rate": 4.632712409038054e-07, "loss": 0.0135, "step": 8827 }, { "epoch": 4.016378525932666, "grad_norm": 0.3103001104170144, "learning_rate": 4.6285690359928856e-07, "loss": 0.0024, "step": 8828 }, { "epoch": 4.016833484986352, "grad_norm": 0.7223786889706539, "learning_rate": 4.6244273276090655e-07, "loss": 0.0125, "step": 8829 }, { "epoch": 4.0172884440400365, "grad_norm": 0.6134513456656842, "learning_rate": 4.620287284225028e-07, "loss": 0.0055, "step": 8830 }, { "epoch": 4.017743403093721, "grad_norm": 0.8708811882291376, "learning_rate": 4.616148906179083e-07, "loss": 0.0196, "step": 8831 }, { "epoch": 4.018198362147407, "grad_norm": 0.48015635657204603, "learning_rate": 4.612012193809387e-07, "loss": 0.0076, "step": 8832 }, { "epoch": 4.018653321201092, "grad_norm": 0.34518790094464086, "learning_rate": 4.60787714745399e-07, "loss": 0.0038, "step": 8833 }, { "epoch": 4.019108280254777, "grad_norm": 0.36891720134523337, "learning_rate": 4.6037437674507827e-07, "loss": 0.0039, "step": 8834 }, { "epoch": 4.019563239308463, "grad_norm": 0.7172850432351958, "learning_rate": 4.59961205413752e-07, "loss": 0.0177, "step": 8835 }, { "epoch": 4.0200181983621475, "grad_norm": 0.47474891917744666, "learning_rate": 4.5954820078518397e-07, "loss": 0.0031, "step": 8836 }, { "epoch": 4.020473157415832, "grad_norm": 0.6852005001365392, "learning_rate": 4.591353628931222e-07, "loss": 0.0196, "step": 8837 }, { "epoch": 4.020928116469518, "grad_norm": 1.3487941170021251, "learning_rate": 4.587226917713017e-07, "loss": 0.0102, "step": 8838 }, { "epoch": 4.021383075523203, "grad_norm": 0.12950750866893554, "learning_rate": 4.5831018745344487e-07, "loss": 0.0007, "step": 8839 }, { "epoch": 4.021838034576888, "grad_norm": 0.3447120311138188, "learning_rate": 4.5789784997325946e-07, "loss": 0.0028, "step": 8840 }, { "epoch": 4.022292993630574, "grad_norm": 0.16451928652981718, "learning_rate": 4.5748567936443974e-07, "loss": 0.001, "step": 8841 }, { "epoch": 4.022747952684258, "grad_norm": 0.6085637003878673, "learning_rate": 4.5707367566066584e-07, "loss": 0.0116, "step": 8842 }, { "epoch": 4.023202911737943, "grad_norm": 0.6913794001826817, "learning_rate": 4.566618388956054e-07, "loss": 0.0049, "step": 8843 }, { "epoch": 4.023657870791629, "grad_norm": 0.5586616617802772, "learning_rate": 4.5625016910291223e-07, "loss": 0.0039, "step": 8844 }, { "epoch": 4.024112829845314, "grad_norm": 0.3841517240339153, "learning_rate": 4.558386663162259e-07, "loss": 0.0025, "step": 8845 }, { "epoch": 4.024567788898999, "grad_norm": 0.49492544391705173, "learning_rate": 4.554273305691725e-07, "loss": 0.0069, "step": 8846 }, { "epoch": 4.0250227479526846, "grad_norm": 0.2795330064755246, "learning_rate": 4.550161618953636e-07, "loss": 0.0017, "step": 8847 }, { "epoch": 4.025477707006369, "grad_norm": 0.35488274519483193, "learning_rate": 4.5460516032839963e-07, "loss": 0.0036, "step": 8848 }, { "epoch": 4.025932666060054, "grad_norm": 0.11990652627185439, "learning_rate": 4.541943259018644e-07, "loss": 0.0007, "step": 8849 }, { "epoch": 4.02638762511374, "grad_norm": 0.7402266668683801, "learning_rate": 4.5378365864933076e-07, "loss": 0.0189, "step": 8850 }, { "epoch": 4.026842584167425, "grad_norm": 0.6225139459316867, "learning_rate": 4.5337315860435574e-07, "loss": 0.028, "step": 8851 }, { "epoch": 4.02729754322111, "grad_norm": 0.4963302105634239, "learning_rate": 4.529628258004831e-07, "loss": 0.0053, "step": 8852 }, { "epoch": 4.0277525022747955, "grad_norm": 0.2992654229270557, "learning_rate": 4.525526602712449e-07, "loss": 0.0043, "step": 8853 }, { "epoch": 4.02820746132848, "grad_norm": 0.8131619322444351, "learning_rate": 4.521426620501568e-07, "loss": 0.0147, "step": 8854 }, { "epoch": 4.028662420382165, "grad_norm": 0.16394509367689897, "learning_rate": 4.5173283117072254e-07, "loss": 0.0005, "step": 8855 }, { "epoch": 4.029117379435851, "grad_norm": 0.7910423252281373, "learning_rate": 4.5132316766643064e-07, "loss": 0.0096, "step": 8856 }, { "epoch": 4.029572338489536, "grad_norm": 0.26239889567688146, "learning_rate": 4.5091367157075794e-07, "loss": 0.002, "step": 8857 }, { "epoch": 4.030027297543221, "grad_norm": 0.19550373561091644, "learning_rate": 4.5050434291716684e-07, "loss": 0.0011, "step": 8858 }, { "epoch": 4.0304822565969065, "grad_norm": 0.6033027633315493, "learning_rate": 4.500951817391055e-07, "loss": 0.0076, "step": 8859 }, { "epoch": 4.030937215650591, "grad_norm": 0.3045203320042466, "learning_rate": 4.496861880700085e-07, "loss": 0.0026, "step": 8860 }, { "epoch": 4.031392174704276, "grad_norm": 0.14807583917618902, "learning_rate": 4.492773619432966e-07, "loss": 0.0009, "step": 8861 }, { "epoch": 4.031847133757962, "grad_norm": 0.7558163183134748, "learning_rate": 4.488687033923783e-07, "loss": 0.009, "step": 8862 }, { "epoch": 4.032302092811647, "grad_norm": 0.23030112000166197, "learning_rate": 4.48460212450646e-07, "loss": 0.0011, "step": 8863 }, { "epoch": 4.032757051865332, "grad_norm": 0.3214200072769125, "learning_rate": 4.4805188915148095e-07, "loss": 0.0016, "step": 8864 }, { "epoch": 4.0332120109190175, "grad_norm": 0.8247328613113796, "learning_rate": 4.4764373352824934e-07, "loss": 0.005, "step": 8865 }, { "epoch": 4.033666969972702, "grad_norm": 1.020403217390097, "learning_rate": 4.4723574561430254e-07, "loss": 0.0051, "step": 8866 }, { "epoch": 4.034121929026387, "grad_norm": 0.584433737313302, "learning_rate": 4.4682792544298137e-07, "loss": 0.0023, "step": 8867 }, { "epoch": 4.034576888080073, "grad_norm": 0.875174653120777, "learning_rate": 4.4642027304761e-07, "loss": 0.002, "step": 8868 }, { "epoch": 4.035031847133758, "grad_norm": 0.5132892227454089, "learning_rate": 4.460127884614998e-07, "loss": 0.0032, "step": 8869 }, { "epoch": 4.035486806187443, "grad_norm": 0.40981744418256116, "learning_rate": 4.4560547171794837e-07, "loss": 0.0035, "step": 8870 }, { "epoch": 4.0359417652411285, "grad_norm": 0.20429183397848633, "learning_rate": 4.451983228502402e-07, "loss": 0.0013, "step": 8871 }, { "epoch": 4.036396724294813, "grad_norm": 0.2829083074931466, "learning_rate": 4.447913418916464e-07, "loss": 0.0027, "step": 8872 }, { "epoch": 4.036851683348498, "grad_norm": 0.4931919041818383, "learning_rate": 4.4438452887542255e-07, "loss": 0.0021, "step": 8873 }, { "epoch": 4.037306642402184, "grad_norm": 0.534584318219316, "learning_rate": 4.4397788383481215e-07, "loss": 0.0092, "step": 8874 }, { "epoch": 4.037761601455869, "grad_norm": 0.5030443967211639, "learning_rate": 4.4357140680304416e-07, "loss": 0.0079, "step": 8875 }, { "epoch": 4.038216560509555, "grad_norm": 0.5876080785884027, "learning_rate": 4.4316509781333365e-07, "loss": 0.0077, "step": 8876 }, { "epoch": 4.038671519563239, "grad_norm": 0.3179090686399854, "learning_rate": 4.4275895689888243e-07, "loss": 0.0036, "step": 8877 }, { "epoch": 4.039126478616924, "grad_norm": 0.9637317042179895, "learning_rate": 4.4235298409287977e-07, "loss": 0.0068, "step": 8878 }, { "epoch": 4.03958143767061, "grad_norm": 0.6340269265157724, "learning_rate": 4.419471794284988e-07, "loss": 0.0035, "step": 8879 }, { "epoch": 4.040036396724295, "grad_norm": 0.40168209382751513, "learning_rate": 4.4154154293890003e-07, "loss": 0.0012, "step": 8880 }, { "epoch": 4.04049135577798, "grad_norm": 0.24126991544323742, "learning_rate": 4.4113607465723017e-07, "loss": 0.0018, "step": 8881 }, { "epoch": 4.0409463148316656, "grad_norm": 1.0452221857412196, "learning_rate": 4.407307746166231e-07, "loss": 0.0049, "step": 8882 }, { "epoch": 4.04140127388535, "grad_norm": 0.5516266739195607, "learning_rate": 4.4032564285019756e-07, "loss": 0.0053, "step": 8883 }, { "epoch": 4.041856232939035, "grad_norm": 0.6311443666867417, "learning_rate": 4.399206793910582e-07, "loss": 0.0099, "step": 8884 }, { "epoch": 4.042311191992721, "grad_norm": 0.660660991900591, "learning_rate": 4.3951588427229855e-07, "loss": 0.0108, "step": 8885 }, { "epoch": 4.042766151046406, "grad_norm": 0.8580245105643047, "learning_rate": 4.3911125752699513e-07, "loss": 0.0111, "step": 8886 }, { "epoch": 4.043221110100091, "grad_norm": 0.6231459039152066, "learning_rate": 4.387067991882135e-07, "loss": 0.003, "step": 8887 }, { "epoch": 4.0436760691537765, "grad_norm": 0.19674558007425363, "learning_rate": 4.3830250928900335e-07, "loss": 0.0016, "step": 8888 }, { "epoch": 4.044131028207461, "grad_norm": 0.21802086319465624, "learning_rate": 4.378983878624018e-07, "loss": 0.0014, "step": 8889 }, { "epoch": 4.044585987261146, "grad_norm": 0.4992944140356869, "learning_rate": 4.374944349414309e-07, "loss": 0.0066, "step": 8890 }, { "epoch": 4.045040946314832, "grad_norm": 0.3344435117990978, "learning_rate": 4.3709065055910075e-07, "loss": 0.0032, "step": 8891 }, { "epoch": 4.045495905368517, "grad_norm": 0.29219745209501075, "learning_rate": 4.36687034748407e-07, "loss": 0.0018, "step": 8892 }, { "epoch": 4.045950864422202, "grad_norm": 0.6475647421697943, "learning_rate": 4.3628358754233103e-07, "loss": 0.0333, "step": 8893 }, { "epoch": 4.0464058234758875, "grad_norm": 0.08932916528792785, "learning_rate": 4.3588030897384057e-07, "loss": 0.0005, "step": 8894 }, { "epoch": 4.046860782529572, "grad_norm": 0.4022565269401293, "learning_rate": 4.3547719907588937e-07, "loss": 0.0014, "step": 8895 }, { "epoch": 4.047315741583257, "grad_norm": 0.9758562940079167, "learning_rate": 4.350742578814185e-07, "loss": 0.0239, "step": 8896 }, { "epoch": 4.047770700636943, "grad_norm": 0.5736344529998822, "learning_rate": 4.346714854233544e-07, "loss": 0.0045, "step": 8897 }, { "epoch": 4.048225659690628, "grad_norm": 0.3057716639655835, "learning_rate": 4.3426888173460886e-07, "loss": 0.0022, "step": 8898 }, { "epoch": 4.048680618744313, "grad_norm": 0.10885320486024447, "learning_rate": 4.3386644684808214e-07, "loss": 0.0007, "step": 8899 }, { "epoch": 4.0491355777979985, "grad_norm": 0.14139816900548002, "learning_rate": 4.3346418079665803e-07, "loss": 0.0007, "step": 8900 }, { "epoch": 4.049590536851683, "grad_norm": 0.05982706888985871, "learning_rate": 4.3306208361320963e-07, "loss": 0.0003, "step": 8901 }, { "epoch": 4.050045495905368, "grad_norm": 0.37560155870028195, "learning_rate": 4.326601553305934e-07, "loss": 0.0044, "step": 8902 }, { "epoch": 4.050500454959054, "grad_norm": 0.4874677343821864, "learning_rate": 4.3225839598165315e-07, "loss": 0.004, "step": 8903 }, { "epoch": 4.050955414012739, "grad_norm": 1.9085879385530915, "learning_rate": 4.318568055992184e-07, "loss": 0.0066, "step": 8904 }, { "epoch": 4.051410373066424, "grad_norm": 0.8198822501989165, "learning_rate": 4.3145538421610564e-07, "loss": 0.0273, "step": 8905 }, { "epoch": 4.0518653321201095, "grad_norm": 0.6965972064679407, "learning_rate": 4.3105413186511847e-07, "loss": 0.0071, "step": 8906 }, { "epoch": 4.052320291173794, "grad_norm": 1.474661949272094, "learning_rate": 4.306530485790439e-07, "loss": 0.0111, "step": 8907 }, { "epoch": 4.052775250227479, "grad_norm": 0.33533488234128106, "learning_rate": 4.302521343906574e-07, "loss": 0.002, "step": 8908 }, { "epoch": 4.053230209281165, "grad_norm": 0.5210975338562948, "learning_rate": 4.298513893327194e-07, "loss": 0.009, "step": 8909 }, { "epoch": 4.05368516833485, "grad_norm": 0.178937361056195, "learning_rate": 4.2945081343797687e-07, "loss": 0.0008, "step": 8910 }, { "epoch": 4.054140127388535, "grad_norm": 0.5766558569967528, "learning_rate": 4.2905040673916376e-07, "loss": 0.0105, "step": 8911 }, { "epoch": 4.05459508644222, "grad_norm": 0.5349159068541721, "learning_rate": 4.2865016926899844e-07, "loss": 0.0111, "step": 8912 }, { "epoch": 4.055050045495905, "grad_norm": 0.7738314659711798, "learning_rate": 4.2825010106018776e-07, "loss": 0.0265, "step": 8913 }, { "epoch": 4.05550500454959, "grad_norm": 0.6038945339345899, "learning_rate": 4.2785020214542285e-07, "loss": 0.0112, "step": 8914 }, { "epoch": 4.055959963603276, "grad_norm": 0.6702246028396758, "learning_rate": 4.274504725573811e-07, "loss": 0.0143, "step": 8915 }, { "epoch": 4.056414922656961, "grad_norm": 0.35517569256599, "learning_rate": 4.270509123287278e-07, "loss": 0.0028, "step": 8916 }, { "epoch": 4.056869881710646, "grad_norm": 0.48478065467995984, "learning_rate": 4.266515214921127e-07, "loss": 0.0077, "step": 8917 }, { "epoch": 4.057324840764331, "grad_norm": 0.5375092104978245, "learning_rate": 4.2625230008017185e-07, "loss": 0.002, "step": 8918 }, { "epoch": 4.057779799818016, "grad_norm": 0.23906454751512357, "learning_rate": 4.258532481255276e-07, "loss": 0.002, "step": 8919 }, { "epoch": 4.058234758871701, "grad_norm": 0.268370395647064, "learning_rate": 4.254543656607893e-07, "loss": 0.0015, "step": 8920 }, { "epoch": 4.058689717925387, "grad_norm": 0.05382668741572471, "learning_rate": 4.2505565271855205e-07, "loss": 0.0002, "step": 8921 }, { "epoch": 4.059144676979072, "grad_norm": 0.10536834746085053, "learning_rate": 4.246571093313967e-07, "loss": 0.0006, "step": 8922 }, { "epoch": 4.059599636032757, "grad_norm": 0.9770614743306882, "learning_rate": 4.2425873553189003e-07, "loss": 0.0193, "step": 8923 }, { "epoch": 4.060054595086442, "grad_norm": 0.6523638155650862, "learning_rate": 4.238605313525851e-07, "loss": 0.0107, "step": 8924 }, { "epoch": 4.060509554140127, "grad_norm": 0.48975490318972303, "learning_rate": 4.234624968260223e-07, "loss": 0.0075, "step": 8925 }, { "epoch": 4.060964513193812, "grad_norm": 0.32205970007209417, "learning_rate": 4.2306463198472593e-07, "loss": 0.0026, "step": 8926 }, { "epoch": 4.061419472247498, "grad_norm": 0.1948084132867714, "learning_rate": 4.2266693686120933e-07, "loss": 0.0012, "step": 8927 }, { "epoch": 4.061874431301183, "grad_norm": 3.2547280271264594, "learning_rate": 4.222694114879694e-07, "loss": 0.011, "step": 8928 }, { "epoch": 4.0623293903548685, "grad_norm": 0.42014188333503155, "learning_rate": 4.2187205589748953e-07, "loss": 0.0074, "step": 8929 }, { "epoch": 4.062784349408553, "grad_norm": 0.7777753786490325, "learning_rate": 4.2147487012224127e-07, "loss": 0.004, "step": 8930 }, { "epoch": 4.063239308462238, "grad_norm": 0.43404097736010344, "learning_rate": 4.2107785419468e-07, "loss": 0.003, "step": 8931 }, { "epoch": 4.063694267515924, "grad_norm": 0.4489931746225123, "learning_rate": 4.2068100814724814e-07, "loss": 0.0008, "step": 8932 }, { "epoch": 4.064149226569609, "grad_norm": 0.8880793731532659, "learning_rate": 4.2028433201237366e-07, "loss": 0.0106, "step": 8933 }, { "epoch": 4.064604185623294, "grad_norm": 0.7609322968082919, "learning_rate": 4.1988782582247146e-07, "loss": 0.0066, "step": 8934 }, { "epoch": 4.0650591446769795, "grad_norm": 0.22014772950328887, "learning_rate": 4.1949148960994335e-07, "loss": 0.0014, "step": 8935 }, { "epoch": 4.065514103730664, "grad_norm": 0.3068636137831001, "learning_rate": 4.1909532340717484e-07, "loss": 0.0026, "step": 8936 }, { "epoch": 4.065969062784349, "grad_norm": 0.5879201336889759, "learning_rate": 4.186993272465395e-07, "loss": 0.0093, "step": 8937 }, { "epoch": 4.066424021838035, "grad_norm": 0.4951600519862137, "learning_rate": 4.183035011603953e-07, "loss": 0.0047, "step": 8938 }, { "epoch": 4.06687898089172, "grad_norm": 1.1774232062426935, "learning_rate": 4.179078451810889e-07, "loss": 0.0077, "step": 8939 }, { "epoch": 4.067333939945405, "grad_norm": 0.888464494214015, "learning_rate": 4.1751235934094994e-07, "loss": 0.0066, "step": 8940 }, { "epoch": 4.0677888989990905, "grad_norm": 0.4815391543748049, "learning_rate": 4.171170436722974e-07, "loss": 0.0051, "step": 8941 }, { "epoch": 4.068243858052775, "grad_norm": 0.7315496376180145, "learning_rate": 4.1672189820743365e-07, "loss": 0.0147, "step": 8942 }, { "epoch": 4.06869881710646, "grad_norm": 0.7274265191348468, "learning_rate": 4.1632692297864765e-07, "loss": 0.0106, "step": 8943 }, { "epoch": 4.069153776160146, "grad_norm": 0.32291736469853405, "learning_rate": 4.159321180182166e-07, "loss": 0.0037, "step": 8944 }, { "epoch": 4.069608735213831, "grad_norm": 0.4161233843266604, "learning_rate": 4.155374833584011e-07, "loss": 0.0027, "step": 8945 }, { "epoch": 4.070063694267516, "grad_norm": 1.0257194156037674, "learning_rate": 4.1514301903144926e-07, "loss": 0.0116, "step": 8946 }, { "epoch": 4.070518653321201, "grad_norm": 0.879353853650363, "learning_rate": 4.147487250695942e-07, "loss": 0.0095, "step": 8947 }, { "epoch": 4.070973612374886, "grad_norm": 0.5509542255569546, "learning_rate": 4.1435460150505675e-07, "loss": 0.0089, "step": 8948 }, { "epoch": 4.071428571428571, "grad_norm": 0.3263172844187778, "learning_rate": 4.139606483700423e-07, "loss": 0.0092, "step": 8949 }, { "epoch": 4.071883530482257, "grad_norm": 0.7110461873513519, "learning_rate": 4.1356686569674344e-07, "loss": 0.0169, "step": 8950 }, { "epoch": 4.072338489535942, "grad_norm": 1.042068407820813, "learning_rate": 4.1317325351733827e-07, "loss": 0.0094, "step": 8951 }, { "epoch": 4.072793448589627, "grad_norm": 0.957239618485897, "learning_rate": 4.1277981186399084e-07, "loss": 0.0152, "step": 8952 }, { "epoch": 4.073248407643312, "grad_norm": 0.4334678758506964, "learning_rate": 4.123865407688507e-07, "loss": 0.0016, "step": 8953 }, { "epoch": 4.073703366696997, "grad_norm": 0.31880755518695375, "learning_rate": 4.119934402640549e-07, "loss": 0.0013, "step": 8954 }, { "epoch": 4.074158325750682, "grad_norm": 0.3439040255155942, "learning_rate": 4.1160051038172636e-07, "loss": 0.0045, "step": 8955 }, { "epoch": 4.074613284804368, "grad_norm": 0.4582714073670656, "learning_rate": 4.1120775115397305e-07, "loss": 0.0144, "step": 8956 }, { "epoch": 4.075068243858053, "grad_norm": 0.4188655802012719, "learning_rate": 4.1081516261288953e-07, "loss": 0.002, "step": 8957 }, { "epoch": 4.075523202911738, "grad_norm": 0.3480222723495707, "learning_rate": 4.104227447905554e-07, "loss": 0.0005, "step": 8958 }, { "epoch": 4.075978161965423, "grad_norm": 0.6513791806731924, "learning_rate": 4.100304977190389e-07, "loss": 0.0205, "step": 8959 }, { "epoch": 4.076433121019108, "grad_norm": 0.5623280979905413, "learning_rate": 4.0963842143039194e-07, "loss": 0.0061, "step": 8960 }, { "epoch": 4.076888080072793, "grad_norm": 0.5829549299519512, "learning_rate": 4.092465159566525e-07, "loss": 0.0055, "step": 8961 }, { "epoch": 4.077343039126479, "grad_norm": 0.4074715062767335, "learning_rate": 4.088547813298466e-07, "loss": 0.0035, "step": 8962 }, { "epoch": 4.077797998180164, "grad_norm": 0.8573424340624157, "learning_rate": 4.084632175819836e-07, "loss": 0.0077, "step": 8963 }, { "epoch": 4.078252957233849, "grad_norm": 0.7360933549931776, "learning_rate": 4.080718247450621e-07, "loss": 0.0067, "step": 8964 }, { "epoch": 4.078707916287534, "grad_norm": 0.3796359570961542, "learning_rate": 4.076806028510638e-07, "loss": 0.0025, "step": 8965 }, { "epoch": 4.079162875341219, "grad_norm": 0.18633640295767898, "learning_rate": 4.0728955193195806e-07, "loss": 0.0019, "step": 8966 }, { "epoch": 4.079617834394904, "grad_norm": 0.7545604847596348, "learning_rate": 4.068986720196988e-07, "loss": 0.0089, "step": 8967 }, { "epoch": 4.08007279344859, "grad_norm": 0.2635886455230726, "learning_rate": 4.0650796314622767e-07, "loss": 0.0023, "step": 8968 }, { "epoch": 4.080527752502275, "grad_norm": 0.40271703515449986, "learning_rate": 4.061174253434724e-07, "loss": 0.0027, "step": 8969 }, { "epoch": 4.08098271155596, "grad_norm": 0.41664445122436705, "learning_rate": 4.057270586433451e-07, "loss": 0.0057, "step": 8970 }, { "epoch": 4.081437670609645, "grad_norm": 0.5620382453647341, "learning_rate": 4.0533686307774487e-07, "loss": 0.0035, "step": 8971 }, { "epoch": 4.08189262966333, "grad_norm": 0.40588876089405806, "learning_rate": 4.049468386785563e-07, "loss": 0.0013, "step": 8972 }, { "epoch": 4.082347588717015, "grad_norm": 1.1012057569443516, "learning_rate": 4.0455698547765155e-07, "loss": 0.0135, "step": 8973 }, { "epoch": 4.082802547770701, "grad_norm": 0.2474521194529223, "learning_rate": 4.0416730350688687e-07, "loss": 0.0042, "step": 8974 }, { "epoch": 4.083257506824386, "grad_norm": 0.40056719137537805, "learning_rate": 4.0377779279810485e-07, "loss": 0.0023, "step": 8975 }, { "epoch": 4.083712465878071, "grad_norm": 0.2628264293039178, "learning_rate": 4.033884533831359e-07, "loss": 0.0018, "step": 8976 }, { "epoch": 4.084167424931756, "grad_norm": 0.4100282199238081, "learning_rate": 4.0299928529379364e-07, "loss": 0.0075, "step": 8977 }, { "epoch": 4.084622383985441, "grad_norm": 0.3406159115746341, "learning_rate": 4.0261028856188017e-07, "loss": 0.0022, "step": 8978 }, { "epoch": 4.085077343039126, "grad_norm": 0.23619615178159387, "learning_rate": 4.022214632191826e-07, "loss": 0.0015, "step": 8979 }, { "epoch": 4.085532302092812, "grad_norm": 0.3243036094998844, "learning_rate": 4.018328092974733e-07, "loss": 0.0025, "step": 8980 }, { "epoch": 4.085987261146497, "grad_norm": 0.7360445976572362, "learning_rate": 4.014443268285118e-07, "loss": 0.0092, "step": 8981 }, { "epoch": 4.0864422202001816, "grad_norm": 0.4299812931147312, "learning_rate": 4.0105601584404214e-07, "loss": 0.0024, "step": 8982 }, { "epoch": 4.086897179253867, "grad_norm": 0.240905204764052, "learning_rate": 4.00667876375796e-07, "loss": 0.0011, "step": 8983 }, { "epoch": 4.087352138307552, "grad_norm": 0.2892746031158108, "learning_rate": 4.0027990845549146e-07, "loss": 0.0021, "step": 8984 }, { "epoch": 4.087807097361237, "grad_norm": 0.5481880065567188, "learning_rate": 3.9989211211483025e-07, "loss": 0.0106, "step": 8985 }, { "epoch": 4.088262056414923, "grad_norm": 0.3350059331145538, "learning_rate": 3.9950448738550166e-07, "loss": 0.003, "step": 8986 }, { "epoch": 4.088717015468608, "grad_norm": 0.9166154870438693, "learning_rate": 3.991170342991801e-07, "loss": 0.0093, "step": 8987 }, { "epoch": 4.089171974522293, "grad_norm": 0.28692218475619113, "learning_rate": 3.987297528875275e-07, "loss": 0.0016, "step": 8988 }, { "epoch": 4.089626933575978, "grad_norm": 0.5464734844605956, "learning_rate": 3.983426431821899e-07, "loss": 0.0082, "step": 8989 }, { "epoch": 4.090081892629663, "grad_norm": 0.6622253886184369, "learning_rate": 3.9795570521480087e-07, "loss": 0.0089, "step": 8990 }, { "epoch": 4.090536851683349, "grad_norm": 0.621791312552536, "learning_rate": 3.9756893901697904e-07, "loss": 0.0117, "step": 8991 }, { "epoch": 4.090991810737034, "grad_norm": 0.49512019829276144, "learning_rate": 3.971823446203282e-07, "loss": 0.0039, "step": 8992 }, { "epoch": 4.091446769790719, "grad_norm": 0.720035929029686, "learning_rate": 3.967959220564405e-07, "loss": 0.0197, "step": 8993 }, { "epoch": 4.091901728844404, "grad_norm": 0.7723527643819349, "learning_rate": 3.964096713568924e-07, "loss": 0.0134, "step": 8994 }, { "epoch": 4.092356687898089, "grad_norm": 1.155593039977891, "learning_rate": 3.9602359255324574e-07, "loss": 0.0331, "step": 8995 }, { "epoch": 4.092811646951774, "grad_norm": 0.479638479670972, "learning_rate": 3.956376856770494e-07, "loss": 0.0033, "step": 8996 }, { "epoch": 4.09326660600546, "grad_norm": 1.3998148556644323, "learning_rate": 3.952519507598382e-07, "loss": 0.0069, "step": 8997 }, { "epoch": 4.093721565059145, "grad_norm": 0.5587792802216957, "learning_rate": 3.94866387833133e-07, "loss": 0.0073, "step": 8998 }, { "epoch": 4.09417652411283, "grad_norm": 1.2339927317602266, "learning_rate": 3.9448099692843994e-07, "loss": 0.0209, "step": 8999 }, { "epoch": 4.094631483166515, "grad_norm": 0.6208271851210835, "learning_rate": 3.940957780772514e-07, "loss": 0.0037, "step": 9000 }, { "epoch": 4.0950864422202, "grad_norm": 0.35204550371652116, "learning_rate": 3.93710731311045e-07, "loss": 0.0026, "step": 9001 }, { "epoch": 4.095541401273885, "grad_norm": 1.1538698272134988, "learning_rate": 3.933258566612863e-07, "loss": 0.0082, "step": 9002 }, { "epoch": 4.095996360327571, "grad_norm": 0.732969906230072, "learning_rate": 3.929411541594247e-07, "loss": 0.0057, "step": 9003 }, { "epoch": 4.096451319381256, "grad_norm": 0.6785502962991478, "learning_rate": 3.925566238368969e-07, "loss": 0.0113, "step": 9004 }, { "epoch": 4.096906278434941, "grad_norm": 0.3307011846409374, "learning_rate": 3.9217226572512453e-07, "loss": 0.0028, "step": 9005 }, { "epoch": 4.097361237488626, "grad_norm": 0.4969657637518024, "learning_rate": 3.917880798555154e-07, "loss": 0.0113, "step": 9006 }, { "epoch": 4.097816196542311, "grad_norm": 0.5508339729763557, "learning_rate": 3.9140406625946425e-07, "loss": 0.0052, "step": 9007 }, { "epoch": 4.098271155595996, "grad_norm": 0.5524081369900411, "learning_rate": 3.910202249683506e-07, "loss": 0.0064, "step": 9008 }, { "epoch": 4.098726114649682, "grad_norm": 1.3554716135123293, "learning_rate": 3.9063655601354e-07, "loss": 0.0031, "step": 9009 }, { "epoch": 4.099181073703367, "grad_norm": 0.48776488891418956, "learning_rate": 3.902530594263837e-07, "loss": 0.014, "step": 9010 }, { "epoch": 4.099636032757052, "grad_norm": 0.5196051650092911, "learning_rate": 3.898697352382197e-07, "loss": 0.0053, "step": 9011 }, { "epoch": 4.100090991810737, "grad_norm": 0.8104408768279396, "learning_rate": 3.8948658348037236e-07, "loss": 0.025, "step": 9012 }, { "epoch": 4.100545950864422, "grad_norm": 0.080829962053949, "learning_rate": 3.891036041841506e-07, "loss": 0.0002, "step": 9013 }, { "epoch": 4.101000909918107, "grad_norm": 0.23012734609236848, "learning_rate": 3.8872079738084934e-07, "loss": 0.0019, "step": 9014 }, { "epoch": 4.101455868971793, "grad_norm": 0.3769635587691822, "learning_rate": 3.883381631017502e-07, "loss": 0.0023, "step": 9015 }, { "epoch": 4.101910828025478, "grad_norm": 0.32681628649444255, "learning_rate": 3.8795570137811933e-07, "loss": 0.0025, "step": 9016 }, { "epoch": 4.1023657870791626, "grad_norm": 0.6931893432711573, "learning_rate": 3.8757341224121085e-07, "loss": 0.0177, "step": 9017 }, { "epoch": 4.102820746132848, "grad_norm": 0.8992867540615863, "learning_rate": 3.8719129572226425e-07, "loss": 0.0038, "step": 9018 }, { "epoch": 4.103275705186533, "grad_norm": 0.1828982892802334, "learning_rate": 3.8680935185250344e-07, "loss": 0.0009, "step": 9019 }, { "epoch": 4.103730664240218, "grad_norm": 0.545795048810617, "learning_rate": 3.864275806631393e-07, "loss": 0.0056, "step": 9020 }, { "epoch": 4.104185623293904, "grad_norm": 0.2599465046041267, "learning_rate": 3.8604598218536795e-07, "loss": 0.0016, "step": 9021 }, { "epoch": 4.104640582347589, "grad_norm": 0.520207118719701, "learning_rate": 3.8566455645037275e-07, "loss": 0.004, "step": 9022 }, { "epoch": 4.1050955414012735, "grad_norm": 0.6123234762123698, "learning_rate": 3.852833034893219e-07, "loss": 0.0027, "step": 9023 }, { "epoch": 4.105550500454959, "grad_norm": 0.5012133491241545, "learning_rate": 3.8490222333336906e-07, "loss": 0.0048, "step": 9024 }, { "epoch": 4.106005459508644, "grad_norm": 0.3487341506556214, "learning_rate": 3.845213160136552e-07, "loss": 0.0025, "step": 9025 }, { "epoch": 4.106460418562329, "grad_norm": 0.397340406980482, "learning_rate": 3.841405815613056e-07, "loss": 0.0102, "step": 9026 }, { "epoch": 4.106915377616015, "grad_norm": 0.6786607917783007, "learning_rate": 3.837600200074329e-07, "loss": 0.0216, "step": 9027 }, { "epoch": 4.1073703366697, "grad_norm": 0.47906318206304693, "learning_rate": 3.833796313831345e-07, "loss": 0.0023, "step": 9028 }, { "epoch": 4.1078252957233845, "grad_norm": 0.9420783021698584, "learning_rate": 3.8299941571949437e-07, "loss": 0.0133, "step": 9029 }, { "epoch": 4.10828025477707, "grad_norm": 0.6758343084207502, "learning_rate": 3.826193730475808e-07, "loss": 0.0092, "step": 9030 }, { "epoch": 4.108735213830755, "grad_norm": 0.3401986163568087, "learning_rate": 3.8223950339845024e-07, "loss": 0.0028, "step": 9031 }, { "epoch": 4.10919017288444, "grad_norm": 0.643898040664257, "learning_rate": 3.818598068031443e-07, "loss": 0.005, "step": 9032 }, { "epoch": 4.109645131938126, "grad_norm": 0.6612896968147358, "learning_rate": 3.814802832926895e-07, "loss": 0.0057, "step": 9033 }, { "epoch": 4.110100090991811, "grad_norm": 0.6490167902649726, "learning_rate": 3.811009328980986e-07, "loss": 0.0062, "step": 9034 }, { "epoch": 4.1105550500454955, "grad_norm": 0.859696687407586, "learning_rate": 3.807217556503703e-07, "loss": 0.0194, "step": 9035 }, { "epoch": 4.111010009099181, "grad_norm": 0.870337028895151, "learning_rate": 3.8034275158049e-07, "loss": 0.0055, "step": 9036 }, { "epoch": 4.111464968152866, "grad_norm": 0.06537810637843695, "learning_rate": 3.799639207194272e-07, "loss": 0.0003, "step": 9037 }, { "epoch": 4.111919927206552, "grad_norm": 0.5773998698249474, "learning_rate": 3.795852630981392e-07, "loss": 0.014, "step": 9038 }, { "epoch": 4.112374886260237, "grad_norm": 1.2153899666846102, "learning_rate": 3.792067787475681e-07, "loss": 0.0063, "step": 9039 }, { "epoch": 4.112829845313922, "grad_norm": 1.2615298752646595, "learning_rate": 3.788284676986409e-07, "loss": 0.0157, "step": 9040 }, { "epoch": 4.113284804367607, "grad_norm": 1.0471558201851596, "learning_rate": 3.784503299822728e-07, "loss": 0.0152, "step": 9041 }, { "epoch": 4.113739763421292, "grad_norm": 1.2350541554099599, "learning_rate": 3.780723656293628e-07, "loss": 0.0072, "step": 9042 }, { "epoch": 4.114194722474977, "grad_norm": 0.26203170969509143, "learning_rate": 3.7769457467079664e-07, "loss": 0.0016, "step": 9043 }, { "epoch": 4.114649681528663, "grad_norm": 0.8085540409074006, "learning_rate": 3.7731695713744493e-07, "loss": 0.0121, "step": 9044 }, { "epoch": 4.115104640582348, "grad_norm": 0.8206477685030159, "learning_rate": 3.769395130601655e-07, "loss": 0.005, "step": 9045 }, { "epoch": 4.115559599636033, "grad_norm": 0.7569286400882071, "learning_rate": 3.7656224246980207e-07, "loss": 0.0043, "step": 9046 }, { "epoch": 4.116014558689718, "grad_norm": 0.5907737674556077, "learning_rate": 3.761851453971829e-07, "loss": 0.0126, "step": 9047 }, { "epoch": 4.116469517743403, "grad_norm": 0.5336426906768313, "learning_rate": 3.7580822187312265e-07, "loss": 0.0012, "step": 9048 }, { "epoch": 4.116924476797088, "grad_norm": 0.2000753816030492, "learning_rate": 3.7543147192842075e-07, "loss": 0.0012, "step": 9049 }, { "epoch": 4.117379435850774, "grad_norm": 0.4876473513798249, "learning_rate": 3.750548955938654e-07, "loss": 0.0053, "step": 9050 }, { "epoch": 4.117834394904459, "grad_norm": 1.1599361279001044, "learning_rate": 3.746784929002273e-07, "loss": 0.0079, "step": 9051 }, { "epoch": 4.1182893539581436, "grad_norm": 1.1077963424169583, "learning_rate": 3.7430226387826534e-07, "loss": 0.0037, "step": 9052 }, { "epoch": 4.118744313011829, "grad_norm": 0.4928502774420678, "learning_rate": 3.739262085587228e-07, "loss": 0.0039, "step": 9053 }, { "epoch": 4.119199272065514, "grad_norm": 0.8298155196923959, "learning_rate": 3.7355032697232926e-07, "loss": 0.0143, "step": 9054 }, { "epoch": 4.119654231119199, "grad_norm": 0.7618883936378483, "learning_rate": 3.731746191497995e-07, "loss": 0.0172, "step": 9055 }, { "epoch": 4.120109190172885, "grad_norm": 0.1811799225100392, "learning_rate": 3.7279908512183576e-07, "loss": 0.0007, "step": 9056 }, { "epoch": 4.12056414922657, "grad_norm": 0.5016620443798204, "learning_rate": 3.7242372491912456e-07, "loss": 0.0028, "step": 9057 }, { "epoch": 4.1210191082802545, "grad_norm": 0.17560055721231, "learning_rate": 3.720485385723377e-07, "loss": 0.0009, "step": 9058 }, { "epoch": 4.12147406733394, "grad_norm": 0.7960733222253396, "learning_rate": 3.716735261121351e-07, "loss": 0.0079, "step": 9059 }, { "epoch": 4.121929026387625, "grad_norm": 0.6715133333626871, "learning_rate": 3.7129868756916013e-07, "loss": 0.013, "step": 9060 }, { "epoch": 4.12238398544131, "grad_norm": 0.3513955692125617, "learning_rate": 3.709240229740435e-07, "loss": 0.002, "step": 9061 }, { "epoch": 4.122838944494996, "grad_norm": 0.6804506677270292, "learning_rate": 3.7054953235740125e-07, "loss": 0.0132, "step": 9062 }, { "epoch": 4.123293903548681, "grad_norm": 0.3945162658160078, "learning_rate": 3.701752157498345e-07, "loss": 0.0035, "step": 9063 }, { "epoch": 4.1237488626023655, "grad_norm": 0.46512264089908395, "learning_rate": 3.698010731819304e-07, "loss": 0.0013, "step": 9064 }, { "epoch": 4.124203821656051, "grad_norm": 0.4936371404357973, "learning_rate": 3.694271046842629e-07, "loss": 0.0044, "step": 9065 }, { "epoch": 4.124658780709736, "grad_norm": 0.5963487273092747, "learning_rate": 3.690533102873911e-07, "loss": 0.0081, "step": 9066 }, { "epoch": 4.125113739763421, "grad_norm": 0.8115703858707116, "learning_rate": 3.686796900218598e-07, "loss": 0.0023, "step": 9067 }, { "epoch": 4.125568698817107, "grad_norm": 0.36682462260163334, "learning_rate": 3.683062439181992e-07, "loss": 0.0033, "step": 9068 }, { "epoch": 4.126023657870792, "grad_norm": 0.4619664502452838, "learning_rate": 3.6793297200692494e-07, "loss": 0.0052, "step": 9069 }, { "epoch": 4.1264786169244765, "grad_norm": 0.5169070977802155, "learning_rate": 3.6755987431854046e-07, "loss": 0.0037, "step": 9070 }, { "epoch": 4.126933575978162, "grad_norm": 0.15767614293498328, "learning_rate": 3.6718695088353323e-07, "loss": 0.0005, "step": 9071 }, { "epoch": 4.127388535031847, "grad_norm": 0.30976397660479565, "learning_rate": 3.6681420173237585e-07, "loss": 0.0022, "step": 9072 }, { "epoch": 4.127843494085532, "grad_norm": 1.0727873969197255, "learning_rate": 3.6644162689552925e-07, "loss": 0.0184, "step": 9073 }, { "epoch": 4.128298453139218, "grad_norm": 0.16224689660356612, "learning_rate": 3.660692264034374e-07, "loss": 0.0009, "step": 9074 }, { "epoch": 4.128753412192903, "grad_norm": 0.36059486680096564, "learning_rate": 3.6569700028653205e-07, "loss": 0.0031, "step": 9075 }, { "epoch": 4.1292083712465875, "grad_norm": 0.3120321660638545, "learning_rate": 3.6532494857522944e-07, "loss": 0.0025, "step": 9076 }, { "epoch": 4.129663330300273, "grad_norm": 0.33945851571142655, "learning_rate": 3.649530712999319e-07, "loss": 0.0008, "step": 9077 }, { "epoch": 4.130118289353958, "grad_norm": 0.20607458220559233, "learning_rate": 3.645813684910271e-07, "loss": 0.0012, "step": 9078 }, { "epoch": 4.130573248407643, "grad_norm": 0.14382065307718642, "learning_rate": 3.6420984017888934e-07, "loss": 0.0009, "step": 9079 }, { "epoch": 4.131028207461329, "grad_norm": 0.6917656975728576, "learning_rate": 3.6383848639387876e-07, "loss": 0.0042, "step": 9080 }, { "epoch": 4.131483166515014, "grad_norm": 0.5147618492132571, "learning_rate": 3.6346730716634026e-07, "loss": 0.003, "step": 9081 }, { "epoch": 4.131938125568698, "grad_norm": 0.49139698007800653, "learning_rate": 3.6309630252660514e-07, "loss": 0.0022, "step": 9082 }, { "epoch": 4.132393084622384, "grad_norm": 2.284669350273881, "learning_rate": 3.627254725049892e-07, "loss": 0.0294, "step": 9083 }, { "epoch": 4.132848043676069, "grad_norm": 0.7935768861570938, "learning_rate": 3.6235481713179644e-07, "loss": 0.0051, "step": 9084 }, { "epoch": 4.133303002729754, "grad_norm": 1.5420142855161132, "learning_rate": 3.619843364373146e-07, "loss": 0.0112, "step": 9085 }, { "epoch": 4.13375796178344, "grad_norm": 0.7023849968520004, "learning_rate": 3.6161403045181704e-07, "loss": 0.0102, "step": 9086 }, { "epoch": 4.1342129208371245, "grad_norm": 0.5399492879091048, "learning_rate": 3.6124389920556445e-07, "loss": 0.0116, "step": 9087 }, { "epoch": 4.134667879890809, "grad_norm": 0.6379287057897552, "learning_rate": 3.608739427288013e-07, "loss": 0.0051, "step": 9088 }, { "epoch": 4.135122838944495, "grad_norm": 0.9246933070162452, "learning_rate": 3.605041610517601e-07, "loss": 0.0116, "step": 9089 }, { "epoch": 4.13557779799818, "grad_norm": 0.7417657699885851, "learning_rate": 3.601345542046569e-07, "loss": 0.0189, "step": 9090 }, { "epoch": 4.136032757051865, "grad_norm": 0.379851889243767, "learning_rate": 3.597651222176943e-07, "loss": 0.0046, "step": 9091 }, { "epoch": 4.136487716105551, "grad_norm": 0.25228437792793657, "learning_rate": 3.593958651210608e-07, "loss": 0.0012, "step": 9092 }, { "epoch": 4.1369426751592355, "grad_norm": 0.8819953965059972, "learning_rate": 3.590267829449298e-07, "loss": 0.0121, "step": 9093 }, { "epoch": 4.13739763421292, "grad_norm": 0.0982072337436747, "learning_rate": 3.586578757194614e-07, "loss": 0.0004, "step": 9094 }, { "epoch": 4.137852593266606, "grad_norm": 0.9108751013197662, "learning_rate": 3.5828914347480175e-07, "loss": 0.019, "step": 9095 }, { "epoch": 4.138307552320291, "grad_norm": 0.36933424536404486, "learning_rate": 3.5792058624108143e-07, "loss": 0.0134, "step": 9096 }, { "epoch": 4.138762511373977, "grad_norm": 0.7068494631963054, "learning_rate": 3.575522040484172e-07, "loss": 0.006, "step": 9097 }, { "epoch": 4.139217470427662, "grad_norm": 0.5048180841929737, "learning_rate": 3.571839969269114e-07, "loss": 0.0092, "step": 9098 }, { "epoch": 4.1396724294813465, "grad_norm": 0.571623174430884, "learning_rate": 3.568159649066527e-07, "loss": 0.0077, "step": 9099 }, { "epoch": 4.140127388535032, "grad_norm": 0.4676656823572997, "learning_rate": 3.5644810801771454e-07, "loss": 0.0025, "step": 9100 }, { "epoch": 4.140582347588717, "grad_norm": 2.4222235899363005, "learning_rate": 3.560804262901571e-07, "loss": 0.0118, "step": 9101 }, { "epoch": 4.141037306642402, "grad_norm": 0.9168014790148133, "learning_rate": 3.5571291975402545e-07, "loss": 0.0231, "step": 9102 }, { "epoch": 4.141492265696088, "grad_norm": 0.9287477036476876, "learning_rate": 3.5534558843935e-07, "loss": 0.0021, "step": 9103 }, { "epoch": 4.141947224749773, "grad_norm": 1.0141734510881188, "learning_rate": 3.549784323761485e-07, "loss": 0.0214, "step": 9104 }, { "epoch": 4.1424021838034575, "grad_norm": 0.7792812426019553, "learning_rate": 3.546114515944224e-07, "loss": 0.0082, "step": 9105 }, { "epoch": 4.142857142857143, "grad_norm": 0.4909049145706677, "learning_rate": 3.5424464612416025e-07, "loss": 0.0096, "step": 9106 }, { "epoch": 4.143312101910828, "grad_norm": 0.7247566619079496, "learning_rate": 3.538780159953348e-07, "loss": 0.0018, "step": 9107 }, { "epoch": 4.143767060964513, "grad_norm": 0.09812029809130293, "learning_rate": 3.5351156123790614e-07, "loss": 0.0004, "step": 9108 }, { "epoch": 4.144222020018199, "grad_norm": 0.1933668221678777, "learning_rate": 3.5314528188181984e-07, "loss": 0.0009, "step": 9109 }, { "epoch": 4.144676979071884, "grad_norm": 0.4837733580627216, "learning_rate": 3.527791779570058e-07, "loss": 0.0064, "step": 9110 }, { "epoch": 4.1451319381255685, "grad_norm": 0.5022215657110249, "learning_rate": 3.5241324949338075e-07, "loss": 0.0035, "step": 9111 }, { "epoch": 4.145586897179254, "grad_norm": 0.6668402199605945, "learning_rate": 3.520474965208459e-07, "loss": 0.0159, "step": 9112 }, { "epoch": 4.146041856232939, "grad_norm": 0.7867345467261244, "learning_rate": 3.516819190692902e-07, "loss": 0.0166, "step": 9113 }, { "epoch": 4.146496815286624, "grad_norm": 0.5011584937031608, "learning_rate": 3.513165171685856e-07, "loss": 0.0037, "step": 9114 }, { "epoch": 4.14695177434031, "grad_norm": 0.8760216222483778, "learning_rate": 3.509512908485926e-07, "loss": 0.0058, "step": 9115 }, { "epoch": 4.147406733393995, "grad_norm": 0.30486493553030436, "learning_rate": 3.505862401391552e-07, "loss": 0.0013, "step": 9116 }, { "epoch": 4.147861692447679, "grad_norm": 0.2357033991719677, "learning_rate": 3.5022136507010277e-07, "loss": 0.0016, "step": 9117 }, { "epoch": 4.148316651501365, "grad_norm": 0.7560795605030102, "learning_rate": 3.498566656712529e-07, "loss": 0.0016, "step": 9118 }, { "epoch": 4.14877161055505, "grad_norm": 0.826928435539528, "learning_rate": 3.4949214197240624e-07, "loss": 0.0081, "step": 9119 }, { "epoch": 4.149226569608735, "grad_norm": 0.7580698292767752, "learning_rate": 3.4912779400334996e-07, "loss": 0.0106, "step": 9120 }, { "epoch": 4.149681528662421, "grad_norm": 0.7417138128570855, "learning_rate": 3.487636217938567e-07, "loss": 0.0059, "step": 9121 }, { "epoch": 4.1501364877161055, "grad_norm": 0.33418429227702895, "learning_rate": 3.4839962537368514e-07, "loss": 0.0009, "step": 9122 }, { "epoch": 4.15059144676979, "grad_norm": 0.5256218822620315, "learning_rate": 3.480358047725804e-07, "loss": 0.0027, "step": 9123 }, { "epoch": 4.151046405823476, "grad_norm": 0.705808753103024, "learning_rate": 3.476721600202715e-07, "loss": 0.0069, "step": 9124 }, { "epoch": 4.151501364877161, "grad_norm": 0.5233609641960908, "learning_rate": 3.4730869114647404e-07, "loss": 0.0116, "step": 9125 }, { "epoch": 4.151956323930846, "grad_norm": 0.5156333047814919, "learning_rate": 3.4694539818088876e-07, "loss": 0.0114, "step": 9126 }, { "epoch": 4.152411282984532, "grad_norm": 0.32831397732083184, "learning_rate": 3.4658228115320157e-07, "loss": 0.0019, "step": 9127 }, { "epoch": 4.1528662420382165, "grad_norm": 0.32166299116057895, "learning_rate": 3.4621934009308604e-07, "loss": 0.0025, "step": 9128 }, { "epoch": 4.153321201091901, "grad_norm": 0.6764307953286968, "learning_rate": 3.458565750301998e-07, "loss": 0.0041, "step": 9129 }, { "epoch": 4.153776160145587, "grad_norm": 0.8591244890015549, "learning_rate": 3.4549398599418667e-07, "loss": 0.021, "step": 9130 }, { "epoch": 4.154231119199272, "grad_norm": 0.12706261249567183, "learning_rate": 3.4513157301467507e-07, "loss": 0.0004, "step": 9131 }, { "epoch": 4.154686078252957, "grad_norm": 0.7531784809712111, "learning_rate": 3.447693361212795e-07, "loss": 0.0046, "step": 9132 }, { "epoch": 4.155141037306643, "grad_norm": 0.7995179243118439, "learning_rate": 3.4440727534360147e-07, "loss": 0.0039, "step": 9133 }, { "epoch": 4.1555959963603275, "grad_norm": 0.7894225553926383, "learning_rate": 3.440453907112262e-07, "loss": 0.0054, "step": 9134 }, { "epoch": 4.156050955414012, "grad_norm": 0.7555309916358668, "learning_rate": 3.4368368225372484e-07, "loss": 0.0278, "step": 9135 }, { "epoch": 4.156505914467698, "grad_norm": 0.46127595513110214, "learning_rate": 3.4332215000065587e-07, "loss": 0.0062, "step": 9136 }, { "epoch": 4.156960873521383, "grad_norm": 0.17239535256869248, "learning_rate": 3.4296079398156074e-07, "loss": 0.0009, "step": 9137 }, { "epoch": 4.157415832575068, "grad_norm": 0.4821718715865408, "learning_rate": 3.4259961422596884e-07, "loss": 0.0025, "step": 9138 }, { "epoch": 4.157870791628754, "grad_norm": 0.3109847954797973, "learning_rate": 3.4223861076339375e-07, "loss": 0.0018, "step": 9139 }, { "epoch": 4.1583257506824385, "grad_norm": 0.4966298017547602, "learning_rate": 3.4187778362333503e-07, "loss": 0.004, "step": 9140 }, { "epoch": 4.158780709736123, "grad_norm": 0.5149943590712679, "learning_rate": 3.415171328352773e-07, "loss": 0.0043, "step": 9141 }, { "epoch": 4.159235668789809, "grad_norm": 0.3533763737329364, "learning_rate": 3.411566584286918e-07, "loss": 0.0034, "step": 9142 }, { "epoch": 4.159690627843494, "grad_norm": 0.9673248557871915, "learning_rate": 3.4079636043303555e-07, "loss": 0.0151, "step": 9143 }, { "epoch": 4.160145586897179, "grad_norm": 0.4628030771013315, "learning_rate": 3.404362388777499e-07, "loss": 0.0027, "step": 9144 }, { "epoch": 4.160600545950865, "grad_norm": 0.8167371599012814, "learning_rate": 3.400762937922622e-07, "loss": 0.011, "step": 9145 }, { "epoch": 4.1610555050045495, "grad_norm": 0.5852134645894841, "learning_rate": 3.397165252059853e-07, "loss": 0.005, "step": 9146 }, { "epoch": 4.161510464058235, "grad_norm": 0.601904282175205, "learning_rate": 3.3935693314831847e-07, "loss": 0.0073, "step": 9147 }, { "epoch": 4.16196542311192, "grad_norm": 0.3185747022315862, "learning_rate": 3.3899751764864597e-07, "loss": 0.0018, "step": 9148 }, { "epoch": 4.162420382165605, "grad_norm": 0.14060727250208133, "learning_rate": 3.386382787363365e-07, "loss": 0.0007, "step": 9149 }, { "epoch": 4.162875341219291, "grad_norm": 0.6077250790708328, "learning_rate": 3.38279216440747e-07, "loss": 0.0047, "step": 9150 }, { "epoch": 4.163330300272976, "grad_norm": 1.0991153759399597, "learning_rate": 3.379203307912171e-07, "loss": 0.0039, "step": 9151 }, { "epoch": 4.16378525932666, "grad_norm": 1.100851658474028, "learning_rate": 3.3756162181707436e-07, "loss": 0.016, "step": 9152 }, { "epoch": 4.164240218380346, "grad_norm": 0.37417532686764376, "learning_rate": 3.3720308954763053e-07, "loss": 0.0026, "step": 9153 }, { "epoch": 4.164695177434031, "grad_norm": 0.7683985644999398, "learning_rate": 3.3684473401218304e-07, "loss": 0.0121, "step": 9154 }, { "epoch": 4.165150136487716, "grad_norm": 0.8825947512629626, "learning_rate": 3.364865552400146e-07, "loss": 0.0147, "step": 9155 }, { "epoch": 4.165605095541402, "grad_norm": 0.825802195946474, "learning_rate": 3.3612855326039447e-07, "loss": 0.0113, "step": 9156 }, { "epoch": 4.1660600545950865, "grad_norm": 0.9404423015268455, "learning_rate": 3.3577072810257764e-07, "loss": 0.005, "step": 9157 }, { "epoch": 4.166515013648771, "grad_norm": 0.4137021302192791, "learning_rate": 3.3541307979580356e-07, "loss": 0.0038, "step": 9158 }, { "epoch": 4.166969972702457, "grad_norm": 0.4746060234586639, "learning_rate": 3.3505560836929714e-07, "loss": 0.013, "step": 9159 }, { "epoch": 4.167424931756142, "grad_norm": 0.7667958188595829, "learning_rate": 3.346983138522697e-07, "loss": 0.0049, "step": 9160 }, { "epoch": 4.167879890809827, "grad_norm": 0.7890623833062681, "learning_rate": 3.343411962739168e-07, "loss": 0.0071, "step": 9161 }, { "epoch": 4.168334849863513, "grad_norm": 0.3493103390173607, "learning_rate": 3.3398425566342236e-07, "loss": 0.0012, "step": 9162 }, { "epoch": 4.1687898089171975, "grad_norm": 0.41950568901806284, "learning_rate": 3.336274920499519e-07, "loss": 0.0024, "step": 9163 }, { "epoch": 4.169244767970882, "grad_norm": 0.26208487435573263, "learning_rate": 3.332709054626604e-07, "loss": 0.0019, "step": 9164 }, { "epoch": 4.169699727024568, "grad_norm": 0.048636721831574366, "learning_rate": 3.329144959306854e-07, "loss": 0.0002, "step": 9165 }, { "epoch": 4.170154686078253, "grad_norm": 0.4043332475149659, "learning_rate": 3.325582634831509e-07, "loss": 0.0031, "step": 9166 }, { "epoch": 4.170609645131938, "grad_norm": 0.37643600112398223, "learning_rate": 3.3220220814916773e-07, "loss": 0.002, "step": 9167 }, { "epoch": 4.171064604185624, "grad_norm": 0.6325758566489649, "learning_rate": 3.3184632995783007e-07, "loss": 0.0122, "step": 9168 }, { "epoch": 4.1715195632393085, "grad_norm": 1.8309321545138375, "learning_rate": 3.3149062893821945e-07, "loss": 0.0038, "step": 9169 }, { "epoch": 4.171974522292993, "grad_norm": 1.3873853145942066, "learning_rate": 3.311351051194009e-07, "loss": 0.006, "step": 9170 }, { "epoch": 4.172429481346679, "grad_norm": 0.6658463469917438, "learning_rate": 3.3077975853042704e-07, "loss": 0.0086, "step": 9171 }, { "epoch": 4.172884440400364, "grad_norm": 0.8495926899868449, "learning_rate": 3.3042458920033577e-07, "loss": 0.0053, "step": 9172 }, { "epoch": 4.173339399454049, "grad_norm": 0.3892193788545552, "learning_rate": 3.300695971581494e-07, "loss": 0.0021, "step": 9173 }, { "epoch": 4.173794358507735, "grad_norm": 0.440788225441829, "learning_rate": 3.297147824328764e-07, "loss": 0.0025, "step": 9174 }, { "epoch": 4.1742493175614195, "grad_norm": 0.8640796492938213, "learning_rate": 3.293601450535097e-07, "loss": 0.0054, "step": 9175 }, { "epoch": 4.174704276615104, "grad_norm": 0.578390269267532, "learning_rate": 3.2900568504903e-07, "loss": 0.0021, "step": 9176 }, { "epoch": 4.17515923566879, "grad_norm": 0.10646503079017486, "learning_rate": 3.286514024484011e-07, "loss": 0.0005, "step": 9177 }, { "epoch": 4.175614194722475, "grad_norm": 1.1348697023296268, "learning_rate": 3.2829729728057425e-07, "loss": 0.0055, "step": 9178 }, { "epoch": 4.17606915377616, "grad_norm": 0.9367283692537013, "learning_rate": 3.2794336957448517e-07, "loss": 0.0076, "step": 9179 }, { "epoch": 4.176524112829846, "grad_norm": 0.9968985080826686, "learning_rate": 3.2758961935905444e-07, "loss": 0.0255, "step": 9180 }, { "epoch": 4.1769790718835305, "grad_norm": 0.6562423884061814, "learning_rate": 3.272360466631899e-07, "loss": 0.008, "step": 9181 }, { "epoch": 4.177434030937215, "grad_norm": 0.9557158913987218, "learning_rate": 3.2688265151578357e-07, "loss": 0.0124, "step": 9182 }, { "epoch": 4.177888989990901, "grad_norm": 0.13209665792826591, "learning_rate": 3.2652943394571314e-07, "loss": 0.0004, "step": 9183 }, { "epoch": 4.178343949044586, "grad_norm": 0.5002488493069122, "learning_rate": 3.2617639398184186e-07, "loss": 0.0077, "step": 9184 }, { "epoch": 4.178798908098271, "grad_norm": 0.29631076512324445, "learning_rate": 3.258235316530184e-07, "loss": 0.002, "step": 9185 }, { "epoch": 4.179253867151957, "grad_norm": 0.575900249817844, "learning_rate": 3.2547084698807824e-07, "loss": 0.0109, "step": 9186 }, { "epoch": 4.179708826205641, "grad_norm": 0.822695623803017, "learning_rate": 3.2511834001584005e-07, "loss": 0.0117, "step": 9187 }, { "epoch": 4.180163785259326, "grad_norm": 0.4325845466217385, "learning_rate": 3.247660107651096e-07, "loss": 0.0081, "step": 9188 }, { "epoch": 4.180618744313012, "grad_norm": 0.15155563711184322, "learning_rate": 3.24413859264677e-07, "loss": 0.0007, "step": 9189 }, { "epoch": 4.181073703366697, "grad_norm": 0.6087919426404469, "learning_rate": 3.2406188554331945e-07, "loss": 0.005, "step": 9190 }, { "epoch": 4.181528662420382, "grad_norm": 1.2819191671335908, "learning_rate": 3.237100896297979e-07, "loss": 0.0118, "step": 9191 }, { "epoch": 4.1819836214740675, "grad_norm": 0.5506147089608737, "learning_rate": 3.233584715528601e-07, "loss": 0.0098, "step": 9192 }, { "epoch": 4.182438580527752, "grad_norm": 0.12674098041639834, "learning_rate": 3.2300703134123814e-07, "loss": 0.0006, "step": 9193 }, { "epoch": 4.182893539581437, "grad_norm": 0.28894180950072545, "learning_rate": 3.2265576902365007e-07, "loss": 0.0013, "step": 9194 }, { "epoch": 4.183348498635123, "grad_norm": 0.4820808756977568, "learning_rate": 3.223046846288003e-07, "loss": 0.0025, "step": 9195 }, { "epoch": 4.183803457688808, "grad_norm": 0.49421759761372047, "learning_rate": 3.219537781853774e-07, "loss": 0.0065, "step": 9196 }, { "epoch": 4.184258416742493, "grad_norm": 0.6244261055568395, "learning_rate": 3.216030497220557e-07, "loss": 0.0046, "step": 9197 }, { "epoch": 4.1847133757961785, "grad_norm": 0.36816611502298635, "learning_rate": 3.2125249926749455e-07, "loss": 0.0017, "step": 9198 }, { "epoch": 4.185168334849863, "grad_norm": 0.4492359351002564, "learning_rate": 3.2090212685034067e-07, "loss": 0.0046, "step": 9199 }, { "epoch": 4.185623293903548, "grad_norm": 1.5282372055321918, "learning_rate": 3.205519324992237e-07, "loss": 0.007, "step": 9200 }, { "epoch": 4.186078252957234, "grad_norm": 0.48765287297890186, "learning_rate": 3.202019162427611e-07, "loss": 0.005, "step": 9201 }, { "epoch": 4.186533212010919, "grad_norm": 0.51370876423683, "learning_rate": 3.1985207810955404e-07, "loss": 0.0016, "step": 9202 }, { "epoch": 4.186988171064604, "grad_norm": 0.5239834484051922, "learning_rate": 3.1950241812818944e-07, "loss": 0.0097, "step": 9203 }, { "epoch": 4.1874431301182895, "grad_norm": 1.7066421498941662, "learning_rate": 3.1915293632723996e-07, "loss": 0.0065, "step": 9204 }, { "epoch": 4.187898089171974, "grad_norm": 1.3584720336252756, "learning_rate": 3.188036327352637e-07, "loss": 0.0174, "step": 9205 }, { "epoch": 4.188353048225659, "grad_norm": 0.49935863023600996, "learning_rate": 3.1845450738080514e-07, "loss": 0.0024, "step": 9206 }, { "epoch": 4.188808007279345, "grad_norm": 0.7735522963800355, "learning_rate": 3.1810556029239214e-07, "loss": 0.0211, "step": 9207 }, { "epoch": 4.18926296633303, "grad_norm": 0.7718517436546878, "learning_rate": 3.177567914985397e-07, "loss": 0.0047, "step": 9208 }, { "epoch": 4.189717925386716, "grad_norm": 0.2969902840146813, "learning_rate": 3.174082010277468e-07, "loss": 0.0035, "step": 9209 }, { "epoch": 4.1901728844404005, "grad_norm": 0.7970724394036472, "learning_rate": 3.1705978890849946e-07, "loss": 0.0213, "step": 9210 }, { "epoch": 4.190627843494085, "grad_norm": 0.42238115473253535, "learning_rate": 3.1671155516926843e-07, "loss": 0.0031, "step": 9211 }, { "epoch": 4.191082802547771, "grad_norm": 0.4100170540832191, "learning_rate": 3.163634998385087e-07, "loss": 0.005, "step": 9212 }, { "epoch": 4.191537761601456, "grad_norm": 0.40905590557916366, "learning_rate": 3.160156229446631e-07, "loss": 0.0049, "step": 9213 }, { "epoch": 4.191992720655141, "grad_norm": 0.13364133815229404, "learning_rate": 3.156679245161576e-07, "loss": 0.0005, "step": 9214 }, { "epoch": 4.192447679708827, "grad_norm": 0.6038539281885951, "learning_rate": 3.153204045814054e-07, "loss": 0.0118, "step": 9215 }, { "epoch": 4.1929026387625115, "grad_norm": 0.4860083595599833, "learning_rate": 3.149730631688039e-07, "loss": 0.0054, "step": 9216 }, { "epoch": 4.193357597816196, "grad_norm": 0.15028962639182356, "learning_rate": 3.1462590030673616e-07, "loss": 0.0008, "step": 9217 }, { "epoch": 4.193812556869882, "grad_norm": 0.45118867555435277, "learning_rate": 3.1427891602357014e-07, "loss": 0.005, "step": 9218 }, { "epoch": 4.194267515923567, "grad_norm": 1.5428513642741064, "learning_rate": 3.139321103476606e-07, "loss": 0.0146, "step": 9219 }, { "epoch": 4.194722474977252, "grad_norm": 0.8512049451708603, "learning_rate": 3.1358548330734735e-07, "loss": 0.0239, "step": 9220 }, { "epoch": 4.195177434030938, "grad_norm": 0.1974275955798926, "learning_rate": 3.132390349309547e-07, "loss": 0.0009, "step": 9221 }, { "epoch": 4.195632393084622, "grad_norm": 0.40171190235558385, "learning_rate": 3.1289276524679254e-07, "loss": 0.002, "step": 9222 }, { "epoch": 4.196087352138307, "grad_norm": 0.6583545316033516, "learning_rate": 3.125466742831562e-07, "loss": 0.0129, "step": 9223 }, { "epoch": 4.196542311191993, "grad_norm": 1.1204290830480184, "learning_rate": 3.122007620683279e-07, "loss": 0.0171, "step": 9224 }, { "epoch": 4.196997270245678, "grad_norm": 0.6461649285335918, "learning_rate": 3.1185502863057326e-07, "loss": 0.004, "step": 9225 }, { "epoch": 4.197452229299363, "grad_norm": 0.7603660895525879, "learning_rate": 3.1150947399814363e-07, "loss": 0.0088, "step": 9226 }, { "epoch": 4.1979071883530485, "grad_norm": 0.7918075856478315, "learning_rate": 3.1116409819927697e-07, "loss": 0.0042, "step": 9227 }, { "epoch": 4.198362147406733, "grad_norm": 0.7537480588662263, "learning_rate": 3.108189012621951e-07, "loss": 0.0063, "step": 9228 }, { "epoch": 4.198817106460418, "grad_norm": 0.843050478945658, "learning_rate": 3.1047388321510693e-07, "loss": 0.0101, "step": 9229 }, { "epoch": 4.199272065514104, "grad_norm": 0.7106879941760049, "learning_rate": 3.1012904408620536e-07, "loss": 0.017, "step": 9230 }, { "epoch": 4.199727024567789, "grad_norm": 0.44514843960976685, "learning_rate": 3.097843839036688e-07, "loss": 0.0073, "step": 9231 }, { "epoch": 4.200181983621474, "grad_norm": 0.8357002355218163, "learning_rate": 3.094399026956613e-07, "loss": 0.0063, "step": 9232 }, { "epoch": 4.2006369426751595, "grad_norm": 0.588356132926905, "learning_rate": 3.0909560049033145e-07, "loss": 0.0101, "step": 9233 }, { "epoch": 4.201091901728844, "grad_norm": 0.3482790872753342, "learning_rate": 3.0875147731581625e-07, "loss": 0.0023, "step": 9234 }, { "epoch": 4.201546860782529, "grad_norm": 0.2677699330021114, "learning_rate": 3.084075332002348e-07, "loss": 0.0018, "step": 9235 }, { "epoch": 4.202001819836215, "grad_norm": 0.23156310518063766, "learning_rate": 3.080637681716925e-07, "loss": 0.0014, "step": 9236 }, { "epoch": 4.2024567788899, "grad_norm": 0.2543589191615957, "learning_rate": 3.077201822582804e-07, "loss": 0.0014, "step": 9237 }, { "epoch": 4.202911737943585, "grad_norm": 0.24208807848918842, "learning_rate": 3.0737677548807435e-07, "loss": 0.0002, "step": 9238 }, { "epoch": 4.2033666969972705, "grad_norm": 0.6474198865495667, "learning_rate": 3.0703354788913675e-07, "loss": 0.0046, "step": 9239 }, { "epoch": 4.203821656050955, "grad_norm": 0.5575563448138307, "learning_rate": 3.066904994895137e-07, "loss": 0.0025, "step": 9240 }, { "epoch": 4.20427661510464, "grad_norm": 0.37846758603305647, "learning_rate": 3.0634763031723885e-07, "loss": 0.0055, "step": 9241 }, { "epoch": 4.204731574158326, "grad_norm": 0.9864653460859852, "learning_rate": 3.0600494040032904e-07, "loss": 0.0088, "step": 9242 }, { "epoch": 4.205186533212011, "grad_norm": 0.7022029533018272, "learning_rate": 3.056624297667871e-07, "loss": 0.0076, "step": 9243 }, { "epoch": 4.205641492265696, "grad_norm": 0.8321432714632097, "learning_rate": 3.0532009844460227e-07, "loss": 0.0105, "step": 9244 }, { "epoch": 4.2060964513193815, "grad_norm": 0.9133185906748561, "learning_rate": 3.0497794646174803e-07, "loss": 0.0073, "step": 9245 }, { "epoch": 4.206551410373066, "grad_norm": 1.4742502038760457, "learning_rate": 3.046359738461832e-07, "loss": 0.0023, "step": 9246 }, { "epoch": 4.207006369426751, "grad_norm": 0.8278066332021435, "learning_rate": 3.0429418062585205e-07, "loss": 0.006, "step": 9247 }, { "epoch": 4.207461328480437, "grad_norm": 0.3007093904970632, "learning_rate": 3.039525668286847e-07, "loss": 0.0013, "step": 9248 }, { "epoch": 4.207916287534122, "grad_norm": 0.7596174045473961, "learning_rate": 3.036111324825969e-07, "loss": 0.0237, "step": 9249 }, { "epoch": 4.208371246587807, "grad_norm": 0.7184613509126948, "learning_rate": 3.0326987761548825e-07, "loss": 0.0131, "step": 9250 }, { "epoch": 4.2088262056414925, "grad_norm": 0.5900368000138793, "learning_rate": 3.0292880225524514e-07, "loss": 0.0015, "step": 9251 }, { "epoch": 4.209281164695177, "grad_norm": 0.1964734848808745, "learning_rate": 3.0258790642973797e-07, "loss": 0.001, "step": 9252 }, { "epoch": 4.209736123748862, "grad_norm": 0.747708838192542, "learning_rate": 3.022471901668239e-07, "loss": 0.0072, "step": 9253 }, { "epoch": 4.210191082802548, "grad_norm": 0.42849045064524416, "learning_rate": 3.019066534943443e-07, "loss": 0.0023, "step": 9254 }, { "epoch": 4.210646041856233, "grad_norm": 0.3403091483222645, "learning_rate": 3.015662964401267e-07, "loss": 0.0012, "step": 9255 }, { "epoch": 4.211101000909918, "grad_norm": 0.3996126054396111, "learning_rate": 3.0122611903198344e-07, "loss": 0.0023, "step": 9256 }, { "epoch": 4.211555959963603, "grad_norm": 0.9389410951991405, "learning_rate": 3.0088612129771154e-07, "loss": 0.0278, "step": 9257 }, { "epoch": 4.212010919017288, "grad_norm": 0.542367395456533, "learning_rate": 3.0054630326509544e-07, "loss": 0.0036, "step": 9258 }, { "epoch": 4.212465878070974, "grad_norm": 0.2148959665663888, "learning_rate": 3.002066649619026e-07, "loss": 0.0028, "step": 9259 }, { "epoch": 4.212920837124659, "grad_norm": 0.13471196170910954, "learning_rate": 2.9986720641588696e-07, "loss": 0.0006, "step": 9260 }, { "epoch": 4.213375796178344, "grad_norm": 0.3569205756555735, "learning_rate": 2.9952792765478715e-07, "loss": 0.0011, "step": 9261 }, { "epoch": 4.2138307552320295, "grad_norm": 0.6763691727777869, "learning_rate": 2.991888287063277e-07, "loss": 0.007, "step": 9262 }, { "epoch": 4.214285714285714, "grad_norm": 1.3838222796128776, "learning_rate": 2.988499095982189e-07, "loss": 0.0099, "step": 9263 }, { "epoch": 4.214740673339399, "grad_norm": 0.7692394937005496, "learning_rate": 2.98511170358155e-07, "loss": 0.0056, "step": 9264 }, { "epoch": 4.215195632393085, "grad_norm": 0.2397692686734173, "learning_rate": 2.9817261101381667e-07, "loss": 0.0013, "step": 9265 }, { "epoch": 4.21565059144677, "grad_norm": 0.5001497018338963, "learning_rate": 2.9783423159286923e-07, "loss": 0.0031, "step": 9266 }, { "epoch": 4.216105550500455, "grad_norm": 0.9109543918883428, "learning_rate": 2.974960321229628e-07, "loss": 0.0099, "step": 9267 }, { "epoch": 4.2165605095541405, "grad_norm": 1.4424206041023218, "learning_rate": 2.971580126317344e-07, "loss": 0.009, "step": 9268 }, { "epoch": 4.217015468607825, "grad_norm": 0.3313367280244002, "learning_rate": 2.9682017314680566e-07, "loss": 0.0016, "step": 9269 }, { "epoch": 4.21747042766151, "grad_norm": 0.47213072177922044, "learning_rate": 2.96482513695783e-07, "loss": 0.0015, "step": 9270 }, { "epoch": 4.217925386715196, "grad_norm": 0.8968895003922193, "learning_rate": 2.961450343062583e-07, "loss": 0.0071, "step": 9271 }, { "epoch": 4.218380345768881, "grad_norm": 0.1859250031787471, "learning_rate": 2.9580773500580804e-07, "loss": 0.001, "step": 9272 }, { "epoch": 4.218835304822566, "grad_norm": 0.5260641443143235, "learning_rate": 2.9547061582199666e-07, "loss": 0.0067, "step": 9273 }, { "epoch": 4.2192902638762515, "grad_norm": 1.005837339890751, "learning_rate": 2.9513367678237063e-07, "loss": 0.0245, "step": 9274 }, { "epoch": 4.219745222929936, "grad_norm": 0.39141181745320414, "learning_rate": 2.94796917914463e-07, "loss": 0.0021, "step": 9275 }, { "epoch": 4.220200181983621, "grad_norm": 0.1732359117001167, "learning_rate": 2.9446033924579315e-07, "loss": 0.0005, "step": 9276 }, { "epoch": 4.220655141037307, "grad_norm": 0.46487345170162647, "learning_rate": 2.9412394080386374e-07, "loss": 0.0032, "step": 9277 }, { "epoch": 4.221110100090992, "grad_norm": 0.42629865876242096, "learning_rate": 2.937877226161648e-07, "loss": 0.005, "step": 9278 }, { "epoch": 4.221565059144677, "grad_norm": 0.743844588398593, "learning_rate": 2.934516847101701e-07, "loss": 0.0153, "step": 9279 }, { "epoch": 4.2220200181983625, "grad_norm": 0.6488217183337317, "learning_rate": 2.9311582711333885e-07, "loss": 0.0109, "step": 9280 }, { "epoch": 4.222474977252047, "grad_norm": 0.5977824282414355, "learning_rate": 2.927801498531155e-07, "loss": 0.0092, "step": 9281 }, { "epoch": 4.222929936305732, "grad_norm": 0.5295061198307358, "learning_rate": 2.924446529569308e-07, "loss": 0.003, "step": 9282 }, { "epoch": 4.223384895359418, "grad_norm": 0.5614639390300001, "learning_rate": 2.9210933645220015e-07, "loss": 0.0025, "step": 9283 }, { "epoch": 4.223839854413103, "grad_norm": 0.1911478954337233, "learning_rate": 2.9177420036632376e-07, "loss": 0.0009, "step": 9284 }, { "epoch": 4.224294813466788, "grad_norm": 0.22063670053724865, "learning_rate": 2.9143924472668754e-07, "loss": 0.0012, "step": 9285 }, { "epoch": 4.2247497725204735, "grad_norm": 0.33560346829693666, "learning_rate": 2.9110446956066187e-07, "loss": 0.0011, "step": 9286 }, { "epoch": 4.225204731574158, "grad_norm": 1.2453774676690206, "learning_rate": 2.907698748956042e-07, "loss": 0.0261, "step": 9287 }, { "epoch": 4.225659690627843, "grad_norm": 0.09451237877724696, "learning_rate": 2.9043546075885554e-07, "loss": 0.0003, "step": 9288 }, { "epoch": 4.226114649681529, "grad_norm": 1.0997917246674491, "learning_rate": 2.901012271777423e-07, "loss": 0.0182, "step": 9289 }, { "epoch": 4.226569608735214, "grad_norm": 0.7286325257866918, "learning_rate": 2.897671741795774e-07, "loss": 0.0062, "step": 9290 }, { "epoch": 4.227024567788899, "grad_norm": 0.4464865835542279, "learning_rate": 2.894333017916573e-07, "loss": 0.005, "step": 9291 }, { "epoch": 4.227479526842584, "grad_norm": 0.8584440869073869, "learning_rate": 2.8909961004126546e-07, "loss": 0.0009, "step": 9292 }, { "epoch": 4.227934485896269, "grad_norm": 0.4747277647443826, "learning_rate": 2.88766098955669e-07, "loss": 0.0032, "step": 9293 }, { "epoch": 4.228389444949954, "grad_norm": 0.7066543128180379, "learning_rate": 2.8843276856212106e-07, "loss": 0.0165, "step": 9294 }, { "epoch": 4.22884440400364, "grad_norm": 0.7443695766535057, "learning_rate": 2.880996188878596e-07, "loss": 0.0123, "step": 9295 }, { "epoch": 4.229299363057325, "grad_norm": 0.3742285664316725, "learning_rate": 2.877666499601084e-07, "loss": 0.0025, "step": 9296 }, { "epoch": 4.22975432211101, "grad_norm": 0.1631129320336293, "learning_rate": 2.874338618060765e-07, "loss": 0.0008, "step": 9297 }, { "epoch": 4.230209281164695, "grad_norm": 0.1481496928085184, "learning_rate": 2.8710125445295777e-07, "loss": 0.0002, "step": 9298 }, { "epoch": 4.23066424021838, "grad_norm": 0.2300663656107947, "learning_rate": 2.8676882792793126e-07, "loss": 0.0009, "step": 9299 }, { "epoch": 4.231119199272065, "grad_norm": 0.7377197350357967, "learning_rate": 2.864365822581605e-07, "loss": 0.0067, "step": 9300 }, { "epoch": 4.231574158325751, "grad_norm": 0.15789684023168762, "learning_rate": 2.861045174707966e-07, "loss": 0.0007, "step": 9301 }, { "epoch": 4.232029117379436, "grad_norm": 1.0666344268412935, "learning_rate": 2.857726335929734e-07, "loss": 0.0209, "step": 9302 }, { "epoch": 4.232484076433121, "grad_norm": 0.19159205695488757, "learning_rate": 2.8544093065181105e-07, "loss": 0.0012, "step": 9303 }, { "epoch": 4.232939035486806, "grad_norm": 0.4234984102248458, "learning_rate": 2.851094086744152e-07, "loss": 0.0028, "step": 9304 }, { "epoch": 4.233393994540491, "grad_norm": 1.073062457968768, "learning_rate": 2.8477806768787616e-07, "loss": 0.0259, "step": 9305 }, { "epoch": 4.233848953594176, "grad_norm": 0.7666039793230076, "learning_rate": 2.844469077192691e-07, "loss": 0.0153, "step": 9306 }, { "epoch": 4.234303912647862, "grad_norm": 0.7649719710404642, "learning_rate": 2.8411592879565604e-07, "loss": 0.0017, "step": 9307 }, { "epoch": 4.234758871701547, "grad_norm": 0.3204032973984886, "learning_rate": 2.8378513094408227e-07, "loss": 0.0042, "step": 9308 }, { "epoch": 4.235213830755232, "grad_norm": 0.36851835536170463, "learning_rate": 2.8345451419157924e-07, "loss": 0.0019, "step": 9309 }, { "epoch": 4.235668789808917, "grad_norm": 0.5897412651462877, "learning_rate": 2.831240785651632e-07, "loss": 0.0043, "step": 9310 }, { "epoch": 4.236123748862602, "grad_norm": 1.0079293667522646, "learning_rate": 2.8279382409183596e-07, "loss": 0.0186, "step": 9311 }, { "epoch": 4.236578707916287, "grad_norm": 0.7002074038662439, "learning_rate": 2.824637507985853e-07, "loss": 0.0119, "step": 9312 }, { "epoch": 4.237033666969973, "grad_norm": 0.6535471612218284, "learning_rate": 2.821338587123823e-07, "loss": 0.0083, "step": 9313 }, { "epoch": 4.237488626023658, "grad_norm": 0.30987228722183957, "learning_rate": 2.818041478601849e-07, "loss": 0.0021, "step": 9314 }, { "epoch": 4.237943585077343, "grad_norm": 0.20335579181277977, "learning_rate": 2.8147461826893456e-07, "loss": 0.0011, "step": 9315 }, { "epoch": 4.238398544131028, "grad_norm": 0.3507256247040015, "learning_rate": 2.8114526996556006e-07, "loss": 0.002, "step": 9316 }, { "epoch": 4.238853503184713, "grad_norm": 0.6444175517730933, "learning_rate": 2.8081610297697346e-07, "loss": 0.0034, "step": 9317 }, { "epoch": 4.239308462238399, "grad_norm": 0.6078146397552509, "learning_rate": 2.8048711733007356e-07, "loss": 0.0087, "step": 9318 }, { "epoch": 4.239763421292084, "grad_norm": 1.582229787612465, "learning_rate": 2.8015831305174324e-07, "loss": 0.004, "step": 9319 }, { "epoch": 4.240218380345769, "grad_norm": 0.7319077933646089, "learning_rate": 2.798296901688505e-07, "loss": 0.0024, "step": 9320 }, { "epoch": 4.2406733393994545, "grad_norm": 0.5422943449679541, "learning_rate": 2.795012487082496e-07, "loss": 0.0208, "step": 9321 }, { "epoch": 4.241128298453139, "grad_norm": 2.199608855096781, "learning_rate": 2.791729886967792e-07, "loss": 0.0074, "step": 9322 }, { "epoch": 4.241583257506824, "grad_norm": 0.6235287982290946, "learning_rate": 2.788449101612628e-07, "loss": 0.0089, "step": 9323 }, { "epoch": 4.24203821656051, "grad_norm": 0.4452225371588477, "learning_rate": 2.785170131285092e-07, "loss": 0.0033, "step": 9324 }, { "epoch": 4.242493175614195, "grad_norm": 0.35391706769771925, "learning_rate": 2.7818929762531336e-07, "loss": 0.0028, "step": 9325 }, { "epoch": 4.24294813466788, "grad_norm": 0.7520579768481093, "learning_rate": 2.778617636784547e-07, "loss": 0.0056, "step": 9326 }, { "epoch": 4.243403093721565, "grad_norm": 0.4841003879945973, "learning_rate": 2.77534411314698e-07, "loss": 0.0056, "step": 9327 }, { "epoch": 4.24385805277525, "grad_norm": 0.183387291408306, "learning_rate": 2.7720724056079227e-07, "loss": 0.0006, "step": 9328 }, { "epoch": 4.244313011828935, "grad_norm": 0.6630255720374216, "learning_rate": 2.7688025144347266e-07, "loss": 0.009, "step": 9329 }, { "epoch": 4.244767970882621, "grad_norm": 0.16241825038153293, "learning_rate": 2.765534439894596e-07, "loss": 0.0007, "step": 9330 }, { "epoch": 4.245222929936306, "grad_norm": 0.654478990556985, "learning_rate": 2.7622681822545765e-07, "loss": 0.0139, "step": 9331 }, { "epoch": 4.245677888989991, "grad_norm": 0.5694783490156349, "learning_rate": 2.7590037417815825e-07, "loss": 0.0154, "step": 9332 }, { "epoch": 4.246132848043676, "grad_norm": 0.5907368331618346, "learning_rate": 2.755741118742361e-07, "loss": 0.0103, "step": 9333 }, { "epoch": 4.246587807097361, "grad_norm": 0.6481876237470008, "learning_rate": 2.752480313403519e-07, "loss": 0.0166, "step": 9334 }, { "epoch": 4.247042766151046, "grad_norm": 0.7466656462706402, "learning_rate": 2.74922132603152e-07, "loss": 0.0029, "step": 9335 }, { "epoch": 4.247497725204732, "grad_norm": 0.2708121583567563, "learning_rate": 2.745964156892672e-07, "loss": 0.0016, "step": 9336 }, { "epoch": 4.247952684258417, "grad_norm": 0.17503469441525363, "learning_rate": 2.7427088062531333e-07, "loss": 0.0012, "step": 9337 }, { "epoch": 4.248407643312102, "grad_norm": 0.41233339428657656, "learning_rate": 2.739455274378913e-07, "loss": 0.0074, "step": 9338 }, { "epoch": 4.248862602365787, "grad_norm": 1.7214992859021458, "learning_rate": 2.73620356153588e-07, "loss": 0.0129, "step": 9339 }, { "epoch": 4.249317561419472, "grad_norm": 4.1797259602025525, "learning_rate": 2.732953667989757e-07, "loss": 0.0236, "step": 9340 }, { "epoch": 4.249772520473157, "grad_norm": 0.7135018817110257, "learning_rate": 2.729705594006099e-07, "loss": 0.0158, "step": 9341 }, { "epoch": 4.250227479526843, "grad_norm": 0.3352585965945271, "learning_rate": 2.726459339850332e-07, "loss": 0.0086, "step": 9342 }, { "epoch": 4.250682438580528, "grad_norm": 0.5417026621597827, "learning_rate": 2.723214905787719e-07, "loss": 0.0098, "step": 9343 }, { "epoch": 4.251137397634213, "grad_norm": 0.1639399023823485, "learning_rate": 2.719972292083378e-07, "loss": 0.0004, "step": 9344 }, { "epoch": 4.251592356687898, "grad_norm": 1.1499363576920996, "learning_rate": 2.716731499002287e-07, "loss": 0.0055, "step": 9345 }, { "epoch": 4.252047315741583, "grad_norm": 0.15884641021153562, "learning_rate": 2.7134925268092724e-07, "loss": 0.0007, "step": 9346 }, { "epoch": 4.252502274795268, "grad_norm": 0.9610687034633633, "learning_rate": 2.7102553757690024e-07, "loss": 0.0162, "step": 9347 }, { "epoch": 4.252957233848954, "grad_norm": 1.1269208748140875, "learning_rate": 2.707020046146003e-07, "loss": 0.0097, "step": 9348 }, { "epoch": 4.253412192902639, "grad_norm": 0.4936141427973077, "learning_rate": 2.7037865382046475e-07, "loss": 0.0009, "step": 9349 }, { "epoch": 4.253867151956324, "grad_norm": 0.26293170538976524, "learning_rate": 2.7005548522091694e-07, "loss": 0.0006, "step": 9350 }, { "epoch": 4.254322111010009, "grad_norm": 1.073025753771524, "learning_rate": 2.69732498842365e-07, "loss": 0.0077, "step": 9351 }, { "epoch": 4.254777070063694, "grad_norm": 0.7719713855022057, "learning_rate": 2.694096947112007e-07, "loss": 0.0078, "step": 9352 }, { "epoch": 4.255232029117379, "grad_norm": 0.5942831790498595, "learning_rate": 2.690870728538034e-07, "loss": 0.0119, "step": 9353 }, { "epoch": 4.255686988171065, "grad_norm": 0.7998249204894701, "learning_rate": 2.687646332965352e-07, "loss": 0.0121, "step": 9354 }, { "epoch": 4.25614194722475, "grad_norm": 0.2975966688918182, "learning_rate": 2.684423760657456e-07, "loss": 0.0024, "step": 9355 }, { "epoch": 4.256596906278435, "grad_norm": 1.01127585868472, "learning_rate": 2.6812030118776754e-07, "loss": 0.0195, "step": 9356 }, { "epoch": 4.25705186533212, "grad_norm": 1.9106885830599236, "learning_rate": 2.677984086889193e-07, "loss": 0.0151, "step": 9357 }, { "epoch": 4.257506824385805, "grad_norm": 0.5373883095878089, "learning_rate": 2.674766985955041e-07, "loss": 0.0022, "step": 9358 }, { "epoch": 4.25796178343949, "grad_norm": 0.6483983100678385, "learning_rate": 2.6715517093381075e-07, "loss": 0.0101, "step": 9359 }, { "epoch": 4.258416742493176, "grad_norm": 0.9819601720391563, "learning_rate": 2.6683382573011426e-07, "loss": 0.0153, "step": 9360 }, { "epoch": 4.258871701546861, "grad_norm": 0.5592363784756628, "learning_rate": 2.665126630106726e-07, "loss": 0.0052, "step": 9361 }, { "epoch": 4.2593266606005455, "grad_norm": 0.5111946205337408, "learning_rate": 2.661916828017297e-07, "loss": 0.0011, "step": 9362 }, { "epoch": 4.259781619654231, "grad_norm": 0.4379673384825, "learning_rate": 2.6587088512951416e-07, "loss": 0.0033, "step": 9363 }, { "epoch": 4.260236578707916, "grad_norm": 0.7764604315216734, "learning_rate": 2.655502700202414e-07, "loss": 0.0165, "step": 9364 }, { "epoch": 4.260691537761602, "grad_norm": 0.9797946477331295, "learning_rate": 2.6522983750010974e-07, "loss": 0.0106, "step": 9365 }, { "epoch": 4.261146496815287, "grad_norm": 0.9022702623408249, "learning_rate": 2.6490958759530285e-07, "loss": 0.0078, "step": 9366 }, { "epoch": 4.261601455868972, "grad_norm": 0.24770922018188882, "learning_rate": 2.645895203319918e-07, "loss": 0.0013, "step": 9367 }, { "epoch": 4.262056414922657, "grad_norm": 0.7827326436307885, "learning_rate": 2.6426963573632947e-07, "loss": 0.0097, "step": 9368 }, { "epoch": 4.262511373976342, "grad_norm": 0.673061713448335, "learning_rate": 2.6394993383445647e-07, "loss": 0.0029, "step": 9369 }, { "epoch": 4.262966333030027, "grad_norm": 1.1525652100192096, "learning_rate": 2.6363041465249707e-07, "loss": 0.0131, "step": 9370 }, { "epoch": 4.263421292083713, "grad_norm": 0.4912369496361605, "learning_rate": 2.633110782165607e-07, "loss": 0.0073, "step": 9371 }, { "epoch": 4.263876251137398, "grad_norm": 0.7104054822040058, "learning_rate": 2.629919245527418e-07, "loss": 0.0086, "step": 9372 }, { "epoch": 4.264331210191083, "grad_norm": 0.49380585620703693, "learning_rate": 2.6267295368712057e-07, "loss": 0.0081, "step": 9373 }, { "epoch": 4.264786169244768, "grad_norm": 0.1785938493792216, "learning_rate": 2.6235416564576234e-07, "loss": 0.001, "step": 9374 }, { "epoch": 4.265241128298453, "grad_norm": 0.44076484992603726, "learning_rate": 2.6203556045471674e-07, "loss": 0.0026, "step": 9375 }, { "epoch": 4.265696087352138, "grad_norm": 1.1558091050367687, "learning_rate": 2.6171713814001824e-07, "loss": 0.0132, "step": 9376 }, { "epoch": 4.266151046405824, "grad_norm": 0.2803262730893479, "learning_rate": 2.6139889872768746e-07, "loss": 0.0015, "step": 9377 }, { "epoch": 4.266606005459509, "grad_norm": 0.2765087540714806, "learning_rate": 2.6108084224372885e-07, "loss": 0.0009, "step": 9378 }, { "epoch": 4.267060964513194, "grad_norm": 0.294568368046785, "learning_rate": 2.607629687141333e-07, "loss": 0.0017, "step": 9379 }, { "epoch": 4.267515923566879, "grad_norm": 0.46158196960877845, "learning_rate": 2.60445278164875e-07, "loss": 0.0011, "step": 9380 }, { "epoch": 4.267970882620564, "grad_norm": 1.0057063935798778, "learning_rate": 2.6012777062191546e-07, "loss": 0.0063, "step": 9381 }, { "epoch": 4.268425841674249, "grad_norm": 0.5667395759372718, "learning_rate": 2.598104461111994e-07, "loss": 0.0037, "step": 9382 }, { "epoch": 4.268880800727935, "grad_norm": 0.35995752070681397, "learning_rate": 2.5949330465865676e-07, "loss": 0.0052, "step": 9383 }, { "epoch": 4.26933575978162, "grad_norm": 0.4036853522140548, "learning_rate": 2.5917634629020334e-07, "loss": 0.0104, "step": 9384 }, { "epoch": 4.269790718835305, "grad_norm": 0.43557028661469, "learning_rate": 2.588595710317396e-07, "loss": 0.0016, "step": 9385 }, { "epoch": 4.27024567788899, "grad_norm": 0.45564069922484357, "learning_rate": 2.5854297890915094e-07, "loss": 0.0035, "step": 9386 }, { "epoch": 4.270700636942675, "grad_norm": 0.2849016938922637, "learning_rate": 2.582265699483069e-07, "loss": 0.0025, "step": 9387 }, { "epoch": 4.27115559599636, "grad_norm": 0.66255741637744, "learning_rate": 2.579103441750641e-07, "loss": 0.0082, "step": 9388 }, { "epoch": 4.271610555050046, "grad_norm": 0.14480158965002918, "learning_rate": 2.5759430161526324e-07, "loss": 0.0006, "step": 9389 }, { "epoch": 4.272065514103731, "grad_norm": 0.12993714858424113, "learning_rate": 2.5727844229472914e-07, "loss": 0.0004, "step": 9390 }, { "epoch": 4.272520473157416, "grad_norm": 0.7165598487293803, "learning_rate": 2.569627662392729e-07, "loss": 0.0097, "step": 9391 }, { "epoch": 4.272975432211101, "grad_norm": 0.555633769112161, "learning_rate": 2.566472734746894e-07, "loss": 0.0095, "step": 9392 }, { "epoch": 4.273430391264786, "grad_norm": 0.9772983730447893, "learning_rate": 2.563319640267606e-07, "loss": 0.0296, "step": 9393 }, { "epoch": 4.273885350318471, "grad_norm": 0.2716625626273673, "learning_rate": 2.560168379212505e-07, "loss": 0.0017, "step": 9394 }, { "epoch": 4.274340309372157, "grad_norm": 0.6646661651994485, "learning_rate": 2.557018951839113e-07, "loss": 0.0044, "step": 9395 }, { "epoch": 4.274795268425842, "grad_norm": 0.8271593982953452, "learning_rate": 2.553871358404783e-07, "loss": 0.0082, "step": 9396 }, { "epoch": 4.2752502274795265, "grad_norm": 0.7014008680166035, "learning_rate": 2.5507255991667116e-07, "loss": 0.0083, "step": 9397 }, { "epoch": 4.275705186533212, "grad_norm": 0.9458312195492896, "learning_rate": 2.5475816743819715e-07, "loss": 0.0141, "step": 9398 }, { "epoch": 4.276160145586897, "grad_norm": 0.43357110253633674, "learning_rate": 2.544439584307459e-07, "loss": 0.006, "step": 9399 }, { "epoch": 4.276615104640582, "grad_norm": 0.7039564125597411, "learning_rate": 2.5412993291999396e-07, "loss": 0.0069, "step": 9400 }, { "epoch": 4.277070063694268, "grad_norm": 0.4284002440862766, "learning_rate": 2.538160909316009e-07, "loss": 0.0023, "step": 9401 }, { "epoch": 4.277525022747953, "grad_norm": 0.46249574155991147, "learning_rate": 2.5350243249121333e-07, "loss": 0.0019, "step": 9402 }, { "epoch": 4.2779799818016375, "grad_norm": 0.692513801155582, "learning_rate": 2.5318895762446226e-07, "loss": 0.0086, "step": 9403 }, { "epoch": 4.278434940855323, "grad_norm": 2.8872848816364605, "learning_rate": 2.5287566635696316e-07, "loss": 0.0039, "step": 9404 }, { "epoch": 4.278889899909008, "grad_norm": 0.31750986437139384, "learning_rate": 2.5256255871431654e-07, "loss": 0.0022, "step": 9405 }, { "epoch": 4.279344858962693, "grad_norm": 1.2647220779564645, "learning_rate": 2.522496347221079e-07, "loss": 0.012, "step": 9406 }, { "epoch": 4.279799818016379, "grad_norm": 1.3625285349391865, "learning_rate": 2.519368944059089e-07, "loss": 0.0152, "step": 9407 }, { "epoch": 4.280254777070064, "grad_norm": 1.4301817693748855, "learning_rate": 2.516243377912742e-07, "loss": 0.0088, "step": 9408 }, { "epoch": 4.2807097361237485, "grad_norm": 0.29671448733523, "learning_rate": 2.513119649037454e-07, "loss": 0.001, "step": 9409 }, { "epoch": 4.281164695177434, "grad_norm": 0.7831524483152399, "learning_rate": 2.5099977576884814e-07, "loss": 0.0102, "step": 9410 }, { "epoch": 4.281619654231119, "grad_norm": 0.45733967884657967, "learning_rate": 2.506877704120925e-07, "loss": 0.004, "step": 9411 }, { "epoch": 4.282074613284804, "grad_norm": 0.5548276632915439, "learning_rate": 2.503759488589741e-07, "loss": 0.0109, "step": 9412 }, { "epoch": 4.28252957233849, "grad_norm": 0.1485882452268798, "learning_rate": 2.5006431113497457e-07, "loss": 0.0007, "step": 9413 }, { "epoch": 4.282984531392175, "grad_norm": 0.395211979802539, "learning_rate": 2.497528572655586e-07, "loss": 0.0034, "step": 9414 }, { "epoch": 4.2834394904458595, "grad_norm": 0.5984301894643417, "learning_rate": 2.4944158727617687e-07, "loss": 0.0045, "step": 9415 }, { "epoch": 4.283894449499545, "grad_norm": 0.12518244308112877, "learning_rate": 2.4913050119226565e-07, "loss": 0.0006, "step": 9416 }, { "epoch": 4.28434940855323, "grad_norm": 0.5885786238054344, "learning_rate": 2.4881959903924447e-07, "loss": 0.0094, "step": 9417 }, { "epoch": 4.284804367606915, "grad_norm": 0.256030803086381, "learning_rate": 2.4850888084251986e-07, "loss": 0.0012, "step": 9418 }, { "epoch": 4.285259326660601, "grad_norm": 0.29302825991327786, "learning_rate": 2.4819834662748205e-07, "loss": 0.0025, "step": 9419 }, { "epoch": 4.285714285714286, "grad_norm": 0.48396613797578203, "learning_rate": 2.4788799641950605e-07, "loss": 0.0029, "step": 9420 }, { "epoch": 4.2861692447679705, "grad_norm": 0.09912977891580249, "learning_rate": 2.4757783024395244e-07, "loss": 0.0003, "step": 9421 }, { "epoch": 4.286624203821656, "grad_norm": 0.7656378607790022, "learning_rate": 2.4726784812616645e-07, "loss": 0.0086, "step": 9422 }, { "epoch": 4.287079162875341, "grad_norm": 0.5726806413713623, "learning_rate": 2.469580500914789e-07, "loss": 0.0101, "step": 9423 }, { "epoch": 4.287534121929026, "grad_norm": 0.12605602856185735, "learning_rate": 2.4664843616520525e-07, "loss": 0.0007, "step": 9424 }, { "epoch": 4.287989080982712, "grad_norm": 0.29276737290917876, "learning_rate": 2.4633900637264507e-07, "loss": 0.0014, "step": 9425 }, { "epoch": 4.288444040036397, "grad_norm": 0.19035115863401092, "learning_rate": 2.4602976073908354e-07, "loss": 0.0015, "step": 9426 }, { "epoch": 4.288898999090081, "grad_norm": 0.544372247409013, "learning_rate": 2.4572069928979147e-07, "loss": 0.0027, "step": 9427 }, { "epoch": 4.289353958143767, "grad_norm": 0.9606352697974171, "learning_rate": 2.454118220500237e-07, "loss": 0.0054, "step": 9428 }, { "epoch": 4.289808917197452, "grad_norm": 0.3391843756211158, "learning_rate": 2.451031290450198e-07, "loss": 0.0012, "step": 9429 }, { "epoch": 4.290263876251138, "grad_norm": 0.30029177880870644, "learning_rate": 2.4479462030000566e-07, "loss": 0.002, "step": 9430 }, { "epoch": 4.290718835304823, "grad_norm": 0.1852185197787274, "learning_rate": 2.4448629584019003e-07, "loss": 0.0007, "step": 9431 }, { "epoch": 4.2911737943585075, "grad_norm": 0.3193457118778118, "learning_rate": 2.441781556907694e-07, "loss": 0.0024, "step": 9432 }, { "epoch": 4.291628753412193, "grad_norm": 0.49267517261193094, "learning_rate": 2.4387019987692215e-07, "loss": 0.0055, "step": 9433 }, { "epoch": 4.292083712465878, "grad_norm": 1.2382524727167499, "learning_rate": 2.435624284238139e-07, "loss": 0.0026, "step": 9434 }, { "epoch": 4.292538671519563, "grad_norm": 0.1855214584025504, "learning_rate": 2.4325484135659356e-07, "loss": 0.0007, "step": 9435 }, { "epoch": 4.292993630573249, "grad_norm": 0.9132373690726981, "learning_rate": 2.4294743870039604e-07, "loss": 0.0071, "step": 9436 }, { "epoch": 4.293448589626934, "grad_norm": 0.8161346029169962, "learning_rate": 2.426402204803416e-07, "loss": 0.0081, "step": 9437 }, { "epoch": 4.2939035486806185, "grad_norm": 1.1497737179487786, "learning_rate": 2.423331867215342e-07, "loss": 0.002, "step": 9438 }, { "epoch": 4.294358507734304, "grad_norm": 0.6860850321134196, "learning_rate": 2.4202633744906336e-07, "loss": 0.0069, "step": 9439 }, { "epoch": 4.294813466787989, "grad_norm": 0.6214746171768587, "learning_rate": 2.417196726880025e-07, "loss": 0.0081, "step": 9440 }, { "epoch": 4.295268425841674, "grad_norm": 0.39049677199858357, "learning_rate": 2.414131924634125e-07, "loss": 0.0016, "step": 9441 }, { "epoch": 4.29572338489536, "grad_norm": 0.5198364790897549, "learning_rate": 2.411068968003366e-07, "loss": 0.0034, "step": 9442 }, { "epoch": 4.296178343949045, "grad_norm": 0.11127727351140632, "learning_rate": 2.408007857238037e-07, "loss": 0.0006, "step": 9443 }, { "epoch": 4.2966333030027295, "grad_norm": 0.3504047874929669, "learning_rate": 2.404948592588283e-07, "loss": 0.004, "step": 9444 }, { "epoch": 4.297088262056415, "grad_norm": 0.7737016333961039, "learning_rate": 2.4018911743040884e-07, "loss": 0.0044, "step": 9445 }, { "epoch": 4.2975432211101, "grad_norm": 0.23422508031285758, "learning_rate": 2.398835602635302e-07, "loss": 0.0013, "step": 9446 }, { "epoch": 4.297998180163785, "grad_norm": 0.715386127101944, "learning_rate": 2.3957818778316017e-07, "loss": 0.004, "step": 9447 }, { "epoch": 4.298453139217471, "grad_norm": 0.8287166622667207, "learning_rate": 2.3927300001425263e-07, "loss": 0.0091, "step": 9448 }, { "epoch": 4.298908098271156, "grad_norm": 0.5550981973485504, "learning_rate": 2.389679969817463e-07, "loss": 0.0017, "step": 9449 }, { "epoch": 4.2993630573248405, "grad_norm": 0.9604227209876903, "learning_rate": 2.3866317871056396e-07, "loss": 0.0046, "step": 9450 }, { "epoch": 4.299818016378526, "grad_norm": 0.3475164198616045, "learning_rate": 2.3835854522561457e-07, "loss": 0.0025, "step": 9451 }, { "epoch": 4.300272975432211, "grad_norm": 1.0226198759481613, "learning_rate": 2.380540965517919e-07, "loss": 0.0058, "step": 9452 }, { "epoch": 4.300727934485896, "grad_norm": 0.7848283331655461, "learning_rate": 2.3774983271397352e-07, "loss": 0.0155, "step": 9453 }, { "epoch": 4.301182893539582, "grad_norm": 1.1542774789424333, "learning_rate": 2.3744575373702256e-07, "loss": 0.0101, "step": 9454 }, { "epoch": 4.301637852593267, "grad_norm": 1.9835582309272446, "learning_rate": 2.3714185964578667e-07, "loss": 0.0085, "step": 9455 }, { "epoch": 4.3020928116469515, "grad_norm": 0.8438329434631188, "learning_rate": 2.3683815046509934e-07, "loss": 0.0045, "step": 9456 }, { "epoch": 4.302547770700637, "grad_norm": 1.2226092022257395, "learning_rate": 2.3653462621977764e-07, "loss": 0.0106, "step": 9457 }, { "epoch": 4.303002729754322, "grad_norm": 0.8119785703109904, "learning_rate": 2.3623128693462505e-07, "loss": 0.0045, "step": 9458 }, { "epoch": 4.303457688808007, "grad_norm": 0.3618319113639415, "learning_rate": 2.359281326344287e-07, "loss": 0.0016, "step": 9459 }, { "epoch": 4.303912647861693, "grad_norm": 0.2768629857766232, "learning_rate": 2.3562516334396018e-07, "loss": 0.0016, "step": 9460 }, { "epoch": 4.304367606915378, "grad_norm": 0.1776672195730611, "learning_rate": 2.35322379087978e-07, "loss": 0.001, "step": 9461 }, { "epoch": 4.304822565969062, "grad_norm": 0.8393337714160379, "learning_rate": 2.3501977989122405e-07, "loss": 0.02, "step": 9462 }, { "epoch": 4.305277525022748, "grad_norm": 0.24136013834918835, "learning_rate": 2.3471736577842492e-07, "loss": 0.0015, "step": 9463 }, { "epoch": 4.305732484076433, "grad_norm": 0.5371591993929902, "learning_rate": 2.3441513677429223e-07, "loss": 0.0071, "step": 9464 }, { "epoch": 4.306187443130118, "grad_norm": 1.5032555749585257, "learning_rate": 2.3411309290352347e-07, "loss": 0.0059, "step": 9465 }, { "epoch": 4.306642402183804, "grad_norm": 0.48781777567833895, "learning_rate": 2.3381123419080026e-07, "loss": 0.0045, "step": 9466 }, { "epoch": 4.3070973612374885, "grad_norm": 0.8564528413709025, "learning_rate": 2.3350956066078927e-07, "loss": 0.0146, "step": 9467 }, { "epoch": 4.307552320291173, "grad_norm": 1.1024802259155655, "learning_rate": 2.3320807233814135e-07, "loss": 0.0132, "step": 9468 }, { "epoch": 4.308007279344859, "grad_norm": 0.5101601875860028, "learning_rate": 2.3290676924749288e-07, "loss": 0.006, "step": 9469 }, { "epoch": 4.308462238398544, "grad_norm": 0.5488208043208028, "learning_rate": 2.326056514134653e-07, "loss": 0.0071, "step": 9470 }, { "epoch": 4.308917197452229, "grad_norm": 0.44554321580031964, "learning_rate": 2.323047188606642e-07, "loss": 0.0017, "step": 9471 }, { "epoch": 4.309372156505915, "grad_norm": 0.31135368629699534, "learning_rate": 2.320039716136807e-07, "loss": 0.0031, "step": 9472 }, { "epoch": 4.3098271155595995, "grad_norm": 0.8226019675564654, "learning_rate": 2.3170340969709077e-07, "loss": 0.0284, "step": 9473 }, { "epoch": 4.310282074613285, "grad_norm": 1.2783109621821611, "learning_rate": 2.3140303313545416e-07, "loss": 0.0301, "step": 9474 }, { "epoch": 4.31073703366697, "grad_norm": 0.1792398605129933, "learning_rate": 2.3110284195331733e-07, "loss": 0.0004, "step": 9475 }, { "epoch": 4.311191992720655, "grad_norm": 0.7472674349343935, "learning_rate": 2.3080283617520987e-07, "loss": 0.0047, "step": 9476 }, { "epoch": 4.311646951774341, "grad_norm": 0.4210770789644459, "learning_rate": 2.3050301582564715e-07, "loss": 0.003, "step": 9477 }, { "epoch": 4.312101910828026, "grad_norm": 0.32768280630848323, "learning_rate": 2.3020338092912848e-07, "loss": 0.0018, "step": 9478 }, { "epoch": 4.3125568698817105, "grad_norm": 0.30516493685190776, "learning_rate": 2.2990393151013923e-07, "loss": 0.0025, "step": 9479 }, { "epoch": 4.313011828935396, "grad_norm": 0.4758645331728424, "learning_rate": 2.296046675931496e-07, "loss": 0.0009, "step": 9480 }, { "epoch": 4.313466787989081, "grad_norm": 1.134135172454976, "learning_rate": 2.2930558920261332e-07, "loss": 0.0135, "step": 9481 }, { "epoch": 4.313921747042766, "grad_norm": 0.6261351433176319, "learning_rate": 2.2900669636297002e-07, "loss": 0.0035, "step": 9482 }, { "epoch": 4.314376706096452, "grad_norm": 0.7941978265493396, "learning_rate": 2.287079890986438e-07, "loss": 0.0097, "step": 9483 }, { "epoch": 4.314831665150137, "grad_norm": 0.18443058773496565, "learning_rate": 2.2840946743404313e-07, "loss": 0.001, "step": 9484 }, { "epoch": 4.3152866242038215, "grad_norm": 0.5990748797243969, "learning_rate": 2.2811113139356245e-07, "loss": 0.0082, "step": 9485 }, { "epoch": 4.315741583257507, "grad_norm": 0.6279798383847466, "learning_rate": 2.2781298100158084e-07, "loss": 0.0023, "step": 9486 }, { "epoch": 4.316196542311192, "grad_norm": 0.4929002679009532, "learning_rate": 2.275150162824613e-07, "loss": 0.0021, "step": 9487 }, { "epoch": 4.316651501364877, "grad_norm": 0.571909449977339, "learning_rate": 2.272172372605519e-07, "loss": 0.0036, "step": 9488 }, { "epoch": 4.317106460418563, "grad_norm": 0.7037780961753833, "learning_rate": 2.269196439601859e-07, "loss": 0.0132, "step": 9489 }, { "epoch": 4.317561419472248, "grad_norm": 0.7049274268741239, "learning_rate": 2.2662223640568192e-07, "loss": 0.0101, "step": 9490 }, { "epoch": 4.3180163785259325, "grad_norm": 0.25492556989120607, "learning_rate": 2.2632501462134222e-07, "loss": 0.0009, "step": 9491 }, { "epoch": 4.318471337579618, "grad_norm": 0.8527308200400004, "learning_rate": 2.2602797863145397e-07, "loss": 0.0026, "step": 9492 }, { "epoch": 4.318926296633303, "grad_norm": 0.3676134595952648, "learning_rate": 2.2573112846029087e-07, "loss": 0.0062, "step": 9493 }, { "epoch": 4.319381255686988, "grad_norm": 0.8496150228704988, "learning_rate": 2.2543446413210879e-07, "loss": 0.0077, "step": 9494 }, { "epoch": 4.319836214740674, "grad_norm": 1.0006976352642767, "learning_rate": 2.251379856711508e-07, "loss": 0.0183, "step": 9495 }, { "epoch": 4.320291173794359, "grad_norm": 0.421378373014245, "learning_rate": 2.2484169310164366e-07, "loss": 0.0077, "step": 9496 }, { "epoch": 4.320746132848043, "grad_norm": 7.80670158119926, "learning_rate": 2.2454558644779856e-07, "loss": 0.0075, "step": 9497 }, { "epoch": 4.321201091901729, "grad_norm": 0.6175489645361679, "learning_rate": 2.2424966573381195e-07, "loss": 0.0042, "step": 9498 }, { "epoch": 4.321656050955414, "grad_norm": 0.16623704064904185, "learning_rate": 2.2395393098386565e-07, "loss": 0.0004, "step": 9499 }, { "epoch": 4.322111010009099, "grad_norm": 0.7396751896573991, "learning_rate": 2.2365838222212583e-07, "loss": 0.0233, "step": 9500 }, { "epoch": 4.322565969062785, "grad_norm": 0.47156262126415566, "learning_rate": 2.233630194727432e-07, "loss": 0.0068, "step": 9501 }, { "epoch": 4.3230209281164695, "grad_norm": 0.4670655857183652, "learning_rate": 2.2306784275985344e-07, "loss": 0.0044, "step": 9502 }, { "epoch": 4.323475887170154, "grad_norm": 0.3657382828405814, "learning_rate": 2.2277285210757644e-07, "loss": 0.0022, "step": 9503 }, { "epoch": 4.32393084622384, "grad_norm": 0.6685564296587082, "learning_rate": 2.2247804754001874e-07, "loss": 0.0078, "step": 9504 }, { "epoch": 4.324385805277525, "grad_norm": 0.5966284767924025, "learning_rate": 2.2218342908126965e-07, "loss": 0.0022, "step": 9505 }, { "epoch": 4.32484076433121, "grad_norm": 0.19866208179755834, "learning_rate": 2.2188899675540388e-07, "loss": 0.001, "step": 9506 }, { "epoch": 4.325295723384896, "grad_norm": 1.0325878384999023, "learning_rate": 2.2159475058648184e-07, "loss": 0.0062, "step": 9507 }, { "epoch": 4.3257506824385805, "grad_norm": 0.48838573872408964, "learning_rate": 2.213006905985471e-07, "loss": 0.0044, "step": 9508 }, { "epoch": 4.326205641492265, "grad_norm": 0.22369189389446545, "learning_rate": 2.2100681681562985e-07, "loss": 0.0012, "step": 9509 }, { "epoch": 4.326660600545951, "grad_norm": 0.5757588968581829, "learning_rate": 2.207131292617437e-07, "loss": 0.0085, "step": 9510 }, { "epoch": 4.327115559599636, "grad_norm": 0.7248346709071533, "learning_rate": 2.204196279608875e-07, "loss": 0.0087, "step": 9511 }, { "epoch": 4.327570518653321, "grad_norm": 0.12215024408209349, "learning_rate": 2.201263129370443e-07, "loss": 0.0006, "step": 9512 }, { "epoch": 4.328025477707007, "grad_norm": 0.28186326226351416, "learning_rate": 2.198331842141832e-07, "loss": 0.0041, "step": 9513 }, { "epoch": 4.3284804367606915, "grad_norm": 0.5604150303657685, "learning_rate": 2.1954024181625732e-07, "loss": 0.0067, "step": 9514 }, { "epoch": 4.328935395814376, "grad_norm": 0.2538700355397201, "learning_rate": 2.1924748576720445e-07, "loss": 0.0008, "step": 9515 }, { "epoch": 4.329390354868062, "grad_norm": 0.48937063275237347, "learning_rate": 2.189549160909474e-07, "loss": 0.0025, "step": 9516 }, { "epoch": 4.329845313921747, "grad_norm": 0.6927845648898731, "learning_rate": 2.186625328113931e-07, "loss": 0.0074, "step": 9517 }, { "epoch": 4.330300272975432, "grad_norm": 1.7537912193791618, "learning_rate": 2.1837033595243446e-07, "loss": 0.0122, "step": 9518 }, { "epoch": 4.330755232029118, "grad_norm": 0.7526609899311327, "learning_rate": 2.1807832553794815e-07, "loss": 0.0133, "step": 9519 }, { "epoch": 4.3312101910828025, "grad_norm": 0.7385374979507293, "learning_rate": 2.177865015917957e-07, "loss": 0.0174, "step": 9520 }, { "epoch": 4.331665150136487, "grad_norm": 1.3491770453722003, "learning_rate": 2.174948641378244e-07, "loss": 0.0031, "step": 9521 }, { "epoch": 4.332120109190173, "grad_norm": 0.9738256221602035, "learning_rate": 2.1720341319986516e-07, "loss": 0.0081, "step": 9522 }, { "epoch": 4.332575068243858, "grad_norm": 1.033204485368784, "learning_rate": 2.169121488017334e-07, "loss": 0.0043, "step": 9523 }, { "epoch": 4.333030027297543, "grad_norm": 0.3893880714847734, "learning_rate": 2.1662107096723116e-07, "loss": 0.0012, "step": 9524 }, { "epoch": 4.333484986351229, "grad_norm": 0.5570089096480714, "learning_rate": 2.163301797201431e-07, "loss": 0.0032, "step": 9525 }, { "epoch": 4.3339399454049135, "grad_norm": 0.6659273823084348, "learning_rate": 2.1603947508423983e-07, "loss": 0.0088, "step": 9526 }, { "epoch": 4.334394904458598, "grad_norm": 1.7354960408153683, "learning_rate": 2.1574895708327603e-07, "loss": 0.0095, "step": 9527 }, { "epoch": 4.334849863512284, "grad_norm": 0.9492430535142498, "learning_rate": 2.1545862574099185e-07, "loss": 0.009, "step": 9528 }, { "epoch": 4.335304822565969, "grad_norm": 0.7695610498898182, "learning_rate": 2.151684810811122e-07, "loss": 0.0048, "step": 9529 }, { "epoch": 4.335759781619654, "grad_norm": 0.1412963807100035, "learning_rate": 2.148785231273462e-07, "loss": 0.0004, "step": 9530 }, { "epoch": 4.33621474067334, "grad_norm": 0.27930072158465036, "learning_rate": 2.1458875190338792e-07, "loss": 0.0005, "step": 9531 }, { "epoch": 4.336669699727024, "grad_norm": 0.09006956652584609, "learning_rate": 2.1429916743291534e-07, "loss": 0.0003, "step": 9532 }, { "epoch": 4.337124658780709, "grad_norm": 0.707829118387955, "learning_rate": 2.140097697395932e-07, "loss": 0.0021, "step": 9533 }, { "epoch": 4.337579617834395, "grad_norm": 0.42754786376296305, "learning_rate": 2.1372055884706865e-07, "loss": 0.0036, "step": 9534 }, { "epoch": 4.33803457688808, "grad_norm": 0.6157811961158592, "learning_rate": 2.1343153477897587e-07, "loss": 0.0035, "step": 9535 }, { "epoch": 4.338489535941765, "grad_norm": 0.629024068078715, "learning_rate": 2.1314269755893209e-07, "loss": 0.0039, "step": 9536 }, { "epoch": 4.3389444949954505, "grad_norm": 0.6131214278457429, "learning_rate": 2.1285404721053893e-07, "loss": 0.0119, "step": 9537 }, { "epoch": 4.339399454049135, "grad_norm": 0.14788210833098886, "learning_rate": 2.1256558375738507e-07, "loss": 0.0007, "step": 9538 }, { "epoch": 4.339854413102821, "grad_norm": 0.554017628157928, "learning_rate": 2.122773072230419e-07, "loss": 0.0153, "step": 9539 }, { "epoch": 4.340309372156506, "grad_norm": 0.6531475361310309, "learning_rate": 2.1198921763106562e-07, "loss": 0.0075, "step": 9540 }, { "epoch": 4.340764331210191, "grad_norm": 0.4228175264290507, "learning_rate": 2.1170131500499763e-07, "loss": 0.0032, "step": 9541 }, { "epoch": 4.341219290263877, "grad_norm": 1.6013032351606624, "learning_rate": 2.1141359936836414e-07, "loss": 0.0046, "step": 9542 }, { "epoch": 4.3416742493175615, "grad_norm": 0.4960870037757189, "learning_rate": 2.111260707446769e-07, "loss": 0.0066, "step": 9543 }, { "epoch": 4.342129208371246, "grad_norm": 0.6568185748087135, "learning_rate": 2.108387291574304e-07, "loss": 0.0069, "step": 9544 }, { "epoch": 4.342584167424932, "grad_norm": 0.6176138469422263, "learning_rate": 2.1055157463010512e-07, "loss": 0.0085, "step": 9545 }, { "epoch": 4.343039126478617, "grad_norm": 0.4006701337338917, "learning_rate": 2.102646071861661e-07, "loss": 0.0004, "step": 9546 }, { "epoch": 4.343494085532302, "grad_norm": 0.5573120131276602, "learning_rate": 2.099778268490632e-07, "loss": 0.012, "step": 9547 }, { "epoch": 4.343949044585988, "grad_norm": 0.8265211464827064, "learning_rate": 2.0969123364222993e-07, "loss": 0.0061, "step": 9548 }, { "epoch": 4.3444040036396725, "grad_norm": 0.43462702347345583, "learning_rate": 2.0940482758908698e-07, "loss": 0.0091, "step": 9549 }, { "epoch": 4.344858962693357, "grad_norm": 0.4137200732411227, "learning_rate": 2.09118608713037e-07, "loss": 0.0025, "step": 9550 }, { "epoch": 4.345313921747043, "grad_norm": 1.6118258108310075, "learning_rate": 2.0883257703746856e-07, "loss": 0.0052, "step": 9551 }, { "epoch": 4.345768880800728, "grad_norm": 0.7924425433608572, "learning_rate": 2.0854673258575542e-07, "loss": 0.0051, "step": 9552 }, { "epoch": 4.346223839854413, "grad_norm": 1.5197154148818657, "learning_rate": 2.082610753812553e-07, "loss": 0.0079, "step": 9553 }, { "epoch": 4.346678798908099, "grad_norm": 0.6679287572057152, "learning_rate": 2.0797560544731061e-07, "loss": 0.0169, "step": 9554 }, { "epoch": 4.3471337579617835, "grad_norm": 0.27104928746179563, "learning_rate": 2.0769032280724827e-07, "loss": 0.0018, "step": 9555 }, { "epoch": 4.347588717015468, "grad_norm": 1.2058920567083182, "learning_rate": 2.074052274843813e-07, "loss": 0.0159, "step": 9556 }, { "epoch": 4.348043676069154, "grad_norm": 0.4140612419364471, "learning_rate": 2.0712031950200523e-07, "loss": 0.0018, "step": 9557 }, { "epoch": 4.348498635122839, "grad_norm": 0.590485923167175, "learning_rate": 2.0683559888340283e-07, "loss": 0.0021, "step": 9558 }, { "epoch": 4.348953594176524, "grad_norm": 0.6645384782404199, "learning_rate": 2.0655106565183934e-07, "loss": 0.0093, "step": 9559 }, { "epoch": 4.34940855323021, "grad_norm": 1.7198045092936984, "learning_rate": 2.0626671983056566e-07, "loss": 0.0454, "step": 9560 }, { "epoch": 4.3498635122838945, "grad_norm": 1.112841632627292, "learning_rate": 2.0598256144281654e-07, "loss": 0.0255, "step": 9561 }, { "epoch": 4.350318471337579, "grad_norm": 0.2122236049635532, "learning_rate": 2.0569859051181306e-07, "loss": 0.0011, "step": 9562 }, { "epoch": 4.350773430391265, "grad_norm": 0.4641714590798055, "learning_rate": 2.0541480706076033e-07, "loss": 0.0042, "step": 9563 }, { "epoch": 4.35122838944495, "grad_norm": 0.2931671175787406, "learning_rate": 2.05131211112847e-07, "loss": 0.0007, "step": 9564 }, { "epoch": 4.351683348498635, "grad_norm": 0.4733752484534316, "learning_rate": 2.0484780269124787e-07, "loss": 0.0056, "step": 9565 }, { "epoch": 4.352138307552321, "grad_norm": 0.7341303681472258, "learning_rate": 2.0456458181912082e-07, "loss": 0.0094, "step": 9566 }, { "epoch": 4.352593266606005, "grad_norm": 0.7969021419526019, "learning_rate": 2.0428154851961062e-07, "loss": 0.0038, "step": 9567 }, { "epoch": 4.35304822565969, "grad_norm": 0.10028420297511419, "learning_rate": 2.0399870281584467e-07, "loss": 0.0004, "step": 9568 }, { "epoch": 4.353503184713376, "grad_norm": 0.28111060837582236, "learning_rate": 2.037160447309358e-07, "loss": 0.0017, "step": 9569 }, { "epoch": 4.353958143767061, "grad_norm": 0.46627968424164323, "learning_rate": 2.0343357428798256e-07, "loss": 0.0056, "step": 9570 }, { "epoch": 4.354413102820746, "grad_norm": 0.5064828106613256, "learning_rate": 2.0315129151006557e-07, "loss": 0.0228, "step": 9571 }, { "epoch": 4.3548680618744315, "grad_norm": 0.5941199963395051, "learning_rate": 2.028691964202531e-07, "loss": 0.018, "step": 9572 }, { "epoch": 4.355323020928116, "grad_norm": 0.2500336375256369, "learning_rate": 2.0258728904159612e-07, "loss": 0.0015, "step": 9573 }, { "epoch": 4.355777979981801, "grad_norm": 0.45482862289357756, "learning_rate": 2.0230556939713098e-07, "loss": 0.0056, "step": 9574 }, { "epoch": 4.356232939035487, "grad_norm": 0.5324584885194383, "learning_rate": 2.0202403750987809e-07, "loss": 0.0059, "step": 9575 }, { "epoch": 4.356687898089172, "grad_norm": 1.4075883236666897, "learning_rate": 2.0174269340284297e-07, "loss": 0.0133, "step": 9576 }, { "epoch": 4.357142857142857, "grad_norm": 1.0631693561229703, "learning_rate": 2.0146153709901666e-07, "loss": 0.0034, "step": 9577 }, { "epoch": 4.3575978161965425, "grad_norm": 0.2500732427418824, "learning_rate": 2.0118056862137358e-07, "loss": 0.0009, "step": 9578 }, { "epoch": 4.358052775250227, "grad_norm": 0.13508685678907587, "learning_rate": 2.0089978799287286e-07, "loss": 0.0007, "step": 9579 }, { "epoch": 4.358507734303912, "grad_norm": 0.7067082358349016, "learning_rate": 2.0061919523645835e-07, "loss": 0.0104, "step": 9580 }, { "epoch": 4.358962693357598, "grad_norm": 0.5385799629974599, "learning_rate": 2.0033879037506003e-07, "loss": 0.0064, "step": 9581 }, { "epoch": 4.359417652411283, "grad_norm": 0.7636302663142198, "learning_rate": 2.0005857343159046e-07, "loss": 0.0112, "step": 9582 }, { "epoch": 4.359872611464968, "grad_norm": 0.2823245124350229, "learning_rate": 1.997785444289471e-07, "loss": 0.0011, "step": 9583 }, { "epoch": 4.3603275705186535, "grad_norm": 0.39496230962408213, "learning_rate": 1.9949870339001422e-07, "loss": 0.0046, "step": 9584 }, { "epoch": 4.360782529572338, "grad_norm": 0.4063023272170611, "learning_rate": 1.992190503376576e-07, "loss": 0.0063, "step": 9585 }, { "epoch": 4.361237488626024, "grad_norm": 0.7739200701817368, "learning_rate": 1.989395852947304e-07, "loss": 0.0035, "step": 9586 }, { "epoch": 4.361692447679709, "grad_norm": 0.3802657602736787, "learning_rate": 1.9866030828406908e-07, "loss": 0.002, "step": 9587 }, { "epoch": 4.362147406733394, "grad_norm": 0.6033258621635715, "learning_rate": 1.9838121932849423e-07, "loss": 0.0048, "step": 9588 }, { "epoch": 4.36260236578708, "grad_norm": 0.6327707777406835, "learning_rate": 1.981023184508124e-07, "loss": 0.0055, "step": 9589 }, { "epoch": 4.3630573248407645, "grad_norm": 0.31788380505448205, "learning_rate": 1.978236056738128e-07, "loss": 0.001, "step": 9590 }, { "epoch": 4.363512283894449, "grad_norm": 0.6435527843978914, "learning_rate": 1.9754508102027253e-07, "loss": 0.0035, "step": 9591 }, { "epoch": 4.363967242948135, "grad_norm": 0.8656448085366537, "learning_rate": 1.972667445129506e-07, "loss": 0.0246, "step": 9592 }, { "epoch": 4.36442220200182, "grad_norm": 0.9909982792685692, "learning_rate": 1.96988596174591e-07, "loss": 0.0181, "step": 9593 }, { "epoch": 4.364877161055505, "grad_norm": 0.5539878344802275, "learning_rate": 1.9671063602792307e-07, "loss": 0.004, "step": 9594 }, { "epoch": 4.365332120109191, "grad_norm": 0.6121996516787316, "learning_rate": 1.9643286409566004e-07, "loss": 0.0033, "step": 9595 }, { "epoch": 4.3657870791628755, "grad_norm": 0.5482000232068092, "learning_rate": 1.9615528040050098e-07, "loss": 0.0031, "step": 9596 }, { "epoch": 4.36624203821656, "grad_norm": 0.44787297218252586, "learning_rate": 1.95877884965128e-07, "loss": 0.0021, "step": 9597 }, { "epoch": 4.366696997270246, "grad_norm": 0.8230883415361642, "learning_rate": 1.956006778122091e-07, "loss": 0.0169, "step": 9598 }, { "epoch": 4.367151956323931, "grad_norm": 0.7033709180419427, "learning_rate": 1.9532365896439642e-07, "loss": 0.0041, "step": 9599 }, { "epoch": 4.367606915377616, "grad_norm": 0.5594121114999732, "learning_rate": 1.9504682844432603e-07, "loss": 0.0305, "step": 9600 }, { "epoch": 4.368061874431302, "grad_norm": 0.25626693876957424, "learning_rate": 1.9477018627462013e-07, "loss": 0.001, "step": 9601 }, { "epoch": 4.368516833484986, "grad_norm": 0.917464834594088, "learning_rate": 1.944937324778845e-07, "loss": 0.0341, "step": 9602 }, { "epoch": 4.368971792538671, "grad_norm": 1.083156094788485, "learning_rate": 1.9421746707670913e-07, "loss": 0.0024, "step": 9603 }, { "epoch": 4.369426751592357, "grad_norm": 0.637445262161518, "learning_rate": 1.939413900936693e-07, "loss": 0.0093, "step": 9604 }, { "epoch": 4.369881710646042, "grad_norm": 0.5619158237389463, "learning_rate": 1.93665501551325e-07, "loss": 0.0039, "step": 9605 }, { "epoch": 4.370336669699727, "grad_norm": 0.6083945943034871, "learning_rate": 1.9338980147222103e-07, "loss": 0.0068, "step": 9606 }, { "epoch": 4.3707916287534125, "grad_norm": 1.017649666621362, "learning_rate": 1.9311428987888597e-07, "loss": 0.0102, "step": 9607 }, { "epoch": 4.371246587807097, "grad_norm": 0.31597160512119615, "learning_rate": 1.9283896679383325e-07, "loss": 0.0018, "step": 9608 }, { "epoch": 4.371701546860782, "grad_norm": 0.489442124661251, "learning_rate": 1.9256383223956067e-07, "loss": 0.0026, "step": 9609 }, { "epoch": 4.372156505914468, "grad_norm": 1.3329604224032816, "learning_rate": 1.9228888623855192e-07, "loss": 0.0119, "step": 9610 }, { "epoch": 4.372611464968153, "grad_norm": 0.10863581476598971, "learning_rate": 1.9201412881327318e-07, "loss": 0.0005, "step": 9611 }, { "epoch": 4.373066424021838, "grad_norm": 0.3298600985336216, "learning_rate": 1.9173955998617794e-07, "loss": 0.0023, "step": 9612 }, { "epoch": 4.3735213830755235, "grad_norm": 0.358690561726219, "learning_rate": 1.9146517977970152e-07, "loss": 0.0122, "step": 9613 }, { "epoch": 4.373976342129208, "grad_norm": 0.6164302606999852, "learning_rate": 1.9119098821626492e-07, "loss": 0.0083, "step": 9614 }, { "epoch": 4.374431301182893, "grad_norm": 0.28791037921120155, "learning_rate": 1.909169853182749e-07, "loss": 0.0008, "step": 9615 }, { "epoch": 4.374886260236579, "grad_norm": 0.4100747827983027, "learning_rate": 1.906431711081211e-07, "loss": 0.0015, "step": 9616 }, { "epoch": 4.375341219290264, "grad_norm": 0.5072265885492898, "learning_rate": 1.9036954560817804e-07, "loss": 0.0168, "step": 9617 }, { "epoch": 4.375796178343949, "grad_norm": 1.2809488111982297, "learning_rate": 1.9009610884080543e-07, "loss": 0.0067, "step": 9618 }, { "epoch": 4.3762511373976345, "grad_norm": 0.4991499726549628, "learning_rate": 1.8982286082834728e-07, "loss": 0.0102, "step": 9619 }, { "epoch": 4.376706096451319, "grad_norm": 0.5119498326097054, "learning_rate": 1.895498015931327e-07, "loss": 0.0074, "step": 9620 }, { "epoch": 4.377161055505004, "grad_norm": 0.4743212778667676, "learning_rate": 1.8927693115747464e-07, "loss": 0.0082, "step": 9621 }, { "epoch": 4.37761601455869, "grad_norm": 1.6236115407718454, "learning_rate": 1.8900424954367031e-07, "loss": 0.0045, "step": 9622 }, { "epoch": 4.378070973612375, "grad_norm": 0.3080430212675028, "learning_rate": 1.8873175677400212e-07, "loss": 0.002, "step": 9623 }, { "epoch": 4.37852593266606, "grad_norm": 0.1050917443284625, "learning_rate": 1.884594528707376e-07, "loss": 0.0005, "step": 9624 }, { "epoch": 4.3789808917197455, "grad_norm": 0.6153836249507183, "learning_rate": 1.881873378561272e-07, "loss": 0.0108, "step": 9625 }, { "epoch": 4.37943585077343, "grad_norm": 0.18785053603560203, "learning_rate": 1.879154117524079e-07, "loss": 0.0008, "step": 9626 }, { "epoch": 4.379890809827115, "grad_norm": 0.3160849717137599, "learning_rate": 1.876436745818e-07, "loss": 0.0009, "step": 9627 }, { "epoch": 4.380345768880801, "grad_norm": 1.0566975633716893, "learning_rate": 1.8737212636650848e-07, "loss": 0.0344, "step": 9628 }, { "epoch": 4.380800727934486, "grad_norm": 0.4516457722426166, "learning_rate": 1.8710076712872254e-07, "loss": 0.0038, "step": 9629 }, { "epoch": 4.381255686988171, "grad_norm": 0.428314800652728, "learning_rate": 1.8682959689061753e-07, "loss": 0.0042, "step": 9630 }, { "epoch": 4.3817106460418564, "grad_norm": 0.4464672159079038, "learning_rate": 1.8655861567435152e-07, "loss": 0.0032, "step": 9631 }, { "epoch": 4.382165605095541, "grad_norm": 0.12899165524605963, "learning_rate": 1.8628782350206764e-07, "loss": 0.0005, "step": 9632 }, { "epoch": 4.382620564149226, "grad_norm": 0.8787760280330905, "learning_rate": 1.8601722039589488e-07, "loss": 0.0177, "step": 9633 }, { "epoch": 4.383075523202912, "grad_norm": 0.515121759348226, "learning_rate": 1.8574680637794413e-07, "loss": 0.0082, "step": 9634 }, { "epoch": 4.383530482256597, "grad_norm": 0.881262972054177, "learning_rate": 1.8547658147031412e-07, "loss": 0.0073, "step": 9635 }, { "epoch": 4.383985441310282, "grad_norm": 0.5534621256848222, "learning_rate": 1.8520654569508523e-07, "loss": 0.0037, "step": 9636 }, { "epoch": 4.384440400363967, "grad_norm": 0.5168093151237019, "learning_rate": 1.8493669907432426e-07, "loss": 0.0068, "step": 9637 }, { "epoch": 4.384895359417652, "grad_norm": 0.4579590784863398, "learning_rate": 1.8466704163008082e-07, "loss": 0.0013, "step": 9638 }, { "epoch": 4.385350318471337, "grad_norm": 0.46184902676731554, "learning_rate": 1.8439757338439085e-07, "loss": 0.0036, "step": 9639 }, { "epoch": 4.385805277525023, "grad_norm": 0.12450525017749033, "learning_rate": 1.8412829435927453e-07, "loss": 0.0005, "step": 9640 }, { "epoch": 4.386260236578708, "grad_norm": 0.7959218763354149, "learning_rate": 1.838592045767354e-07, "loss": 0.0052, "step": 9641 }, { "epoch": 4.386715195632393, "grad_norm": 0.5645888004827234, "learning_rate": 1.8359030405876276e-07, "loss": 0.0018, "step": 9642 }, { "epoch": 4.387170154686078, "grad_norm": 0.7260854059438131, "learning_rate": 1.833215928273291e-07, "loss": 0.0126, "step": 9643 }, { "epoch": 4.387625113739763, "grad_norm": 0.5191043670039629, "learning_rate": 1.8305307090439318e-07, "loss": 0.0031, "step": 9644 }, { "epoch": 4.388080072793448, "grad_norm": 0.5844194278172518, "learning_rate": 1.8278473831189718e-07, "loss": 0.0064, "step": 9645 }, { "epoch": 4.388535031847134, "grad_norm": 0.22854363922269422, "learning_rate": 1.825165950717675e-07, "loss": 0.0013, "step": 9646 }, { "epoch": 4.388989990900819, "grad_norm": 0.645132216707225, "learning_rate": 1.8224864120591628e-07, "loss": 0.0192, "step": 9647 }, { "epoch": 4.389444949954504, "grad_norm": 0.8390465412592926, "learning_rate": 1.819808767362391e-07, "loss": 0.0108, "step": 9648 }, { "epoch": 4.389899909008189, "grad_norm": 0.6292180153097627, "learning_rate": 1.8171330168461675e-07, "loss": 0.0056, "step": 9649 }, { "epoch": 4.390354868061874, "grad_norm": 0.8473047963866898, "learning_rate": 1.8144591607291427e-07, "loss": 0.0141, "step": 9650 }, { "epoch": 4.39080982711556, "grad_norm": 0.32508632635927676, "learning_rate": 1.8117871992298086e-07, "loss": 0.0021, "step": 9651 }, { "epoch": 4.391264786169245, "grad_norm": 0.5972129556851212, "learning_rate": 1.8091171325665042e-07, "loss": 0.0021, "step": 9652 }, { "epoch": 4.39171974522293, "grad_norm": 0.17687118732085555, "learning_rate": 1.8064489609574186e-07, "loss": 0.0006, "step": 9653 }, { "epoch": 4.3921747042766155, "grad_norm": 0.5877914946701797, "learning_rate": 1.8037826846205864e-07, "loss": 0.0082, "step": 9654 }, { "epoch": 4.3926296633303, "grad_norm": 0.5329114627596371, "learning_rate": 1.80111830377388e-07, "loss": 0.0094, "step": 9655 }, { "epoch": 4.393084622383985, "grad_norm": 1.185672843164627, "learning_rate": 1.7984558186350226e-07, "loss": 0.0178, "step": 9656 }, { "epoch": 4.393539581437671, "grad_norm": 0.7403192408390645, "learning_rate": 1.795795229421571e-07, "loss": 0.0009, "step": 9657 }, { "epoch": 4.393994540491356, "grad_norm": 0.1760503905896683, "learning_rate": 1.7931365363509506e-07, "loss": 0.0009, "step": 9658 }, { "epoch": 4.394449499545041, "grad_norm": 0.27076337304599335, "learning_rate": 1.7904797396404106e-07, "loss": 0.0016, "step": 9659 }, { "epoch": 4.3949044585987265, "grad_norm": 0.22881079333953852, "learning_rate": 1.787824839507049e-07, "loss": 0.0011, "step": 9660 }, { "epoch": 4.395359417652411, "grad_norm": 0.8987649600855361, "learning_rate": 1.7851718361678206e-07, "loss": 0.0028, "step": 9661 }, { "epoch": 4.395814376706096, "grad_norm": 0.7629085559345663, "learning_rate": 1.7825207298395068e-07, "loss": 0.0219, "step": 9662 }, { "epoch": 4.396269335759782, "grad_norm": 0.09175049309941848, "learning_rate": 1.7798715207387545e-07, "loss": 0.0003, "step": 9663 }, { "epoch": 4.396724294813467, "grad_norm": 0.3359965303308679, "learning_rate": 1.7772242090820402e-07, "loss": 0.0011, "step": 9664 }, { "epoch": 4.397179253867152, "grad_norm": 0.629385329668107, "learning_rate": 1.7745787950856907e-07, "loss": 0.0224, "step": 9665 }, { "epoch": 4.3976342129208374, "grad_norm": 0.39164601950174166, "learning_rate": 1.7719352789658779e-07, "loss": 0.0079, "step": 9666 }, { "epoch": 4.398089171974522, "grad_norm": 0.16019812371942735, "learning_rate": 1.769293660938612e-07, "loss": 0.0009, "step": 9667 }, { "epoch": 4.398544131028207, "grad_norm": 1.1679240745554984, "learning_rate": 1.7666539412197619e-07, "loss": 0.0076, "step": 9668 }, { "epoch": 4.398999090081893, "grad_norm": 0.7908016417201755, "learning_rate": 1.7640161200250305e-07, "loss": 0.0143, "step": 9669 }, { "epoch": 4.399454049135578, "grad_norm": 0.5976296810007278, "learning_rate": 1.761380197569973e-07, "loss": 0.0094, "step": 9670 }, { "epoch": 4.399909008189263, "grad_norm": 0.3286637568552253, "learning_rate": 1.758746174069978e-07, "loss": 0.0032, "step": 9671 }, { "epoch": 4.400363967242948, "grad_norm": 0.8996187926045167, "learning_rate": 1.7561140497402874e-07, "loss": 0.0017, "step": 9672 }, { "epoch": 4.400818926296633, "grad_norm": 0.552673724159962, "learning_rate": 1.75348382479599e-07, "loss": 0.0014, "step": 9673 }, { "epoch": 4.401273885350318, "grad_norm": 0.5790883324021376, "learning_rate": 1.7508554994520117e-07, "loss": 0.0022, "step": 9674 }, { "epoch": 4.401728844404004, "grad_norm": 0.25805703253330825, "learning_rate": 1.7482290739231327e-07, "loss": 0.0015, "step": 9675 }, { "epoch": 4.402183803457689, "grad_norm": 0.24519210346794518, "learning_rate": 1.7456045484239708e-07, "loss": 0.0017, "step": 9676 }, { "epoch": 4.402638762511374, "grad_norm": 0.9100955831229194, "learning_rate": 1.7429819231689848e-07, "loss": 0.005, "step": 9677 }, { "epoch": 4.403093721565059, "grad_norm": 0.8475249954444521, "learning_rate": 1.740361198372492e-07, "loss": 0.0125, "step": 9678 }, { "epoch": 4.403548680618744, "grad_norm": 1.1532319412217367, "learning_rate": 1.7377423742486439e-07, "loss": 0.003, "step": 9679 }, { "epoch": 4.404003639672429, "grad_norm": 0.5254298290462994, "learning_rate": 1.735125451011435e-07, "loss": 0.0037, "step": 9680 }, { "epoch": 4.404458598726115, "grad_norm": 0.29218877062446486, "learning_rate": 1.7325104288747092e-07, "loss": 0.0015, "step": 9681 }, { "epoch": 4.4049135577798, "grad_norm": 0.24054906572388296, "learning_rate": 1.7298973080521532e-07, "loss": 0.0023, "step": 9682 }, { "epoch": 4.405368516833485, "grad_norm": 0.5943491850840132, "learning_rate": 1.72728608875731e-07, "loss": 0.0058, "step": 9683 }, { "epoch": 4.40582347588717, "grad_norm": 0.9864419262332652, "learning_rate": 1.7246767712035457e-07, "loss": 0.0102, "step": 9684 }, { "epoch": 4.406278434940855, "grad_norm": 0.6096806578087617, "learning_rate": 1.7220693556040862e-07, "loss": 0.0078, "step": 9685 }, { "epoch": 4.40673339399454, "grad_norm": 0.8447394880247437, "learning_rate": 1.719463842171995e-07, "loss": 0.0087, "step": 9686 }, { "epoch": 4.407188353048226, "grad_norm": 0.540903334716079, "learning_rate": 1.7168602311201875e-07, "loss": 0.0176, "step": 9687 }, { "epoch": 4.407643312101911, "grad_norm": 0.6859006803338078, "learning_rate": 1.7142585226614107e-07, "loss": 0.0105, "step": 9688 }, { "epoch": 4.408098271155596, "grad_norm": 0.42513302179215456, "learning_rate": 1.7116587170082743e-07, "loss": 0.0026, "step": 9689 }, { "epoch": 4.408553230209281, "grad_norm": 0.8522958409010785, "learning_rate": 1.70906081437322e-07, "loss": 0.0163, "step": 9690 }, { "epoch": 4.409008189262966, "grad_norm": 0.5267564658995249, "learning_rate": 1.706464814968531e-07, "loss": 0.0023, "step": 9691 }, { "epoch": 4.409463148316651, "grad_norm": 0.1658204901596422, "learning_rate": 1.7038707190063454e-07, "loss": 0.0007, "step": 9692 }, { "epoch": 4.409918107370337, "grad_norm": 0.18646842156434953, "learning_rate": 1.7012785266986438e-07, "loss": 0.0013, "step": 9693 }, { "epoch": 4.410373066424022, "grad_norm": 0.30139529485337174, "learning_rate": 1.698688238257243e-07, "loss": 0.0027, "step": 9694 }, { "epoch": 4.4108280254777075, "grad_norm": 0.5684149116348419, "learning_rate": 1.6960998538938096e-07, "loss": 0.0065, "step": 9695 }, { "epoch": 4.411282984531392, "grad_norm": 0.46723870580743065, "learning_rate": 1.693513373819855e-07, "loss": 0.0044, "step": 9696 }, { "epoch": 4.411737943585077, "grad_norm": 1.2518893945515488, "learning_rate": 1.6909287982467382e-07, "loss": 0.0087, "step": 9697 }, { "epoch": 4.412192902638763, "grad_norm": 1.033881787685317, "learning_rate": 1.688346127385662e-07, "loss": 0.0074, "step": 9698 }, { "epoch": 4.412647861692448, "grad_norm": 0.8496201925327224, "learning_rate": 1.685765361447661e-07, "loss": 0.0107, "step": 9699 }, { "epoch": 4.413102820746133, "grad_norm": 0.6775019133314827, "learning_rate": 1.6831865006436327e-07, "loss": 0.0059, "step": 9700 }, { "epoch": 4.4135577797998184, "grad_norm": 0.10985236613430738, "learning_rate": 1.6806095451843e-07, "loss": 0.0004, "step": 9701 }, { "epoch": 4.414012738853503, "grad_norm": 0.23814069599000554, "learning_rate": 1.678034495280245e-07, "loss": 0.0005, "step": 9702 }, { "epoch": 4.414467697907188, "grad_norm": 0.6587831085401895, "learning_rate": 1.6754613511418938e-07, "loss": 0.0039, "step": 9703 }, { "epoch": 4.414922656960874, "grad_norm": 1.132290625993569, "learning_rate": 1.6728901129795082e-07, "loss": 0.0035, "step": 9704 }, { "epoch": 4.415377616014559, "grad_norm": 0.8741924409127341, "learning_rate": 1.6703207810032012e-07, "loss": 0.0183, "step": 9705 }, { "epoch": 4.415832575068244, "grad_norm": 0.5799170657609555, "learning_rate": 1.6677533554229186e-07, "loss": 0.0154, "step": 9706 }, { "epoch": 4.416287534121929, "grad_norm": 0.5096568583864268, "learning_rate": 1.6651878364484676e-07, "loss": 0.0146, "step": 9707 }, { "epoch": 4.416742493175614, "grad_norm": 0.8129566795972774, "learning_rate": 1.6626242242894858e-07, "loss": 0.0078, "step": 9708 }, { "epoch": 4.417197452229299, "grad_norm": 0.8341497041701079, "learning_rate": 1.6600625191554616e-07, "loss": 0.0243, "step": 9709 }, { "epoch": 4.417652411282985, "grad_norm": 0.2915098906664663, "learning_rate": 1.657502721255727e-07, "loss": 0.0015, "step": 9710 }, { "epoch": 4.41810737033667, "grad_norm": 0.5685263052137177, "learning_rate": 1.6549448307994542e-07, "loss": 0.0092, "step": 9711 }, { "epoch": 4.418562329390355, "grad_norm": 0.3675310995674599, "learning_rate": 1.6523888479956675e-07, "loss": 0.0057, "step": 9712 }, { "epoch": 4.41901728844404, "grad_norm": 0.6244767208240298, "learning_rate": 1.649834773053227e-07, "loss": 0.0129, "step": 9713 }, { "epoch": 4.419472247497725, "grad_norm": 0.6739874876901008, "learning_rate": 1.6472826061808416e-07, "loss": 0.0132, "step": 9714 }, { "epoch": 4.41992720655141, "grad_norm": 0.8156965243801084, "learning_rate": 1.6447323475870554e-07, "loss": 0.0215, "step": 9715 }, { "epoch": 4.420382165605096, "grad_norm": 0.3713707949864442, "learning_rate": 1.6421839974802733e-07, "loss": 0.0051, "step": 9716 }, { "epoch": 4.420837124658781, "grad_norm": 1.2190504151877812, "learning_rate": 1.639637556068735e-07, "loss": 0.0145, "step": 9717 }, { "epoch": 4.421292083712466, "grad_norm": 0.5620341984624162, "learning_rate": 1.6370930235605182e-07, "loss": 0.0072, "step": 9718 }, { "epoch": 4.421747042766151, "grad_norm": 0.7115244766917679, "learning_rate": 1.6345504001635564e-07, "loss": 0.0161, "step": 9719 }, { "epoch": 4.422202001819836, "grad_norm": 1.1670752112549103, "learning_rate": 1.6320096860856143e-07, "loss": 0.0109, "step": 9720 }, { "epoch": 4.422656960873521, "grad_norm": 0.6456899079360489, "learning_rate": 1.6294708815343174e-07, "loss": 0.0092, "step": 9721 }, { "epoch": 4.423111919927207, "grad_norm": 0.7349195743748974, "learning_rate": 1.6269339867171163e-07, "loss": 0.0102, "step": 9722 }, { "epoch": 4.423566878980892, "grad_norm": 0.8445726182245428, "learning_rate": 1.6243990018413146e-07, "loss": 0.0067, "step": 9723 }, { "epoch": 4.424021838034577, "grad_norm": 0.37615559223711487, "learning_rate": 1.621865927114069e-07, "loss": 0.0036, "step": 9724 }, { "epoch": 4.424476797088262, "grad_norm": 0.5196253509493691, "learning_rate": 1.619334762742361e-07, "loss": 0.0026, "step": 9725 }, { "epoch": 4.424931756141947, "grad_norm": 0.6056445525002015, "learning_rate": 1.6168055089330338e-07, "loss": 0.0091, "step": 9726 }, { "epoch": 4.425386715195632, "grad_norm": 0.297055592866198, "learning_rate": 1.6142781658927603e-07, "loss": 0.0009, "step": 9727 }, { "epoch": 4.425841674249318, "grad_norm": 0.34585269002971947, "learning_rate": 1.6117527338280674e-07, "loss": 0.0021, "step": 9728 }, { "epoch": 4.426296633303003, "grad_norm": 0.1597621626733815, "learning_rate": 1.609229212945318e-07, "loss": 0.0007, "step": 9729 }, { "epoch": 4.426751592356688, "grad_norm": 0.3890629620451909, "learning_rate": 1.6067076034507246e-07, "loss": 0.0062, "step": 9730 }, { "epoch": 4.427206551410373, "grad_norm": 0.07021509825117325, "learning_rate": 1.6041879055503473e-07, "loss": 0.0002, "step": 9731 }, { "epoch": 4.427661510464058, "grad_norm": 0.6787322996452306, "learning_rate": 1.60167011945008e-07, "loss": 0.0018, "step": 9732 }, { "epoch": 4.428116469517743, "grad_norm": 0.12042969094898866, "learning_rate": 1.5991542453556635e-07, "loss": 0.0005, "step": 9733 }, { "epoch": 4.428571428571429, "grad_norm": 0.09312673775025103, "learning_rate": 1.5966402834726862e-07, "loss": 0.0004, "step": 9734 }, { "epoch": 4.429026387625114, "grad_norm": 0.7991513390937816, "learning_rate": 1.59412823400657e-07, "loss": 0.0081, "step": 9735 }, { "epoch": 4.429481346678799, "grad_norm": 0.5897034484675205, "learning_rate": 1.5916180971626006e-07, "loss": 0.0054, "step": 9736 }, { "epoch": 4.429936305732484, "grad_norm": 0.39974977269685635, "learning_rate": 1.5891098731458832e-07, "loss": 0.005, "step": 9737 }, { "epoch": 4.430391264786169, "grad_norm": 0.880665929165812, "learning_rate": 1.58660356216139e-07, "loss": 0.0134, "step": 9738 }, { "epoch": 4.430846223839854, "grad_norm": 0.46221428211375176, "learning_rate": 1.5840991644139187e-07, "loss": 0.0026, "step": 9739 }, { "epoch": 4.43130118289354, "grad_norm": 1.5383885164760147, "learning_rate": 1.5815966801081163e-07, "loss": 0.0067, "step": 9740 }, { "epoch": 4.431756141947225, "grad_norm": 0.3148060789790762, "learning_rate": 1.5790961094484802e-07, "loss": 0.0032, "step": 9741 }, { "epoch": 4.4322111010009095, "grad_norm": 1.1636169094723658, "learning_rate": 1.576597452639339e-07, "loss": 0.0265, "step": 9742 }, { "epoch": 4.432666060054595, "grad_norm": 0.22728651119392043, "learning_rate": 1.5741007098848792e-07, "loss": 0.0009, "step": 9743 }, { "epoch": 4.43312101910828, "grad_norm": 0.6081573820687975, "learning_rate": 1.571605881389113e-07, "loss": 0.0071, "step": 9744 }, { "epoch": 4.433575978161965, "grad_norm": 0.7443638060862887, "learning_rate": 1.5691129673559098e-07, "loss": 0.017, "step": 9745 }, { "epoch": 4.434030937215651, "grad_norm": 0.8446804910836628, "learning_rate": 1.5666219679889906e-07, "loss": 0.0118, "step": 9746 }, { "epoch": 4.434485896269336, "grad_norm": 0.5861983311076432, "learning_rate": 1.5641328834918978e-07, "loss": 0.0035, "step": 9747 }, { "epoch": 4.4349408553230205, "grad_norm": 0.7397433518724431, "learning_rate": 1.5616457140680303e-07, "loss": 0.0046, "step": 9748 }, { "epoch": 4.435395814376706, "grad_norm": 0.8707363439976065, "learning_rate": 1.5591604599206223e-07, "loss": 0.0095, "step": 9749 }, { "epoch": 4.435850773430391, "grad_norm": 0.6775458750210451, "learning_rate": 1.5566771212527697e-07, "loss": 0.025, "step": 9750 }, { "epoch": 4.436305732484076, "grad_norm": 0.3692402312336025, "learning_rate": 1.5541956982673912e-07, "loss": 0.0025, "step": 9751 }, { "epoch": 4.436760691537762, "grad_norm": 0.8857172020489119, "learning_rate": 1.5517161911672628e-07, "loss": 0.0015, "step": 9752 }, { "epoch": 4.437215650591447, "grad_norm": 0.5773801924159077, "learning_rate": 1.5492386001549952e-07, "loss": 0.0053, "step": 9753 }, { "epoch": 4.4376706096451315, "grad_norm": 0.7091001174287956, "learning_rate": 1.54676292543304e-07, "loss": 0.0063, "step": 9754 }, { "epoch": 4.438125568698817, "grad_norm": 0.2894132240383063, "learning_rate": 1.5442891672037135e-07, "loss": 0.0021, "step": 9755 }, { "epoch": 4.438580527752502, "grad_norm": 0.3981633655054172, "learning_rate": 1.5418173256691481e-07, "loss": 0.0038, "step": 9756 }, { "epoch": 4.439035486806187, "grad_norm": 0.16104175349251398, "learning_rate": 1.5393474010313353e-07, "loss": 0.0006, "step": 9757 }, { "epoch": 4.439490445859873, "grad_norm": 0.25112422425514086, "learning_rate": 1.5368793934921023e-07, "loss": 0.0007, "step": 9758 }, { "epoch": 4.439945404913558, "grad_norm": 0.8274501167215043, "learning_rate": 1.5344133032531267e-07, "loss": 0.0089, "step": 9759 }, { "epoch": 4.440400363967243, "grad_norm": 0.7747186693805379, "learning_rate": 1.531949130515928e-07, "loss": 0.0064, "step": 9760 }, { "epoch": 4.440855323020928, "grad_norm": 1.1918980545134585, "learning_rate": 1.529486875481867e-07, "loss": 0.0023, "step": 9761 }, { "epoch": 4.441310282074613, "grad_norm": 0.7217142026429995, "learning_rate": 1.5270265383521472e-07, "loss": 0.0033, "step": 9762 }, { "epoch": 4.441765241128299, "grad_norm": 0.21957328735050882, "learning_rate": 1.5245681193278127e-07, "loss": 0.001, "step": 9763 }, { "epoch": 4.442220200181984, "grad_norm": 0.3001731835349383, "learning_rate": 1.522111618609759e-07, "loss": 0.0012, "step": 9764 }, { "epoch": 4.442675159235669, "grad_norm": 0.7073061071630043, "learning_rate": 1.5196570363987167e-07, "loss": 0.0042, "step": 9765 }, { "epoch": 4.443130118289354, "grad_norm": 0.32743281362562254, "learning_rate": 1.5172043728952672e-07, "loss": 0.0062, "step": 9766 }, { "epoch": 4.443585077343039, "grad_norm": 0.809150649643931, "learning_rate": 1.514753628299831e-07, "loss": 0.0049, "step": 9767 }, { "epoch": 4.444040036396724, "grad_norm": 0.6745060610482161, "learning_rate": 1.512304802812664e-07, "loss": 0.0033, "step": 9768 }, { "epoch": 4.44449499545041, "grad_norm": 0.49384629801622526, "learning_rate": 1.5098578966338845e-07, "loss": 0.0014, "step": 9769 }, { "epoch": 4.444949954504095, "grad_norm": 0.5347208195255794, "learning_rate": 1.507412909963435e-07, "loss": 0.0052, "step": 9770 }, { "epoch": 4.44540491355778, "grad_norm": 0.4480250934376436, "learning_rate": 1.504969843001114e-07, "loss": 0.0027, "step": 9771 }, { "epoch": 4.445859872611465, "grad_norm": 0.7668971451765911, "learning_rate": 1.5025286959465479e-07, "loss": 0.0069, "step": 9772 }, { "epoch": 4.44631483166515, "grad_norm": 0.3390131439021309, "learning_rate": 1.5000894689992274e-07, "loss": 0.0022, "step": 9773 }, { "epoch": 4.446769790718835, "grad_norm": 0.967394173933728, "learning_rate": 1.4976521623584678e-07, "loss": 0.0042, "step": 9774 }, { "epoch": 4.447224749772521, "grad_norm": 1.3249539282424374, "learning_rate": 1.4952167762234433e-07, "loss": 0.0165, "step": 9775 }, { "epoch": 4.447679708826206, "grad_norm": 0.44996545136459143, "learning_rate": 1.4927833107931556e-07, "loss": 0.0077, "step": 9776 }, { "epoch": 4.4481346678798905, "grad_norm": 0.9356888010534882, "learning_rate": 1.4903517662664568e-07, "loss": 0.0151, "step": 9777 }, { "epoch": 4.448589626933576, "grad_norm": 0.39740933209962176, "learning_rate": 1.487922142842041e-07, "loss": 0.0051, "step": 9778 }, { "epoch": 4.449044585987261, "grad_norm": 0.5162943487971867, "learning_rate": 1.4854944407184463e-07, "loss": 0.007, "step": 9779 }, { "epoch": 4.449499545040946, "grad_norm": 0.11969622775245517, "learning_rate": 1.4830686600940614e-07, "loss": 0.0004, "step": 9780 }, { "epoch": 4.449954504094632, "grad_norm": 1.46222004317012, "learning_rate": 1.4806448011671025e-07, "loss": 0.003, "step": 9781 }, { "epoch": 4.450409463148317, "grad_norm": 0.2249866842253085, "learning_rate": 1.4782228641356393e-07, "loss": 0.0009, "step": 9782 }, { "epoch": 4.4508644222020015, "grad_norm": 0.100107698340382, "learning_rate": 1.4758028491975745e-07, "loss": 0.0005, "step": 9783 }, { "epoch": 4.451319381255687, "grad_norm": 0.09998695715838525, "learning_rate": 1.473384756550672e-07, "loss": 0.0003, "step": 9784 }, { "epoch": 4.451774340309372, "grad_norm": 0.6900439300627942, "learning_rate": 1.470968586392521e-07, "loss": 0.0026, "step": 9785 }, { "epoch": 4.452229299363057, "grad_norm": 1.0679905947732706, "learning_rate": 1.468554338920558e-07, "loss": 0.0042, "step": 9786 }, { "epoch": 4.452684258416743, "grad_norm": 0.412863867776042, "learning_rate": 1.4661420143320725e-07, "loss": 0.0037, "step": 9787 }, { "epoch": 4.453139217470428, "grad_norm": 0.4374765303623458, "learning_rate": 1.4637316128241763e-07, "loss": 0.0031, "step": 9788 }, { "epoch": 4.4535941765241125, "grad_norm": 1.0095950194786134, "learning_rate": 1.4613231345938506e-07, "loss": 0.0033, "step": 9789 }, { "epoch": 4.454049135577798, "grad_norm": 0.5102877630068731, "learning_rate": 1.458916579837896e-07, "loss": 0.0011, "step": 9790 }, { "epoch": 4.454504094631483, "grad_norm": 0.6100666744032089, "learning_rate": 1.45651194875297e-07, "loss": 0.0013, "step": 9791 }, { "epoch": 4.454959053685168, "grad_norm": 0.09592515175657507, "learning_rate": 1.454109241535562e-07, "loss": 0.0003, "step": 9792 }, { "epoch": 4.455414012738854, "grad_norm": 0.4378641680707167, "learning_rate": 1.4517084583820145e-07, "loss": 0.0038, "step": 9793 }, { "epoch": 4.455868971792539, "grad_norm": 0.30413164664266795, "learning_rate": 1.449309599488513e-07, "loss": 0.0022, "step": 9794 }, { "epoch": 4.4563239308462235, "grad_norm": 0.09489047948898581, "learning_rate": 1.4469126650510755e-07, "loss": 0.0003, "step": 9795 }, { "epoch": 4.456778889899909, "grad_norm": 0.7346376667448924, "learning_rate": 1.4445176552655705e-07, "loss": 0.0067, "step": 9796 }, { "epoch": 4.457233848953594, "grad_norm": 0.461015891249621, "learning_rate": 1.4421245703277047e-07, "loss": 0.0041, "step": 9797 }, { "epoch": 4.457688808007279, "grad_norm": 1.4742366135892424, "learning_rate": 1.4397334104330335e-07, "loss": 0.0059, "step": 9798 }, { "epoch": 4.458143767060965, "grad_norm": 0.21684728493365357, "learning_rate": 1.437344175776953e-07, "loss": 0.0011, "step": 9799 }, { "epoch": 4.45859872611465, "grad_norm": 0.5957964736307716, "learning_rate": 1.434956866554693e-07, "loss": 0.0031, "step": 9800 }, { "epoch": 4.4590536851683344, "grad_norm": 0.9034624743013534, "learning_rate": 1.4325714829613453e-07, "loss": 0.0032, "step": 9801 }, { "epoch": 4.45950864422202, "grad_norm": 0.23483665664557965, "learning_rate": 1.4301880251918227e-07, "loss": 0.0013, "step": 9802 }, { "epoch": 4.459963603275705, "grad_norm": 0.8105675483172239, "learning_rate": 1.4278064934408946e-07, "loss": 0.0119, "step": 9803 }, { "epoch": 4.460418562329391, "grad_norm": 1.7455422809750731, "learning_rate": 1.4254268879031725e-07, "loss": 0.0253, "step": 9804 }, { "epoch": 4.460873521383076, "grad_norm": 1.0313200787813424, "learning_rate": 1.423049208773103e-07, "loss": 0.0112, "step": 9805 }, { "epoch": 4.461328480436761, "grad_norm": 0.800992780799919, "learning_rate": 1.420673456244978e-07, "loss": 0.0035, "step": 9806 }, { "epoch": 4.461783439490446, "grad_norm": 0.8285347086363363, "learning_rate": 1.418299630512926e-07, "loss": 0.0027, "step": 9807 }, { "epoch": 4.462238398544131, "grad_norm": 0.17833363124500773, "learning_rate": 1.415927731770944e-07, "loss": 0.0008, "step": 9808 }, { "epoch": 4.462693357597816, "grad_norm": 0.11034331385976327, "learning_rate": 1.4135577602128413e-07, "loss": 0.0003, "step": 9809 }, { "epoch": 4.463148316651502, "grad_norm": 0.15655105350431997, "learning_rate": 1.4111897160322852e-07, "loss": 0.0008, "step": 9810 }, { "epoch": 4.463603275705187, "grad_norm": 0.5655302682443396, "learning_rate": 1.408823599422779e-07, "loss": 0.0035, "step": 9811 }, { "epoch": 4.4640582347588715, "grad_norm": 0.537451287485392, "learning_rate": 1.406459410577668e-07, "loss": 0.0011, "step": 9812 }, { "epoch": 4.464513193812557, "grad_norm": 0.6275326183987182, "learning_rate": 1.404097149690148e-07, "loss": 0.0092, "step": 9813 }, { "epoch": 4.464968152866242, "grad_norm": 0.2916512359285108, "learning_rate": 1.4017368169532474e-07, "loss": 0.0019, "step": 9814 }, { "epoch": 4.465423111919927, "grad_norm": 0.12089825575221036, "learning_rate": 1.3993784125598513e-07, "loss": 0.0005, "step": 9815 }, { "epoch": 4.465878070973613, "grad_norm": 0.28306036981548205, "learning_rate": 1.3970219367026694e-07, "loss": 0.0022, "step": 9816 }, { "epoch": 4.466333030027298, "grad_norm": 0.7600478838513217, "learning_rate": 1.394667389574264e-07, "loss": 0.0048, "step": 9817 }, { "epoch": 4.4667879890809825, "grad_norm": 0.9052926678390206, "learning_rate": 1.39231477136704e-07, "loss": 0.0067, "step": 9818 }, { "epoch": 4.467242948134668, "grad_norm": 0.7174492554810711, "learning_rate": 1.38996408227324e-07, "loss": 0.004, "step": 9819 }, { "epoch": 4.467697907188353, "grad_norm": 1.2512238498129111, "learning_rate": 1.387615322484953e-07, "loss": 0.0054, "step": 9820 }, { "epoch": 4.468152866242038, "grad_norm": 0.6704890948125176, "learning_rate": 1.3852684921941112e-07, "loss": 0.0057, "step": 9821 }, { "epoch": 4.468607825295724, "grad_norm": 0.9088865760204339, "learning_rate": 1.3829235915924832e-07, "loss": 0.005, "step": 9822 }, { "epoch": 4.469062784349409, "grad_norm": 0.47575626177480373, "learning_rate": 1.3805806208716855e-07, "loss": 0.0069, "step": 9823 }, { "epoch": 4.4695177434030935, "grad_norm": 0.3075174019336655, "learning_rate": 1.3782395802231785e-07, "loss": 0.0023, "step": 9824 }, { "epoch": 4.469972702456779, "grad_norm": 0.44703552755616943, "learning_rate": 1.3759004698382566e-07, "loss": 0.0034, "step": 9825 }, { "epoch": 4.470427661510464, "grad_norm": 0.8775336321034306, "learning_rate": 1.3735632899080586e-07, "loss": 0.0136, "step": 9826 }, { "epoch": 4.470882620564149, "grad_norm": 0.37708926329412346, "learning_rate": 1.3712280406235733e-07, "loss": 0.0015, "step": 9827 }, { "epoch": 4.471337579617835, "grad_norm": 0.24669100906412217, "learning_rate": 1.3688947221756316e-07, "loss": 0.0015, "step": 9828 }, { "epoch": 4.47179253867152, "grad_norm": 1.157838269126152, "learning_rate": 1.3665633347548946e-07, "loss": 0.0055, "step": 9829 }, { "epoch": 4.4722474977252045, "grad_norm": 0.5658063917926225, "learning_rate": 1.364233878551874e-07, "loss": 0.0057, "step": 9830 }, { "epoch": 4.47270245677889, "grad_norm": 0.5284180345701617, "learning_rate": 1.3619063537569173e-07, "loss": 0.0039, "step": 9831 }, { "epoch": 4.473157415832575, "grad_norm": 0.4811636563918908, "learning_rate": 1.3595807605602307e-07, "loss": 0.0042, "step": 9832 }, { "epoch": 4.47361237488626, "grad_norm": 0.2221626991500221, "learning_rate": 1.357257099151843e-07, "loss": 0.0018, "step": 9833 }, { "epoch": 4.474067333939946, "grad_norm": 0.8856034232042718, "learning_rate": 1.3549353697216326e-07, "loss": 0.0413, "step": 9834 }, { "epoch": 4.474522292993631, "grad_norm": 0.5231299378131835, "learning_rate": 1.3526155724593288e-07, "loss": 0.008, "step": 9835 }, { "epoch": 4.4749772520473154, "grad_norm": 0.34867778420494877, "learning_rate": 1.350297707554485e-07, "loss": 0.0042, "step": 9836 }, { "epoch": 4.475432211101001, "grad_norm": 0.0959783032925497, "learning_rate": 1.3479817751965164e-07, "loss": 0.0002, "step": 9837 }, { "epoch": 4.475887170154686, "grad_norm": 0.7140017884167985, "learning_rate": 1.3456677755746634e-07, "loss": 0.0054, "step": 9838 }, { "epoch": 4.476342129208371, "grad_norm": 0.6409500887821322, "learning_rate": 1.343355708878019e-07, "loss": 0.0052, "step": 9839 }, { "epoch": 4.476797088262057, "grad_norm": 0.5743999394933489, "learning_rate": 1.3410455752955132e-07, "loss": 0.0155, "step": 9840 }, { "epoch": 4.477252047315742, "grad_norm": 0.7922451792786711, "learning_rate": 1.338737375015911e-07, "loss": 0.0093, "step": 9841 }, { "epoch": 4.477707006369426, "grad_norm": 0.30209006085992524, "learning_rate": 1.336431108227848e-07, "loss": 0.0014, "step": 9842 }, { "epoch": 4.478161965423112, "grad_norm": 0.7123345265804434, "learning_rate": 1.3341267751197678e-07, "loss": 0.0133, "step": 9843 }, { "epoch": 4.478616924476797, "grad_norm": 0.4547109713719715, "learning_rate": 1.3318243758799754e-07, "loss": 0.0222, "step": 9844 }, { "epoch": 4.479071883530482, "grad_norm": 0.7162503817653217, "learning_rate": 1.3295239106966118e-07, "loss": 0.0027, "step": 9845 }, { "epoch": 4.479526842584168, "grad_norm": 0.6860554135162332, "learning_rate": 1.3272253797576518e-07, "loss": 0.0042, "step": 9846 }, { "epoch": 4.4799818016378525, "grad_norm": 0.1927045710166255, "learning_rate": 1.3249287832509366e-07, "loss": 0.0007, "step": 9847 }, { "epoch": 4.480436760691537, "grad_norm": 0.6638395354304734, "learning_rate": 1.3226341213641191e-07, "loss": 0.0086, "step": 9848 }, { "epoch": 4.480891719745223, "grad_norm": 0.9581013081437911, "learning_rate": 1.3203413942847189e-07, "loss": 0.0052, "step": 9849 }, { "epoch": 4.481346678798908, "grad_norm": 0.747852510727218, "learning_rate": 1.3180506022000827e-07, "loss": 0.0055, "step": 9850 }, { "epoch": 4.481801637852593, "grad_norm": 0.34207572431038774, "learning_rate": 1.3157617452974032e-07, "loss": 0.0024, "step": 9851 }, { "epoch": 4.482256596906279, "grad_norm": 0.8030589394553277, "learning_rate": 1.313474823763719e-07, "loss": 0.0097, "step": 9852 }, { "epoch": 4.4827115559599635, "grad_norm": 0.19120478530290613, "learning_rate": 1.311189837785906e-07, "loss": 0.0012, "step": 9853 }, { "epoch": 4.483166515013648, "grad_norm": 0.08230350176446104, "learning_rate": 1.3089067875506788e-07, "loss": 0.0004, "step": 9854 }, { "epoch": 4.483621474067334, "grad_norm": 0.15146805610808528, "learning_rate": 1.306625673244602e-07, "loss": 0.0006, "step": 9855 }, { "epoch": 4.484076433121019, "grad_norm": 0.6274947170510479, "learning_rate": 1.3043464950540734e-07, "loss": 0.0113, "step": 9856 }, { "epoch": 4.484531392174704, "grad_norm": 0.718448497289765, "learning_rate": 1.3020692531653445e-07, "loss": 0.007, "step": 9857 }, { "epoch": 4.48498635122839, "grad_norm": 0.5952399524591315, "learning_rate": 1.2997939477644967e-07, "loss": 0.0073, "step": 9858 }, { "epoch": 4.4854413102820745, "grad_norm": 0.32930833808616783, "learning_rate": 1.2975205790374617e-07, "loss": 0.0005, "step": 9859 }, { "epoch": 4.485896269335759, "grad_norm": 0.44258053271880915, "learning_rate": 1.2952491471699997e-07, "loss": 0.0054, "step": 9860 }, { "epoch": 4.486351228389445, "grad_norm": 0.4461567582809166, "learning_rate": 1.292979652347731e-07, "loss": 0.0052, "step": 9861 }, { "epoch": 4.48680618744313, "grad_norm": 0.7843567791273511, "learning_rate": 1.2907120947561024e-07, "loss": 0.0086, "step": 9862 }, { "epoch": 4.487261146496815, "grad_norm": 0.6868834880290128, "learning_rate": 1.2884464745804125e-07, "loss": 0.0155, "step": 9863 }, { "epoch": 4.487716105550501, "grad_norm": 0.2402261317208703, "learning_rate": 1.2861827920057994e-07, "loss": 0.0011, "step": 9864 }, { "epoch": 4.4881710646041855, "grad_norm": 0.8961280214339132, "learning_rate": 1.283921047217232e-07, "loss": 0.0022, "step": 9865 }, { "epoch": 4.48862602365787, "grad_norm": 0.799963807200324, "learning_rate": 1.28166124039954e-07, "loss": 0.003, "step": 9866 }, { "epoch": 4.489080982711556, "grad_norm": 0.25803322544704677, "learning_rate": 1.279403371737381e-07, "loss": 0.0027, "step": 9867 }, { "epoch": 4.489535941765241, "grad_norm": 0.3342246727925926, "learning_rate": 1.2771474414152552e-07, "loss": 0.0009, "step": 9868 }, { "epoch": 4.489990900818927, "grad_norm": 0.04350928928204437, "learning_rate": 1.2748934496175092e-07, "loss": 0.0001, "step": 9869 }, { "epoch": 4.490445859872612, "grad_norm": 0.46453684274472057, "learning_rate": 1.2726413965283264e-07, "loss": 0.0069, "step": 9870 }, { "epoch": 4.4909008189262964, "grad_norm": 0.5029296326539349, "learning_rate": 1.27039128233174e-07, "loss": 0.0059, "step": 9871 }, { "epoch": 4.491355777979982, "grad_norm": 1.8277159886483765, "learning_rate": 1.2681431072116168e-07, "loss": 0.0177, "step": 9872 }, { "epoch": 4.491810737033667, "grad_norm": 1.1411930568672808, "learning_rate": 1.2658968713516655e-07, "loss": 0.004, "step": 9873 }, { "epoch": 4.492265696087352, "grad_norm": 0.5153681262857569, "learning_rate": 1.2636525749354396e-07, "loss": 0.0023, "step": 9874 }, { "epoch": 4.492720655141038, "grad_norm": 1.1126118998157013, "learning_rate": 1.2614102181463334e-07, "loss": 0.019, "step": 9875 }, { "epoch": 4.4931756141947226, "grad_norm": 0.1632849689306653, "learning_rate": 1.2591698011675784e-07, "loss": 0.0005, "step": 9876 }, { "epoch": 4.493630573248407, "grad_norm": 0.2787688012577561, "learning_rate": 1.256931324182259e-07, "loss": 0.0011, "step": 9877 }, { "epoch": 4.494085532302093, "grad_norm": 0.6807201605688501, "learning_rate": 1.2546947873732896e-07, "loss": 0.0043, "step": 9878 }, { "epoch": 4.494540491355778, "grad_norm": 0.5101951595090856, "learning_rate": 1.2524601909234268e-07, "loss": 0.0034, "step": 9879 }, { "epoch": 4.494995450409463, "grad_norm": 0.32466310347644156, "learning_rate": 1.250227535015272e-07, "loss": 0.0037, "step": 9880 }, { "epoch": 4.495450409463149, "grad_norm": 0.6838681166821015, "learning_rate": 1.2479968198312736e-07, "loss": 0.0062, "step": 9881 }, { "epoch": 4.4959053685168335, "grad_norm": 0.1891430551297332, "learning_rate": 1.2457680455537136e-07, "loss": 0.0007, "step": 9882 }, { "epoch": 4.496360327570518, "grad_norm": 0.3585832879719572, "learning_rate": 1.24354121236471e-07, "loss": 0.0018, "step": 9883 }, { "epoch": 4.496815286624204, "grad_norm": 0.0766952339132367, "learning_rate": 1.2413163204462398e-07, "loss": 0.0003, "step": 9884 }, { "epoch": 4.497270245677889, "grad_norm": 0.7511979049021951, "learning_rate": 1.2390933699801017e-07, "loss": 0.0115, "step": 9885 }, { "epoch": 4.497725204731574, "grad_norm": 1.8077704403995334, "learning_rate": 1.236872361147956e-07, "loss": 0.0129, "step": 9886 }, { "epoch": 4.49818016378526, "grad_norm": 1.3541180027051432, "learning_rate": 1.2346532941312854e-07, "loss": 0.0159, "step": 9887 }, { "epoch": 4.4986351228389445, "grad_norm": 0.6143319781956107, "learning_rate": 1.2324361691114257e-07, "loss": 0.0023, "step": 9888 }, { "epoch": 4.499090081892629, "grad_norm": 0.340011230753036, "learning_rate": 1.2302209862695453e-07, "loss": 0.0013, "step": 9889 }, { "epoch": 4.499545040946315, "grad_norm": 0.2059175763216069, "learning_rate": 1.2280077457866635e-07, "loss": 0.0006, "step": 9890 }, { "epoch": 4.5, "grad_norm": 0.411994601453013, "learning_rate": 1.2257964478436358e-07, "loss": 0.0025, "step": 9891 }, { "epoch": 4.500454959053685, "grad_norm": 0.5771620282369587, "learning_rate": 1.223587092621162e-07, "loss": 0.0035, "step": 9892 }, { "epoch": 4.500909918107371, "grad_norm": 0.46077427768813395, "learning_rate": 1.2213796802997752e-07, "loss": 0.0054, "step": 9893 }, { "epoch": 4.5013648771610555, "grad_norm": 0.430965678973847, "learning_rate": 1.2191742110598564e-07, "loss": 0.0084, "step": 9894 }, { "epoch": 4.50181983621474, "grad_norm": 0.8274405100766725, "learning_rate": 1.2169706850816309e-07, "loss": 0.0047, "step": 9895 }, { "epoch": 4.502274795268426, "grad_norm": 0.31537494040467584, "learning_rate": 1.2147691025451573e-07, "loss": 0.0011, "step": 9896 }, { "epoch": 4.502729754322111, "grad_norm": 0.5327286535001189, "learning_rate": 1.2125694636303337e-07, "loss": 0.0118, "step": 9897 }, { "epoch": 4.503184713375796, "grad_norm": 0.6161672150216891, "learning_rate": 1.2103717685169187e-07, "loss": 0.002, "step": 9898 }, { "epoch": 4.503639672429482, "grad_norm": 0.5636007289784855, "learning_rate": 1.2081760173844825e-07, "loss": 0.0035, "step": 9899 }, { "epoch": 4.5040946314831665, "grad_norm": 0.16903214690878202, "learning_rate": 1.2059822104124626e-07, "loss": 0.0009, "step": 9900 }, { "epoch": 4.504549590536851, "grad_norm": 0.4726503981160163, "learning_rate": 1.203790347780126e-07, "loss": 0.0074, "step": 9901 }, { "epoch": 4.505004549590537, "grad_norm": 0.515436254429222, "learning_rate": 1.2016004296665772e-07, "loss": 0.003, "step": 9902 }, { "epoch": 4.505459508644222, "grad_norm": 2.2341465040935033, "learning_rate": 1.1994124562507674e-07, "loss": 0.0108, "step": 9903 }, { "epoch": 4.505914467697907, "grad_norm": 0.6133820826941745, "learning_rate": 1.1972264277114898e-07, "loss": 0.0066, "step": 9904 }, { "epoch": 4.506369426751593, "grad_norm": 1.243481097536698, "learning_rate": 1.195042344227376e-07, "loss": 0.0016, "step": 9905 }, { "epoch": 4.5068243858052774, "grad_norm": 0.5025672230593139, "learning_rate": 1.1928602059769008e-07, "loss": 0.0046, "step": 9906 }, { "epoch": 4.507279344858962, "grad_norm": 0.5555982337605584, "learning_rate": 1.1906800131383789e-07, "loss": 0.0227, "step": 9907 }, { "epoch": 4.507734303912648, "grad_norm": 0.855855824783825, "learning_rate": 1.18850176588996e-07, "loss": 0.002, "step": 9908 }, { "epoch": 4.508189262966333, "grad_norm": 0.9135384769987488, "learning_rate": 1.1863254644096488e-07, "loss": 0.0125, "step": 9909 }, { "epoch": 4.508644222020019, "grad_norm": 0.41005986771583236, "learning_rate": 1.1841511088752783e-07, "loss": 0.0046, "step": 9910 }, { "epoch": 4.5090991810737036, "grad_norm": 1.624908597652368, "learning_rate": 1.1819786994645255e-07, "loss": 0.0053, "step": 9911 }, { "epoch": 4.509554140127388, "grad_norm": 0.3249416348762103, "learning_rate": 1.1798082363549152e-07, "loss": 0.0019, "step": 9912 }, { "epoch": 4.510009099181074, "grad_norm": 0.2510440132156049, "learning_rate": 1.1776397197238027e-07, "loss": 0.0008, "step": 9913 }, { "epoch": 4.510464058234759, "grad_norm": 0.8142071867738923, "learning_rate": 1.1754731497483934e-07, "loss": 0.0052, "step": 9914 }, { "epoch": 4.510919017288444, "grad_norm": 0.6892488155549055, "learning_rate": 1.1733085266057265e-07, "loss": 0.0036, "step": 9915 }, { "epoch": 4.51137397634213, "grad_norm": 1.035794905830122, "learning_rate": 1.171145850472688e-07, "loss": 0.0184, "step": 9916 }, { "epoch": 4.5118289353958145, "grad_norm": 0.6381827189528861, "learning_rate": 1.1689851215260006e-07, "loss": 0.0083, "step": 9917 }, { "epoch": 4.512283894449499, "grad_norm": 0.7912005615752203, "learning_rate": 1.1668263399422258e-07, "loss": 0.0097, "step": 9918 }, { "epoch": 4.512738853503185, "grad_norm": 0.4823904902321086, "learning_rate": 1.1646695058977697e-07, "loss": 0.0043, "step": 9919 }, { "epoch": 4.51319381255687, "grad_norm": 0.17765773912844962, "learning_rate": 1.1625146195688885e-07, "loss": 0.001, "step": 9920 }, { "epoch": 4.513648771610555, "grad_norm": 0.416259111975624, "learning_rate": 1.1603616811316638e-07, "loss": 0.0021, "step": 9921 }, { "epoch": 4.514103730664241, "grad_norm": 0.6706651363933, "learning_rate": 1.1582106907620238e-07, "loss": 0.0041, "step": 9922 }, { "epoch": 4.5145586897179255, "grad_norm": 0.3711640790223938, "learning_rate": 1.156061648635734e-07, "loss": 0.0019, "step": 9923 }, { "epoch": 4.51501364877161, "grad_norm": 1.046851460467525, "learning_rate": 1.1539145549284092e-07, "loss": 0.0027, "step": 9924 }, { "epoch": 4.515468607825296, "grad_norm": 0.8322510699469805, "learning_rate": 1.151769409815498e-07, "loss": 0.0095, "step": 9925 }, { "epoch": 4.515923566878981, "grad_norm": 0.7483688460972775, "learning_rate": 1.1496262134722935e-07, "loss": 0.0046, "step": 9926 }, { "epoch": 4.516378525932666, "grad_norm": 0.7355157829892907, "learning_rate": 1.1474849660739306e-07, "loss": 0.018, "step": 9927 }, { "epoch": 4.516833484986352, "grad_norm": 0.36210314955991446, "learning_rate": 1.145345667795375e-07, "loss": 0.0033, "step": 9928 }, { "epoch": 4.5172884440400365, "grad_norm": 0.42964098411028107, "learning_rate": 1.143208318811448e-07, "loss": 0.0141, "step": 9929 }, { "epoch": 4.517743403093721, "grad_norm": 0.5929051330743892, "learning_rate": 1.1410729192968012e-07, "loss": 0.0071, "step": 9930 }, { "epoch": 4.518198362147407, "grad_norm": 0.4871737264062261, "learning_rate": 1.1389394694259287e-07, "loss": 0.0021, "step": 9931 }, { "epoch": 4.518653321201092, "grad_norm": 0.7937285059254866, "learning_rate": 1.1368079693731632e-07, "loss": 0.0024, "step": 9932 }, { "epoch": 4.519108280254777, "grad_norm": 0.26712450885773925, "learning_rate": 1.1346784193126875e-07, "loss": 0.0018, "step": 9933 }, { "epoch": 4.519563239308463, "grad_norm": 0.5825679110938948, "learning_rate": 1.1325508194185181e-07, "loss": 0.0021, "step": 9934 }, { "epoch": 4.5200181983621475, "grad_norm": 0.486795872057124, "learning_rate": 1.1304251698645102e-07, "loss": 0.0109, "step": 9935 }, { "epoch": 4.520473157415832, "grad_norm": 0.47572342121973826, "learning_rate": 1.1283014708243666e-07, "loss": 0.0044, "step": 9936 }, { "epoch": 4.520928116469518, "grad_norm": 1.4139546115473745, "learning_rate": 1.126179722471618e-07, "loss": 0.0072, "step": 9937 }, { "epoch": 4.521383075523203, "grad_norm": 0.30733159943413274, "learning_rate": 1.1240599249796535e-07, "loss": 0.0024, "step": 9938 }, { "epoch": 4.521838034576888, "grad_norm": 0.33486519611998705, "learning_rate": 1.1219420785216844e-07, "loss": 0.0027, "step": 9939 }, { "epoch": 4.522292993630574, "grad_norm": 1.247226606189212, "learning_rate": 1.1198261832707808e-07, "loss": 0.0089, "step": 9940 }, { "epoch": 4.522747952684258, "grad_norm": 0.28513161548936, "learning_rate": 1.1177122393998374e-07, "loss": 0.001, "step": 9941 }, { "epoch": 4.523202911737943, "grad_norm": 0.1077003967082178, "learning_rate": 1.1156002470815968e-07, "loss": 0.0005, "step": 9942 }, { "epoch": 4.523657870791629, "grad_norm": 0.9472135392800752, "learning_rate": 1.113490206488646e-07, "loss": 0.0064, "step": 9943 }, { "epoch": 4.524112829845314, "grad_norm": 0.3474133549598887, "learning_rate": 1.1113821177934053e-07, "loss": 0.0017, "step": 9944 }, { "epoch": 4.524567788898999, "grad_norm": 1.4938781135498371, "learning_rate": 1.109275981168137e-07, "loss": 0.0232, "step": 9945 }, { "epoch": 4.5250227479526846, "grad_norm": 1.655157242544746, "learning_rate": 1.1071717967849449e-07, "loss": 0.0086, "step": 9946 }, { "epoch": 4.525477707006369, "grad_norm": 0.7094904490714827, "learning_rate": 1.1050695648157722e-07, "loss": 0.0065, "step": 9947 }, { "epoch": 4.525932666060054, "grad_norm": 0.11284707763666389, "learning_rate": 1.1029692854324092e-07, "loss": 0.0005, "step": 9948 }, { "epoch": 4.52638762511374, "grad_norm": 0.7037041075453447, "learning_rate": 1.1008709588064798e-07, "loss": 0.0056, "step": 9949 }, { "epoch": 4.526842584167425, "grad_norm": 0.8351289806931412, "learning_rate": 1.0987745851094494e-07, "loss": 0.0032, "step": 9950 }, { "epoch": 4.52729754322111, "grad_norm": 0.39709905740938667, "learning_rate": 1.0966801645126202e-07, "loss": 0.0061, "step": 9951 }, { "epoch": 4.5277525022747955, "grad_norm": 0.2669431743031923, "learning_rate": 1.0945876971871383e-07, "loss": 0.0013, "step": 9952 }, { "epoch": 4.52820746132848, "grad_norm": 0.4078491035543665, "learning_rate": 1.0924971833039949e-07, "loss": 0.0016, "step": 9953 }, { "epoch": 4.528662420382165, "grad_norm": 0.9802397541646081, "learning_rate": 1.0904086230340199e-07, "loss": 0.0077, "step": 9954 }, { "epoch": 4.529117379435851, "grad_norm": 0.15114725126383774, "learning_rate": 1.088322016547877e-07, "loss": 0.0005, "step": 9955 }, { "epoch": 4.529572338489536, "grad_norm": 0.48014424485415946, "learning_rate": 1.086237364016074e-07, "loss": 0.0041, "step": 9956 }, { "epoch": 4.530027297543221, "grad_norm": 0.7565141096461859, "learning_rate": 1.084154665608958e-07, "loss": 0.0087, "step": 9957 }, { "epoch": 4.5304822565969065, "grad_norm": 0.6920383005924577, "learning_rate": 1.0820739214967208e-07, "loss": 0.0068, "step": 9958 }, { "epoch": 4.530937215650591, "grad_norm": 0.19535216897711577, "learning_rate": 1.0799951318493929e-07, "loss": 0.0015, "step": 9959 }, { "epoch": 4.531392174704276, "grad_norm": 1.3553217600887593, "learning_rate": 1.0779182968368357e-07, "loss": 0.0128, "step": 9960 }, { "epoch": 4.531847133757962, "grad_norm": 0.3533396996964176, "learning_rate": 1.075843416628769e-07, "loss": 0.0022, "step": 9961 }, { "epoch": 4.532302092811647, "grad_norm": 0.16734841772513753, "learning_rate": 1.073770491394735e-07, "loss": 0.0006, "step": 9962 }, { "epoch": 4.532757051865332, "grad_norm": 0.7026615187973705, "learning_rate": 1.0716995213041287e-07, "loss": 0.0156, "step": 9963 }, { "epoch": 4.5332120109190175, "grad_norm": 0.9463401696631527, "learning_rate": 1.0696305065261787e-07, "loss": 0.0039, "step": 9964 }, { "epoch": 4.533666969972702, "grad_norm": 0.44035231499355887, "learning_rate": 1.067563447229955e-07, "loss": 0.0027, "step": 9965 }, { "epoch": 4.534121929026387, "grad_norm": 0.7465553796142235, "learning_rate": 1.0654983435843646e-07, "loss": 0.0196, "step": 9966 }, { "epoch": 4.534576888080073, "grad_norm": 0.9878029492192568, "learning_rate": 1.0634351957581613e-07, "loss": 0.0122, "step": 9967 }, { "epoch": 4.535031847133758, "grad_norm": 0.6974795420598723, "learning_rate": 1.0613740039199433e-07, "loss": 0.0018, "step": 9968 }, { "epoch": 4.535486806187443, "grad_norm": 0.6115213936443965, "learning_rate": 1.0593147682381344e-07, "loss": 0.0031, "step": 9969 }, { "epoch": 4.5359417652411285, "grad_norm": 0.5927750902560632, "learning_rate": 1.0572574888810055e-07, "loss": 0.0108, "step": 9970 }, { "epoch": 4.536396724294813, "grad_norm": 0.5724008332602967, "learning_rate": 1.0552021660166694e-07, "loss": 0.0062, "step": 9971 }, { "epoch": 4.536851683348498, "grad_norm": 0.355809911322456, "learning_rate": 1.0531487998130808e-07, "loss": 0.0018, "step": 9972 }, { "epoch": 4.537306642402184, "grad_norm": 0.9302065108700407, "learning_rate": 1.0510973904380301e-07, "loss": 0.0087, "step": 9973 }, { "epoch": 4.537761601455869, "grad_norm": 0.9760395917508141, "learning_rate": 1.0490479380591445e-07, "loss": 0.0169, "step": 9974 }, { "epoch": 4.538216560509554, "grad_norm": 2.4398848794036367, "learning_rate": 1.047000442843904e-07, "loss": 0.0118, "step": 9975 }, { "epoch": 4.538671519563239, "grad_norm": 1.2138675519491022, "learning_rate": 1.0449549049596136e-07, "loss": 0.0177, "step": 9976 }, { "epoch": 4.539126478616924, "grad_norm": 0.05949294746256485, "learning_rate": 1.0429113245734285e-07, "loss": 0.0002, "step": 9977 }, { "epoch": 4.539581437670609, "grad_norm": 0.42191680362905937, "learning_rate": 1.0408697018523428e-07, "loss": 0.0018, "step": 9978 }, { "epoch": 4.540036396724295, "grad_norm": 0.3830173349194551, "learning_rate": 1.0388300369631871e-07, "loss": 0.0028, "step": 9979 }, { "epoch": 4.54049135577798, "grad_norm": 0.56716382969998, "learning_rate": 1.0367923300726307e-07, "loss": 0.0093, "step": 9980 }, { "epoch": 4.540946314831665, "grad_norm": 0.45324872924759846, "learning_rate": 1.034756581347185e-07, "loss": 0.0058, "step": 9981 }, { "epoch": 4.54140127388535, "grad_norm": 0.2046138566629578, "learning_rate": 1.0327227909532111e-07, "loss": 0.0003, "step": 9982 }, { "epoch": 4.541856232939035, "grad_norm": 0.7516184421375919, "learning_rate": 1.0306909590568959e-07, "loss": 0.008, "step": 9983 }, { "epoch": 4.542311191992721, "grad_norm": 0.5385334496591783, "learning_rate": 1.0286610858242701e-07, "loss": 0.0083, "step": 9984 }, { "epoch": 4.542766151046406, "grad_norm": 0.32545702757388284, "learning_rate": 1.0266331714212069e-07, "loss": 0.0016, "step": 9985 }, { "epoch": 4.543221110100091, "grad_norm": 0.4000718883025517, "learning_rate": 1.0246072160134123e-07, "loss": 0.0033, "step": 9986 }, { "epoch": 4.5436760691537765, "grad_norm": 0.43341016442845287, "learning_rate": 1.0225832197664515e-07, "loss": 0.0011, "step": 9987 }, { "epoch": 4.544131028207461, "grad_norm": 0.2591037864722553, "learning_rate": 1.0205611828457029e-07, "loss": 0.0019, "step": 9988 }, { "epoch": 4.544585987261146, "grad_norm": 0.7872535457280343, "learning_rate": 1.0185411054164096e-07, "loss": 0.012, "step": 9989 }, { "epoch": 4.545040946314832, "grad_norm": 0.7319799712335445, "learning_rate": 1.0165229876436367e-07, "loss": 0.0152, "step": 9990 }, { "epoch": 4.545495905368517, "grad_norm": 0.19719650560131602, "learning_rate": 1.0145068296922911e-07, "loss": 0.0011, "step": 9991 }, { "epoch": 4.545950864422202, "grad_norm": 0.3315916189493323, "learning_rate": 1.0124926317271355e-07, "loss": 0.002, "step": 9992 }, { "epoch": 4.5464058234758875, "grad_norm": 0.5264330903955505, "learning_rate": 1.0104803939127578e-07, "loss": 0.0067, "step": 9993 }, { "epoch": 4.546860782529572, "grad_norm": 0.5935233985999003, "learning_rate": 1.0084701164135818e-07, "loss": 0.0183, "step": 9994 }, { "epoch": 4.547315741583257, "grad_norm": 0.6630183057395955, "learning_rate": 1.0064617993938847e-07, "loss": 0.0085, "step": 9995 }, { "epoch": 4.547770700636943, "grad_norm": 0.35340107226219314, "learning_rate": 1.0044554430177711e-07, "loss": 0.0015, "step": 9996 }, { "epoch": 4.548225659690628, "grad_norm": 0.901103259765951, "learning_rate": 1.0024510474492016e-07, "loss": 0.0115, "step": 9997 }, { "epoch": 4.548680618744313, "grad_norm": 0.728334558929005, "learning_rate": 1.0004486128519592e-07, "loss": 0.0053, "step": 9998 }, { "epoch": 4.5491355777979985, "grad_norm": 0.4083642078127323, "learning_rate": 9.984481393896767e-08, "loss": 0.0022, "step": 9999 }, { "epoch": 4.549590536851683, "grad_norm": 0.2427476391128149, "learning_rate": 9.964496272258178e-08, "loss": 0.001, "step": 10000 }, { "epoch": 4.550045495905368, "grad_norm": 0.582725435981475, "learning_rate": 9.944530765236993e-08, "loss": 0.0097, "step": 10001 }, { "epoch": 4.550500454959054, "grad_norm": 1.5902069522467264, "learning_rate": 9.924584874464655e-08, "loss": 0.002, "step": 10002 }, { "epoch": 4.550955414012739, "grad_norm": 0.6513133931824141, "learning_rate": 9.904658601571083e-08, "loss": 0.008, "step": 10003 }, { "epoch": 4.551410373066424, "grad_norm": 1.0535030074990723, "learning_rate": 9.884751948184584e-08, "loss": 0.0223, "step": 10004 }, { "epoch": 4.5518653321201095, "grad_norm": 0.928526439962241, "learning_rate": 9.864864915931748e-08, "loss": 0.0166, "step": 10005 }, { "epoch": 4.552320291173794, "grad_norm": 0.11901174303569895, "learning_rate": 9.84499750643772e-08, "loss": 0.0003, "step": 10006 }, { "epoch": 4.552775250227479, "grad_norm": 0.5522469863994841, "learning_rate": 9.825149721326005e-08, "loss": 0.0009, "step": 10007 }, { "epoch": 4.553230209281165, "grad_norm": 0.23060965589130622, "learning_rate": 9.805321562218417e-08, "loss": 0.0014, "step": 10008 }, { "epoch": 4.55368516833485, "grad_norm": 0.6816660733465801, "learning_rate": 9.785513030735216e-08, "loss": 0.0045, "step": 10009 }, { "epoch": 4.554140127388535, "grad_norm": 0.24311196362936519, "learning_rate": 9.76572412849508e-08, "loss": 0.0011, "step": 10010 }, { "epoch": 4.55459508644222, "grad_norm": 0.43371582547510973, "learning_rate": 9.745954857115104e-08, "loss": 0.0035, "step": 10011 }, { "epoch": 4.555050045495905, "grad_norm": 0.6005694823785229, "learning_rate": 9.726205218210743e-08, "loss": 0.0028, "step": 10012 }, { "epoch": 4.55550500454959, "grad_norm": 0.27852786815236846, "learning_rate": 9.706475213395822e-08, "loss": 0.0013, "step": 10013 }, { "epoch": 4.555959963603276, "grad_norm": 0.6939029116412462, "learning_rate": 9.686764844282547e-08, "loss": 0.0035, "step": 10014 }, { "epoch": 4.556414922656961, "grad_norm": 0.46816611213393305, "learning_rate": 9.667074112481633e-08, "loss": 0.0032, "step": 10015 }, { "epoch": 4.556869881710646, "grad_norm": 0.4963619894046811, "learning_rate": 9.647403019602069e-08, "loss": 0.0021, "step": 10016 }, { "epoch": 4.557324840764331, "grad_norm": 0.30761791143490264, "learning_rate": 9.62775156725132e-08, "loss": 0.0015, "step": 10017 }, { "epoch": 4.557779799818016, "grad_norm": 0.2623277125735554, "learning_rate": 9.608119757035211e-08, "loss": 0.0015, "step": 10018 }, { "epoch": 4.558234758871702, "grad_norm": 0.7364253674548289, "learning_rate": 9.588507590557933e-08, "loss": 0.0161, "step": 10019 }, { "epoch": 4.558689717925387, "grad_norm": 0.3596667021265034, "learning_rate": 9.568915069422147e-08, "loss": 0.0011, "step": 10020 }, { "epoch": 4.559144676979072, "grad_norm": 0.7196558560255785, "learning_rate": 9.54934219522885e-08, "loss": 0.0223, "step": 10021 }, { "epoch": 4.5595996360327575, "grad_norm": 0.24432320699050317, "learning_rate": 9.529788969577459e-08, "loss": 0.0012, "step": 10022 }, { "epoch": 4.560054595086442, "grad_norm": 0.06986450568778929, "learning_rate": 9.510255394065692e-08, "loss": 0.0003, "step": 10023 }, { "epoch": 4.560509554140127, "grad_norm": 0.34989197382123693, "learning_rate": 9.49074147028986e-08, "loss": 0.0015, "step": 10024 }, { "epoch": 4.560964513193813, "grad_norm": 0.4402200956306291, "learning_rate": 9.471247199844491e-08, "loss": 0.003, "step": 10025 }, { "epoch": 4.561419472247498, "grad_norm": 0.5934744990122068, "learning_rate": 9.451772584322589e-08, "loss": 0.0032, "step": 10026 }, { "epoch": 4.561874431301183, "grad_norm": 0.5298716037081519, "learning_rate": 9.432317625315546e-08, "loss": 0.0077, "step": 10027 }, { "epoch": 4.5623293903548685, "grad_norm": 1.2217448783170772, "learning_rate": 9.412882324413091e-08, "loss": 0.0144, "step": 10028 }, { "epoch": 4.562784349408553, "grad_norm": 0.6613288252045205, "learning_rate": 9.3934666832034e-08, "loss": 0.0066, "step": 10029 }, { "epoch": 4.563239308462238, "grad_norm": 0.4875772151631161, "learning_rate": 9.374070703273036e-08, "loss": 0.0031, "step": 10030 }, { "epoch": 4.563694267515924, "grad_norm": 0.3220691989788428, "learning_rate": 9.354694386206981e-08, "loss": 0.0009, "step": 10031 }, { "epoch": 4.564149226569609, "grad_norm": 0.6488218141583818, "learning_rate": 9.335337733588551e-08, "loss": 0.0107, "step": 10032 }, { "epoch": 4.564604185623294, "grad_norm": 1.536091778033614, "learning_rate": 9.31600074699951e-08, "loss": 0.0089, "step": 10033 }, { "epoch": 4.5650591446769795, "grad_norm": 0.22988951472077176, "learning_rate": 9.296683428019926e-08, "loss": 0.0009, "step": 10034 }, { "epoch": 4.565514103730664, "grad_norm": 1.0281700794324997, "learning_rate": 9.277385778228398e-08, "loss": 0.0013, "step": 10035 }, { "epoch": 4.565969062784349, "grad_norm": 0.41726258986939513, "learning_rate": 9.258107799201804e-08, "loss": 0.0033, "step": 10036 }, { "epoch": 4.566424021838035, "grad_norm": 0.36539779241617953, "learning_rate": 9.23884949251544e-08, "loss": 0.0021, "step": 10037 }, { "epoch": 4.56687898089172, "grad_norm": 0.45812992303522176, "learning_rate": 9.219610859743044e-08, "loss": 0.0018, "step": 10038 }, { "epoch": 4.567333939945405, "grad_norm": 0.2488595084154663, "learning_rate": 9.200391902456667e-08, "loss": 0.0009, "step": 10039 }, { "epoch": 4.5677888989990905, "grad_norm": 0.849516683892568, "learning_rate": 9.181192622226859e-08, "loss": 0.0165, "step": 10040 }, { "epoch": 4.568243858052775, "grad_norm": 0.4945881095182725, "learning_rate": 9.162013020622473e-08, "loss": 0.0051, "step": 10041 }, { "epoch": 4.56869881710646, "grad_norm": 0.6223758328460317, "learning_rate": 9.142853099210758e-08, "loss": 0.0107, "step": 10042 }, { "epoch": 4.569153776160146, "grad_norm": 0.4135911079346784, "learning_rate": 9.123712859557348e-08, "loss": 0.0027, "step": 10043 }, { "epoch": 4.569608735213831, "grad_norm": 0.33773834756541404, "learning_rate": 9.104592303226356e-08, "loss": 0.0023, "step": 10044 }, { "epoch": 4.570063694267516, "grad_norm": 0.5133661623774162, "learning_rate": 9.085491431780224e-08, "loss": 0.0074, "step": 10045 }, { "epoch": 4.570518653321201, "grad_norm": 0.34496067959180055, "learning_rate": 9.06641024677976e-08, "loss": 0.0021, "step": 10046 }, { "epoch": 4.570973612374886, "grad_norm": 0.7177517215511733, "learning_rate": 9.047348749784218e-08, "loss": 0.0121, "step": 10047 }, { "epoch": 4.571428571428571, "grad_norm": 0.5933695617181176, "learning_rate": 9.028306942351156e-08, "loss": 0.0048, "step": 10048 }, { "epoch": 4.571883530482257, "grad_norm": 0.32069287558538245, "learning_rate": 9.00928482603669e-08, "loss": 0.0029, "step": 10049 }, { "epoch": 4.572338489535942, "grad_norm": 0.45065649451070633, "learning_rate": 8.990282402395134e-08, "loss": 0.0071, "step": 10050 }, { "epoch": 4.572793448589627, "grad_norm": 0.14147163077224967, "learning_rate": 8.9712996729793e-08, "loss": 0.0006, "step": 10051 }, { "epoch": 4.573248407643312, "grad_norm": 0.5896868329563055, "learning_rate": 8.952336639340419e-08, "loss": 0.0059, "step": 10052 }, { "epoch": 4.573703366696997, "grad_norm": 0.6262962966521725, "learning_rate": 8.933393303027977e-08, "loss": 0.0041, "step": 10053 }, { "epoch": 4.574158325750682, "grad_norm": 0.35455530118054857, "learning_rate": 8.914469665590036e-08, "loss": 0.0025, "step": 10054 }, { "epoch": 4.574613284804368, "grad_norm": 2.0539686816303924, "learning_rate": 8.895565728572864e-08, "loss": 0.022, "step": 10055 }, { "epoch": 4.575068243858053, "grad_norm": 0.5669940387720821, "learning_rate": 8.876681493521277e-08, "loss": 0.0078, "step": 10056 }, { "epoch": 4.575523202911738, "grad_norm": 0.8027427809270409, "learning_rate": 8.857816961978377e-08, "loss": 0.0025, "step": 10057 }, { "epoch": 4.575978161965423, "grad_norm": 1.0189860502561183, "learning_rate": 8.838972135485596e-08, "loss": 0.015, "step": 10058 }, { "epoch": 4.576433121019108, "grad_norm": 0.3192866585406991, "learning_rate": 8.820147015583037e-08, "loss": 0.0014, "step": 10059 }, { "epoch": 4.576888080072793, "grad_norm": 0.19879463741220652, "learning_rate": 8.801341603808883e-08, "loss": 0.001, "step": 10060 }, { "epoch": 4.577343039126479, "grad_norm": 0.8210699670143461, "learning_rate": 8.782555901699852e-08, "loss": 0.0104, "step": 10061 }, { "epoch": 4.577797998180164, "grad_norm": 0.3787622371231527, "learning_rate": 8.763789910791021e-08, "loss": 0.0018, "step": 10062 }, { "epoch": 4.578252957233849, "grad_norm": 0.6252732775472204, "learning_rate": 8.74504363261583e-08, "loss": 0.0009, "step": 10063 }, { "epoch": 4.578707916287534, "grad_norm": 0.4456554714485441, "learning_rate": 8.72631706870622e-08, "loss": 0.003, "step": 10064 }, { "epoch": 4.579162875341219, "grad_norm": 0.37374989754750687, "learning_rate": 8.707610220592355e-08, "loss": 0.0019, "step": 10065 }, { "epoch": 4.579617834394904, "grad_norm": 0.15408420772742734, "learning_rate": 8.688923089802959e-08, "loss": 0.0006, "step": 10066 }, { "epoch": 4.58007279344859, "grad_norm": 0.7906021588283282, "learning_rate": 8.670255677865003e-08, "loss": 0.0028, "step": 10067 }, { "epoch": 4.580527752502275, "grad_norm": 0.7319178249133774, "learning_rate": 8.651607986303906e-08, "loss": 0.0084, "step": 10068 }, { "epoch": 4.58098271155596, "grad_norm": 0.3478745052697997, "learning_rate": 8.632980016643505e-08, "loss": 0.0023, "step": 10069 }, { "epoch": 4.581437670609645, "grad_norm": 1.5037994170806963, "learning_rate": 8.614371770405971e-08, "loss": 0.0143, "step": 10070 }, { "epoch": 4.58189262966333, "grad_norm": 0.47807582196822695, "learning_rate": 8.595783249111895e-08, "loss": 0.0028, "step": 10071 }, { "epoch": 4.582347588717015, "grad_norm": 0.814407683396984, "learning_rate": 8.577214454280197e-08, "loss": 0.0029, "step": 10072 }, { "epoch": 4.582802547770701, "grad_norm": 0.7837147958588294, "learning_rate": 8.558665387428277e-08, "loss": 0.0074, "step": 10073 }, { "epoch": 4.583257506824386, "grad_norm": 0.6953927794305833, "learning_rate": 8.540136050071923e-08, "loss": 0.0038, "step": 10074 }, { "epoch": 4.583712465878071, "grad_norm": 0.5184701779790698, "learning_rate": 8.521626443725228e-08, "loss": 0.0049, "step": 10075 }, { "epoch": 4.584167424931756, "grad_norm": 0.4885350144381359, "learning_rate": 8.503136569900705e-08, "loss": 0.0018, "step": 10076 }, { "epoch": 4.584622383985441, "grad_norm": 0.15194815084142402, "learning_rate": 8.484666430109257e-08, "loss": 0.0005, "step": 10077 }, { "epoch": 4.585077343039126, "grad_norm": 0.987655526577473, "learning_rate": 8.466216025860202e-08, "loss": 0.0035, "step": 10078 }, { "epoch": 4.585532302092812, "grad_norm": 0.40951578460586524, "learning_rate": 8.447785358661199e-08, "loss": 0.0018, "step": 10079 }, { "epoch": 4.585987261146497, "grad_norm": 0.12161157706371165, "learning_rate": 8.429374430018372e-08, "loss": 0.0004, "step": 10080 }, { "epoch": 4.5864422202001816, "grad_norm": 0.6333103702833256, "learning_rate": 8.410983241436132e-08, "loss": 0.0036, "step": 10081 }, { "epoch": 4.586897179253867, "grad_norm": 0.5618412749173163, "learning_rate": 8.392611794417305e-08, "loss": 0.0073, "step": 10082 }, { "epoch": 4.587352138307552, "grad_norm": 0.4027147954017274, "learning_rate": 8.374260090463188e-08, "loss": 0.0061, "step": 10083 }, { "epoch": 4.587807097361237, "grad_norm": 0.35470544228087375, "learning_rate": 8.35592813107336e-08, "loss": 0.0028, "step": 10084 }, { "epoch": 4.588262056414923, "grad_norm": 0.514922258771944, "learning_rate": 8.337615917745844e-08, "loss": 0.0044, "step": 10085 }, { "epoch": 4.588717015468608, "grad_norm": 1.6825754657245773, "learning_rate": 8.319323451976974e-08, "loss": 0.0179, "step": 10086 }, { "epoch": 4.5891719745222925, "grad_norm": 0.4724728625868127, "learning_rate": 8.301050735261579e-08, "loss": 0.0005, "step": 10087 }, { "epoch": 4.589626933575978, "grad_norm": 0.24073031503548808, "learning_rate": 8.282797769092854e-08, "loss": 0.0011, "step": 10088 }, { "epoch": 4.590081892629663, "grad_norm": 0.1333531596123758, "learning_rate": 8.264564554962273e-08, "loss": 0.0003, "step": 10089 }, { "epoch": 4.590536851683348, "grad_norm": 0.08072414303418123, "learning_rate": 8.24635109435984e-08, "loss": 0.0003, "step": 10090 }, { "epoch": 4.590991810737034, "grad_norm": 2.2157131125624656, "learning_rate": 8.228157388773805e-08, "loss": 0.0175, "step": 10091 }, { "epoch": 4.591446769790719, "grad_norm": 0.6525339208096167, "learning_rate": 8.209983439690955e-08, "loss": 0.0055, "step": 10092 }, { "epoch": 4.5919017288444035, "grad_norm": 0.2621543864113915, "learning_rate": 8.191829248596323e-08, "loss": 0.0019, "step": 10093 }, { "epoch": 4.592356687898089, "grad_norm": 0.344118301576787, "learning_rate": 8.173694816973415e-08, "loss": 0.0024, "step": 10094 }, { "epoch": 4.592811646951774, "grad_norm": 0.8178085203247772, "learning_rate": 8.155580146304104e-08, "loss": 0.0085, "step": 10095 }, { "epoch": 4.59326660600546, "grad_norm": 0.38885642261921066, "learning_rate": 8.13748523806862e-08, "loss": 0.0083, "step": 10096 }, { "epoch": 4.593721565059145, "grad_norm": 0.5233252847272661, "learning_rate": 8.11941009374556e-08, "loss": 0.0046, "step": 10097 }, { "epoch": 4.59417652411283, "grad_norm": 0.8170965464724753, "learning_rate": 8.101354714812021e-08, "loss": 0.0117, "step": 10098 }, { "epoch": 4.594631483166515, "grad_norm": 0.6439584595560373, "learning_rate": 8.083319102743375e-08, "loss": 0.0092, "step": 10099 }, { "epoch": 4.5950864422202, "grad_norm": 1.080721274042395, "learning_rate": 8.065303259013362e-08, "loss": 0.0132, "step": 10100 }, { "epoch": 4.595541401273885, "grad_norm": 0.11327372811463411, "learning_rate": 8.04730718509425e-08, "loss": 0.0003, "step": 10101 }, { "epoch": 4.595996360327571, "grad_norm": 0.5744073728334154, "learning_rate": 8.029330882456498e-08, "loss": 0.0076, "step": 10102 }, { "epoch": 4.596451319381256, "grad_norm": 0.48544791589179687, "learning_rate": 8.011374352569156e-08, "loss": 0.0097, "step": 10103 }, { "epoch": 4.596906278434941, "grad_norm": 0.8325091674885705, "learning_rate": 7.993437596899467e-08, "loss": 0.01, "step": 10104 }, { "epoch": 4.597361237488626, "grad_norm": 0.7056725897513985, "learning_rate": 7.975520616913174e-08, "loss": 0.0143, "step": 10105 }, { "epoch": 4.597816196542311, "grad_norm": 0.6471352933856741, "learning_rate": 7.957623414074328e-08, "loss": 0.0138, "step": 10106 }, { "epoch": 4.598271155595996, "grad_norm": 0.2947816984723841, "learning_rate": 7.939745989845427e-08, "loss": 0.0011, "step": 10107 }, { "epoch": 4.598726114649682, "grad_norm": 0.17580341226490057, "learning_rate": 7.921888345687412e-08, "loss": 0.0007, "step": 10108 }, { "epoch": 4.599181073703367, "grad_norm": 0.24721265737387227, "learning_rate": 7.904050483059422e-08, "loss": 0.001, "step": 10109 }, { "epoch": 4.599636032757052, "grad_norm": 0.43090314120110557, "learning_rate": 7.88623240341918e-08, "loss": 0.003, "step": 10110 }, { "epoch": 4.600090991810737, "grad_norm": 0.1093682784859318, "learning_rate": 7.868434108222577e-08, "loss": 0.0006, "step": 10111 }, { "epoch": 4.600545950864422, "grad_norm": 0.616157187896059, "learning_rate": 7.850655598924144e-08, "loss": 0.0048, "step": 10112 }, { "epoch": 4.601000909918107, "grad_norm": 0.8643947651114378, "learning_rate": 7.832896876976581e-08, "loss": 0.0266, "step": 10113 }, { "epoch": 4.601455868971793, "grad_norm": 0.9349430658560639, "learning_rate": 7.815157943831058e-08, "loss": 0.0051, "step": 10114 }, { "epoch": 4.601910828025478, "grad_norm": 1.0901226079083637, "learning_rate": 7.79743880093714e-08, "loss": 0.0028, "step": 10115 }, { "epoch": 4.6023657870791626, "grad_norm": 1.0948707824144905, "learning_rate": 7.779739449742724e-08, "loss": 0.0024, "step": 10116 }, { "epoch": 4.602820746132848, "grad_norm": 0.2767411529143188, "learning_rate": 7.76205989169418e-08, "loss": 0.0013, "step": 10117 }, { "epoch": 4.603275705186533, "grad_norm": 0.6641187312454001, "learning_rate": 7.744400128236158e-08, "loss": 0.0172, "step": 10118 }, { "epoch": 4.603730664240218, "grad_norm": 0.2982966582067206, "learning_rate": 7.726760160811726e-08, "loss": 0.0026, "step": 10119 }, { "epoch": 4.604185623293904, "grad_norm": 0.6145493830418631, "learning_rate": 7.709139990862342e-08, "loss": 0.0032, "step": 10120 }, { "epoch": 4.604640582347589, "grad_norm": 0.23390240171514096, "learning_rate": 7.691539619827881e-08, "loss": 0.0006, "step": 10121 }, { "epoch": 4.6050955414012735, "grad_norm": 0.33743568183768347, "learning_rate": 7.673959049146557e-08, "loss": 0.0026, "step": 10122 }, { "epoch": 4.605550500454959, "grad_norm": 0.25895734146553384, "learning_rate": 7.656398280254967e-08, "loss": 0.0019, "step": 10123 }, { "epoch": 4.606005459508644, "grad_norm": 0.08357134663296041, "learning_rate": 7.638857314588077e-08, "loss": 0.0003, "step": 10124 }, { "epoch": 4.606460418562329, "grad_norm": 0.9067013109817974, "learning_rate": 7.621336153579267e-08, "loss": 0.0058, "step": 10125 }, { "epoch": 4.606915377616015, "grad_norm": 0.4542956180146837, "learning_rate": 7.603834798660309e-08, "loss": 0.001, "step": 10126 }, { "epoch": 4.6073703366697, "grad_norm": 1.0693360829402174, "learning_rate": 7.586353251261336e-08, "loss": 0.0055, "step": 10127 }, { "epoch": 4.607825295723385, "grad_norm": 0.5596962570826692, "learning_rate": 7.568891512810817e-08, "loss": 0.0055, "step": 10128 }, { "epoch": 4.60828025477707, "grad_norm": 0.19097893790091805, "learning_rate": 7.551449584735693e-08, "loss": 0.0008, "step": 10129 }, { "epoch": 4.608735213830755, "grad_norm": 0.3397369648940844, "learning_rate": 7.534027468461213e-08, "loss": 0.002, "step": 10130 }, { "epoch": 4.609190172884441, "grad_norm": 0.6391580012645519, "learning_rate": 7.516625165411018e-08, "loss": 0.0065, "step": 10131 }, { "epoch": 4.609645131938126, "grad_norm": 1.0812426173793857, "learning_rate": 7.499242677007218e-08, "loss": 0.0078, "step": 10132 }, { "epoch": 4.610100090991811, "grad_norm": 0.30454544122655636, "learning_rate": 7.48188000467015e-08, "loss": 0.0029, "step": 10133 }, { "epoch": 4.610555050045496, "grad_norm": 1.0144380221541762, "learning_rate": 7.46453714981868e-08, "loss": 0.0465, "step": 10134 }, { "epoch": 4.611010009099181, "grad_norm": 0.5730873453635519, "learning_rate": 7.447214113869893e-08, "loss": 0.0016, "step": 10135 }, { "epoch": 4.611464968152866, "grad_norm": 0.2420804910583944, "learning_rate": 7.42991089823944e-08, "loss": 0.001, "step": 10136 }, { "epoch": 4.611919927206552, "grad_norm": 0.09499988055902077, "learning_rate": 7.412627504341241e-08, "loss": 0.0003, "step": 10137 }, { "epoch": 4.612374886260237, "grad_norm": 0.6667988101699831, "learning_rate": 7.395363933587612e-08, "loss": 0.0149, "step": 10138 }, { "epoch": 4.612829845313922, "grad_norm": 1.0260439200377016, "learning_rate": 7.378120187389231e-08, "loss": 0.0046, "step": 10139 }, { "epoch": 4.613284804367607, "grad_norm": 0.5477802579681678, "learning_rate": 7.360896267155193e-08, "loss": 0.0022, "step": 10140 }, { "epoch": 4.613739763421292, "grad_norm": 0.1879816462376185, "learning_rate": 7.343692174292982e-08, "loss": 0.0008, "step": 10141 }, { "epoch": 4.614194722474977, "grad_norm": 0.5449708541824304, "learning_rate": 7.32650791020842e-08, "loss": 0.0029, "step": 10142 }, { "epoch": 4.614649681528663, "grad_norm": 0.8020993752013785, "learning_rate": 7.309343476305714e-08, "loss": 0.0169, "step": 10143 }, { "epoch": 4.615104640582348, "grad_norm": 0.5827617834270544, "learning_rate": 7.292198873987493e-08, "loss": 0.003, "step": 10144 }, { "epoch": 4.615559599636033, "grad_norm": 0.9185462081526751, "learning_rate": 7.275074104654695e-08, "loss": 0.0056, "step": 10145 }, { "epoch": 4.616014558689718, "grad_norm": 0.4525887332314491, "learning_rate": 7.257969169706752e-08, "loss": 0.015, "step": 10146 }, { "epoch": 4.616469517743403, "grad_norm": 0.7392413769252447, "learning_rate": 7.240884070541326e-08, "loss": 0.0024, "step": 10147 }, { "epoch": 4.616924476797088, "grad_norm": 0.4361963792371806, "learning_rate": 7.223818808554578e-08, "loss": 0.0014, "step": 10148 }, { "epoch": 4.617379435850774, "grad_norm": 0.35196851811556307, "learning_rate": 7.206773385140947e-08, "loss": 0.0035, "step": 10149 }, { "epoch": 4.617834394904459, "grad_norm": 0.6543016038610908, "learning_rate": 7.189747801693375e-08, "loss": 0.0021, "step": 10150 }, { "epoch": 4.6182893539581436, "grad_norm": 0.36552203615592893, "learning_rate": 7.172742059603111e-08, "loss": 0.0011, "step": 10151 }, { "epoch": 4.618744313011829, "grad_norm": 0.36620790830547756, "learning_rate": 7.155756160259764e-08, "loss": 0.0019, "step": 10152 }, { "epoch": 4.619199272065514, "grad_norm": 0.5611107545119333, "learning_rate": 7.138790105051335e-08, "loss": 0.0036, "step": 10153 }, { "epoch": 4.619654231119199, "grad_norm": 0.6971921250416832, "learning_rate": 7.121843895364217e-08, "loss": 0.0018, "step": 10154 }, { "epoch": 4.620109190172885, "grad_norm": 0.5161944871873542, "learning_rate": 7.104917532583216e-08, "loss": 0.0031, "step": 10155 }, { "epoch": 4.62056414922657, "grad_norm": 0.2176315604067687, "learning_rate": 7.088011018091395e-08, "loss": 0.0011, "step": 10156 }, { "epoch": 4.6210191082802545, "grad_norm": 1.6506355560598143, "learning_rate": 7.071124353270398e-08, "loss": 0.0018, "step": 10157 }, { "epoch": 4.62147406733394, "grad_norm": 0.662495378977037, "learning_rate": 7.054257539500037e-08, "loss": 0.0169, "step": 10158 }, { "epoch": 4.621929026387625, "grad_norm": 0.7356785896796137, "learning_rate": 7.037410578158598e-08, "loss": 0.0128, "step": 10159 }, { "epoch": 4.62238398544131, "grad_norm": 0.7537836510617872, "learning_rate": 7.020583470622789e-08, "loss": 0.0184, "step": 10160 }, { "epoch": 4.622838944494996, "grad_norm": 0.458134483498468, "learning_rate": 7.003776218267588e-08, "loss": 0.0017, "step": 10161 }, { "epoch": 4.623293903548681, "grad_norm": 0.04169590838859223, "learning_rate": 6.986988822466456e-08, "loss": 0.0001, "step": 10162 }, { "epoch": 4.6237488626023655, "grad_norm": 0.9334101479631084, "learning_rate": 6.970221284591128e-08, "loss": 0.0126, "step": 10163 }, { "epoch": 4.624203821656051, "grad_norm": 0.41266784869337797, "learning_rate": 6.953473606011813e-08, "loss": 0.0078, "step": 10164 }, { "epoch": 4.624658780709736, "grad_norm": 0.6675195064928228, "learning_rate": 6.936745788097082e-08, "loss": 0.009, "step": 10165 }, { "epoch": 4.625113739763421, "grad_norm": 0.4368757384269319, "learning_rate": 6.920037832213789e-08, "loss": 0.0168, "step": 10166 }, { "epoch": 4.625568698817107, "grad_norm": 0.5662496012989569, "learning_rate": 6.903349739727284e-08, "loss": 0.0018, "step": 10167 }, { "epoch": 4.626023657870792, "grad_norm": 1.7398294077537757, "learning_rate": 6.886681512001225e-08, "loss": 0.0139, "step": 10168 }, { "epoch": 4.6264786169244765, "grad_norm": 0.588102872820143, "learning_rate": 6.870033150397637e-08, "loss": 0.008, "step": 10169 }, { "epoch": 4.626933575978162, "grad_norm": 0.13259604268956948, "learning_rate": 6.853404656276957e-08, "loss": 0.0007, "step": 10170 }, { "epoch": 4.627388535031847, "grad_norm": 0.572902871700175, "learning_rate": 6.836796030998044e-08, "loss": 0.0028, "step": 10171 }, { "epoch": 4.627843494085532, "grad_norm": 0.23320681857347522, "learning_rate": 6.820207275918061e-08, "loss": 0.001, "step": 10172 }, { "epoch": 4.628298453139218, "grad_norm": 0.9498746184281998, "learning_rate": 6.803638392392537e-08, "loss": 0.01, "step": 10173 }, { "epoch": 4.628753412192903, "grad_norm": 0.8794897844735243, "learning_rate": 6.787089381775386e-08, "loss": 0.0062, "step": 10174 }, { "epoch": 4.6292083712465875, "grad_norm": 0.7719265852562136, "learning_rate": 6.770560245418972e-08, "loss": 0.0134, "step": 10175 }, { "epoch": 4.629663330300273, "grad_norm": 0.528338403347276, "learning_rate": 6.754050984673993e-08, "loss": 0.0054, "step": 10176 }, { "epoch": 4.630118289353958, "grad_norm": 0.8175109176627982, "learning_rate": 6.737561600889425e-08, "loss": 0.0143, "step": 10177 }, { "epoch": 4.630573248407643, "grad_norm": 0.8263620099541746, "learning_rate": 6.721092095412774e-08, "loss": 0.0255, "step": 10178 }, { "epoch": 4.631028207461329, "grad_norm": 0.35483716500509305, "learning_rate": 6.70464246958985e-08, "loss": 0.0013, "step": 10179 }, { "epoch": 4.631483166515014, "grad_norm": 0.04785793201009651, "learning_rate": 6.688212724764831e-08, "loss": 0.0002, "step": 10180 }, { "epoch": 4.631938125568698, "grad_norm": 0.7333828620193423, "learning_rate": 6.67180286228028e-08, "loss": 0.0176, "step": 10181 }, { "epoch": 4.632393084622384, "grad_norm": 0.5724860427821112, "learning_rate": 6.655412883477153e-08, "loss": 0.0072, "step": 10182 }, { "epoch": 4.632848043676069, "grad_norm": 0.6704392283805026, "learning_rate": 6.63904278969471e-08, "loss": 0.0041, "step": 10183 }, { "epoch": 4.633303002729754, "grad_norm": 0.46903604864321474, "learning_rate": 6.62269258227069e-08, "loss": 0.0135, "step": 10184 }, { "epoch": 4.63375796178344, "grad_norm": 0.4019554540849858, "learning_rate": 6.606362262541188e-08, "loss": 0.0041, "step": 10185 }, { "epoch": 4.6342129208371245, "grad_norm": 0.25538022810507316, "learning_rate": 6.590051831840583e-08, "loss": 0.0012, "step": 10186 }, { "epoch": 4.634667879890809, "grad_norm": 0.09073996892331193, "learning_rate": 6.573761291501724e-08, "loss": 0.0003, "step": 10187 }, { "epoch": 4.635122838944495, "grad_norm": 0.41515152178279574, "learning_rate": 6.557490642855769e-08, "loss": 0.0018, "step": 10188 }, { "epoch": 4.63557779799818, "grad_norm": 0.3299903999138031, "learning_rate": 6.54123988723232e-08, "loss": 0.0019, "step": 10189 }, { "epoch": 4.636032757051865, "grad_norm": 0.28072804881578506, "learning_rate": 6.525009025959289e-08, "loss": 0.0017, "step": 10190 }, { "epoch": 4.636487716105551, "grad_norm": 0.8408119850709069, "learning_rate": 6.508798060362976e-08, "loss": 0.004, "step": 10191 }, { "epoch": 4.6369426751592355, "grad_norm": 0.20405536529742901, "learning_rate": 6.492606991768125e-08, "loss": 0.0009, "step": 10192 }, { "epoch": 4.63739763421292, "grad_norm": 0.8716957689473536, "learning_rate": 6.476435821497734e-08, "loss": 0.0033, "step": 10193 }, { "epoch": 4.637852593266606, "grad_norm": 0.7482499229352273, "learning_rate": 6.460284550873275e-08, "loss": 0.0189, "step": 10194 }, { "epoch": 4.638307552320291, "grad_norm": 0.2453889814603065, "learning_rate": 6.44415318121458e-08, "loss": 0.0049, "step": 10195 }, { "epoch": 4.638762511373976, "grad_norm": 0.35871352165171044, "learning_rate": 6.428041713839761e-08, "loss": 0.0047, "step": 10196 }, { "epoch": 4.639217470427662, "grad_norm": 0.3418328272474742, "learning_rate": 6.411950150065404e-08, "loss": 0.0025, "step": 10197 }, { "epoch": 4.6396724294813465, "grad_norm": 0.6736748781811195, "learning_rate": 6.395878491206458e-08, "loss": 0.0236, "step": 10198 }, { "epoch": 4.640127388535031, "grad_norm": 0.6371637046559355, "learning_rate": 6.379826738576206e-08, "loss": 0.0087, "step": 10199 }, { "epoch": 4.640582347588717, "grad_norm": 0.5333456512718006, "learning_rate": 6.363794893486375e-08, "loss": 0.0027, "step": 10200 }, { "epoch": 4.641037306642402, "grad_norm": 0.41712076800296316, "learning_rate": 6.347782957246945e-08, "loss": 0.0008, "step": 10201 }, { "epoch": 4.641492265696087, "grad_norm": 1.3282834009553866, "learning_rate": 6.331790931166371e-08, "loss": 0.011, "step": 10202 }, { "epoch": 4.641947224749773, "grad_norm": 0.7210087372772567, "learning_rate": 6.315818816551439e-08, "loss": 0.0299, "step": 10203 }, { "epoch": 4.6424021838034575, "grad_norm": 0.5081327726126422, "learning_rate": 6.299866614707328e-08, "loss": 0.0042, "step": 10204 }, { "epoch": 4.642857142857143, "grad_norm": 0.8168956019057707, "learning_rate": 6.28393432693758e-08, "loss": 0.0025, "step": 10205 }, { "epoch": 4.643312101910828, "grad_norm": 0.8433781189473114, "learning_rate": 6.268021954544095e-08, "loss": 0.0025, "step": 10206 }, { "epoch": 4.643767060964513, "grad_norm": 0.44409485668396287, "learning_rate": 6.252129498827197e-08, "loss": 0.0014, "step": 10207 }, { "epoch": 4.644222020018199, "grad_norm": 1.6974382877358214, "learning_rate": 6.236256961085486e-08, "loss": 0.007, "step": 10208 }, { "epoch": 4.644676979071884, "grad_norm": 0.8312256074226476, "learning_rate": 6.22040434261606e-08, "loss": 0.0303, "step": 10209 }, { "epoch": 4.6451319381255685, "grad_norm": 0.7529131307311955, "learning_rate": 6.204571644714303e-08, "loss": 0.0104, "step": 10210 }, { "epoch": 4.645586897179254, "grad_norm": 0.35865729500495647, "learning_rate": 6.188758868673955e-08, "loss": 0.0011, "step": 10211 }, { "epoch": 4.646041856232939, "grad_norm": 0.1484244262166595, "learning_rate": 6.17296601578718e-08, "loss": 0.0003, "step": 10212 }, { "epoch": 4.646496815286624, "grad_norm": 1.1026893150670731, "learning_rate": 6.157193087344526e-08, "loss": 0.0059, "step": 10213 }, { "epoch": 4.64695177434031, "grad_norm": 0.6461617658195181, "learning_rate": 6.141440084634854e-08, "loss": 0.0052, "step": 10214 }, { "epoch": 4.647406733393995, "grad_norm": 0.9499418852825211, "learning_rate": 6.125707008945464e-08, "loss": 0.0123, "step": 10215 }, { "epoch": 4.647861692447679, "grad_norm": 0.04374904836264823, "learning_rate": 6.109993861561969e-08, "loss": 0.0001, "step": 10216 }, { "epoch": 4.648316651501365, "grad_norm": 0.5283526697201992, "learning_rate": 6.09430064376837e-08, "loss": 0.0044, "step": 10217 }, { "epoch": 4.64877161055505, "grad_norm": 0.3461400993177478, "learning_rate": 6.078627356847056e-08, "loss": 0.0035, "step": 10218 }, { "epoch": 4.649226569608735, "grad_norm": 0.5922395122111976, "learning_rate": 6.062974002078753e-08, "loss": 0.0094, "step": 10219 }, { "epoch": 4.649681528662421, "grad_norm": 0.34069691946653763, "learning_rate": 6.047340580742634e-08, "loss": 0.001, "step": 10220 }, { "epoch": 4.6501364877161055, "grad_norm": 0.6771701368229937, "learning_rate": 6.031727094116174e-08, "loss": 0.0093, "step": 10221 }, { "epoch": 4.65059144676979, "grad_norm": 0.6163157679750256, "learning_rate": 6.016133543475189e-08, "loss": 0.0075, "step": 10222 }, { "epoch": 4.651046405823476, "grad_norm": 2.541755549592879, "learning_rate": 6.000559930093964e-08, "loss": 0.0107, "step": 10223 }, { "epoch": 4.651501364877161, "grad_norm": 0.9885006145630703, "learning_rate": 5.98500625524509e-08, "loss": 0.0201, "step": 10224 }, { "epoch": 4.651956323930846, "grad_norm": 0.580876838435522, "learning_rate": 5.969472520199553e-08, "loss": 0.0085, "step": 10225 }, { "epoch": 4.652411282984532, "grad_norm": 0.24678329948743563, "learning_rate": 5.953958726226672e-08, "loss": 0.0013, "step": 10226 }, { "epoch": 4.6528662420382165, "grad_norm": 1.5904717751571311, "learning_rate": 5.93846487459418e-08, "loss": 0.0194, "step": 10227 }, { "epoch": 4.653321201091901, "grad_norm": 0.2705615444223741, "learning_rate": 5.922990966568176e-08, "loss": 0.0018, "step": 10228 }, { "epoch": 4.653776160145587, "grad_norm": 0.19646566253498218, "learning_rate": 5.9075370034131216e-08, "loss": 0.0008, "step": 10229 }, { "epoch": 4.654231119199272, "grad_norm": 0.654025947033978, "learning_rate": 5.89210298639184e-08, "loss": 0.0049, "step": 10230 }, { "epoch": 4.654686078252957, "grad_norm": 0.8620731179382145, "learning_rate": 5.876688916765461e-08, "loss": 0.0215, "step": 10231 }, { "epoch": 4.655141037306643, "grad_norm": 0.2733347521735169, "learning_rate": 5.861294795793671e-08, "loss": 0.0013, "step": 10232 }, { "epoch": 4.6555959963603275, "grad_norm": 0.4553937070688931, "learning_rate": 5.845920624734325e-08, "loss": 0.0087, "step": 10233 }, { "epoch": 4.656050955414012, "grad_norm": 0.5041184379973797, "learning_rate": 5.830566404843752e-08, "loss": 0.0113, "step": 10234 }, { "epoch": 4.656505914467698, "grad_norm": 0.5726055011582639, "learning_rate": 5.815232137376642e-08, "loss": 0.0012, "step": 10235 }, { "epoch": 4.656960873521383, "grad_norm": 0.5822405774869539, "learning_rate": 5.799917823586021e-08, "loss": 0.0059, "step": 10236 }, { "epoch": 4.657415832575068, "grad_norm": 0.5935815243722296, "learning_rate": 5.784623464723332e-08, "loss": 0.0076, "step": 10237 }, { "epoch": 4.657870791628754, "grad_norm": 0.32784924401373394, "learning_rate": 5.7693490620383544e-08, "loss": 0.0035, "step": 10238 }, { "epoch": 4.6583257506824385, "grad_norm": 0.752301727235939, "learning_rate": 5.7540946167792265e-08, "loss": 0.007, "step": 10239 }, { "epoch": 4.658780709736124, "grad_norm": 0.6286240322903953, "learning_rate": 5.738860130192481e-08, "loss": 0.0028, "step": 10240 }, { "epoch": 4.659235668789809, "grad_norm": 0.5031826446450929, "learning_rate": 5.7236456035230096e-08, "loss": 0.0074, "step": 10241 }, { "epoch": 4.659690627843494, "grad_norm": 0.9997421431467953, "learning_rate": 5.708451038014068e-08, "loss": 0.008, "step": 10242 }, { "epoch": 4.66014558689718, "grad_norm": 0.6184794049542449, "learning_rate": 5.693276434907302e-08, "loss": 0.008, "step": 10243 }, { "epoch": 4.660600545950865, "grad_norm": 0.4514590038280776, "learning_rate": 5.6781217954427206e-08, "loss": 0.0105, "step": 10244 }, { "epoch": 4.6610555050045495, "grad_norm": 0.2137912223395863, "learning_rate": 5.6629871208586926e-08, "loss": 0.0006, "step": 10245 }, { "epoch": 4.661510464058235, "grad_norm": 0.5180863628139974, "learning_rate": 5.647872412391897e-08, "loss": 0.0068, "step": 10246 }, { "epoch": 4.66196542311192, "grad_norm": 0.9232123207416976, "learning_rate": 5.632777671277484e-08, "loss": 0.0319, "step": 10247 }, { "epoch": 4.662420382165605, "grad_norm": 0.5600709734044063, "learning_rate": 5.617702898748967e-08, "loss": 0.011, "step": 10248 }, { "epoch": 4.662875341219291, "grad_norm": 0.45957716077111693, "learning_rate": 5.6026480960381377e-08, "loss": 0.0031, "step": 10249 }, { "epoch": 4.663330300272976, "grad_norm": 0.3209828150194338, "learning_rate": 5.587613264375208e-08, "loss": 0.001, "step": 10250 }, { "epoch": 4.66378525932666, "grad_norm": 2.912820422956027, "learning_rate": 5.5725984049887495e-08, "loss": 0.0014, "step": 10251 }, { "epoch": 4.664240218380346, "grad_norm": 0.823093798287925, "learning_rate": 5.5576035191057534e-08, "loss": 0.011, "step": 10252 }, { "epoch": 4.664695177434031, "grad_norm": 0.42411396014090985, "learning_rate": 5.542628607951489e-08, "loss": 0.0032, "step": 10253 }, { "epoch": 4.665150136487716, "grad_norm": 0.5772538832304198, "learning_rate": 5.527673672749645e-08, "loss": 0.0013, "step": 10254 }, { "epoch": 4.665605095541402, "grad_norm": 0.5331606782272126, "learning_rate": 5.512738714722299e-08, "loss": 0.0112, "step": 10255 }, { "epoch": 4.6660600545950865, "grad_norm": 0.8631072617077079, "learning_rate": 5.497823735089836e-08, "loss": 0.0069, "step": 10256 }, { "epoch": 4.666515013648771, "grad_norm": 0.1591212824215432, "learning_rate": 5.482928735071086e-08, "loss": 0.0008, "step": 10257 }, { "epoch": 4.666969972702457, "grad_norm": 0.25846833116734413, "learning_rate": 5.4680537158831595e-08, "loss": 0.0013, "step": 10258 }, { "epoch": 4.667424931756142, "grad_norm": 0.6469513881378692, "learning_rate": 5.4531986787415834e-08, "loss": 0.0053, "step": 10259 }, { "epoch": 4.667879890809827, "grad_norm": 0.7328686374172201, "learning_rate": 5.4383636248602213e-08, "loss": 0.0078, "step": 10260 }, { "epoch": 4.668334849863513, "grad_norm": 0.28152556971138565, "learning_rate": 5.423548555451352e-08, "loss": 0.0014, "step": 10261 }, { "epoch": 4.6687898089171975, "grad_norm": 0.24862446044989378, "learning_rate": 5.4087534717256195e-08, "loss": 0.0024, "step": 10262 }, { "epoch": 4.669244767970882, "grad_norm": 0.784454986209739, "learning_rate": 5.393978374892001e-08, "loss": 0.0193, "step": 10263 }, { "epoch": 4.669699727024568, "grad_norm": 0.5235526784699579, "learning_rate": 5.379223266157835e-08, "loss": 0.0067, "step": 10264 }, { "epoch": 4.670154686078253, "grad_norm": 1.0072595695613125, "learning_rate": 5.3644881467288245e-08, "loss": 0.0142, "step": 10265 }, { "epoch": 4.670609645131938, "grad_norm": 0.5909123565453743, "learning_rate": 5.34977301780909e-08, "loss": 0.0026, "step": 10266 }, { "epoch": 4.671064604185624, "grad_norm": 0.6187020943288865, "learning_rate": 5.335077880601086e-08, "loss": 0.0072, "step": 10267 }, { "epoch": 4.6715195632393085, "grad_norm": 0.13810089225863045, "learning_rate": 5.320402736305602e-08, "loss": 0.0013, "step": 10268 }, { "epoch": 4.671974522292993, "grad_norm": 0.2713977713029142, "learning_rate": 5.305747586121845e-08, "loss": 0.0021, "step": 10269 }, { "epoch": 4.672429481346679, "grad_norm": 0.916438418742185, "learning_rate": 5.291112431247358e-08, "loss": 0.0046, "step": 10270 }, { "epoch": 4.672884440400364, "grad_norm": 0.30612665769645797, "learning_rate": 5.276497272878101e-08, "loss": 0.0025, "step": 10271 }, { "epoch": 4.673339399454049, "grad_norm": 0.19705597336706132, "learning_rate": 5.2619021122083116e-08, "loss": 0.0006, "step": 10272 }, { "epoch": 4.673794358507735, "grad_norm": 1.0069751147486095, "learning_rate": 5.247326950430648e-08, "loss": 0.006, "step": 10273 }, { "epoch": 4.6742493175614195, "grad_norm": 0.20765764728042252, "learning_rate": 5.232771788736157e-08, "loss": 0.0008, "step": 10274 }, { "epoch": 4.674704276615104, "grad_norm": 1.6907747831841196, "learning_rate": 5.2182366283141384e-08, "loss": 0.0185, "step": 10275 }, { "epoch": 4.67515923566879, "grad_norm": 0.05682763206030863, "learning_rate": 5.2037214703524185e-08, "loss": 0.0001, "step": 10276 }, { "epoch": 4.675614194722475, "grad_norm": 2.125537063131834, "learning_rate": 5.189226316037105e-08, "loss": 0.01, "step": 10277 }, { "epoch": 4.67606915377616, "grad_norm": 1.2258332092338151, "learning_rate": 5.1747511665526665e-08, "loss": 0.0095, "step": 10278 }, { "epoch": 4.676524112829846, "grad_norm": 0.6775293912549879, "learning_rate": 5.1602960230819624e-08, "loss": 0.0015, "step": 10279 }, { "epoch": 4.6769790718835305, "grad_norm": 0.29918673667874934, "learning_rate": 5.145860886806131e-08, "loss": 0.0019, "step": 10280 }, { "epoch": 4.677434030937215, "grad_norm": 0.8698429207056215, "learning_rate": 5.131445758904813e-08, "loss": 0.0093, "step": 10281 }, { "epoch": 4.677888989990901, "grad_norm": 0.6014853993448903, "learning_rate": 5.117050640555926e-08, "loss": 0.0139, "step": 10282 }, { "epoch": 4.678343949044586, "grad_norm": 0.3397651035376281, "learning_rate": 5.1026755329358077e-08, "loss": 0.0019, "step": 10283 }, { "epoch": 4.678798908098271, "grad_norm": 0.9318869636120587, "learning_rate": 5.088320437219074e-08, "loss": 0.0061, "step": 10284 }, { "epoch": 4.679253867151957, "grad_norm": 0.5555016839164546, "learning_rate": 5.073985354578786e-08, "loss": 0.0217, "step": 10285 }, { "epoch": 4.679708826205641, "grad_norm": 0.15683488454524885, "learning_rate": 5.059670286186341e-08, "loss": 0.0007, "step": 10286 }, { "epoch": 4.680163785259326, "grad_norm": 0.2934857744945566, "learning_rate": 5.045375233211497e-08, "loss": 0.0051, "step": 10287 }, { "epoch": 4.680618744313012, "grad_norm": 0.2434302523434641, "learning_rate": 5.031100196822403e-08, "loss": 0.001, "step": 10288 }, { "epoch": 4.681073703366697, "grad_norm": 0.4089406710438022, "learning_rate": 5.0168451781854865e-08, "loss": 0.0034, "step": 10289 }, { "epoch": 4.681528662420382, "grad_norm": 0.5473472567050284, "learning_rate": 5.0026101784656776e-08, "loss": 0.002, "step": 10290 }, { "epoch": 4.6819836214740675, "grad_norm": 0.8564391723669051, "learning_rate": 4.988395198826157e-08, "loss": 0.0241, "step": 10291 }, { "epoch": 4.682438580527752, "grad_norm": 0.7673469522143083, "learning_rate": 4.974200240428551e-08, "loss": 0.0132, "step": 10292 }, { "epoch": 4.682893539581437, "grad_norm": 1.0685173126762002, "learning_rate": 4.9600253044327364e-08, "loss": 0.0065, "step": 10293 }, { "epoch": 4.683348498635123, "grad_norm": 0.2608585002037625, "learning_rate": 4.945870391997065e-08, "loss": 0.002, "step": 10294 }, { "epoch": 4.683803457688808, "grad_norm": 0.07722237048587341, "learning_rate": 4.931735504278223e-08, "loss": 0.0003, "step": 10295 }, { "epoch": 4.684258416742493, "grad_norm": 0.38581856735705106, "learning_rate": 4.917620642431231e-08, "loss": 0.0014, "step": 10296 }, { "epoch": 4.6847133757961785, "grad_norm": 0.9581590863879081, "learning_rate": 4.9035258076094996e-08, "loss": 0.0161, "step": 10297 }, { "epoch": 4.685168334849863, "grad_norm": 0.5317490561915522, "learning_rate": 4.889451000964801e-08, "loss": 0.0042, "step": 10298 }, { "epoch": 4.685623293903548, "grad_norm": 0.7467988247448609, "learning_rate": 4.8753962236472443e-08, "loss": 0.022, "step": 10299 }, { "epoch": 4.686078252957234, "grad_norm": 0.22357650741723548, "learning_rate": 4.861361476805354e-08, "loss": 0.0012, "step": 10300 }, { "epoch": 4.686533212010919, "grad_norm": 0.6517603792909479, "learning_rate": 4.8473467615859637e-08, "loss": 0.0026, "step": 10301 }, { "epoch": 4.686988171064604, "grad_norm": 0.9666871809685256, "learning_rate": 4.833352079134296e-08, "loss": 0.0288, "step": 10302 }, { "epoch": 4.6874431301182895, "grad_norm": 0.6830576757680491, "learning_rate": 4.819377430593908e-08, "loss": 0.0014, "step": 10303 }, { "epoch": 4.687898089171974, "grad_norm": 0.45733502723530284, "learning_rate": 4.8054228171067755e-08, "loss": 0.0014, "step": 10304 }, { "epoch": 4.688353048225659, "grad_norm": 0.16420938618837974, "learning_rate": 4.7914882398132357e-08, "loss": 0.0006, "step": 10305 }, { "epoch": 4.688808007279345, "grad_norm": 0.6769005742554, "learning_rate": 4.7775736998519065e-08, "loss": 0.0041, "step": 10306 }, { "epoch": 4.68926296633303, "grad_norm": 0.30427742586702095, "learning_rate": 4.7636791983598496e-08, "loss": 0.0022, "step": 10307 }, { "epoch": 4.689717925386715, "grad_norm": 0.8288052904138916, "learning_rate": 4.749804736472435e-08, "loss": 0.0075, "step": 10308 }, { "epoch": 4.6901728844404005, "grad_norm": 0.08954452855898573, "learning_rate": 4.7359503153234235e-08, "loss": 0.0003, "step": 10309 }, { "epoch": 4.690627843494085, "grad_norm": 0.5991562439430569, "learning_rate": 4.7221159360449634e-08, "loss": 0.0072, "step": 10310 }, { "epoch": 4.69108280254777, "grad_norm": 0.6412907317624414, "learning_rate": 4.7083015997675395e-08, "loss": 0.0045, "step": 10311 }, { "epoch": 4.691537761601456, "grad_norm": 0.8110347633484929, "learning_rate": 4.694507307619972e-08, "loss": 0.0158, "step": 10312 }, { "epoch": 4.691992720655141, "grad_norm": 0.6126750066594936, "learning_rate": 4.68073306072947e-08, "loss": 0.0091, "step": 10313 }, { "epoch": 4.692447679708827, "grad_norm": 0.251130739033825, "learning_rate": 4.6669788602216046e-08, "loss": 0.0012, "step": 10314 }, { "epoch": 4.6929026387625115, "grad_norm": 0.7895764603027564, "learning_rate": 4.653244707220339e-08, "loss": 0.0058, "step": 10315 }, { "epoch": 4.693357597816196, "grad_norm": 0.24288235662755162, "learning_rate": 4.639530602847914e-08, "loss": 0.0013, "step": 10316 }, { "epoch": 4.693812556869882, "grad_norm": 0.6902325226101521, "learning_rate": 4.625836548225016e-08, "loss": 0.0109, "step": 10317 }, { "epoch": 4.694267515923567, "grad_norm": 0.7420821118281028, "learning_rate": 4.612162544470666e-08, "loss": 0.027, "step": 10318 }, { "epoch": 4.694722474977252, "grad_norm": 0.9406507907231262, "learning_rate": 4.598508592702222e-08, "loss": 0.0049, "step": 10319 }, { "epoch": 4.695177434030938, "grad_norm": 0.8482910213474063, "learning_rate": 4.5848746940354294e-08, "loss": 0.0078, "step": 10320 }, { "epoch": 4.695632393084622, "grad_norm": 0.4398459471933356, "learning_rate": 4.571260849584397e-08, "loss": 0.0022, "step": 10321 }, { "epoch": 4.696087352138307, "grad_norm": 0.10632410909327886, "learning_rate": 4.5576670604615955e-08, "loss": 0.0005, "step": 10322 }, { "epoch": 4.696542311191993, "grad_norm": 0.7764134384911916, "learning_rate": 4.544093327777804e-08, "loss": 0.0117, "step": 10323 }, { "epoch": 4.696997270245678, "grad_norm": 0.48366746219716183, "learning_rate": 4.530539652642246e-08, "loss": 0.0046, "step": 10324 }, { "epoch": 4.697452229299363, "grad_norm": 0.6234679405669904, "learning_rate": 4.51700603616248e-08, "loss": 0.0109, "step": 10325 }, { "epoch": 4.6979071883530485, "grad_norm": 0.9632294064205773, "learning_rate": 4.503492479444371e-08, "loss": 0.0156, "step": 10326 }, { "epoch": 4.698362147406733, "grad_norm": 0.4717000445059305, "learning_rate": 4.489998983592231e-08, "loss": 0.0066, "step": 10327 }, { "epoch": 4.698817106460418, "grad_norm": 0.899864439040203, "learning_rate": 4.476525549708621e-08, "loss": 0.0039, "step": 10328 }, { "epoch": 4.699272065514104, "grad_norm": 1.1431744820238512, "learning_rate": 4.463072178894579e-08, "loss": 0.0043, "step": 10329 }, { "epoch": 4.699727024567789, "grad_norm": 0.11844490272775667, "learning_rate": 4.4496388722494455e-08, "loss": 0.0003, "step": 10330 }, { "epoch": 4.700181983621474, "grad_norm": 0.3141459612098197, "learning_rate": 4.436225630870927e-08, "loss": 0.0065, "step": 10331 }, { "epoch": 4.7006369426751595, "grad_norm": 0.6366228700634975, "learning_rate": 4.4228324558551195e-08, "loss": 0.0233, "step": 10332 }, { "epoch": 4.701091901728844, "grad_norm": 0.5897764428823397, "learning_rate": 4.4094593482963686e-08, "loss": 0.0013, "step": 10333 }, { "epoch": 4.701546860782529, "grad_norm": 0.6588918180780293, "learning_rate": 4.396106309287579e-08, "loss": 0.0115, "step": 10334 }, { "epoch": 4.702001819836215, "grad_norm": 0.44085276106885324, "learning_rate": 4.3827733399198215e-08, "loss": 0.0066, "step": 10335 }, { "epoch": 4.7024567788899, "grad_norm": 0.19493477389904604, "learning_rate": 4.3694604412826416e-08, "loss": 0.0007, "step": 10336 }, { "epoch": 4.702911737943585, "grad_norm": 0.34002791451597736, "learning_rate": 4.356167614463891e-08, "loss": 0.0013, "step": 10337 }, { "epoch": 4.7033666969972705, "grad_norm": 0.23180916081745068, "learning_rate": 4.3428948605497844e-08, "loss": 0.002, "step": 10338 }, { "epoch": 4.703821656050955, "grad_norm": 0.25682156831391617, "learning_rate": 4.3296421806249546e-08, "loss": 0.0028, "step": 10339 }, { "epoch": 4.70427661510464, "grad_norm": 0.4748738631287524, "learning_rate": 4.3164095757723404e-08, "loss": 0.0029, "step": 10340 }, { "epoch": 4.704731574158326, "grad_norm": 0.3988987958757744, "learning_rate": 4.3031970470732156e-08, "loss": 0.0027, "step": 10341 }, { "epoch": 4.705186533212011, "grad_norm": 0.9237068405967368, "learning_rate": 4.290004595607272e-08, "loss": 0.0047, "step": 10342 }, { "epoch": 4.705641492265696, "grad_norm": 1.293798796284997, "learning_rate": 4.276832222452537e-08, "loss": 0.0056, "step": 10343 }, { "epoch": 4.7060964513193815, "grad_norm": 0.29508327645051097, "learning_rate": 4.263679928685399e-08, "loss": 0.0034, "step": 10344 }, { "epoch": 4.706551410373066, "grad_norm": 0.8843875393677617, "learning_rate": 4.2505477153806094e-08, "loss": 0.0143, "step": 10345 }, { "epoch": 4.707006369426751, "grad_norm": 0.5528013207026439, "learning_rate": 4.2374355836112545e-08, "loss": 0.0095, "step": 10346 }, { "epoch": 4.707461328480437, "grad_norm": 0.7437088149363985, "learning_rate": 4.224343534448838e-08, "loss": 0.0029, "step": 10347 }, { "epoch": 4.707916287534122, "grad_norm": 0.029702889883963107, "learning_rate": 4.211271568963116e-08, "loss": 0.0001, "step": 10348 }, { "epoch": 4.708371246587808, "grad_norm": 0.2471982376355346, "learning_rate": 4.198219688222316e-08, "loss": 0.0013, "step": 10349 }, { "epoch": 4.7088262056414925, "grad_norm": 0.8128611187144885, "learning_rate": 4.1851878932930026e-08, "loss": 0.0013, "step": 10350 }, { "epoch": 4.709281164695177, "grad_norm": 0.7348230897502447, "learning_rate": 4.172176185240018e-08, "loss": 0.0141, "step": 10351 }, { "epoch": 4.709736123748863, "grad_norm": 0.5035336329274653, "learning_rate": 4.159184565126651e-08, "loss": 0.005, "step": 10352 }, { "epoch": 4.710191082802548, "grad_norm": 0.3132780324878811, "learning_rate": 4.146213034014496e-08, "loss": 0.0013, "step": 10353 }, { "epoch": 4.710646041856233, "grad_norm": 0.8800594110680099, "learning_rate": 4.133261592963567e-08, "loss": 0.0117, "step": 10354 }, { "epoch": 4.711101000909919, "grad_norm": 0.8698496265602705, "learning_rate": 4.120330243032183e-08, "loss": 0.0343, "step": 10355 }, { "epoch": 4.711555959963603, "grad_norm": 0.4494653406897672, "learning_rate": 4.1074189852770284e-08, "loss": 0.0049, "step": 10356 }, { "epoch": 4.712010919017288, "grad_norm": 1.0001072547020986, "learning_rate": 4.0945278207531466e-08, "loss": 0.0023, "step": 10357 }, { "epoch": 4.712465878070974, "grad_norm": 0.4926629297505045, "learning_rate": 4.081656750513946e-08, "loss": 0.0092, "step": 10358 }, { "epoch": 4.712920837124659, "grad_norm": 0.48060572075935676, "learning_rate": 4.0688057756111956e-08, "loss": 0.0039, "step": 10359 }, { "epoch": 4.713375796178344, "grad_norm": 0.4530074147885882, "learning_rate": 4.0559748970950274e-08, "loss": 0.0015, "step": 10360 }, { "epoch": 4.7138307552320295, "grad_norm": 1.1647254667513804, "learning_rate": 4.043164116013937e-08, "loss": 0.0115, "step": 10361 }, { "epoch": 4.714285714285714, "grad_norm": 0.8054800631425497, "learning_rate": 4.030373433414697e-08, "loss": 0.0062, "step": 10362 }, { "epoch": 4.714740673339399, "grad_norm": 1.0081057876217947, "learning_rate": 4.017602850342584e-08, "loss": 0.0134, "step": 10363 }, { "epoch": 4.715195632393085, "grad_norm": 0.5476077568601391, "learning_rate": 4.004852367841122e-08, "loss": 0.0059, "step": 10364 }, { "epoch": 4.71565059144677, "grad_norm": 0.46376871090921845, "learning_rate": 3.99212198695223e-08, "loss": 0.0044, "step": 10365 }, { "epoch": 4.716105550500455, "grad_norm": 0.6472660649339824, "learning_rate": 3.97941170871613e-08, "loss": 0.0148, "step": 10366 }, { "epoch": 4.7165605095541405, "grad_norm": 0.871592491942921, "learning_rate": 3.9667215341714915e-08, "loss": 0.0113, "step": 10367 }, { "epoch": 4.717015468607825, "grad_norm": 0.8538443849059508, "learning_rate": 3.954051464355319e-08, "loss": 0.0106, "step": 10368 }, { "epoch": 4.71747042766151, "grad_norm": 0.5696060182845988, "learning_rate": 3.9414015003029214e-08, "loss": 0.0051, "step": 10369 }, { "epoch": 4.717925386715196, "grad_norm": 0.34040615033616994, "learning_rate": 3.9287716430480014e-08, "loss": 0.0041, "step": 10370 }, { "epoch": 4.718380345768881, "grad_norm": 0.153007258569408, "learning_rate": 3.916161893622594e-08, "loss": 0.0013, "step": 10371 }, { "epoch": 4.718835304822566, "grad_norm": 0.4475016674701215, "learning_rate": 3.903572253057153e-08, "loss": 0.0083, "step": 10372 }, { "epoch": 4.7192902638762515, "grad_norm": 0.863813400913638, "learning_rate": 3.8910027223804105e-08, "loss": 0.0018, "step": 10373 }, { "epoch": 4.719745222929936, "grad_norm": 0.8568802959211431, "learning_rate": 3.8784533026195446e-08, "loss": 0.0096, "step": 10374 }, { "epoch": 4.720200181983621, "grad_norm": 0.9186189849070475, "learning_rate": 3.865923994799958e-08, "loss": 0.0146, "step": 10375 }, { "epoch": 4.720655141037307, "grad_norm": 0.7791879346509872, "learning_rate": 3.853414799945554e-08, "loss": 0.0164, "step": 10376 }, { "epoch": 4.721110100090992, "grad_norm": 0.6075853098965045, "learning_rate": 3.840925719078486e-08, "loss": 0.0075, "step": 10377 }, { "epoch": 4.721565059144677, "grad_norm": 0.5063910093584798, "learning_rate": 3.828456753219356e-08, "loss": 0.004, "step": 10378 }, { "epoch": 4.7220200181983625, "grad_norm": 1.4000064878295277, "learning_rate": 3.816007903387015e-08, "loss": 0.014, "step": 10379 }, { "epoch": 4.722474977252047, "grad_norm": 0.44886150888328225, "learning_rate": 3.803579170598731e-08, "loss": 0.005, "step": 10380 }, { "epoch": 4.722929936305732, "grad_norm": 0.10359693440944401, "learning_rate": 3.791170555870166e-08, "loss": 0.0003, "step": 10381 }, { "epoch": 4.723384895359418, "grad_norm": 0.3677093895127463, "learning_rate": 3.778782060215286e-08, "loss": 0.0016, "step": 10382 }, { "epoch": 4.723839854413103, "grad_norm": 0.21765411778398633, "learning_rate": 3.7664136846463916e-08, "loss": 0.0008, "step": 10383 }, { "epoch": 4.724294813466788, "grad_norm": 0.3246658620335353, "learning_rate": 3.754065430174203e-08, "loss": 0.0011, "step": 10384 }, { "epoch": 4.7247497725204735, "grad_norm": 0.6567569839873031, "learning_rate": 3.741737297807746e-08, "loss": 0.0112, "step": 10385 }, { "epoch": 4.725204731574158, "grad_norm": 0.23217262369071182, "learning_rate": 3.729429288554409e-08, "loss": 0.0005, "step": 10386 }, { "epoch": 4.725659690627843, "grad_norm": 0.5738273954031861, "learning_rate": 3.717141403419972e-08, "loss": 0.0094, "step": 10387 }, { "epoch": 4.726114649681529, "grad_norm": 0.16275422767519424, "learning_rate": 3.7048736434085465e-08, "loss": 0.0008, "step": 10388 }, { "epoch": 4.726569608735214, "grad_norm": 0.7505803373967885, "learning_rate": 3.692626009522582e-08, "loss": 0.025, "step": 10389 }, { "epoch": 4.727024567788899, "grad_norm": 0.39725680970253835, "learning_rate": 3.6803985027629164e-08, "loss": 0.0004, "step": 10390 }, { "epoch": 4.727479526842584, "grad_norm": 0.6434718008546139, "learning_rate": 3.668191124128695e-08, "loss": 0.0115, "step": 10391 }, { "epoch": 4.727934485896269, "grad_norm": 0.2975741396172763, "learning_rate": 3.6560038746174805e-08, "loss": 0.0019, "step": 10392 }, { "epoch": 4.728389444949954, "grad_norm": 0.2350951051594801, "learning_rate": 3.643836755225172e-08, "loss": 0.0012, "step": 10393 }, { "epoch": 4.72884440400364, "grad_norm": 0.09564729772877942, "learning_rate": 3.631689766945945e-08, "loss": 0.0005, "step": 10394 }, { "epoch": 4.729299363057325, "grad_norm": 0.6400750046355298, "learning_rate": 3.619562910772478e-08, "loss": 0.0084, "step": 10395 }, { "epoch": 4.72975432211101, "grad_norm": 2.7399653020198316, "learning_rate": 3.607456187695646e-08, "loss": 0.0011, "step": 10396 }, { "epoch": 4.730209281164695, "grad_norm": 0.3734311005527613, "learning_rate": 3.595369598704823e-08, "loss": 0.001, "step": 10397 }, { "epoch": 4.73066424021838, "grad_norm": 0.3012791722691489, "learning_rate": 3.583303144787637e-08, "loss": 0.0048, "step": 10398 }, { "epoch": 4.731119199272065, "grad_norm": 0.751252906180249, "learning_rate": 3.5712568269301306e-08, "loss": 0.0041, "step": 10399 }, { "epoch": 4.731574158325751, "grad_norm": 0.5485741726960345, "learning_rate": 3.559230646116629e-08, "loss": 0.0061, "step": 10400 }, { "epoch": 4.732029117379436, "grad_norm": 0.7696707074310493, "learning_rate": 3.547224603329874e-08, "loss": 0.0242, "step": 10401 }, { "epoch": 4.732484076433121, "grad_norm": 0.14864944489614376, "learning_rate": 3.535238699550969e-08, "loss": 0.0007, "step": 10402 }, { "epoch": 4.732939035486806, "grad_norm": 0.4028231032560098, "learning_rate": 3.5232729357593254e-08, "loss": 0.0016, "step": 10403 }, { "epoch": 4.733393994540491, "grad_norm": 0.26353522166147686, "learning_rate": 3.511327312932772e-08, "loss": 0.0016, "step": 10404 }, { "epoch": 4.733848953594176, "grad_norm": 0.4810034928407515, "learning_rate": 3.499401832047361e-08, "loss": 0.0012, "step": 10405 }, { "epoch": 4.734303912647862, "grad_norm": 0.4369445991090852, "learning_rate": 3.487496494077702e-08, "loss": 0.0017, "step": 10406 }, { "epoch": 4.734758871701547, "grad_norm": 0.24395011451152057, "learning_rate": 3.4756112999965454e-08, "loss": 0.0012, "step": 10407 }, { "epoch": 4.735213830755232, "grad_norm": 0.1030916862016564, "learning_rate": 3.463746250775141e-08, "loss": 0.0004, "step": 10408 }, { "epoch": 4.735668789808917, "grad_norm": 0.8306809987106989, "learning_rate": 3.451901347383074e-08, "loss": 0.0038, "step": 10409 }, { "epoch": 4.736123748862602, "grad_norm": 1.6760701327427419, "learning_rate": 3.4400765907882106e-08, "loss": 0.0099, "step": 10410 }, { "epoch": 4.736578707916287, "grad_norm": 0.5532867835061953, "learning_rate": 3.4282719819568324e-08, "loss": 0.0078, "step": 10411 }, { "epoch": 4.737033666969973, "grad_norm": 0.595444528968717, "learning_rate": 3.416487521853584e-08, "loss": 0.0104, "step": 10412 }, { "epoch": 4.737488626023658, "grad_norm": 0.3616118119236637, "learning_rate": 3.404723211441391e-08, "loss": 0.0012, "step": 10413 }, { "epoch": 4.737943585077343, "grad_norm": 0.3492479066942913, "learning_rate": 3.392979051681622e-08, "loss": 0.0053, "step": 10414 }, { "epoch": 4.738398544131028, "grad_norm": 0.5263333303085034, "learning_rate": 3.3812550435338706e-08, "loss": 0.0073, "step": 10415 }, { "epoch": 4.738853503184713, "grad_norm": 0.28138525928517644, "learning_rate": 3.369551187956288e-08, "loss": 0.0009, "step": 10416 }, { "epoch": 4.739308462238398, "grad_norm": 0.300574615220743, "learning_rate": 3.3578674859052194e-08, "loss": 0.0028, "step": 10417 }, { "epoch": 4.739763421292084, "grad_norm": 0.5114161456520538, "learning_rate": 3.346203938335402e-08, "loss": 0.0024, "step": 10418 }, { "epoch": 4.740218380345769, "grad_norm": 1.1777975867354198, "learning_rate": 3.3345605461999056e-08, "loss": 0.0048, "step": 10419 }, { "epoch": 4.740673339399454, "grad_norm": 1.1335267561347981, "learning_rate": 3.322937310450164e-08, "loss": 0.0097, "step": 10420 }, { "epoch": 4.741128298453139, "grad_norm": 0.16673860008003358, "learning_rate": 3.3113342320360285e-08, "loss": 0.0009, "step": 10421 }, { "epoch": 4.741583257506824, "grad_norm": 0.3031098635489789, "learning_rate": 3.299751311905602e-08, "loss": 0.0012, "step": 10422 }, { "epoch": 4.742038216560509, "grad_norm": 0.4666335914991316, "learning_rate": 3.288188551005433e-08, "loss": 0.006, "step": 10423 }, { "epoch": 4.742493175614195, "grad_norm": 0.7799966017781799, "learning_rate": 3.27664595028035e-08, "loss": 0.0031, "step": 10424 }, { "epoch": 4.74294813466788, "grad_norm": 0.4938470487926683, "learning_rate": 3.26512351067354e-08, "loss": 0.0028, "step": 10425 }, { "epoch": 4.743403093721565, "grad_norm": 0.21752693342526233, "learning_rate": 3.2536212331266416e-08, "loss": 0.0011, "step": 10426 }, { "epoch": 4.74385805277525, "grad_norm": 0.5554579930736916, "learning_rate": 3.242139118579485e-08, "loss": 0.0024, "step": 10427 }, { "epoch": 4.744313011828935, "grad_norm": 0.4012083432475951, "learning_rate": 3.230677167970403e-08, "loss": 0.0005, "step": 10428 }, { "epoch": 4.744767970882621, "grad_norm": 0.5856888905241288, "learning_rate": 3.2192353822359246e-08, "loss": 0.0055, "step": 10429 }, { "epoch": 4.745222929936306, "grad_norm": 0.5637827369784544, "learning_rate": 3.207813762311107e-08, "loss": 0.009, "step": 10430 }, { "epoch": 4.745677888989991, "grad_norm": 0.48498939271004776, "learning_rate": 3.19641230912926e-08, "loss": 0.0015, "step": 10431 }, { "epoch": 4.746132848043676, "grad_norm": 0.42154815642284976, "learning_rate": 3.185031023622026e-08, "loss": 0.0036, "step": 10432 }, { "epoch": 4.746587807097361, "grad_norm": 1.0246227713343692, "learning_rate": 3.1736699067194675e-08, "loss": 0.0075, "step": 10433 }, { "epoch": 4.747042766151046, "grad_norm": 0.26278804522821164, "learning_rate": 3.162328959349925e-08, "loss": 0.0015, "step": 10434 }, { "epoch": 4.747497725204732, "grad_norm": 0.44309401306565105, "learning_rate": 3.151008182440185e-08, "loss": 0.0045, "step": 10435 }, { "epoch": 4.747952684258417, "grad_norm": 0.5397422199552426, "learning_rate": 3.1397075769152576e-08, "loss": 0.0114, "step": 10436 }, { "epoch": 4.748407643312102, "grad_norm": 0.6525242627433353, "learning_rate": 3.128427143698626e-08, "loss": 0.0144, "step": 10437 }, { "epoch": 4.748862602365787, "grad_norm": 0.34662973880871534, "learning_rate": 3.1171668837120805e-08, "loss": 0.0031, "step": 10438 }, { "epoch": 4.749317561419472, "grad_norm": 0.09351662358691552, "learning_rate": 3.1059267978757466e-08, "loss": 0.0005, "step": 10439 }, { "epoch": 4.749772520473157, "grad_norm": 0.5790447731372799, "learning_rate": 3.094706887108084e-08, "loss": 0.005, "step": 10440 }, { "epoch": 4.750227479526843, "grad_norm": 0.8547345528810848, "learning_rate": 3.083507152325999e-08, "loss": 0.0145, "step": 10441 }, { "epoch": 4.750682438580528, "grad_norm": 0.43370496337296743, "learning_rate": 3.0723275944446185e-08, "loss": 0.0027, "step": 10442 }, { "epoch": 4.751137397634213, "grad_norm": 0.6037887797126837, "learning_rate": 3.061168214377519e-08, "loss": 0.0179, "step": 10443 }, { "epoch": 4.751592356687898, "grad_norm": 0.9377564680862849, "learning_rate": 3.050029013036554e-08, "loss": 0.0231, "step": 10444 }, { "epoch": 4.752047315741583, "grad_norm": 0.5766727279520155, "learning_rate": 3.0389099913320505e-08, "loss": 0.0144, "step": 10445 }, { "epoch": 4.752502274795268, "grad_norm": 0.5715522549313082, "learning_rate": 3.027811150172533e-08, "loss": 0.0066, "step": 10446 }, { "epoch": 4.752957233848954, "grad_norm": 0.539053219864098, "learning_rate": 3.016732490464997e-08, "loss": 0.0074, "step": 10447 }, { "epoch": 4.753412192902639, "grad_norm": 0.16048048893695657, "learning_rate": 3.005674013114662e-08, "loss": 0.0006, "step": 10448 }, { "epoch": 4.753867151956324, "grad_norm": 0.8613096704144463, "learning_rate": 2.994635719025279e-08, "loss": 0.0026, "step": 10449 }, { "epoch": 4.754322111010009, "grad_norm": 0.28309301921557256, "learning_rate": 2.983617609098766e-08, "loss": 0.0015, "step": 10450 }, { "epoch": 4.754777070063694, "grad_norm": 0.9025992571071293, "learning_rate": 2.9726196842355394e-08, "loss": 0.0063, "step": 10451 }, { "epoch": 4.755232029117379, "grad_norm": 1.0483825254819616, "learning_rate": 2.9616419453342426e-08, "loss": 0.0203, "step": 10452 }, { "epoch": 4.755686988171065, "grad_norm": 0.28437242243372723, "learning_rate": 2.9506843932919637e-08, "loss": 0.0023, "step": 10453 }, { "epoch": 4.75614194722475, "grad_norm": 0.8559389551604291, "learning_rate": 2.9397470290040697e-08, "loss": 0.012, "step": 10454 }, { "epoch": 4.756596906278435, "grad_norm": 0.35386546624258997, "learning_rate": 2.9288298533643455e-08, "loss": 0.0009, "step": 10455 }, { "epoch": 4.75705186533212, "grad_norm": 0.39266581419881663, "learning_rate": 2.917932867264911e-08, "loss": 0.0027, "step": 10456 }, { "epoch": 4.757506824385805, "grad_norm": 1.356808051533497, "learning_rate": 2.907056071596137e-08, "loss": 0.0041, "step": 10457 }, { "epoch": 4.757961783439491, "grad_norm": 0.457358916307574, "learning_rate": 2.896199467246924e-08, "loss": 0.007, "step": 10458 }, { "epoch": 4.758416742493176, "grad_norm": 0.263297752207414, "learning_rate": 2.8853630551043398e-08, "loss": 0.0014, "step": 10459 }, { "epoch": 4.758871701546861, "grad_norm": 0.37588841133134027, "learning_rate": 2.874546836053954e-08, "loss": 0.0015, "step": 10460 }, { "epoch": 4.759326660600546, "grad_norm": 1.276892821396039, "learning_rate": 2.8637508109795875e-08, "loss": 0.0036, "step": 10461 }, { "epoch": 4.759781619654231, "grad_norm": 0.20994833160016835, "learning_rate": 2.852974980763451e-08, "loss": 0.0013, "step": 10462 }, { "epoch": 4.760236578707916, "grad_norm": 0.4637825488808254, "learning_rate": 2.8422193462860903e-08, "loss": 0.0023, "step": 10463 }, { "epoch": 4.760691537761602, "grad_norm": 0.8526747496770892, "learning_rate": 2.8314839084263857e-08, "loss": 0.0032, "step": 10464 }, { "epoch": 4.761146496815287, "grad_norm": 0.21869299127422792, "learning_rate": 2.8207686680616354e-08, "loss": 0.0009, "step": 10465 }, { "epoch": 4.761601455868972, "grad_norm": 0.8089902134827464, "learning_rate": 2.8100736260674442e-08, "loss": 0.0046, "step": 10466 }, { "epoch": 4.762056414922657, "grad_norm": 0.6245923581127596, "learning_rate": 2.7993987833176972e-08, "loss": 0.0102, "step": 10467 }, { "epoch": 4.762511373976342, "grad_norm": 0.11301169657256528, "learning_rate": 2.7887441406847516e-08, "loss": 0.0006, "step": 10468 }, { "epoch": 4.762966333030027, "grad_norm": 0.3881921240658275, "learning_rate": 2.7781096990392443e-08, "loss": 0.0013, "step": 10469 }, { "epoch": 4.763421292083713, "grad_norm": 0.05270244404807204, "learning_rate": 2.767495459250147e-08, "loss": 0.0002, "step": 10470 }, { "epoch": 4.763876251137398, "grad_norm": 0.2975346139764498, "learning_rate": 2.756901422184821e-08, "loss": 0.001, "step": 10471 }, { "epoch": 4.764331210191083, "grad_norm": 0.9725592554041269, "learning_rate": 2.746327588709019e-08, "loss": 0.0047, "step": 10472 }, { "epoch": 4.764786169244768, "grad_norm": 0.5384227631075906, "learning_rate": 2.7357739596866884e-08, "loss": 0.002, "step": 10473 }, { "epoch": 4.765241128298453, "grad_norm": 0.21528735107172015, "learning_rate": 2.7252405359803057e-08, "loss": 0.0009, "step": 10474 }, { "epoch": 4.765696087352138, "grad_norm": 1.0690256289394977, "learning_rate": 2.714727318450572e-08, "loss": 0.0026, "step": 10475 }, { "epoch": 4.766151046405824, "grad_norm": 0.5887782400421867, "learning_rate": 2.7042343079566048e-08, "loss": 0.0075, "step": 10476 }, { "epoch": 4.766606005459509, "grad_norm": 8.056231602716611, "learning_rate": 2.6937615053558018e-08, "loss": 0.0022, "step": 10477 }, { "epoch": 4.767060964513194, "grad_norm": 0.5802509132414354, "learning_rate": 2.683308911503979e-08, "loss": 0.0025, "step": 10478 }, { "epoch": 4.767515923566879, "grad_norm": 0.6598315940630182, "learning_rate": 2.6728765272553135e-08, "loss": 0.0023, "step": 10479 }, { "epoch": 4.767970882620564, "grad_norm": 0.6479733186800662, "learning_rate": 2.662464353462263e-08, "loss": 0.0104, "step": 10480 }, { "epoch": 4.768425841674249, "grad_norm": 2.0423426443178947, "learning_rate": 2.6520723909756462e-08, "loss": 0.0466, "step": 10481 }, { "epoch": 4.768880800727935, "grad_norm": 0.7146206425198058, "learning_rate": 2.6417006406446456e-08, "loss": 0.0081, "step": 10482 }, { "epoch": 4.76933575978162, "grad_norm": 0.2055031225199938, "learning_rate": 2.631349103316805e-08, "loss": 0.0014, "step": 10483 }, { "epoch": 4.769790718835305, "grad_norm": 0.16789361804309333, "learning_rate": 2.621017779838031e-08, "loss": 0.0009, "step": 10484 }, { "epoch": 4.77024567788899, "grad_norm": 0.8704471104722631, "learning_rate": 2.61070667105251e-08, "loss": 0.0078, "step": 10485 }, { "epoch": 4.770700636942675, "grad_norm": 0.6484105083079831, "learning_rate": 2.6004157778028726e-08, "loss": 0.0081, "step": 10486 }, { "epoch": 4.77115559599636, "grad_norm": 0.6780629105435755, "learning_rate": 2.590145100929975e-08, "loss": 0.0103, "step": 10487 }, { "epoch": 4.771610555050046, "grad_norm": 0.9201375038055206, "learning_rate": 2.5798946412731452e-08, "loss": 0.0092, "step": 10488 }, { "epoch": 4.772065514103731, "grad_norm": 1.933801618762254, "learning_rate": 2.569664399669991e-08, "loss": 0.0133, "step": 10489 }, { "epoch": 4.772520473157416, "grad_norm": 0.36472961728759395, "learning_rate": 2.5594543769564828e-08, "loss": 0.004, "step": 10490 }, { "epoch": 4.772975432211101, "grad_norm": 0.6847867519382651, "learning_rate": 2.5492645739669253e-08, "loss": 0.0075, "step": 10491 }, { "epoch": 4.773430391264786, "grad_norm": 0.5191594275418843, "learning_rate": 2.539094991533958e-08, "loss": 0.009, "step": 10492 }, { "epoch": 4.773885350318471, "grad_norm": 0.9212776694673404, "learning_rate": 2.5289456304886385e-08, "loss": 0.0096, "step": 10493 }, { "epoch": 4.774340309372157, "grad_norm": 0.5890375008605112, "learning_rate": 2.518816491660331e-08, "loss": 0.0022, "step": 10494 }, { "epoch": 4.774795268425842, "grad_norm": 0.7632836229886721, "learning_rate": 2.5087075758767064e-08, "loss": 0.0019, "step": 10495 }, { "epoch": 4.7752502274795265, "grad_norm": 0.23519490667675125, "learning_rate": 2.4986188839638548e-08, "loss": 0.0006, "step": 10496 }, { "epoch": 4.775705186533212, "grad_norm": 0.6648427883943479, "learning_rate": 2.4885504167461437e-08, "loss": 0.0059, "step": 10497 }, { "epoch": 4.776160145586897, "grad_norm": 0.4199366163813325, "learning_rate": 2.47850217504636e-08, "loss": 0.0033, "step": 10498 }, { "epoch": 4.776615104640582, "grad_norm": 0.31277286486220984, "learning_rate": 2.4684741596855687e-08, "loss": 0.0007, "step": 10499 }, { "epoch": 4.777070063694268, "grad_norm": 0.49296595294298506, "learning_rate": 2.4584663714832257e-08, "loss": 0.0081, "step": 10500 }, { "epoch": 4.777525022747953, "grad_norm": 1.845861381768707, "learning_rate": 2.448478811257149e-08, "loss": 0.0061, "step": 10501 }, { "epoch": 4.7779799818016375, "grad_norm": 0.36754829263879046, "learning_rate": 2.438511479823408e-08, "loss": 0.0035, "step": 10502 }, { "epoch": 4.778434940855323, "grad_norm": 0.19179382725112515, "learning_rate": 2.428564377996545e-08, "loss": 0.0006, "step": 10503 }, { "epoch": 4.778889899909008, "grad_norm": 0.6822105617383928, "learning_rate": 2.4186375065894107e-08, "loss": 0.0119, "step": 10504 }, { "epoch": 4.779344858962693, "grad_norm": 0.28495534263017985, "learning_rate": 2.4087308664131338e-08, "loss": 0.0011, "step": 10505 }, { "epoch": 4.779799818016379, "grad_norm": 0.14893331158541206, "learning_rate": 2.398844458277233e-08, "loss": 0.0005, "step": 10506 }, { "epoch": 4.780254777070064, "grad_norm": 0.5187357415755414, "learning_rate": 2.388978282989618e-08, "loss": 0.0029, "step": 10507 }, { "epoch": 4.7807097361237485, "grad_norm": 0.49742738078880844, "learning_rate": 2.379132341356505e-08, "loss": 0.0061, "step": 10508 }, { "epoch": 4.781164695177434, "grad_norm": 0.48024425245406616, "learning_rate": 2.3693066341824444e-08, "loss": 0.0105, "step": 10509 }, { "epoch": 4.781619654231119, "grad_norm": 0.36409237327439203, "learning_rate": 2.359501162270378e-08, "loss": 0.0024, "step": 10510 }, { "epoch": 4.782074613284804, "grad_norm": 0.6424737021363716, "learning_rate": 2.3497159264214974e-08, "loss": 0.0177, "step": 10511 }, { "epoch": 4.78252957233849, "grad_norm": 0.17645394577034706, "learning_rate": 2.339950927435497e-08, "loss": 0.0011, "step": 10512 }, { "epoch": 4.782984531392175, "grad_norm": 0.39474535398369776, "learning_rate": 2.330206166110238e-08, "loss": 0.0154, "step": 10513 }, { "epoch": 4.7834394904458595, "grad_norm": 0.47405310214867113, "learning_rate": 2.320481643242112e-08, "loss": 0.0172, "step": 10514 }, { "epoch": 4.783894449499545, "grad_norm": 0.5135546946537195, "learning_rate": 2.3107773596257042e-08, "loss": 0.009, "step": 10515 }, { "epoch": 4.78434940855323, "grad_norm": 0.26430763321868583, "learning_rate": 2.3010933160539927e-08, "loss": 0.0007, "step": 10516 }, { "epoch": 4.784804367606915, "grad_norm": 0.9602628213976433, "learning_rate": 2.291429513318344e-08, "loss": 0.0253, "step": 10517 }, { "epoch": 4.785259326660601, "grad_norm": 0.5654834474144939, "learning_rate": 2.2817859522084597e-08, "loss": 0.01, "step": 10518 }, { "epoch": 4.785714285714286, "grad_norm": 0.2540607730655405, "learning_rate": 2.2721626335123202e-08, "loss": 0.0009, "step": 10519 }, { "epoch": 4.7861692447679705, "grad_norm": 0.24412702672123715, "learning_rate": 2.262559558016325e-08, "loss": 0.001, "step": 10520 }, { "epoch": 4.786624203821656, "grad_norm": 0.08452599614059753, "learning_rate": 2.2529767265051795e-08, "loss": 0.0002, "step": 10521 }, { "epoch": 4.787079162875341, "grad_norm": 0.2797188708454739, "learning_rate": 2.2434141397619513e-08, "loss": 0.0007, "step": 10522 }, { "epoch": 4.787534121929026, "grad_norm": 0.4953768806754102, "learning_rate": 2.2338717985680993e-08, "loss": 0.0151, "step": 10523 }, { "epoch": 4.787989080982712, "grad_norm": 0.1284425917557068, "learning_rate": 2.2243497037033325e-08, "loss": 0.0003, "step": 10524 }, { "epoch": 4.788444040036397, "grad_norm": 0.5011268963323009, "learning_rate": 2.214847855945751e-08, "loss": 0.0085, "step": 10525 }, { "epoch": 4.788898999090081, "grad_norm": 0.7784021687157576, "learning_rate": 2.205366256071817e-08, "loss": 0.0163, "step": 10526 }, { "epoch": 4.789353958143767, "grad_norm": 1.0142340286768408, "learning_rate": 2.1959049048562997e-08, "loss": 0.0086, "step": 10527 }, { "epoch": 4.789808917197452, "grad_norm": 2.765203145413155, "learning_rate": 2.186463803072386e-08, "loss": 0.0042, "step": 10528 }, { "epoch": 4.790263876251137, "grad_norm": 0.2979070497166688, "learning_rate": 2.1770429514915425e-08, "loss": 0.0032, "step": 10529 }, { "epoch": 4.790718835304823, "grad_norm": 0.5437526306316539, "learning_rate": 2.1676423508835698e-08, "loss": 0.0011, "step": 10530 }, { "epoch": 4.7911737943585075, "grad_norm": 0.22335994421534133, "learning_rate": 2.158262002016659e-08, "loss": 0.0011, "step": 10531 }, { "epoch": 4.791628753412192, "grad_norm": 1.0405326219746052, "learning_rate": 2.1489019056573636e-08, "loss": 0.018, "step": 10532 }, { "epoch": 4.792083712465878, "grad_norm": 0.393882236222705, "learning_rate": 2.1395620625704882e-08, "loss": 0.0024, "step": 10533 }, { "epoch": 4.792538671519563, "grad_norm": 0.679941890938747, "learning_rate": 2.130242473519284e-08, "loss": 0.0096, "step": 10534 }, { "epoch": 4.792993630573249, "grad_norm": 0.5734903161799437, "learning_rate": 2.1209431392653078e-08, "loss": 0.0023, "step": 10535 }, { "epoch": 4.793448589626934, "grad_norm": 0.19912492845545463, "learning_rate": 2.1116640605684247e-08, "loss": 0.0009, "step": 10536 }, { "epoch": 4.7939035486806185, "grad_norm": 0.23508177287855414, "learning_rate": 2.1024052381869164e-08, "loss": 0.0006, "step": 10537 }, { "epoch": 4.794358507734304, "grad_norm": 0.7568993187361771, "learning_rate": 2.0931666728773448e-08, "loss": 0.0061, "step": 10538 }, { "epoch": 4.794813466787989, "grad_norm": 1.0784953002977076, "learning_rate": 2.0839483653946613e-08, "loss": 0.0068, "step": 10539 }, { "epoch": 4.795268425841674, "grad_norm": 0.46725385902748723, "learning_rate": 2.0747503164921522e-08, "loss": 0.0188, "step": 10540 }, { "epoch": 4.79572338489536, "grad_norm": 0.4079065066807701, "learning_rate": 2.0655725269213833e-08, "loss": 0.0043, "step": 10541 }, { "epoch": 4.796178343949045, "grad_norm": 1.0958023993482915, "learning_rate": 2.056414997432421e-08, "loss": 0.0056, "step": 10542 }, { "epoch": 4.7966333030027295, "grad_norm": 0.8080124763209007, "learning_rate": 2.0472777287735e-08, "loss": 0.0074, "step": 10543 }, { "epoch": 4.797088262056415, "grad_norm": 0.21649670422995923, "learning_rate": 2.0381607216913012e-08, "loss": 0.0009, "step": 10544 }, { "epoch": 4.7975432211101, "grad_norm": 0.34317088585734373, "learning_rate": 2.029063976930784e-08, "loss": 0.002, "step": 10545 }, { "epoch": 4.797998180163785, "grad_norm": 0.2673056729391347, "learning_rate": 2.0199874952353816e-08, "loss": 0.0013, "step": 10546 }, { "epoch": 4.798453139217471, "grad_norm": 0.5479354295173678, "learning_rate": 2.0109312773467228e-08, "loss": 0.0036, "step": 10547 }, { "epoch": 4.798908098271156, "grad_norm": 0.7563068895768117, "learning_rate": 2.0018953240048267e-08, "loss": 0.0105, "step": 10548 }, { "epoch": 4.7993630573248405, "grad_norm": 0.39672470638434587, "learning_rate": 1.9928796359481306e-08, "loss": 0.0048, "step": 10549 }, { "epoch": 4.799818016378526, "grad_norm": 0.6352984898535043, "learning_rate": 1.9838842139132953e-08, "loss": 0.0077, "step": 10550 }, { "epoch": 4.800272975432211, "grad_norm": 0.3431829337006467, "learning_rate": 1.974909058635399e-08, "loss": 0.002, "step": 10551 }, { "epoch": 4.800727934485896, "grad_norm": 0.6128334140194811, "learning_rate": 1.9659541708478836e-08, "loss": 0.0096, "step": 10552 }, { "epoch": 4.801182893539582, "grad_norm": 0.2612432358927972, "learning_rate": 1.9570195512824963e-08, "loss": 0.0013, "step": 10553 }, { "epoch": 4.801637852593267, "grad_norm": 0.3856035821555687, "learning_rate": 1.9481052006692924e-08, "loss": 0.0027, "step": 10554 }, { "epoch": 4.8020928116469515, "grad_norm": 0.45252285968035233, "learning_rate": 1.9392111197367446e-08, "loss": 0.0026, "step": 10555 }, { "epoch": 4.802547770700637, "grad_norm": 0.3538636456141296, "learning_rate": 1.930337309211633e-08, "loss": 0.0017, "step": 10556 }, { "epoch": 4.803002729754322, "grad_norm": 0.8154013405738001, "learning_rate": 1.9214837698190992e-08, "loss": 0.0182, "step": 10557 }, { "epoch": 4.803457688808007, "grad_norm": 0.3988331709924424, "learning_rate": 1.9126505022825924e-08, "loss": 0.0036, "step": 10558 }, { "epoch": 4.803912647861693, "grad_norm": 0.4942946129142108, "learning_rate": 1.9038375073239245e-08, "loss": 0.0036, "step": 10559 }, { "epoch": 4.804367606915378, "grad_norm": 0.16597244673924078, "learning_rate": 1.8950447856632694e-08, "loss": 0.0005, "step": 10560 }, { "epoch": 4.804822565969062, "grad_norm": 0.5518301061240428, "learning_rate": 1.8862723380191072e-08, "loss": 0.0039, "step": 10561 }, { "epoch": 4.805277525022748, "grad_norm": 0.16690656753460054, "learning_rate": 1.8775201651083097e-08, "loss": 0.0006, "step": 10562 }, { "epoch": 4.805732484076433, "grad_norm": 0.548028645518643, "learning_rate": 1.8687882676460546e-08, "loss": 0.0284, "step": 10563 }, { "epoch": 4.806187443130118, "grad_norm": 0.08459109507495932, "learning_rate": 1.860076646345882e-08, "loss": 0.0002, "step": 10564 }, { "epoch": 4.806642402183804, "grad_norm": 0.8486250652584314, "learning_rate": 1.8513853019196393e-08, "loss": 0.0066, "step": 10565 }, { "epoch": 4.8070973612374885, "grad_norm": 0.1306285023318396, "learning_rate": 1.842714235077564e-08, "loss": 0.0003, "step": 10566 }, { "epoch": 4.807552320291173, "grad_norm": 2.359340808687295, "learning_rate": 1.834063446528228e-08, "loss": 0.0142, "step": 10567 }, { "epoch": 4.808007279344859, "grad_norm": 0.6894796648874297, "learning_rate": 1.8254329369785106e-08, "loss": 0.0035, "step": 10568 }, { "epoch": 4.808462238398544, "grad_norm": 1.2086769511858377, "learning_rate": 1.816822707133653e-08, "loss": 0.003, "step": 10569 }, { "epoch": 4.80891719745223, "grad_norm": 0.7113364542744995, "learning_rate": 1.808232757697259e-08, "loss": 0.0118, "step": 10570 }, { "epoch": 4.809372156505915, "grad_norm": 0.2575459743782993, "learning_rate": 1.7996630893712675e-08, "loss": 0.0015, "step": 10571 }, { "epoch": 4.8098271155595995, "grad_norm": 0.6351462173043833, "learning_rate": 1.791113702855951e-08, "loss": 0.0077, "step": 10572 }, { "epoch": 4.810282074613285, "grad_norm": 0.5266906476491618, "learning_rate": 1.782584598849918e-08, "loss": 0.0073, "step": 10573 }, { "epoch": 4.81073703366697, "grad_norm": 0.6625259846645816, "learning_rate": 1.7740757780501383e-08, "loss": 0.0107, "step": 10574 }, { "epoch": 4.811191992720655, "grad_norm": 0.4365084347512612, "learning_rate": 1.7655872411518892e-08, "loss": 0.0025, "step": 10575 }, { "epoch": 4.811646951774341, "grad_norm": 0.20399753640535856, "learning_rate": 1.7571189888488384e-08, "loss": 0.001, "step": 10576 }, { "epoch": 4.812101910828026, "grad_norm": 0.47060417141021804, "learning_rate": 1.7486710218329872e-08, "loss": 0.0019, "step": 10577 }, { "epoch": 4.8125568698817105, "grad_norm": 0.46014814855582303, "learning_rate": 1.740243340794645e-08, "loss": 0.0047, "step": 10578 }, { "epoch": 4.813011828935396, "grad_norm": 0.4141073761194633, "learning_rate": 1.7318359464224555e-08, "loss": 0.0022, "step": 10579 }, { "epoch": 4.813466787989081, "grad_norm": 0.5639018254079473, "learning_rate": 1.7234488394034798e-08, "loss": 0.0058, "step": 10580 }, { "epoch": 4.813921747042766, "grad_norm": 0.5231775288714801, "learning_rate": 1.7150820204230868e-08, "loss": 0.0093, "step": 10581 }, { "epoch": 4.814376706096452, "grad_norm": 0.5033031544621533, "learning_rate": 1.7067354901649235e-08, "loss": 0.0049, "step": 10582 }, { "epoch": 4.814831665150137, "grad_norm": 0.47288589643144424, "learning_rate": 1.6984092493110283e-08, "loss": 0.0038, "step": 10583 }, { "epoch": 4.8152866242038215, "grad_norm": 0.6548178579776038, "learning_rate": 1.6901032985418286e-08, "loss": 0.0096, "step": 10584 }, { "epoch": 4.815741583257507, "grad_norm": 0.3088024422056283, "learning_rate": 1.6818176385360318e-08, "loss": 0.0013, "step": 10585 }, { "epoch": 4.816196542311192, "grad_norm": 0.6001093769515757, "learning_rate": 1.6735522699707076e-08, "loss": 0.0029, "step": 10586 }, { "epoch": 4.816651501364877, "grad_norm": 0.39297783438581435, "learning_rate": 1.6653071935212872e-08, "loss": 0.0053, "step": 10587 }, { "epoch": 4.817106460418563, "grad_norm": 0.34988442993313323, "learning_rate": 1.6570824098614547e-08, "loss": 0.0059, "step": 10588 }, { "epoch": 4.817561419472248, "grad_norm": 0.18999497342833255, "learning_rate": 1.6488779196633387e-08, "loss": 0.0009, "step": 10589 }, { "epoch": 4.8180163785259325, "grad_norm": 1.1560776642085986, "learning_rate": 1.6406937235973753e-08, "loss": 0.0074, "step": 10590 }, { "epoch": 4.818471337579618, "grad_norm": 0.6899428989631277, "learning_rate": 1.6325298223323626e-08, "loss": 0.0021, "step": 10591 }, { "epoch": 4.818926296633303, "grad_norm": 0.6228080506325112, "learning_rate": 1.6243862165353784e-08, "loss": 0.0056, "step": 10592 }, { "epoch": 4.819381255686988, "grad_norm": 0.7499651206204238, "learning_rate": 1.6162629068718904e-08, "loss": 0.0161, "step": 10593 }, { "epoch": 4.819836214740674, "grad_norm": 0.48411638304366417, "learning_rate": 1.6081598940057287e-08, "loss": 0.0167, "step": 10594 }, { "epoch": 4.820291173794359, "grad_norm": 0.8078334043739172, "learning_rate": 1.600077178599002e-08, "loss": 0.0259, "step": 10595 }, { "epoch": 4.820746132848043, "grad_norm": 0.1537209065496083, "learning_rate": 1.5920147613122106e-08, "loss": 0.0005, "step": 10596 }, { "epoch": 4.821201091901729, "grad_norm": 0.5675095022494631, "learning_rate": 1.5839726428041602e-08, "loss": 0.003, "step": 10597 }, { "epoch": 4.821656050955414, "grad_norm": 0.4974680640979575, "learning_rate": 1.5759508237320476e-08, "loss": 0.0029, "step": 10598 }, { "epoch": 4.822111010009099, "grad_norm": 0.711715471159512, "learning_rate": 1.5679493047513482e-08, "loss": 0.0055, "step": 10599 }, { "epoch": 4.822565969062785, "grad_norm": 0.46003457359117056, "learning_rate": 1.5599680865159285e-08, "loss": 0.0043, "step": 10600 }, { "epoch": 4.8230209281164695, "grad_norm": 0.25140926182729595, "learning_rate": 1.5520071696779605e-08, "loss": 0.0014, "step": 10601 }, { "epoch": 4.823475887170154, "grad_norm": 0.1933769198742454, "learning_rate": 1.5440665548879796e-08, "loss": 0.0008, "step": 10602 }, { "epoch": 4.82393084622384, "grad_norm": 1.110219033894772, "learning_rate": 1.5361462427948838e-08, "loss": 0.0191, "step": 10603 }, { "epoch": 4.824385805277525, "grad_norm": 0.3397536186898075, "learning_rate": 1.5282462340458493e-08, "loss": 0.0016, "step": 10604 }, { "epoch": 4.82484076433121, "grad_norm": 0.9853100191158017, "learning_rate": 1.5203665292864435e-08, "loss": 0.0223, "step": 10605 }, { "epoch": 4.825295723384896, "grad_norm": 0.7966244756530954, "learning_rate": 1.5125071291605675e-08, "loss": 0.0105, "step": 10606 }, { "epoch": 4.8257506824385805, "grad_norm": 0.37315211562100464, "learning_rate": 1.504668034310458e-08, "loss": 0.0012, "step": 10607 }, { "epoch": 4.826205641492265, "grad_norm": 0.8902712576061755, "learning_rate": 1.496849245376658e-08, "loss": 0.0108, "step": 10608 }, { "epoch": 4.826660600545951, "grad_norm": 0.9058909284980615, "learning_rate": 1.4890507629981288e-08, "loss": 0.0085, "step": 10609 }, { "epoch": 4.827115559599636, "grad_norm": 0.9065986912534721, "learning_rate": 1.4812725878120827e-08, "loss": 0.0021, "step": 10610 }, { "epoch": 4.827570518653321, "grad_norm": 0.5764898012232238, "learning_rate": 1.47351472045415e-08, "loss": 0.0081, "step": 10611 }, { "epoch": 4.828025477707007, "grad_norm": 0.12442909921781581, "learning_rate": 1.4657771615582683e-08, "loss": 0.0007, "step": 10612 }, { "epoch": 4.8284804367606915, "grad_norm": 0.6251702159850926, "learning_rate": 1.4580599117567096e-08, "loss": 0.007, "step": 10613 }, { "epoch": 4.828935395814376, "grad_norm": 0.33019751575839784, "learning_rate": 1.4503629716800804e-08, "loss": 0.0018, "step": 10614 }, { "epoch": 4.829390354868062, "grad_norm": 0.9675157271659875, "learning_rate": 1.44268634195735e-08, "loss": 0.0009, "step": 10615 }, { "epoch": 4.829845313921747, "grad_norm": 0.5195331314362539, "learning_rate": 1.435030023215822e-08, "loss": 0.0068, "step": 10616 }, { "epoch": 4.830300272975432, "grad_norm": 0.10406016139763573, "learning_rate": 1.4273940160811073e-08, "loss": 0.0004, "step": 10617 }, { "epoch": 4.830755232029118, "grad_norm": 0.41941469411690524, "learning_rate": 1.4197783211772343e-08, "loss": 0.0014, "step": 10618 }, { "epoch": 4.8312101910828025, "grad_norm": 0.6655231237190279, "learning_rate": 1.412182939126483e-08, "loss": 0.0079, "step": 10619 }, { "epoch": 4.831665150136487, "grad_norm": 0.5221243288673941, "learning_rate": 1.4046078705495514e-08, "loss": 0.0039, "step": 10620 }, { "epoch": 4.832120109190173, "grad_norm": 0.4357341424240143, "learning_rate": 1.3970531160654166e-08, "loss": 0.0039, "step": 10621 }, { "epoch": 4.832575068243858, "grad_norm": 0.8683428967464235, "learning_rate": 1.3895186762913903e-08, "loss": 0.0134, "step": 10622 }, { "epoch": 4.833030027297543, "grad_norm": 0.7453178169093733, "learning_rate": 1.3820045518432025e-08, "loss": 0.0245, "step": 10623 }, { "epoch": 4.833484986351229, "grad_norm": 0.44846870017193213, "learning_rate": 1.3745107433348615e-08, "loss": 0.0038, "step": 10624 }, { "epoch": 4.8339399454049135, "grad_norm": 0.551512052852457, "learning_rate": 1.3670372513787111e-08, "loss": 0.0086, "step": 10625 }, { "epoch": 4.834394904458598, "grad_norm": 0.675728393264126, "learning_rate": 1.3595840765854574e-08, "loss": 0.0082, "step": 10626 }, { "epoch": 4.834849863512284, "grad_norm": 0.14717181554395695, "learning_rate": 1.3521512195641407e-08, "loss": 0.0009, "step": 10627 }, { "epoch": 4.835304822565969, "grad_norm": 0.5926801425879694, "learning_rate": 1.3447386809221364e-08, "loss": 0.0041, "step": 10628 }, { "epoch": 4.835759781619654, "grad_norm": 0.3395662711306018, "learning_rate": 1.3373464612651821e-08, "loss": 0.0051, "step": 10629 }, { "epoch": 4.83621474067334, "grad_norm": 0.04710220619590608, "learning_rate": 1.3299745611973224e-08, "loss": 0.0002, "step": 10630 }, { "epoch": 4.836669699727024, "grad_norm": 0.33654569285709385, "learning_rate": 1.3226229813209645e-08, "loss": 0.0005, "step": 10631 }, { "epoch": 4.837124658780709, "grad_norm": 0.6827733185154327, "learning_rate": 1.3152917222368222e-08, "loss": 0.0035, "step": 10632 }, { "epoch": 4.837579617834395, "grad_norm": 0.6969960143369023, "learning_rate": 1.3079807845439996e-08, "loss": 0.0146, "step": 10633 }, { "epoch": 4.83803457688808, "grad_norm": 0.24726451024271073, "learning_rate": 1.3006901688399077e-08, "loss": 0.0023, "step": 10634 }, { "epoch": 4.838489535941765, "grad_norm": 0.04112584954208512, "learning_rate": 1.293419875720292e-08, "loss": 0.0001, "step": 10635 }, { "epoch": 4.8389444949954505, "grad_norm": 0.28413527302295905, "learning_rate": 1.2861699057792887e-08, "loss": 0.0024, "step": 10636 }, { "epoch": 4.839399454049135, "grad_norm": 0.29142027008262966, "learning_rate": 1.278940259609257e-08, "loss": 0.002, "step": 10637 }, { "epoch": 4.83985441310282, "grad_norm": 0.8340138665984904, "learning_rate": 1.2717309378010024e-08, "loss": 0.0063, "step": 10638 }, { "epoch": 4.840309372156506, "grad_norm": 0.660054687444536, "learning_rate": 1.2645419409436921e-08, "loss": 0.0055, "step": 10639 }, { "epoch": 4.840764331210191, "grad_norm": 0.31901210870488333, "learning_rate": 1.2573732696247176e-08, "loss": 0.0013, "step": 10640 }, { "epoch": 4.841219290263876, "grad_norm": 0.42114819530480246, "learning_rate": 1.250224924429888e-08, "loss": 0.0058, "step": 10641 }, { "epoch": 4.8416742493175615, "grad_norm": 0.3099384473950171, "learning_rate": 1.2430969059433196e-08, "loss": 0.0024, "step": 10642 }, { "epoch": 4.842129208371246, "grad_norm": 0.19839566696973113, "learning_rate": 1.2359892147474906e-08, "loss": 0.0006, "step": 10643 }, { "epoch": 4.842584167424932, "grad_norm": 0.788598875650103, "learning_rate": 1.2289018514232421e-08, "loss": 0.0068, "step": 10644 }, { "epoch": 4.843039126478617, "grad_norm": 0.3353911846106786, "learning_rate": 1.2218348165496663e-08, "loss": 0.001, "step": 10645 }, { "epoch": 4.843494085532302, "grad_norm": 0.4813999872483867, "learning_rate": 1.2147881107043014e-08, "loss": 0.0032, "step": 10646 }, { "epoch": 4.843949044585988, "grad_norm": 0.4793806039788295, "learning_rate": 1.2077617344629366e-08, "loss": 0.0044, "step": 10647 }, { "epoch": 4.8444040036396725, "grad_norm": 0.10879628550742493, "learning_rate": 1.2007556883997518e-08, "loss": 0.0004, "step": 10648 }, { "epoch": 4.844858962693357, "grad_norm": 0.11080087520042547, "learning_rate": 1.193769973087261e-08, "loss": 0.0005, "step": 10649 }, { "epoch": 4.845313921747043, "grad_norm": 0.6148083168615605, "learning_rate": 1.1868045890962576e-08, "loss": 0.0057, "step": 10650 }, { "epoch": 4.845768880800728, "grad_norm": 1.5651068429463417, "learning_rate": 1.1798595369959532e-08, "loss": 0.004, "step": 10651 }, { "epoch": 4.846223839854413, "grad_norm": 0.32267062376177946, "learning_rate": 1.1729348173538934e-08, "loss": 0.0022, "step": 10652 }, { "epoch": 4.846678798908099, "grad_norm": 0.24183165697959344, "learning_rate": 1.166030430735876e-08, "loss": 0.001, "step": 10653 }, { "epoch": 4.8471337579617835, "grad_norm": 0.6356158160966946, "learning_rate": 1.159146377706144e-08, "loss": 0.0116, "step": 10654 }, { "epoch": 4.847588717015468, "grad_norm": 0.4462762706907423, "learning_rate": 1.1522826588272196e-08, "loss": 0.0036, "step": 10655 }, { "epoch": 4.848043676069154, "grad_norm": 2.5957999787941186, "learning_rate": 1.1454392746599596e-08, "loss": 0.0062, "step": 10656 }, { "epoch": 4.848498635122839, "grad_norm": 0.14644365344098673, "learning_rate": 1.1386162257636113e-08, "loss": 0.0006, "step": 10657 }, { "epoch": 4.848953594176524, "grad_norm": 0.6314430133165478, "learning_rate": 1.131813512695673e-08, "loss": 0.0052, "step": 10658 }, { "epoch": 4.84940855323021, "grad_norm": 0.7355086912775471, "learning_rate": 1.1250311360120335e-08, "loss": 0.0066, "step": 10659 }, { "epoch": 4.8498635122838945, "grad_norm": 0.21067843823371618, "learning_rate": 1.1182690962669719e-08, "loss": 0.0009, "step": 10660 }, { "epoch": 4.850318471337579, "grad_norm": 0.6081409644542831, "learning_rate": 1.1115273940130178e-08, "loss": 0.0034, "step": 10661 }, { "epoch": 4.850773430391265, "grad_norm": 0.22501664661452614, "learning_rate": 1.1048060298010644e-08, "loss": 0.0011, "step": 10662 }, { "epoch": 4.85122838944495, "grad_norm": 0.5281331916469054, "learning_rate": 1.0981050041803665e-08, "loss": 0.0049, "step": 10663 }, { "epoch": 4.851683348498635, "grad_norm": 0.20756475542824068, "learning_rate": 1.091424317698514e-08, "loss": 0.0009, "step": 10664 }, { "epoch": 4.852138307552321, "grad_norm": 0.9168955627449218, "learning_rate": 1.0847639709013757e-08, "loss": 0.0051, "step": 10665 }, { "epoch": 4.852593266606005, "grad_norm": 0.13838712759793087, "learning_rate": 1.0781239643332387e-08, "loss": 0.0007, "step": 10666 }, { "epoch": 4.85304822565969, "grad_norm": 0.7883909120861291, "learning_rate": 1.0715042985366964e-08, "loss": 0.0126, "step": 10667 }, { "epoch": 4.853503184713376, "grad_norm": 1.6477759295430365, "learning_rate": 1.0649049740526774e-08, "loss": 0.0321, "step": 10668 }, { "epoch": 4.853958143767061, "grad_norm": 0.31227452797233973, "learning_rate": 1.0583259914204446e-08, "loss": 0.0096, "step": 10669 }, { "epoch": 4.854413102820746, "grad_norm": 1.121263346947775, "learning_rate": 1.051767351177596e-08, "loss": 0.0159, "step": 10670 }, { "epoch": 4.8548680618744315, "grad_norm": 3.9299778463844697, "learning_rate": 1.045229053860064e-08, "loss": 0.0203, "step": 10671 }, { "epoch": 4.855323020928116, "grad_norm": 0.3336554282183522, "learning_rate": 1.038711100002171e-08, "loss": 0.0009, "step": 10672 }, { "epoch": 4.855777979981801, "grad_norm": 0.2676885806508192, "learning_rate": 1.032213490136491e-08, "loss": 0.0012, "step": 10673 }, { "epoch": 4.856232939035487, "grad_norm": 0.24631889805156035, "learning_rate": 1.0257362247939884e-08, "loss": 0.0017, "step": 10674 }, { "epoch": 4.856687898089172, "grad_norm": 0.3831308027069852, "learning_rate": 1.0192793045039894e-08, "loss": 0.0021, "step": 10675 }, { "epoch": 4.857142857142857, "grad_norm": 0.37177308491996175, "learning_rate": 1.0128427297940724e-08, "loss": 0.0022, "step": 10676 }, { "epoch": 4.8575978161965425, "grad_norm": 0.16619395068177556, "learning_rate": 1.006426501190233e-08, "loss": 0.0008, "step": 10677 }, { "epoch": 4.858052775250227, "grad_norm": 0.3490992914086953, "learning_rate": 1.000030619216802e-08, "loss": 0.0011, "step": 10678 }, { "epoch": 4.858507734303913, "grad_norm": 0.40277981534009094, "learning_rate": 9.936550843963888e-09, "loss": 0.003, "step": 10679 }, { "epoch": 4.858962693357598, "grad_norm": 0.2605551720124959, "learning_rate": 9.872998972499381e-09, "loss": 0.002, "step": 10680 }, { "epoch": 4.859417652411283, "grad_norm": 0.0583034065932554, "learning_rate": 9.8096505829684e-09, "loss": 0.0001, "step": 10681 }, { "epoch": 4.859872611464969, "grad_norm": 0.24472811294205996, "learning_rate": 9.746505680547358e-09, "loss": 0.0006, "step": 10682 }, { "epoch": 4.8603275705186535, "grad_norm": 1.0060058453502354, "learning_rate": 9.68356427039574e-09, "loss": 0.0103, "step": 10683 }, { "epoch": 4.860782529572338, "grad_norm": 0.40755522237893127, "learning_rate": 9.62082635765721e-09, "loss": 0.0065, "step": 10684 }, { "epoch": 4.861237488626024, "grad_norm": 0.9124709668151496, "learning_rate": 9.558291947457943e-09, "loss": 0.015, "step": 10685 }, { "epoch": 4.861692447679709, "grad_norm": 0.5233321546055363, "learning_rate": 9.495961044908852e-09, "loss": 0.0078, "step": 10686 }, { "epoch": 4.862147406733394, "grad_norm": 0.626883320251926, "learning_rate": 9.433833655102253e-09, "loss": 0.013, "step": 10687 }, { "epoch": 4.86260236578708, "grad_norm": 0.5414580743500721, "learning_rate": 9.371909783116028e-09, "loss": 0.0069, "step": 10688 }, { "epoch": 4.8630573248407645, "grad_norm": 0.30136819908029716, "learning_rate": 9.310189434009464e-09, "loss": 0.0024, "step": 10689 }, { "epoch": 4.863512283894449, "grad_norm": 0.16349571840774443, "learning_rate": 9.248672612826304e-09, "loss": 0.0006, "step": 10690 }, { "epoch": 4.863967242948135, "grad_norm": 0.7311442017457582, "learning_rate": 9.187359324593637e-09, "loss": 0.0072, "step": 10691 }, { "epoch": 4.86442220200182, "grad_norm": 0.3099271484412273, "learning_rate": 9.126249574321344e-09, "loss": 0.0024, "step": 10692 }, { "epoch": 4.864877161055505, "grad_norm": 1.0087404124650612, "learning_rate": 9.065343367003488e-09, "loss": 0.007, "step": 10693 }, { "epoch": 4.865332120109191, "grad_norm": 0.5115679211189336, "learning_rate": 9.004640707616641e-09, "loss": 0.0091, "step": 10694 }, { "epoch": 4.8657870791628755, "grad_norm": 0.8061687792254901, "learning_rate": 8.944141601121559e-09, "loss": 0.0135, "step": 10695 }, { "epoch": 4.86624203821656, "grad_norm": 0.5114994603468391, "learning_rate": 8.88384605246151e-09, "loss": 0.0113, "step": 10696 }, { "epoch": 4.866696997270246, "grad_norm": 1.8558305386253875, "learning_rate": 8.823754066563662e-09, "loss": 0.0104, "step": 10697 }, { "epoch": 4.867151956323931, "grad_norm": 0.4437775952384882, "learning_rate": 8.763865648338809e-09, "loss": 0.0086, "step": 10698 }, { "epoch": 4.867606915377616, "grad_norm": 0.546166977146709, "learning_rate": 8.70418080268054e-09, "loss": 0.0015, "step": 10699 }, { "epoch": 4.868061874431302, "grad_norm": 0.33477375575276463, "learning_rate": 8.644699534466061e-09, "loss": 0.0007, "step": 10700 }, { "epoch": 4.868516833484986, "grad_norm": 0.3557366488799452, "learning_rate": 8.585421848555653e-09, "loss": 0.0019, "step": 10701 }, { "epoch": 4.868971792538671, "grad_norm": 0.4243738081108588, "learning_rate": 8.526347749793495e-09, "loss": 0.0048, "step": 10702 }, { "epoch": 4.869426751592357, "grad_norm": 0.5329699140706184, "learning_rate": 8.467477243006838e-09, "loss": 0.0106, "step": 10703 }, { "epoch": 4.869881710646042, "grad_norm": 0.6060778183653093, "learning_rate": 8.408810333006278e-09, "loss": 0.015, "step": 10704 }, { "epoch": 4.870336669699727, "grad_norm": 0.27115281800376445, "learning_rate": 8.350347024586036e-09, "loss": 0.0019, "step": 10705 }, { "epoch": 4.8707916287534125, "grad_norm": 0.6366323408740789, "learning_rate": 8.292087322522846e-09, "loss": 0.005, "step": 10706 }, { "epoch": 4.871246587807097, "grad_norm": 0.12242878331506885, "learning_rate": 8.234031231578177e-09, "loss": 0.0003, "step": 10707 }, { "epoch": 4.871701546860782, "grad_norm": 0.6382051794200383, "learning_rate": 8.176178756495457e-09, "loss": 0.0238, "step": 10708 }, { "epoch": 4.872156505914468, "grad_norm": 1.6344968488673806, "learning_rate": 8.11852990200257e-09, "loss": 0.0102, "step": 10709 }, { "epoch": 4.872611464968153, "grad_norm": 0.9983085343403226, "learning_rate": 8.061084672810193e-09, "loss": 0.0155, "step": 10710 }, { "epoch": 4.873066424021838, "grad_norm": 0.5708994435485224, "learning_rate": 8.003843073612627e-09, "loss": 0.0111, "step": 10711 }, { "epoch": 4.8735213830755235, "grad_norm": 0.3800132466650493, "learning_rate": 7.946805109086964e-09, "loss": 0.0021, "step": 10712 }, { "epoch": 4.873976342129208, "grad_norm": 0.4328633473900317, "learning_rate": 7.889970783894751e-09, "loss": 0.0031, "step": 10713 }, { "epoch": 4.874431301182893, "grad_norm": 0.11336505784572609, "learning_rate": 7.833340102679498e-09, "loss": 0.0004, "step": 10714 }, { "epoch": 4.874886260236579, "grad_norm": 0.45684786260008825, "learning_rate": 7.77691307006917e-09, "loss": 0.0019, "step": 10715 }, { "epoch": 4.875341219290264, "grad_norm": 0.6386060378427625, "learning_rate": 7.720689690674798e-09, "loss": 0.0076, "step": 10716 }, { "epoch": 4.875796178343949, "grad_norm": 1.117693634858015, "learning_rate": 7.664669969090765e-09, "loss": 0.0172, "step": 10717 }, { "epoch": 4.8762511373976345, "grad_norm": 0.4683331906858081, "learning_rate": 7.60885390989452e-09, "loss": 0.0013, "step": 10718 }, { "epoch": 4.876706096451319, "grad_norm": 0.0947692977894827, "learning_rate": 7.553241517647136e-09, "loss": 0.0005, "step": 10719 }, { "epoch": 4.877161055505004, "grad_norm": 0.9135080346221078, "learning_rate": 7.497832796893311e-09, "loss": 0.0165, "step": 10720 }, { "epoch": 4.87761601455869, "grad_norm": 0.7749763290373621, "learning_rate": 7.442627752160259e-09, "loss": 0.0075, "step": 10721 }, { "epoch": 4.878070973612375, "grad_norm": 0.4740735028713511, "learning_rate": 7.387626387959368e-09, "loss": 0.0032, "step": 10722 }, { "epoch": 4.87852593266606, "grad_norm": 0.8918585121329317, "learning_rate": 7.332828708785378e-09, "loss": 0.0096, "step": 10723 }, { "epoch": 4.8789808917197455, "grad_norm": 0.7832356303203454, "learning_rate": 7.27823471911554e-09, "loss": 0.0149, "step": 10724 }, { "epoch": 4.87943585077343, "grad_norm": 0.7134419460075775, "learning_rate": 7.223844423411564e-09, "loss": 0.0045, "step": 10725 }, { "epoch": 4.879890809827115, "grad_norm": 0.36711331866114455, "learning_rate": 7.169657826117671e-09, "loss": 0.0023, "step": 10726 }, { "epoch": 4.880345768880801, "grad_norm": 0.5060673603933115, "learning_rate": 7.115674931661987e-09, "loss": 0.0043, "step": 10727 }, { "epoch": 4.880800727934486, "grad_norm": 0.34292293022407444, "learning_rate": 7.061895744455149e-09, "loss": 0.0012, "step": 10728 }, { "epoch": 4.881255686988171, "grad_norm": 0.7225099110908161, "learning_rate": 7.008320268892532e-09, "loss": 0.0253, "step": 10729 }, { "epoch": 4.8817106460418564, "grad_norm": 0.6818516484685787, "learning_rate": 6.9549485093514665e-09, "loss": 0.0179, "step": 10730 }, { "epoch": 4.882165605095541, "grad_norm": 0.33425227235351274, "learning_rate": 6.901780470193742e-09, "loss": 0.0019, "step": 10731 }, { "epoch": 4.882620564149226, "grad_norm": 0.7333923277297666, "learning_rate": 6.8488161557639376e-09, "loss": 0.011, "step": 10732 }, { "epoch": 4.883075523202912, "grad_norm": 0.8598480493649336, "learning_rate": 6.796055570389426e-09, "loss": 0.009, "step": 10733 }, { "epoch": 4.883530482256597, "grad_norm": 1.23859036246682, "learning_rate": 6.743498718382591e-09, "loss": 0.0128, "step": 10734 }, { "epoch": 4.883985441310282, "grad_norm": 0.7882723451144211, "learning_rate": 6.691145604037219e-09, "loss": 0.0014, "step": 10735 }, { "epoch": 4.884440400363967, "grad_norm": 0.329260675302962, "learning_rate": 6.638996231631834e-09, "loss": 0.0016, "step": 10736 }, { "epoch": 4.884895359417652, "grad_norm": 0.667200224018284, "learning_rate": 6.5870506054277475e-09, "loss": 0.014, "step": 10737 }, { "epoch": 4.885350318471337, "grad_norm": 0.48609458196493777, "learning_rate": 6.5353087296696205e-09, "loss": 0.0047, "step": 10738 }, { "epoch": 4.885805277525023, "grad_norm": 0.22667909779404544, "learning_rate": 6.483770608586016e-09, "loss": 0.0012, "step": 10739 }, { "epoch": 4.886260236578708, "grad_norm": 0.6037636201797201, "learning_rate": 6.43243624638773e-09, "loss": 0.0114, "step": 10740 }, { "epoch": 4.886715195632393, "grad_norm": 0.6492288856434071, "learning_rate": 6.3813056472700194e-09, "loss": 0.0027, "step": 10741 }, { "epoch": 4.887170154686078, "grad_norm": 0.26948246919208824, "learning_rate": 6.330378815410932e-09, "loss": 0.0016, "step": 10742 }, { "epoch": 4.887625113739763, "grad_norm": 0.26942526106347825, "learning_rate": 6.2796557549718585e-09, "loss": 0.0013, "step": 10743 }, { "epoch": 4.888080072793448, "grad_norm": 0.3404047580323894, "learning_rate": 6.229136470098096e-09, "loss": 0.0018, "step": 10744 }, { "epoch": 4.888535031847134, "grad_norm": 0.650981163195041, "learning_rate": 6.178820964917176e-09, "loss": 0.0105, "step": 10745 }, { "epoch": 4.888989990900819, "grad_norm": 0.2731148062188111, "learning_rate": 6.1287092435413645e-09, "loss": 0.0012, "step": 10746 }, { "epoch": 4.889444949954504, "grad_norm": 0.4938529397655602, "learning_rate": 6.078801310064886e-09, "loss": 0.0042, "step": 10747 }, { "epoch": 4.889899909008189, "grad_norm": 0.068705978137054, "learning_rate": 6.029097168566422e-09, "loss": 0.0003, "step": 10748 }, { "epoch": 4.890354868061874, "grad_norm": 0.6202723997036864, "learning_rate": 5.979596823107448e-09, "loss": 0.0063, "step": 10749 }, { "epoch": 4.890809827115559, "grad_norm": 0.7832121637241725, "learning_rate": 5.930300277732781e-09, "loss": 0.0067, "step": 10750 }, { "epoch": 4.891264786169245, "grad_norm": 0.6647273976331272, "learning_rate": 5.881207536471145e-09, "loss": 0.0089, "step": 10751 }, { "epoch": 4.89171974522293, "grad_norm": 0.5798823185613392, "learning_rate": 5.832318603333776e-09, "loss": 0.0093, "step": 10752 }, { "epoch": 4.892174704276615, "grad_norm": 0.8912975167564241, "learning_rate": 5.783633482315809e-09, "loss": 0.0016, "step": 10753 }, { "epoch": 4.8926296633303, "grad_norm": 0.5362279885554133, "learning_rate": 5.735152177395453e-09, "loss": 0.0173, "step": 10754 }, { "epoch": 4.893084622383985, "grad_norm": 1.0194393937406658, "learning_rate": 5.686874692534538e-09, "loss": 0.0061, "step": 10755 }, { "epoch": 4.893539581437671, "grad_norm": 0.8860798820954366, "learning_rate": 5.6388010316779655e-09, "loss": 0.007, "step": 10756 }, { "epoch": 4.893994540491356, "grad_norm": 0.4079759739898361, "learning_rate": 5.59093119875398e-09, "loss": 0.0031, "step": 10757 }, { "epoch": 4.894449499545041, "grad_norm": 0.013397964411732691, "learning_rate": 5.54326519767473e-09, "loss": 0.0, "step": 10758 }, { "epoch": 4.8949044585987265, "grad_norm": 1.0710567073530133, "learning_rate": 5.495803032334879e-09, "loss": 0.0135, "step": 10759 }, { "epoch": 4.895359417652411, "grad_norm": 0.4457536383532997, "learning_rate": 5.448544706612713e-09, "loss": 0.0027, "step": 10760 }, { "epoch": 4.895814376706096, "grad_norm": 0.5793878952494157, "learning_rate": 5.401490224370421e-09, "loss": 0.0051, "step": 10761 }, { "epoch": 4.896269335759782, "grad_norm": 0.17278659669409133, "learning_rate": 5.3546395894527035e-09, "loss": 0.0006, "step": 10762 }, { "epoch": 4.896724294813467, "grad_norm": 0.39526354822212617, "learning_rate": 5.307992805688445e-09, "loss": 0.0073, "step": 10763 }, { "epoch": 4.897179253867152, "grad_norm": 0.5113460880867208, "learning_rate": 5.2615498768887605e-09, "loss": 0.0038, "step": 10764 }, { "epoch": 4.8976342129208374, "grad_norm": 0.6970024733251623, "learning_rate": 5.21531080684895e-09, "loss": 0.0078, "step": 10765 }, { "epoch": 4.898089171974522, "grad_norm": 0.48877638691035313, "learning_rate": 5.1692755993479335e-09, "loss": 0.0016, "step": 10766 }, { "epoch": 4.898544131028207, "grad_norm": 0.7108949556689609, "learning_rate": 5.12344425814687e-09, "loss": 0.0268, "step": 10767 }, { "epoch": 4.898999090081893, "grad_norm": 0.5992837027566308, "learning_rate": 5.077816786991374e-09, "loss": 0.0089, "step": 10768 }, { "epoch": 4.899454049135578, "grad_norm": 0.34013695045886405, "learning_rate": 5.032393189609852e-09, "loss": 0.0014, "step": 10769 }, { "epoch": 4.899909008189263, "grad_norm": 0.4831002186180567, "learning_rate": 4.98717346971378e-09, "loss": 0.0024, "step": 10770 }, { "epoch": 4.900363967242948, "grad_norm": 0.6681990099077835, "learning_rate": 4.942157630998534e-09, "loss": 0.0133, "step": 10771 }, { "epoch": 4.900818926296633, "grad_norm": 0.5435889987453912, "learning_rate": 4.897345677142562e-09, "loss": 0.0056, "step": 10772 }, { "epoch": 4.901273885350318, "grad_norm": 0.3049122112185648, "learning_rate": 4.852737611807656e-09, "loss": 0.0017, "step": 10773 }, { "epoch": 4.901728844404004, "grad_norm": 0.17851760003676748, "learning_rate": 4.808333438639235e-09, "loss": 0.0007, "step": 10774 }, { "epoch": 4.902183803457689, "grad_norm": 0.09733745943677966, "learning_rate": 4.764133161265505e-09, "loss": 0.0004, "step": 10775 }, { "epoch": 4.902638762511374, "grad_norm": 0.610297406150545, "learning_rate": 4.720136783298579e-09, "loss": 0.0079, "step": 10776 }, { "epoch": 4.903093721565059, "grad_norm": 0.09525402403936717, "learning_rate": 4.676344308333081e-09, "loss": 0.0002, "step": 10777 }, { "epoch": 4.903548680618744, "grad_norm": 0.07433179822906599, "learning_rate": 4.632755739948369e-09, "loss": 0.0003, "step": 10778 }, { "epoch": 4.904003639672429, "grad_norm": 0.17260520531791423, "learning_rate": 4.589371081705762e-09, "loss": 0.0008, "step": 10779 }, { "epoch": 4.904458598726115, "grad_norm": 1.2104762523563912, "learning_rate": 4.54619033715048e-09, "loss": 0.0146, "step": 10780 }, { "epoch": 4.9049135577798, "grad_norm": 0.8893237512657519, "learning_rate": 4.5032135098110884e-09, "loss": 0.0134, "step": 10781 }, { "epoch": 4.905368516833485, "grad_norm": 0.5958712729455083, "learning_rate": 4.460440603199778e-09, "loss": 0.0015, "step": 10782 }, { "epoch": 4.90582347588717, "grad_norm": 0.6562641941397628, "learning_rate": 4.417871620811254e-09, "loss": 0.0056, "step": 10783 }, { "epoch": 4.906278434940855, "grad_norm": 0.46646770065353477, "learning_rate": 4.375506566124676e-09, "loss": 0.0046, "step": 10784 }, { "epoch": 4.90673339399454, "grad_norm": 0.18274400655679318, "learning_rate": 4.333345442601167e-09, "loss": 0.0009, "step": 10785 }, { "epoch": 4.907188353048226, "grad_norm": 1.036670107504628, "learning_rate": 4.291388253686579e-09, "loss": 0.0102, "step": 10786 }, { "epoch": 4.907643312101911, "grad_norm": 0.6668346947642565, "learning_rate": 4.249635002809005e-09, "loss": 0.0057, "step": 10787 }, { "epoch": 4.9080982711555965, "grad_norm": 0.5995621406721393, "learning_rate": 4.208085693380715e-09, "loss": 0.0069, "step": 10788 }, { "epoch": 4.908553230209281, "grad_norm": 1.0650742365975197, "learning_rate": 4.16674032879677e-09, "loss": 0.0215, "step": 10789 }, { "epoch": 4.909008189262966, "grad_norm": 0.7091031077319836, "learning_rate": 4.12559891243558e-09, "loss": 0.0092, "step": 10790 }, { "epoch": 4.909463148316652, "grad_norm": 0.5564882575168428, "learning_rate": 4.084661447659178e-09, "loss": 0.0038, "step": 10791 }, { "epoch": 4.909918107370337, "grad_norm": 1.1904421038938038, "learning_rate": 4.043927937812941e-09, "loss": 0.027, "step": 10792 }, { "epoch": 4.910373066424022, "grad_norm": 0.5256343239771207, "learning_rate": 4.003398386225321e-09, "loss": 0.0114, "step": 10793 }, { "epoch": 4.9108280254777075, "grad_norm": 0.639280290296693, "learning_rate": 3.963072796208112e-09, "loss": 0.004, "step": 10794 }, { "epoch": 4.911282984531392, "grad_norm": 1.0067958880065144, "learning_rate": 3.922951171056455e-09, "loss": 0.0045, "step": 10795 }, { "epoch": 4.911737943585077, "grad_norm": 0.4547350148135931, "learning_rate": 3.8830335140491174e-09, "loss": 0.0028, "step": 10796 }, { "epoch": 4.912192902638763, "grad_norm": 0.5554181968103328, "learning_rate": 3.8433198284479335e-09, "loss": 0.0038, "step": 10797 }, { "epoch": 4.912647861692448, "grad_norm": 0.5984467825673397, "learning_rate": 3.8038101174980856e-09, "loss": 0.0075, "step": 10798 }, { "epoch": 4.913102820746133, "grad_norm": 0.8396023793481242, "learning_rate": 3.764504384428103e-09, "loss": 0.0201, "step": 10799 }, { "epoch": 4.9135577797998184, "grad_norm": 0.26604630057160134, "learning_rate": 3.725402632450137e-09, "loss": 0.0016, "step": 10800 }, { "epoch": 4.914012738853503, "grad_norm": 0.4583354683714948, "learning_rate": 3.6865048647588554e-09, "loss": 0.0032, "step": 10801 }, { "epoch": 4.914467697907188, "grad_norm": 0.5141820135985183, "learning_rate": 3.6478110845333814e-09, "loss": 0.0027, "step": 10802 }, { "epoch": 4.914922656960874, "grad_norm": 0.9162346236583728, "learning_rate": 3.6093212949353527e-09, "loss": 0.0024, "step": 10803 }, { "epoch": 4.915377616014559, "grad_norm": 1.0060123431588028, "learning_rate": 3.5710354991100317e-09, "loss": 0.0052, "step": 10804 }, { "epoch": 4.915832575068244, "grad_norm": 0.9340767598775508, "learning_rate": 3.532953700185748e-09, "loss": 0.0121, "step": 10805 }, { "epoch": 4.916287534121929, "grad_norm": 0.3470276404366122, "learning_rate": 3.495075901274736e-09, "loss": 0.0012, "step": 10806 }, { "epoch": 4.916742493175614, "grad_norm": 1.6664375102075482, "learning_rate": 3.457402105471741e-09, "loss": 0.0043, "step": 10807 }, { "epoch": 4.917197452229299, "grad_norm": 0.7361594494041906, "learning_rate": 3.4199323158556897e-09, "loss": 0.0169, "step": 10808 }, { "epoch": 4.917652411282985, "grad_norm": 0.2025933971395758, "learning_rate": 3.3826665354882994e-09, "loss": 0.001, "step": 10809 }, { "epoch": 4.91810737033667, "grad_norm": 0.20349819642242628, "learning_rate": 3.3456047674149118e-09, "loss": 0.0008, "step": 10810 }, { "epoch": 4.918562329390355, "grad_norm": 0.16325092991100365, "learning_rate": 3.308747014663938e-09, "loss": 0.0005, "step": 10811 }, { "epoch": 4.91901728844404, "grad_norm": 0.91064136741133, "learning_rate": 3.2720932802468573e-09, "loss": 0.0303, "step": 10812 }, { "epoch": 4.919472247497725, "grad_norm": 0.41921111373588116, "learning_rate": 3.2356435671596076e-09, "loss": 0.0011, "step": 10813 }, { "epoch": 4.91992720655141, "grad_norm": 0.30459615448725247, "learning_rate": 3.199397878380084e-09, "loss": 0.0027, "step": 10814 }, { "epoch": 4.920382165605096, "grad_norm": 0.9808400860007765, "learning_rate": 3.1633562168700836e-09, "loss": 0.0074, "step": 10815 }, { "epoch": 4.920837124658781, "grad_norm": 0.6732873941993525, "learning_rate": 3.1275185855753064e-09, "loss": 0.0073, "step": 10816 }, { "epoch": 4.921292083712466, "grad_norm": 0.6653625562739386, "learning_rate": 3.091884987423965e-09, "loss": 0.01, "step": 10817 }, { "epoch": 4.921747042766151, "grad_norm": 0.37263564714923336, "learning_rate": 3.0564554253276204e-09, "loss": 0.0014, "step": 10818 }, { "epoch": 4.922202001819836, "grad_norm": 0.4787926815189841, "learning_rate": 3.0212299021817326e-09, "loss": 0.0026, "step": 10819 }, { "epoch": 4.922656960873521, "grad_norm": 0.7265648814493539, "learning_rate": 2.9862084208648336e-09, "loss": 0.0078, "step": 10820 }, { "epoch": 4.923111919927207, "grad_norm": 0.5056947629062873, "learning_rate": 2.951390984238245e-09, "loss": 0.0027, "step": 10821 }, { "epoch": 4.923566878980892, "grad_norm": 0.846235704532716, "learning_rate": 2.916777595147746e-09, "loss": 0.0092, "step": 10822 }, { "epoch": 4.924021838034577, "grad_norm": 0.5511423638654714, "learning_rate": 2.8823682564210752e-09, "loss": 0.0017, "step": 10823 }, { "epoch": 4.924476797088262, "grad_norm": 0.3006763970513799, "learning_rate": 2.848162970870705e-09, "loss": 0.0016, "step": 10824 }, { "epoch": 4.924931756141947, "grad_norm": 0.26960600414551505, "learning_rate": 2.8141617412913435e-09, "loss": 0.001, "step": 10825 }, { "epoch": 4.925386715195632, "grad_norm": 0.6759558119214527, "learning_rate": 2.7803645704616023e-09, "loss": 0.003, "step": 10826 }, { "epoch": 4.925841674249318, "grad_norm": 1.0413763248244894, "learning_rate": 2.746771461142883e-09, "loss": 0.0295, "step": 10827 }, { "epoch": 4.926296633303003, "grad_norm": 0.6594509951048885, "learning_rate": 2.71338241608049e-09, "loss": 0.0192, "step": 10828 }, { "epoch": 4.926751592356688, "grad_norm": 0.8893227258862624, "learning_rate": 2.6801974380030736e-09, "loss": 0.0075, "step": 10829 }, { "epoch": 4.927206551410373, "grad_norm": 0.4534227698373838, "learning_rate": 2.6472165296220764e-09, "loss": 0.0081, "step": 10830 }, { "epoch": 4.927661510464058, "grad_norm": 1.0125537395803224, "learning_rate": 2.6144396936325645e-09, "loss": 0.0178, "step": 10831 }, { "epoch": 4.928116469517743, "grad_norm": 1.1736876219206434, "learning_rate": 2.5818669327129507e-09, "loss": 0.0122, "step": 10832 }, { "epoch": 4.928571428571429, "grad_norm": 0.5577161708609156, "learning_rate": 2.5494982495249955e-09, "loss": 0.0113, "step": 10833 }, { "epoch": 4.929026387625114, "grad_norm": 0.30745161368924245, "learning_rate": 2.5173336467135266e-09, "loss": 0.0049, "step": 10834 }, { "epoch": 4.929481346678799, "grad_norm": 0.710562590600738, "learning_rate": 2.485373126906998e-09, "loss": 0.0042, "step": 10835 }, { "epoch": 4.929936305732484, "grad_norm": 0.9977142948190286, "learning_rate": 2.453616692717209e-09, "loss": 0.0108, "step": 10836 }, { "epoch": 4.930391264786169, "grad_norm": 0.6162607271620215, "learning_rate": 2.4220643467387506e-09, "loss": 0.0047, "step": 10837 }, { "epoch": 4.930846223839854, "grad_norm": 0.5726805342211836, "learning_rate": 2.390716091550671e-09, "loss": 0.0063, "step": 10838 }, { "epoch": 4.93130118289354, "grad_norm": 1.1264076968700125, "learning_rate": 2.3595719297139776e-09, "loss": 0.0081, "step": 10839 }, { "epoch": 4.931756141947225, "grad_norm": 0.36115213211778585, "learning_rate": 2.3286318637738557e-09, "loss": 0.0011, "step": 10840 }, { "epoch": 4.9322111010009095, "grad_norm": 0.3173008065962513, "learning_rate": 2.297895896258284e-09, "loss": 0.0014, "step": 10841 }, { "epoch": 4.932666060054595, "grad_norm": 0.25610511242615047, "learning_rate": 2.26736402967942e-09, "loss": 0.0014, "step": 10842 }, { "epoch": 4.93312101910828, "grad_norm": 0.5233348317037907, "learning_rate": 2.2370362665319333e-09, "loss": 0.0054, "step": 10843 }, { "epoch": 4.933575978161965, "grad_norm": 0.7632751610351663, "learning_rate": 2.206912609293843e-09, "loss": 0.0181, "step": 10844 }, { "epoch": 4.934030937215651, "grad_norm": 0.14882170408652345, "learning_rate": 2.1769930604270683e-09, "loss": 0.0005, "step": 10845 }, { "epoch": 4.934485896269336, "grad_norm": 0.7438268101793324, "learning_rate": 2.14727762237632e-09, "loss": 0.0206, "step": 10846 }, { "epoch": 4.9349408553230205, "grad_norm": 0.43427152356275944, "learning_rate": 2.1177662975699343e-09, "loss": 0.0046, "step": 10847 }, { "epoch": 4.935395814376706, "grad_norm": 1.2332274680628301, "learning_rate": 2.0884590884193144e-09, "loss": 0.0199, "step": 10848 }, { "epoch": 4.935850773430391, "grad_norm": 0.3944831010880099, "learning_rate": 2.0593559973192125e-09, "loss": 0.0024, "step": 10849 }, { "epoch": 4.936305732484076, "grad_norm": 0.4981032867751457, "learning_rate": 2.030457026648003e-09, "loss": 0.0079, "step": 10850 }, { "epoch": 4.936760691537762, "grad_norm": 0.24087543498830033, "learning_rate": 2.0017621787671304e-09, "loss": 0.0014, "step": 10851 }, { "epoch": 4.937215650591447, "grad_norm": 0.3289542451071778, "learning_rate": 1.973271456021386e-09, "loss": 0.0021, "step": 10852 }, { "epoch": 4.9376706096451315, "grad_norm": 0.39451546133017007, "learning_rate": 1.9449848607391853e-09, "loss": 0.0012, "step": 10853 }, { "epoch": 4.938125568698817, "grad_norm": 0.10228908891451763, "learning_rate": 1.9169023952311795e-09, "loss": 0.0005, "step": 10854 }, { "epoch": 4.938580527752502, "grad_norm": 0.22245661788719262, "learning_rate": 1.8890240617930323e-09, "loss": 0.0008, "step": 10855 }, { "epoch": 4.939035486806187, "grad_norm": 0.5667318500981702, "learning_rate": 1.8613498627023664e-09, "loss": 0.0049, "step": 10856 }, { "epoch": 4.939490445859873, "grad_norm": 0.9554866551357742, "learning_rate": 1.8338798002207059e-09, "loss": 0.0081, "step": 10857 }, { "epoch": 4.939945404913558, "grad_norm": 0.8353844153725459, "learning_rate": 1.8066138765926445e-09, "loss": 0.0031, "step": 10858 }, { "epoch": 4.9404003639672425, "grad_norm": 1.2295590097220483, "learning_rate": 1.779552094046677e-09, "loss": 0.0138, "step": 10859 }, { "epoch": 4.940855323020928, "grad_norm": 0.36071146585224084, "learning_rate": 1.7526944547935355e-09, "loss": 0.0028, "step": 10860 }, { "epoch": 4.941310282074613, "grad_norm": 0.5128292649416883, "learning_rate": 1.726040961028408e-09, "loss": 0.0067, "step": 10861 }, { "epoch": 4.941765241128298, "grad_norm": 0.22298814387661037, "learning_rate": 1.699591614928997e-09, "loss": 0.0015, "step": 10862 }, { "epoch": 4.942220200181984, "grad_norm": 1.023906961394931, "learning_rate": 1.6733464186566295e-09, "loss": 0.0116, "step": 10863 }, { "epoch": 4.942675159235669, "grad_norm": 1.2445851930086804, "learning_rate": 1.6473053743562561e-09, "loss": 0.013, "step": 10864 }, { "epoch": 4.943130118289354, "grad_norm": 0.40368332729411616, "learning_rate": 1.6214684841556194e-09, "loss": 0.0045, "step": 10865 }, { "epoch": 4.943585077343039, "grad_norm": 0.7509782659689058, "learning_rate": 1.5958357501658084e-09, "loss": 0.0047, "step": 10866 }, { "epoch": 4.944040036396724, "grad_norm": 0.5729825675571333, "learning_rate": 1.5704071744818138e-09, "loss": 0.002, "step": 10867 }, { "epoch": 4.94449499545041, "grad_norm": 1.2529474027443164, "learning_rate": 1.5451827591811407e-09, "loss": 0.0065, "step": 10868 }, { "epoch": 4.944949954504095, "grad_norm": 0.9910567816519172, "learning_rate": 1.5201625063251956e-09, "loss": 0.0093, "step": 10869 }, { "epoch": 4.94540491355778, "grad_norm": 1.0567900241481365, "learning_rate": 1.4953464179587319e-09, "loss": 0.0051, "step": 10870 }, { "epoch": 4.945859872611465, "grad_norm": 0.4000824485574719, "learning_rate": 1.4707344961092939e-09, "loss": 0.0033, "step": 10871 }, { "epoch": 4.94631483166515, "grad_norm": 0.646345400583451, "learning_rate": 1.4463267427883287e-09, "loss": 0.0084, "step": 10872 }, { "epoch": 4.946769790718835, "grad_norm": 0.8216307493115282, "learning_rate": 1.4221231599900743e-09, "loss": 0.0072, "step": 10873 }, { "epoch": 4.947224749772521, "grad_norm": 0.4005146641128074, "learning_rate": 1.3981237496923927e-09, "loss": 0.0013, "step": 10874 }, { "epoch": 4.947679708826206, "grad_norm": 0.6068059848238905, "learning_rate": 1.3743285138564932e-09, "loss": 0.007, "step": 10875 }, { "epoch": 4.9481346678798905, "grad_norm": 0.27470994401621246, "learning_rate": 1.3507374544266538e-09, "loss": 0.0007, "step": 10876 }, { "epoch": 4.948589626933576, "grad_norm": 0.6670705043947764, "learning_rate": 1.3273505733310543e-09, "loss": 0.0023, "step": 10877 }, { "epoch": 4.949044585987261, "grad_norm": 0.6575996576313538, "learning_rate": 1.304167872480111e-09, "loss": 0.0114, "step": 10878 }, { "epoch": 4.949499545040946, "grad_norm": 0.6218728916118911, "learning_rate": 1.2811893537686971e-09, "loss": 0.0069, "step": 10879 }, { "epoch": 4.949954504094632, "grad_norm": 0.9064285523338046, "learning_rate": 1.2584150190744772e-09, "loss": 0.0127, "step": 10880 }, { "epoch": 4.950409463148317, "grad_norm": 1.0097963825058527, "learning_rate": 1.235844870258185e-09, "loss": 0.0081, "step": 10881 }, { "epoch": 4.9508644222020015, "grad_norm": 0.5171007913234315, "learning_rate": 1.2134789091644561e-09, "loss": 0.0028, "step": 10882 }, { "epoch": 4.951319381255687, "grad_norm": 1.641343411690411, "learning_rate": 1.1913171376207178e-09, "loss": 0.0122, "step": 10883 }, { "epoch": 4.951774340309372, "grad_norm": 1.1133215615328491, "learning_rate": 1.1693595574382989e-09, "loss": 0.0084, "step": 10884 }, { "epoch": 4.952229299363057, "grad_norm": 0.7741394899588707, "learning_rate": 1.1476061704107645e-09, "loss": 0.0047, "step": 10885 }, { "epoch": 4.952684258416743, "grad_norm": 1.0885240992323821, "learning_rate": 1.1260569783164144e-09, "loss": 0.0144, "step": 10886 }, { "epoch": 4.953139217470428, "grad_norm": 0.3094997814568148, "learning_rate": 1.104711982915785e-09, "loss": 0.0017, "step": 10887 }, { "epoch": 4.9535941765241125, "grad_norm": 0.6468145435534469, "learning_rate": 1.0835711859533139e-09, "loss": 0.0045, "step": 10888 }, { "epoch": 4.954049135577798, "grad_norm": 0.04535408196467257, "learning_rate": 1.0626345891562305e-09, "loss": 0.0002, "step": 10889 }, { "epoch": 4.954504094631483, "grad_norm": 0.7061066663068544, "learning_rate": 1.0419021942356666e-09, "loss": 0.0133, "step": 10890 }, { "epoch": 4.954959053685168, "grad_norm": 0.6885729194192685, "learning_rate": 1.0213740028855445e-09, "loss": 0.0055, "step": 10891 }, { "epoch": 4.955414012738854, "grad_norm": 0.8473994668781124, "learning_rate": 1.001050016783689e-09, "loss": 0.0041, "step": 10892 }, { "epoch": 4.955868971792539, "grad_norm": 0.6001921611130359, "learning_rate": 9.809302375904385e-10, "loss": 0.0149, "step": 10893 }, { "epoch": 4.9563239308462235, "grad_norm": 0.5156136783206365, "learning_rate": 9.610146669500332e-10, "loss": 0.0091, "step": 10894 }, { "epoch": 4.956778889899909, "grad_norm": 0.5840220571765611, "learning_rate": 9.4130330649006e-10, "loss": 0.0041, "step": 10895 }, { "epoch": 4.957233848953594, "grad_norm": 0.6353507669535167, "learning_rate": 9.217961578211754e-10, "loss": 0.0062, "step": 10896 }, { "epoch": 4.957688808007279, "grad_norm": 0.4487430220811916, "learning_rate": 9.024932225371041e-10, "loss": 0.0048, "step": 10897 }, { "epoch": 4.958143767060965, "grad_norm": 0.56387449479828, "learning_rate": 8.833945022157509e-10, "loss": 0.0102, "step": 10898 }, { "epoch": 4.95859872611465, "grad_norm": 0.330319384085004, "learning_rate": 8.64499998417534e-10, "loss": 0.0012, "step": 10899 }, { "epoch": 4.959053685168335, "grad_norm": 0.42688148711462104, "learning_rate": 8.458097126862186e-10, "loss": 0.005, "step": 10900 }, { "epoch": 4.95950864422202, "grad_norm": 0.6820650544319906, "learning_rate": 8.273236465491941e-10, "loss": 0.0058, "step": 10901 }, { "epoch": 4.959963603275705, "grad_norm": 0.32969887628570865, "learning_rate": 8.090418015171964e-10, "loss": 0.0011, "step": 10902 }, { "epoch": 4.960418562329391, "grad_norm": 0.3949135865316868, "learning_rate": 7.909641790840306e-10, "loss": 0.0045, "step": 10903 }, { "epoch": 4.960873521383076, "grad_norm": 0.4699318109724844, "learning_rate": 7.730907807271259e-10, "loss": 0.001, "step": 10904 }, { "epoch": 4.961328480436761, "grad_norm": 0.057438776016600177, "learning_rate": 7.554216079067033e-10, "loss": 0.0002, "step": 10905 }, { "epoch": 4.961783439490446, "grad_norm": 0.750696382954958, "learning_rate": 7.379566620666079e-10, "loss": 0.0245, "step": 10906 }, { "epoch": 4.962238398544131, "grad_norm": 0.37241630544413884, "learning_rate": 7.206959446343087e-10, "loss": 0.0014, "step": 10907 }, { "epoch": 4.962693357597816, "grad_norm": 0.4231154224434961, "learning_rate": 7.036394570200667e-10, "loss": 0.002, "step": 10908 }, { "epoch": 4.963148316651502, "grad_norm": 0.6391565188232259, "learning_rate": 6.867872006174892e-10, "loss": 0.0026, "step": 10909 }, { "epoch": 4.963603275705187, "grad_norm": 0.3031056155983299, "learning_rate": 6.701391768040854e-10, "loss": 0.0027, "step": 10910 }, { "epoch": 4.9640582347588715, "grad_norm": 0.324805418622375, "learning_rate": 6.536953869398788e-10, "loss": 0.0026, "step": 10911 }, { "epoch": 4.964513193812557, "grad_norm": 1.242733260954874, "learning_rate": 6.37455832368794e-10, "loss": 0.0097, "step": 10912 }, { "epoch": 4.964968152866242, "grad_norm": 0.5530545520643836, "learning_rate": 6.214205144178254e-10, "loss": 0.0079, "step": 10913 }, { "epoch": 4.965423111919927, "grad_norm": 0.4806287564927471, "learning_rate": 6.055894343973135e-10, "loss": 0.0053, "step": 10914 }, { "epoch": 4.965878070973613, "grad_norm": 0.6573041423905549, "learning_rate": 5.899625936009457e-10, "loss": 0.0168, "step": 10915 }, { "epoch": 4.966333030027298, "grad_norm": 0.6053287937018109, "learning_rate": 5.745399933054785e-10, "loss": 0.0211, "step": 10916 }, { "epoch": 4.9667879890809825, "grad_norm": 0.41879382166666884, "learning_rate": 5.593216347712927e-10, "loss": 0.0028, "step": 10917 }, { "epoch": 4.967242948134668, "grad_norm": 1.50491341494476, "learning_rate": 5.443075192418379e-10, "loss": 0.006, "step": 10918 }, { "epoch": 4.967697907188353, "grad_norm": 0.8068508590576491, "learning_rate": 5.294976479441882e-10, "loss": 0.0023, "step": 10919 }, { "epoch": 4.968152866242038, "grad_norm": 0.15567252182001337, "learning_rate": 5.148920220887643e-10, "loss": 0.0005, "step": 10920 }, { "epoch": 4.968607825295724, "grad_norm": 1.1112047813181327, "learning_rate": 5.004906428685008e-10, "loss": 0.0066, "step": 10921 }, { "epoch": 4.969062784349409, "grad_norm": 0.5311027485996995, "learning_rate": 4.862935114605117e-10, "loss": 0.0091, "step": 10922 }, { "epoch": 4.9695177434030935, "grad_norm": 0.9414710635811209, "learning_rate": 4.723006290249799e-10, "loss": 0.0178, "step": 10923 }, { "epoch": 4.969972702456779, "grad_norm": 0.294986358136637, "learning_rate": 4.5851199670543523e-10, "loss": 0.0005, "step": 10924 }, { "epoch": 4.970427661510464, "grad_norm": 0.07037091478517939, "learning_rate": 4.4492761562819896e-10, "loss": 0.0003, "step": 10925 }, { "epoch": 4.970882620564149, "grad_norm": 0.851100441944809, "learning_rate": 4.315474869037717e-10, "loss": 0.0144, "step": 10926 }, { "epoch": 4.971337579617835, "grad_norm": 0.7743611299380844, "learning_rate": 4.183716116251679e-10, "loss": 0.0097, "step": 10927 }, { "epoch": 4.97179253867152, "grad_norm": 0.8178598132232988, "learning_rate": 4.0539999086930403e-10, "loss": 0.0127, "step": 10928 }, { "epoch": 4.9722474977252045, "grad_norm": 0.2523410904495866, "learning_rate": 3.9263262569616547e-10, "loss": 0.0013, "step": 10929 }, { "epoch": 4.97270245677889, "grad_norm": 0.842976845515936, "learning_rate": 3.800695171488067e-10, "loss": 0.0062, "step": 10930 }, { "epoch": 4.973157415832575, "grad_norm": 0.4122364115299324, "learning_rate": 3.6771066625418405e-10, "loss": 0.0029, "step": 10931 }, { "epoch": 4.97361237488626, "grad_norm": 0.15592419433229593, "learning_rate": 3.5555607402176783e-10, "loss": 0.0006, "step": 10932 }, { "epoch": 4.974067333939946, "grad_norm": 0.9870978679811618, "learning_rate": 3.4360574144520764e-10, "loss": 0.021, "step": 10933 }, { "epoch": 4.974522292993631, "grad_norm": 0.6934218936857525, "learning_rate": 3.3185966950066705e-10, "loss": 0.0016, "step": 10934 }, { "epoch": 4.9749772520473154, "grad_norm": 0.3744344457443098, "learning_rate": 3.20317859148489e-10, "loss": 0.0016, "step": 10935 }, { "epoch": 4.975432211101001, "grad_norm": 0.7541989584847004, "learning_rate": 3.0898031133125283e-10, "loss": 0.0042, "step": 10936 }, { "epoch": 4.975887170154686, "grad_norm": 0.975543195641849, "learning_rate": 2.9784702697543964e-10, "loss": 0.0067, "step": 10937 }, { "epoch": 4.976342129208371, "grad_norm": 1.048186892677499, "learning_rate": 2.8691800699115477e-10, "loss": 0.0098, "step": 10938 }, { "epoch": 4.976797088262057, "grad_norm": 0.5606753042537482, "learning_rate": 2.761932522715727e-10, "loss": 0.01, "step": 10939 }, { "epoch": 4.977252047315742, "grad_norm": 0.5847027465742545, "learning_rate": 2.656727636926593e-10, "loss": 0.0104, "step": 10940 }, { "epoch": 4.977707006369426, "grad_norm": 0.7844745684815503, "learning_rate": 2.5535654211400474e-10, "loss": 0.0216, "step": 10941 }, { "epoch": 4.978161965423112, "grad_norm": 0.8041771323820236, "learning_rate": 2.45244588379101e-10, "loss": 0.0087, "step": 10942 }, { "epoch": 4.978616924476797, "grad_norm": 0.9632676255155042, "learning_rate": 2.3533690331423166e-10, "loss": 0.0065, "step": 10943 }, { "epoch": 4.979071883530482, "grad_norm": 0.43055210480863626, "learning_rate": 2.256334877284716e-10, "loss": 0.0032, "step": 10944 }, { "epoch": 4.979526842584168, "grad_norm": 0.3577834764627258, "learning_rate": 2.1613434241507524e-10, "loss": 0.0019, "step": 10945 }, { "epoch": 4.9799818016378525, "grad_norm": 0.612729114327619, "learning_rate": 2.0683946815036604e-10, "loss": 0.0053, "step": 10946 }, { "epoch": 4.980436760691537, "grad_norm": 0.2901435041163739, "learning_rate": 1.9774886569373653e-10, "loss": 0.0006, "step": 10947 }, { "epoch": 4.980891719745223, "grad_norm": 0.9362536219884585, "learning_rate": 1.8886253578820345e-10, "loss": 0.0087, "step": 10948 }, { "epoch": 4.981346678798908, "grad_norm": 0.055790856142270966, "learning_rate": 1.8018047915957515e-10, "loss": 0.0002, "step": 10949 }, { "epoch": 4.981801637852593, "grad_norm": 0.2549824122252257, "learning_rate": 1.7170269651756165e-10, "loss": 0.0018, "step": 10950 }, { "epoch": 4.982256596906279, "grad_norm": 0.5325842428978382, "learning_rate": 1.6342918855494216e-10, "loss": 0.0101, "step": 10951 }, { "epoch": 4.9827115559599635, "grad_norm": 0.8815446135994479, "learning_rate": 1.553599559475649e-10, "loss": 0.0086, "step": 10952 }, { "epoch": 4.983166515013648, "grad_norm": 0.7042886677633085, "learning_rate": 1.4749499935517998e-10, "loss": 0.0033, "step": 10953 }, { "epoch": 4.983621474067334, "grad_norm": 0.48905781466098225, "learning_rate": 1.3983431942005133e-10, "loss": 0.0062, "step": 10954 }, { "epoch": 4.984076433121019, "grad_norm": 0.7856659763865894, "learning_rate": 1.3237791676862232e-10, "loss": 0.0055, "step": 10955 }, { "epoch": 4.984531392174704, "grad_norm": 0.767075280639585, "learning_rate": 1.251257920098503e-10, "loss": 0.019, "step": 10956 }, { "epoch": 4.98498635122839, "grad_norm": 0.05041468187952228, "learning_rate": 1.1807794573659437e-10, "loss": 0.0002, "step": 10957 }, { "epoch": 4.9854413102820745, "grad_norm": 0.512130986350284, "learning_rate": 1.1123437852450514e-10, "loss": 0.0061, "step": 10958 }, { "epoch": 4.985896269335759, "grad_norm": 1.3300996008011219, "learning_rate": 1.0459509093285747e-10, "loss": 0.0059, "step": 10959 }, { "epoch": 4.986351228389445, "grad_norm": 0.48775340946878615, "learning_rate": 9.816008350455042e-11, "loss": 0.0028, "step": 10960 }, { "epoch": 4.98680618744313, "grad_norm": 0.6276054840118677, "learning_rate": 9.192935676499704e-11, "loss": 0.0158, "step": 10961 }, { "epoch": 4.987261146496815, "grad_norm": 0.3812094857143223, "learning_rate": 8.590291122323458e-11, "loss": 0.0031, "step": 10962 }, { "epoch": 4.987716105550501, "grad_norm": 0.5344430060372136, "learning_rate": 8.008074737220206e-11, "loss": 0.0074, "step": 10963 }, { "epoch": 4.9881710646041855, "grad_norm": 0.4404643700090742, "learning_rate": 7.446286568763006e-11, "loss": 0.0018, "step": 10964 }, { "epoch": 4.98862602365787, "grad_norm": 0.2704586175620654, "learning_rate": 6.904926662804068e-11, "loss": 0.0011, "step": 10965 }, { "epoch": 4.989080982711556, "grad_norm": 0.24227224279160395, "learning_rate": 6.38399506364129e-11, "loss": 0.0012, "step": 10966 }, { "epoch": 4.989535941765241, "grad_norm": 0.11763042793140213, "learning_rate": 5.883491813796216e-11, "loss": 0.0006, "step": 10967 }, { "epoch": 4.989990900818926, "grad_norm": 0.6220072380835971, "learning_rate": 5.403416954208318e-11, "loss": 0.0082, "step": 10968 }, { "epoch": 4.990445859872612, "grad_norm": 0.5860919345621729, "learning_rate": 4.94377052406847e-11, "loss": 0.0091, "step": 10969 }, { "epoch": 4.9909008189262964, "grad_norm": 0.5894902227019098, "learning_rate": 4.5045525609854756e-11, "loss": 0.0022, "step": 10970 }, { "epoch": 4.991355777979981, "grad_norm": 0.3493913766644291, "learning_rate": 4.085763100791784e-11, "loss": 0.0017, "step": 10971 }, { "epoch": 4.991810737033667, "grad_norm": 0.3304024957228081, "learning_rate": 3.6874021777377754e-11, "loss": 0.0067, "step": 10972 }, { "epoch": 4.992265696087352, "grad_norm": 0.7224591457422294, "learning_rate": 3.3094698244084956e-11, "loss": 0.0084, "step": 10973 }, { "epoch": 4.992720655141038, "grad_norm": 0.4793909266943991, "learning_rate": 2.951966071612633e-11, "loss": 0.0044, "step": 10974 }, { "epoch": 4.9931756141947226, "grad_norm": 0.3295981435593923, "learning_rate": 2.6148909486323204e-11, "loss": 0.0023, "step": 10975 }, { "epoch": 4.993630573248407, "grad_norm": 0.7095121039455974, "learning_rate": 2.298244482973333e-11, "loss": 0.0128, "step": 10976 }, { "epoch": 4.994085532302093, "grad_norm": 1.1081315087740333, "learning_rate": 2.002026700531623e-11, "loss": 0.0076, "step": 10977 }, { "epoch": 4.994540491355778, "grad_norm": 0.5318384883437017, "learning_rate": 1.7262376254822964e-11, "loss": 0.005, "step": 10978 }, { "epoch": 4.994995450409463, "grad_norm": 0.2565385219699516, "learning_rate": 1.4708772804183924e-11, "loss": 0.0019, "step": 10979 }, { "epoch": 4.995450409463149, "grad_norm": 0.42715274349375876, "learning_rate": 1.2359456861565922e-11, "loss": 0.0069, "step": 10980 }, { "epoch": 4.9959053685168335, "grad_norm": 0.5733869998045944, "learning_rate": 1.0214428618759986e-11, "loss": 0.0084, "step": 10981 }, { "epoch": 4.996360327570518, "grad_norm": 0.5221253739254562, "learning_rate": 8.273688251736468e-12, "loss": 0.0026, "step": 10982 }, { "epoch": 4.996815286624204, "grad_norm": 0.4594202891089341, "learning_rate": 6.537235918702145e-12, "loss": 0.0025, "step": 10983 }, { "epoch": 4.997270245677889, "grad_norm": 0.08033699828630346, "learning_rate": 5.005071761488012e-12, "loss": 0.0002, "step": 10984 }, { "epoch": 4.997725204731574, "grad_norm": 0.13687913233051696, "learning_rate": 3.677195905271713e-12, "loss": 0.0008, "step": 10985 }, { "epoch": 4.99818016378526, "grad_norm": 0.36275651780811574, "learning_rate": 2.5536084588551058e-12, "loss": 0.0024, "step": 10986 }, { "epoch": 4.9986351228389445, "grad_norm": 0.5977926465248656, "learning_rate": 1.634309513831589e-12, "loss": 0.0024, "step": 10987 }, { "epoch": 4.999090081892629, "grad_norm": 0.5196464279643532, "learning_rate": 9.192991454187727e-13, "loss": 0.0017, "step": 10988 }, { "epoch": 4.999545040946315, "grad_norm": 0.36660122801219475, "learning_rate": 4.085774119033659e-13, "loss": 0.0013, "step": 10989 }, { "epoch": 5.0, "grad_norm": 0.23747516297604374, "learning_rate": 1.0214435491873176e-13, "loss": 0.0014, "step": 10990 }, { "epoch": 5.0, "step": 10990, "total_flos": 97535837306880.0, "train_loss": 0.06672536440892078, "train_runtime": 16269.4783, "train_samples_per_second": 2.701, "train_steps_per_second": 0.675 } ], "logging_steps": 1, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 555, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 97535837306880.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }