{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999922761433239, "eval_steps": 500, "global_step": 32367, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003089542670446207, "grad_norm": 1.4342206015296557, "learning_rate": 1.852995676343422e-07, "loss": 0.459, "step": 10 }, { "epoch": 0.0006179085340892414, "grad_norm": 1.2541838529642675, "learning_rate": 3.705991352686844e-07, "loss": 0.4524, "step": 20 }, { "epoch": 0.0009268628011338621, "grad_norm": 0.9828360076982188, "learning_rate": 5.558987029030266e-07, "loss": 0.4481, "step": 30 }, { "epoch": 0.0012358170681784828, "grad_norm": 0.6974302325411567, "learning_rate": 7.411982705373688e-07, "loss": 0.4281, "step": 40 }, { "epoch": 0.0015447713352231035, "grad_norm": 0.547411288843025, "learning_rate": 9.26497838171711e-07, "loss": 0.3942, "step": 50 }, { "epoch": 0.0018537256022677243, "grad_norm": 0.6249425214319002, "learning_rate": 1.1117974058060532e-06, "loss": 0.3598, "step": 60 }, { "epoch": 0.002162679869312345, "grad_norm": 0.4046247574364963, "learning_rate": 1.2970969734403955e-06, "loss": 0.3279, "step": 70 }, { "epoch": 0.0024716341363569655, "grad_norm": 0.3667300122619454, "learning_rate": 1.4823965410747376e-06, "loss": 0.2969, "step": 80 }, { "epoch": 0.0027805884034015865, "grad_norm": 0.3371062673025034, "learning_rate": 1.6676961087090797e-06, "loss": 0.2878, "step": 90 }, { "epoch": 0.003089542670446207, "grad_norm": 0.3260702974723146, "learning_rate": 1.852995676343422e-06, "loss": 0.2784, "step": 100 }, { "epoch": 0.003398496937490828, "grad_norm": 0.34444353567760644, "learning_rate": 2.0382952439777643e-06, "loss": 0.2615, "step": 110 }, { "epoch": 0.0037074512045354485, "grad_norm": 0.3850956468437575, "learning_rate": 2.2235948116121064e-06, "loss": 0.2676, "step": 120 }, { "epoch": 0.004016405471580069, "grad_norm": 0.3649231621770138, "learning_rate": 2.4088943792464484e-06, "loss": 0.2661, "step": 130 }, { "epoch": 0.00432535973862469, "grad_norm": 0.33008261225178304, "learning_rate": 2.594193946880791e-06, "loss": 0.2578, "step": 140 }, { "epoch": 0.004634314005669311, "grad_norm": 0.37275477440355054, "learning_rate": 2.779493514515133e-06, "loss": 0.258, "step": 150 }, { "epoch": 0.004943268272713931, "grad_norm": 0.33558939418979133, "learning_rate": 2.964793082149475e-06, "loss": 0.2509, "step": 160 }, { "epoch": 0.005252222539758552, "grad_norm": 0.33590394685537367, "learning_rate": 3.1500926497838172e-06, "loss": 0.2441, "step": 170 }, { "epoch": 0.005561176806803173, "grad_norm": 0.39338675651403776, "learning_rate": 3.3353922174181593e-06, "loss": 0.2455, "step": 180 }, { "epoch": 0.005870131073847794, "grad_norm": 0.34625762130866167, "learning_rate": 3.5206917850525014e-06, "loss": 0.2394, "step": 190 }, { "epoch": 0.006179085340892414, "grad_norm": 0.3556251662900565, "learning_rate": 3.705991352686844e-06, "loss": 0.2385, "step": 200 }, { "epoch": 0.006488039607937035, "grad_norm": 0.3260564080707061, "learning_rate": 3.891290920321186e-06, "loss": 0.2322, "step": 210 }, { "epoch": 0.006796993874981656, "grad_norm": 0.3676995014349691, "learning_rate": 4.0765904879555285e-06, "loss": 0.2331, "step": 220 }, { "epoch": 0.007105948142026277, "grad_norm": 0.35446900070732784, "learning_rate": 4.26189005558987e-06, "loss": 0.2339, "step": 230 }, { "epoch": 0.007414902409070897, "grad_norm": 0.3819527072560428, "learning_rate": 4.447189623224213e-06, "loss": 0.2357, "step": 240 }, { "epoch": 0.007723856676115518, "grad_norm": 0.3480466322290397, "learning_rate": 4.632489190858555e-06, "loss": 0.2254, "step": 250 }, { "epoch": 0.008032810943160138, "grad_norm": 0.3194903733187964, "learning_rate": 4.817788758492897e-06, "loss": 0.2377, "step": 260 }, { "epoch": 0.008341765210204759, "grad_norm": 0.29604030153949573, "learning_rate": 5.003088326127239e-06, "loss": 0.2225, "step": 270 }, { "epoch": 0.00865071947724938, "grad_norm": 0.39212973620577857, "learning_rate": 5.188387893761582e-06, "loss": 0.2459, "step": 280 }, { "epoch": 0.008959673744294001, "grad_norm": 0.30359592816187564, "learning_rate": 5.3736874613959236e-06, "loss": 0.2261, "step": 290 }, { "epoch": 0.009268628011338622, "grad_norm": 0.3349637043598593, "learning_rate": 5.558987029030266e-06, "loss": 0.2211, "step": 300 }, { "epoch": 0.009577582278383243, "grad_norm": 0.3942177663321336, "learning_rate": 5.744286596664609e-06, "loss": 0.2318, "step": 310 }, { "epoch": 0.009886536545427862, "grad_norm": 0.349281010677901, "learning_rate": 5.92958616429895e-06, "loss": 0.2257, "step": 320 }, { "epoch": 0.010195490812472483, "grad_norm": 0.36174185858414665, "learning_rate": 6.114885731933293e-06, "loss": 0.2258, "step": 330 }, { "epoch": 0.010504445079517104, "grad_norm": 0.33791988292092384, "learning_rate": 6.3001852995676344e-06, "loss": 0.2244, "step": 340 }, { "epoch": 0.010813399346561725, "grad_norm": 0.36238831779965897, "learning_rate": 6.485484867201976e-06, "loss": 0.217, "step": 350 }, { "epoch": 0.011122353613606346, "grad_norm": 0.4074399279408642, "learning_rate": 6.670784434836319e-06, "loss": 0.2179, "step": 360 }, { "epoch": 0.011431307880650967, "grad_norm": 0.4061444760562626, "learning_rate": 6.856084002470661e-06, "loss": 0.2207, "step": 370 }, { "epoch": 0.011740262147695588, "grad_norm": 0.2998368254360446, "learning_rate": 7.041383570105003e-06, "loss": 0.2242, "step": 380 }, { "epoch": 0.012049216414740209, "grad_norm": 0.3792762067269328, "learning_rate": 7.226683137739345e-06, "loss": 0.221, "step": 390 }, { "epoch": 0.012358170681784828, "grad_norm": 0.38562436397551597, "learning_rate": 7.411982705373688e-06, "loss": 0.2233, "step": 400 }, { "epoch": 0.012667124948829449, "grad_norm": 0.37949063216436724, "learning_rate": 7.59728227300803e-06, "loss": 0.2272, "step": 410 }, { "epoch": 0.01297607921587407, "grad_norm": 0.4070170847933438, "learning_rate": 7.782581840642372e-06, "loss": 0.2201, "step": 420 }, { "epoch": 0.013285033482918691, "grad_norm": 0.3017443172169592, "learning_rate": 7.967881408276715e-06, "loss": 0.2205, "step": 430 }, { "epoch": 0.013593987749963312, "grad_norm": 0.3201901762844856, "learning_rate": 8.153180975911057e-06, "loss": 0.2173, "step": 440 }, { "epoch": 0.013902942017007933, "grad_norm": 0.3253472072116366, "learning_rate": 8.338480543545398e-06, "loss": 0.221, "step": 450 }, { "epoch": 0.014211896284052554, "grad_norm": 0.33183918202637014, "learning_rate": 8.52378011117974e-06, "loss": 0.2151, "step": 460 }, { "epoch": 0.014520850551097173, "grad_norm": 0.3243332278148931, "learning_rate": 8.709079678814083e-06, "loss": 0.2174, "step": 470 }, { "epoch": 0.014829804818141794, "grad_norm": 0.36798410428072925, "learning_rate": 8.894379246448425e-06, "loss": 0.2172, "step": 480 }, { "epoch": 0.015138759085186415, "grad_norm": 0.3261138949645182, "learning_rate": 9.079678814082766e-06, "loss": 0.21, "step": 490 }, { "epoch": 0.015447713352231036, "grad_norm": 0.2632848279909067, "learning_rate": 9.26497838171711e-06, "loss": 0.2133, "step": 500 }, { "epoch": 0.015756667619275657, "grad_norm": 0.3446013399586666, "learning_rate": 9.450277949351451e-06, "loss": 0.2152, "step": 510 }, { "epoch": 0.016065621886320276, "grad_norm": 0.29629460796616464, "learning_rate": 9.635577516985794e-06, "loss": 0.2184, "step": 520 }, { "epoch": 0.0163745761533649, "grad_norm": 0.3014262990944555, "learning_rate": 9.820877084620136e-06, "loss": 0.2171, "step": 530 }, { "epoch": 0.016683530420409518, "grad_norm": 0.32096296401164126, "learning_rate": 1.0006176652254479e-05, "loss": 0.2099, "step": 540 }, { "epoch": 0.01699248468745414, "grad_norm": 0.320451613139758, "learning_rate": 1.019147621988882e-05, "loss": 0.2139, "step": 550 }, { "epoch": 0.01730143895449876, "grad_norm": 0.35135365320921647, "learning_rate": 1.0376775787523164e-05, "loss": 0.2134, "step": 560 }, { "epoch": 0.01761039322154338, "grad_norm": 0.33802984085653676, "learning_rate": 1.0562075355157505e-05, "loss": 0.2099, "step": 570 }, { "epoch": 0.017919347488588002, "grad_norm": 0.30465329323172424, "learning_rate": 1.0747374922791847e-05, "loss": 0.207, "step": 580 }, { "epoch": 0.01822830175563262, "grad_norm": 0.39913719353068217, "learning_rate": 1.093267449042619e-05, "loss": 0.2162, "step": 590 }, { "epoch": 0.018537256022677244, "grad_norm": 0.39279649552480556, "learning_rate": 1.1117974058060532e-05, "loss": 0.2048, "step": 600 }, { "epoch": 0.018846210289721863, "grad_norm": 0.3392053906234814, "learning_rate": 1.1303273625694873e-05, "loss": 0.2108, "step": 610 }, { "epoch": 0.019155164556766486, "grad_norm": 0.2888812954242707, "learning_rate": 1.1488573193329217e-05, "loss": 0.2097, "step": 620 }, { "epoch": 0.019464118823811105, "grad_norm": 0.3372761067600913, "learning_rate": 1.1673872760963558e-05, "loss": 0.2119, "step": 630 }, { "epoch": 0.019773073090855724, "grad_norm": 0.3712557625995704, "learning_rate": 1.18591723285979e-05, "loss": 0.2226, "step": 640 }, { "epoch": 0.020082027357900347, "grad_norm": 0.3197200919411092, "learning_rate": 1.2044471896232243e-05, "loss": 0.2099, "step": 650 }, { "epoch": 0.020390981624944966, "grad_norm": 0.28518041257422966, "learning_rate": 1.2229771463866586e-05, "loss": 0.2092, "step": 660 }, { "epoch": 0.02069993589198959, "grad_norm": 0.28640971911900237, "learning_rate": 1.2415071031500926e-05, "loss": 0.2093, "step": 670 }, { "epoch": 0.021008890159034208, "grad_norm": 0.31681783495811866, "learning_rate": 1.2600370599135269e-05, "loss": 0.2175, "step": 680 }, { "epoch": 0.02131784442607883, "grad_norm": 0.3086403635983181, "learning_rate": 1.2785670166769611e-05, "loss": 0.2145, "step": 690 }, { "epoch": 0.02162679869312345, "grad_norm": 0.279274615260603, "learning_rate": 1.2970969734403952e-05, "loss": 0.2097, "step": 700 }, { "epoch": 0.021935752960168073, "grad_norm": 0.28404939336317203, "learning_rate": 1.3156269302038296e-05, "loss": 0.215, "step": 710 }, { "epoch": 0.022244707227212692, "grad_norm": 0.36193233481706244, "learning_rate": 1.3341568869672637e-05, "loss": 0.2147, "step": 720 }, { "epoch": 0.02255366149425731, "grad_norm": 0.2906216615512196, "learning_rate": 1.352686843730698e-05, "loss": 0.2063, "step": 730 }, { "epoch": 0.022862615761301934, "grad_norm": 0.3710603016488123, "learning_rate": 1.3712168004941322e-05, "loss": 0.2077, "step": 740 }, { "epoch": 0.023171570028346553, "grad_norm": 0.2886159866780826, "learning_rate": 1.3897467572575665e-05, "loss": 0.2093, "step": 750 }, { "epoch": 0.023480524295391176, "grad_norm": 0.3112205868629234, "learning_rate": 1.4082767140210006e-05, "loss": 0.2073, "step": 760 }, { "epoch": 0.023789478562435795, "grad_norm": 0.26528993566514497, "learning_rate": 1.426806670784435e-05, "loss": 0.2107, "step": 770 }, { "epoch": 0.024098432829480418, "grad_norm": 0.29394783645223177, "learning_rate": 1.445336627547869e-05, "loss": 0.2185, "step": 780 }, { "epoch": 0.024407387096525037, "grad_norm": 0.26589042931900475, "learning_rate": 1.4638665843113033e-05, "loss": 0.2132, "step": 790 }, { "epoch": 0.024716341363569656, "grad_norm": 0.266047120102079, "learning_rate": 1.4823965410747376e-05, "loss": 0.2164, "step": 800 }, { "epoch": 0.02502529563061428, "grad_norm": 0.28935549812532074, "learning_rate": 1.5009264978381716e-05, "loss": 0.2111, "step": 810 }, { "epoch": 0.025334249897658898, "grad_norm": 0.3268318837541725, "learning_rate": 1.519456454601606e-05, "loss": 0.2076, "step": 820 }, { "epoch": 0.02564320416470352, "grad_norm": 0.26553688783929824, "learning_rate": 1.53798641136504e-05, "loss": 0.2096, "step": 830 }, { "epoch": 0.02595215843174814, "grad_norm": 0.3230918273181566, "learning_rate": 1.5565163681284744e-05, "loss": 0.2058, "step": 840 }, { "epoch": 0.026261112698792763, "grad_norm": 0.3158040319486062, "learning_rate": 1.5750463248919087e-05, "loss": 0.2144, "step": 850 }, { "epoch": 0.026570066965837382, "grad_norm": 0.42005876152094646, "learning_rate": 1.593576281655343e-05, "loss": 0.2094, "step": 860 }, { "epoch": 0.026879021232882, "grad_norm": 0.29990865839111325, "learning_rate": 1.6121062384187768e-05, "loss": 0.2111, "step": 870 }, { "epoch": 0.027187975499926624, "grad_norm": 0.30161102177218907, "learning_rate": 1.6306361951822114e-05, "loss": 0.2068, "step": 880 }, { "epoch": 0.027496929766971243, "grad_norm": 0.37056174507591677, "learning_rate": 1.6491661519456457e-05, "loss": 0.2107, "step": 890 }, { "epoch": 0.027805884034015866, "grad_norm": 0.3655205446506857, "learning_rate": 1.6676961087090796e-05, "loss": 0.209, "step": 900 }, { "epoch": 0.028114838301060485, "grad_norm": 0.33860684112890227, "learning_rate": 1.6862260654725138e-05, "loss": 0.2075, "step": 910 }, { "epoch": 0.028423792568105108, "grad_norm": 0.25129594692224955, "learning_rate": 1.704756022235948e-05, "loss": 0.2067, "step": 920 }, { "epoch": 0.028732746835149727, "grad_norm": 0.3251631455751991, "learning_rate": 1.7232859789993823e-05, "loss": 0.2052, "step": 930 }, { "epoch": 0.029041701102194346, "grad_norm": 0.363155931575057, "learning_rate": 1.7418159357628166e-05, "loss": 0.2032, "step": 940 }, { "epoch": 0.02935065536923897, "grad_norm": 0.30100529535994425, "learning_rate": 1.7603458925262508e-05, "loss": 0.2031, "step": 950 }, { "epoch": 0.02965960963628359, "grad_norm": 0.3141907193101939, "learning_rate": 1.778875849289685e-05, "loss": 0.2032, "step": 960 }, { "epoch": 0.02996856390332821, "grad_norm": 0.298253799467931, "learning_rate": 1.7974058060531193e-05, "loss": 0.2062, "step": 970 }, { "epoch": 0.03027751817037283, "grad_norm": 0.2913001451283101, "learning_rate": 1.8159357628165532e-05, "loss": 0.2042, "step": 980 }, { "epoch": 0.030586472437417453, "grad_norm": 0.29004478573458653, "learning_rate": 1.8344657195799875e-05, "loss": 0.2115, "step": 990 }, { "epoch": 0.030895426704462072, "grad_norm": 0.3039673938016494, "learning_rate": 1.852995676343422e-05, "loss": 0.2078, "step": 1000 }, { "epoch": 0.03120438097150669, "grad_norm": 0.26992191709125957, "learning_rate": 1.8715256331068563e-05, "loss": 0.2144, "step": 1010 }, { "epoch": 0.031513335238551314, "grad_norm": 0.29224055505601537, "learning_rate": 1.8900555898702902e-05, "loss": 0.2051, "step": 1020 }, { "epoch": 0.03182228950559594, "grad_norm": 0.2813736747873565, "learning_rate": 1.9085855466337245e-05, "loss": 0.2046, "step": 1030 }, { "epoch": 0.03213124377264055, "grad_norm": 0.3617377978103828, "learning_rate": 1.9271155033971588e-05, "loss": 0.2077, "step": 1040 }, { "epoch": 0.032440198039685175, "grad_norm": 0.426903690417414, "learning_rate": 1.945645460160593e-05, "loss": 0.2057, "step": 1050 }, { "epoch": 0.0327491523067298, "grad_norm": 0.26884197701522644, "learning_rate": 1.9641754169240273e-05, "loss": 0.2021, "step": 1060 }, { "epoch": 0.033058106573774414, "grad_norm": 0.3138092113639732, "learning_rate": 1.9827053736874615e-05, "loss": 0.2041, "step": 1070 }, { "epoch": 0.033367060840819036, "grad_norm": 0.26273787982665225, "learning_rate": 2.0012353304508958e-05, "loss": 0.2046, "step": 1080 }, { "epoch": 0.03367601510786366, "grad_norm": 0.3305991889210736, "learning_rate": 2.01976528721433e-05, "loss": 0.2108, "step": 1090 }, { "epoch": 0.03398496937490828, "grad_norm": 0.28132184754515793, "learning_rate": 2.038295243977764e-05, "loss": 0.2024, "step": 1100 }, { "epoch": 0.0342939236419529, "grad_norm": 0.3310218166064453, "learning_rate": 2.0568252007411982e-05, "loss": 0.2077, "step": 1110 }, { "epoch": 0.03460287790899752, "grad_norm": 0.24084016233984354, "learning_rate": 2.0753551575046328e-05, "loss": 0.2304, "step": 1120 }, { "epoch": 0.03491183217604214, "grad_norm": 0.24689565123316048, "learning_rate": 2.0938851142680667e-05, "loss": 0.2084, "step": 1130 }, { "epoch": 0.03522078644308676, "grad_norm": 0.3202170546704081, "learning_rate": 2.112415071031501e-05, "loss": 0.2085, "step": 1140 }, { "epoch": 0.03552974071013138, "grad_norm": 0.220583117467106, "learning_rate": 2.1309450277949352e-05, "loss": 0.2069, "step": 1150 }, { "epoch": 0.035838694977176004, "grad_norm": 0.2080170682936219, "learning_rate": 2.1494749845583694e-05, "loss": 0.2069, "step": 1160 }, { "epoch": 0.03614764924422063, "grad_norm": 0.2627342483578785, "learning_rate": 2.1680049413218033e-05, "loss": 0.2123, "step": 1170 }, { "epoch": 0.03645660351126524, "grad_norm": 0.3358338758591602, "learning_rate": 2.186534898085238e-05, "loss": 0.2178, "step": 1180 }, { "epoch": 0.036765557778309865, "grad_norm": 0.38073592070845597, "learning_rate": 2.2050648548486722e-05, "loss": 0.2054, "step": 1190 }, { "epoch": 0.03707451204535449, "grad_norm": 0.28383460905604635, "learning_rate": 2.2235948116121064e-05, "loss": 0.215, "step": 1200 }, { "epoch": 0.037383466312399104, "grad_norm": 0.32201223136969137, "learning_rate": 2.2421247683755403e-05, "loss": 0.2049, "step": 1210 }, { "epoch": 0.037692420579443726, "grad_norm": 0.2330892975043787, "learning_rate": 2.2606547251389746e-05, "loss": 0.2085, "step": 1220 }, { "epoch": 0.03800137484648835, "grad_norm": 0.26739199739680014, "learning_rate": 2.279184681902409e-05, "loss": 0.207, "step": 1230 }, { "epoch": 0.03831032911353297, "grad_norm": 0.29859132653166187, "learning_rate": 2.2977146386658434e-05, "loss": 0.2101, "step": 1240 }, { "epoch": 0.03861928338057759, "grad_norm": 0.30552113069212283, "learning_rate": 2.3162445954292774e-05, "loss": 0.2095, "step": 1250 }, { "epoch": 0.03892823764762221, "grad_norm": 0.30270640732504084, "learning_rate": 2.3347745521927116e-05, "loss": 0.2125, "step": 1260 }, { "epoch": 0.03923719191466683, "grad_norm": 0.26397966225828484, "learning_rate": 2.353304508956146e-05, "loss": 0.2104, "step": 1270 }, { "epoch": 0.03954614618171145, "grad_norm": 0.28745917243179153, "learning_rate": 2.37183446571958e-05, "loss": 0.208, "step": 1280 }, { "epoch": 0.03985510044875607, "grad_norm": 0.27010837808728094, "learning_rate": 2.390364422483014e-05, "loss": 0.2044, "step": 1290 }, { "epoch": 0.040164054715800694, "grad_norm": 0.2792843380104636, "learning_rate": 2.4088943792464486e-05, "loss": 0.2005, "step": 1300 }, { "epoch": 0.04047300898284532, "grad_norm": 0.22371555206758015, "learning_rate": 2.427424336009883e-05, "loss": 0.2147, "step": 1310 }, { "epoch": 0.04078196324988993, "grad_norm": 0.29757581055889437, "learning_rate": 2.445954292773317e-05, "loss": 0.2028, "step": 1320 }, { "epoch": 0.041090917516934555, "grad_norm": 0.25619509674648794, "learning_rate": 2.464484249536751e-05, "loss": 0.2058, "step": 1330 }, { "epoch": 0.04139987178397918, "grad_norm": 0.2828499187719199, "learning_rate": 2.4830142063001853e-05, "loss": 0.2054, "step": 1340 }, { "epoch": 0.041708826051023794, "grad_norm": 0.23957860940910639, "learning_rate": 2.5015441630636195e-05, "loss": 0.2059, "step": 1350 }, { "epoch": 0.042017780318068416, "grad_norm": 0.2347743056596026, "learning_rate": 2.5200741198270538e-05, "loss": 0.2071, "step": 1360 }, { "epoch": 0.04232673458511304, "grad_norm": 0.2354164254244114, "learning_rate": 2.538604076590488e-05, "loss": 0.2036, "step": 1370 }, { "epoch": 0.04263568885215766, "grad_norm": 0.28253508725613863, "learning_rate": 2.5571340333539223e-05, "loss": 0.2067, "step": 1380 }, { "epoch": 0.04294464311920228, "grad_norm": 0.3359662573400643, "learning_rate": 2.5756639901173565e-05, "loss": 0.2068, "step": 1390 }, { "epoch": 0.0432535973862469, "grad_norm": 0.23342104156491575, "learning_rate": 2.5941939468807904e-05, "loss": 0.2048, "step": 1400 }, { "epoch": 0.04356255165329152, "grad_norm": 0.3288703789832579, "learning_rate": 2.6127239036442247e-05, "loss": 0.2106, "step": 1410 }, { "epoch": 0.043871505920336146, "grad_norm": 0.2311175530684186, "learning_rate": 2.6312538604076593e-05, "loss": 0.2035, "step": 1420 }, { "epoch": 0.04418046018738076, "grad_norm": 0.24902940547184585, "learning_rate": 2.6497838171710935e-05, "loss": 0.2101, "step": 1430 }, { "epoch": 0.044489414454425384, "grad_norm": 0.30318744715773366, "learning_rate": 2.6683137739345275e-05, "loss": 0.2121, "step": 1440 }, { "epoch": 0.04479836872147001, "grad_norm": 0.22206577360841506, "learning_rate": 2.6868437306979617e-05, "loss": 0.2103, "step": 1450 }, { "epoch": 0.04510732298851462, "grad_norm": 0.20158462025933055, "learning_rate": 2.705373687461396e-05, "loss": 0.1993, "step": 1460 }, { "epoch": 0.045416277255559245, "grad_norm": 0.2878798477872172, "learning_rate": 2.7239036442248302e-05, "loss": 0.1977, "step": 1470 }, { "epoch": 0.04572523152260387, "grad_norm": 0.2066911603227598, "learning_rate": 2.7424336009882645e-05, "loss": 0.2153, "step": 1480 }, { "epoch": 0.04603418578964849, "grad_norm": 0.18272356845350932, "learning_rate": 2.7609635577516987e-05, "loss": 0.2029, "step": 1490 }, { "epoch": 0.046343140056693106, "grad_norm": 0.28595402585297963, "learning_rate": 2.779493514515133e-05, "loss": 0.2136, "step": 1500 }, { "epoch": 0.04665209432373773, "grad_norm": 0.20644642694772639, "learning_rate": 2.7980234712785672e-05, "loss": 0.2042, "step": 1510 }, { "epoch": 0.04696104859078235, "grad_norm": 0.277579026973121, "learning_rate": 2.816553428042001e-05, "loss": 0.2034, "step": 1520 }, { "epoch": 0.04727000285782697, "grad_norm": 0.26497694489417567, "learning_rate": 2.8350833848054354e-05, "loss": 0.2031, "step": 1530 }, { "epoch": 0.04757895712487159, "grad_norm": 0.22929988466716691, "learning_rate": 2.85361334156887e-05, "loss": 0.2074, "step": 1540 }, { "epoch": 0.04788791139191621, "grad_norm": 0.26775708641135415, "learning_rate": 2.872143298332304e-05, "loss": 0.2031, "step": 1550 }, { "epoch": 0.048196865658960836, "grad_norm": 0.2239949588049736, "learning_rate": 2.890673255095738e-05, "loss": 0.2064, "step": 1560 }, { "epoch": 0.04850581992600545, "grad_norm": 0.20883167310182313, "learning_rate": 2.9092032118591724e-05, "loss": 0.206, "step": 1570 }, { "epoch": 0.048814774193050074, "grad_norm": 0.19285251737246453, "learning_rate": 2.9277331686226066e-05, "loss": 0.1983, "step": 1580 }, { "epoch": 0.0491237284600947, "grad_norm": 0.2956628718116073, "learning_rate": 2.9462631253860405e-05, "loss": 0.1988, "step": 1590 }, { "epoch": 0.04943268272713931, "grad_norm": 0.250995206935813, "learning_rate": 2.964793082149475e-05, "loss": 0.2004, "step": 1600 }, { "epoch": 0.049741636994183935, "grad_norm": 0.243205872744263, "learning_rate": 2.9833230389129094e-05, "loss": 0.1982, "step": 1610 }, { "epoch": 0.05005059126122856, "grad_norm": 0.25558818024233726, "learning_rate": 2.999999992170622e-05, "loss": 0.2036, "step": 1620 }, { "epoch": 0.05035954552827318, "grad_norm": 0.23149680065562725, "learning_rate": 2.9999990526453555e-05, "loss": 0.2027, "step": 1630 }, { "epoch": 0.050668499795317797, "grad_norm": 0.24011462992841165, "learning_rate": 2.9999965472456043e-05, "loss": 0.203, "step": 1640 }, { "epoch": 0.05097745406236242, "grad_norm": 0.24085468590450437, "learning_rate": 2.9999924759739835e-05, "loss": 0.2064, "step": 1650 }, { "epoch": 0.05128640832940704, "grad_norm": 0.24906142359876013, "learning_rate": 2.999986838834743e-05, "loss": 0.2047, "step": 1660 }, { "epoch": 0.05159536259645166, "grad_norm": 0.42395661256821077, "learning_rate": 2.999979635833768e-05, "loss": 0.2096, "step": 1670 }, { "epoch": 0.05190431686349628, "grad_norm": 0.20504456024115916, "learning_rate": 2.999970866978578e-05, "loss": 0.2003, "step": 1680 }, { "epoch": 0.0522132711305409, "grad_norm": 0.25305112383752665, "learning_rate": 2.999960532278326e-05, "loss": 0.2114, "step": 1690 }, { "epoch": 0.052522225397585526, "grad_norm": 0.2470280003459118, "learning_rate": 2.9999486317438012e-05, "loss": 0.2054, "step": 1700 }, { "epoch": 0.05283117966463014, "grad_norm": 0.17619984944090902, "learning_rate": 2.999935165387427e-05, "loss": 0.1978, "step": 1710 }, { "epoch": 0.053140133931674764, "grad_norm": 0.25629485039850225, "learning_rate": 2.9999201332232602e-05, "loss": 0.2056, "step": 1720 }, { "epoch": 0.05344908819871939, "grad_norm": 0.22095393558240067, "learning_rate": 2.999903535266994e-05, "loss": 0.2036, "step": 1730 }, { "epoch": 0.053758042465764, "grad_norm": 0.3143405292517971, "learning_rate": 2.9998853715359554e-05, "loss": 0.2093, "step": 1740 }, { "epoch": 0.054066996732808625, "grad_norm": 0.2309885475456474, "learning_rate": 2.9998656420491052e-05, "loss": 0.217, "step": 1750 }, { "epoch": 0.05437595099985325, "grad_norm": 0.20760194513380262, "learning_rate": 2.99984434682704e-05, "loss": 0.2166, "step": 1760 }, { "epoch": 0.05468490526689787, "grad_norm": 0.24300116908428407, "learning_rate": 2.9998214858919898e-05, "loss": 0.2033, "step": 1770 }, { "epoch": 0.05499385953394249, "grad_norm": 0.2219233975267644, "learning_rate": 2.9997970592678194e-05, "loss": 0.2021, "step": 1780 }, { "epoch": 0.05530281380098711, "grad_norm": 0.30710149760971917, "learning_rate": 2.999771066980028e-05, "loss": 0.2177, "step": 1790 }, { "epoch": 0.05561176806803173, "grad_norm": 0.16031344634197983, "learning_rate": 2.9997435090557503e-05, "loss": 0.2015, "step": 1800 }, { "epoch": 0.05592072233507635, "grad_norm": 0.32697590864254217, "learning_rate": 2.999714385523754e-05, "loss": 0.2177, "step": 1810 }, { "epoch": 0.05622967660212097, "grad_norm": 0.17668176381250358, "learning_rate": 2.9996836964144416e-05, "loss": 0.1981, "step": 1820 }, { "epoch": 0.05653863086916559, "grad_norm": 0.17168676181872455, "learning_rate": 2.9996514417598497e-05, "loss": 0.2106, "step": 1830 }, { "epoch": 0.056847585136210216, "grad_norm": 0.18135718361194356, "learning_rate": 2.9996176215936498e-05, "loss": 0.1999, "step": 1840 }, { "epoch": 0.05715653940325483, "grad_norm": 0.3034185998566844, "learning_rate": 2.9995822359511474e-05, "loss": 0.2145, "step": 1850 }, { "epoch": 0.057465493670299454, "grad_norm": 0.22175858229853335, "learning_rate": 2.999545284869282e-05, "loss": 0.2067, "step": 1860 }, { "epoch": 0.05777444793734408, "grad_norm": 0.23366649948214785, "learning_rate": 2.9995067683866277e-05, "loss": 0.2005, "step": 1870 }, { "epoch": 0.05808340220438869, "grad_norm": 0.18657800025148788, "learning_rate": 2.999466686543392e-05, "loss": 0.2143, "step": 1880 }, { "epoch": 0.058392356471433315, "grad_norm": 0.19587658784940068, "learning_rate": 2.9994250393814177e-05, "loss": 0.2015, "step": 1890 }, { "epoch": 0.05870131073847794, "grad_norm": 0.1988143078568588, "learning_rate": 2.9993818269441807e-05, "loss": 0.1985, "step": 1900 }, { "epoch": 0.05901026500552256, "grad_norm": 0.31463751837253334, "learning_rate": 2.999337049276791e-05, "loss": 0.2002, "step": 1910 }, { "epoch": 0.05931921927256718, "grad_norm": 0.29579355828477577, "learning_rate": 2.9992907064259932e-05, "loss": 0.2023, "step": 1920 }, { "epoch": 0.0596281735396118, "grad_norm": 0.2299056861396528, "learning_rate": 2.9992427984401645e-05, "loss": 0.2122, "step": 1930 }, { "epoch": 0.05993712780665642, "grad_norm": 0.18186606292999008, "learning_rate": 2.999193325369318e-05, "loss": 0.2, "step": 1940 }, { "epoch": 0.06024608207370104, "grad_norm": 0.21384750204974323, "learning_rate": 2.9991422872650983e-05, "loss": 0.2116, "step": 1950 }, { "epoch": 0.06055503634074566, "grad_norm": 1.6716992559379211, "learning_rate": 2.9990896841807862e-05, "loss": 0.39, "step": 1960 }, { "epoch": 0.06086399060779028, "grad_norm": 38.03714494100377, "learning_rate": 2.9990355161712942e-05, "loss": 0.3536, "step": 1970 }, { "epoch": 0.061172944874834906, "grad_norm": 3.9014531815641784, "learning_rate": 2.9989797832931692e-05, "loss": 0.2543, "step": 1980 }, { "epoch": 0.06148189914187952, "grad_norm": 7.019711079073617, "learning_rate": 2.998922485604592e-05, "loss": 0.302, "step": 1990 }, { "epoch": 0.061790853408924144, "grad_norm": 0.28537204731383636, "learning_rate": 2.9988636231653765e-05, "loss": 0.2307, "step": 2000 }, { "epoch": 0.06209980767596877, "grad_norm": 0.19569138798894245, "learning_rate": 2.99880319603697e-05, "loss": 0.2078, "step": 2010 }, { "epoch": 0.06240876194301338, "grad_norm": 0.20010523604026445, "learning_rate": 2.9987412042824535e-05, "loss": 0.2045, "step": 2020 }, { "epoch": 0.062717716210058, "grad_norm": 0.18477737555488893, "learning_rate": 2.9986776479665414e-05, "loss": 0.2062, "step": 2030 }, { "epoch": 0.06302667047710263, "grad_norm": 0.18012953650966962, "learning_rate": 2.9986125271555814e-05, "loss": 0.2035, "step": 2040 }, { "epoch": 0.06333562474414725, "grad_norm": 0.20312463120985744, "learning_rate": 2.998545841917554e-05, "loss": 0.2039, "step": 2050 }, { "epoch": 0.06364457901119187, "grad_norm": 0.2067516871530281, "learning_rate": 2.9984775923220725e-05, "loss": 0.2038, "step": 2060 }, { "epoch": 0.06395353327823648, "grad_norm": 0.23804247346031, "learning_rate": 2.9984077784403846e-05, "loss": 0.2002, "step": 2070 }, { "epoch": 0.0642624875452811, "grad_norm": 0.17368067775150903, "learning_rate": 2.9983364003453702e-05, "loss": 0.2027, "step": 2080 }, { "epoch": 0.06457144181232573, "grad_norm": 0.16626358367903837, "learning_rate": 2.998263458111541e-05, "loss": 0.2015, "step": 2090 }, { "epoch": 0.06488039607937035, "grad_norm": 0.27621519975999975, "learning_rate": 2.9981889518150445e-05, "loss": 0.2067, "step": 2100 }, { "epoch": 0.06518935034641497, "grad_norm": 0.21024444701085984, "learning_rate": 2.9981128815336574e-05, "loss": 0.1996, "step": 2110 }, { "epoch": 0.0654983046134596, "grad_norm": 0.24426130276969282, "learning_rate": 2.9980352473467915e-05, "loss": 0.2037, "step": 2120 }, { "epoch": 0.06580725888050422, "grad_norm": 0.1477033633417111, "learning_rate": 2.9979560493354907e-05, "loss": 0.2004, "step": 2130 }, { "epoch": 0.06611621314754883, "grad_norm": 0.24107343107321108, "learning_rate": 2.9978752875824302e-05, "loss": 0.2036, "step": 2140 }, { "epoch": 0.06642516741459345, "grad_norm": 0.2187941774851456, "learning_rate": 2.9977929621719197e-05, "loss": 0.2047, "step": 2150 }, { "epoch": 0.06673412168163807, "grad_norm": 0.1992578320035458, "learning_rate": 2.9977090731898995e-05, "loss": 0.1994, "step": 2160 }, { "epoch": 0.0670430759486827, "grad_norm": 0.19806105265419735, "learning_rate": 2.9976236207239426e-05, "loss": 0.1976, "step": 2170 }, { "epoch": 0.06735203021572732, "grad_norm": 0.17567955418535217, "learning_rate": 2.9975366048632544e-05, "loss": 0.2, "step": 2180 }, { "epoch": 0.06766098448277194, "grad_norm": 0.19530939639808778, "learning_rate": 2.9974480256986724e-05, "loss": 0.2001, "step": 2190 }, { "epoch": 0.06796993874981656, "grad_norm": 0.2044638958382385, "learning_rate": 2.997357883322666e-05, "loss": 0.2106, "step": 2200 }, { "epoch": 0.06827889301686117, "grad_norm": 0.18894945695103943, "learning_rate": 2.9972661778293362e-05, "loss": 0.2018, "step": 2210 }, { "epoch": 0.0685878472839058, "grad_norm": 0.1728822949445438, "learning_rate": 2.9971729093144157e-05, "loss": 0.1968, "step": 2220 }, { "epoch": 0.06889680155095042, "grad_norm": 0.1580606167173133, "learning_rate": 2.997078077875269e-05, "loss": 0.2067, "step": 2230 }, { "epoch": 0.06920575581799504, "grad_norm": 0.5254308545963697, "learning_rate": 2.996981683610893e-05, "loss": 0.195, "step": 2240 }, { "epoch": 0.06951471008503966, "grad_norm": 0.17131874970846256, "learning_rate": 2.9968837266219146e-05, "loss": 0.2162, "step": 2250 }, { "epoch": 0.06982366435208429, "grad_norm": 0.16078626988371186, "learning_rate": 2.9967842070105928e-05, "loss": 0.1984, "step": 2260 }, { "epoch": 0.07013261861912891, "grad_norm": 0.1736114792541168, "learning_rate": 2.9966831248808184e-05, "loss": 0.2044, "step": 2270 }, { "epoch": 0.07044157288617352, "grad_norm": 0.18838468255956162, "learning_rate": 2.9965804803381123e-05, "loss": 0.1997, "step": 2280 }, { "epoch": 0.07075052715321814, "grad_norm": 0.1707422151814035, "learning_rate": 2.996476273489627e-05, "loss": 0.1988, "step": 2290 }, { "epoch": 0.07105948142026276, "grad_norm": 0.17611613049009683, "learning_rate": 2.9963705044441453e-05, "loss": 0.2102, "step": 2300 }, { "epoch": 0.07136843568730739, "grad_norm": 0.1620119857718042, "learning_rate": 2.996263173312082e-05, "loss": 0.2023, "step": 2310 }, { "epoch": 0.07167738995435201, "grad_norm": 0.14850403653592215, "learning_rate": 2.9961542802054816e-05, "loss": 0.1987, "step": 2320 }, { "epoch": 0.07198634422139663, "grad_norm": 0.1567391362200369, "learning_rate": 2.9960438252380198e-05, "loss": 0.1988, "step": 2330 }, { "epoch": 0.07229529848844125, "grad_norm": 0.28124291829683645, "learning_rate": 2.995931808525002e-05, "loss": 0.2044, "step": 2340 }, { "epoch": 0.07260425275548586, "grad_norm": 0.19068209274883757, "learning_rate": 2.995818230183364e-05, "loss": 0.198, "step": 2350 }, { "epoch": 0.07291320702253049, "grad_norm": 0.17692328802750804, "learning_rate": 2.995703090331673e-05, "loss": 0.1976, "step": 2360 }, { "epoch": 0.07322216128957511, "grad_norm": 0.21242379957382648, "learning_rate": 2.9955863890901252e-05, "loss": 0.1988, "step": 2370 }, { "epoch": 0.07353111555661973, "grad_norm": 0.173614714476889, "learning_rate": 2.9954681265805465e-05, "loss": 0.1986, "step": 2380 }, { "epoch": 0.07384006982366435, "grad_norm": 0.1861446361803506, "learning_rate": 2.995348302926394e-05, "loss": 0.1985, "step": 2390 }, { "epoch": 0.07414902409070898, "grad_norm": 0.176519042598455, "learning_rate": 2.9952269182527526e-05, "loss": 0.2078, "step": 2400 }, { "epoch": 0.0744579783577536, "grad_norm": 0.2405258805951056, "learning_rate": 2.9951039726863385e-05, "loss": 0.2055, "step": 2410 }, { "epoch": 0.07476693262479821, "grad_norm": 0.17213732654915595, "learning_rate": 2.9949794663554968e-05, "loss": 0.1953, "step": 2420 }, { "epoch": 0.07507588689184283, "grad_norm": 0.21174539141948398, "learning_rate": 2.9948533993902016e-05, "loss": 0.2091, "step": 2430 }, { "epoch": 0.07538484115888745, "grad_norm": 0.17914977361908388, "learning_rate": 2.994725771922056e-05, "loss": 0.1972, "step": 2440 }, { "epoch": 0.07569379542593208, "grad_norm": 0.18681317713452364, "learning_rate": 2.994596584084293e-05, "loss": 0.198, "step": 2450 }, { "epoch": 0.0760027496929767, "grad_norm": 0.1702246734917539, "learning_rate": 2.9944658360117735e-05, "loss": 0.1982, "step": 2460 }, { "epoch": 0.07631170396002132, "grad_norm": 0.20079090661429203, "learning_rate": 2.994333527840988e-05, "loss": 0.2015, "step": 2470 }, { "epoch": 0.07662065822706594, "grad_norm": 0.16681783867686925, "learning_rate": 2.9941996597100553e-05, "loss": 0.2036, "step": 2480 }, { "epoch": 0.07692961249411055, "grad_norm": 0.23888069273818058, "learning_rate": 2.9940642317587222e-05, "loss": 0.2121, "step": 2490 }, { "epoch": 0.07723856676115518, "grad_norm": 0.15612939807020818, "learning_rate": 2.9939272441283643e-05, "loss": 0.1957, "step": 2500 }, { "epoch": 0.0775475210281998, "grad_norm": 0.15896537672170197, "learning_rate": 2.993788696961986e-05, "loss": 0.1983, "step": 2510 }, { "epoch": 0.07785647529524442, "grad_norm": 0.19879597737685326, "learning_rate": 2.993648590404218e-05, "loss": 0.2001, "step": 2520 }, { "epoch": 0.07816542956228904, "grad_norm": 0.213895580920291, "learning_rate": 2.9935069246013204e-05, "loss": 0.1956, "step": 2530 }, { "epoch": 0.07847438382933367, "grad_norm": 0.18008643894889034, "learning_rate": 2.9933636997011812e-05, "loss": 0.1942, "step": 2540 }, { "epoch": 0.07878333809637829, "grad_norm": 0.17288127839945946, "learning_rate": 2.993218915853314e-05, "loss": 0.1944, "step": 2550 }, { "epoch": 0.0790922923634229, "grad_norm": 0.19874886444946327, "learning_rate": 2.993072573208862e-05, "loss": 0.1977, "step": 2560 }, { "epoch": 0.07940124663046752, "grad_norm": 0.20028131905271443, "learning_rate": 2.992924671920595e-05, "loss": 0.1969, "step": 2570 }, { "epoch": 0.07971020089751214, "grad_norm": 0.22741761017119985, "learning_rate": 2.9927752121429085e-05, "loss": 0.1986, "step": 2580 }, { "epoch": 0.08001915516455677, "grad_norm": 0.14892790240624842, "learning_rate": 2.9926241940318275e-05, "loss": 0.1989, "step": 2590 }, { "epoch": 0.08032810943160139, "grad_norm": 0.1812090530263701, "learning_rate": 2.9924716177450013e-05, "loss": 0.2007, "step": 2600 }, { "epoch": 0.08063706369864601, "grad_norm": 0.21981022506889003, "learning_rate": 2.9923174834417073e-05, "loss": 0.1992, "step": 2610 }, { "epoch": 0.08094601796569063, "grad_norm": 0.1806725892016268, "learning_rate": 2.992161791282849e-05, "loss": 0.1985, "step": 2620 }, { "epoch": 0.08125497223273524, "grad_norm": 0.152449578467758, "learning_rate": 2.9920045414309562e-05, "loss": 0.1997, "step": 2630 }, { "epoch": 0.08156392649977987, "grad_norm": 0.16967064167562734, "learning_rate": 2.991845734050184e-05, "loss": 0.2003, "step": 2640 }, { "epoch": 0.08187288076682449, "grad_norm": 0.15273183917648353, "learning_rate": 2.991685369306315e-05, "loss": 0.199, "step": 2650 }, { "epoch": 0.08218183503386911, "grad_norm": 0.1785418164488158, "learning_rate": 2.991523447366756e-05, "loss": 0.2065, "step": 2660 }, { "epoch": 0.08249078930091373, "grad_norm": 0.19119076782838293, "learning_rate": 2.991359968400541e-05, "loss": 0.2043, "step": 2670 }, { "epoch": 0.08279974356795836, "grad_norm": 0.12963083951582324, "learning_rate": 2.991194932578327e-05, "loss": 0.193, "step": 2680 }, { "epoch": 0.08310869783500298, "grad_norm": 0.23260798475158634, "learning_rate": 2.9910283400723987e-05, "loss": 0.1975, "step": 2690 }, { "epoch": 0.08341765210204759, "grad_norm": 0.1453149378730413, "learning_rate": 2.9908601910566645e-05, "loss": 0.196, "step": 2700 }, { "epoch": 0.08372660636909221, "grad_norm": 0.19381474156061762, "learning_rate": 2.9906904857066585e-05, "loss": 0.2085, "step": 2710 }, { "epoch": 0.08403556063613683, "grad_norm": 0.15613430326411726, "learning_rate": 2.9905192241995386e-05, "loss": 0.192, "step": 2720 }, { "epoch": 0.08434451490318146, "grad_norm": 0.15517037231213784, "learning_rate": 2.990346406714088e-05, "loss": 0.1962, "step": 2730 }, { "epoch": 0.08465346917022608, "grad_norm": 0.19071627886237663, "learning_rate": 2.9901720334307126e-05, "loss": 0.1993, "step": 2740 }, { "epoch": 0.0849624234372707, "grad_norm": 0.18480484319556262, "learning_rate": 2.9899961045314455e-05, "loss": 0.2028, "step": 2750 }, { "epoch": 0.08527137770431532, "grad_norm": 0.18673748288484698, "learning_rate": 2.9898186201999404e-05, "loss": 0.194, "step": 2760 }, { "epoch": 0.08558033197135993, "grad_norm": 0.19721048721852688, "learning_rate": 2.9896395806214765e-05, "loss": 0.1937, "step": 2770 }, { "epoch": 0.08588928623840456, "grad_norm": 0.16917472375765735, "learning_rate": 2.9894589859829565e-05, "loss": 0.2035, "step": 2780 }, { "epoch": 0.08619824050544918, "grad_norm": 0.15929564098033613, "learning_rate": 2.9892768364729062e-05, "loss": 0.1959, "step": 2790 }, { "epoch": 0.0865071947724938, "grad_norm": 0.1564281325302, "learning_rate": 2.9890931322814746e-05, "loss": 0.1985, "step": 2800 }, { "epoch": 0.08681614903953842, "grad_norm": 0.18083462652656238, "learning_rate": 2.9889078736004337e-05, "loss": 0.2054, "step": 2810 }, { "epoch": 0.08712510330658305, "grad_norm": 0.15347293774038326, "learning_rate": 2.9887210606231775e-05, "loss": 0.1976, "step": 2820 }, { "epoch": 0.08743405757362767, "grad_norm": 0.14369548444314065, "learning_rate": 2.988532693544724e-05, "loss": 0.1965, "step": 2830 }, { "epoch": 0.08774301184067229, "grad_norm": 0.16755897503768843, "learning_rate": 2.9883427725617127e-05, "loss": 0.1923, "step": 2840 }, { "epoch": 0.0880519661077169, "grad_norm": 0.1766933017480405, "learning_rate": 2.9881512978724054e-05, "loss": 0.1999, "step": 2850 }, { "epoch": 0.08836092037476152, "grad_norm": 0.16335478491515748, "learning_rate": 2.987958269676685e-05, "loss": 0.196, "step": 2860 }, { "epoch": 0.08866987464180615, "grad_norm": 0.1412388792891497, "learning_rate": 2.9877636881760578e-05, "loss": 0.1968, "step": 2870 }, { "epoch": 0.08897882890885077, "grad_norm": 0.28855040940828436, "learning_rate": 2.9875675535736505e-05, "loss": 0.2192, "step": 2880 }, { "epoch": 0.08928778317589539, "grad_norm": 0.19016910033969708, "learning_rate": 2.9873698660742107e-05, "loss": 0.1973, "step": 2890 }, { "epoch": 0.08959673744294001, "grad_norm": 0.19178278954952716, "learning_rate": 2.987170625884109e-05, "loss": 0.2064, "step": 2900 }, { "epoch": 0.08990569170998464, "grad_norm": 0.15224503527765604, "learning_rate": 2.9869698332113346e-05, "loss": 0.1946, "step": 2910 }, { "epoch": 0.09021464597702925, "grad_norm": 0.13514359465615594, "learning_rate": 2.9867674882654983e-05, "loss": 0.193, "step": 2920 }, { "epoch": 0.09052360024407387, "grad_norm": 0.15927437454216875, "learning_rate": 2.9865635912578325e-05, "loss": 0.1944, "step": 2930 }, { "epoch": 0.09083255451111849, "grad_norm": 0.1412858450776157, "learning_rate": 2.9863581424011874e-05, "loss": 0.1954, "step": 2940 }, { "epoch": 0.09114150877816311, "grad_norm": 0.1692424504947583, "learning_rate": 2.9861511419100358e-05, "loss": 0.197, "step": 2950 }, { "epoch": 0.09145046304520774, "grad_norm": 0.15945249012427518, "learning_rate": 2.985942590000468e-05, "loss": 0.1987, "step": 2960 }, { "epoch": 0.09175941731225236, "grad_norm": 0.1509003969152127, "learning_rate": 2.9857324868901958e-05, "loss": 0.1961, "step": 2970 }, { "epoch": 0.09206837157929698, "grad_norm": 0.16215765059346146, "learning_rate": 2.9855208327985486e-05, "loss": 0.1997, "step": 2980 }, { "epoch": 0.09237732584634159, "grad_norm": 0.21205919686914562, "learning_rate": 2.985307627946476e-05, "loss": 0.2031, "step": 2990 }, { "epoch": 0.09268628011338621, "grad_norm": 0.18692914385469056, "learning_rate": 2.9850928725565464e-05, "loss": 0.1932, "step": 3000 }, { "epoch": 0.09299523438043084, "grad_norm": 0.1383747544451328, "learning_rate": 2.9848765668529463e-05, "loss": 0.1947, "step": 3010 }, { "epoch": 0.09330418864747546, "grad_norm": 0.17240403540350424, "learning_rate": 2.984658711061481e-05, "loss": 0.1981, "step": 3020 }, { "epoch": 0.09361314291452008, "grad_norm": 0.13524872029986645, "learning_rate": 2.9844393054095738e-05, "loss": 0.195, "step": 3030 }, { "epoch": 0.0939220971815647, "grad_norm": 0.16721476843562097, "learning_rate": 2.9842183501262664e-05, "loss": 0.1993, "step": 3040 }, { "epoch": 0.09423105144860933, "grad_norm": 0.1781358051524646, "learning_rate": 2.9839958454422165e-05, "loss": 0.1933, "step": 3050 }, { "epoch": 0.09454000571565394, "grad_norm": 0.15964728077303253, "learning_rate": 2.983771791589702e-05, "loss": 0.1956, "step": 3060 }, { "epoch": 0.09484895998269856, "grad_norm": 0.14892102028856208, "learning_rate": 2.9835461888026163e-05, "loss": 0.1952, "step": 3070 }, { "epoch": 0.09515791424974318, "grad_norm": 0.16357065961104184, "learning_rate": 2.9833190373164692e-05, "loss": 0.1942, "step": 3080 }, { "epoch": 0.0954668685167878, "grad_norm": 0.22398475313678176, "learning_rate": 2.9830903373683883e-05, "loss": 0.2065, "step": 3090 }, { "epoch": 0.09577582278383243, "grad_norm": 0.14766190514738772, "learning_rate": 2.982860089197118e-05, "loss": 0.2053, "step": 3100 }, { "epoch": 0.09608477705087705, "grad_norm": 0.3349719693911538, "learning_rate": 2.9826282930430175e-05, "loss": 0.2007, "step": 3110 }, { "epoch": 0.09639373131792167, "grad_norm": 0.1644366645964322, "learning_rate": 2.982394949148063e-05, "loss": 0.1938, "step": 3120 }, { "epoch": 0.09670268558496628, "grad_norm": 0.14493671521828133, "learning_rate": 2.9821600577558457e-05, "loss": 0.2013, "step": 3130 }, { "epoch": 0.0970116398520109, "grad_norm": 0.11524631469635575, "learning_rate": 2.9819236191115738e-05, "loss": 0.1967, "step": 3140 }, { "epoch": 0.09732059411905553, "grad_norm": 0.16223379075585082, "learning_rate": 2.9816856334620686e-05, "loss": 0.2079, "step": 3150 }, { "epoch": 0.09762954838610015, "grad_norm": 0.15651559888502994, "learning_rate": 2.9814461010557683e-05, "loss": 0.2084, "step": 3160 }, { "epoch": 0.09793850265314477, "grad_norm": 0.16731616566868066, "learning_rate": 2.981205022142724e-05, "loss": 0.2032, "step": 3170 }, { "epoch": 0.0982474569201894, "grad_norm": 0.161127910827826, "learning_rate": 2.9809623969746022e-05, "loss": 0.1942, "step": 3180 }, { "epoch": 0.09855641118723402, "grad_norm": 0.14089901940734195, "learning_rate": 2.9807182258046838e-05, "loss": 0.1959, "step": 3190 }, { "epoch": 0.09886536545427863, "grad_norm": 0.13991812367182896, "learning_rate": 2.980472508887863e-05, "loss": 0.1938, "step": 3200 }, { "epoch": 0.09917431972132325, "grad_norm": 0.16151724527912023, "learning_rate": 2.9802252464806477e-05, "loss": 0.2024, "step": 3210 }, { "epoch": 0.09948327398836787, "grad_norm": 0.17232581708425088, "learning_rate": 2.9799764388411597e-05, "loss": 0.2027, "step": 3220 }, { "epoch": 0.0997922282554125, "grad_norm": 0.15221857389640467, "learning_rate": 2.9797260862291332e-05, "loss": 0.2027, "step": 3230 }, { "epoch": 0.10010118252245712, "grad_norm": 0.17076789497687467, "learning_rate": 2.9794741889059154e-05, "loss": 0.1972, "step": 3240 }, { "epoch": 0.10041013678950174, "grad_norm": 0.14404182419615677, "learning_rate": 2.9792207471344675e-05, "loss": 0.1919, "step": 3250 }, { "epoch": 0.10071909105654636, "grad_norm": 0.16945885121142765, "learning_rate": 2.9789657611793598e-05, "loss": 0.2098, "step": 3260 }, { "epoch": 0.10102804532359097, "grad_norm": 0.16013235477830412, "learning_rate": 2.9787092313067775e-05, "loss": 0.196, "step": 3270 }, { "epoch": 0.10133699959063559, "grad_norm": 0.14813863429455534, "learning_rate": 2.9784511577845164e-05, "loss": 0.2013, "step": 3280 }, { "epoch": 0.10164595385768022, "grad_norm": 0.1590236946020729, "learning_rate": 2.9781915408819833e-05, "loss": 0.1977, "step": 3290 }, { "epoch": 0.10195490812472484, "grad_norm": 0.16008756310142963, "learning_rate": 2.9779303808701975e-05, "loss": 0.202, "step": 3300 }, { "epoch": 0.10226386239176946, "grad_norm": 0.14373767651911806, "learning_rate": 2.977667678021788e-05, "loss": 0.1959, "step": 3310 }, { "epoch": 0.10257281665881408, "grad_norm": 0.1410824571705092, "learning_rate": 2.9774034326109948e-05, "loss": 0.1916, "step": 3320 }, { "epoch": 0.1028817709258587, "grad_norm": 0.136187184514505, "learning_rate": 2.977137644913668e-05, "loss": 0.2026, "step": 3330 }, { "epoch": 0.10319072519290332, "grad_norm": 0.1800698776121755, "learning_rate": 2.9768703152072684e-05, "loss": 0.1949, "step": 3340 }, { "epoch": 0.10349967945994794, "grad_norm": 0.1538791403546913, "learning_rate": 2.9766014437708654e-05, "loss": 0.2025, "step": 3350 }, { "epoch": 0.10380863372699256, "grad_norm": 0.1568382831737684, "learning_rate": 2.9763310308851388e-05, "loss": 0.1953, "step": 3360 }, { "epoch": 0.10411758799403718, "grad_norm": 0.13122346416217745, "learning_rate": 2.9760590768323766e-05, "loss": 0.2014, "step": 3370 }, { "epoch": 0.1044265422610818, "grad_norm": 0.1806485020747406, "learning_rate": 2.9757855818964775e-05, "loss": 0.1971, "step": 3380 }, { "epoch": 0.10473549652812643, "grad_norm": 0.13537590653822115, "learning_rate": 2.9755105463629463e-05, "loss": 0.2065, "step": 3390 }, { "epoch": 0.10504445079517105, "grad_norm": 0.13595042959879605, "learning_rate": 2.975233970518898e-05, "loss": 0.1992, "step": 3400 }, { "epoch": 0.10535340506221566, "grad_norm": 0.1509154648140987, "learning_rate": 2.9749558546530542e-05, "loss": 0.1979, "step": 3410 }, { "epoch": 0.10566235932926028, "grad_norm": 0.15428441505969218, "learning_rate": 2.9746761990557453e-05, "loss": 0.2083, "step": 3420 }, { "epoch": 0.1059713135963049, "grad_norm": 0.1397814921074192, "learning_rate": 2.9743950040189086e-05, "loss": 0.2036, "step": 3430 }, { "epoch": 0.10628026786334953, "grad_norm": 0.14912878243829816, "learning_rate": 2.974112269836088e-05, "loss": 0.1956, "step": 3440 }, { "epoch": 0.10658922213039415, "grad_norm": 0.16469373458469788, "learning_rate": 2.9738279968024345e-05, "loss": 0.1947, "step": 3450 }, { "epoch": 0.10689817639743877, "grad_norm": 0.14096722380444265, "learning_rate": 2.9735421852147062e-05, "loss": 0.1955, "step": 3460 }, { "epoch": 0.1072071306644834, "grad_norm": 0.1712139882570602, "learning_rate": 2.9732548353712653e-05, "loss": 0.1977, "step": 3470 }, { "epoch": 0.107516084931528, "grad_norm": 0.182036434354717, "learning_rate": 2.972965947572083e-05, "loss": 0.1984, "step": 3480 }, { "epoch": 0.10782503919857263, "grad_norm": 0.12685205273511865, "learning_rate": 2.9726755221187325e-05, "loss": 0.194, "step": 3490 }, { "epoch": 0.10813399346561725, "grad_norm": 0.19036499223183592, "learning_rate": 2.972383559314395e-05, "loss": 0.1929, "step": 3500 }, { "epoch": 0.10844294773266187, "grad_norm": 0.18080925676285772, "learning_rate": 2.972090059463855e-05, "loss": 0.1979, "step": 3510 }, { "epoch": 0.1087519019997065, "grad_norm": 0.1607497216955787, "learning_rate": 2.971795022873502e-05, "loss": 0.2002, "step": 3520 }, { "epoch": 0.10906085626675112, "grad_norm": 0.14846714634409408, "learning_rate": 2.9714984498513298e-05, "loss": 0.2, "step": 3530 }, { "epoch": 0.10936981053379574, "grad_norm": 0.22370896372238852, "learning_rate": 2.9712003407069355e-05, "loss": 0.2133, "step": 3540 }, { "epoch": 0.10967876480084035, "grad_norm": 0.14800105867230223, "learning_rate": 2.970900695751521e-05, "loss": 0.2006, "step": 3550 }, { "epoch": 0.10998771906788497, "grad_norm": 0.15815792317396923, "learning_rate": 2.9705995152978907e-05, "loss": 0.1983, "step": 3560 }, { "epoch": 0.1102966733349296, "grad_norm": 0.18564579888879634, "learning_rate": 2.9702967996604512e-05, "loss": 0.1941, "step": 3570 }, { "epoch": 0.11060562760197422, "grad_norm": 0.9104975551943203, "learning_rate": 2.9699925491552137e-05, "loss": 0.2026, "step": 3580 }, { "epoch": 0.11091458186901884, "grad_norm": 0.17333001775135767, "learning_rate": 2.9696867640997895e-05, "loss": 0.4843, "step": 3590 }, { "epoch": 0.11122353613606346, "grad_norm": 0.15860053694459797, "learning_rate": 2.9693794448133928e-05, "loss": 0.1914, "step": 3600 }, { "epoch": 0.11153249040310809, "grad_norm": 0.1649188831618067, "learning_rate": 2.96907059161684e-05, "loss": 0.1986, "step": 3610 }, { "epoch": 0.1118414446701527, "grad_norm": 0.17291101908747689, "learning_rate": 2.9687602048325485e-05, "loss": 0.1991, "step": 3620 }, { "epoch": 0.11215039893719732, "grad_norm": 0.14504924285922557, "learning_rate": 2.9684482847845354e-05, "loss": 0.1965, "step": 3630 }, { "epoch": 0.11245935320424194, "grad_norm": 0.15988131838749894, "learning_rate": 2.96813483179842e-05, "loss": 0.198, "step": 3640 }, { "epoch": 0.11276830747128656, "grad_norm": 0.24942612461926558, "learning_rate": 2.9678198462014207e-05, "loss": 0.2018, "step": 3650 }, { "epoch": 0.11307726173833119, "grad_norm": 0.15620629317438656, "learning_rate": 2.9675033283223565e-05, "loss": 0.2005, "step": 3660 }, { "epoch": 0.11338621600537581, "grad_norm": 0.17244272644487615, "learning_rate": 2.9671852784916455e-05, "loss": 0.1994, "step": 3670 }, { "epoch": 0.11369517027242043, "grad_norm": 0.156712293447336, "learning_rate": 2.9668656970413063e-05, "loss": 0.1956, "step": 3680 }, { "epoch": 0.11400412453946504, "grad_norm": 0.13623076388298133, "learning_rate": 2.9665445843049543e-05, "loss": 0.1938, "step": 3690 }, { "epoch": 0.11431307880650966, "grad_norm": 0.1521454012963752, "learning_rate": 2.9662219406178055e-05, "loss": 0.1957, "step": 3700 }, { "epoch": 0.11462203307355429, "grad_norm": 0.17226312492776039, "learning_rate": 2.965897766316672e-05, "loss": 0.195, "step": 3710 }, { "epoch": 0.11493098734059891, "grad_norm": 0.17103571729681852, "learning_rate": 2.965572061739966e-05, "loss": 0.1917, "step": 3720 }, { "epoch": 0.11523994160764353, "grad_norm": 0.14787997443801582, "learning_rate": 2.965244827227695e-05, "loss": 0.1988, "step": 3730 }, { "epoch": 0.11554889587468815, "grad_norm": 0.16499617818184958, "learning_rate": 2.9649160631214656e-05, "loss": 0.1934, "step": 3740 }, { "epoch": 0.11585785014173278, "grad_norm": 0.13859267641190945, "learning_rate": 2.96458576976448e-05, "loss": 0.1992, "step": 3750 }, { "epoch": 0.11616680440877739, "grad_norm": 0.13925810169222835, "learning_rate": 2.9642539475015368e-05, "loss": 0.1974, "step": 3760 }, { "epoch": 0.11647575867582201, "grad_norm": 0.14590521946593527, "learning_rate": 2.9639205966790304e-05, "loss": 0.1952, "step": 3770 }, { "epoch": 0.11678471294286663, "grad_norm": 0.15328983346122252, "learning_rate": 2.963585717644952e-05, "loss": 0.1937, "step": 3780 }, { "epoch": 0.11709366720991125, "grad_norm": 0.1704191032635659, "learning_rate": 2.963249310748888e-05, "loss": 0.207, "step": 3790 }, { "epoch": 0.11740262147695588, "grad_norm": 0.16383535566426977, "learning_rate": 2.9629113763420187e-05, "loss": 0.1938, "step": 3800 }, { "epoch": 0.1177115757440005, "grad_norm": 0.14359121960553015, "learning_rate": 2.9625719147771188e-05, "loss": 0.201, "step": 3810 }, { "epoch": 0.11802053001104512, "grad_norm": 0.14131405219953524, "learning_rate": 2.9622309264085595e-05, "loss": 0.1927, "step": 3820 }, { "epoch": 0.11832948427808973, "grad_norm": 0.16335041544288204, "learning_rate": 2.9618884115923033e-05, "loss": 0.1952, "step": 3830 }, { "epoch": 0.11863843854513435, "grad_norm": 0.2502348189666573, "learning_rate": 2.961544370685908e-05, "loss": 0.2034, "step": 3840 }, { "epoch": 0.11894739281217898, "grad_norm": 0.17031687373001325, "learning_rate": 2.961198804048523e-05, "loss": 0.2008, "step": 3850 }, { "epoch": 0.1192563470792236, "grad_norm": 0.14658450456524985, "learning_rate": 2.9608517120408918e-05, "loss": 0.2015, "step": 3860 }, { "epoch": 0.11956530134626822, "grad_norm": 0.14591308196539304, "learning_rate": 2.960503095025349e-05, "loss": 0.1904, "step": 3870 }, { "epoch": 0.11987425561331284, "grad_norm": 0.13427534668219537, "learning_rate": 2.960152953365822e-05, "loss": 0.2046, "step": 3880 }, { "epoch": 0.12018320988035747, "grad_norm": 0.15168502466304842, "learning_rate": 2.95980128742783e-05, "loss": 0.1979, "step": 3890 }, { "epoch": 0.12049216414740208, "grad_norm": 0.14442317575764707, "learning_rate": 2.9594480975784833e-05, "loss": 0.2129, "step": 3900 }, { "epoch": 0.1208011184144467, "grad_norm": 0.1777783043137642, "learning_rate": 2.9590933841864815e-05, "loss": 0.2052, "step": 3910 }, { "epoch": 0.12111007268149132, "grad_norm": 0.14281818732894197, "learning_rate": 2.958737147622117e-05, "loss": 0.2001, "step": 3920 }, { "epoch": 0.12141902694853594, "grad_norm": 0.1354700247916489, "learning_rate": 2.9583793882572715e-05, "loss": 0.1987, "step": 3930 }, { "epoch": 0.12172798121558057, "grad_norm": 0.17757910288830733, "learning_rate": 2.9580201064654148e-05, "loss": 0.1993, "step": 3940 }, { "epoch": 0.12203693548262519, "grad_norm": 0.14055138458099284, "learning_rate": 2.957659302621608e-05, "loss": 0.1961, "step": 3950 }, { "epoch": 0.12234588974966981, "grad_norm": 0.1419351708319278, "learning_rate": 2.957296977102501e-05, "loss": 0.1925, "step": 3960 }, { "epoch": 0.12265484401671442, "grad_norm": 0.1296616328956848, "learning_rate": 2.9569331302863305e-05, "loss": 0.1991, "step": 3970 }, { "epoch": 0.12296379828375904, "grad_norm": 0.14738144874164277, "learning_rate": 2.9565677625529225e-05, "loss": 0.1906, "step": 3980 }, { "epoch": 0.12327275255080367, "grad_norm": 0.13274251994537717, "learning_rate": 2.956200874283691e-05, "loss": 0.2065, "step": 3990 }, { "epoch": 0.12358170681784829, "grad_norm": 0.18168042419561298, "learning_rate": 2.955832465861637e-05, "loss": 0.1918, "step": 4000 }, { "epoch": 0.12389066108489291, "grad_norm": 0.2078814909170653, "learning_rate": 2.9554625376713477e-05, "loss": 0.203, "step": 4010 }, { "epoch": 0.12419961535193753, "grad_norm": 0.15436707123570834, "learning_rate": 2.9550910900989982e-05, "loss": 0.1918, "step": 4020 }, { "epoch": 0.12450856961898216, "grad_norm": 0.15422646051605707, "learning_rate": 2.9547181235323485e-05, "loss": 0.194, "step": 4030 }, { "epoch": 0.12481752388602677, "grad_norm": 0.13549992108306505, "learning_rate": 2.9543436383607447e-05, "loss": 0.1919, "step": 4040 }, { "epoch": 0.1251264781530714, "grad_norm": 0.14308198767398161, "learning_rate": 2.953967634975118e-05, "loss": 0.1994, "step": 4050 }, { "epoch": 0.125435432420116, "grad_norm": 0.20628750095306617, "learning_rate": 2.9535901137679848e-05, "loss": 0.2066, "step": 4060 }, { "epoch": 0.12574438668716062, "grad_norm": 0.16292002289397242, "learning_rate": 2.953211075133447e-05, "loss": 0.2044, "step": 4070 }, { "epoch": 0.12605334095420526, "grad_norm": 0.16507262758091465, "learning_rate": 2.9528305194671878e-05, "loss": 0.1915, "step": 4080 }, { "epoch": 0.12636229522124987, "grad_norm": 0.13034214436998967, "learning_rate": 2.9524484471664765e-05, "loss": 0.2021, "step": 4090 }, { "epoch": 0.1266712494882945, "grad_norm": 0.14593293004873178, "learning_rate": 2.9520648586301652e-05, "loss": 0.2032, "step": 4100 }, { "epoch": 0.1269802037553391, "grad_norm": 0.12500323412729464, "learning_rate": 2.951679754258688e-05, "loss": 0.1936, "step": 4110 }, { "epoch": 0.12728915802238375, "grad_norm": 0.1400278042626436, "learning_rate": 2.9512931344540617e-05, "loss": 0.1976, "step": 4120 }, { "epoch": 0.12759811228942836, "grad_norm": 0.16049356004715562, "learning_rate": 2.9509049996198863e-05, "loss": 0.1941, "step": 4130 }, { "epoch": 0.12790706655647296, "grad_norm": 0.1484062143138434, "learning_rate": 2.9505153501613407e-05, "loss": 0.1953, "step": 4140 }, { "epoch": 0.1282160208235176, "grad_norm": 0.14015163193632005, "learning_rate": 2.950124186485188e-05, "loss": 0.193, "step": 4150 }, { "epoch": 0.1285249750905622, "grad_norm": 0.1231425362581919, "learning_rate": 2.9497315089997702e-05, "loss": 0.2006, "step": 4160 }, { "epoch": 0.12883392935760685, "grad_norm": 0.12528784539325993, "learning_rate": 2.9493373181150106e-05, "loss": 0.1948, "step": 4170 }, { "epoch": 0.12914288362465146, "grad_norm": 0.11966360889407414, "learning_rate": 2.9489416142424104e-05, "loss": 0.2001, "step": 4180 }, { "epoch": 0.1294518378916961, "grad_norm": 0.14438982807807982, "learning_rate": 2.948544397795052e-05, "loss": 0.1942, "step": 4190 }, { "epoch": 0.1297607921587407, "grad_norm": 0.13507913856323525, "learning_rate": 2.948145669187597e-05, "loss": 0.1988, "step": 4200 }, { "epoch": 0.1300697464257853, "grad_norm": 0.13072933322754932, "learning_rate": 2.9477454288362847e-05, "loss": 0.2029, "step": 4210 }, { "epoch": 0.13037870069282995, "grad_norm": 0.1346665945344133, "learning_rate": 2.947343677158933e-05, "loss": 0.1977, "step": 4220 }, { "epoch": 0.13068765495987456, "grad_norm": 0.14419031028708462, "learning_rate": 2.9469404145749366e-05, "loss": 0.195, "step": 4230 }, { "epoch": 0.1309966092269192, "grad_norm": 0.15289759763473756, "learning_rate": 2.9465356415052687e-05, "loss": 0.1933, "step": 4240 }, { "epoch": 0.1313055634939638, "grad_norm": 0.14879122180002222, "learning_rate": 2.946129358372479e-05, "loss": 0.1928, "step": 4250 }, { "epoch": 0.13161451776100844, "grad_norm": 0.1233182967420781, "learning_rate": 2.9457215656006927e-05, "loss": 0.1913, "step": 4260 }, { "epoch": 0.13192347202805305, "grad_norm": 0.13905927812327482, "learning_rate": 2.945312263615612e-05, "loss": 0.2008, "step": 4270 }, { "epoch": 0.13223242629509765, "grad_norm": 0.1480347048625918, "learning_rate": 2.944901452844514e-05, "loss": 0.1922, "step": 4280 }, { "epoch": 0.1325413805621423, "grad_norm": 0.1759649098608902, "learning_rate": 2.9444891337162517e-05, "loss": 0.1962, "step": 4290 }, { "epoch": 0.1328503348291869, "grad_norm": 0.15444515511292223, "learning_rate": 2.944075306661251e-05, "loss": 0.202, "step": 4300 }, { "epoch": 0.13315928909623154, "grad_norm": 0.17237129521975675, "learning_rate": 2.9436599721115143e-05, "loss": 0.1935, "step": 4310 }, { "epoch": 0.13346824336327615, "grad_norm": 0.12984196981989796, "learning_rate": 2.9432431305006147e-05, "loss": 0.194, "step": 4320 }, { "epoch": 0.13377719763032078, "grad_norm": 0.13269341815635174, "learning_rate": 2.9428247822637015e-05, "loss": 0.203, "step": 4330 }, { "epoch": 0.1340861518973654, "grad_norm": 0.15149059382961613, "learning_rate": 2.942404927837495e-05, "loss": 0.191, "step": 4340 }, { "epoch": 0.13439510616441, "grad_norm": 0.1148468815620595, "learning_rate": 2.9419835676602893e-05, "loss": 0.1937, "step": 4350 }, { "epoch": 0.13470406043145464, "grad_norm": 0.13918237186519886, "learning_rate": 2.9415607021719482e-05, "loss": 0.2023, "step": 4360 }, { "epoch": 0.13501301469849925, "grad_norm": 0.1335764168362304, "learning_rate": 2.9411363318139083e-05, "loss": 0.2014, "step": 4370 }, { "epoch": 0.13532196896554388, "grad_norm": 0.1228777211146217, "learning_rate": 2.940710457029178e-05, "loss": 0.2, "step": 4380 }, { "epoch": 0.1356309232325885, "grad_norm": 0.13309593839884942, "learning_rate": 2.940283078262334e-05, "loss": 0.1916, "step": 4390 }, { "epoch": 0.13593987749963313, "grad_norm": 0.12294257730806753, "learning_rate": 2.939854195959525e-05, "loss": 0.199, "step": 4400 }, { "epoch": 0.13624883176667774, "grad_norm": 0.14335373300838533, "learning_rate": 2.939423810568468e-05, "loss": 0.1966, "step": 4410 }, { "epoch": 0.13655778603372234, "grad_norm": 0.12813785744442807, "learning_rate": 2.938991922538451e-05, "loss": 0.1926, "step": 4420 }, { "epoch": 0.13686674030076698, "grad_norm": 0.13376844823268128, "learning_rate": 2.9385585323203274e-05, "loss": 0.1934, "step": 4430 }, { "epoch": 0.1371756945678116, "grad_norm": 0.13510587816156738, "learning_rate": 2.938123640366522e-05, "loss": 0.1995, "step": 4440 }, { "epoch": 0.13748464883485623, "grad_norm": 0.14932950286836474, "learning_rate": 2.9376872471310247e-05, "loss": 0.1961, "step": 4450 }, { "epoch": 0.13779360310190084, "grad_norm": 0.12340399413892836, "learning_rate": 2.937249353069395e-05, "loss": 0.1902, "step": 4460 }, { "epoch": 0.13810255736894547, "grad_norm": 0.12742383096621718, "learning_rate": 2.936809958638757e-05, "loss": 0.1936, "step": 4470 }, { "epoch": 0.13841151163599008, "grad_norm": 0.13843377169803955, "learning_rate": 2.9363690642978025e-05, "loss": 0.1919, "step": 4480 }, { "epoch": 0.1387204659030347, "grad_norm": 0.12772195973728676, "learning_rate": 2.935926670506789e-05, "loss": 0.1933, "step": 4490 }, { "epoch": 0.13902942017007933, "grad_norm": 0.14589049855962488, "learning_rate": 2.9354827777275375e-05, "loss": 0.1933, "step": 4500 }, { "epoch": 0.13933837443712394, "grad_norm": 0.11227430428451665, "learning_rate": 2.9350373864234366e-05, "loss": 0.1909, "step": 4510 }, { "epoch": 0.13964732870416857, "grad_norm": 0.13853166961303737, "learning_rate": 2.9345904970594374e-05, "loss": 0.2041, "step": 4520 }, { "epoch": 0.13995628297121318, "grad_norm": 0.12350643169399671, "learning_rate": 2.9341421101020552e-05, "loss": 0.1996, "step": 4530 }, { "epoch": 0.14026523723825782, "grad_norm": 0.14290175255995127, "learning_rate": 2.933692226019369e-05, "loss": 0.1924, "step": 4540 }, { "epoch": 0.14057419150530243, "grad_norm": 0.12829786899010034, "learning_rate": 2.9332408452810196e-05, "loss": 0.1995, "step": 4550 }, { "epoch": 0.14088314577234703, "grad_norm": 0.14530621971898633, "learning_rate": 2.932787968358212e-05, "loss": 0.193, "step": 4560 }, { "epoch": 0.14119210003939167, "grad_norm": 0.12771847498386088, "learning_rate": 2.9323335957237118e-05, "loss": 0.202, "step": 4570 }, { "epoch": 0.14150105430643628, "grad_norm": 0.11736054576554526, "learning_rate": 2.931877727851846e-05, "loss": 0.1918, "step": 4580 }, { "epoch": 0.14181000857348092, "grad_norm": 0.13362073917143064, "learning_rate": 2.931420365218503e-05, "loss": 0.1989, "step": 4590 }, { "epoch": 0.14211896284052553, "grad_norm": 0.16854876515549833, "learning_rate": 2.9309615083011322e-05, "loss": 0.2035, "step": 4600 }, { "epoch": 0.14242791710757016, "grad_norm": 0.12606958463728377, "learning_rate": 2.9305011575787412e-05, "loss": 0.1947, "step": 4610 }, { "epoch": 0.14273687137461477, "grad_norm": 0.11958865479873768, "learning_rate": 2.9300393135318977e-05, "loss": 0.1971, "step": 4620 }, { "epoch": 0.14304582564165938, "grad_norm": 0.129237378103935, "learning_rate": 2.9295759766427296e-05, "loss": 0.1941, "step": 4630 }, { "epoch": 0.14335477990870402, "grad_norm": 0.13016163572306816, "learning_rate": 2.929111147394921e-05, "loss": 0.1982, "step": 4640 }, { "epoch": 0.14366373417574863, "grad_norm": 0.15129061732266857, "learning_rate": 2.928644826273716e-05, "loss": 0.2065, "step": 4650 }, { "epoch": 0.14397268844279326, "grad_norm": 0.1680228209888944, "learning_rate": 2.9281770137659147e-05, "loss": 0.1965, "step": 4660 }, { "epoch": 0.14428164270983787, "grad_norm": 0.1347795092156423, "learning_rate": 2.9277077103598745e-05, "loss": 0.2065, "step": 4670 }, { "epoch": 0.1445905969768825, "grad_norm": 0.13761617881117794, "learning_rate": 2.9272369165455092e-05, "loss": 0.1952, "step": 4680 }, { "epoch": 0.14489955124392712, "grad_norm": 0.11483309292959085, "learning_rate": 2.9267646328142888e-05, "loss": 0.2062, "step": 4690 }, { "epoch": 0.14520850551097172, "grad_norm": 0.14810852616214162, "learning_rate": 2.926290859659238e-05, "loss": 0.1935, "step": 4700 }, { "epoch": 0.14551745977801636, "grad_norm": 0.12469037615780773, "learning_rate": 2.9258155975749362e-05, "loss": 0.1918, "step": 4710 }, { "epoch": 0.14582641404506097, "grad_norm": 0.1408670467987656, "learning_rate": 2.9253388470575185e-05, "loss": 0.1927, "step": 4720 }, { "epoch": 0.1461353683121056, "grad_norm": 0.1363354513340469, "learning_rate": 2.9248606086046722e-05, "loss": 0.1995, "step": 4730 }, { "epoch": 0.14644432257915022, "grad_norm": 0.130509753951675, "learning_rate": 2.9243808827156385e-05, "loss": 0.1864, "step": 4740 }, { "epoch": 0.14675327684619485, "grad_norm": 0.11391094065019194, "learning_rate": 2.923899669891212e-05, "loss": 0.1981, "step": 4750 }, { "epoch": 0.14706223111323946, "grad_norm": 0.11044888884253695, "learning_rate": 2.9234169706337384e-05, "loss": 0.1908, "step": 4760 }, { "epoch": 0.14737118538028407, "grad_norm": 0.12922382038899408, "learning_rate": 2.922932785447116e-05, "loss": 0.194, "step": 4770 }, { "epoch": 0.1476801396473287, "grad_norm": 0.127886291516086, "learning_rate": 2.9224471148367936e-05, "loss": 0.1961, "step": 4780 }, { "epoch": 0.14798909391437332, "grad_norm": 0.12355832270568426, "learning_rate": 2.9219599593097712e-05, "loss": 0.1924, "step": 4790 }, { "epoch": 0.14829804818141795, "grad_norm": 0.1326979789376945, "learning_rate": 2.921471319374599e-05, "loss": 0.1983, "step": 4800 }, { "epoch": 0.14860700244846256, "grad_norm": 0.11511939455576024, "learning_rate": 2.920981195541376e-05, "loss": 0.1974, "step": 4810 }, { "epoch": 0.1489159567155072, "grad_norm": 0.1342465951058469, "learning_rate": 2.9204895883217505e-05, "loss": 0.2072, "step": 4820 }, { "epoch": 0.1492249109825518, "grad_norm": 0.12069929909097779, "learning_rate": 2.9199964982289204e-05, "loss": 0.1938, "step": 4830 }, { "epoch": 0.14953386524959641, "grad_norm": 0.16688076554041342, "learning_rate": 2.919501925777631e-05, "loss": 0.192, "step": 4840 }, { "epoch": 0.14984281951664105, "grad_norm": 0.11696584922616346, "learning_rate": 2.9190058714841742e-05, "loss": 0.1915, "step": 4850 }, { "epoch": 0.15015177378368566, "grad_norm": 0.1552950204583272, "learning_rate": 2.9185083358663897e-05, "loss": 0.1963, "step": 4860 }, { "epoch": 0.1504607280507303, "grad_norm": 0.12285118837217292, "learning_rate": 2.9180093194436633e-05, "loss": 0.2041, "step": 4870 }, { "epoch": 0.1507696823177749, "grad_norm": 0.1354730699847804, "learning_rate": 2.9175088227369273e-05, "loss": 0.1959, "step": 4880 }, { "epoch": 0.15107863658481954, "grad_norm": 0.1066152050049423, "learning_rate": 2.917006846268658e-05, "loss": 0.1941, "step": 4890 }, { "epoch": 0.15138759085186415, "grad_norm": 0.1465495271752134, "learning_rate": 2.9165033905628772e-05, "loss": 0.1939, "step": 4900 }, { "epoch": 0.15169654511890876, "grad_norm": 0.15692246190599393, "learning_rate": 2.9159984561451515e-05, "loss": 0.1893, "step": 4910 }, { "epoch": 0.1520054993859534, "grad_norm": 0.11807608098513886, "learning_rate": 2.9154920435425895e-05, "loss": 0.1918, "step": 4920 }, { "epoch": 0.152314453652998, "grad_norm": 0.13409966056661382, "learning_rate": 2.9149841532838448e-05, "loss": 0.1912, "step": 4930 }, { "epoch": 0.15262340792004264, "grad_norm": 0.18083372810012097, "learning_rate": 2.9144747858991122e-05, "loss": 0.1954, "step": 4940 }, { "epoch": 0.15293236218708725, "grad_norm": 0.11964200044359867, "learning_rate": 2.9139639419201296e-05, "loss": 0.1945, "step": 4950 }, { "epoch": 0.1532413164541319, "grad_norm": 0.13386397016898569, "learning_rate": 2.9134516218801747e-05, "loss": 0.1958, "step": 4960 }, { "epoch": 0.1535502707211765, "grad_norm": 0.15176520812335212, "learning_rate": 2.912937826314068e-05, "loss": 0.1939, "step": 4970 }, { "epoch": 0.1538592249882211, "grad_norm": 0.13802838419756916, "learning_rate": 2.912422555758169e-05, "loss": 0.1964, "step": 4980 }, { "epoch": 0.15416817925526574, "grad_norm": 0.22709981781596195, "learning_rate": 2.9119058107503774e-05, "loss": 0.2017, "step": 4990 }, { "epoch": 0.15447713352231035, "grad_norm": 0.30226292741230987, "learning_rate": 2.911387591830132e-05, "loss": 0.1966, "step": 5000 }, { "epoch": 0.154786087789355, "grad_norm": 0.1627564205425172, "learning_rate": 2.910867899538411e-05, "loss": 0.1989, "step": 5010 }, { "epoch": 0.1550950420563996, "grad_norm": 0.1317254809615595, "learning_rate": 2.910346734417729e-05, "loss": 0.1999, "step": 5020 }, { "epoch": 0.15540399632344423, "grad_norm": 0.13719263854058972, "learning_rate": 2.9098240970121395e-05, "loss": 0.1899, "step": 5030 }, { "epoch": 0.15571295059048884, "grad_norm": 0.13267611025762924, "learning_rate": 2.9092999878672326e-05, "loss": 0.1971, "step": 5040 }, { "epoch": 0.15602190485753345, "grad_norm": 0.14416703153906263, "learning_rate": 2.9087744075301347e-05, "loss": 0.1927, "step": 5050 }, { "epoch": 0.1563308591245781, "grad_norm": 0.12210235810758614, "learning_rate": 2.9082473565495087e-05, "loss": 0.1923, "step": 5060 }, { "epoch": 0.1566398133916227, "grad_norm": 0.11791810636207076, "learning_rate": 2.907718835475551e-05, "loss": 0.195, "step": 5070 }, { "epoch": 0.15694876765866733, "grad_norm": 0.12095560677835107, "learning_rate": 2.9071888448599947e-05, "loss": 0.1941, "step": 5080 }, { "epoch": 0.15725772192571194, "grad_norm": 0.12912300332818374, "learning_rate": 2.9066573852561047e-05, "loss": 0.1915, "step": 5090 }, { "epoch": 0.15756667619275658, "grad_norm": 0.13215524078407134, "learning_rate": 2.906124457218682e-05, "loss": 0.1907, "step": 5100 }, { "epoch": 0.1578756304598012, "grad_norm": 0.17097794726193138, "learning_rate": 2.9055900613040588e-05, "loss": 0.1978, "step": 5110 }, { "epoch": 0.1581845847268458, "grad_norm": 0.12947782531420673, "learning_rate": 2.9050541980701002e-05, "loss": 0.2019, "step": 5120 }, { "epoch": 0.15849353899389043, "grad_norm": 0.13748652286867397, "learning_rate": 2.9045168680762023e-05, "loss": 0.1963, "step": 5130 }, { "epoch": 0.15880249326093504, "grad_norm": 0.13779167566436834, "learning_rate": 2.903978071883294e-05, "loss": 0.1994, "step": 5140 }, { "epoch": 0.15911144752797968, "grad_norm": 0.13634464708954122, "learning_rate": 2.903437810053833e-05, "loss": 0.1934, "step": 5150 }, { "epoch": 0.15942040179502429, "grad_norm": 0.13717458947995578, "learning_rate": 2.9028960831518075e-05, "loss": 0.1944, "step": 5160 }, { "epoch": 0.15972935606206892, "grad_norm": 0.12646264871030233, "learning_rate": 2.902352891742737e-05, "loss": 0.1911, "step": 5170 }, { "epoch": 0.16003831032911353, "grad_norm": 0.13801332842129482, "learning_rate": 2.9018082363936667e-05, "loss": 0.1917, "step": 5180 }, { "epoch": 0.16034726459615814, "grad_norm": 0.1094078051116294, "learning_rate": 2.901262117673172e-05, "loss": 0.1911, "step": 5190 }, { "epoch": 0.16065621886320278, "grad_norm": 0.13287386784850339, "learning_rate": 2.9007145361513555e-05, "loss": 0.1952, "step": 5200 }, { "epoch": 0.16096517313024739, "grad_norm": 0.1053642665870786, "learning_rate": 2.9001654923998472e-05, "loss": 0.1949, "step": 5210 }, { "epoch": 0.16127412739729202, "grad_norm": 0.13283196358546498, "learning_rate": 2.899614986991803e-05, "loss": 0.1928, "step": 5220 }, { "epoch": 0.16158308166433663, "grad_norm": 0.10594792098557188, "learning_rate": 2.899063020501904e-05, "loss": 0.196, "step": 5230 }, { "epoch": 0.16189203593138127, "grad_norm": 0.12056290103985914, "learning_rate": 2.8985095935063592e-05, "loss": 0.1863, "step": 5240 }, { "epoch": 0.16220099019842588, "grad_norm": 0.13276276324563877, "learning_rate": 2.8979547065828986e-05, "loss": 0.1939, "step": 5250 }, { "epoch": 0.16250994446547048, "grad_norm": 0.11880304982202072, "learning_rate": 2.8973983603107795e-05, "loss": 0.1882, "step": 5260 }, { "epoch": 0.16281889873251512, "grad_norm": 0.13647860297794587, "learning_rate": 2.8968405552707805e-05, "loss": 0.1899, "step": 5270 }, { "epoch": 0.16312785299955973, "grad_norm": 0.13737173617789297, "learning_rate": 2.8962812920452035e-05, "loss": 0.1976, "step": 5280 }, { "epoch": 0.16343680726660437, "grad_norm": 0.14560561251250137, "learning_rate": 2.8957205712178734e-05, "loss": 0.1922, "step": 5290 }, { "epoch": 0.16374576153364898, "grad_norm": 0.12816948689448399, "learning_rate": 2.8951583933741365e-05, "loss": 0.1952, "step": 5300 }, { "epoch": 0.1640547158006936, "grad_norm": 0.13875960915862645, "learning_rate": 2.8945947591008584e-05, "loss": 0.1914, "step": 5310 }, { "epoch": 0.16436367006773822, "grad_norm": 0.12147448891632738, "learning_rate": 2.894029668986428e-05, "loss": 0.2001, "step": 5320 }, { "epoch": 0.16467262433478283, "grad_norm": 0.14172848510381716, "learning_rate": 2.8934631236207518e-05, "loss": 0.2005, "step": 5330 }, { "epoch": 0.16498157860182747, "grad_norm": 0.11965777792428922, "learning_rate": 2.8928951235952567e-05, "loss": 0.202, "step": 5340 }, { "epoch": 0.16529053286887208, "grad_norm": 0.12556181014437173, "learning_rate": 2.892325669502886e-05, "loss": 0.1943, "step": 5350 }, { "epoch": 0.1655994871359167, "grad_norm": 0.12988258572700673, "learning_rate": 2.8917547619381046e-05, "loss": 0.1929, "step": 5360 }, { "epoch": 0.16590844140296132, "grad_norm": 0.1591002578274483, "learning_rate": 2.8911824014968914e-05, "loss": 0.1923, "step": 5370 }, { "epoch": 0.16621739567000596, "grad_norm": 0.12842753220093361, "learning_rate": 2.890608588776743e-05, "loss": 0.1912, "step": 5380 }, { "epoch": 0.16652634993705057, "grad_norm": 0.16237973177755397, "learning_rate": 2.890033324376673e-05, "loss": 0.1982, "step": 5390 }, { "epoch": 0.16683530420409518, "grad_norm": 0.13252696464562266, "learning_rate": 2.8894566088972086e-05, "loss": 0.1941, "step": 5400 }, { "epoch": 0.1671442584711398, "grad_norm": 0.1562459648242363, "learning_rate": 2.8888784429403942e-05, "loss": 0.1951, "step": 5410 }, { "epoch": 0.16745321273818442, "grad_norm": 0.19214897406108242, "learning_rate": 2.8882988271097867e-05, "loss": 0.1944, "step": 5420 }, { "epoch": 0.16776216700522906, "grad_norm": 0.11645411992655547, "learning_rate": 2.8877177620104563e-05, "loss": 0.1976, "step": 5430 }, { "epoch": 0.16807112127227367, "grad_norm": 0.14830738696497464, "learning_rate": 2.8871352482489867e-05, "loss": 0.1943, "step": 5440 }, { "epoch": 0.1683800755393183, "grad_norm": 0.1476613773833638, "learning_rate": 2.886551286433475e-05, "loss": 0.1988, "step": 5450 }, { "epoch": 0.1686890298063629, "grad_norm": 0.363023628817013, "learning_rate": 2.8859658771735275e-05, "loss": 0.2041, "step": 5460 }, { "epoch": 0.16899798407340752, "grad_norm": 0.12877823250892356, "learning_rate": 2.885379021080264e-05, "loss": 0.1883, "step": 5470 }, { "epoch": 0.16930693834045216, "grad_norm": 0.16072479476005033, "learning_rate": 2.884790718766313e-05, "loss": 0.1957, "step": 5480 }, { "epoch": 0.16961589260749677, "grad_norm": 0.10660898088117149, "learning_rate": 2.8842009708458134e-05, "loss": 0.1879, "step": 5490 }, { "epoch": 0.1699248468745414, "grad_norm": 0.11287940246473271, "learning_rate": 2.8836097779344124e-05, "loss": 0.1967, "step": 5500 }, { "epoch": 0.170233801141586, "grad_norm": 0.1392226076869493, "learning_rate": 2.8830171406492675e-05, "loss": 0.1961, "step": 5510 }, { "epoch": 0.17054275540863065, "grad_norm": 0.14186426823834117, "learning_rate": 2.882423059609042e-05, "loss": 0.1909, "step": 5520 }, { "epoch": 0.17085170967567526, "grad_norm": 0.11802002523931364, "learning_rate": 2.8818275354339076e-05, "loss": 0.189, "step": 5530 }, { "epoch": 0.17116066394271987, "grad_norm": 0.1544264170886084, "learning_rate": 2.881230568745542e-05, "loss": 0.201, "step": 5540 }, { "epoch": 0.1714696182097645, "grad_norm": 0.1390375900467053, "learning_rate": 2.8806321601671283e-05, "loss": 0.2041, "step": 5550 }, { "epoch": 0.1717785724768091, "grad_norm": 0.13144810014781408, "learning_rate": 2.8800323103233562e-05, "loss": 0.1902, "step": 5560 }, { "epoch": 0.17208752674385375, "grad_norm": 0.4660580390716817, "learning_rate": 2.8794310198404187e-05, "loss": 0.1906, "step": 5570 }, { "epoch": 0.17239648101089836, "grad_norm": 0.12437154731352344, "learning_rate": 2.8788282893460132e-05, "loss": 0.2117, "step": 5580 }, { "epoch": 0.172705435277943, "grad_norm": 0.12213570462157555, "learning_rate": 2.8782241194693405e-05, "loss": 0.1932, "step": 5590 }, { "epoch": 0.1730143895449876, "grad_norm": 0.11938574410310657, "learning_rate": 2.877618510841104e-05, "loss": 0.1844, "step": 5600 }, { "epoch": 0.1733233438120322, "grad_norm": 0.18758761384669917, "learning_rate": 2.8770114640935082e-05, "loss": 0.1992, "step": 5610 }, { "epoch": 0.17363229807907685, "grad_norm": 0.1323353027887345, "learning_rate": 2.8764029798602595e-05, "loss": 0.1934, "step": 5620 }, { "epoch": 0.17394125234612146, "grad_norm": 0.13036359195874733, "learning_rate": 2.8757930587765656e-05, "loss": 0.1968, "step": 5630 }, { "epoch": 0.1742502066131661, "grad_norm": 0.1437468757002543, "learning_rate": 2.8751817014791328e-05, "loss": 0.1937, "step": 5640 }, { "epoch": 0.1745591608802107, "grad_norm": 0.11855881753153365, "learning_rate": 2.8745689086061676e-05, "loss": 0.1906, "step": 5650 }, { "epoch": 0.17486811514725534, "grad_norm": 0.14122479545822594, "learning_rate": 2.8739546807973755e-05, "loss": 0.1881, "step": 5660 }, { "epoch": 0.17517706941429995, "grad_norm": 0.15249348073207336, "learning_rate": 2.8733390186939583e-05, "loss": 0.1969, "step": 5670 }, { "epoch": 0.17548602368134458, "grad_norm": 0.1281228020957653, "learning_rate": 2.8727219229386165e-05, "loss": 0.1902, "step": 5680 }, { "epoch": 0.1757949779483892, "grad_norm": 0.13912156186757996, "learning_rate": 2.8721033941755475e-05, "loss": 0.1917, "step": 5690 }, { "epoch": 0.1761039322154338, "grad_norm": 0.13363292138326746, "learning_rate": 2.871483433050443e-05, "loss": 0.2003, "step": 5700 }, { "epoch": 0.17641288648247844, "grad_norm": 0.12385133385686929, "learning_rate": 2.8708620402104917e-05, "loss": 0.1877, "step": 5710 }, { "epoch": 0.17672184074952305, "grad_norm": 0.15347590707493236, "learning_rate": 2.8702392163043755e-05, "loss": 0.202, "step": 5720 }, { "epoch": 0.17703079501656768, "grad_norm": 0.14209395882176734, "learning_rate": 2.8696149619822714e-05, "loss": 0.1899, "step": 5730 }, { "epoch": 0.1773397492836123, "grad_norm": 0.11663542532440394, "learning_rate": 2.8689892778958487e-05, "loss": 0.1931, "step": 5740 }, { "epoch": 0.17764870355065693, "grad_norm": 0.1103800020632343, "learning_rate": 2.8683621646982696e-05, "loss": 0.1907, "step": 5750 }, { "epoch": 0.17795765781770154, "grad_norm": 0.12819159634700894, "learning_rate": 2.867733623044189e-05, "loss": 0.2143, "step": 5760 }, { "epoch": 0.17826661208474615, "grad_norm": 0.8223795049786881, "learning_rate": 2.867103653589751e-05, "loss": 0.2072, "step": 5770 }, { "epoch": 0.17857556635179078, "grad_norm": 4.599271520472337, "learning_rate": 2.8664722569925926e-05, "loss": 0.2243, "step": 5780 }, { "epoch": 0.1788845206188354, "grad_norm": 0.331101029184519, "learning_rate": 2.8658394339118385e-05, "loss": 0.2326, "step": 5790 }, { "epoch": 0.17919347488588003, "grad_norm": 0.16303299024889983, "learning_rate": 2.8652051850081038e-05, "loss": 0.1965, "step": 5800 }, { "epoch": 0.17950242915292464, "grad_norm": 0.1212517752245839, "learning_rate": 2.864569510943492e-05, "loss": 0.1912, "step": 5810 }, { "epoch": 0.17981138341996927, "grad_norm": 0.15802784822663518, "learning_rate": 2.8639324123815937e-05, "loss": 0.2154, "step": 5820 }, { "epoch": 0.18012033768701388, "grad_norm": 0.14573693043932212, "learning_rate": 2.8632938899874873e-05, "loss": 0.1971, "step": 5830 }, { "epoch": 0.1804292919540585, "grad_norm": 0.1144883581455652, "learning_rate": 2.862653944427736e-05, "loss": 0.1925, "step": 5840 }, { "epoch": 0.18073824622110313, "grad_norm": 0.12229559324577813, "learning_rate": 2.8620125763703915e-05, "loss": 0.192, "step": 5850 }, { "epoch": 0.18104720048814774, "grad_norm": 0.12451915938153357, "learning_rate": 2.8613697864849874e-05, "loss": 0.1998, "step": 5860 }, { "epoch": 0.18135615475519237, "grad_norm": 0.12089904409190706, "learning_rate": 2.860725575442544e-05, "loss": 0.2047, "step": 5870 }, { "epoch": 0.18166510902223698, "grad_norm": 0.13070519068349834, "learning_rate": 2.8600799439155636e-05, "loss": 0.2187, "step": 5880 }, { "epoch": 0.18197406328928162, "grad_norm": 0.140904960792321, "learning_rate": 2.859432892578032e-05, "loss": 0.1955, "step": 5890 }, { "epoch": 0.18228301755632623, "grad_norm": 0.11511321564596642, "learning_rate": 2.8587844221054174e-05, "loss": 0.2023, "step": 5900 }, { "epoch": 0.18259197182337084, "grad_norm": 0.14044793424844815, "learning_rate": 2.8581345331746685e-05, "loss": 0.1936, "step": 5910 }, { "epoch": 0.18290092609041547, "grad_norm": 0.12197453877103906, "learning_rate": 2.857483226464216e-05, "loss": 0.2058, "step": 5920 }, { "epoch": 0.18320988035746008, "grad_norm": 0.10550839853911653, "learning_rate": 2.85683050265397e-05, "loss": 0.1888, "step": 5930 }, { "epoch": 0.18351883462450472, "grad_norm": 0.3544110519251212, "learning_rate": 2.8561763624253193e-05, "loss": 0.2015, "step": 5940 }, { "epoch": 0.18382778889154933, "grad_norm": 0.12390373192178894, "learning_rate": 2.8555208064611327e-05, "loss": 0.1935, "step": 5950 }, { "epoch": 0.18413674315859396, "grad_norm": 0.10157626065289536, "learning_rate": 2.854863835445756e-05, "loss": 0.2014, "step": 5960 }, { "epoch": 0.18444569742563857, "grad_norm": 0.1213651122339615, "learning_rate": 2.854205450065012e-05, "loss": 0.1951, "step": 5970 }, { "epoch": 0.18475465169268318, "grad_norm": 0.110018669063968, "learning_rate": 2.8535456510062012e-05, "loss": 0.1937, "step": 5980 }, { "epoch": 0.18506360595972782, "grad_norm": 0.12020488555211208, "learning_rate": 2.8528844389580987e-05, "loss": 0.1933, "step": 5990 }, { "epoch": 0.18537256022677243, "grad_norm": 0.10212491511734467, "learning_rate": 2.852221814610954e-05, "loss": 0.1934, "step": 6000 }, { "epoch": 0.18568151449381706, "grad_norm": 0.11461418479211462, "learning_rate": 2.8515577786564938e-05, "loss": 0.1962, "step": 6010 }, { "epoch": 0.18599046876086167, "grad_norm": 0.12701914196924702, "learning_rate": 2.8508923317879146e-05, "loss": 0.1933, "step": 6020 }, { "epoch": 0.1862994230279063, "grad_norm": 0.10620878119551466, "learning_rate": 2.8502254746998892e-05, "loss": 0.1913, "step": 6030 }, { "epoch": 0.18660837729495092, "grad_norm": 0.12540398396874758, "learning_rate": 2.8495572080885604e-05, "loss": 0.2027, "step": 6040 }, { "epoch": 0.18691733156199553, "grad_norm": 0.11185891031649897, "learning_rate": 2.848887532651543e-05, "loss": 0.1914, "step": 6050 }, { "epoch": 0.18722628582904016, "grad_norm": 0.10471835154321292, "learning_rate": 2.8482164490879227e-05, "loss": 0.1889, "step": 6060 }, { "epoch": 0.18753524009608477, "grad_norm": 0.11272716282275089, "learning_rate": 2.8475439580982543e-05, "loss": 0.1933, "step": 6070 }, { "epoch": 0.1878441943631294, "grad_norm": 0.11032134196256359, "learning_rate": 2.8468700603845642e-05, "loss": 0.1956, "step": 6080 }, { "epoch": 0.18815314863017402, "grad_norm": 0.13815453925637802, "learning_rate": 2.846194756650344e-05, "loss": 0.1919, "step": 6090 }, { "epoch": 0.18846210289721865, "grad_norm": 0.11573234574605717, "learning_rate": 2.8455180476005557e-05, "loss": 0.2012, "step": 6100 }, { "epoch": 0.18877105716426326, "grad_norm": 0.11196139186060282, "learning_rate": 2.8448399339416274e-05, "loss": 0.1929, "step": 6110 }, { "epoch": 0.18908001143130787, "grad_norm": 0.10739449957825083, "learning_rate": 2.8441604163814525e-05, "loss": 0.1916, "step": 6120 }, { "epoch": 0.1893889656983525, "grad_norm": 0.1161372376814427, "learning_rate": 2.843479495629392e-05, "loss": 0.1863, "step": 6130 }, { "epoch": 0.18969791996539712, "grad_norm": 0.14113036096515547, "learning_rate": 2.8427971723962702e-05, "loss": 0.2056, "step": 6140 }, { "epoch": 0.19000687423244175, "grad_norm": 0.10329701247144614, "learning_rate": 2.842113447394376e-05, "loss": 0.1958, "step": 6150 }, { "epoch": 0.19031582849948636, "grad_norm": 0.11459855113810817, "learning_rate": 2.8414283213374613e-05, "loss": 0.1955, "step": 6160 }, { "epoch": 0.190624782766531, "grad_norm": 0.11189483533384283, "learning_rate": 2.8407417949407414e-05, "loss": 0.1933, "step": 6170 }, { "epoch": 0.1909337370335756, "grad_norm": 0.10358808296525418, "learning_rate": 2.8400538689208924e-05, "loss": 0.1917, "step": 6180 }, { "epoch": 0.19124269130062022, "grad_norm": 0.12269036980168484, "learning_rate": 2.8393645439960516e-05, "loss": 0.1963, "step": 6190 }, { "epoch": 0.19155164556766485, "grad_norm": 0.10795805628499067, "learning_rate": 2.8386738208858177e-05, "loss": 0.1907, "step": 6200 }, { "epoch": 0.19186059983470946, "grad_norm": 0.10460423251159999, "learning_rate": 2.8379817003112482e-05, "loss": 0.1903, "step": 6210 }, { "epoch": 0.1921695541017541, "grad_norm": 0.11496650985336623, "learning_rate": 2.837288182994859e-05, "loss": 0.1919, "step": 6220 }, { "epoch": 0.1924785083687987, "grad_norm": 0.14483110291964724, "learning_rate": 2.836593269660625e-05, "loss": 0.1936, "step": 6230 }, { "epoch": 0.19278746263584334, "grad_norm": 0.09787707275628683, "learning_rate": 2.835896961033978e-05, "loss": 0.189, "step": 6240 }, { "epoch": 0.19309641690288795, "grad_norm": 0.1399375909350434, "learning_rate": 2.8351992578418068e-05, "loss": 0.1922, "step": 6250 }, { "epoch": 0.19340537116993256, "grad_norm": 0.12969252303227272, "learning_rate": 2.8345001608124543e-05, "loss": 0.1915, "step": 6260 }, { "epoch": 0.1937143254369772, "grad_norm": 0.1145848580359697, "learning_rate": 2.833799670675721e-05, "loss": 0.2005, "step": 6270 }, { "epoch": 0.1940232797040218, "grad_norm": 0.12357801812493495, "learning_rate": 2.8330977881628603e-05, "loss": 0.1898, "step": 6280 }, { "epoch": 0.19433223397106644, "grad_norm": 0.10802118534444101, "learning_rate": 2.832394514006579e-05, "loss": 0.1903, "step": 6290 }, { "epoch": 0.19464118823811105, "grad_norm": 0.7038205985344811, "learning_rate": 2.8316898489410373e-05, "loss": 0.2084, "step": 6300 }, { "epoch": 0.1949501425051557, "grad_norm": 0.852860941813526, "learning_rate": 2.8309837937018464e-05, "loss": 0.2115, "step": 6310 }, { "epoch": 0.1952590967722003, "grad_norm": 0.11807169443069974, "learning_rate": 2.8302763490260696e-05, "loss": 0.2002, "step": 6320 }, { "epoch": 0.1955680510392449, "grad_norm": 0.12318553183844727, "learning_rate": 2.8295675156522204e-05, "loss": 0.1987, "step": 6330 }, { "epoch": 0.19587700530628954, "grad_norm": 0.11085622907333874, "learning_rate": 2.8288572943202624e-05, "loss": 0.1925, "step": 6340 }, { "epoch": 0.19618595957333415, "grad_norm": 0.16905478398175539, "learning_rate": 2.8281456857716076e-05, "loss": 0.2044, "step": 6350 }, { "epoch": 0.1964949138403788, "grad_norm": 0.10671870520994445, "learning_rate": 2.8274326907491156e-05, "loss": 0.1911, "step": 6360 }, { "epoch": 0.1968038681074234, "grad_norm": 0.10623440103784029, "learning_rate": 2.8267183099970945e-05, "loss": 0.19, "step": 6370 }, { "epoch": 0.19711282237446803, "grad_norm": 0.15928224004029165, "learning_rate": 2.8260025442612983e-05, "loss": 0.1949, "step": 6380 }, { "epoch": 0.19742177664151264, "grad_norm": 0.11034131216285688, "learning_rate": 2.8252853942889274e-05, "loss": 0.1918, "step": 6390 }, { "epoch": 0.19773073090855725, "grad_norm": 0.1140506297139604, "learning_rate": 2.824566860828627e-05, "loss": 0.1928, "step": 6400 }, { "epoch": 0.1980396851756019, "grad_norm": 0.11926635456622503, "learning_rate": 2.8238469446304854e-05, "loss": 0.1912, "step": 6410 }, { "epoch": 0.1983486394426465, "grad_norm": 0.12481884844878596, "learning_rate": 2.823125646446036e-05, "loss": 0.1973, "step": 6420 }, { "epoch": 0.19865759370969113, "grad_norm": 0.11023358164463454, "learning_rate": 2.8224029670282544e-05, "loss": 0.1912, "step": 6430 }, { "epoch": 0.19896654797673574, "grad_norm": 0.157612940617181, "learning_rate": 2.8216789071315577e-05, "loss": 0.1904, "step": 6440 }, { "epoch": 0.19927550224378038, "grad_norm": 0.17797632091205678, "learning_rate": 2.8209534675118037e-05, "loss": 0.198, "step": 6450 }, { "epoch": 0.199584456510825, "grad_norm": 0.12752648901399943, "learning_rate": 2.8202266489262927e-05, "loss": 0.1992, "step": 6460 }, { "epoch": 0.1998934107778696, "grad_norm": 0.10197005313284616, "learning_rate": 2.8194984521337615e-05, "loss": 0.1991, "step": 6470 }, { "epoch": 0.20020236504491423, "grad_norm": 0.11157616962657688, "learning_rate": 2.8187688778943873e-05, "loss": 0.1955, "step": 6480 }, { "epoch": 0.20051131931195884, "grad_norm": 0.12325180760299378, "learning_rate": 2.818037926969786e-05, "loss": 0.1936, "step": 6490 }, { "epoch": 0.20082027357900348, "grad_norm": 0.12801150415023763, "learning_rate": 2.8173056001230084e-05, "loss": 0.1881, "step": 6500 }, { "epoch": 0.2011292278460481, "grad_norm": 0.10416101082645696, "learning_rate": 2.816571898118544e-05, "loss": 0.1922, "step": 6510 }, { "epoch": 0.20143818211309272, "grad_norm": 0.10950422981342788, "learning_rate": 2.815836821722316e-05, "loss": 0.1882, "step": 6520 }, { "epoch": 0.20174713638013733, "grad_norm": 0.09763003963050149, "learning_rate": 2.815100371701684e-05, "loss": 0.19, "step": 6530 }, { "epoch": 0.20205609064718194, "grad_norm": 0.10277512232078516, "learning_rate": 2.8143625488254397e-05, "loss": 0.1899, "step": 6540 }, { "epoch": 0.20236504491422658, "grad_norm": 0.11553245013740743, "learning_rate": 2.8136233538638094e-05, "loss": 0.1947, "step": 6550 }, { "epoch": 0.20267399918127119, "grad_norm": 0.11943120745573048, "learning_rate": 2.8128827875884514e-05, "loss": 0.191, "step": 6560 }, { "epoch": 0.20298295344831582, "grad_norm": 0.12929683588195923, "learning_rate": 2.812140850772455e-05, "loss": 0.1917, "step": 6570 }, { "epoch": 0.20329190771536043, "grad_norm": 0.11462916010589808, "learning_rate": 2.8113975441903408e-05, "loss": 0.1892, "step": 6580 }, { "epoch": 0.20360086198240507, "grad_norm": 0.10741753547213757, "learning_rate": 2.8106528686180597e-05, "loss": 0.1909, "step": 6590 }, { "epoch": 0.20390981624944968, "grad_norm": 0.12371768694612524, "learning_rate": 2.8099068248329903e-05, "loss": 0.1918, "step": 6600 }, { "epoch": 0.20421877051649429, "grad_norm": 0.17782467422845236, "learning_rate": 2.8091594136139415e-05, "loss": 0.2014, "step": 6610 }, { "epoch": 0.20452772478353892, "grad_norm": 0.13069920151152928, "learning_rate": 2.8084106357411475e-05, "loss": 0.1873, "step": 6620 }, { "epoch": 0.20483667905058353, "grad_norm": 0.1055617718054131, "learning_rate": 2.8076604919962707e-05, "loss": 0.1904, "step": 6630 }, { "epoch": 0.20514563331762817, "grad_norm": 0.1159829518595496, "learning_rate": 2.8069089831623987e-05, "loss": 0.1963, "step": 6640 }, { "epoch": 0.20545458758467278, "grad_norm": 0.10335240155650839, "learning_rate": 2.8061561100240446e-05, "loss": 0.194, "step": 6650 }, { "epoch": 0.2057635418517174, "grad_norm": 0.10692461701538676, "learning_rate": 2.8054018733671454e-05, "loss": 0.1879, "step": 6660 }, { "epoch": 0.20607249611876202, "grad_norm": 0.11119981470419012, "learning_rate": 2.804646273979061e-05, "loss": 0.1985, "step": 6670 }, { "epoch": 0.20638145038580663, "grad_norm": 0.1350643179785673, "learning_rate": 2.8038893126485754e-05, "loss": 0.1936, "step": 6680 }, { "epoch": 0.20669040465285127, "grad_norm": 0.17007718558751797, "learning_rate": 2.8031309901658927e-05, "loss": 0.1916, "step": 6690 }, { "epoch": 0.20699935891989588, "grad_norm": 0.10746164982745886, "learning_rate": 2.802371307322639e-05, "loss": 0.1887, "step": 6700 }, { "epoch": 0.2073083131869405, "grad_norm": 0.10296855946725689, "learning_rate": 2.8016102649118596e-05, "loss": 0.1968, "step": 6710 }, { "epoch": 0.20761726745398512, "grad_norm": 0.11822232947799508, "learning_rate": 2.80084786372802e-05, "loss": 0.1948, "step": 6720 }, { "epoch": 0.20792622172102976, "grad_norm": 0.1049014841435582, "learning_rate": 2.8000841045670038e-05, "loss": 0.186, "step": 6730 }, { "epoch": 0.20823517598807437, "grad_norm": 0.12518456734681194, "learning_rate": 2.7993189882261124e-05, "loss": 0.1891, "step": 6740 }, { "epoch": 0.20854413025511898, "grad_norm": 0.11229137823017848, "learning_rate": 2.7985525155040626e-05, "loss": 0.1932, "step": 6750 }, { "epoch": 0.2088530845221636, "grad_norm": 0.10920721402430587, "learning_rate": 2.7977846872009897e-05, "loss": 0.1948, "step": 6760 }, { "epoch": 0.20916203878920822, "grad_norm": 0.14095829376780517, "learning_rate": 2.7970155041184414e-05, "loss": 0.1952, "step": 6770 }, { "epoch": 0.20947099305625286, "grad_norm": 0.13626041179383216, "learning_rate": 2.7962449670593822e-05, "loss": 0.2017, "step": 6780 }, { "epoch": 0.20977994732329747, "grad_norm": 0.10590137512169466, "learning_rate": 2.795473076828188e-05, "loss": 0.1933, "step": 6790 }, { "epoch": 0.2100889015903421, "grad_norm": 0.12858921928245667, "learning_rate": 2.7946998342306485e-05, "loss": 0.2017, "step": 6800 }, { "epoch": 0.2103978558573867, "grad_norm": 0.12263130724996488, "learning_rate": 2.793925240073965e-05, "loss": 0.1887, "step": 6810 }, { "epoch": 0.21070681012443132, "grad_norm": 0.11411065395288106, "learning_rate": 2.7931492951667486e-05, "loss": 0.2068, "step": 6820 }, { "epoch": 0.21101576439147596, "grad_norm": 0.14052334966829683, "learning_rate": 2.792372000319022e-05, "loss": 0.1903, "step": 6830 }, { "epoch": 0.21132471865852057, "grad_norm": 0.10184578448643603, "learning_rate": 2.7915933563422168e-05, "loss": 0.1957, "step": 6840 }, { "epoch": 0.2116336729255652, "grad_norm": 0.15600508192428814, "learning_rate": 2.7908133640491724e-05, "loss": 0.193, "step": 6850 }, { "epoch": 0.2119426271926098, "grad_norm": 0.1155550515031426, "learning_rate": 2.7900320242541355e-05, "loss": 0.1891, "step": 6860 }, { "epoch": 0.21225158145965445, "grad_norm": 0.13793849430052257, "learning_rate": 2.7892493377727602e-05, "loss": 0.1894, "step": 6870 }, { "epoch": 0.21256053572669906, "grad_norm": 0.10042125672168209, "learning_rate": 2.7884653054221068e-05, "loss": 0.1921, "step": 6880 }, { "epoch": 0.21286948999374367, "grad_norm": 0.13407576631381074, "learning_rate": 2.7876799280206394e-05, "loss": 0.201, "step": 6890 }, { "epoch": 0.2131784442607883, "grad_norm": 0.10873064021666524, "learning_rate": 2.786893206388227e-05, "loss": 0.1896, "step": 6900 }, { "epoch": 0.2134873985278329, "grad_norm": 0.11818610369536223, "learning_rate": 2.786105141346142e-05, "loss": 0.1966, "step": 6910 }, { "epoch": 0.21379635279487755, "grad_norm": 0.12481232088656549, "learning_rate": 2.785315733717058e-05, "loss": 0.1971, "step": 6920 }, { "epoch": 0.21410530706192216, "grad_norm": 0.14966870833411877, "learning_rate": 2.7845249843250518e-05, "loss": 0.2037, "step": 6930 }, { "epoch": 0.2144142613289668, "grad_norm": 0.1251945341593738, "learning_rate": 2.7837328939955997e-05, "loss": 0.1916, "step": 6940 }, { "epoch": 0.2147232155960114, "grad_norm": 0.12207705977247393, "learning_rate": 2.782939463555579e-05, "loss": 0.1974, "step": 6950 }, { "epoch": 0.215032169863056, "grad_norm": 0.12038734457901437, "learning_rate": 2.7821446938332637e-05, "loss": 0.197, "step": 6960 }, { "epoch": 0.21534112413010065, "grad_norm": 0.09818291730818668, "learning_rate": 2.781348585658329e-05, "loss": 0.1907, "step": 6970 }, { "epoch": 0.21565007839714526, "grad_norm": 0.09848192293933164, "learning_rate": 2.780551139861845e-05, "loss": 0.1878, "step": 6980 }, { "epoch": 0.2159590326641899, "grad_norm": 0.16289460630025418, "learning_rate": 2.779752357276279e-05, "loss": 0.1873, "step": 6990 }, { "epoch": 0.2162679869312345, "grad_norm": 0.10641772126448294, "learning_rate": 2.778952238735494e-05, "loss": 0.1844, "step": 7000 }, { "epoch": 0.21657694119827914, "grad_norm": 0.12320916475177035, "learning_rate": 2.7781507850747465e-05, "loss": 0.1905, "step": 7010 }, { "epoch": 0.21688589546532375, "grad_norm": 0.1206318889827832, "learning_rate": 2.777347997130688e-05, "loss": 0.193, "step": 7020 }, { "epoch": 0.21719484973236836, "grad_norm": 0.12415769443532793, "learning_rate": 2.776543875741363e-05, "loss": 0.1973, "step": 7030 }, { "epoch": 0.217503803999413, "grad_norm": 0.10105900635222193, "learning_rate": 2.7757384217462074e-05, "loss": 0.1855, "step": 7040 }, { "epoch": 0.2178127582664576, "grad_norm": 0.12183775230754369, "learning_rate": 2.7749316359860473e-05, "loss": 0.1974, "step": 7050 }, { "epoch": 0.21812171253350224, "grad_norm": 0.11627326751129516, "learning_rate": 2.7741235193031013e-05, "loss": 0.1901, "step": 7060 }, { "epoch": 0.21843066680054685, "grad_norm": 0.11888956708480215, "learning_rate": 2.7733140725409753e-05, "loss": 0.1919, "step": 7070 }, { "epoch": 0.21873962106759148, "grad_norm": 0.10836367743252319, "learning_rate": 2.772503296544665e-05, "loss": 0.1974, "step": 7080 }, { "epoch": 0.2190485753346361, "grad_norm": 0.09845270251147463, "learning_rate": 2.7716911921605527e-05, "loss": 0.192, "step": 7090 }, { "epoch": 0.2193575296016807, "grad_norm": 0.09759744848301183, "learning_rate": 2.7708777602364084e-05, "loss": 0.189, "step": 7100 }, { "epoch": 0.21966648386872534, "grad_norm": 0.16511348874597986, "learning_rate": 2.7700630016213873e-05, "loss": 0.1994, "step": 7110 }, { "epoch": 0.21997543813576995, "grad_norm": 0.10122062268642094, "learning_rate": 2.7692469171660298e-05, "loss": 0.1897, "step": 7120 }, { "epoch": 0.22028439240281458, "grad_norm": 0.1690160311789051, "learning_rate": 2.7684295077222606e-05, "loss": 0.1946, "step": 7130 }, { "epoch": 0.2205933466698592, "grad_norm": 0.1037572984921032, "learning_rate": 2.7676107741433868e-05, "loss": 0.196, "step": 7140 }, { "epoch": 0.22090230093690383, "grad_norm": 0.10057158452622221, "learning_rate": 2.7667907172840985e-05, "loss": 0.1923, "step": 7150 }, { "epoch": 0.22121125520394844, "grad_norm": 0.11703235990845123, "learning_rate": 2.7659693380004672e-05, "loss": 0.1901, "step": 7160 }, { "epoch": 0.22152020947099305, "grad_norm": 0.12712275759726363, "learning_rate": 2.7651466371499447e-05, "loss": 0.1934, "step": 7170 }, { "epoch": 0.22182916373803768, "grad_norm": 0.10892070832647131, "learning_rate": 2.764322615591362e-05, "loss": 0.1894, "step": 7180 }, { "epoch": 0.2221381180050823, "grad_norm": 0.12384774155062112, "learning_rate": 2.7634972741849293e-05, "loss": 0.2006, "step": 7190 }, { "epoch": 0.22244707227212693, "grad_norm": 0.1443436358378938, "learning_rate": 2.762670613792235e-05, "loss": 0.2042, "step": 7200 }, { "epoch": 0.22275602653917154, "grad_norm": 0.0902185286773544, "learning_rate": 2.7618426352762427e-05, "loss": 0.1925, "step": 7210 }, { "epoch": 0.22306498080621617, "grad_norm": 0.1058137828159615, "learning_rate": 2.7610133395012946e-05, "loss": 0.1901, "step": 7220 }, { "epoch": 0.22337393507326078, "grad_norm": 0.11373454837980301, "learning_rate": 2.7601827273331065e-05, "loss": 0.1941, "step": 7230 }, { "epoch": 0.2236828893403054, "grad_norm": 0.09076918742921525, "learning_rate": 2.7593507996387673e-05, "loss": 0.1964, "step": 7240 }, { "epoch": 0.22399184360735003, "grad_norm": 0.1950118432774784, "learning_rate": 2.758517557286742e-05, "loss": 0.1972, "step": 7250 }, { "epoch": 0.22430079787439464, "grad_norm": 0.10617294180572731, "learning_rate": 2.7576830011468652e-05, "loss": 0.195, "step": 7260 }, { "epoch": 0.22460975214143927, "grad_norm": 0.14309770436023087, "learning_rate": 2.7568471320903453e-05, "loss": 0.1935, "step": 7270 }, { "epoch": 0.22491870640848388, "grad_norm": 0.09947273293633722, "learning_rate": 2.7560099509897593e-05, "loss": 0.2082, "step": 7280 }, { "epoch": 0.22522766067552852, "grad_norm": 0.10477746443337488, "learning_rate": 2.755171458719055e-05, "loss": 0.1893, "step": 7290 }, { "epoch": 0.22553661494257313, "grad_norm": 0.11140754503234265, "learning_rate": 2.7543316561535493e-05, "loss": 0.1911, "step": 7300 }, { "epoch": 0.22584556920961774, "grad_norm": 0.10230070448574866, "learning_rate": 2.7534905441699255e-05, "loss": 0.1921, "step": 7310 }, { "epoch": 0.22615452347666237, "grad_norm": 0.10504554261460508, "learning_rate": 2.7526481236462355e-05, "loss": 0.1832, "step": 7320 }, { "epoch": 0.22646347774370698, "grad_norm": 0.09942788623431183, "learning_rate": 2.751804395461896e-05, "loss": 0.1901, "step": 7330 }, { "epoch": 0.22677243201075162, "grad_norm": 0.12870863618383507, "learning_rate": 2.7509593604976887e-05, "loss": 0.1894, "step": 7340 }, { "epoch": 0.22708138627779623, "grad_norm": 0.0906429526198156, "learning_rate": 2.750113019635761e-05, "loss": 0.1928, "step": 7350 }, { "epoch": 0.22739034054484086, "grad_norm": 0.11427297980006616, "learning_rate": 2.7492653737596216e-05, "loss": 0.2092, "step": 7360 }, { "epoch": 0.22769929481188547, "grad_norm": 0.1112130919921356, "learning_rate": 2.748416423754143e-05, "loss": 0.2022, "step": 7370 }, { "epoch": 0.22800824907893008, "grad_norm": 0.14104233187512377, "learning_rate": 2.747566170505558e-05, "loss": 0.2024, "step": 7380 }, { "epoch": 0.22831720334597472, "grad_norm": 0.12499517138025472, "learning_rate": 2.746714614901461e-05, "loss": 0.1911, "step": 7390 }, { "epoch": 0.22862615761301933, "grad_norm": 0.11490983239759463, "learning_rate": 2.7458617578308046e-05, "loss": 0.1887, "step": 7400 }, { "epoch": 0.22893511188006396, "grad_norm": 0.10861871017164962, "learning_rate": 2.7450076001839018e-05, "loss": 0.1975, "step": 7410 }, { "epoch": 0.22924406614710857, "grad_norm": 0.10381821105467194, "learning_rate": 2.7441521428524216e-05, "loss": 0.1942, "step": 7420 }, { "epoch": 0.2295530204141532, "grad_norm": 0.15815281263510628, "learning_rate": 2.74329538672939e-05, "loss": 0.1975, "step": 7430 }, { "epoch": 0.22986197468119782, "grad_norm": 0.13564816041505678, "learning_rate": 2.7424373327091897e-05, "loss": 0.1886, "step": 7440 }, { "epoch": 0.23017092894824243, "grad_norm": 0.13467168641030136, "learning_rate": 2.7415779816875585e-05, "loss": 0.198, "step": 7450 }, { "epoch": 0.23047988321528706, "grad_norm": 0.12859509587750653, "learning_rate": 2.7407173345615867e-05, "loss": 0.2001, "step": 7460 }, { "epoch": 0.23078883748233167, "grad_norm": 0.10624362646867024, "learning_rate": 2.7398553922297192e-05, "loss": 0.1908, "step": 7470 }, { "epoch": 0.2310977917493763, "grad_norm": 0.11668205761717289, "learning_rate": 2.7389921555917513e-05, "loss": 0.1864, "step": 7480 }, { "epoch": 0.23140674601642092, "grad_norm": 0.10783332997141962, "learning_rate": 2.738127625548831e-05, "loss": 0.1904, "step": 7490 }, { "epoch": 0.23171570028346555, "grad_norm": 0.11748499343151125, "learning_rate": 2.737261803003456e-05, "loss": 0.1864, "step": 7500 }, { "epoch": 0.23202465455051016, "grad_norm": 0.13975556960830696, "learning_rate": 2.7363946888594723e-05, "loss": 0.1934, "step": 7510 }, { "epoch": 0.23233360881755477, "grad_norm": 0.1114144356338949, "learning_rate": 2.7355262840220768e-05, "loss": 0.1898, "step": 7520 }, { "epoch": 0.2326425630845994, "grad_norm": 0.19218149019117217, "learning_rate": 2.73465658939781e-05, "loss": 0.1896, "step": 7530 }, { "epoch": 0.23295151735164402, "grad_norm": 0.11827272441099144, "learning_rate": 2.733785605894562e-05, "loss": 0.1896, "step": 7540 }, { "epoch": 0.23326047161868865, "grad_norm": 0.10231517550773084, "learning_rate": 2.732913334421567e-05, "loss": 0.1902, "step": 7550 }, { "epoch": 0.23356942588573326, "grad_norm": 0.13695325334624153, "learning_rate": 2.7320397758894045e-05, "loss": 0.1986, "step": 7560 }, { "epoch": 0.2338783801527779, "grad_norm": 0.09913754561076096, "learning_rate": 2.7311649312099964e-05, "loss": 0.1898, "step": 7570 }, { "epoch": 0.2341873344198225, "grad_norm": 0.10233966911193337, "learning_rate": 2.7302888012966085e-05, "loss": 0.1958, "step": 7580 }, { "epoch": 0.23449628868686712, "grad_norm": 0.11685434890734718, "learning_rate": 2.7294113870638474e-05, "loss": 0.1902, "step": 7590 }, { "epoch": 0.23480524295391175, "grad_norm": 0.10073743758366671, "learning_rate": 2.728532689427661e-05, "loss": 0.1922, "step": 7600 }, { "epoch": 0.23511419722095636, "grad_norm": 0.10684939012097934, "learning_rate": 2.727652709305336e-05, "loss": 0.1922, "step": 7610 }, { "epoch": 0.235423151488001, "grad_norm": 0.09699939241548962, "learning_rate": 2.726771447615499e-05, "loss": 0.1871, "step": 7620 }, { "epoch": 0.2357321057550456, "grad_norm": 0.15601884781782124, "learning_rate": 2.7258889052781138e-05, "loss": 0.1916, "step": 7630 }, { "epoch": 0.23604106002209024, "grad_norm": 0.11755291522610203, "learning_rate": 2.725005083214482e-05, "loss": 0.1926, "step": 7640 }, { "epoch": 0.23635001428913485, "grad_norm": 0.0957888696538469, "learning_rate": 2.72411998234724e-05, "loss": 0.1918, "step": 7650 }, { "epoch": 0.23665896855617946, "grad_norm": 0.10384235779595037, "learning_rate": 2.7232336036003595e-05, "loss": 0.2016, "step": 7660 }, { "epoch": 0.2369679228232241, "grad_norm": 0.10016558170903206, "learning_rate": 2.722345947899147e-05, "loss": 0.1927, "step": 7670 }, { "epoch": 0.2372768770902687, "grad_norm": 0.09514114947045538, "learning_rate": 2.7214570161702405e-05, "loss": 0.1871, "step": 7680 }, { "epoch": 0.23758583135731334, "grad_norm": 0.10949082524512631, "learning_rate": 2.720566809341611e-05, "loss": 0.1914, "step": 7690 }, { "epoch": 0.23789478562435795, "grad_norm": 0.10890151055771378, "learning_rate": 2.719675328342561e-05, "loss": 0.1908, "step": 7700 }, { "epoch": 0.2382037398914026, "grad_norm": 0.10465628872644676, "learning_rate": 2.7187825741037232e-05, "loss": 0.1882, "step": 7710 }, { "epoch": 0.2385126941584472, "grad_norm": 0.10402768513972506, "learning_rate": 2.7178885475570576e-05, "loss": 0.1926, "step": 7720 }, { "epoch": 0.2388216484254918, "grad_norm": 0.10955252072745258, "learning_rate": 2.7169932496358542e-05, "loss": 0.1911, "step": 7730 }, { "epoch": 0.23913060269253644, "grad_norm": 0.08596550377900035, "learning_rate": 2.71609668127473e-05, "loss": 0.1994, "step": 7740 }, { "epoch": 0.23943955695958105, "grad_norm": 0.0987463813479826, "learning_rate": 2.7151988434096275e-05, "loss": 0.1883, "step": 7750 }, { "epoch": 0.2397485112266257, "grad_norm": 0.1064028381898475, "learning_rate": 2.7142997369778155e-05, "loss": 0.2003, "step": 7760 }, { "epoch": 0.2400574654936703, "grad_norm": 0.11893651330649783, "learning_rate": 2.713399362917886e-05, "loss": 0.1929, "step": 7770 }, { "epoch": 0.24036641976071493, "grad_norm": 0.10998904702589943, "learning_rate": 2.7124977221697547e-05, "loss": 0.1872, "step": 7780 }, { "epoch": 0.24067537402775954, "grad_norm": 0.11354255934291793, "learning_rate": 2.7115948156746595e-05, "loss": 0.1883, "step": 7790 }, { "epoch": 0.24098432829480415, "grad_norm": 0.11292108116796575, "learning_rate": 2.710690644375161e-05, "loss": 0.2198, "step": 7800 }, { "epoch": 0.2412932825618488, "grad_norm": 0.10942923077248073, "learning_rate": 2.7097852092151376e-05, "loss": 0.1867, "step": 7810 }, { "epoch": 0.2416022368288934, "grad_norm": 0.11768641645499137, "learning_rate": 2.708878511139789e-05, "loss": 0.192, "step": 7820 }, { "epoch": 0.24191119109593803, "grad_norm": 0.11109350143861754, "learning_rate": 2.707970551095633e-05, "loss": 0.1913, "step": 7830 }, { "epoch": 0.24222014536298264, "grad_norm": 0.11858098291737695, "learning_rate": 2.707061330030504e-05, "loss": 0.1937, "step": 7840 }, { "epoch": 0.24252909963002728, "grad_norm": 0.14887299272500534, "learning_rate": 2.7061508488935535e-05, "loss": 0.1971, "step": 7850 }, { "epoch": 0.2428380538970719, "grad_norm": 0.1106518196749893, "learning_rate": 2.7052391086352486e-05, "loss": 0.191, "step": 7860 }, { "epoch": 0.2431470081641165, "grad_norm": 0.11219662644614184, "learning_rate": 2.7043261102073697e-05, "loss": 0.1871, "step": 7870 }, { "epoch": 0.24345596243116113, "grad_norm": 0.12963570048249912, "learning_rate": 2.703411854563012e-05, "loss": 0.191, "step": 7880 }, { "epoch": 0.24376491669820574, "grad_norm": 0.11378349978891705, "learning_rate": 2.702496342656582e-05, "loss": 0.1898, "step": 7890 }, { "epoch": 0.24407387096525038, "grad_norm": 0.11259272531566963, "learning_rate": 2.7015795754437985e-05, "loss": 0.1963, "step": 7900 }, { "epoch": 0.244382825232295, "grad_norm": 0.10641185316248417, "learning_rate": 2.7006615538816903e-05, "loss": 0.1863, "step": 7910 }, { "epoch": 0.24469177949933962, "grad_norm": 0.13819996798691975, "learning_rate": 2.6997422789285958e-05, "loss": 0.1986, "step": 7920 }, { "epoch": 0.24500073376638423, "grad_norm": 0.15848509465337973, "learning_rate": 2.6988217515441615e-05, "loss": 0.199, "step": 7930 }, { "epoch": 0.24530968803342884, "grad_norm": 0.1117568604112502, "learning_rate": 2.6978999726893417e-05, "loss": 0.1949, "step": 7940 }, { "epoch": 0.24561864230047348, "grad_norm": 0.1292327900706852, "learning_rate": 2.6969769433263965e-05, "loss": 0.1968, "step": 7950 }, { "epoch": 0.2459275965675181, "grad_norm": 0.10906225810666081, "learning_rate": 2.6960526644188927e-05, "loss": 0.1948, "step": 7960 }, { "epoch": 0.24623655083456272, "grad_norm": 0.09872685121132838, "learning_rate": 2.695127136931701e-05, "loss": 0.1878, "step": 7970 }, { "epoch": 0.24654550510160733, "grad_norm": 0.09211463702369019, "learning_rate": 2.6942003618309937e-05, "loss": 0.1911, "step": 7980 }, { "epoch": 0.24685445936865197, "grad_norm": 0.08386993835617362, "learning_rate": 2.6932723400842486e-05, "loss": 0.1883, "step": 7990 }, { "epoch": 0.24716341363569658, "grad_norm": 0.10076472491363259, "learning_rate": 2.6923430726602425e-05, "loss": 0.191, "step": 8000 }, { "epoch": 0.24747236790274119, "grad_norm": 0.10977878502585547, "learning_rate": 2.6914125605290542e-05, "loss": 0.1906, "step": 8010 }, { "epoch": 0.24778132216978582, "grad_norm": 0.10862240634766637, "learning_rate": 2.6904808046620604e-05, "loss": 0.1856, "step": 8020 }, { "epoch": 0.24809027643683043, "grad_norm": 0.11527181158680835, "learning_rate": 2.689547806031937e-05, "loss": 0.1887, "step": 8030 }, { "epoch": 0.24839923070387507, "grad_norm": 0.09557266955615018, "learning_rate": 2.6886135656126577e-05, "loss": 0.1867, "step": 8040 }, { "epoch": 0.24870818497091968, "grad_norm": 0.1021792289141178, "learning_rate": 2.6876780843794908e-05, "loss": 0.1915, "step": 8050 }, { "epoch": 0.2490171392379643, "grad_norm": 0.09753339540571344, "learning_rate": 2.6867413633090025e-05, "loss": 0.1989, "step": 8060 }, { "epoch": 0.24932609350500892, "grad_norm": 0.12230575386492942, "learning_rate": 2.6858034033790505e-05, "loss": 0.1902, "step": 8070 }, { "epoch": 0.24963504777205353, "grad_norm": 0.10760106835552179, "learning_rate": 2.684864205568788e-05, "loss": 0.1901, "step": 8080 }, { "epoch": 0.24994400203909817, "grad_norm": 0.12777365353219655, "learning_rate": 2.683923770858659e-05, "loss": 0.1861, "step": 8090 }, { "epoch": 0.2502529563061428, "grad_norm": 0.11362177095837529, "learning_rate": 2.6829821002303997e-05, "loss": 0.1914, "step": 8100 }, { "epoch": 0.2505619105731874, "grad_norm": 0.16071911627078958, "learning_rate": 2.682039194667036e-05, "loss": 0.2075, "step": 8110 }, { "epoch": 0.250870864840232, "grad_norm": 0.0994911056032934, "learning_rate": 2.681095055152883e-05, "loss": 0.185, "step": 8120 }, { "epoch": 0.25117981910727666, "grad_norm": 0.10651352049455586, "learning_rate": 2.6801496826735432e-05, "loss": 0.1934, "step": 8130 }, { "epoch": 0.25148877337432124, "grad_norm": 0.11171894213037324, "learning_rate": 2.6792030782159082e-05, "loss": 0.1872, "step": 8140 }, { "epoch": 0.2517977276413659, "grad_norm": 0.12181299260141097, "learning_rate": 2.6782552427681533e-05, "loss": 0.1909, "step": 8150 }, { "epoch": 0.2521066819084105, "grad_norm": 0.11060480769569164, "learning_rate": 2.6773061773197413e-05, "loss": 0.1866, "step": 8160 }, { "epoch": 0.25241563617545515, "grad_norm": 0.10113486964251968, "learning_rate": 2.6763558828614166e-05, "loss": 0.1889, "step": 8170 }, { "epoch": 0.25272459044249973, "grad_norm": 0.11633152832284499, "learning_rate": 2.6754043603852088e-05, "loss": 0.1892, "step": 8180 }, { "epoch": 0.25303354470954437, "grad_norm": 0.1129318134656178, "learning_rate": 2.6744516108844274e-05, "loss": 0.2029, "step": 8190 }, { "epoch": 0.253342498976589, "grad_norm": 0.08594486928553212, "learning_rate": 2.673497635353664e-05, "loss": 0.1853, "step": 8200 }, { "epoch": 0.2536514532436336, "grad_norm": 0.10438205925856926, "learning_rate": 2.6725424347887903e-05, "loss": 0.1888, "step": 8210 }, { "epoch": 0.2539604075106782, "grad_norm": 0.0830780927378656, "learning_rate": 2.671586010186956e-05, "loss": 0.1933, "step": 8220 }, { "epoch": 0.25426936177772286, "grad_norm": 0.1654442951133926, "learning_rate": 2.670628362546589e-05, "loss": 0.2002, "step": 8230 }, { "epoch": 0.2545783160447675, "grad_norm": 0.1531186496346408, "learning_rate": 2.669669492867394e-05, "loss": 0.1959, "step": 8240 }, { "epoch": 0.2548872703118121, "grad_norm": 0.10758917016702108, "learning_rate": 2.6687094021503515e-05, "loss": 0.1877, "step": 8250 }, { "epoch": 0.2551962245788567, "grad_norm": 0.0996373769963642, "learning_rate": 2.667748091397716e-05, "loss": 0.1872, "step": 8260 }, { "epoch": 0.25550517884590135, "grad_norm": 0.10931966816599097, "learning_rate": 2.666785561613017e-05, "loss": 0.1907, "step": 8270 }, { "epoch": 0.25581413311294593, "grad_norm": 0.14853693797824316, "learning_rate": 2.6658218138010552e-05, "loss": 0.1882, "step": 8280 }, { "epoch": 0.25612308737999057, "grad_norm": 0.1121180081710152, "learning_rate": 2.6648568489679027e-05, "loss": 0.1954, "step": 8290 }, { "epoch": 0.2564320416470352, "grad_norm": 0.10488948376769945, "learning_rate": 2.6638906681209036e-05, "loss": 0.1879, "step": 8300 }, { "epoch": 0.25674099591407984, "grad_norm": 1.0799296386711583, "learning_rate": 2.66292327226867e-05, "loss": 0.1971, "step": 8310 }, { "epoch": 0.2570499501811244, "grad_norm": 0.08770130701395235, "learning_rate": 2.661954662421083e-05, "loss": 0.1869, "step": 8320 }, { "epoch": 0.25735890444816906, "grad_norm": 0.10937918136719363, "learning_rate": 2.6609848395892912e-05, "loss": 0.19, "step": 8330 }, { "epoch": 0.2576678587152137, "grad_norm": 0.11924512956831727, "learning_rate": 2.6600138047857087e-05, "loss": 0.1902, "step": 8340 }, { "epoch": 0.2579768129822583, "grad_norm": 0.1510606292166939, "learning_rate": 2.659041559024015e-05, "loss": 0.1999, "step": 8350 }, { "epoch": 0.2582857672493029, "grad_norm": 0.14802474236102292, "learning_rate": 2.6580681033191548e-05, "loss": 0.1944, "step": 8360 }, { "epoch": 0.25859472151634755, "grad_norm": 0.09347449504038548, "learning_rate": 2.6570934386873343e-05, "loss": 0.1885, "step": 8370 }, { "epoch": 0.2589036757833922, "grad_norm": 0.09351642486398651, "learning_rate": 2.6561175661460226e-05, "loss": 0.1861, "step": 8380 }, { "epoch": 0.25921263005043677, "grad_norm": 0.10833451271862764, "learning_rate": 2.6551404867139503e-05, "loss": 0.1863, "step": 8390 }, { "epoch": 0.2595215843174814, "grad_norm": 0.20552700992879117, "learning_rate": 2.6541622014111058e-05, "loss": 0.1948, "step": 8400 }, { "epoch": 0.25983053858452604, "grad_norm": 0.11559804825891065, "learning_rate": 2.653182711258739e-05, "loss": 0.1888, "step": 8410 }, { "epoch": 0.2601394928515706, "grad_norm": 0.09789129860045476, "learning_rate": 2.6522020172793555e-05, "loss": 0.1891, "step": 8420 }, { "epoch": 0.26044844711861526, "grad_norm": 0.1238643847309866, "learning_rate": 2.6512201204967194e-05, "loss": 0.2003, "step": 8430 }, { "epoch": 0.2607574013856599, "grad_norm": 0.12050982567076174, "learning_rate": 2.6502370219358484e-05, "loss": 0.1865, "step": 8440 }, { "epoch": 0.26106635565270453, "grad_norm": 0.11492470628163776, "learning_rate": 2.6492527226230162e-05, "loss": 0.1905, "step": 8450 }, { "epoch": 0.2613753099197491, "grad_norm": 0.09696514200967198, "learning_rate": 2.6482672235857496e-05, "loss": 0.1997, "step": 8460 }, { "epoch": 0.26168426418679375, "grad_norm": 0.10480432447934097, "learning_rate": 2.6472805258528274e-05, "loss": 0.1917, "step": 8470 }, { "epoch": 0.2619932184538384, "grad_norm": 0.16726727929445964, "learning_rate": 2.6462926304542807e-05, "loss": 0.1946, "step": 8480 }, { "epoch": 0.26230217272088296, "grad_norm": 0.1228501564007655, "learning_rate": 2.6453035384213902e-05, "loss": 0.1888, "step": 8490 }, { "epoch": 0.2626111269879276, "grad_norm": 0.10757566489326317, "learning_rate": 2.644313250786686e-05, "loss": 0.1879, "step": 8500 }, { "epoch": 0.26292008125497224, "grad_norm": 0.10570638616014276, "learning_rate": 2.643321768583945e-05, "loss": 0.19, "step": 8510 }, { "epoch": 0.2632290355220169, "grad_norm": 0.13579051567335834, "learning_rate": 2.6423290928481936e-05, "loss": 0.1917, "step": 8520 }, { "epoch": 0.26353798978906146, "grad_norm": 0.11825966431712678, "learning_rate": 2.6413352246157026e-05, "loss": 0.1887, "step": 8530 }, { "epoch": 0.2638469440561061, "grad_norm": 0.09504572464949657, "learning_rate": 2.6403401649239876e-05, "loss": 0.1903, "step": 8540 }, { "epoch": 0.26415589832315073, "grad_norm": 0.10563766950029646, "learning_rate": 2.6393439148118087e-05, "loss": 0.1839, "step": 8550 }, { "epoch": 0.2644648525901953, "grad_norm": 0.1189064346208586, "learning_rate": 2.6383464753191682e-05, "loss": 0.1976, "step": 8560 }, { "epoch": 0.26477380685723995, "grad_norm": 0.10491874329682777, "learning_rate": 2.6373478474873095e-05, "loss": 0.1869, "step": 8570 }, { "epoch": 0.2650827611242846, "grad_norm": 0.13054816684170087, "learning_rate": 2.6363480323587184e-05, "loss": 0.2002, "step": 8580 }, { "epoch": 0.2653917153913292, "grad_norm": 0.10884577615051691, "learning_rate": 2.6353470309771176e-05, "loss": 0.1871, "step": 8590 }, { "epoch": 0.2657006696583738, "grad_norm": 0.33459869289623057, "learning_rate": 2.63434484438747e-05, "loss": 0.2186, "step": 8600 }, { "epoch": 0.26600962392541844, "grad_norm": 0.11021312725841806, "learning_rate": 2.6333414736359757e-05, "loss": 0.1889, "step": 8610 }, { "epoch": 0.2663185781924631, "grad_norm": 0.6727696788407207, "learning_rate": 2.6323369197700696e-05, "loss": 0.1947, "step": 8620 }, { "epoch": 0.26662753245950765, "grad_norm": 0.1090738335756151, "learning_rate": 2.6313311838384227e-05, "loss": 0.1951, "step": 8630 }, { "epoch": 0.2669364867265523, "grad_norm": 0.10213255707094637, "learning_rate": 2.6303242668909404e-05, "loss": 0.1979, "step": 8640 }, { "epoch": 0.2672454409935969, "grad_norm": 0.12960536832691863, "learning_rate": 2.6293161699787604e-05, "loss": 0.1884, "step": 8650 }, { "epoch": 0.26755439526064156, "grad_norm": 0.19761273648447258, "learning_rate": 2.6283068941542513e-05, "loss": 0.1952, "step": 8660 }, { "epoch": 0.26786334952768615, "grad_norm": 0.10896656958532387, "learning_rate": 2.6272964404710135e-05, "loss": 0.188, "step": 8670 }, { "epoch": 0.2681723037947308, "grad_norm": 0.13125066011172226, "learning_rate": 2.6262848099838777e-05, "loss": 0.1992, "step": 8680 }, { "epoch": 0.2684812580617754, "grad_norm": 0.13997887897511796, "learning_rate": 2.6252720037489012e-05, "loss": 0.2042, "step": 8690 }, { "epoch": 0.26879021232882, "grad_norm": 0.09697033217541794, "learning_rate": 2.6242580228233702e-05, "loss": 0.1879, "step": 8700 }, { "epoch": 0.26909916659586464, "grad_norm": 0.11741103151081521, "learning_rate": 2.6232428682657964e-05, "loss": 0.1874, "step": 8710 }, { "epoch": 0.2694081208629093, "grad_norm": 0.08964007187317638, "learning_rate": 2.6222265411359168e-05, "loss": 0.1966, "step": 8720 }, { "epoch": 0.2697170751299539, "grad_norm": 0.08888805133889255, "learning_rate": 2.621209042494692e-05, "loss": 0.188, "step": 8730 }, { "epoch": 0.2700260293969985, "grad_norm": 0.09658041854013423, "learning_rate": 2.6201903734043074e-05, "loss": 0.1888, "step": 8740 }, { "epoch": 0.2703349836640431, "grad_norm": 0.11319723780103118, "learning_rate": 2.6191705349281675e-05, "loss": 0.1868, "step": 8750 }, { "epoch": 0.27064393793108776, "grad_norm": 0.10743229763229148, "learning_rate": 2.6181495281309003e-05, "loss": 0.2103, "step": 8760 }, { "epoch": 0.27095289219813234, "grad_norm": 0.14559623069747357, "learning_rate": 2.6171273540783514e-05, "loss": 0.198, "step": 8770 }, { "epoch": 0.271261846465177, "grad_norm": 0.23249937801724405, "learning_rate": 2.616104013837586e-05, "loss": 0.1895, "step": 8780 }, { "epoch": 0.2715708007322216, "grad_norm": 0.09492811965368149, "learning_rate": 2.6150795084768862e-05, "loss": 0.1892, "step": 8790 }, { "epoch": 0.27187975499926625, "grad_norm": 0.09734065538760589, "learning_rate": 2.6140538390657506e-05, "loss": 0.1999, "step": 8800 }, { "epoch": 0.27218870926631084, "grad_norm": 0.0965002759362586, "learning_rate": 2.6130270066748924e-05, "loss": 0.187, "step": 8810 }, { "epoch": 0.27249766353335547, "grad_norm": 0.11472850759947459, "learning_rate": 2.6119990123762406e-05, "loss": 0.1872, "step": 8820 }, { "epoch": 0.2728066178004001, "grad_norm": 0.11213872064955528, "learning_rate": 2.6109698572429344e-05, "loss": 0.1939, "step": 8830 }, { "epoch": 0.2731155720674447, "grad_norm": 0.10841550853694587, "learning_rate": 2.6099395423493278e-05, "loss": 0.1853, "step": 8840 }, { "epoch": 0.2734245263344893, "grad_norm": 0.09602089207539032, "learning_rate": 2.6089080687709827e-05, "loss": 0.1925, "step": 8850 }, { "epoch": 0.27373348060153396, "grad_norm": 0.0872433254648909, "learning_rate": 2.6078754375846726e-05, "loss": 0.1997, "step": 8860 }, { "epoch": 0.2740424348685786, "grad_norm": 0.11114739795765963, "learning_rate": 2.6068416498683786e-05, "loss": 0.187, "step": 8870 }, { "epoch": 0.2743513891356232, "grad_norm": 0.09779538164019706, "learning_rate": 2.6058067067012893e-05, "loss": 0.1887, "step": 8880 }, { "epoch": 0.2746603434026678, "grad_norm": 0.10694174384574226, "learning_rate": 2.6047706091637995e-05, "loss": 0.1864, "step": 8890 }, { "epoch": 0.27496929766971245, "grad_norm": 0.11458632793553461, "learning_rate": 2.6037333583375087e-05, "loss": 0.1903, "step": 8900 }, { "epoch": 0.27527825193675703, "grad_norm": 0.13835046571924234, "learning_rate": 2.602694955305221e-05, "loss": 0.1941, "step": 8910 }, { "epoch": 0.27558720620380167, "grad_norm": 0.09842039219454346, "learning_rate": 2.6016554011509428e-05, "loss": 0.1849, "step": 8920 }, { "epoch": 0.2758961604708463, "grad_norm": 0.11019182951426115, "learning_rate": 2.6006146969598822e-05, "loss": 0.1879, "step": 8930 }, { "epoch": 0.27620511473789094, "grad_norm": 0.10991414892083742, "learning_rate": 2.5995728438184485e-05, "loss": 0.1904, "step": 8940 }, { "epoch": 0.2765140690049355, "grad_norm": 0.0901963469107233, "learning_rate": 2.598529842814249e-05, "loss": 0.1844, "step": 8950 }, { "epoch": 0.27682302327198016, "grad_norm": 0.09683160975761855, "learning_rate": 2.597485695036091e-05, "loss": 0.1895, "step": 8960 }, { "epoch": 0.2771319775390248, "grad_norm": 0.10166995909330437, "learning_rate": 2.596440401573978e-05, "loss": 0.1857, "step": 8970 }, { "epoch": 0.2774409318060694, "grad_norm": 0.10826540384259199, "learning_rate": 2.5953939635191087e-05, "loss": 0.189, "step": 8980 }, { "epoch": 0.277749886073114, "grad_norm": 0.082213056193883, "learning_rate": 2.5943463819638792e-05, "loss": 0.1893, "step": 8990 }, { "epoch": 0.27805884034015865, "grad_norm": 0.10518238040177312, "learning_rate": 2.593297658001876e-05, "loss": 0.1896, "step": 9000 }, { "epoch": 0.2783677946072033, "grad_norm": 0.09821841783566572, "learning_rate": 2.592247792727881e-05, "loss": 0.1882, "step": 9010 }, { "epoch": 0.27867674887424787, "grad_norm": 0.10344017418225036, "learning_rate": 2.5911967872378655e-05, "loss": 0.1911, "step": 9020 }, { "epoch": 0.2789857031412925, "grad_norm": 0.10640760406134377, "learning_rate": 2.590144642628993e-05, "loss": 0.187, "step": 9030 }, { "epoch": 0.27929465740833714, "grad_norm": 0.11580836430543065, "learning_rate": 2.5890913599996143e-05, "loss": 0.1891, "step": 9040 }, { "epoch": 0.2796036116753817, "grad_norm": 0.09905157455059976, "learning_rate": 2.588036940449269e-05, "loss": 0.1852, "step": 9050 }, { "epoch": 0.27991256594242636, "grad_norm": 0.11576126917189689, "learning_rate": 2.586981385078684e-05, "loss": 0.1933, "step": 9060 }, { "epoch": 0.280221520209471, "grad_norm": 0.11856880648441576, "learning_rate": 2.5859246949897713e-05, "loss": 0.1974, "step": 9070 }, { "epoch": 0.28053047447651563, "grad_norm": 0.11182800280519953, "learning_rate": 2.5848668712856282e-05, "loss": 0.1899, "step": 9080 }, { "epoch": 0.2808394287435602, "grad_norm": 0.11527572973477145, "learning_rate": 2.5838079150705337e-05, "loss": 0.1984, "step": 9090 }, { "epoch": 0.28114838301060485, "grad_norm": 0.10445552575840167, "learning_rate": 2.582747827449951e-05, "loss": 0.1891, "step": 9100 }, { "epoch": 0.2814573372776495, "grad_norm": 0.14182003203057386, "learning_rate": 2.5816866095305237e-05, "loss": 0.1893, "step": 9110 }, { "epoch": 0.28176629154469407, "grad_norm": 0.11571260491824123, "learning_rate": 2.580624262420074e-05, "loss": 0.1916, "step": 9120 }, { "epoch": 0.2820752458117387, "grad_norm": 0.10869138832778663, "learning_rate": 2.5795607872276057e-05, "loss": 0.1869, "step": 9130 }, { "epoch": 0.28238420007878334, "grad_norm": 0.10892045289021521, "learning_rate": 2.5784961850632977e-05, "loss": 0.1864, "step": 9140 }, { "epoch": 0.282693154345828, "grad_norm": 0.10479307702793711, "learning_rate": 2.577430457038506e-05, "loss": 0.2088, "step": 9150 }, { "epoch": 0.28300210861287256, "grad_norm": 0.09978126119728896, "learning_rate": 2.5763636042657633e-05, "loss": 0.1929, "step": 9160 }, { "epoch": 0.2833110628799172, "grad_norm": 0.10169798878249496, "learning_rate": 2.5752956278587746e-05, "loss": 0.1876, "step": 9170 }, { "epoch": 0.28362001714696183, "grad_norm": 0.10708803279437269, "learning_rate": 2.5742265289324178e-05, "loss": 0.1902, "step": 9180 }, { "epoch": 0.2839289714140064, "grad_norm": 0.11891559449824145, "learning_rate": 2.573156308602745e-05, "loss": 0.1947, "step": 9190 }, { "epoch": 0.28423792568105105, "grad_norm": 0.1405744328768296, "learning_rate": 2.5720849679869765e-05, "loss": 0.1908, "step": 9200 }, { "epoch": 0.2845468799480957, "grad_norm": 0.10049760138123942, "learning_rate": 2.5710125082035034e-05, "loss": 0.1881, "step": 9210 }, { "epoch": 0.2848558342151403, "grad_norm": 0.09016115351030712, "learning_rate": 2.569938930371884e-05, "loss": 0.2003, "step": 9220 }, { "epoch": 0.2851647884821849, "grad_norm": 0.09967127985671986, "learning_rate": 2.568864235612845e-05, "loss": 0.1929, "step": 9230 }, { "epoch": 0.28547374274922954, "grad_norm": 0.11403371889988403, "learning_rate": 2.5677884250482784e-05, "loss": 0.1889, "step": 9240 }, { "epoch": 0.2857826970162742, "grad_norm": 0.09433735452091101, "learning_rate": 2.566711499801241e-05, "loss": 0.1842, "step": 9250 }, { "epoch": 0.28609165128331876, "grad_norm": 0.10939804567546585, "learning_rate": 2.565633460995953e-05, "loss": 0.19, "step": 9260 }, { "epoch": 0.2864006055503634, "grad_norm": 0.10746949395383892, "learning_rate": 2.5645543097577987e-05, "loss": 0.2016, "step": 9270 }, { "epoch": 0.28670955981740803, "grad_norm": 0.10198863160123904, "learning_rate": 2.563474047213321e-05, "loss": 0.1863, "step": 9280 }, { "epoch": 0.28701851408445267, "grad_norm": 0.10077440810770365, "learning_rate": 2.562392674490225e-05, "loss": 0.1905, "step": 9290 }, { "epoch": 0.28732746835149725, "grad_norm": 0.10855984916578419, "learning_rate": 2.561310192717374e-05, "loss": 0.1887, "step": 9300 }, { "epoch": 0.2876364226185419, "grad_norm": 0.0845171337600054, "learning_rate": 2.5602266030247886e-05, "loss": 0.1943, "step": 9310 }, { "epoch": 0.2879453768855865, "grad_norm": 0.10092603226413406, "learning_rate": 2.5591419065436472e-05, "loss": 0.187, "step": 9320 }, { "epoch": 0.2882543311526311, "grad_norm": 0.1435878134945247, "learning_rate": 2.5580561044062827e-05, "loss": 0.1936, "step": 9330 }, { "epoch": 0.28856328541967574, "grad_norm": 0.09253494052059409, "learning_rate": 2.556969197746182e-05, "loss": 0.1859, "step": 9340 }, { "epoch": 0.2888722396867204, "grad_norm": 8.814361473086858, "learning_rate": 2.5558811876979855e-05, "loss": 0.2474, "step": 9350 }, { "epoch": 0.289181193953765, "grad_norm": 0.15148451030285348, "learning_rate": 2.5547920753974863e-05, "loss": 0.2022, "step": 9360 }, { "epoch": 0.2894901482208096, "grad_norm": 0.09255025901365488, "learning_rate": 2.553701861981626e-05, "loss": 0.1864, "step": 9370 }, { "epoch": 0.28979910248785423, "grad_norm": 0.104459976300163, "learning_rate": 2.5526105485884973e-05, "loss": 0.191, "step": 9380 }, { "epoch": 0.29010805675489887, "grad_norm": 0.10441613403727636, "learning_rate": 2.5515181363573406e-05, "loss": 0.188, "step": 9390 }, { "epoch": 0.29041701102194345, "grad_norm": 0.11765425543594707, "learning_rate": 2.5504246264285445e-05, "loss": 0.1977, "step": 9400 }, { "epoch": 0.2907259652889881, "grad_norm": 0.1306047238430537, "learning_rate": 2.5493300199436413e-05, "loss": 0.1888, "step": 9410 }, { "epoch": 0.2910349195560327, "grad_norm": 0.09658760288385525, "learning_rate": 2.5482343180453104e-05, "loss": 0.2029, "step": 9420 }, { "epoch": 0.29134387382307736, "grad_norm": 0.11503788646311412, "learning_rate": 2.5471375218773733e-05, "loss": 0.1906, "step": 9430 }, { "epoch": 0.29165282809012194, "grad_norm": 0.10380154574814283, "learning_rate": 2.5460396325847934e-05, "loss": 0.1815, "step": 9440 }, { "epoch": 0.2919617823571666, "grad_norm": 0.12772727064183623, "learning_rate": 2.5449406513136768e-05, "loss": 0.1937, "step": 9450 }, { "epoch": 0.2922707366242112, "grad_norm": 0.12065907520857644, "learning_rate": 2.543840579211269e-05, "loss": 0.1933, "step": 9460 }, { "epoch": 0.2925796908912558, "grad_norm": 0.11855525364711275, "learning_rate": 2.5427394174259523e-05, "loss": 0.1875, "step": 9470 }, { "epoch": 0.29288864515830043, "grad_norm": 0.08503805054916591, "learning_rate": 2.5416371671072505e-05, "loss": 0.1947, "step": 9480 }, { "epoch": 0.29319759942534507, "grad_norm": 0.11370146401105172, "learning_rate": 2.5405338294058194e-05, "loss": 0.2035, "step": 9490 }, { "epoch": 0.2935065536923897, "grad_norm": 0.10353773038207591, "learning_rate": 2.539429405473453e-05, "loss": 0.1928, "step": 9500 }, { "epoch": 0.2938155079594343, "grad_norm": 0.09346658558384335, "learning_rate": 2.5383238964630782e-05, "loss": 0.1857, "step": 9510 }, { "epoch": 0.2941244622264789, "grad_norm": 0.09188718153400204, "learning_rate": 2.5372173035287542e-05, "loss": 0.2027, "step": 9520 }, { "epoch": 0.29443341649352356, "grad_norm": 0.09914190766514895, "learning_rate": 2.5361096278256726e-05, "loss": 0.1871, "step": 9530 }, { "epoch": 0.29474237076056814, "grad_norm": 0.07930791004599667, "learning_rate": 2.5350008705101547e-05, "loss": 0.1863, "step": 9540 }, { "epoch": 0.2950513250276128, "grad_norm": 0.11050563294669064, "learning_rate": 2.5338910327396513e-05, "loss": 0.1868, "step": 9550 }, { "epoch": 0.2953602792946574, "grad_norm": 0.09858219263880891, "learning_rate": 2.5327801156727402e-05, "loss": 0.1878, "step": 9560 }, { "epoch": 0.29566923356170205, "grad_norm": 0.18479412983473878, "learning_rate": 2.531668120469127e-05, "loss": 0.1973, "step": 9570 }, { "epoch": 0.29597818782874663, "grad_norm": 0.08603486027474677, "learning_rate": 2.5305550482896428e-05, "loss": 0.1924, "step": 9580 }, { "epoch": 0.29628714209579127, "grad_norm": 0.09563990873301001, "learning_rate": 2.529440900296242e-05, "loss": 0.1834, "step": 9590 }, { "epoch": 0.2965960963628359, "grad_norm": 0.09781713993143305, "learning_rate": 2.5283256776520028e-05, "loss": 0.1889, "step": 9600 }, { "epoch": 0.2969050506298805, "grad_norm": 0.11485623459415951, "learning_rate": 2.527209381521125e-05, "loss": 0.1954, "step": 9610 }, { "epoch": 0.2972140048969251, "grad_norm": 0.09526265527710998, "learning_rate": 2.526092013068929e-05, "loss": 0.1872, "step": 9620 }, { "epoch": 0.29752295916396976, "grad_norm": 0.09604807066414829, "learning_rate": 2.5249735734618552e-05, "loss": 0.1915, "step": 9630 }, { "epoch": 0.2978319134310144, "grad_norm": 0.10545061928010044, "learning_rate": 2.5238540638674614e-05, "loss": 0.1893, "step": 9640 }, { "epoch": 0.298140867698059, "grad_norm": 0.09842714903768751, "learning_rate": 2.5227334854544227e-05, "loss": 0.1878, "step": 9650 }, { "epoch": 0.2984498219651036, "grad_norm": 0.09554273140667502, "learning_rate": 2.5216118393925296e-05, "loss": 0.1947, "step": 9660 }, { "epoch": 0.29875877623214825, "grad_norm": 0.12366791756895854, "learning_rate": 2.5204891268526878e-05, "loss": 0.1907, "step": 9670 }, { "epoch": 0.29906773049919283, "grad_norm": 0.10651055652542657, "learning_rate": 2.5193653490069154e-05, "loss": 0.1889, "step": 9680 }, { "epoch": 0.29937668476623747, "grad_norm": 0.09103700810970648, "learning_rate": 2.518240507028344e-05, "loss": 0.1884, "step": 9690 }, { "epoch": 0.2996856390332821, "grad_norm": 0.11344449962662458, "learning_rate": 2.5171146020912155e-05, "loss": 0.188, "step": 9700 }, { "epoch": 0.29999459330032674, "grad_norm": 0.1018815606614934, "learning_rate": 2.5159876353708803e-05, "loss": 0.2021, "step": 9710 }, { "epoch": 0.3003035475673713, "grad_norm": 0.11178044146446087, "learning_rate": 2.5148596080437976e-05, "loss": 0.1959, "step": 9720 }, { "epoch": 0.30061250183441596, "grad_norm": 0.11026769938443516, "learning_rate": 2.5137305212875363e-05, "loss": 0.1903, "step": 9730 }, { "epoch": 0.3009214561014606, "grad_norm": 0.10047849455876935, "learning_rate": 2.5126003762807675e-05, "loss": 0.1973, "step": 9740 }, { "epoch": 0.3012304103685052, "grad_norm": 0.10931511029328492, "learning_rate": 2.5114691742032694e-05, "loss": 0.1879, "step": 9750 }, { "epoch": 0.3015393646355498, "grad_norm": 0.11494970064041439, "learning_rate": 2.5103369162359227e-05, "loss": 0.1988, "step": 9760 }, { "epoch": 0.30184831890259445, "grad_norm": 0.09125287216445102, "learning_rate": 2.5092036035607112e-05, "loss": 0.1867, "step": 9770 }, { "epoch": 0.3021572731696391, "grad_norm": 0.09819220491579592, "learning_rate": 2.5080692373607192e-05, "loss": 0.1855, "step": 9780 }, { "epoch": 0.30246622743668367, "grad_norm": 0.0976611633634228, "learning_rate": 2.5069338188201307e-05, "loss": 0.2054, "step": 9790 }, { "epoch": 0.3027751817037283, "grad_norm": 0.1160725370149606, "learning_rate": 2.505797349124228e-05, "loss": 0.1914, "step": 9800 }, { "epoch": 0.30308413597077294, "grad_norm": 0.13094878962368744, "learning_rate": 2.5046598294593915e-05, "loss": 0.1975, "step": 9810 }, { "epoch": 0.3033930902378175, "grad_norm": 0.11198671576007047, "learning_rate": 2.5035212610130974e-05, "loss": 0.1871, "step": 9820 }, { "epoch": 0.30370204450486216, "grad_norm": 0.10673072784365434, "learning_rate": 2.502381644973917e-05, "loss": 0.1835, "step": 9830 }, { "epoch": 0.3040109987719068, "grad_norm": 0.09530537911686453, "learning_rate": 2.5012409825315137e-05, "loss": 0.1864, "step": 9840 }, { "epoch": 0.30431995303895143, "grad_norm": 0.11900423902554536, "learning_rate": 2.5000992748766454e-05, "loss": 0.1866, "step": 9850 }, { "epoch": 0.304628907305996, "grad_norm": 0.09929015203337174, "learning_rate": 2.4989565232011594e-05, "loss": 0.1849, "step": 9860 }, { "epoch": 0.30493786157304065, "grad_norm": 0.11285518932683963, "learning_rate": 2.4978127286979944e-05, "loss": 0.1816, "step": 9870 }, { "epoch": 0.3052468158400853, "grad_norm": 0.09456894976012721, "learning_rate": 2.4966678925611766e-05, "loss": 0.186, "step": 9880 }, { "epoch": 0.30555577010712986, "grad_norm": 0.10323981633268041, "learning_rate": 2.4955220159858193e-05, "loss": 0.1875, "step": 9890 }, { "epoch": 0.3058647243741745, "grad_norm": 0.09901307303137544, "learning_rate": 2.4943751001681234e-05, "loss": 0.1879, "step": 9900 }, { "epoch": 0.30617367864121914, "grad_norm": 0.10046839593159085, "learning_rate": 2.4932271463053728e-05, "loss": 0.1916, "step": 9910 }, { "epoch": 0.3064826329082638, "grad_norm": 0.13907399457393424, "learning_rate": 2.492078155595937e-05, "loss": 0.1915, "step": 9920 }, { "epoch": 0.30679158717530836, "grad_norm": 0.0964486904973029, "learning_rate": 2.490928129239266e-05, "loss": 0.2045, "step": 9930 }, { "epoch": 0.307100541442353, "grad_norm": 0.1718161375236824, "learning_rate": 2.4897770684358927e-05, "loss": 0.2027, "step": 9940 }, { "epoch": 0.30740949570939763, "grad_norm": 0.11382680564698643, "learning_rate": 2.488624974387429e-05, "loss": 0.1887, "step": 9950 }, { "epoch": 0.3077184499764422, "grad_norm": 0.09241856142612667, "learning_rate": 2.4874718482965647e-05, "loss": 0.1868, "step": 9960 }, { "epoch": 0.30802740424348685, "grad_norm": 0.10912190096338768, "learning_rate": 2.486317691367068e-05, "loss": 0.1895, "step": 9970 }, { "epoch": 0.3083363585105315, "grad_norm": 0.08454020533402198, "learning_rate": 2.4851625048037833e-05, "loss": 0.1918, "step": 9980 }, { "epoch": 0.3086453127775761, "grad_norm": 0.09952497416826764, "learning_rate": 2.484006289812629e-05, "loss": 0.1856, "step": 9990 }, { "epoch": 0.3089542670446207, "grad_norm": 0.11209923667424684, "learning_rate": 2.482849047600598e-05, "loss": 0.1901, "step": 10000 }, { "epoch": 0.30926322131166534, "grad_norm": 0.09340912080530649, "learning_rate": 2.481690779375755e-05, "loss": 0.1936, "step": 10010 }, { "epoch": 0.30957217557871, "grad_norm": 0.09136718288585276, "learning_rate": 2.4805314863472355e-05, "loss": 0.1934, "step": 10020 }, { "epoch": 0.30988112984575455, "grad_norm": 0.09725966361145463, "learning_rate": 2.4793711697252453e-05, "loss": 0.1977, "step": 10030 }, { "epoch": 0.3101900841127992, "grad_norm": 0.14851332633478187, "learning_rate": 2.478209830721059e-05, "loss": 0.1948, "step": 10040 }, { "epoch": 0.31049903837984383, "grad_norm": 0.09331412732773223, "learning_rate": 2.4770474705470176e-05, "loss": 0.1876, "step": 10050 }, { "epoch": 0.31080799264688846, "grad_norm": 0.0985111396819373, "learning_rate": 2.4758840904165293e-05, "loss": 0.1889, "step": 10060 }, { "epoch": 0.31111694691393305, "grad_norm": 0.1074244405660999, "learning_rate": 2.4747196915440654e-05, "loss": 0.1878, "step": 10070 }, { "epoch": 0.3114259011809777, "grad_norm": 0.11270406523031921, "learning_rate": 2.4735542751451624e-05, "loss": 0.1903, "step": 10080 }, { "epoch": 0.3117348554480223, "grad_norm": 0.14444786045852906, "learning_rate": 2.4723878424364183e-05, "loss": 0.1926, "step": 10090 }, { "epoch": 0.3120438097150669, "grad_norm": 0.13199514765736367, "learning_rate": 2.4712203946354918e-05, "loss": 0.1916, "step": 10100 }, { "epoch": 0.31235276398211154, "grad_norm": 0.1102216519163516, "learning_rate": 2.4700519329611013e-05, "loss": 0.1875, "step": 10110 }, { "epoch": 0.3126617182491562, "grad_norm": 0.08172713783176928, "learning_rate": 2.4688824586330242e-05, "loss": 0.2014, "step": 10120 }, { "epoch": 0.3129706725162008, "grad_norm": 0.08766853575869235, "learning_rate": 2.4677119728720944e-05, "loss": 0.1948, "step": 10130 }, { "epoch": 0.3132796267832454, "grad_norm": 0.0864321078467092, "learning_rate": 2.466540476900202e-05, "loss": 0.1911, "step": 10140 }, { "epoch": 0.31358858105029, "grad_norm": 0.13369147149520363, "learning_rate": 2.4653679719402907e-05, "loss": 0.1879, "step": 10150 }, { "epoch": 0.31389753531733466, "grad_norm": 0.10138950969987723, "learning_rate": 2.4641944592163595e-05, "loss": 0.1926, "step": 10160 }, { "epoch": 0.31420648958437924, "grad_norm": 0.1131289750323316, "learning_rate": 2.4630199399534576e-05, "loss": 0.189, "step": 10170 }, { "epoch": 0.3145154438514239, "grad_norm": 0.08786637618867353, "learning_rate": 2.461844415377686e-05, "loss": 0.1958, "step": 10180 }, { "epoch": 0.3148243981184685, "grad_norm": 0.09390997282261213, "learning_rate": 2.4606678867161946e-05, "loss": 0.1945, "step": 10190 }, { "epoch": 0.31513335238551315, "grad_norm": 0.1067984321824277, "learning_rate": 2.4594903551971814e-05, "loss": 0.1861, "step": 10200 }, { "epoch": 0.31544230665255774, "grad_norm": 0.10431041990990578, "learning_rate": 2.458311822049892e-05, "loss": 0.1897, "step": 10210 }, { "epoch": 0.3157512609196024, "grad_norm": 0.08883021936643981, "learning_rate": 2.4571322885046166e-05, "loss": 0.1864, "step": 10220 }, { "epoch": 0.316060215186647, "grad_norm": 0.08876563960489203, "learning_rate": 2.4559517557926908e-05, "loss": 0.2012, "step": 10230 }, { "epoch": 0.3163691694536916, "grad_norm": 0.13279127653095946, "learning_rate": 2.4547702251464926e-05, "loss": 0.1951, "step": 10240 }, { "epoch": 0.3166781237207362, "grad_norm": 0.08223666907358407, "learning_rate": 2.453587697799442e-05, "loss": 0.1928, "step": 10250 }, { "epoch": 0.31698707798778086, "grad_norm": 0.1462141674424809, "learning_rate": 2.4524041749859992e-05, "loss": 0.2007, "step": 10260 }, { "epoch": 0.3172960322548255, "grad_norm": 0.08504556283374634, "learning_rate": 2.4512196579416636e-05, "loss": 0.1899, "step": 10270 }, { "epoch": 0.3176049865218701, "grad_norm": 0.08145333178233713, "learning_rate": 2.4500341479029734e-05, "loss": 0.1936, "step": 10280 }, { "epoch": 0.3179139407889147, "grad_norm": 0.11105143484390026, "learning_rate": 2.4488476461075014e-05, "loss": 0.187, "step": 10290 }, { "epoch": 0.31822289505595935, "grad_norm": 0.10120758320452351, "learning_rate": 2.4476601537938585e-05, "loss": 0.1976, "step": 10300 }, { "epoch": 0.31853184932300393, "grad_norm": 0.09868681939796119, "learning_rate": 2.4464716722016868e-05, "loss": 0.187, "step": 10310 }, { "epoch": 0.31884080359004857, "grad_norm": 0.11260503884674848, "learning_rate": 2.4452822025716633e-05, "loss": 0.1894, "step": 10320 }, { "epoch": 0.3191497578570932, "grad_norm": 0.11147937208027198, "learning_rate": 2.444091746145495e-05, "loss": 0.187, "step": 10330 }, { "epoch": 0.31945871212413784, "grad_norm": 0.10292959866660813, "learning_rate": 2.44290030416592e-05, "loss": 0.1834, "step": 10340 }, { "epoch": 0.3197676663911824, "grad_norm": 0.10594382495485939, "learning_rate": 2.4417078778767042e-05, "loss": 0.1876, "step": 10350 }, { "epoch": 0.32007662065822706, "grad_norm": 0.10032254310574651, "learning_rate": 2.4405144685226423e-05, "loss": 0.1916, "step": 10360 }, { "epoch": 0.3203855749252717, "grad_norm": 0.10627717057008569, "learning_rate": 2.439320077349554e-05, "loss": 0.1861, "step": 10370 }, { "epoch": 0.3206945291923163, "grad_norm": 0.13099542633637604, "learning_rate": 2.438124705604285e-05, "loss": 0.1884, "step": 10380 }, { "epoch": 0.3210034834593609, "grad_norm": 0.10802554360054083, "learning_rate": 2.436928354534704e-05, "loss": 0.2024, "step": 10390 }, { "epoch": 0.32131243772640555, "grad_norm": 0.08637551242366232, "learning_rate": 2.4357310253897024e-05, "loss": 0.1915, "step": 10400 }, { "epoch": 0.3216213919934502, "grad_norm": 0.0898888594607148, "learning_rate": 2.434532719419192e-05, "loss": 0.1884, "step": 10410 }, { "epoch": 0.32193034626049477, "grad_norm": 0.08570850184237087, "learning_rate": 2.4333334378741054e-05, "loss": 0.1911, "step": 10420 }, { "epoch": 0.3222393005275394, "grad_norm": 0.10249856361831708, "learning_rate": 2.4321331820063918e-05, "loss": 0.1982, "step": 10430 }, { "epoch": 0.32254825479458404, "grad_norm": 0.10257845052867674, "learning_rate": 2.43093195306902e-05, "loss": 0.1951, "step": 10440 }, { "epoch": 0.3228572090616286, "grad_norm": 0.10229604671643684, "learning_rate": 2.429729752315973e-05, "loss": 0.1887, "step": 10450 }, { "epoch": 0.32316616332867326, "grad_norm": 0.10314819818016309, "learning_rate": 2.428526581002248e-05, "loss": 0.1854, "step": 10460 }, { "epoch": 0.3234751175957179, "grad_norm": 0.11144197702516244, "learning_rate": 2.4273224403838562e-05, "loss": 0.1837, "step": 10470 }, { "epoch": 0.32378407186276253, "grad_norm": 0.08803908428000846, "learning_rate": 2.4261173317178206e-05, "loss": 0.1862, "step": 10480 }, { "epoch": 0.3240930261298071, "grad_norm": 0.09657899131923031, "learning_rate": 2.4249112562621744e-05, "loss": 0.1846, "step": 10490 }, { "epoch": 0.32440198039685175, "grad_norm": 0.08073177587693156, "learning_rate": 2.4237042152759604e-05, "loss": 0.1835, "step": 10500 }, { "epoch": 0.3247109346638964, "grad_norm": 0.09804505805837463, "learning_rate": 2.4224962100192297e-05, "loss": 0.1988, "step": 10510 }, { "epoch": 0.32501988893094097, "grad_norm": 0.09316989255139811, "learning_rate": 2.4212872417530387e-05, "loss": 0.1875, "step": 10520 }, { "epoch": 0.3253288431979856, "grad_norm": 0.10100730513827896, "learning_rate": 2.42007731173945e-05, "loss": 0.1871, "step": 10530 }, { "epoch": 0.32563779746503024, "grad_norm": 0.11426894363762669, "learning_rate": 2.4188664212415303e-05, "loss": 0.1867, "step": 10540 }, { "epoch": 0.3259467517320749, "grad_norm": 0.08657873556350776, "learning_rate": 2.4176545715233487e-05, "loss": 0.1949, "step": 10550 }, { "epoch": 0.32625570599911946, "grad_norm": 0.11819434882118317, "learning_rate": 2.416441763849976e-05, "loss": 0.1944, "step": 10560 }, { "epoch": 0.3265646602661641, "grad_norm": 0.1075874849306456, "learning_rate": 2.4152279994874825e-05, "loss": 0.1959, "step": 10570 }, { "epoch": 0.32687361453320873, "grad_norm": 0.09047038439088448, "learning_rate": 2.4140132797029374e-05, "loss": 0.2071, "step": 10580 }, { "epoch": 0.3271825688002533, "grad_norm": 0.12922438663777203, "learning_rate": 2.4127976057644066e-05, "loss": 0.1866, "step": 10590 }, { "epoch": 0.32749152306729795, "grad_norm": 0.09071465290826776, "learning_rate": 2.4115809789409538e-05, "loss": 0.1968, "step": 10600 }, { "epoch": 0.3278004773343426, "grad_norm": 0.09615055209291347, "learning_rate": 2.4103634005026362e-05, "loss": 0.1942, "step": 10610 }, { "epoch": 0.3281094316013872, "grad_norm": 0.11297404179659212, "learning_rate": 2.4091448717205045e-05, "loss": 0.1982, "step": 10620 }, { "epoch": 0.3284183858684318, "grad_norm": 0.08103250825755323, "learning_rate": 2.4079253938666015e-05, "loss": 0.1883, "step": 10630 }, { "epoch": 0.32872734013547644, "grad_norm": 0.10027791187893295, "learning_rate": 2.406704968213961e-05, "loss": 0.1934, "step": 10640 }, { "epoch": 0.3290362944025211, "grad_norm": 0.10728333469606902, "learning_rate": 2.4054835960366055e-05, "loss": 0.1956, "step": 10650 }, { "epoch": 0.32934524866956566, "grad_norm": 0.10145867895118323, "learning_rate": 2.404261278609547e-05, "loss": 0.1842, "step": 10660 }, { "epoch": 0.3296542029366103, "grad_norm": 0.08577620774947799, "learning_rate": 2.403038017208783e-05, "loss": 0.19, "step": 10670 }, { "epoch": 0.32996315720365493, "grad_norm": 0.11295170789310942, "learning_rate": 2.4018138131112972e-05, "loss": 0.198, "step": 10680 }, { "epoch": 0.33027211147069957, "grad_norm": 0.09213130233065495, "learning_rate": 2.4005886675950562e-05, "loss": 0.1901, "step": 10690 }, { "epoch": 0.33058106573774415, "grad_norm": 0.09202019556291367, "learning_rate": 2.3993625819390114e-05, "loss": 0.1883, "step": 10700 }, { "epoch": 0.3308900200047888, "grad_norm": 0.0873260575047738, "learning_rate": 2.398135557423093e-05, "loss": 0.1859, "step": 10710 }, { "epoch": 0.3311989742718334, "grad_norm": 0.1953741414952931, "learning_rate": 2.3969075953282143e-05, "loss": 0.1905, "step": 10720 }, { "epoch": 0.331507928538878, "grad_norm": 0.08764013609672623, "learning_rate": 2.3956786969362647e-05, "loss": 0.1857, "step": 10730 }, { "epoch": 0.33181688280592264, "grad_norm": 0.08642787068756606, "learning_rate": 2.3944488635301122e-05, "loss": 0.1875, "step": 10740 }, { "epoch": 0.3321258370729673, "grad_norm": 0.0940671293624528, "learning_rate": 2.3932180963936013e-05, "loss": 0.1833, "step": 10750 }, { "epoch": 0.3324347913400119, "grad_norm": 0.1489525719605523, "learning_rate": 2.39198639681155e-05, "loss": 0.1946, "step": 10760 }, { "epoch": 0.3327437456070565, "grad_norm": 0.08370747837677327, "learning_rate": 2.3907537660697512e-05, "loss": 0.1844, "step": 10770 }, { "epoch": 0.33305269987410113, "grad_norm": 0.08082000819454536, "learning_rate": 2.3895202054549684e-05, "loss": 0.1946, "step": 10780 }, { "epoch": 0.33336165414114577, "grad_norm": 0.08554354991880381, "learning_rate": 2.3882857162549374e-05, "loss": 0.1899, "step": 10790 }, { "epoch": 0.33367060840819035, "grad_norm": 0.10646003301479967, "learning_rate": 2.3870502997583614e-05, "loss": 0.1873, "step": 10800 }, { "epoch": 0.333979562675235, "grad_norm": 0.11072901291957145, "learning_rate": 2.385813957254913e-05, "loss": 0.2096, "step": 10810 }, { "epoch": 0.3342885169422796, "grad_norm": 0.10355008643861074, "learning_rate": 2.384576690035231e-05, "loss": 0.1874, "step": 10820 }, { "epoch": 0.33459747120932426, "grad_norm": 0.08504136163217271, "learning_rate": 2.38333849939092e-05, "loss": 0.1892, "step": 10830 }, { "epoch": 0.33490642547636884, "grad_norm": 0.11696581059408875, "learning_rate": 2.3820993866145485e-05, "loss": 0.1831, "step": 10840 }, { "epoch": 0.3352153797434135, "grad_norm": 0.12789603723509432, "learning_rate": 2.3808593529996462e-05, "loss": 0.187, "step": 10850 }, { "epoch": 0.3355243340104581, "grad_norm": 0.09406362406435059, "learning_rate": 2.3796183998407067e-05, "loss": 0.1956, "step": 10860 }, { "epoch": 0.3358332882775027, "grad_norm": 0.0963012641045247, "learning_rate": 2.378376528433181e-05, "loss": 0.1893, "step": 10870 }, { "epoch": 0.33614224254454733, "grad_norm": 0.09137989792286952, "learning_rate": 2.3771337400734803e-05, "loss": 0.1945, "step": 10880 }, { "epoch": 0.33645119681159197, "grad_norm": 0.10276076002740937, "learning_rate": 2.375890036058972e-05, "loss": 0.1935, "step": 10890 }, { "epoch": 0.3367601510786366, "grad_norm": 0.10542029603064709, "learning_rate": 2.374645417687981e-05, "loss": 0.183, "step": 10900 }, { "epoch": 0.3370691053456812, "grad_norm": 0.15502207119275505, "learning_rate": 2.373399886259784e-05, "loss": 0.1916, "step": 10910 }, { "epoch": 0.3373780596127258, "grad_norm": 0.0902706320845401, "learning_rate": 2.372153443074613e-05, "loss": 0.1868, "step": 10920 }, { "epoch": 0.33768701387977046, "grad_norm": 0.10491192336946742, "learning_rate": 2.3709060894336516e-05, "loss": 0.1839, "step": 10930 }, { "epoch": 0.33799596814681504, "grad_norm": 0.09407652570377974, "learning_rate": 2.3696578266390333e-05, "loss": 0.1951, "step": 10940 }, { "epoch": 0.3383049224138597, "grad_norm": 0.10131817320848324, "learning_rate": 2.368408655993841e-05, "loss": 0.185, "step": 10950 }, { "epoch": 0.3386138766809043, "grad_norm": 0.09312775430656653, "learning_rate": 2.367158578802104e-05, "loss": 0.1804, "step": 10960 }, { "epoch": 0.33892283094794895, "grad_norm": 0.08868384788858018, "learning_rate": 2.3659075963688005e-05, "loss": 0.1879, "step": 10970 }, { "epoch": 0.33923178521499353, "grad_norm": 0.12383857112167239, "learning_rate": 2.364655709999852e-05, "loss": 0.1915, "step": 10980 }, { "epoch": 0.33954073948203817, "grad_norm": 0.09606980612192445, "learning_rate": 2.3634029210021234e-05, "loss": 0.1831, "step": 10990 }, { "epoch": 0.3398496937490828, "grad_norm": 0.09531477336107455, "learning_rate": 2.3621492306834228e-05, "loss": 0.1931, "step": 11000 }, { "epoch": 0.3401586480161274, "grad_norm": 0.09670879312647006, "learning_rate": 2.3608946403524988e-05, "loss": 0.1943, "step": 11010 }, { "epoch": 0.340467602283172, "grad_norm": 0.11263160779466505, "learning_rate": 2.3596391513190393e-05, "loss": 0.1977, "step": 11020 }, { "epoch": 0.34077655655021666, "grad_norm": 0.09297761654801473, "learning_rate": 2.3583827648936715e-05, "loss": 0.1889, "step": 11030 }, { "epoch": 0.3410855108172613, "grad_norm": 0.13797057326064396, "learning_rate": 2.3571254823879573e-05, "loss": 0.1962, "step": 11040 }, { "epoch": 0.3413944650843059, "grad_norm": 0.08912817358442929, "learning_rate": 2.355867305114396e-05, "loss": 0.193, "step": 11050 }, { "epoch": 0.3417034193513505, "grad_norm": 0.09595013583838197, "learning_rate": 2.3546082343864198e-05, "loss": 0.1848, "step": 11060 }, { "epoch": 0.34201237361839515, "grad_norm": 0.10821307526466348, "learning_rate": 2.3533482715183946e-05, "loss": 0.1895, "step": 11070 }, { "epoch": 0.34232132788543973, "grad_norm": 0.11873312543475707, "learning_rate": 2.3520874178256165e-05, "loss": 0.1867, "step": 11080 }, { "epoch": 0.34263028215248437, "grad_norm": 0.11502975717030715, "learning_rate": 2.350825674624312e-05, "loss": 0.1893, "step": 11090 }, { "epoch": 0.342939236419529, "grad_norm": 0.09755080794306602, "learning_rate": 2.3495630432316368e-05, "loss": 0.1897, "step": 11100 }, { "epoch": 0.34324819068657364, "grad_norm": 0.08872588077759884, "learning_rate": 2.3482995249656725e-05, "loss": 0.1986, "step": 11110 }, { "epoch": 0.3435571449536182, "grad_norm": 0.09838896910229696, "learning_rate": 2.3470351211454285e-05, "loss": 0.1908, "step": 11120 }, { "epoch": 0.34386609922066286, "grad_norm": 0.0923233551081439, "learning_rate": 2.3457698330908364e-05, "loss": 0.2002, "step": 11130 }, { "epoch": 0.3441750534877075, "grad_norm": 0.08737552496078425, "learning_rate": 2.344503662122752e-05, "loss": 0.1895, "step": 11140 }, { "epoch": 0.3444840077547521, "grad_norm": 0.10668889181697425, "learning_rate": 2.3432366095629528e-05, "loss": 0.1888, "step": 11150 }, { "epoch": 0.3447929620217967, "grad_norm": 0.08291984012290848, "learning_rate": 2.3419686767341367e-05, "loss": 0.1987, "step": 11160 }, { "epoch": 0.34510191628884135, "grad_norm": 0.10764632441930712, "learning_rate": 2.3406998649599202e-05, "loss": 0.1832, "step": 11170 }, { "epoch": 0.345410870555886, "grad_norm": 0.09017312726891885, "learning_rate": 2.3394301755648375e-05, "loss": 0.192, "step": 11180 }, { "epoch": 0.34571982482293057, "grad_norm": 0.1324785929205862, "learning_rate": 2.3381596098743387e-05, "loss": 0.192, "step": 11190 }, { "epoch": 0.3460287790899752, "grad_norm": 0.10070251954726174, "learning_rate": 2.336888169214789e-05, "loss": 0.1869, "step": 11200 }, { "epoch": 0.34633773335701984, "grad_norm": 0.08935782758742528, "learning_rate": 2.335615854913467e-05, "loss": 0.1928, "step": 11210 }, { "epoch": 0.3466466876240644, "grad_norm": 0.0855348666768417, "learning_rate": 2.3343426682985625e-05, "loss": 0.1872, "step": 11220 }, { "epoch": 0.34695564189110906, "grad_norm": 0.09737930135217468, "learning_rate": 2.333068610699178e-05, "loss": 0.1869, "step": 11230 }, { "epoch": 0.3472645961581537, "grad_norm": 0.11069764429002865, "learning_rate": 2.3317936834453228e-05, "loss": 0.1829, "step": 11240 }, { "epoch": 0.34757355042519833, "grad_norm": 0.0924798319053801, "learning_rate": 2.3305178878679153e-05, "loss": 0.1981, "step": 11250 }, { "epoch": 0.3478825046922429, "grad_norm": 0.10323547298579855, "learning_rate": 2.3292412252987806e-05, "loss": 0.1915, "step": 11260 }, { "epoch": 0.34819145895928755, "grad_norm": 0.08749789524208305, "learning_rate": 2.327963697070648e-05, "loss": 0.1927, "step": 11270 }, { "epoch": 0.3485004132263322, "grad_norm": 0.09480047005422201, "learning_rate": 2.326685304517151e-05, "loss": 0.1872, "step": 11280 }, { "epoch": 0.34880936749337677, "grad_norm": 0.10359229819355556, "learning_rate": 2.325406048972826e-05, "loss": 0.1959, "step": 11290 }, { "epoch": 0.3491183217604214, "grad_norm": 0.1041559744501966, "learning_rate": 2.324125931773108e-05, "loss": 0.1916, "step": 11300 }, { "epoch": 0.34942727602746604, "grad_norm": 0.08958972823331576, "learning_rate": 2.3228449542543343e-05, "loss": 0.1824, "step": 11310 }, { "epoch": 0.3497362302945107, "grad_norm": 0.11017769684432532, "learning_rate": 2.321563117753739e-05, "loss": 0.1901, "step": 11320 }, { "epoch": 0.35004518456155526, "grad_norm": 0.09160695776194668, "learning_rate": 2.3202804236094525e-05, "loss": 0.1944, "step": 11330 }, { "epoch": 0.3503541388285999, "grad_norm": 0.08626306454489814, "learning_rate": 2.3189968731605015e-05, "loss": 0.1876, "step": 11340 }, { "epoch": 0.35066309309564453, "grad_norm": 0.0886446415178499, "learning_rate": 2.317712467746806e-05, "loss": 0.1877, "step": 11350 }, { "epoch": 0.35097204736268917, "grad_norm": 0.10801126161955527, "learning_rate": 2.3164272087091783e-05, "loss": 0.1937, "step": 11360 }, { "epoch": 0.35128100162973375, "grad_norm": 0.09010158779533652, "learning_rate": 2.3151410973893225e-05, "loss": 0.1921, "step": 11370 }, { "epoch": 0.3515899558967784, "grad_norm": 0.10195229727148444, "learning_rate": 2.313854135129832e-05, "loss": 0.1874, "step": 11380 }, { "epoch": 0.351898910163823, "grad_norm": 0.10378462738337658, "learning_rate": 2.3125663232741882e-05, "loss": 0.19, "step": 11390 }, { "epoch": 0.3522078644308676, "grad_norm": 0.09872375304401278, "learning_rate": 2.31127766316676e-05, "loss": 0.1974, "step": 11400 }, { "epoch": 0.35251681869791224, "grad_norm": 0.09793706394169703, "learning_rate": 2.3099881561528018e-05, "loss": 0.1858, "step": 11410 }, { "epoch": 0.3528257729649569, "grad_norm": 0.148466539275305, "learning_rate": 2.3086978035784517e-05, "loss": 0.1995, "step": 11420 }, { "epoch": 0.3531347272320015, "grad_norm": 0.0811008808934049, "learning_rate": 2.307406606790731e-05, "loss": 0.1937, "step": 11430 }, { "epoch": 0.3534436814990461, "grad_norm": 0.12512398588345597, "learning_rate": 2.3061145671375416e-05, "loss": 0.1855, "step": 11440 }, { "epoch": 0.35375263576609073, "grad_norm": 0.17576087917517788, "learning_rate": 2.304821685967665e-05, "loss": 0.1924, "step": 11450 }, { "epoch": 0.35406159003313536, "grad_norm": 0.10825937957450911, "learning_rate": 2.3035279646307625e-05, "loss": 0.1908, "step": 11460 }, { "epoch": 0.35437054430017995, "grad_norm": 0.11619912660972567, "learning_rate": 2.302233404477372e-05, "loss": 0.1864, "step": 11470 }, { "epoch": 0.3546794985672246, "grad_norm": 0.10569052440962179, "learning_rate": 2.3009380068589064e-05, "loss": 0.1871, "step": 11480 }, { "epoch": 0.3549884528342692, "grad_norm": 0.14648434410348218, "learning_rate": 2.299641773127654e-05, "loss": 0.1994, "step": 11490 }, { "epoch": 0.35529740710131386, "grad_norm": 0.123038590761249, "learning_rate": 2.2983447046367742e-05, "loss": 0.1985, "step": 11500 }, { "epoch": 0.35560636136835844, "grad_norm": 0.12576949179128505, "learning_rate": 2.297046802740299e-05, "loss": 0.1877, "step": 11510 }, { "epoch": 0.3559153156354031, "grad_norm": 0.11812104987432641, "learning_rate": 2.295748068793131e-05, "loss": 0.1883, "step": 11520 }, { "epoch": 0.3562242699024477, "grad_norm": 0.09611722879304281, "learning_rate": 2.2944485041510394e-05, "loss": 0.187, "step": 11530 }, { "epoch": 0.3565332241694923, "grad_norm": 0.09523547540301808, "learning_rate": 2.2931481101706634e-05, "loss": 0.1989, "step": 11540 }, { "epoch": 0.3568421784365369, "grad_norm": 0.1107326367612505, "learning_rate": 2.2918468882095046e-05, "loss": 0.1875, "step": 11550 }, { "epoch": 0.35715113270358156, "grad_norm": 0.07632770384439395, "learning_rate": 2.2905448396259327e-05, "loss": 0.1883, "step": 11560 }, { "epoch": 0.3574600869706262, "grad_norm": 0.10984237859706728, "learning_rate": 2.289241965779176e-05, "loss": 0.1848, "step": 11570 }, { "epoch": 0.3577690412376708, "grad_norm": 0.102672162754776, "learning_rate": 2.2879382680293286e-05, "loss": 0.1875, "step": 11580 }, { "epoch": 0.3580779955047154, "grad_norm": 0.16716939251737217, "learning_rate": 2.2866337477373413e-05, "loss": 0.1925, "step": 11590 }, { "epoch": 0.35838694977176005, "grad_norm": 0.10693715887367708, "learning_rate": 2.2853284062650263e-05, "loss": 0.1893, "step": 11600 }, { "epoch": 0.35869590403880464, "grad_norm": 0.0929253979596654, "learning_rate": 2.284022244975051e-05, "loss": 0.1974, "step": 11610 }, { "epoch": 0.3590048583058493, "grad_norm": 0.08777840184290545, "learning_rate": 2.282715265230939e-05, "loss": 0.1876, "step": 11620 }, { "epoch": 0.3593138125728939, "grad_norm": 0.0762413574768266, "learning_rate": 2.2814074683970693e-05, "loss": 0.1917, "step": 11630 }, { "epoch": 0.35962276683993855, "grad_norm": 0.09454809705985567, "learning_rate": 2.280098855838673e-05, "loss": 0.1835, "step": 11640 }, { "epoch": 0.3599317211069831, "grad_norm": 0.09051571813041435, "learning_rate": 2.2787894289218332e-05, "loss": 0.1891, "step": 11650 }, { "epoch": 0.36024067537402776, "grad_norm": 0.0894254880628476, "learning_rate": 2.2774791890134825e-05, "loss": 0.1891, "step": 11660 }, { "epoch": 0.3605496296410724, "grad_norm": 0.09466970727294428, "learning_rate": 2.2761681374814028e-05, "loss": 0.204, "step": 11670 }, { "epoch": 0.360858583908117, "grad_norm": 0.08531259730374659, "learning_rate": 2.2748562756942232e-05, "loss": 0.1861, "step": 11680 }, { "epoch": 0.3611675381751616, "grad_norm": 0.12284005507850895, "learning_rate": 2.2735436050214178e-05, "loss": 0.1951, "step": 11690 }, { "epoch": 0.36147649244220625, "grad_norm": 0.11919850377776255, "learning_rate": 2.272230126833307e-05, "loss": 0.1887, "step": 11700 }, { "epoch": 0.3617854467092509, "grad_norm": 0.10155349442380728, "learning_rate": 2.270915842501052e-05, "loss": 0.1881, "step": 11710 }, { "epoch": 0.36209440097629547, "grad_norm": 0.0873831466599122, "learning_rate": 2.2696007533966568e-05, "loss": 0.1873, "step": 11720 }, { "epoch": 0.3624033552433401, "grad_norm": 0.0793548762125726, "learning_rate": 2.2682848608929655e-05, "loss": 0.1851, "step": 11730 }, { "epoch": 0.36271230951038474, "grad_norm": 0.11524622716827497, "learning_rate": 2.2669681663636598e-05, "loss": 0.1862, "step": 11740 }, { "epoch": 0.3630212637774293, "grad_norm": 0.10495555651573284, "learning_rate": 2.265650671183261e-05, "loss": 0.187, "step": 11750 }, { "epoch": 0.36333021804447396, "grad_norm": 0.10260436277766406, "learning_rate": 2.2643323767271233e-05, "loss": 0.184, "step": 11760 }, { "epoch": 0.3636391723115186, "grad_norm": 0.0932148539544743, "learning_rate": 2.2630132843714373e-05, "loss": 0.1807, "step": 11770 }, { "epoch": 0.36394812657856324, "grad_norm": 0.12898993106004603, "learning_rate": 2.261693395493226e-05, "loss": 0.1897, "step": 11780 }, { "epoch": 0.3642570808456078, "grad_norm": 0.09356212327035456, "learning_rate": 2.2603727114703432e-05, "loss": 0.1942, "step": 11790 }, { "epoch": 0.36456603511265245, "grad_norm": 0.11527288431873035, "learning_rate": 2.259051233681474e-05, "loss": 0.1886, "step": 11800 }, { "epoch": 0.3648749893796971, "grad_norm": 0.10182329082139296, "learning_rate": 2.2577289635061314e-05, "loss": 0.1952, "step": 11810 }, { "epoch": 0.36518394364674167, "grad_norm": 0.10126866107020173, "learning_rate": 2.2564059023246552e-05, "loss": 0.2, "step": 11820 }, { "epoch": 0.3654928979137863, "grad_norm": 0.11290011810118633, "learning_rate": 2.255082051518212e-05, "loss": 0.1907, "step": 11830 }, { "epoch": 0.36580185218083094, "grad_norm": 0.10399887523638827, "learning_rate": 2.253757412468792e-05, "loss": 0.1846, "step": 11840 }, { "epoch": 0.3661108064478756, "grad_norm": 0.08962367793400801, "learning_rate": 2.2524319865592085e-05, "loss": 0.1896, "step": 11850 }, { "epoch": 0.36641976071492016, "grad_norm": 0.0971662300096469, "learning_rate": 2.2511057751730957e-05, "loss": 0.1848, "step": 11860 }, { "epoch": 0.3667287149819648, "grad_norm": 0.08926981262442568, "learning_rate": 2.2497787796949086e-05, "loss": 0.1891, "step": 11870 }, { "epoch": 0.36703766924900944, "grad_norm": 0.0997820021140413, "learning_rate": 2.24845100150992e-05, "loss": 0.1883, "step": 11880 }, { "epoch": 0.367346623516054, "grad_norm": 0.11019032194902358, "learning_rate": 2.247122442004221e-05, "loss": 0.214, "step": 11890 }, { "epoch": 0.36765557778309865, "grad_norm": 0.08843270186981546, "learning_rate": 2.2457931025647164e-05, "loss": 0.1948, "step": 11900 }, { "epoch": 0.3679645320501433, "grad_norm": 0.08081469587518253, "learning_rate": 2.2444629845791263e-05, "loss": 0.1864, "step": 11910 }, { "epoch": 0.3682734863171879, "grad_norm": 0.08643601308806263, "learning_rate": 2.2431320894359837e-05, "loss": 0.189, "step": 11920 }, { "epoch": 0.3685824405842325, "grad_norm": 0.1043042633827644, "learning_rate": 2.2418004185246325e-05, "loss": 0.1863, "step": 11930 }, { "epoch": 0.36889139485127714, "grad_norm": 0.10909555625908413, "learning_rate": 2.240467973235227e-05, "loss": 0.1853, "step": 11940 }, { "epoch": 0.3692003491183218, "grad_norm": 0.10081295667943212, "learning_rate": 2.2391347549587288e-05, "loss": 0.1835, "step": 11950 }, { "epoch": 0.36950930338536636, "grad_norm": 0.10853639856568237, "learning_rate": 2.237800765086908e-05, "loss": 0.1869, "step": 11960 }, { "epoch": 0.369818257652411, "grad_norm": 0.10906644104912402, "learning_rate": 2.236466005012338e-05, "loss": 0.1841, "step": 11970 }, { "epoch": 0.37012721191945563, "grad_norm": 0.09492831998510312, "learning_rate": 2.2351304761283985e-05, "loss": 0.1873, "step": 11980 }, { "epoch": 0.37043616618650027, "grad_norm": 0.08404413315269131, "learning_rate": 2.233794179829271e-05, "loss": 0.1834, "step": 11990 }, { "epoch": 0.37074512045354485, "grad_norm": 0.11624722382524719, "learning_rate": 2.232457117509937e-05, "loss": 0.1999, "step": 12000 }, { "epoch": 0.3710540747205895, "grad_norm": 0.10604518786522354, "learning_rate": 2.2311192905661795e-05, "loss": 0.1876, "step": 12010 }, { "epoch": 0.3713630289876341, "grad_norm": 0.12277243084657756, "learning_rate": 2.229780700394578e-05, "loss": 0.1858, "step": 12020 }, { "epoch": 0.3716719832546787, "grad_norm": 0.09428607361303694, "learning_rate": 2.2284413483925097e-05, "loss": 0.1858, "step": 12030 }, { "epoch": 0.37198093752172334, "grad_norm": 0.10303962243287938, "learning_rate": 2.2271012359581468e-05, "loss": 0.1854, "step": 12040 }, { "epoch": 0.372289891788768, "grad_norm": 0.08662878274471632, "learning_rate": 2.2257603644904557e-05, "loss": 0.1833, "step": 12050 }, { "epoch": 0.3725988460558126, "grad_norm": 0.08877536584269159, "learning_rate": 2.224418735389195e-05, "loss": 0.1961, "step": 12060 }, { "epoch": 0.3729078003228572, "grad_norm": 0.09638428066654024, "learning_rate": 2.2230763500549136e-05, "loss": 0.19, "step": 12070 }, { "epoch": 0.37321675458990183, "grad_norm": 0.11273893075014302, "learning_rate": 2.2217332098889507e-05, "loss": 0.2024, "step": 12080 }, { "epoch": 0.37352570885694647, "grad_norm": 0.08659890915115652, "learning_rate": 2.2203893162934333e-05, "loss": 0.1927, "step": 12090 }, { "epoch": 0.37383466312399105, "grad_norm": 0.09580197460355058, "learning_rate": 2.2190446706712743e-05, "loss": 0.1927, "step": 12100 }, { "epoch": 0.3741436173910357, "grad_norm": 0.0861993079505064, "learning_rate": 2.2176992744261728e-05, "loss": 0.1848, "step": 12110 }, { "epoch": 0.3744525716580803, "grad_norm": 0.09478748886063199, "learning_rate": 2.21635312896261e-05, "loss": 0.1868, "step": 12120 }, { "epoch": 0.37476152592512496, "grad_norm": 0.10030432908753674, "learning_rate": 2.2150062356858507e-05, "loss": 0.1878, "step": 12130 }, { "epoch": 0.37507048019216954, "grad_norm": 0.1017851961915696, "learning_rate": 2.2136585960019387e-05, "loss": 0.1993, "step": 12140 }, { "epoch": 0.3753794344592142, "grad_norm": 0.10067172635604223, "learning_rate": 2.212310211317699e-05, "loss": 0.1934, "step": 12150 }, { "epoch": 0.3756883887262588, "grad_norm": 0.08778870365569208, "learning_rate": 2.2109610830407327e-05, "loss": 0.1862, "step": 12160 }, { "epoch": 0.3759973429933034, "grad_norm": 0.09508651264190547, "learning_rate": 2.2096112125794174e-05, "loss": 0.1972, "step": 12170 }, { "epoch": 0.37630629726034803, "grad_norm": 0.08730341557296374, "learning_rate": 2.208260601342907e-05, "loss": 0.1919, "step": 12180 }, { "epoch": 0.37661525152739267, "grad_norm": 0.10788799527763362, "learning_rate": 2.2069092507411265e-05, "loss": 0.195, "step": 12190 }, { "epoch": 0.3769242057944373, "grad_norm": 0.11811812837511006, "learning_rate": 2.2055571621847737e-05, "loss": 0.1864, "step": 12200 }, { "epoch": 0.3772331600614819, "grad_norm": 0.0878050403220875, "learning_rate": 2.2042043370853174e-05, "loss": 0.1855, "step": 12210 }, { "epoch": 0.3775421143285265, "grad_norm": 0.08853991233275472, "learning_rate": 2.2028507768549944e-05, "loss": 0.1885, "step": 12220 }, { "epoch": 0.37785106859557116, "grad_norm": 0.08452375786593688, "learning_rate": 2.2014964829068087e-05, "loss": 0.1925, "step": 12230 }, { "epoch": 0.37816002286261574, "grad_norm": 0.09903481689275309, "learning_rate": 2.2001414566545323e-05, "loss": 0.1859, "step": 12240 }, { "epoch": 0.3784689771296604, "grad_norm": 0.0902044230703504, "learning_rate": 2.198785699512698e-05, "loss": 0.1898, "step": 12250 }, { "epoch": 0.378777931396705, "grad_norm": 0.09789504524374835, "learning_rate": 2.1974292128966054e-05, "loss": 0.193, "step": 12260 }, { "epoch": 0.37908688566374965, "grad_norm": 0.0761749796310836, "learning_rate": 2.196071998222313e-05, "loss": 0.186, "step": 12270 }, { "epoch": 0.37939583993079423, "grad_norm": 0.09601634226279154, "learning_rate": 2.1947140569066405e-05, "loss": 0.1813, "step": 12280 }, { "epoch": 0.37970479419783887, "grad_norm": 0.10033439403921206, "learning_rate": 2.193355390367166e-05, "loss": 0.1877, "step": 12290 }, { "epoch": 0.3800137484648835, "grad_norm": 0.09969839405797135, "learning_rate": 2.1919960000222245e-05, "loss": 0.1894, "step": 12300 }, { "epoch": 0.3803227027319281, "grad_norm": 0.1060276229550653, "learning_rate": 2.1906358872909068e-05, "loss": 0.1877, "step": 12310 }, { "epoch": 0.3806316569989727, "grad_norm": 0.10895069946809241, "learning_rate": 2.1892750535930575e-05, "loss": 0.1882, "step": 12320 }, { "epoch": 0.38094061126601736, "grad_norm": 0.09820742351990522, "learning_rate": 2.187913500349274e-05, "loss": 0.1963, "step": 12330 }, { "epoch": 0.381249565533062, "grad_norm": 0.09472323522818168, "learning_rate": 2.1865512289809052e-05, "loss": 0.1857, "step": 12340 }, { "epoch": 0.3815585198001066, "grad_norm": 0.09988799948578274, "learning_rate": 2.185188240910049e-05, "loss": 0.1868, "step": 12350 }, { "epoch": 0.3818674740671512, "grad_norm": 0.12750044657653234, "learning_rate": 2.1838245375595517e-05, "loss": 0.1921, "step": 12360 }, { "epoch": 0.38217642833419585, "grad_norm": 0.09912066288832452, "learning_rate": 2.182460120353006e-05, "loss": 0.1874, "step": 12370 }, { "epoch": 0.38248538260124043, "grad_norm": 0.08189537759998616, "learning_rate": 2.1810949907147518e-05, "loss": 0.1783, "step": 12380 }, { "epoch": 0.38279433686828507, "grad_norm": 0.09036618563864209, "learning_rate": 2.179729150069869e-05, "loss": 0.1866, "step": 12390 }, { "epoch": 0.3831032911353297, "grad_norm": 0.09923347261178064, "learning_rate": 2.1783625998441832e-05, "loss": 0.1861, "step": 12400 }, { "epoch": 0.38341224540237434, "grad_norm": 0.0970615990744683, "learning_rate": 2.1769953414642595e-05, "loss": 0.187, "step": 12410 }, { "epoch": 0.3837211996694189, "grad_norm": 0.0971170760607018, "learning_rate": 2.1756273763574015e-05, "loss": 0.1914, "step": 12420 }, { "epoch": 0.38403015393646356, "grad_norm": 0.14313686063009398, "learning_rate": 2.1742587059516515e-05, "loss": 0.195, "step": 12430 }, { "epoch": 0.3843391082035082, "grad_norm": 0.10942143582420122, "learning_rate": 2.1728893316757873e-05, "loss": 0.1865, "step": 12440 }, { "epoch": 0.3846480624705528, "grad_norm": 0.09674273343079418, "learning_rate": 2.1715192549593225e-05, "loss": 0.1846, "step": 12450 }, { "epoch": 0.3849570167375974, "grad_norm": 0.11089546161791718, "learning_rate": 2.1701484772325033e-05, "loss": 0.1863, "step": 12460 }, { "epoch": 0.38526597100464205, "grad_norm": 0.08988728264341979, "learning_rate": 2.1687769999263082e-05, "loss": 0.1838, "step": 12470 }, { "epoch": 0.3855749252716867, "grad_norm": 0.10942846423576381, "learning_rate": 2.1674048244724448e-05, "loss": 0.1918, "step": 12480 }, { "epoch": 0.38588387953873127, "grad_norm": 0.0960690443747848, "learning_rate": 2.1660319523033513e-05, "loss": 0.1858, "step": 12490 }, { "epoch": 0.3861928338057759, "grad_norm": 0.088145278690529, "learning_rate": 2.164658384852191e-05, "loss": 0.1898, "step": 12500 }, { "epoch": 0.38650178807282054, "grad_norm": 0.09096558863983556, "learning_rate": 2.163284123552856e-05, "loss": 0.1896, "step": 12510 }, { "epoch": 0.3868107423398651, "grad_norm": 0.1047677160787241, "learning_rate": 2.1619091698399597e-05, "loss": 0.1884, "step": 12520 }, { "epoch": 0.38711969660690976, "grad_norm": 0.10141960460779591, "learning_rate": 2.1605335251488402e-05, "loss": 0.1857, "step": 12530 }, { "epoch": 0.3874286508739544, "grad_norm": 0.07862972805395212, "learning_rate": 2.1591571909155562e-05, "loss": 0.1885, "step": 12540 }, { "epoch": 0.38773760514099903, "grad_norm": 0.08583249061967742, "learning_rate": 2.1577801685768863e-05, "loss": 0.1848, "step": 12550 }, { "epoch": 0.3880465594080436, "grad_norm": 0.09202201423974528, "learning_rate": 2.1564024595703277e-05, "loss": 0.19, "step": 12560 }, { "epoch": 0.38835551367508825, "grad_norm": 0.1406654518530953, "learning_rate": 2.1550240653340943e-05, "loss": 0.1903, "step": 12570 }, { "epoch": 0.3886644679421329, "grad_norm": 0.09258293574371818, "learning_rate": 2.153644987307115e-05, "loss": 0.1884, "step": 12580 }, { "epoch": 0.38897342220917747, "grad_norm": 0.11507710882309509, "learning_rate": 2.152265226929033e-05, "loss": 0.194, "step": 12590 }, { "epoch": 0.3892823764762221, "grad_norm": 0.07649515273569724, "learning_rate": 2.150884785640203e-05, "loss": 0.1961, "step": 12600 }, { "epoch": 0.38959133074326674, "grad_norm": 0.08426992115426334, "learning_rate": 2.1495036648816916e-05, "loss": 0.1829, "step": 12610 }, { "epoch": 0.3899002850103114, "grad_norm": 0.09726246348335965, "learning_rate": 2.1481218660952744e-05, "loss": 0.1933, "step": 12620 }, { "epoch": 0.39020923927735596, "grad_norm": 0.5326000685879965, "learning_rate": 2.146739390723434e-05, "loss": 0.189, "step": 12630 }, { "epoch": 0.3905181935444006, "grad_norm": 0.08204811848597375, "learning_rate": 2.145356240209361e-05, "loss": 0.1931, "step": 12640 }, { "epoch": 0.39082714781144523, "grad_norm": 0.07572955421670016, "learning_rate": 2.1439724159969487e-05, "loss": 0.1913, "step": 12650 }, { "epoch": 0.3911361020784898, "grad_norm": 0.09723300209292356, "learning_rate": 2.142587919530795e-05, "loss": 0.1848, "step": 12660 }, { "epoch": 0.39144505634553445, "grad_norm": 0.087439750920017, "learning_rate": 2.1412027522561993e-05, "loss": 0.1869, "step": 12670 }, { "epoch": 0.3917540106125791, "grad_norm": 0.09070355508572686, "learning_rate": 2.1398169156191608e-05, "loss": 0.1994, "step": 12680 }, { "epoch": 0.3920629648796237, "grad_norm": 0.08435644984154654, "learning_rate": 2.1384304110663793e-05, "loss": 0.1885, "step": 12690 }, { "epoch": 0.3923719191466683, "grad_norm": 0.1021550545994983, "learning_rate": 2.137043240045249e-05, "loss": 0.1891, "step": 12700 }, { "epoch": 0.39268087341371294, "grad_norm": 0.08711715789476, "learning_rate": 2.1356554040038613e-05, "loss": 0.1836, "step": 12710 }, { "epoch": 0.3929898276807576, "grad_norm": 0.0981042539086943, "learning_rate": 2.134266904391003e-05, "loss": 0.1853, "step": 12720 }, { "epoch": 0.39329878194780216, "grad_norm": 0.11903424237869814, "learning_rate": 2.1328777426561508e-05, "loss": 0.1924, "step": 12730 }, { "epoch": 0.3936077362148468, "grad_norm": 0.09828569735706146, "learning_rate": 2.131487920249476e-05, "loss": 0.1878, "step": 12740 }, { "epoch": 0.39391669048189143, "grad_norm": 0.10736634595032453, "learning_rate": 2.1300974386218364e-05, "loss": 0.1865, "step": 12750 }, { "epoch": 0.39422564474893607, "grad_norm": 0.1078794855524927, "learning_rate": 2.12870629922478e-05, "loss": 0.1907, "step": 12760 }, { "epoch": 0.39453459901598065, "grad_norm": 0.09504763593375629, "learning_rate": 2.1273145035105407e-05, "loss": 0.1859, "step": 12770 }, { "epoch": 0.3948435532830253, "grad_norm": 0.10680741045769311, "learning_rate": 2.125922052932038e-05, "loss": 0.1941, "step": 12780 }, { "epoch": 0.3951525075500699, "grad_norm": 0.10606809238189722, "learning_rate": 2.1245289489428747e-05, "loss": 0.1853, "step": 12790 }, { "epoch": 0.3954614618171145, "grad_norm": 0.08659377105273015, "learning_rate": 2.1231351929973354e-05, "loss": 0.1922, "step": 12800 }, { "epoch": 0.39577041608415914, "grad_norm": 0.11632369655608324, "learning_rate": 2.1217407865503853e-05, "loss": 0.1865, "step": 12810 }, { "epoch": 0.3960793703512038, "grad_norm": 0.11702368925564903, "learning_rate": 2.1203457310576692e-05, "loss": 0.1836, "step": 12820 }, { "epoch": 0.3963883246182484, "grad_norm": 0.09962331373161037, "learning_rate": 2.11895002797551e-05, "loss": 0.1895, "step": 12830 }, { "epoch": 0.396697278885293, "grad_norm": 0.07817913257160077, "learning_rate": 2.117553678760905e-05, "loss": 0.1849, "step": 12840 }, { "epoch": 0.39700623315233763, "grad_norm": 0.07912360578462507, "learning_rate": 2.1161566848715264e-05, "loss": 0.1875, "step": 12850 }, { "epoch": 0.39731518741938227, "grad_norm": 0.09695989534077361, "learning_rate": 2.1147590477657214e-05, "loss": 0.1921, "step": 12860 }, { "epoch": 0.39762414168642685, "grad_norm": 0.10486524951927254, "learning_rate": 2.113360768902506e-05, "loss": 0.1955, "step": 12870 }, { "epoch": 0.3979330959534715, "grad_norm": 0.09637892490561646, "learning_rate": 2.1119618497415673e-05, "loss": 0.182, "step": 12880 }, { "epoch": 0.3982420502205161, "grad_norm": 0.08949872499108744, "learning_rate": 2.110562291743261e-05, "loss": 0.1828, "step": 12890 }, { "epoch": 0.39855100448756076, "grad_norm": 0.09086233140354587, "learning_rate": 2.1091620963686092e-05, "loss": 0.1908, "step": 12900 }, { "epoch": 0.39885995875460534, "grad_norm": 0.10980143053783954, "learning_rate": 2.1077612650793004e-05, "loss": 0.188, "step": 12910 }, { "epoch": 0.39916891302165, "grad_norm": 0.13226727670311497, "learning_rate": 2.1063597993376857e-05, "loss": 0.1938, "step": 12920 }, { "epoch": 0.3994778672886946, "grad_norm": 0.08670786506261809, "learning_rate": 2.104957700606778e-05, "loss": 0.1866, "step": 12930 }, { "epoch": 0.3997868215557392, "grad_norm": 0.10261197420659128, "learning_rate": 2.1035549703502533e-05, "loss": 0.1918, "step": 12940 }, { "epoch": 0.40009577582278383, "grad_norm": 0.09071894437343855, "learning_rate": 2.1021516100324455e-05, "loss": 0.192, "step": 12950 }, { "epoch": 0.40040473008982846, "grad_norm": 0.08839448769964206, "learning_rate": 2.1007476211183453e-05, "loss": 0.1894, "step": 12960 }, { "epoch": 0.4007136843568731, "grad_norm": 0.22092952503263738, "learning_rate": 2.0993430050736017e-05, "loss": 0.1942, "step": 12970 }, { "epoch": 0.4010226386239177, "grad_norm": 0.14461578183039348, "learning_rate": 2.0979377633645163e-05, "loss": 0.1883, "step": 12980 }, { "epoch": 0.4013315928909623, "grad_norm": 0.09938517723185097, "learning_rate": 2.0965318974580452e-05, "loss": 0.1838, "step": 12990 }, { "epoch": 0.40164054715800696, "grad_norm": 0.1013939273930599, "learning_rate": 2.0951254088217958e-05, "loss": 0.1901, "step": 13000 }, { "epoch": 0.40194950142505154, "grad_norm": 0.08510276310095609, "learning_rate": 2.093718298924025e-05, "loss": 0.1872, "step": 13010 }, { "epoch": 0.4022584556920962, "grad_norm": 0.09845304927890826, "learning_rate": 2.0923105692336396e-05, "loss": 0.1892, "step": 13020 }, { "epoch": 0.4025674099591408, "grad_norm": 0.08600848269514792, "learning_rate": 2.090902221220192e-05, "loss": 0.1861, "step": 13030 }, { "epoch": 0.40287636422618545, "grad_norm": 0.08941561354144911, "learning_rate": 2.0894932563538802e-05, "loss": 0.1853, "step": 13040 }, { "epoch": 0.40318531849323, "grad_norm": 0.09258580376709308, "learning_rate": 2.088083676105547e-05, "loss": 0.1924, "step": 13050 }, { "epoch": 0.40349427276027466, "grad_norm": 0.09935786665947385, "learning_rate": 2.0866734819466776e-05, "loss": 0.1883, "step": 13060 }, { "epoch": 0.4038032270273193, "grad_norm": 0.10612466525117271, "learning_rate": 2.0852626753493966e-05, "loss": 0.1869, "step": 13070 }, { "epoch": 0.4041121812943639, "grad_norm": 0.08229392848397417, "learning_rate": 2.0838512577864698e-05, "loss": 0.1882, "step": 13080 }, { "epoch": 0.4044211355614085, "grad_norm": 0.08769203289903779, "learning_rate": 2.0824392307312998e-05, "loss": 0.1837, "step": 13090 }, { "epoch": 0.40473008982845315, "grad_norm": 0.08902737795021919, "learning_rate": 2.0810265956579255e-05, "loss": 0.1851, "step": 13100 }, { "epoch": 0.4050390440954978, "grad_norm": 0.10275579899241351, "learning_rate": 2.0796133540410204e-05, "loss": 0.1852, "step": 13110 }, { "epoch": 0.40534799836254237, "grad_norm": 0.10587801739043626, "learning_rate": 2.0781995073558918e-05, "loss": 0.1874, "step": 13120 }, { "epoch": 0.405656952629587, "grad_norm": 0.1246824199375735, "learning_rate": 2.0767850570784784e-05, "loss": 0.1931, "step": 13130 }, { "epoch": 0.40596590689663165, "grad_norm": 0.09930681277470046, "learning_rate": 2.0753700046853483e-05, "loss": 0.1913, "step": 13140 }, { "epoch": 0.4062748611636762, "grad_norm": 0.08695439519398754, "learning_rate": 2.0739543516536993e-05, "loss": 0.1883, "step": 13150 }, { "epoch": 0.40658381543072086, "grad_norm": 0.09422358361375846, "learning_rate": 2.0725380994613546e-05, "loss": 0.1869, "step": 13160 }, { "epoch": 0.4068927696977655, "grad_norm": 0.0968254311671267, "learning_rate": 2.071121249586765e-05, "loss": 0.1929, "step": 13170 }, { "epoch": 0.40720172396481014, "grad_norm": 0.10575057340327401, "learning_rate": 2.0697038035090043e-05, "loss": 0.2018, "step": 13180 }, { "epoch": 0.4075106782318547, "grad_norm": 0.09948013686687163, "learning_rate": 2.068285762707768e-05, "loss": 0.1863, "step": 13190 }, { "epoch": 0.40781963249889935, "grad_norm": 0.09195181849704964, "learning_rate": 2.0668671286633725e-05, "loss": 0.1841, "step": 13200 }, { "epoch": 0.408128586765944, "grad_norm": 0.09366078386563499, "learning_rate": 2.065447902856755e-05, "loss": 0.1937, "step": 13210 }, { "epoch": 0.40843754103298857, "grad_norm": 0.08190487731690498, "learning_rate": 2.0640280867694688e-05, "loss": 0.1865, "step": 13220 }, { "epoch": 0.4087464953000332, "grad_norm": 0.09882543862828375, "learning_rate": 2.062607681883684e-05, "loss": 0.1911, "step": 13230 }, { "epoch": 0.40905544956707784, "grad_norm": 0.08367448640349641, "learning_rate": 2.061186689682186e-05, "loss": 0.1948, "step": 13240 }, { "epoch": 0.4093644038341225, "grad_norm": 0.11294440587485939, "learning_rate": 2.0597651116483717e-05, "loss": 0.1942, "step": 13250 }, { "epoch": 0.40967335810116706, "grad_norm": 0.08153232269152608, "learning_rate": 2.0583429492662514e-05, "loss": 0.1906, "step": 13260 }, { "epoch": 0.4099823123682117, "grad_norm": 0.09853181618840731, "learning_rate": 2.056920204020444e-05, "loss": 0.1927, "step": 13270 }, { "epoch": 0.41029126663525634, "grad_norm": 0.0790572919252489, "learning_rate": 2.0554968773961778e-05, "loss": 0.1914, "step": 13280 }, { "epoch": 0.4106002209023009, "grad_norm": 0.09194298049947046, "learning_rate": 2.0540729708792875e-05, "loss": 0.1946, "step": 13290 }, { "epoch": 0.41090917516934555, "grad_norm": 0.09819303095831762, "learning_rate": 2.052648485956213e-05, "loss": 0.1871, "step": 13300 }, { "epoch": 0.4112181294363902, "grad_norm": 0.09155683839623317, "learning_rate": 2.051223424113999e-05, "loss": 0.1889, "step": 13310 }, { "epoch": 0.4115270837034348, "grad_norm": 0.08773435869517471, "learning_rate": 2.0497977868402914e-05, "loss": 0.1853, "step": 13320 }, { "epoch": 0.4118360379704794, "grad_norm": 0.08760743257171492, "learning_rate": 2.0483715756233373e-05, "loss": 0.1883, "step": 13330 }, { "epoch": 0.41214499223752404, "grad_norm": 0.10691711717069, "learning_rate": 2.046944791951982e-05, "loss": 0.186, "step": 13340 }, { "epoch": 0.4124539465045687, "grad_norm": 0.08634815961598084, "learning_rate": 2.04551743731567e-05, "loss": 0.1947, "step": 13350 }, { "epoch": 0.41276290077161326, "grad_norm": 0.1468646772202215, "learning_rate": 2.0440895132044412e-05, "loss": 0.1953, "step": 13360 }, { "epoch": 0.4130718550386579, "grad_norm": 0.0829618595581519, "learning_rate": 2.0426610211089293e-05, "loss": 0.1873, "step": 13370 }, { "epoch": 0.41338080930570253, "grad_norm": 0.09593809763101668, "learning_rate": 2.0412319625203617e-05, "loss": 0.1868, "step": 13380 }, { "epoch": 0.41368976357274717, "grad_norm": 0.10604014121023575, "learning_rate": 2.039802338930557e-05, "loss": 0.1937, "step": 13390 }, { "epoch": 0.41399871783979175, "grad_norm": 0.0835303284590433, "learning_rate": 2.038372151831923e-05, "loss": 0.1888, "step": 13400 }, { "epoch": 0.4143076721068364, "grad_norm": 0.09142589495290121, "learning_rate": 2.0369414027174572e-05, "loss": 0.1882, "step": 13410 }, { "epoch": 0.414616626373881, "grad_norm": 0.11497398429958629, "learning_rate": 2.035510093080742e-05, "loss": 0.1886, "step": 13420 }, { "epoch": 0.4149255806409256, "grad_norm": 0.09442063761529558, "learning_rate": 2.034078224415947e-05, "loss": 0.1801, "step": 13430 }, { "epoch": 0.41523453490797024, "grad_norm": 0.13278925914404902, "learning_rate": 2.0326457982178228e-05, "loss": 0.2021, "step": 13440 }, { "epoch": 0.4155434891750149, "grad_norm": 0.09970126251068312, "learning_rate": 2.0312128159817043e-05, "loss": 0.1863, "step": 13450 }, { "epoch": 0.4158524434420595, "grad_norm": 0.10704701412369369, "learning_rate": 2.029779279203506e-05, "loss": 0.1844, "step": 13460 }, { "epoch": 0.4161613977091041, "grad_norm": 0.09866383422506134, "learning_rate": 2.0283451893797212e-05, "loss": 0.1952, "step": 13470 }, { "epoch": 0.41647035197614873, "grad_norm": 0.08402645934569061, "learning_rate": 2.0269105480074204e-05, "loss": 0.1877, "step": 13480 }, { "epoch": 0.41677930624319337, "grad_norm": 0.08989225505028141, "learning_rate": 2.02547535658425e-05, "loss": 0.1884, "step": 13490 }, { "epoch": 0.41708826051023795, "grad_norm": 0.10578288777862112, "learning_rate": 2.0240396166084315e-05, "loss": 0.1877, "step": 13500 }, { "epoch": 0.4173972147772826, "grad_norm": 0.07430768141184434, "learning_rate": 2.022603329578757e-05, "loss": 0.1876, "step": 13510 }, { "epoch": 0.4177061690443272, "grad_norm": 0.08544179582130298, "learning_rate": 2.021166496994592e-05, "loss": 0.1855, "step": 13520 }, { "epoch": 0.41801512331137186, "grad_norm": 0.08505474087708828, "learning_rate": 2.0197291203558696e-05, "loss": 0.1916, "step": 13530 }, { "epoch": 0.41832407757841644, "grad_norm": 0.07972136122383491, "learning_rate": 2.0182912011630923e-05, "loss": 0.1912, "step": 13540 }, { "epoch": 0.4186330318454611, "grad_norm": 0.08981097110612854, "learning_rate": 2.0168527409173284e-05, "loss": 0.1827, "step": 13550 }, { "epoch": 0.4189419861125057, "grad_norm": 0.08533861833995386, "learning_rate": 2.015413741120211e-05, "loss": 0.185, "step": 13560 }, { "epoch": 0.4192509403795503, "grad_norm": 0.09544531938911857, "learning_rate": 2.0139742032739356e-05, "loss": 0.1898, "step": 13570 }, { "epoch": 0.41955989464659493, "grad_norm": 0.07626397128140812, "learning_rate": 2.012534128881261e-05, "loss": 0.1903, "step": 13580 }, { "epoch": 0.41986884891363957, "grad_norm": 0.10049543968215914, "learning_rate": 2.0110935194455054e-05, "loss": 0.1839, "step": 13590 }, { "epoch": 0.4201778031806842, "grad_norm": 0.09195127165080279, "learning_rate": 2.0096523764705447e-05, "loss": 0.1845, "step": 13600 }, { "epoch": 0.4204867574477288, "grad_norm": 0.08843111699709341, "learning_rate": 2.0082107014608128e-05, "loss": 0.1839, "step": 13610 }, { "epoch": 0.4207957117147734, "grad_norm": 0.09131074992090338, "learning_rate": 2.0067684959212995e-05, "loss": 0.1866, "step": 13620 }, { "epoch": 0.42110466598181806, "grad_norm": 0.082216641269758, "learning_rate": 2.0053257613575467e-05, "loss": 0.183, "step": 13630 }, { "epoch": 0.42141362024886264, "grad_norm": 0.08831693579553958, "learning_rate": 2.0038824992756496e-05, "loss": 0.1841, "step": 13640 }, { "epoch": 0.4217225745159073, "grad_norm": 0.09112703304305547, "learning_rate": 2.0024387111822547e-05, "loss": 0.182, "step": 13650 }, { "epoch": 0.4220315287829519, "grad_norm": 0.094754745798283, "learning_rate": 2.0009943985845564e-05, "loss": 0.1869, "step": 13660 }, { "epoch": 0.42234048304999655, "grad_norm": 0.09299186425263704, "learning_rate": 1.999549562990297e-05, "loss": 0.1947, "step": 13670 }, { "epoch": 0.42264943731704113, "grad_norm": 0.09819702932619447, "learning_rate": 1.9981042059077657e-05, "loss": 0.1854, "step": 13680 }, { "epoch": 0.42295839158408577, "grad_norm": 0.09652302540077827, "learning_rate": 1.9966583288457944e-05, "loss": 0.1836, "step": 13690 }, { "epoch": 0.4232673458511304, "grad_norm": 0.09629148997305158, "learning_rate": 1.9952119333137594e-05, "loss": 0.1881, "step": 13700 }, { "epoch": 0.423576300118175, "grad_norm": 0.08819754976061543, "learning_rate": 1.9937650208215766e-05, "loss": 0.1902, "step": 13710 }, { "epoch": 0.4238852543852196, "grad_norm": 0.09763247427356099, "learning_rate": 1.9923175928797035e-05, "loss": 0.185, "step": 13720 }, { "epoch": 0.42419420865226426, "grad_norm": 0.09401799831882597, "learning_rate": 1.990869650999134e-05, "loss": 0.1854, "step": 13730 }, { "epoch": 0.4245031629193089, "grad_norm": 0.08850753614982744, "learning_rate": 1.9894211966913998e-05, "loss": 0.1911, "step": 13740 }, { "epoch": 0.4248121171863535, "grad_norm": 0.0963336941553544, "learning_rate": 1.987972231468567e-05, "loss": 0.1847, "step": 13750 }, { "epoch": 0.4251210714533981, "grad_norm": 0.10177981975001972, "learning_rate": 1.9865227568432343e-05, "loss": 0.1844, "step": 13760 }, { "epoch": 0.42543002572044275, "grad_norm": 0.09698420187479102, "learning_rate": 1.9850727743285336e-05, "loss": 0.1869, "step": 13770 }, { "epoch": 0.42573897998748733, "grad_norm": 0.1035133011371867, "learning_rate": 1.9836222854381258e-05, "loss": 0.1883, "step": 13780 }, { "epoch": 0.42604793425453197, "grad_norm": 0.10466490582533701, "learning_rate": 1.9821712916862015e-05, "loss": 0.187, "step": 13790 }, { "epoch": 0.4263568885215766, "grad_norm": 0.07449034686477328, "learning_rate": 1.9807197945874768e-05, "loss": 0.1925, "step": 13800 }, { "epoch": 0.42666584278862124, "grad_norm": 0.0773908754113331, "learning_rate": 1.979267795657195e-05, "loss": 0.1995, "step": 13810 }, { "epoch": 0.4269747970556658, "grad_norm": 0.08664857002889029, "learning_rate": 1.9778152964111222e-05, "loss": 0.1939, "step": 13820 }, { "epoch": 0.42728375132271046, "grad_norm": 0.08680359746443131, "learning_rate": 1.9763622983655464e-05, "loss": 0.1925, "step": 13830 }, { "epoch": 0.4275927055897551, "grad_norm": 0.08025786602482227, "learning_rate": 1.974908803037278e-05, "loss": 0.1852, "step": 13840 }, { "epoch": 0.4279016598567997, "grad_norm": 0.08289192691947796, "learning_rate": 1.9734548119436443e-05, "loss": 0.1995, "step": 13850 }, { "epoch": 0.4282106141238443, "grad_norm": 0.08901264043929087, "learning_rate": 1.9720003266024923e-05, "loss": 0.1826, "step": 13860 }, { "epoch": 0.42851956839088895, "grad_norm": 0.09935947498977617, "learning_rate": 1.9705453485321836e-05, "loss": 0.1857, "step": 13870 }, { "epoch": 0.4288285226579336, "grad_norm": 0.09254167682803593, "learning_rate": 1.9690898792515944e-05, "loss": 0.1882, "step": 13880 }, { "epoch": 0.42913747692497817, "grad_norm": 0.07926805991861635, "learning_rate": 1.9676339202801138e-05, "loss": 0.1851, "step": 13890 }, { "epoch": 0.4294464311920228, "grad_norm": 0.10148392340442433, "learning_rate": 1.9661774731376423e-05, "loss": 0.1828, "step": 13900 }, { "epoch": 0.42975538545906744, "grad_norm": 0.08943261878553953, "learning_rate": 1.9647205393445893e-05, "loss": 0.1865, "step": 13910 }, { "epoch": 0.430064339726112, "grad_norm": 0.08878554964023881, "learning_rate": 1.963263120421874e-05, "loss": 0.1875, "step": 13920 }, { "epoch": 0.43037329399315666, "grad_norm": 0.12975106655918497, "learning_rate": 1.961805217890919e-05, "loss": 0.1893, "step": 13930 }, { "epoch": 0.4306822482602013, "grad_norm": 0.08926495385588933, "learning_rate": 1.960346833273655e-05, "loss": 0.1891, "step": 13940 }, { "epoch": 0.43099120252724593, "grad_norm": 0.08942323160507486, "learning_rate": 1.9588879680925135e-05, "loss": 0.1891, "step": 13950 }, { "epoch": 0.4313001567942905, "grad_norm": 0.09477219895998903, "learning_rate": 1.9574286238704288e-05, "loss": 0.1868, "step": 13960 }, { "epoch": 0.43160911106133515, "grad_norm": 0.1056608039747648, "learning_rate": 1.9559688021308356e-05, "loss": 0.1848, "step": 13970 }, { "epoch": 0.4319180653283798, "grad_norm": 0.09683030360269723, "learning_rate": 1.9545085043976662e-05, "loss": 0.1884, "step": 13980 }, { "epoch": 0.43222701959542437, "grad_norm": 0.07194826476901177, "learning_rate": 1.9530477321953506e-05, "loss": 0.1984, "step": 13990 }, { "epoch": 0.432535973862469, "grad_norm": 0.08807504764938884, "learning_rate": 1.9515864870488132e-05, "loss": 0.1956, "step": 14000 }, { "epoch": 0.43284492812951364, "grad_norm": 0.09157827705197799, "learning_rate": 1.950124770483473e-05, "loss": 0.1853, "step": 14010 }, { "epoch": 0.4331538823965583, "grad_norm": 0.09126112475800932, "learning_rate": 1.9486625840252407e-05, "loss": 0.1875, "step": 14020 }, { "epoch": 0.43346283666360286, "grad_norm": 0.08139981920700338, "learning_rate": 1.9471999292005172e-05, "loss": 0.1856, "step": 14030 }, { "epoch": 0.4337717909306475, "grad_norm": 0.09353478022404632, "learning_rate": 1.945736807536193e-05, "loss": 0.1858, "step": 14040 }, { "epoch": 0.43408074519769213, "grad_norm": 0.08798355421745707, "learning_rate": 1.9442732205596462e-05, "loss": 0.1901, "step": 14050 }, { "epoch": 0.4343896994647367, "grad_norm": 0.09421803651890609, "learning_rate": 1.942809169798739e-05, "loss": 0.1886, "step": 14060 }, { "epoch": 0.43469865373178135, "grad_norm": 0.10686880282959817, "learning_rate": 1.941344656781819e-05, "loss": 0.1865, "step": 14070 }, { "epoch": 0.435007607998826, "grad_norm": 0.10075315487419693, "learning_rate": 1.9398796830377173e-05, "loss": 0.1824, "step": 14080 }, { "epoch": 0.4353165622658706, "grad_norm": 0.08157116161631962, "learning_rate": 1.9384142500957438e-05, "loss": 0.1899, "step": 14090 }, { "epoch": 0.4356255165329152, "grad_norm": 0.10215857572524423, "learning_rate": 1.9369483594856895e-05, "loss": 0.1936, "step": 14100 }, { "epoch": 0.43593447079995984, "grad_norm": 0.09463285224034536, "learning_rate": 1.935482012737822e-05, "loss": 0.1883, "step": 14110 }, { "epoch": 0.4362434250670045, "grad_norm": 0.1133192243836287, "learning_rate": 1.934015211382886e-05, "loss": 0.193, "step": 14120 }, { "epoch": 0.43655237933404906, "grad_norm": 0.085173513733126, "learning_rate": 1.9325479569521e-05, "loss": 0.1893, "step": 14130 }, { "epoch": 0.4368613336010937, "grad_norm": 0.08015246872997993, "learning_rate": 1.931080250977156e-05, "loss": 0.1976, "step": 14140 }, { "epoch": 0.43717028786813833, "grad_norm": 0.10004318093720328, "learning_rate": 1.929612094990217e-05, "loss": 0.1968, "step": 14150 }, { "epoch": 0.43747924213518297, "grad_norm": 0.10879740445860499, "learning_rate": 1.9281434905239162e-05, "loss": 0.1903, "step": 14160 }, { "epoch": 0.43778819640222755, "grad_norm": 0.08899220468383998, "learning_rate": 1.9266744391113547e-05, "loss": 0.1851, "step": 14170 }, { "epoch": 0.4380971506692722, "grad_norm": 0.07604691874249775, "learning_rate": 1.9252049422860998e-05, "loss": 0.1868, "step": 14180 }, { "epoch": 0.4384061049363168, "grad_norm": 0.09320862750343213, "learning_rate": 1.9237350015821847e-05, "loss": 0.1894, "step": 14190 }, { "epoch": 0.4387150592033614, "grad_norm": 0.09481159082107075, "learning_rate": 1.922264618534106e-05, "loss": 0.1869, "step": 14200 }, { "epoch": 0.43902401347040604, "grad_norm": 0.11383050740508886, "learning_rate": 1.9207937946768198e-05, "loss": 0.188, "step": 14210 }, { "epoch": 0.4393329677374507, "grad_norm": 0.08376420694724454, "learning_rate": 1.9193225315457458e-05, "loss": 0.1822, "step": 14220 }, { "epoch": 0.4396419220044953, "grad_norm": 0.09349902209276498, "learning_rate": 1.9178508306767596e-05, "loss": 0.1886, "step": 14230 }, { "epoch": 0.4399508762715399, "grad_norm": 0.10217687349968801, "learning_rate": 1.9163786936061954e-05, "loss": 0.189, "step": 14240 }, { "epoch": 0.44025983053858453, "grad_norm": 0.07900799082968471, "learning_rate": 1.914906121870842e-05, "loss": 0.1993, "step": 14250 }, { "epoch": 0.44056878480562917, "grad_norm": 0.07887609581075224, "learning_rate": 1.913433117007941e-05, "loss": 0.1884, "step": 14260 }, { "epoch": 0.44087773907267375, "grad_norm": 0.10239182083915561, "learning_rate": 1.9119596805551892e-05, "loss": 0.1876, "step": 14270 }, { "epoch": 0.4411866933397184, "grad_norm": 0.09126597517046119, "learning_rate": 1.9104858140507295e-05, "loss": 0.1921, "step": 14280 }, { "epoch": 0.441495647606763, "grad_norm": 0.09596889892113322, "learning_rate": 1.909011519033158e-05, "loss": 0.1849, "step": 14290 }, { "epoch": 0.44180460187380766, "grad_norm": 0.08987253107795462, "learning_rate": 1.9075367970415155e-05, "loss": 0.1844, "step": 14300 }, { "epoch": 0.44211355614085224, "grad_norm": 0.10479803884031098, "learning_rate": 1.9060616496152895e-05, "loss": 0.1873, "step": 14310 }, { "epoch": 0.4424225104078969, "grad_norm": 0.08682188064868668, "learning_rate": 1.9045860782944114e-05, "loss": 0.1873, "step": 14320 }, { "epoch": 0.4427314646749415, "grad_norm": 0.08269565963791928, "learning_rate": 1.9031100846192553e-05, "loss": 0.1937, "step": 14330 }, { "epoch": 0.4430404189419861, "grad_norm": 0.09119937691065266, "learning_rate": 1.901633670130636e-05, "loss": 0.1969, "step": 14340 }, { "epoch": 0.44334937320903073, "grad_norm": 0.10316256903724103, "learning_rate": 1.900156836369808e-05, "loss": 0.1862, "step": 14350 }, { "epoch": 0.44365832747607536, "grad_norm": 0.08498107350359378, "learning_rate": 1.8986795848784624e-05, "loss": 0.1955, "step": 14360 }, { "epoch": 0.44396728174312, "grad_norm": 0.09596006383160492, "learning_rate": 1.897201917198728e-05, "loss": 0.1854, "step": 14370 }, { "epoch": 0.4442762360101646, "grad_norm": 0.10427921561793586, "learning_rate": 1.8957238348731663e-05, "loss": 0.1895, "step": 14380 }, { "epoch": 0.4445851902772092, "grad_norm": 0.10313493002397604, "learning_rate": 1.894245339444773e-05, "loss": 0.1856, "step": 14390 }, { "epoch": 0.44489414454425386, "grad_norm": 0.10719812868284746, "learning_rate": 1.8927664324569747e-05, "loss": 0.1878, "step": 14400 }, { "epoch": 0.44520309881129844, "grad_norm": 0.07234710279240211, "learning_rate": 1.891287115453627e-05, "loss": 0.1882, "step": 14410 }, { "epoch": 0.4455120530783431, "grad_norm": 0.08478092043839999, "learning_rate": 1.8898073899790148e-05, "loss": 0.1949, "step": 14420 }, { "epoch": 0.4458210073453877, "grad_norm": 0.07844482376710984, "learning_rate": 1.8883272575778482e-05, "loss": 0.1909, "step": 14430 }, { "epoch": 0.44612996161243235, "grad_norm": 0.12773304235287128, "learning_rate": 1.8868467197952627e-05, "loss": 0.1904, "step": 14440 }, { "epoch": 0.4464389158794769, "grad_norm": 0.09154512340021774, "learning_rate": 1.8853657781768162e-05, "loss": 0.1868, "step": 14450 }, { "epoch": 0.44674787014652156, "grad_norm": 0.08640986151865691, "learning_rate": 1.8838844342684896e-05, "loss": 0.1878, "step": 14460 }, { "epoch": 0.4470568244135662, "grad_norm": 0.08791857572167769, "learning_rate": 1.8824026896166834e-05, "loss": 0.187, "step": 14470 }, { "epoch": 0.4473657786806108, "grad_norm": 0.09113953242817693, "learning_rate": 1.8809205457682144e-05, "loss": 0.191, "step": 14480 }, { "epoch": 0.4476747329476554, "grad_norm": 0.0872648088127917, "learning_rate": 1.8794380042703193e-05, "loss": 0.1865, "step": 14490 }, { "epoch": 0.44798368721470005, "grad_norm": 0.09349549807896199, "learning_rate": 1.8779550666706476e-05, "loss": 0.1847, "step": 14500 }, { "epoch": 0.4482926414817447, "grad_norm": 0.09931289139986942, "learning_rate": 1.8764717345172628e-05, "loss": 0.1947, "step": 14510 }, { "epoch": 0.4486015957487893, "grad_norm": 0.1094366859375454, "learning_rate": 1.874988009358641e-05, "loss": 0.1923, "step": 14520 }, { "epoch": 0.4489105500158339, "grad_norm": 0.10188243848239363, "learning_rate": 1.873503892743668e-05, "loss": 0.1855, "step": 14530 }, { "epoch": 0.44921950428287855, "grad_norm": 0.09185140886105803, "learning_rate": 1.872019386221638e-05, "loss": 0.1889, "step": 14540 }, { "epoch": 0.4495284585499231, "grad_norm": 0.1104889080374331, "learning_rate": 1.870534491342253e-05, "loss": 0.188, "step": 14550 }, { "epoch": 0.44983741281696776, "grad_norm": 0.09094614020062022, "learning_rate": 1.86904920965562e-05, "loss": 0.1853, "step": 14560 }, { "epoch": 0.4501463670840124, "grad_norm": 0.07721404526513222, "learning_rate": 1.8675635427122496e-05, "loss": 0.1923, "step": 14570 }, { "epoch": 0.45045532135105704, "grad_norm": 0.0905134023141106, "learning_rate": 1.8660774920630547e-05, "loss": 0.1824, "step": 14580 }, { "epoch": 0.4507642756181016, "grad_norm": 0.10448425942826392, "learning_rate": 1.864591059259349e-05, "loss": 0.1902, "step": 14590 }, { "epoch": 0.45107322988514625, "grad_norm": 0.07024958158034736, "learning_rate": 1.863104245852845e-05, "loss": 0.1837, "step": 14600 }, { "epoch": 0.4513821841521909, "grad_norm": 0.07809229034165306, "learning_rate": 1.861617053395652e-05, "loss": 0.1899, "step": 14610 }, { "epoch": 0.45169113841923547, "grad_norm": 0.08308319492796883, "learning_rate": 1.860129483440276e-05, "loss": 0.1865, "step": 14620 }, { "epoch": 0.4520000926862801, "grad_norm": 0.09811794640335583, "learning_rate": 1.8586415375396162e-05, "loss": 0.1852, "step": 14630 }, { "epoch": 0.45230904695332474, "grad_norm": 0.10580092058921427, "learning_rate": 1.8571532172469644e-05, "loss": 0.1809, "step": 14640 }, { "epoch": 0.4526180012203694, "grad_norm": 0.07895169754725709, "learning_rate": 1.8556645241160042e-05, "loss": 0.188, "step": 14650 }, { "epoch": 0.45292695548741396, "grad_norm": 0.09332059811801918, "learning_rate": 1.8541754597008066e-05, "loss": 0.1871, "step": 14660 }, { "epoch": 0.4532359097544586, "grad_norm": 0.10443773639876203, "learning_rate": 1.852686025555832e-05, "loss": 0.1889, "step": 14670 }, { "epoch": 0.45354486402150324, "grad_norm": 0.10221171996264637, "learning_rate": 1.851196223235926e-05, "loss": 0.1873, "step": 14680 }, { "epoch": 0.4538538182885478, "grad_norm": 0.08900271888338614, "learning_rate": 1.8497060542963183e-05, "loss": 0.1861, "step": 14690 }, { "epoch": 0.45416277255559245, "grad_norm": 0.10763607054704716, "learning_rate": 1.848215520292622e-05, "loss": 0.187, "step": 14700 }, { "epoch": 0.4544717268226371, "grad_norm": 0.09251662789119114, "learning_rate": 1.84672462278083e-05, "loss": 0.1867, "step": 14710 }, { "epoch": 0.4547806810896817, "grad_norm": 0.11126069465565208, "learning_rate": 1.845233363317316e-05, "loss": 0.1858, "step": 14720 }, { "epoch": 0.4550896353567263, "grad_norm": 0.10369781754615437, "learning_rate": 1.843741743458831e-05, "loss": 0.1934, "step": 14730 }, { "epoch": 0.45539858962377094, "grad_norm": 0.09510775173864588, "learning_rate": 1.8422497647625024e-05, "loss": 0.19, "step": 14740 }, { "epoch": 0.4557075438908156, "grad_norm": 0.0749622110991937, "learning_rate": 1.840757428785832e-05, "loss": 0.1876, "step": 14750 }, { "epoch": 0.45601649815786016, "grad_norm": 0.09607320796450156, "learning_rate": 1.839264737086695e-05, "loss": 0.1852, "step": 14760 }, { "epoch": 0.4563254524249048, "grad_norm": 0.08572065356867109, "learning_rate": 1.8377716912233372e-05, "loss": 0.1859, "step": 14770 }, { "epoch": 0.45663440669194943, "grad_norm": 0.09045929059827176, "learning_rate": 1.8362782927543748e-05, "loss": 0.1852, "step": 14780 }, { "epoch": 0.45694336095899407, "grad_norm": 0.08558816792747759, "learning_rate": 1.8347845432387914e-05, "loss": 0.1869, "step": 14790 }, { "epoch": 0.45725231522603865, "grad_norm": 0.09781406993689364, "learning_rate": 1.8332904442359382e-05, "loss": 0.1879, "step": 14800 }, { "epoch": 0.4575612694930833, "grad_norm": 0.09270766692079555, "learning_rate": 1.83179599730553e-05, "loss": 0.1848, "step": 14810 }, { "epoch": 0.4578702237601279, "grad_norm": 0.0921124967651755, "learning_rate": 1.8303012040076456e-05, "loss": 0.1949, "step": 14820 }, { "epoch": 0.4581791780271725, "grad_norm": 0.09234506599274829, "learning_rate": 1.8288060659027245e-05, "loss": 0.1847, "step": 14830 }, { "epoch": 0.45848813229421714, "grad_norm": 0.09590755864665845, "learning_rate": 1.8273105845515677e-05, "loss": 0.184, "step": 14840 }, { "epoch": 0.4587970865612618, "grad_norm": 0.09562393573248085, "learning_rate": 1.825814761515333e-05, "loss": 0.1914, "step": 14850 }, { "epoch": 0.4591060408283064, "grad_norm": 0.11029514353162492, "learning_rate": 1.8243185983555354e-05, "loss": 0.1882, "step": 14860 }, { "epoch": 0.459414995095351, "grad_norm": 0.0966739344651011, "learning_rate": 1.822822096634045e-05, "loss": 0.187, "step": 14870 }, { "epoch": 0.45972394936239563, "grad_norm": 0.10608787716057655, "learning_rate": 1.821325257913086e-05, "loss": 0.1881, "step": 14880 }, { "epoch": 0.46003290362944027, "grad_norm": 0.09042449727808383, "learning_rate": 1.819828083755233e-05, "loss": 0.1864, "step": 14890 }, { "epoch": 0.46034185789648485, "grad_norm": 0.0901560389505401, "learning_rate": 1.8183305757234122e-05, "loss": 0.1819, "step": 14900 }, { "epoch": 0.4606508121635295, "grad_norm": 0.26809753696792066, "learning_rate": 1.8168327353808966e-05, "loss": 0.1891, "step": 14910 }, { "epoch": 0.4609597664305741, "grad_norm": 0.14454486069777536, "learning_rate": 1.8153345642913092e-05, "loss": 0.1838, "step": 14920 }, { "epoch": 0.46126872069761876, "grad_norm": 0.09401613139941153, "learning_rate": 1.813836064018614e-05, "loss": 0.1902, "step": 14930 }, { "epoch": 0.46157767496466334, "grad_norm": 0.09662306497757542, "learning_rate": 1.812337236127122e-05, "loss": 0.1886, "step": 14940 }, { "epoch": 0.461886629231708, "grad_norm": 0.08011186199733386, "learning_rate": 1.810838082181485e-05, "loss": 0.1836, "step": 14950 }, { "epoch": 0.4621955834987526, "grad_norm": 0.0959391996675847, "learning_rate": 1.8093386037466948e-05, "loss": 0.1934, "step": 14960 }, { "epoch": 0.4625045377657972, "grad_norm": 0.0984128991567507, "learning_rate": 1.807838802388083e-05, "loss": 0.1841, "step": 14970 }, { "epoch": 0.46281349203284183, "grad_norm": 0.09960467420393028, "learning_rate": 1.8063386796713175e-05, "loss": 0.1868, "step": 14980 }, { "epoch": 0.46312244629988647, "grad_norm": 0.08738018195659383, "learning_rate": 1.8048382371624022e-05, "loss": 0.1841, "step": 14990 }, { "epoch": 0.4634314005669311, "grad_norm": 0.0858575462808963, "learning_rate": 1.803337476427674e-05, "loss": 0.1846, "step": 15000 }, { "epoch": 0.4637403548339757, "grad_norm": 0.11008539341333988, "learning_rate": 1.8018363990338022e-05, "loss": 0.1876, "step": 15010 }, { "epoch": 0.4640493091010203, "grad_norm": 0.08855632244478165, "learning_rate": 1.8003350065477877e-05, "loss": 0.1905, "step": 15020 }, { "epoch": 0.46435826336806496, "grad_norm": 0.09543370847758748, "learning_rate": 1.7988333005369595e-05, "loss": 0.1856, "step": 15030 }, { "epoch": 0.46466721763510954, "grad_norm": 0.11468696699357876, "learning_rate": 1.797331282568974e-05, "loss": 0.185, "step": 15040 }, { "epoch": 0.4649761719021542, "grad_norm": 0.0875863282134661, "learning_rate": 1.7958289542118132e-05, "loss": 0.1825, "step": 15050 }, { "epoch": 0.4652851261691988, "grad_norm": 0.09530661495635731, "learning_rate": 1.794326317033783e-05, "loss": 0.185, "step": 15060 }, { "epoch": 0.46559408043624345, "grad_norm": 0.10196903942408467, "learning_rate": 1.792823372603512e-05, "loss": 0.1836, "step": 15070 }, { "epoch": 0.46590303470328803, "grad_norm": 0.0807715602315761, "learning_rate": 1.7913201224899495e-05, "loss": 0.183, "step": 15080 }, { "epoch": 0.46621198897033267, "grad_norm": 0.09615660065111356, "learning_rate": 1.7898165682623637e-05, "loss": 0.1919, "step": 15090 }, { "epoch": 0.4665209432373773, "grad_norm": 0.10904360246970066, "learning_rate": 1.7883127114903406e-05, "loss": 0.1852, "step": 15100 }, { "epoch": 0.4668298975044219, "grad_norm": 0.12369054276626484, "learning_rate": 1.7868085537437812e-05, "loss": 0.1893, "step": 15110 }, { "epoch": 0.4671388517714665, "grad_norm": 0.08634182660069728, "learning_rate": 1.7853040965929023e-05, "loss": 0.1933, "step": 15120 }, { "epoch": 0.46744780603851116, "grad_norm": 0.08031336460288044, "learning_rate": 1.7837993416082318e-05, "loss": 0.1867, "step": 15130 }, { "epoch": 0.4677567603055558, "grad_norm": 0.09689839372096767, "learning_rate": 1.7822942903606088e-05, "loss": 0.1866, "step": 15140 }, { "epoch": 0.4680657145726004, "grad_norm": 0.10264211622153026, "learning_rate": 1.780788944421182e-05, "loss": 0.191, "step": 15150 }, { "epoch": 0.468374668839645, "grad_norm": 0.09872002146016506, "learning_rate": 1.7792833053614076e-05, "loss": 0.1876, "step": 15160 }, { "epoch": 0.46868362310668965, "grad_norm": 0.09667286155920068, "learning_rate": 1.777777374753048e-05, "loss": 0.1964, "step": 15170 }, { "epoch": 0.46899257737373423, "grad_norm": 0.09691054345384882, "learning_rate": 1.7762711541681695e-05, "loss": 0.1932, "step": 15180 }, { "epoch": 0.46930153164077887, "grad_norm": 0.09086575672944316, "learning_rate": 1.7747646451791408e-05, "loss": 0.1977, "step": 15190 }, { "epoch": 0.4696104859078235, "grad_norm": 0.09227703776636187, "learning_rate": 1.773257849358633e-05, "loss": 0.1828, "step": 15200 }, { "epoch": 0.46991944017486814, "grad_norm": 0.09505838352684795, "learning_rate": 1.7717507682796156e-05, "loss": 0.1864, "step": 15210 }, { "epoch": 0.4702283944419127, "grad_norm": 0.14433938921800793, "learning_rate": 1.7702434035153563e-05, "loss": 0.1914, "step": 15220 }, { "epoch": 0.47053734870895736, "grad_norm": 0.1092071177720221, "learning_rate": 1.7687357566394186e-05, "loss": 0.1868, "step": 15230 }, { "epoch": 0.470846302976002, "grad_norm": 0.08638758400992423, "learning_rate": 1.767227829225661e-05, "loss": 0.1851, "step": 15240 }, { "epoch": 0.4711552572430466, "grad_norm": 0.09367265333682509, "learning_rate": 1.7657196228482337e-05, "loss": 0.1883, "step": 15250 }, { "epoch": 0.4714642115100912, "grad_norm": 0.07692000952585477, "learning_rate": 1.7642111390815796e-05, "loss": 0.187, "step": 15260 }, { "epoch": 0.47177316577713585, "grad_norm": 0.08526520780711176, "learning_rate": 1.762702379500431e-05, "loss": 0.1856, "step": 15270 }, { "epoch": 0.4720821200441805, "grad_norm": 0.08646246182945996, "learning_rate": 1.761193345679807e-05, "loss": 0.1812, "step": 15280 }, { "epoch": 0.47239107431122507, "grad_norm": 0.07694228733864118, "learning_rate": 1.759684039195013e-05, "loss": 0.1904, "step": 15290 }, { "epoch": 0.4727000285782697, "grad_norm": 0.11305783364625918, "learning_rate": 1.7581744616216407e-05, "loss": 0.1921, "step": 15300 }, { "epoch": 0.47300898284531434, "grad_norm": 0.09757586204286521, "learning_rate": 1.7566646145355636e-05, "loss": 0.1841, "step": 15310 }, { "epoch": 0.4733179371123589, "grad_norm": 0.09147307713691727, "learning_rate": 1.755154499512936e-05, "loss": 0.1846, "step": 15320 }, { "epoch": 0.47362689137940356, "grad_norm": 0.09257491179768478, "learning_rate": 1.753644118130194e-05, "loss": 0.1828, "step": 15330 }, { "epoch": 0.4739358456464482, "grad_norm": 0.08393761776522794, "learning_rate": 1.7521334719640487e-05, "loss": 0.1861, "step": 15340 }, { "epoch": 0.47424479991349283, "grad_norm": 0.08964750078537113, "learning_rate": 1.7506225625914905e-05, "loss": 0.1816, "step": 15350 }, { "epoch": 0.4745537541805374, "grad_norm": 0.0848320103314238, "learning_rate": 1.7491113915897832e-05, "loss": 0.1827, "step": 15360 }, { "epoch": 0.47486270844758205, "grad_norm": 0.08596297431686581, "learning_rate": 1.7475999605364633e-05, "loss": 0.1821, "step": 15370 }, { "epoch": 0.4751716627146267, "grad_norm": 0.09322592009027474, "learning_rate": 1.7460882710093403e-05, "loss": 0.1918, "step": 15380 }, { "epoch": 0.47548061698167127, "grad_norm": 0.1071344134291526, "learning_rate": 1.7445763245864917e-05, "loss": 0.1842, "step": 15390 }, { "epoch": 0.4757895712487159, "grad_norm": 0.0939611714636112, "learning_rate": 1.7430641228462642e-05, "loss": 0.1832, "step": 15400 }, { "epoch": 0.47609852551576054, "grad_norm": 0.10232536322383694, "learning_rate": 1.7415516673672713e-05, "loss": 0.1848, "step": 15410 }, { "epoch": 0.4764074797828052, "grad_norm": 0.0782286356173196, "learning_rate": 1.7400389597283904e-05, "loss": 0.1805, "step": 15420 }, { "epoch": 0.47671643404984976, "grad_norm": 0.09208469805587671, "learning_rate": 1.7385260015087627e-05, "loss": 0.1832, "step": 15430 }, { "epoch": 0.4770253883168944, "grad_norm": 0.10857360895510182, "learning_rate": 1.7370127942877916e-05, "loss": 0.1925, "step": 15440 }, { "epoch": 0.47733434258393903, "grad_norm": 0.09873037066071629, "learning_rate": 1.7354993396451392e-05, "loss": 0.1814, "step": 15450 }, { "epoch": 0.4776432968509836, "grad_norm": 0.08573551565865423, "learning_rate": 1.7339856391607268e-05, "loss": 0.1865, "step": 15460 }, { "epoch": 0.47795225111802825, "grad_norm": 0.08016237916706054, "learning_rate": 1.7324716944147322e-05, "loss": 0.1862, "step": 15470 }, { "epoch": 0.4782612053850729, "grad_norm": 0.11483734956349438, "learning_rate": 1.7309575069875874e-05, "loss": 0.1893, "step": 15480 }, { "epoch": 0.4785701596521175, "grad_norm": 0.08225215622883346, "learning_rate": 1.729443078459979e-05, "loss": 0.1967, "step": 15490 }, { "epoch": 0.4788791139191621, "grad_norm": 0.09516950564167469, "learning_rate": 1.727928410412845e-05, "loss": 0.1968, "step": 15500 }, { "epoch": 0.47918806818620674, "grad_norm": 0.13169828452168447, "learning_rate": 1.726413504427372e-05, "loss": 0.1944, "step": 15510 }, { "epoch": 0.4794970224532514, "grad_norm": 0.0841868815675506, "learning_rate": 1.7248983620849967e-05, "loss": 0.1878, "step": 15520 }, { "epoch": 0.47980597672029596, "grad_norm": 0.06872492837280653, "learning_rate": 1.723382984967402e-05, "loss": 0.1856, "step": 15530 }, { "epoch": 0.4801149309873406, "grad_norm": 0.0724457164126773, "learning_rate": 1.7218673746565154e-05, "loss": 0.186, "step": 15540 }, { "epoch": 0.48042388525438523, "grad_norm": 0.09733579992189363, "learning_rate": 1.7203515327345085e-05, "loss": 0.1905, "step": 15550 }, { "epoch": 0.48073283952142987, "grad_norm": 0.08527111330048416, "learning_rate": 1.7188354607837945e-05, "loss": 0.1909, "step": 15560 }, { "epoch": 0.48104179378847445, "grad_norm": 0.12445708330679682, "learning_rate": 1.7173191603870268e-05, "loss": 0.1934, "step": 15570 }, { "epoch": 0.4813507480555191, "grad_norm": 0.08252945797593983, "learning_rate": 1.7158026331270966e-05, "loss": 0.1924, "step": 15580 }, { "epoch": 0.4816597023225637, "grad_norm": 0.09498089069156578, "learning_rate": 1.714285880587133e-05, "loss": 0.1914, "step": 15590 }, { "epoch": 0.4819686565896083, "grad_norm": 0.08935502259651591, "learning_rate": 1.7127689043505e-05, "loss": 0.1948, "step": 15600 }, { "epoch": 0.48227761085665294, "grad_norm": 0.08517155396689621, "learning_rate": 1.7112517060007936e-05, "loss": 0.1821, "step": 15610 }, { "epoch": 0.4825865651236976, "grad_norm": 0.08451627463555345, "learning_rate": 1.7097342871218443e-05, "loss": 0.1893, "step": 15620 }, { "epoch": 0.4828955193907422, "grad_norm": 0.08643284441209946, "learning_rate": 1.7082166492977106e-05, "loss": 0.181, "step": 15630 }, { "epoch": 0.4832044736577868, "grad_norm": 0.17393290533310926, "learning_rate": 1.7066987941126807e-05, "loss": 0.1901, "step": 15640 }, { "epoch": 0.48351342792483143, "grad_norm": 0.0795347019991599, "learning_rate": 1.7051807231512693e-05, "loss": 0.1951, "step": 15650 }, { "epoch": 0.48382238219187607, "grad_norm": 0.09238073290615917, "learning_rate": 1.703662437998216e-05, "loss": 0.1826, "step": 15660 }, { "epoch": 0.48413133645892065, "grad_norm": 0.08937813081599442, "learning_rate": 1.7021439402384854e-05, "loss": 0.1878, "step": 15670 }, { "epoch": 0.4844402907259653, "grad_norm": 0.0991162805047522, "learning_rate": 1.7006252314572626e-05, "loss": 0.1871, "step": 15680 }, { "epoch": 0.4847492449930099, "grad_norm": 0.08471012686275417, "learning_rate": 1.6991063132399536e-05, "loss": 0.1893, "step": 15690 }, { "epoch": 0.48505819926005456, "grad_norm": 0.10987187352916403, "learning_rate": 1.697587187172183e-05, "loss": 0.1839, "step": 15700 }, { "epoch": 0.48536715352709914, "grad_norm": 0.09193744545349228, "learning_rate": 1.6960678548397917e-05, "loss": 0.1808, "step": 15710 }, { "epoch": 0.4856761077941438, "grad_norm": 0.09837930663245117, "learning_rate": 1.6945483178288373e-05, "loss": 0.1871, "step": 15720 }, { "epoch": 0.4859850620611884, "grad_norm": 0.09894497268456612, "learning_rate": 1.6930285777255903e-05, "loss": 0.189, "step": 15730 }, { "epoch": 0.486294016328233, "grad_norm": 0.07747379122039307, "learning_rate": 1.6915086361165326e-05, "loss": 0.1828, "step": 15740 }, { "epoch": 0.48660297059527763, "grad_norm": 0.10507113461789214, "learning_rate": 1.689988494588357e-05, "loss": 0.183, "step": 15750 }, { "epoch": 0.48691192486232227, "grad_norm": 0.09465091321443915, "learning_rate": 1.688468154727966e-05, "loss": 0.1837, "step": 15760 }, { "epoch": 0.4872208791293669, "grad_norm": 0.10640222169454058, "learning_rate": 1.6869476181224674e-05, "loss": 0.1894, "step": 15770 }, { "epoch": 0.4875298333964115, "grad_norm": 0.0918403510265723, "learning_rate": 1.6854268863591748e-05, "loss": 0.1831, "step": 15780 }, { "epoch": 0.4878387876634561, "grad_norm": 0.09166627082137223, "learning_rate": 1.6839059610256072e-05, "loss": 0.1837, "step": 15790 }, { "epoch": 0.48814774193050076, "grad_norm": 0.10530109258748743, "learning_rate": 1.6823848437094833e-05, "loss": 0.201, "step": 15800 }, { "epoch": 0.48845669619754534, "grad_norm": 0.08801482707933495, "learning_rate": 1.680863535998724e-05, "loss": 0.1891, "step": 15810 }, { "epoch": 0.48876565046459, "grad_norm": 0.07688953322096566, "learning_rate": 1.6793420394814476e-05, "loss": 0.1843, "step": 15820 }, { "epoch": 0.4890746047316346, "grad_norm": 0.0995223853600139, "learning_rate": 1.677820355745971e-05, "loss": 0.1896, "step": 15830 }, { "epoch": 0.48938355899867925, "grad_norm": 0.07868309536869768, "learning_rate": 1.676298486380805e-05, "loss": 0.1868, "step": 15840 }, { "epoch": 0.48969251326572383, "grad_norm": 0.11595632274024195, "learning_rate": 1.674776432974655e-05, "loss": 0.1834, "step": 15850 }, { "epoch": 0.49000146753276846, "grad_norm": 0.08790260634307571, "learning_rate": 1.6732541971164184e-05, "loss": 0.1883, "step": 15860 }, { "epoch": 0.4903104217998131, "grad_norm": 0.10067202460327114, "learning_rate": 1.6717317803951828e-05, "loss": 0.1866, "step": 15870 }, { "epoch": 0.4906193760668577, "grad_norm": 0.0897278223733937, "learning_rate": 1.6702091844002256e-05, "loss": 0.1882, "step": 15880 }, { "epoch": 0.4909283303339023, "grad_norm": 0.10306827915240233, "learning_rate": 1.66868641072101e-05, "loss": 0.1826, "step": 15890 }, { "epoch": 0.49123728460094696, "grad_norm": 0.08544730376805611, "learning_rate": 1.667163460947186e-05, "loss": 0.192, "step": 15900 }, { "epoch": 0.4915462388679916, "grad_norm": 0.09608200618848285, "learning_rate": 1.665640336668586e-05, "loss": 0.1845, "step": 15910 }, { "epoch": 0.4918551931350362, "grad_norm": 0.09511371865163808, "learning_rate": 1.664117039475225e-05, "loss": 0.1952, "step": 15920 }, { "epoch": 0.4921641474020808, "grad_norm": 0.10923160317234618, "learning_rate": 1.6625935709572997e-05, "loss": 0.1878, "step": 15930 }, { "epoch": 0.49247310166912545, "grad_norm": 0.07187305100740843, "learning_rate": 1.6610699327051842e-05, "loss": 0.1842, "step": 15940 }, { "epoch": 0.49278205593617, "grad_norm": 0.08941512923731464, "learning_rate": 1.6595461263094316e-05, "loss": 0.1849, "step": 15950 }, { "epoch": 0.49309101020321466, "grad_norm": 0.09825501997247763, "learning_rate": 1.6580221533607672e-05, "loss": 0.1831, "step": 15960 }, { "epoch": 0.4933999644702593, "grad_norm": 0.08596116657977662, "learning_rate": 1.6564980154500937e-05, "loss": 0.1839, "step": 15970 }, { "epoch": 0.49370891873730394, "grad_norm": 0.09269597813122786, "learning_rate": 1.6549737141684837e-05, "loss": 0.1885, "step": 15980 }, { "epoch": 0.4940178730043485, "grad_norm": 0.0799256163556364, "learning_rate": 1.653449251107182e-05, "loss": 0.1846, "step": 15990 }, { "epoch": 0.49432682727139315, "grad_norm": 0.09250933308177511, "learning_rate": 1.651924627857601e-05, "loss": 0.1843, "step": 16000 }, { "epoch": 0.4946357815384378, "grad_norm": 0.10959023050865643, "learning_rate": 1.650399846011321e-05, "loss": 0.1834, "step": 16010 }, { "epoch": 0.49494473580548237, "grad_norm": 0.07759338064387165, "learning_rate": 1.6488749071600875e-05, "loss": 0.1796, "step": 16020 }, { "epoch": 0.495253690072527, "grad_norm": 0.11275098432585824, "learning_rate": 1.64734981289581e-05, "loss": 0.1835, "step": 16030 }, { "epoch": 0.49556264433957165, "grad_norm": 0.09234924044507742, "learning_rate": 1.6458245648105604e-05, "loss": 0.184, "step": 16040 }, { "epoch": 0.4958715986066163, "grad_norm": 0.0933133747622666, "learning_rate": 1.6442991644965716e-05, "loss": 0.1839, "step": 16050 }, { "epoch": 0.49618055287366086, "grad_norm": 0.09976661795148338, "learning_rate": 1.642773613546234e-05, "loss": 0.1838, "step": 16060 }, { "epoch": 0.4964895071407055, "grad_norm": 0.08267966670246835, "learning_rate": 1.6412479135520967e-05, "loss": 0.1842, "step": 16070 }, { "epoch": 0.49679846140775014, "grad_norm": 0.10400517024831424, "learning_rate": 1.6397220661068637e-05, "loss": 0.1878, "step": 16080 }, { "epoch": 0.4971074156747947, "grad_norm": 0.09110354071661138, "learning_rate": 1.638196072803393e-05, "loss": 0.1845, "step": 16090 }, { "epoch": 0.49741636994183935, "grad_norm": 0.09954264527195751, "learning_rate": 1.636669935234695e-05, "loss": 0.1869, "step": 16100 }, { "epoch": 0.497725324208884, "grad_norm": 0.09282185253342333, "learning_rate": 1.6351436549939305e-05, "loss": 0.1844, "step": 16110 }, { "epoch": 0.4980342784759286, "grad_norm": 0.08516110638246024, "learning_rate": 1.6336172336744096e-05, "loss": 0.1892, "step": 16120 }, { "epoch": 0.4983432327429732, "grad_norm": 0.07979420834822085, "learning_rate": 1.6320906728695886e-05, "loss": 0.1896, "step": 16130 }, { "epoch": 0.49865218701001784, "grad_norm": 0.0960932375296174, "learning_rate": 1.6305639741730718e-05, "loss": 0.1989, "step": 16140 }, { "epoch": 0.4989611412770625, "grad_norm": 0.09548736710256746, "learning_rate": 1.6290371391786047e-05, "loss": 0.1916, "step": 16150 }, { "epoch": 0.49927009554410706, "grad_norm": 0.07591727105674324, "learning_rate": 1.6275101694800766e-05, "loss": 0.1875, "step": 16160 }, { "epoch": 0.4995790498111517, "grad_norm": 0.08142558024410718, "learning_rate": 1.6259830666715173e-05, "loss": 0.1829, "step": 16170 }, { "epoch": 0.49988800407819634, "grad_norm": 0.08927629097026177, "learning_rate": 1.624455832347095e-05, "loss": 0.1904, "step": 16180 }, { "epoch": 0.5001969583452409, "grad_norm": 0.09574685711476999, "learning_rate": 1.6229284681011163e-05, "loss": 0.1888, "step": 16190 }, { "epoch": 0.5005059126122856, "grad_norm": 0.14845410169283285, "learning_rate": 1.6214009755280216e-05, "loss": 0.1892, "step": 16200 }, { "epoch": 0.5008148668793302, "grad_norm": 0.11082846390690437, "learning_rate": 1.6198733562223867e-05, "loss": 0.1937, "step": 16210 }, { "epoch": 0.5011238211463748, "grad_norm": 0.08003900253173751, "learning_rate": 1.61834561177892e-05, "loss": 0.1853, "step": 16220 }, { "epoch": 0.5014327754134195, "grad_norm": 0.08739416279208018, "learning_rate": 1.6168177437924588e-05, "loss": 0.1981, "step": 16230 }, { "epoch": 0.501741729680464, "grad_norm": 0.11047366649303564, "learning_rate": 1.615289753857971e-05, "loss": 0.1879, "step": 16240 }, { "epoch": 0.5020506839475086, "grad_norm": 0.1056427332737057, "learning_rate": 1.6137616435705515e-05, "loss": 0.1889, "step": 16250 }, { "epoch": 0.5023596382145533, "grad_norm": 0.10615334695738451, "learning_rate": 1.6122334145254203e-05, "loss": 0.1863, "step": 16260 }, { "epoch": 0.5026685924815979, "grad_norm": 0.08721635048425189, "learning_rate": 1.6107050683179215e-05, "loss": 0.1846, "step": 16270 }, { "epoch": 0.5029775467486425, "grad_norm": 0.1018767876951892, "learning_rate": 1.609176606543522e-05, "loss": 0.1891, "step": 16280 }, { "epoch": 0.5032865010156872, "grad_norm": 0.10092129533153009, "learning_rate": 1.607648030797809e-05, "loss": 0.1901, "step": 16290 }, { "epoch": 0.5035954552827318, "grad_norm": 0.1040256373196848, "learning_rate": 1.6061193426764878e-05, "loss": 0.1855, "step": 16300 }, { "epoch": 0.5039044095497764, "grad_norm": 0.09425086408387737, "learning_rate": 1.6045905437753836e-05, "loss": 0.1861, "step": 16310 }, { "epoch": 0.504213363816821, "grad_norm": 0.09241402729188, "learning_rate": 1.6030616356904338e-05, "loss": 0.1817, "step": 16320 }, { "epoch": 0.5045223180838656, "grad_norm": 0.10000972176400486, "learning_rate": 1.6015326200176925e-05, "loss": 0.1836, "step": 16330 }, { "epoch": 0.5048312723509103, "grad_norm": 0.0902206852113459, "learning_rate": 1.6000034983533248e-05, "loss": 0.1828, "step": 16340 }, { "epoch": 0.5051402266179549, "grad_norm": 0.0829357861710911, "learning_rate": 1.598474272293607e-05, "loss": 0.1915, "step": 16350 }, { "epoch": 0.5054491808849995, "grad_norm": 0.08269650438296887, "learning_rate": 1.5969449434349236e-05, "loss": 0.1924, "step": 16360 }, { "epoch": 0.5057581351520442, "grad_norm": 0.07297811207458096, "learning_rate": 1.5954155133737674e-05, "loss": 0.1986, "step": 16370 }, { "epoch": 0.5060670894190887, "grad_norm": 0.0785437829061446, "learning_rate": 1.5938859837067366e-05, "loss": 0.1887, "step": 16380 }, { "epoch": 0.5063760436861333, "grad_norm": 0.07713942412917614, "learning_rate": 1.5923563560305332e-05, "loss": 0.1866, "step": 16390 }, { "epoch": 0.506684997953178, "grad_norm": 0.13864850216880983, "learning_rate": 1.5908266319419613e-05, "loss": 0.1839, "step": 16400 }, { "epoch": 0.5069939522202226, "grad_norm": 0.09274446071187946, "learning_rate": 1.5892968130379256e-05, "loss": 0.1917, "step": 16410 }, { "epoch": 0.5073029064872672, "grad_norm": 0.08491881663539312, "learning_rate": 1.5877669009154302e-05, "loss": 0.1896, "step": 16420 }, { "epoch": 0.5076118607543119, "grad_norm": 0.0888411644518085, "learning_rate": 1.5862368971715765e-05, "loss": 0.1846, "step": 16430 }, { "epoch": 0.5079208150213564, "grad_norm": 0.08537009899854889, "learning_rate": 1.5847068034035612e-05, "loss": 0.193, "step": 16440 }, { "epoch": 0.5082297692884011, "grad_norm": 0.08129500642041267, "learning_rate": 1.5831766212086754e-05, "loss": 0.1918, "step": 16450 }, { "epoch": 0.5085387235554457, "grad_norm": 0.0933932025729624, "learning_rate": 1.581646352184302e-05, "loss": 0.1947, "step": 16460 }, { "epoch": 0.5088476778224903, "grad_norm": 0.09391013949854107, "learning_rate": 1.5801159979279153e-05, "loss": 0.1912, "step": 16470 }, { "epoch": 0.509156632089535, "grad_norm": 0.07950096960636836, "learning_rate": 1.5785855600370777e-05, "loss": 0.1905, "step": 16480 }, { "epoch": 0.5094655863565796, "grad_norm": 0.09767106159810406, "learning_rate": 1.5770550401094394e-05, "loss": 0.1831, "step": 16490 }, { "epoch": 0.5097745406236242, "grad_norm": 0.09136753456551168, "learning_rate": 1.5755244397427362e-05, "loss": 0.1931, "step": 16500 }, { "epoch": 0.5100834948906688, "grad_norm": 0.07932070523632545, "learning_rate": 1.5739937605347877e-05, "loss": 0.1879, "step": 16510 }, { "epoch": 0.5103924491577134, "grad_norm": 0.08653253033055035, "learning_rate": 1.5724630040834958e-05, "loss": 0.1905, "step": 16520 }, { "epoch": 0.510701403424758, "grad_norm": 0.09940768000206279, "learning_rate": 1.5709321719868436e-05, "loss": 0.1954, "step": 16530 }, { "epoch": 0.5110103576918027, "grad_norm": 0.07755635560688848, "learning_rate": 1.5694012658428924e-05, "loss": 0.187, "step": 16540 }, { "epoch": 0.5113193119588473, "grad_norm": 0.10369840113350164, "learning_rate": 1.567870287249781e-05, "loss": 0.199, "step": 16550 }, { "epoch": 0.5116282662258919, "grad_norm": 0.0858597212972308, "learning_rate": 1.566339237805724e-05, "loss": 0.1968, "step": 16560 }, { "epoch": 0.5119372204929366, "grad_norm": 0.08688322729501115, "learning_rate": 1.5648081191090103e-05, "loss": 0.1933, "step": 16570 }, { "epoch": 0.5122461747599811, "grad_norm": 0.08073024960628804, "learning_rate": 1.5632769327580003e-05, "loss": 0.1821, "step": 16580 }, { "epoch": 0.5125551290270258, "grad_norm": 0.08278731832124943, "learning_rate": 1.5617456803511256e-05, "loss": 0.1807, "step": 16590 }, { "epoch": 0.5128640832940704, "grad_norm": 0.07728647706716552, "learning_rate": 1.5602143634868865e-05, "loss": 0.1837, "step": 16600 }, { "epoch": 0.513173037561115, "grad_norm": 0.07327476351709011, "learning_rate": 1.5586829837638512e-05, "loss": 0.1889, "step": 16610 }, { "epoch": 0.5134819918281597, "grad_norm": 0.08987695047246985, "learning_rate": 1.557151542780652e-05, "loss": 0.1863, "step": 16620 }, { "epoch": 0.5137909460952043, "grad_norm": 0.08638883049527403, "learning_rate": 1.5556200421359875e-05, "loss": 0.1883, "step": 16630 }, { "epoch": 0.5140999003622488, "grad_norm": 0.08866244801088069, "learning_rate": 1.5540884834286157e-05, "loss": 0.1858, "step": 16640 }, { "epoch": 0.5144088546292935, "grad_norm": 0.09021040523108524, "learning_rate": 1.5525568682573574e-05, "loss": 0.1883, "step": 16650 }, { "epoch": 0.5147178088963381, "grad_norm": 0.09170773703557734, "learning_rate": 1.551025198221092e-05, "loss": 0.1897, "step": 16660 }, { "epoch": 0.5150267631633827, "grad_norm": 0.08009894945135204, "learning_rate": 1.549493474918756e-05, "loss": 0.1866, "step": 16670 }, { "epoch": 0.5153357174304274, "grad_norm": 0.07829703404777481, "learning_rate": 1.5479616999493403e-05, "loss": 0.1868, "step": 16680 }, { "epoch": 0.515644671697472, "grad_norm": 0.0892501958293745, "learning_rate": 1.546429874911892e-05, "loss": 0.1859, "step": 16690 }, { "epoch": 0.5159536259645165, "grad_norm": 0.07684214678739372, "learning_rate": 1.5448980014055088e-05, "loss": 0.1902, "step": 16700 }, { "epoch": 0.5162625802315612, "grad_norm": 0.11422270533105577, "learning_rate": 1.5433660810293393e-05, "loss": 0.1839, "step": 16710 }, { "epoch": 0.5165715344986058, "grad_norm": 0.09663701450094417, "learning_rate": 1.5418341153825815e-05, "loss": 0.1903, "step": 16720 }, { "epoch": 0.5168804887656505, "grad_norm": 0.1188369578916549, "learning_rate": 1.54030210606448e-05, "loss": 0.1826, "step": 16730 }, { "epoch": 0.5171894430326951, "grad_norm": 0.1048352198299216, "learning_rate": 1.538770054674326e-05, "loss": 0.1835, "step": 16740 }, { "epoch": 0.5174983972997397, "grad_norm": 0.10024792665403055, "learning_rate": 1.537237962811453e-05, "loss": 0.181, "step": 16750 }, { "epoch": 0.5178073515667844, "grad_norm": 0.08165890938174093, "learning_rate": 1.535705832075238e-05, "loss": 0.2021, "step": 16760 }, { "epoch": 0.518116305833829, "grad_norm": 0.08070950685872727, "learning_rate": 1.5341736640650996e-05, "loss": 0.187, "step": 16770 }, { "epoch": 0.5184252601008735, "grad_norm": 0.08983897419723173, "learning_rate": 1.532641460380492e-05, "loss": 0.1854, "step": 16780 }, { "epoch": 0.5187342143679182, "grad_norm": 0.08956672505699013, "learning_rate": 1.5311092226209095e-05, "loss": 0.1915, "step": 16790 }, { "epoch": 0.5190431686349628, "grad_norm": 0.10233131738545687, "learning_rate": 1.5295769523858814e-05, "loss": 0.1926, "step": 16800 }, { "epoch": 0.5193521229020074, "grad_norm": 0.09231501611292307, "learning_rate": 1.5280446512749707e-05, "loss": 0.1875, "step": 16810 }, { "epoch": 0.5196610771690521, "grad_norm": 0.09091437555359963, "learning_rate": 1.5265123208877722e-05, "loss": 0.1834, "step": 16820 }, { "epoch": 0.5199700314360967, "grad_norm": 0.08961319032273445, "learning_rate": 1.5249799628239112e-05, "loss": 0.1886, "step": 16830 }, { "epoch": 0.5202789857031412, "grad_norm": 0.08716056858195882, "learning_rate": 1.5234475786830432e-05, "loss": 0.1854, "step": 16840 }, { "epoch": 0.5205879399701859, "grad_norm": 0.0775112634053927, "learning_rate": 1.5219151700648499e-05, "loss": 0.1908, "step": 16850 }, { "epoch": 0.5208968942372305, "grad_norm": 0.07994281723800888, "learning_rate": 1.5203827385690378e-05, "loss": 0.1842, "step": 16860 }, { "epoch": 0.5212058485042752, "grad_norm": 0.11038659043417026, "learning_rate": 1.518850285795339e-05, "loss": 0.1848, "step": 16870 }, { "epoch": 0.5215148027713198, "grad_norm": 0.09098567725841361, "learning_rate": 1.5173178133435067e-05, "loss": 0.183, "step": 16880 }, { "epoch": 0.5218237570383644, "grad_norm": 0.08853172387115174, "learning_rate": 1.515785322813315e-05, "loss": 0.1869, "step": 16890 }, { "epoch": 0.5221327113054091, "grad_norm": 0.0919017222927224, "learning_rate": 1.5142528158045563e-05, "loss": 0.1951, "step": 16900 }, { "epoch": 0.5224416655724536, "grad_norm": 0.09828075180749284, "learning_rate": 1.5127202939170418e-05, "loss": 0.1826, "step": 16910 }, { "epoch": 0.5227506198394982, "grad_norm": 0.1090552992323409, "learning_rate": 1.511187758750596e-05, "loss": 0.1856, "step": 16920 }, { "epoch": 0.5230595741065429, "grad_norm": 0.08778878304175707, "learning_rate": 1.5096552119050588e-05, "loss": 0.1877, "step": 16930 }, { "epoch": 0.5233685283735875, "grad_norm": 0.08228476322473942, "learning_rate": 1.5081226549802823e-05, "loss": 0.185, "step": 16940 }, { "epoch": 0.5236774826406321, "grad_norm": 0.08286146340483935, "learning_rate": 1.5065900895761281e-05, "loss": 0.1867, "step": 16950 }, { "epoch": 0.5239864369076768, "grad_norm": 0.08397935696767478, "learning_rate": 1.5050575172924676e-05, "loss": 0.1885, "step": 16960 }, { "epoch": 0.5242953911747213, "grad_norm": 0.11852410287882821, "learning_rate": 1.5035249397291792e-05, "loss": 0.192, "step": 16970 }, { "epoch": 0.5246043454417659, "grad_norm": 0.09165341600222472, "learning_rate": 1.5019923584861459e-05, "loss": 0.1811, "step": 16980 }, { "epoch": 0.5249132997088106, "grad_norm": 0.09532246851600214, "learning_rate": 1.5004597751632555e-05, "loss": 0.1913, "step": 16990 }, { "epoch": 0.5252222539758552, "grad_norm": 0.10521407904617962, "learning_rate": 1.4989271913603985e-05, "loss": 0.1855, "step": 17000 }, { "epoch": 0.5255312082428999, "grad_norm": 0.08637722581336506, "learning_rate": 1.4973946086774648e-05, "loss": 0.1805, "step": 17010 }, { "epoch": 0.5258401625099445, "grad_norm": 0.0928785275306785, "learning_rate": 1.4958620287143426e-05, "loss": 0.1895, "step": 17020 }, { "epoch": 0.5261491167769891, "grad_norm": 0.09415359707936143, "learning_rate": 1.4943294530709196e-05, "loss": 0.1818, "step": 17030 }, { "epoch": 0.5264580710440337, "grad_norm": 0.09163024023045135, "learning_rate": 1.4927968833470763e-05, "loss": 0.1866, "step": 17040 }, { "epoch": 0.5267670253110783, "grad_norm": 0.10236905094453971, "learning_rate": 1.4912643211426894e-05, "loss": 0.1843, "step": 17050 }, { "epoch": 0.5270759795781229, "grad_norm": 0.10445863766997315, "learning_rate": 1.4897317680576255e-05, "loss": 0.191, "step": 17060 }, { "epoch": 0.5273849338451676, "grad_norm": 0.0897013205674567, "learning_rate": 1.4881992256917435e-05, "loss": 0.179, "step": 17070 }, { "epoch": 0.5276938881122122, "grad_norm": 0.08051453323056623, "learning_rate": 1.4866666956448902e-05, "loss": 0.1845, "step": 17080 }, { "epoch": 0.5280028423792568, "grad_norm": 0.0960572970135507, "learning_rate": 1.4851341795169001e-05, "loss": 0.1828, "step": 17090 }, { "epoch": 0.5283117966463015, "grad_norm": 0.09099435178945896, "learning_rate": 1.4836016789075918e-05, "loss": 0.1929, "step": 17100 }, { "epoch": 0.528620750913346, "grad_norm": 0.08925916899274329, "learning_rate": 1.48206919541677e-05, "loss": 0.1881, "step": 17110 }, { "epoch": 0.5289297051803906, "grad_norm": 0.09593747889871479, "learning_rate": 1.4805367306442196e-05, "loss": 0.1888, "step": 17120 }, { "epoch": 0.5292386594474353, "grad_norm": 0.07405438597813077, "learning_rate": 1.4790042861897058e-05, "loss": 0.192, "step": 17130 }, { "epoch": 0.5295476137144799, "grad_norm": 0.09421400762818595, "learning_rate": 1.4774718636529751e-05, "loss": 0.1884, "step": 17140 }, { "epoch": 0.5298565679815246, "grad_norm": 0.09931827531418859, "learning_rate": 1.4759394646337477e-05, "loss": 0.187, "step": 17150 }, { "epoch": 0.5301655222485692, "grad_norm": 0.09243735280179094, "learning_rate": 1.4744070907317227e-05, "loss": 0.1852, "step": 17160 }, { "epoch": 0.5304744765156137, "grad_norm": 0.08297586570125884, "learning_rate": 1.4728747435465693e-05, "loss": 0.1862, "step": 17170 }, { "epoch": 0.5307834307826584, "grad_norm": 0.09711190720355078, "learning_rate": 1.4713424246779327e-05, "loss": 0.1838, "step": 17180 }, { "epoch": 0.531092385049703, "grad_norm": 0.09893471219976875, "learning_rate": 1.4698101357254254e-05, "loss": 0.1889, "step": 17190 }, { "epoch": 0.5314013393167476, "grad_norm": 0.1009962099184244, "learning_rate": 1.4682778782886301e-05, "loss": 0.1835, "step": 17200 }, { "epoch": 0.5317102935837923, "grad_norm": 0.08314577919257489, "learning_rate": 1.466745653967096e-05, "loss": 0.1815, "step": 17210 }, { "epoch": 0.5320192478508369, "grad_norm": 0.07981247358516776, "learning_rate": 1.465213464360339e-05, "loss": 0.1866, "step": 17220 }, { "epoch": 0.5323282021178815, "grad_norm": 0.0948002124448693, "learning_rate": 1.4636813110678367e-05, "loss": 0.182, "step": 17230 }, { "epoch": 0.5326371563849261, "grad_norm": 0.0823123813200111, "learning_rate": 1.4621491956890313e-05, "loss": 0.187, "step": 17240 }, { "epoch": 0.5329461106519707, "grad_norm": 0.10001350183120634, "learning_rate": 1.4606171198233225e-05, "loss": 0.1824, "step": 17250 }, { "epoch": 0.5332550649190153, "grad_norm": 0.0799570894990248, "learning_rate": 1.459085085070072e-05, "loss": 0.1854, "step": 17260 }, { "epoch": 0.53356401918606, "grad_norm": 0.08894190948983392, "learning_rate": 1.4575530930285952e-05, "loss": 0.1944, "step": 17270 }, { "epoch": 0.5338729734531046, "grad_norm": 0.07546407103998903, "learning_rate": 1.4560211452981662e-05, "loss": 0.2028, "step": 17280 }, { "epoch": 0.5341819277201493, "grad_norm": 0.10185925884645297, "learning_rate": 1.4544892434780101e-05, "loss": 0.1829, "step": 17290 }, { "epoch": 0.5344908819871939, "grad_norm": 0.10005081945027251, "learning_rate": 1.4529573891673057e-05, "loss": 0.1793, "step": 17300 }, { "epoch": 0.5347998362542384, "grad_norm": 0.08457714031441274, "learning_rate": 1.4514255839651815e-05, "loss": 0.1797, "step": 17310 }, { "epoch": 0.5351087905212831, "grad_norm": 0.08018724714628987, "learning_rate": 1.4498938294707157e-05, "loss": 0.1938, "step": 17320 }, { "epoch": 0.5354177447883277, "grad_norm": 0.07996280965923389, "learning_rate": 1.4483621272829314e-05, "loss": 0.1912, "step": 17330 }, { "epoch": 0.5357266990553723, "grad_norm": 0.09015109495118928, "learning_rate": 1.4468304790008002e-05, "loss": 0.1853, "step": 17340 }, { "epoch": 0.536035653322417, "grad_norm": 0.0802749764196976, "learning_rate": 1.4452988862232348e-05, "loss": 0.1818, "step": 17350 }, { "epoch": 0.5363446075894616, "grad_norm": 0.08152069249656302, "learning_rate": 1.4437673505490904e-05, "loss": 0.1805, "step": 17360 }, { "epoch": 0.5366535618565061, "grad_norm": 0.08491495530511571, "learning_rate": 1.4422358735771644e-05, "loss": 0.1848, "step": 17370 }, { "epoch": 0.5369625161235508, "grad_norm": 0.09352929532539385, "learning_rate": 1.4407044569061904e-05, "loss": 0.1807, "step": 17380 }, { "epoch": 0.5372714703905954, "grad_norm": 0.0712002076927719, "learning_rate": 1.4391731021348411e-05, "loss": 0.1836, "step": 17390 }, { "epoch": 0.53758042465764, "grad_norm": 0.07654844660693336, "learning_rate": 1.4376418108617233e-05, "loss": 0.1922, "step": 17400 }, { "epoch": 0.5378893789246847, "grad_norm": 0.10044903255580094, "learning_rate": 1.4361105846853785e-05, "loss": 0.188, "step": 17410 }, { "epoch": 0.5381983331917293, "grad_norm": 0.09616695490663842, "learning_rate": 1.4345794252042793e-05, "loss": 0.1837, "step": 17420 }, { "epoch": 0.538507287458774, "grad_norm": 0.09121957303038127, "learning_rate": 1.4330483340168294e-05, "loss": 0.1818, "step": 17430 }, { "epoch": 0.5388162417258185, "grad_norm": 0.08569007567192438, "learning_rate": 1.4315173127213597e-05, "loss": 0.1849, "step": 17440 }, { "epoch": 0.5391251959928631, "grad_norm": 0.12991475621721707, "learning_rate": 1.4299863629161312e-05, "loss": 0.1864, "step": 17450 }, { "epoch": 0.5394341502599078, "grad_norm": 0.09185756381582284, "learning_rate": 1.4284554861993266e-05, "loss": 0.1836, "step": 17460 }, { "epoch": 0.5397431045269524, "grad_norm": 0.08448691611438457, "learning_rate": 1.426924684169056e-05, "loss": 0.1825, "step": 17470 }, { "epoch": 0.540052058793997, "grad_norm": 0.08242456970627907, "learning_rate": 1.4253939584233475e-05, "loss": 0.1817, "step": 17480 }, { "epoch": 0.5403610130610417, "grad_norm": 0.0988976302001369, "learning_rate": 1.4238633105601537e-05, "loss": 0.1852, "step": 17490 }, { "epoch": 0.5406699673280863, "grad_norm": 0.07820452586212755, "learning_rate": 1.4223327421773422e-05, "loss": 0.1888, "step": 17500 }, { "epoch": 0.5409789215951308, "grad_norm": 0.1046724761210211, "learning_rate": 1.420802254872701e-05, "loss": 0.1833, "step": 17510 }, { "epoch": 0.5412878758621755, "grad_norm": 0.10141302621041458, "learning_rate": 1.4192718502439305e-05, "loss": 0.1858, "step": 17520 }, { "epoch": 0.5415968301292201, "grad_norm": 0.0887367435383199, "learning_rate": 1.4177415298886468e-05, "loss": 0.1831, "step": 17530 }, { "epoch": 0.5419057843962647, "grad_norm": 0.09632885655951946, "learning_rate": 1.4162112954043763e-05, "loss": 0.1844, "step": 17540 }, { "epoch": 0.5422147386633094, "grad_norm": 0.07623364132872182, "learning_rate": 1.4146811483885582e-05, "loss": 0.1924, "step": 17550 }, { "epoch": 0.542523692930354, "grad_norm": 0.09477616654210215, "learning_rate": 1.4131510904385376e-05, "loss": 0.1926, "step": 17560 }, { "epoch": 0.5428326471973987, "grad_norm": 0.08307270244024169, "learning_rate": 1.4116211231515693e-05, "loss": 0.191, "step": 17570 }, { "epoch": 0.5431416014644432, "grad_norm": 0.0879987139114687, "learning_rate": 1.4100912481248114e-05, "loss": 0.1846, "step": 17580 }, { "epoch": 0.5434505557314878, "grad_norm": 0.32627772183764964, "learning_rate": 1.4085614669553257e-05, "loss": 0.1856, "step": 17590 }, { "epoch": 0.5437595099985325, "grad_norm": 0.08654146767886002, "learning_rate": 1.4070317812400783e-05, "loss": 0.1847, "step": 17600 }, { "epoch": 0.5440684642655771, "grad_norm": 0.09291518300851469, "learning_rate": 1.4055021925759326e-05, "loss": 0.1897, "step": 17610 }, { "epoch": 0.5443774185326217, "grad_norm": 0.0830066940923024, "learning_rate": 1.4039727025596537e-05, "loss": 0.1841, "step": 17620 }, { "epoch": 0.5446863727996664, "grad_norm": 0.37472320467808995, "learning_rate": 1.402443312787901e-05, "loss": 0.1833, "step": 17630 }, { "epoch": 0.5449953270667109, "grad_norm": 0.17567192167695006, "learning_rate": 1.4009140248572307e-05, "loss": 0.1909, "step": 17640 }, { "epoch": 0.5453042813337555, "grad_norm": 0.0792282290952821, "learning_rate": 1.3993848403640926e-05, "loss": 0.196, "step": 17650 }, { "epoch": 0.5456132356008002, "grad_norm": 0.08045439612250929, "learning_rate": 1.3978557609048288e-05, "loss": 0.1924, "step": 17660 }, { "epoch": 0.5459221898678448, "grad_norm": 0.08474439759336545, "learning_rate": 1.3963267880756702e-05, "loss": 0.1899, "step": 17670 }, { "epoch": 0.5462311441348894, "grad_norm": 0.08076183615781608, "learning_rate": 1.3947979234727389e-05, "loss": 0.1857, "step": 17680 }, { "epoch": 0.5465400984019341, "grad_norm": 0.10006441860898663, "learning_rate": 1.393269168692041e-05, "loss": 0.1825, "step": 17690 }, { "epoch": 0.5468490526689787, "grad_norm": 0.09497652449343258, "learning_rate": 1.3917405253294712e-05, "loss": 0.186, "step": 17700 }, { "epoch": 0.5471580069360233, "grad_norm": 0.07872838283830029, "learning_rate": 1.390211994980805e-05, "loss": 0.1817, "step": 17710 }, { "epoch": 0.5474669612030679, "grad_norm": 0.0854526101511236, "learning_rate": 1.3886835792417021e-05, "loss": 0.1871, "step": 17720 }, { "epoch": 0.5477759154701125, "grad_norm": 0.08515358675952887, "learning_rate": 1.387155279707701e-05, "loss": 0.1804, "step": 17730 }, { "epoch": 0.5480848697371572, "grad_norm": 0.0848473404926865, "learning_rate": 1.3856270979742203e-05, "loss": 0.1902, "step": 17740 }, { "epoch": 0.5483938240042018, "grad_norm": 0.08507319549854123, "learning_rate": 1.384099035636554e-05, "loss": 0.1905, "step": 17750 }, { "epoch": 0.5487027782712464, "grad_norm": 0.09140757552102367, "learning_rate": 1.382571094289873e-05, "loss": 0.1845, "step": 17760 }, { "epoch": 0.549011732538291, "grad_norm": 0.09854616195628617, "learning_rate": 1.3810432755292205e-05, "loss": 0.1864, "step": 17770 }, { "epoch": 0.5493206868053356, "grad_norm": 0.07821144682589172, "learning_rate": 1.3795155809495132e-05, "loss": 0.1796, "step": 17780 }, { "epoch": 0.5496296410723802, "grad_norm": 0.09636875779609737, "learning_rate": 1.3779880121455367e-05, "loss": 0.1874, "step": 17790 }, { "epoch": 0.5499385953394249, "grad_norm": 0.10398916307274209, "learning_rate": 1.376460570711947e-05, "loss": 0.1904, "step": 17800 }, { "epoch": 0.5502475496064695, "grad_norm": 0.0775535615915769, "learning_rate": 1.3749332582432653e-05, "loss": 0.1904, "step": 17810 }, { "epoch": 0.5505565038735141, "grad_norm": 0.09082598461858564, "learning_rate": 1.3734060763338789e-05, "loss": 0.195, "step": 17820 }, { "epoch": 0.5508654581405588, "grad_norm": 0.07848013156671774, "learning_rate": 1.3718790265780398e-05, "loss": 0.1854, "step": 17830 }, { "epoch": 0.5511744124076033, "grad_norm": 0.08192677840117994, "learning_rate": 1.3703521105698601e-05, "loss": 0.1837, "step": 17840 }, { "epoch": 0.551483366674648, "grad_norm": 0.08848765115082168, "learning_rate": 1.3688253299033146e-05, "loss": 0.1805, "step": 17850 }, { "epoch": 0.5517923209416926, "grad_norm": 0.09920079662348166, "learning_rate": 1.3672986861722348e-05, "loss": 0.1811, "step": 17860 }, { "epoch": 0.5521012752087372, "grad_norm": 0.10624153800518651, "learning_rate": 1.3657721809703098e-05, "loss": 0.1893, "step": 17870 }, { "epoch": 0.5524102294757819, "grad_norm": 0.08227130144677, "learning_rate": 1.3642458158910847e-05, "loss": 0.1835, "step": 17880 }, { "epoch": 0.5527191837428265, "grad_norm": 0.08570427994284543, "learning_rate": 1.3627195925279581e-05, "loss": 0.1847, "step": 17890 }, { "epoch": 0.553028138009871, "grad_norm": 0.10391660517592205, "learning_rate": 1.3611935124741798e-05, "loss": 0.1865, "step": 17900 }, { "epoch": 0.5533370922769157, "grad_norm": 0.09700018352924356, "learning_rate": 1.3596675773228515e-05, "loss": 0.1831, "step": 17910 }, { "epoch": 0.5536460465439603, "grad_norm": 0.09473579301432845, "learning_rate": 1.358141788666922e-05, "loss": 0.1841, "step": 17920 }, { "epoch": 0.5539550008110049, "grad_norm": 0.08051417888890819, "learning_rate": 1.3566161480991892e-05, "loss": 0.1867, "step": 17930 }, { "epoch": 0.5542639550780496, "grad_norm": 0.08367151773433916, "learning_rate": 1.3550906572122935e-05, "loss": 0.1817, "step": 17940 }, { "epoch": 0.5545729093450942, "grad_norm": 0.09016611338068892, "learning_rate": 1.3535653175987224e-05, "loss": 0.1899, "step": 17950 }, { "epoch": 0.5548818636121388, "grad_norm": 0.07994393203403673, "learning_rate": 1.3520401308508029e-05, "loss": 0.1848, "step": 17960 }, { "epoch": 0.5551908178791835, "grad_norm": 0.09898322055199392, "learning_rate": 1.3505150985607035e-05, "loss": 0.1882, "step": 17970 }, { "epoch": 0.555499772146228, "grad_norm": 0.0832540759484844, "learning_rate": 1.3489902223204313e-05, "loss": 0.1824, "step": 17980 }, { "epoch": 0.5558087264132727, "grad_norm": 0.07501050463966243, "learning_rate": 1.347465503721831e-05, "loss": 0.1873, "step": 17990 }, { "epoch": 0.5561176806803173, "grad_norm": 0.084183164690891, "learning_rate": 1.345940944356581e-05, "loss": 0.1882, "step": 18000 }, { "epoch": 0.5564266349473619, "grad_norm": 0.09927738611382306, "learning_rate": 1.3444165458161961e-05, "loss": 0.1875, "step": 18010 }, { "epoch": 0.5567355892144066, "grad_norm": 0.10254120092547032, "learning_rate": 1.3428923096920207e-05, "loss": 0.1865, "step": 18020 }, { "epoch": 0.5570445434814512, "grad_norm": 0.08292525689385516, "learning_rate": 1.341368237575232e-05, "loss": 0.1881, "step": 18030 }, { "epoch": 0.5573534977484957, "grad_norm": 0.08145256947523416, "learning_rate": 1.339844331056834e-05, "loss": 0.1809, "step": 18040 }, { "epoch": 0.5576624520155404, "grad_norm": 0.12036097904889985, "learning_rate": 1.3383205917276581e-05, "loss": 0.1853, "step": 18050 }, { "epoch": 0.557971406282585, "grad_norm": 0.0974876488142113, "learning_rate": 1.3367970211783629e-05, "loss": 0.196, "step": 18060 }, { "epoch": 0.5582803605496296, "grad_norm": 0.08900797031577047, "learning_rate": 1.335273620999428e-05, "loss": 0.1859, "step": 18070 }, { "epoch": 0.5585893148166743, "grad_norm": 0.08376806481567103, "learning_rate": 1.3337503927811586e-05, "loss": 0.1876, "step": 18080 }, { "epoch": 0.5588982690837189, "grad_norm": 0.09983652945209968, "learning_rate": 1.332227338113677e-05, "loss": 0.1997, "step": 18090 }, { "epoch": 0.5592072233507634, "grad_norm": 0.08272073156927462, "learning_rate": 1.3307044585869264e-05, "loss": 0.1836, "step": 18100 }, { "epoch": 0.5595161776178081, "grad_norm": 0.08019848440170073, "learning_rate": 1.3291817557906665e-05, "loss": 0.1902, "step": 18110 }, { "epoch": 0.5598251318848527, "grad_norm": 0.07660768377815788, "learning_rate": 1.3276592313144732e-05, "loss": 0.1864, "step": 18120 }, { "epoch": 0.5601340861518974, "grad_norm": 0.08427816260084622, "learning_rate": 1.3261368867477343e-05, "loss": 0.192, "step": 18130 }, { "epoch": 0.560443040418942, "grad_norm": 0.09165925543699757, "learning_rate": 1.3246147236796528e-05, "loss": 0.187, "step": 18140 }, { "epoch": 0.5607519946859866, "grad_norm": 0.08533068321333423, "learning_rate": 1.323092743699239e-05, "loss": 0.1832, "step": 18150 }, { "epoch": 0.5610609489530313, "grad_norm": 0.08163197125245189, "learning_rate": 1.321570948395315e-05, "loss": 0.1824, "step": 18160 }, { "epoch": 0.5613699032200758, "grad_norm": 0.08632504950505282, "learning_rate": 1.3200493393565074e-05, "loss": 0.1879, "step": 18170 }, { "epoch": 0.5616788574871204, "grad_norm": 0.07476858547105282, "learning_rate": 1.3185279181712513e-05, "loss": 0.1834, "step": 18180 }, { "epoch": 0.5619878117541651, "grad_norm": 0.07541841695829907, "learning_rate": 1.317006686427783e-05, "loss": 0.181, "step": 18190 }, { "epoch": 0.5622967660212097, "grad_norm": 0.1077618205576265, "learning_rate": 1.3154856457141423e-05, "loss": 0.1922, "step": 18200 }, { "epoch": 0.5626057202882543, "grad_norm": 0.0818609664964366, "learning_rate": 1.3139647976181696e-05, "loss": 0.1807, "step": 18210 }, { "epoch": 0.562914674555299, "grad_norm": 0.0908286567884653, "learning_rate": 1.3124441437275045e-05, "loss": 0.1835, "step": 18220 }, { "epoch": 0.5632236288223436, "grad_norm": 0.09575946100216455, "learning_rate": 1.310923685629582e-05, "loss": 0.1871, "step": 18230 }, { "epoch": 0.5635325830893881, "grad_norm": 0.08080778197175675, "learning_rate": 1.309403424911636e-05, "loss": 0.1866, "step": 18240 }, { "epoch": 0.5638415373564328, "grad_norm": 0.07256286544884057, "learning_rate": 1.3078833631606908e-05, "loss": 0.1837, "step": 18250 }, { "epoch": 0.5641504916234774, "grad_norm": 0.08937246492506239, "learning_rate": 1.3063635019635662e-05, "loss": 0.1841, "step": 18260 }, { "epoch": 0.5644594458905221, "grad_norm": 0.09723740333864475, "learning_rate": 1.3048438429068703e-05, "loss": 0.1826, "step": 18270 }, { "epoch": 0.5647684001575667, "grad_norm": 0.12124340141480298, "learning_rate": 1.3033243875770002e-05, "loss": 0.1963, "step": 18280 }, { "epoch": 0.5650773544246113, "grad_norm": 0.08216877458298519, "learning_rate": 1.3018051375601428e-05, "loss": 0.1852, "step": 18290 }, { "epoch": 0.565386308691656, "grad_norm": 0.08488433886810821, "learning_rate": 1.3002860944422676e-05, "loss": 0.179, "step": 18300 }, { "epoch": 0.5656952629587005, "grad_norm": 0.09152647645936096, "learning_rate": 1.2987672598091307e-05, "loss": 0.1844, "step": 18310 }, { "epoch": 0.5660042172257451, "grad_norm": 0.0902168511301893, "learning_rate": 1.2972486352462681e-05, "loss": 0.1856, "step": 18320 }, { "epoch": 0.5663131714927898, "grad_norm": 0.10166543212631995, "learning_rate": 1.295730222338999e-05, "loss": 0.1888, "step": 18330 }, { "epoch": 0.5666221257598344, "grad_norm": 0.08335037816166078, "learning_rate": 1.2942120226724193e-05, "loss": 0.1923, "step": 18340 }, { "epoch": 0.566931080026879, "grad_norm": 0.07675305074037349, "learning_rate": 1.2926940378314045e-05, "loss": 0.1846, "step": 18350 }, { "epoch": 0.5672400342939237, "grad_norm": 0.08900068087168743, "learning_rate": 1.2911762694006035e-05, "loss": 0.1833, "step": 18360 }, { "epoch": 0.5675489885609682, "grad_norm": 0.13945464616694303, "learning_rate": 1.289658718964442e-05, "loss": 0.1873, "step": 18370 }, { "epoch": 0.5678579428280128, "grad_norm": 0.08486324017069379, "learning_rate": 1.2881413881071151e-05, "loss": 0.1851, "step": 18380 }, { "epoch": 0.5681668970950575, "grad_norm": 0.09162914427517899, "learning_rate": 1.2866242784125916e-05, "loss": 0.1882, "step": 18390 }, { "epoch": 0.5684758513621021, "grad_norm": 0.07513189668788745, "learning_rate": 1.2851073914646072e-05, "loss": 0.1877, "step": 18400 }, { "epoch": 0.5687848056291468, "grad_norm": 0.10635056481968717, "learning_rate": 1.2835907288466668e-05, "loss": 0.1859, "step": 18410 }, { "epoch": 0.5690937598961914, "grad_norm": 0.1084024767487765, "learning_rate": 1.2820742921420394e-05, "loss": 0.1869, "step": 18420 }, { "epoch": 0.569402714163236, "grad_norm": 0.09261344954353902, "learning_rate": 1.2805580829337596e-05, "loss": 0.1911, "step": 18430 }, { "epoch": 0.5697116684302806, "grad_norm": 0.10332341591055433, "learning_rate": 1.2790421028046234e-05, "loss": 0.1928, "step": 18440 }, { "epoch": 0.5700206226973252, "grad_norm": 0.09147366175987018, "learning_rate": 1.277526353337189e-05, "loss": 0.1815, "step": 18450 }, { "epoch": 0.5703295769643698, "grad_norm": 0.12406447856940347, "learning_rate": 1.276010836113772e-05, "loss": 0.2033, "step": 18460 }, { "epoch": 0.5706385312314145, "grad_norm": 0.08312351723744336, "learning_rate": 1.2744955527164474e-05, "loss": 0.1846, "step": 18470 }, { "epoch": 0.5709474854984591, "grad_norm": 0.09805344865224558, "learning_rate": 1.2729805047270447e-05, "loss": 0.1856, "step": 18480 }, { "epoch": 0.5712564397655037, "grad_norm": 0.09322922472078514, "learning_rate": 1.271465693727149e-05, "loss": 0.1864, "step": 18490 }, { "epoch": 0.5715653940325484, "grad_norm": 0.07423211075291146, "learning_rate": 1.2699511212980967e-05, "loss": 0.184, "step": 18500 }, { "epoch": 0.5718743482995929, "grad_norm": 0.073608149716638, "learning_rate": 1.2684367890209752e-05, "loss": 0.1891, "step": 18510 }, { "epoch": 0.5721833025666375, "grad_norm": 0.10314568671849617, "learning_rate": 1.2669226984766227e-05, "loss": 0.1849, "step": 18520 }, { "epoch": 0.5724922568336822, "grad_norm": 0.08692076340230744, "learning_rate": 1.2654088512456228e-05, "loss": 0.1836, "step": 18530 }, { "epoch": 0.5728012111007268, "grad_norm": 0.07738189014599374, "learning_rate": 1.2638952489083077e-05, "loss": 0.1829, "step": 18540 }, { "epoch": 0.5731101653677715, "grad_norm": 0.09949104059203269, "learning_rate": 1.2623818930447517e-05, "loss": 0.1824, "step": 18550 }, { "epoch": 0.5734191196348161, "grad_norm": 0.09055018055054191, "learning_rate": 1.2608687852347733e-05, "loss": 0.1852, "step": 18560 }, { "epoch": 0.5737280739018606, "grad_norm": 0.11175457125740802, "learning_rate": 1.2593559270579303e-05, "loss": 0.1844, "step": 18570 }, { "epoch": 0.5740370281689053, "grad_norm": 0.13301535459360475, "learning_rate": 1.2578433200935226e-05, "loss": 0.1881, "step": 18580 }, { "epoch": 0.5743459824359499, "grad_norm": 0.0899955346323926, "learning_rate": 1.2563309659205848e-05, "loss": 0.1831, "step": 18590 }, { "epoch": 0.5746549367029945, "grad_norm": 0.08306036513954007, "learning_rate": 1.2548188661178903e-05, "loss": 0.1781, "step": 18600 }, { "epoch": 0.5749638909700392, "grad_norm": 0.08011029389984016, "learning_rate": 1.2533070222639449e-05, "loss": 0.1897, "step": 18610 }, { "epoch": 0.5752728452370838, "grad_norm": 0.0846103975665748, "learning_rate": 1.2517954359369888e-05, "loss": 0.1932, "step": 18620 }, { "epoch": 0.5755817995041284, "grad_norm": 0.09252490726584991, "learning_rate": 1.250284108714992e-05, "loss": 0.1863, "step": 18630 }, { "epoch": 0.575890753771173, "grad_norm": 0.0829174294001005, "learning_rate": 1.2487730421756552e-05, "loss": 0.1902, "step": 18640 }, { "epoch": 0.5761997080382176, "grad_norm": 0.08778215776406699, "learning_rate": 1.2472622378964062e-05, "loss": 0.179, "step": 18650 }, { "epoch": 0.5765086623052622, "grad_norm": 0.09976640750723119, "learning_rate": 1.2457516974543992e-05, "loss": 0.1817, "step": 18660 }, { "epoch": 0.5768176165723069, "grad_norm": 0.38975443046706637, "learning_rate": 1.2442414224265125e-05, "loss": 0.1812, "step": 18670 }, { "epoch": 0.5771265708393515, "grad_norm": 0.08529668233447649, "learning_rate": 1.2427314143893492e-05, "loss": 0.1832, "step": 18680 }, { "epoch": 0.5774355251063962, "grad_norm": 0.19581723497176823, "learning_rate": 1.2412216749192307e-05, "loss": 0.1877, "step": 18690 }, { "epoch": 0.5777444793734408, "grad_norm": 0.09003964574893565, "learning_rate": 1.239712205592201e-05, "loss": 0.1873, "step": 18700 }, { "epoch": 0.5780534336404853, "grad_norm": 0.0758620885951063, "learning_rate": 1.2382030079840197e-05, "loss": 0.1846, "step": 18710 }, { "epoch": 0.57836238790753, "grad_norm": 0.08928353947856521, "learning_rate": 1.2366940836701652e-05, "loss": 0.1873, "step": 18720 }, { "epoch": 0.5786713421745746, "grad_norm": 0.08620986156199108, "learning_rate": 1.235185434225828e-05, "loss": 0.1905, "step": 18730 }, { "epoch": 0.5789802964416192, "grad_norm": 0.08774801161377836, "learning_rate": 1.233677061225913e-05, "loss": 0.1926, "step": 18740 }, { "epoch": 0.5792892507086639, "grad_norm": 0.0845975530435648, "learning_rate": 1.2321689662450371e-05, "loss": 0.1927, "step": 18750 }, { "epoch": 0.5795982049757085, "grad_norm": 0.09635276684974903, "learning_rate": 1.2306611508575256e-05, "loss": 0.1837, "step": 18760 }, { "epoch": 0.579907159242753, "grad_norm": 0.08282334929380135, "learning_rate": 1.2291536166374126e-05, "loss": 0.1836, "step": 18770 }, { "epoch": 0.5802161135097977, "grad_norm": 0.08588169105054738, "learning_rate": 1.227646365158439e-05, "loss": 0.184, "step": 18780 }, { "epoch": 0.5805250677768423, "grad_norm": 0.08938532218341083, "learning_rate": 1.2261393979940504e-05, "loss": 0.19, "step": 18790 }, { "epoch": 0.5808340220438869, "grad_norm": 0.08034961012220436, "learning_rate": 1.2246327167173943e-05, "loss": 0.1842, "step": 18800 }, { "epoch": 0.5811429763109316, "grad_norm": 0.09202410296299979, "learning_rate": 1.2231263229013226e-05, "loss": 0.1819, "step": 18810 }, { "epoch": 0.5814519305779762, "grad_norm": 0.09497765274832858, "learning_rate": 1.2216202181183836e-05, "loss": 0.1857, "step": 18820 }, { "epoch": 0.5817608848450209, "grad_norm": 0.08245146407877549, "learning_rate": 1.2201144039408273e-05, "loss": 0.1812, "step": 18830 }, { "epoch": 0.5820698391120654, "grad_norm": 0.09185748333333948, "learning_rate": 1.218608881940597e-05, "loss": 0.1958, "step": 18840 }, { "epoch": 0.58237879337911, "grad_norm": 0.08503410922279848, "learning_rate": 1.2171036536893344e-05, "loss": 0.1827, "step": 18850 }, { "epoch": 0.5826877476461547, "grad_norm": 0.08937965076157045, "learning_rate": 1.2155987207583718e-05, "loss": 0.1808, "step": 18860 }, { "epoch": 0.5829967019131993, "grad_norm": 0.07385702718310716, "learning_rate": 1.2140940847187345e-05, "loss": 0.1936, "step": 18870 }, { "epoch": 0.5833056561802439, "grad_norm": 0.08983367724418617, "learning_rate": 1.2125897471411374e-05, "loss": 0.1878, "step": 18880 }, { "epoch": 0.5836146104472886, "grad_norm": 0.12091610173052615, "learning_rate": 1.2110857095959848e-05, "loss": 0.1843, "step": 18890 }, { "epoch": 0.5839235647143332, "grad_norm": 0.08723914659503822, "learning_rate": 1.2095819736533656e-05, "loss": 0.1826, "step": 18900 }, { "epoch": 0.5842325189813777, "grad_norm": 0.09361335930489353, "learning_rate": 1.2080785408830568e-05, "loss": 0.1888, "step": 18910 }, { "epoch": 0.5845414732484224, "grad_norm": 0.1117420757778778, "learning_rate": 1.2065754128545162e-05, "loss": 0.1848, "step": 18920 }, { "epoch": 0.584850427515467, "grad_norm": 0.10537147119926492, "learning_rate": 1.205072591136886e-05, "loss": 0.1862, "step": 18930 }, { "epoch": 0.5851593817825116, "grad_norm": 0.0899076765838517, "learning_rate": 1.2035700772989858e-05, "loss": 0.1842, "step": 18940 }, { "epoch": 0.5854683360495563, "grad_norm": 0.13167115985599936, "learning_rate": 1.2020678729093165e-05, "loss": 0.1875, "step": 18950 }, { "epoch": 0.5857772903166009, "grad_norm": 0.0940759886393431, "learning_rate": 1.2005659795360548e-05, "loss": 0.1952, "step": 18960 }, { "epoch": 0.5860862445836456, "grad_norm": 0.08439561408458067, "learning_rate": 1.1990643987470513e-05, "loss": 0.1814, "step": 18970 }, { "epoch": 0.5863951988506901, "grad_norm": 0.09277608571555924, "learning_rate": 1.1975631321098331e-05, "loss": 0.1844, "step": 18980 }, { "epoch": 0.5867041531177347, "grad_norm": 0.09229614294737931, "learning_rate": 1.1960621811915972e-05, "loss": 0.1842, "step": 18990 }, { "epoch": 0.5870131073847794, "grad_norm": 0.08444775309449502, "learning_rate": 1.1945615475592123e-05, "loss": 0.1846, "step": 19000 }, { "epoch": 0.587322061651824, "grad_norm": 0.07699398247589585, "learning_rate": 1.1930612327792149e-05, "loss": 0.1848, "step": 19010 }, { "epoch": 0.5876310159188686, "grad_norm": 0.0811198704798943, "learning_rate": 1.1915612384178095e-05, "loss": 0.1891, "step": 19020 }, { "epoch": 0.5879399701859133, "grad_norm": 0.08420393526672411, "learning_rate": 1.1900615660408646e-05, "loss": 0.1889, "step": 19030 }, { "epoch": 0.5882489244529578, "grad_norm": 0.09157326851832695, "learning_rate": 1.1885622172139154e-05, "loss": 0.1869, "step": 19040 }, { "epoch": 0.5885578787200024, "grad_norm": 0.07432839476221717, "learning_rate": 1.1870631935021558e-05, "loss": 0.1914, "step": 19050 }, { "epoch": 0.5888668329870471, "grad_norm": 0.10557693446278404, "learning_rate": 1.1855644964704437e-05, "loss": 0.1827, "step": 19060 }, { "epoch": 0.5891757872540917, "grad_norm": 0.09831711453984064, "learning_rate": 1.1840661276832932e-05, "loss": 0.1869, "step": 19070 }, { "epoch": 0.5894847415211363, "grad_norm": 0.09770759844682415, "learning_rate": 1.1825680887048779e-05, "loss": 0.1822, "step": 19080 }, { "epoch": 0.589793695788181, "grad_norm": 0.10264452372425766, "learning_rate": 1.1810703810990254e-05, "loss": 0.1959, "step": 19090 }, { "epoch": 0.5901026500552256, "grad_norm": 0.07565463215403447, "learning_rate": 1.179573006429218e-05, "loss": 0.1871, "step": 19100 }, { "epoch": 0.5904116043222702, "grad_norm": 0.07221104152275891, "learning_rate": 1.178075966258591e-05, "loss": 0.1891, "step": 19110 }, { "epoch": 0.5907205585893148, "grad_norm": 0.1329860173858145, "learning_rate": 1.1765792621499305e-05, "loss": 0.1904, "step": 19120 }, { "epoch": 0.5910295128563594, "grad_norm": 0.09125889728006076, "learning_rate": 1.17508289566567e-05, "loss": 0.1833, "step": 19130 }, { "epoch": 0.5913384671234041, "grad_norm": 0.07184380342756125, "learning_rate": 1.1735868683678933e-05, "loss": 0.181, "step": 19140 }, { "epoch": 0.5916474213904487, "grad_norm": 0.07817020505022121, "learning_rate": 1.1720911818183273e-05, "loss": 0.182, "step": 19150 }, { "epoch": 0.5919563756574933, "grad_norm": 0.08513551887533943, "learning_rate": 1.1705958375783461e-05, "loss": 0.1927, "step": 19160 }, { "epoch": 0.592265329924538, "grad_norm": 0.09529922445323878, "learning_rate": 1.1691008372089635e-05, "loss": 0.1805, "step": 19170 }, { "epoch": 0.5925742841915825, "grad_norm": 0.08981535353884065, "learning_rate": 1.1676061822708372e-05, "loss": 0.1885, "step": 19180 }, { "epoch": 0.5928832384586271, "grad_norm": 0.08430387130843447, "learning_rate": 1.166111874324262e-05, "loss": 0.1844, "step": 19190 }, { "epoch": 0.5931921927256718, "grad_norm": 0.08695512811368436, "learning_rate": 1.164617914929171e-05, "loss": 0.1835, "step": 19200 }, { "epoch": 0.5935011469927164, "grad_norm": 0.07833563244574043, "learning_rate": 1.1631243056451347e-05, "loss": 0.1916, "step": 19210 }, { "epoch": 0.593810101259761, "grad_norm": 0.09216158564512843, "learning_rate": 1.1616310480313565e-05, "loss": 0.187, "step": 19220 }, { "epoch": 0.5941190555268057, "grad_norm": 0.10356040039665805, "learning_rate": 1.1601381436466738e-05, "loss": 0.1799, "step": 19230 }, { "epoch": 0.5944280097938502, "grad_norm": 0.0785531722095939, "learning_rate": 1.1586455940495547e-05, "loss": 0.1853, "step": 19240 }, { "epoch": 0.5947369640608949, "grad_norm": 0.11098212443906112, "learning_rate": 1.1571534007980972e-05, "loss": 0.1844, "step": 19250 }, { "epoch": 0.5950459183279395, "grad_norm": 0.09027492882496103, "learning_rate": 1.1556615654500264e-05, "loss": 0.1883, "step": 19260 }, { "epoch": 0.5953548725949841, "grad_norm": 0.10120630638431274, "learning_rate": 1.1541700895626958e-05, "loss": 0.191, "step": 19270 }, { "epoch": 0.5956638268620288, "grad_norm": 0.12492870297484539, "learning_rate": 1.1526789746930815e-05, "loss": 0.1943, "step": 19280 }, { "epoch": 0.5959727811290734, "grad_norm": 0.09996879456988371, "learning_rate": 1.1511882223977844e-05, "loss": 0.1901, "step": 19290 }, { "epoch": 0.596281735396118, "grad_norm": 0.08282819545882, "learning_rate": 1.149697834233025e-05, "loss": 0.184, "step": 19300 }, { "epoch": 0.5965906896631626, "grad_norm": 0.0844753368413714, "learning_rate": 1.148207811754646e-05, "loss": 0.1977, "step": 19310 }, { "epoch": 0.5968996439302072, "grad_norm": 0.08188342055077452, "learning_rate": 1.1467181565181063e-05, "loss": 0.1823, "step": 19320 }, { "epoch": 0.5972085981972518, "grad_norm": 0.07948457256262524, "learning_rate": 1.1452288700784827e-05, "loss": 0.1837, "step": 19330 }, { "epoch": 0.5975175524642965, "grad_norm": 0.0982865574621929, "learning_rate": 1.1437399539904668e-05, "loss": 0.1874, "step": 19340 }, { "epoch": 0.5978265067313411, "grad_norm": 0.08748222062334479, "learning_rate": 1.1422514098083631e-05, "loss": 0.1844, "step": 19350 }, { "epoch": 0.5981354609983857, "grad_norm": 0.0849517310534258, "learning_rate": 1.140763239086088e-05, "loss": 0.1843, "step": 19360 }, { "epoch": 0.5984444152654304, "grad_norm": 0.07401851701885574, "learning_rate": 1.1392754433771687e-05, "loss": 0.1831, "step": 19370 }, { "epoch": 0.5987533695324749, "grad_norm": 0.09235152080487286, "learning_rate": 1.1377880242347397e-05, "loss": 0.1868, "step": 19380 }, { "epoch": 0.5990623237995196, "grad_norm": 0.0788790577912453, "learning_rate": 1.1363009832115439e-05, "loss": 0.1958, "step": 19390 }, { "epoch": 0.5993712780665642, "grad_norm": 0.08257094376240925, "learning_rate": 1.1348143218599278e-05, "loss": 0.179, "step": 19400 }, { "epoch": 0.5996802323336088, "grad_norm": 0.08331281503892017, "learning_rate": 1.1333280417318434e-05, "loss": 0.1779, "step": 19410 }, { "epoch": 0.5999891866006535, "grad_norm": 0.07932051293633373, "learning_rate": 1.1318421443788436e-05, "loss": 0.1859, "step": 19420 }, { "epoch": 0.6002981408676981, "grad_norm": 0.08526455481169266, "learning_rate": 1.1303566313520806e-05, "loss": 0.1842, "step": 19430 }, { "epoch": 0.6006070951347426, "grad_norm": 0.09674375852523574, "learning_rate": 1.1288715042023085e-05, "loss": 0.1881, "step": 19440 }, { "epoch": 0.6009160494017873, "grad_norm": 0.09569181761747295, "learning_rate": 1.1273867644798754e-05, "loss": 0.1857, "step": 19450 }, { "epoch": 0.6012250036688319, "grad_norm": 0.07962622292524987, "learning_rate": 1.1259024137347272e-05, "loss": 0.186, "step": 19460 }, { "epoch": 0.6015339579358765, "grad_norm": 0.08719966337395878, "learning_rate": 1.1244184535164018e-05, "loss": 0.1856, "step": 19470 }, { "epoch": 0.6018429122029212, "grad_norm": 0.08897484019878672, "learning_rate": 1.1229348853740317e-05, "loss": 0.1859, "step": 19480 }, { "epoch": 0.6021518664699658, "grad_norm": 0.07763418090346706, "learning_rate": 1.1214517108563377e-05, "loss": 0.1821, "step": 19490 }, { "epoch": 0.6024608207370103, "grad_norm": 0.08294157293505787, "learning_rate": 1.1199689315116323e-05, "loss": 0.1857, "step": 19500 }, { "epoch": 0.602769775004055, "grad_norm": 0.09126750085376728, "learning_rate": 1.1184865488878123e-05, "loss": 0.1841, "step": 19510 }, { "epoch": 0.6030787292710996, "grad_norm": 0.08780804555609552, "learning_rate": 1.1170045645323638e-05, "loss": 0.1819, "step": 19520 }, { "epoch": 0.6033876835381443, "grad_norm": 0.07561012286175459, "learning_rate": 1.115522979992354e-05, "loss": 0.1845, "step": 19530 }, { "epoch": 0.6036966378051889, "grad_norm": 0.08305953026102175, "learning_rate": 1.1140417968144355e-05, "loss": 0.1824, "step": 19540 }, { "epoch": 0.6040055920722335, "grad_norm": 0.09252131059633029, "learning_rate": 1.1125610165448394e-05, "loss": 0.1895, "step": 19550 }, { "epoch": 0.6043145463392782, "grad_norm": 0.08466140339181151, "learning_rate": 1.1110806407293784e-05, "loss": 0.1851, "step": 19560 }, { "epoch": 0.6046235006063228, "grad_norm": 0.0849766690814433, "learning_rate": 1.1096006709134404e-05, "loss": 0.1843, "step": 19570 }, { "epoch": 0.6049324548733673, "grad_norm": 0.08097195703181048, "learning_rate": 1.1081211086419929e-05, "loss": 0.1829, "step": 19580 }, { "epoch": 0.605241409140412, "grad_norm": 0.08667928640904893, "learning_rate": 1.1066419554595745e-05, "loss": 0.1914, "step": 19590 }, { "epoch": 0.6055503634074566, "grad_norm": 0.15393379599495152, "learning_rate": 1.1051632129102997e-05, "loss": 0.1945, "step": 19600 }, { "epoch": 0.6058593176745012, "grad_norm": 0.09093211482736684, "learning_rate": 1.103684882537852e-05, "loss": 0.1897, "step": 19610 }, { "epoch": 0.6061682719415459, "grad_norm": 0.08624215793715367, "learning_rate": 1.102206965885486e-05, "loss": 0.1865, "step": 19620 }, { "epoch": 0.6064772262085905, "grad_norm": 0.08456190559451564, "learning_rate": 1.1007294644960235e-05, "loss": 0.1859, "step": 19630 }, { "epoch": 0.606786180475635, "grad_norm": 0.08966066955579385, "learning_rate": 1.0992523799118543e-05, "loss": 0.1859, "step": 19640 }, { "epoch": 0.6070951347426797, "grad_norm": 0.08657352283109619, "learning_rate": 1.0977757136749313e-05, "loss": 0.1902, "step": 19650 }, { "epoch": 0.6074040890097243, "grad_norm": 0.07975700433016425, "learning_rate": 1.0962994673267711e-05, "loss": 0.1835, "step": 19660 }, { "epoch": 0.607713043276769, "grad_norm": 0.08009979147722592, "learning_rate": 1.0948236424084534e-05, "loss": 0.1803, "step": 19670 }, { "epoch": 0.6080219975438136, "grad_norm": 0.08506789440825369, "learning_rate": 1.0933482404606164e-05, "loss": 0.1797, "step": 19680 }, { "epoch": 0.6083309518108582, "grad_norm": 0.10157857136421491, "learning_rate": 1.091873263023457e-05, "loss": 0.191, "step": 19690 }, { "epoch": 0.6086399060779029, "grad_norm": 0.07557224513227645, "learning_rate": 1.0903987116367289e-05, "loss": 0.1857, "step": 19700 }, { "epoch": 0.6089488603449474, "grad_norm": 0.11128795404350794, "learning_rate": 1.0889245878397426e-05, "loss": 0.1831, "step": 19710 }, { "epoch": 0.609257814611992, "grad_norm": 0.10292068874463854, "learning_rate": 1.0874508931713593e-05, "loss": 0.1852, "step": 19720 }, { "epoch": 0.6095667688790367, "grad_norm": 0.08079840064866879, "learning_rate": 1.0859776291699956e-05, "loss": 0.178, "step": 19730 }, { "epoch": 0.6098757231460813, "grad_norm": 0.08623791196191595, "learning_rate": 1.0845047973736153e-05, "loss": 0.1924, "step": 19740 }, { "epoch": 0.6101846774131259, "grad_norm": 0.12145228572606707, "learning_rate": 1.0830323993197339e-05, "loss": 0.1915, "step": 19750 }, { "epoch": 0.6104936316801706, "grad_norm": 0.08729285672960486, "learning_rate": 1.0815604365454113e-05, "loss": 0.184, "step": 19760 }, { "epoch": 0.6108025859472151, "grad_norm": 0.08460342384525342, "learning_rate": 1.0800889105872559e-05, "loss": 0.1912, "step": 19770 }, { "epoch": 0.6111115402142597, "grad_norm": 0.0871823576044211, "learning_rate": 1.0786178229814176e-05, "loss": 0.1871, "step": 19780 }, { "epoch": 0.6114204944813044, "grad_norm": 0.07870026373917863, "learning_rate": 1.0771471752635903e-05, "loss": 0.1864, "step": 19790 }, { "epoch": 0.611729448748349, "grad_norm": 0.08073327956470966, "learning_rate": 1.0756769689690074e-05, "loss": 0.183, "step": 19800 }, { "epoch": 0.6120384030153937, "grad_norm": 0.08087990409818094, "learning_rate": 1.0742072056324433e-05, "loss": 0.1842, "step": 19810 }, { "epoch": 0.6123473572824383, "grad_norm": 0.07840518720141709, "learning_rate": 1.0727378867882075e-05, "loss": 0.1844, "step": 19820 }, { "epoch": 0.6126563115494829, "grad_norm": 0.0857951081060033, "learning_rate": 1.0712690139701487e-05, "loss": 0.1807, "step": 19830 }, { "epoch": 0.6129652658165275, "grad_norm": 0.1230016763922673, "learning_rate": 1.0698005887116463e-05, "loss": 0.1949, "step": 19840 }, { "epoch": 0.6132742200835721, "grad_norm": 0.08977077281458193, "learning_rate": 1.0683326125456157e-05, "loss": 0.1828, "step": 19850 }, { "epoch": 0.6135831743506167, "grad_norm": 0.07661706950343256, "learning_rate": 1.0668650870045013e-05, "loss": 0.1805, "step": 19860 }, { "epoch": 0.6138921286176614, "grad_norm": 0.08009017554123325, "learning_rate": 1.0653980136202788e-05, "loss": 0.1868, "step": 19870 }, { "epoch": 0.614201082884706, "grad_norm": 0.08070096082327056, "learning_rate": 1.0639313939244509e-05, "loss": 0.1804, "step": 19880 }, { "epoch": 0.6145100371517506, "grad_norm": 0.07529212229842093, "learning_rate": 1.062465229448046e-05, "loss": 0.184, "step": 19890 }, { "epoch": 0.6148189914187953, "grad_norm": 0.0785703936539135, "learning_rate": 1.0609995217216183e-05, "loss": 0.1833, "step": 19900 }, { "epoch": 0.6151279456858398, "grad_norm": 0.08097943876546655, "learning_rate": 1.0595342722752455e-05, "loss": 0.1978, "step": 19910 }, { "epoch": 0.6154368999528844, "grad_norm": 0.08474437129651521, "learning_rate": 1.0580694826385264e-05, "loss": 0.1836, "step": 19920 }, { "epoch": 0.6157458542199291, "grad_norm": 0.08672105010327077, "learning_rate": 1.056605154340579e-05, "loss": 0.1876, "step": 19930 }, { "epoch": 0.6160548084869737, "grad_norm": 0.07380386784963713, "learning_rate": 1.0551412889100413e-05, "loss": 0.1846, "step": 19940 }, { "epoch": 0.6163637627540184, "grad_norm": 0.09422829500226426, "learning_rate": 1.0536778878750665e-05, "loss": 0.1825, "step": 19950 }, { "epoch": 0.616672717021063, "grad_norm": 0.08064736933954149, "learning_rate": 1.052214952763325e-05, "loss": 0.1851, "step": 19960 }, { "epoch": 0.6169816712881075, "grad_norm": 0.08299793921296503, "learning_rate": 1.050752485101998e-05, "loss": 0.189, "step": 19970 }, { "epoch": 0.6172906255551522, "grad_norm": 0.20022126832501388, "learning_rate": 1.049290486417782e-05, "loss": 0.189, "step": 19980 }, { "epoch": 0.6175995798221968, "grad_norm": 0.08780516760895216, "learning_rate": 1.047828958236881e-05, "loss": 0.1945, "step": 19990 }, { "epoch": 0.6179085340892414, "grad_norm": 0.08728220336156364, "learning_rate": 1.0463679020850098e-05, "loss": 0.1849, "step": 20000 }, { "epoch": 0.6182174883562861, "grad_norm": 0.07646045519503682, "learning_rate": 1.0449073194873893e-05, "loss": 0.1827, "step": 20010 }, { "epoch": 0.6185264426233307, "grad_norm": 0.0848586359033337, "learning_rate": 1.0434472119687468e-05, "loss": 0.1984, "step": 20020 }, { "epoch": 0.6188353968903753, "grad_norm": 0.07910424422713831, "learning_rate": 1.0419875810533129e-05, "loss": 0.1889, "step": 20030 }, { "epoch": 0.61914435115742, "grad_norm": 0.07327911861754191, "learning_rate": 1.0405284282648216e-05, "loss": 0.1898, "step": 20040 }, { "epoch": 0.6194533054244645, "grad_norm": 0.09398610431686306, "learning_rate": 1.0390697551265068e-05, "loss": 0.1869, "step": 20050 }, { "epoch": 0.6197622596915091, "grad_norm": 0.07253967693690853, "learning_rate": 1.0376115631611029e-05, "loss": 0.181, "step": 20060 }, { "epoch": 0.6200712139585538, "grad_norm": 0.09232348245256568, "learning_rate": 1.03615385389084e-05, "loss": 0.1879, "step": 20070 }, { "epoch": 0.6203801682255984, "grad_norm": 0.0923122510204038, "learning_rate": 1.0346966288374472e-05, "loss": 0.1829, "step": 20080 }, { "epoch": 0.6206891224926431, "grad_norm": 0.07936874657939968, "learning_rate": 1.0332398895221449e-05, "loss": 0.1818, "step": 20090 }, { "epoch": 0.6209980767596877, "grad_norm": 0.08894380415302283, "learning_rate": 1.0317836374656494e-05, "loss": 0.1847, "step": 20100 }, { "epoch": 0.6213070310267322, "grad_norm": 0.07863944507287474, "learning_rate": 1.030327874188166e-05, "loss": 0.1845, "step": 20110 }, { "epoch": 0.6216159852937769, "grad_norm": 0.08140127687302948, "learning_rate": 1.0288726012093909e-05, "loss": 0.1804, "step": 20120 }, { "epoch": 0.6219249395608215, "grad_norm": 0.08163237136633517, "learning_rate": 1.0274178200485086e-05, "loss": 0.1872, "step": 20130 }, { "epoch": 0.6222338938278661, "grad_norm": 0.16992938890110298, "learning_rate": 1.0259635322241894e-05, "loss": 0.193, "step": 20140 }, { "epoch": 0.6225428480949108, "grad_norm": 0.12292714328551844, "learning_rate": 1.0245097392545896e-05, "loss": 0.1919, "step": 20150 }, { "epoch": 0.6228518023619554, "grad_norm": 0.0719191327526604, "learning_rate": 1.0230564426573476e-05, "loss": 0.1879, "step": 20160 }, { "epoch": 0.623160756629, "grad_norm": 0.09503697141926552, "learning_rate": 1.0216036439495852e-05, "loss": 0.1797, "step": 20170 }, { "epoch": 0.6234697108960446, "grad_norm": 0.08358688600952284, "learning_rate": 1.0201513446479028e-05, "loss": 0.1838, "step": 20180 }, { "epoch": 0.6237786651630892, "grad_norm": 0.09438394393783162, "learning_rate": 1.0186995462683815e-05, "loss": 0.1828, "step": 20190 }, { "epoch": 0.6240876194301338, "grad_norm": 0.11716791528411023, "learning_rate": 1.0172482503265768e-05, "loss": 0.1935, "step": 20200 }, { "epoch": 0.6243965736971785, "grad_norm": 0.09361063759492731, "learning_rate": 1.0157974583375225e-05, "loss": 0.1818, "step": 20210 }, { "epoch": 0.6247055279642231, "grad_norm": 0.08105502479633177, "learning_rate": 1.0143471718157245e-05, "loss": 0.1887, "step": 20220 }, { "epoch": 0.6250144822312678, "grad_norm": 0.09295395575622786, "learning_rate": 1.0128973922751613e-05, "loss": 0.1898, "step": 20230 }, { "epoch": 0.6253234364983123, "grad_norm": 0.08766105008046463, "learning_rate": 1.0114481212292826e-05, "loss": 0.1809, "step": 20240 }, { "epoch": 0.6256323907653569, "grad_norm": 0.08985001838450198, "learning_rate": 1.0099993601910073e-05, "loss": 0.1775, "step": 20250 }, { "epoch": 0.6259413450324016, "grad_norm": 0.08344503889344561, "learning_rate": 1.0085511106727207e-05, "loss": 0.1794, "step": 20260 }, { "epoch": 0.6262502992994462, "grad_norm": 0.09504270442450875, "learning_rate": 1.0071033741862764e-05, "loss": 0.1835, "step": 20270 }, { "epoch": 0.6265592535664908, "grad_norm": 0.0857553897863654, "learning_rate": 1.00565615224299e-05, "loss": 0.1861, "step": 20280 }, { "epoch": 0.6268682078335355, "grad_norm": 0.08574806320199437, "learning_rate": 1.0042094463536419e-05, "loss": 0.1892, "step": 20290 }, { "epoch": 0.62717716210058, "grad_norm": 0.07834983542251327, "learning_rate": 1.0027632580284723e-05, "loss": 0.1866, "step": 20300 }, { "epoch": 0.6274861163676246, "grad_norm": 0.08745193791786973, "learning_rate": 1.0013175887771824e-05, "loss": 0.1887, "step": 20310 }, { "epoch": 0.6277950706346693, "grad_norm": 0.0788487355313793, "learning_rate": 9.998724401089301e-06, "loss": 0.1854, "step": 20320 }, { "epoch": 0.6281040249017139, "grad_norm": 0.08009857430142646, "learning_rate": 9.984278135323312e-06, "loss": 0.181, "step": 20330 }, { "epoch": 0.6284129791687585, "grad_norm": 0.0926032013008688, "learning_rate": 9.96983710555456e-06, "loss": 0.1895, "step": 20340 }, { "epoch": 0.6287219334358032, "grad_norm": 0.09862161472165971, "learning_rate": 9.955401326858275e-06, "loss": 0.1796, "step": 20350 }, { "epoch": 0.6290308877028478, "grad_norm": 0.07656621564990976, "learning_rate": 9.940970814304217e-06, "loss": 0.1814, "step": 20360 }, { "epoch": 0.6293398419698925, "grad_norm": 0.09098582721427066, "learning_rate": 9.92654558295664e-06, "loss": 0.1829, "step": 20370 }, { "epoch": 0.629648796236937, "grad_norm": 0.09097843459638964, "learning_rate": 9.912125647874293e-06, "loss": 0.1788, "step": 20380 }, { "epoch": 0.6299577505039816, "grad_norm": 0.10792002657150579, "learning_rate": 9.897711024110381e-06, "loss": 0.1826, "step": 20390 }, { "epoch": 0.6302667047710263, "grad_norm": 0.07447835947155786, "learning_rate": 9.88330172671259e-06, "loss": 0.1821, "step": 20400 }, { "epoch": 0.6305756590380709, "grad_norm": 0.09406839755830929, "learning_rate": 9.868897770723012e-06, "loss": 0.1908, "step": 20410 }, { "epoch": 0.6308846133051155, "grad_norm": 0.1285771773616567, "learning_rate": 9.854499171178197e-06, "loss": 0.1885, "step": 20420 }, { "epoch": 0.6311935675721602, "grad_norm": 0.09292480698582771, "learning_rate": 9.840105943109077e-06, "loss": 0.1889, "step": 20430 }, { "epoch": 0.6315025218392047, "grad_norm": 0.07294220435329125, "learning_rate": 9.825718101540994e-06, "loss": 0.1817, "step": 20440 }, { "epoch": 0.6318114761062493, "grad_norm": 0.0895501299879043, "learning_rate": 9.81133566149366e-06, "loss": 0.1797, "step": 20450 }, { "epoch": 0.632120430373294, "grad_norm": 0.08606985323050444, "learning_rate": 9.79695863798114e-06, "loss": 0.1895, "step": 20460 }, { "epoch": 0.6324293846403386, "grad_norm": 0.07606324384658682, "learning_rate": 9.782587046011863e-06, "loss": 0.1823, "step": 20470 }, { "epoch": 0.6327383389073832, "grad_norm": 0.08687194167340641, "learning_rate": 9.768220900588574e-06, "loss": 0.1899, "step": 20480 }, { "epoch": 0.6330472931744279, "grad_norm": 0.0936679680166201, "learning_rate": 9.753860216708333e-06, "loss": 0.1848, "step": 20490 }, { "epoch": 0.6333562474414725, "grad_norm": 0.07860288394024362, "learning_rate": 9.73950500936251e-06, "loss": 0.1835, "step": 20500 }, { "epoch": 0.6336652017085171, "grad_norm": 0.09208404294384408, "learning_rate": 9.725155293536739e-06, "loss": 0.1926, "step": 20510 }, { "epoch": 0.6339741559755617, "grad_norm": 0.08090329111864401, "learning_rate": 9.71081108421094e-06, "loss": 0.1795, "step": 20520 }, { "epoch": 0.6342831102426063, "grad_norm": 0.09430035972466351, "learning_rate": 9.696472396359274e-06, "loss": 0.1807, "step": 20530 }, { "epoch": 0.634592064509651, "grad_norm": 0.09330412143446754, "learning_rate": 9.682139244950143e-06, "loss": 0.1836, "step": 20540 }, { "epoch": 0.6349010187766956, "grad_norm": 0.07667001788217906, "learning_rate": 9.667811644946167e-06, "loss": 0.1806, "step": 20550 }, { "epoch": 0.6352099730437402, "grad_norm": 0.09372099288123097, "learning_rate": 9.653489611304164e-06, "loss": 0.1816, "step": 20560 }, { "epoch": 0.6355189273107849, "grad_norm": 0.09008663985017154, "learning_rate": 9.63917315897516e-06, "loss": 0.1809, "step": 20570 }, { "epoch": 0.6358278815778294, "grad_norm": 0.12952734707392144, "learning_rate": 9.624862302904331e-06, "loss": 0.1905, "step": 20580 }, { "epoch": 0.636136835844874, "grad_norm": 0.09219556248390978, "learning_rate": 9.610557058031032e-06, "loss": 0.1867, "step": 20590 }, { "epoch": 0.6364457901119187, "grad_norm": 0.09330127555515816, "learning_rate": 9.59625743928874e-06, "loss": 0.1828, "step": 20600 }, { "epoch": 0.6367547443789633, "grad_norm": 0.08658418890523488, "learning_rate": 9.581963461605083e-06, "loss": 0.1784, "step": 20610 }, { "epoch": 0.6370636986460079, "grad_norm": 0.08868316701971833, "learning_rate": 9.567675139901774e-06, "loss": 0.1907, "step": 20620 }, { "epoch": 0.6373726529130526, "grad_norm": 0.0856353186910355, "learning_rate": 9.55339248909465e-06, "loss": 0.1948, "step": 20630 }, { "epoch": 0.6376816071800971, "grad_norm": 0.10250504390189848, "learning_rate": 9.539115524093597e-06, "loss": 0.1891, "step": 20640 }, { "epoch": 0.6379905614471418, "grad_norm": 0.07868317571192376, "learning_rate": 9.524844259802594e-06, "loss": 0.1914, "step": 20650 }, { "epoch": 0.6382995157141864, "grad_norm": 0.09384218652468425, "learning_rate": 9.510578711119645e-06, "loss": 0.1874, "step": 20660 }, { "epoch": 0.638608469981231, "grad_norm": 0.11899893173518487, "learning_rate": 9.496318892936809e-06, "loss": 0.1829, "step": 20670 }, { "epoch": 0.6389174242482757, "grad_norm": 0.0816515123513843, "learning_rate": 9.482064820140146e-06, "loss": 0.1808, "step": 20680 }, { "epoch": 0.6392263785153203, "grad_norm": 0.0773146102183417, "learning_rate": 9.467816507609728e-06, "loss": 0.189, "step": 20690 }, { "epoch": 0.6395353327823649, "grad_norm": 0.08757599292315726, "learning_rate": 9.453573970219603e-06, "loss": 0.1884, "step": 20700 }, { "epoch": 0.6398442870494095, "grad_norm": 0.10298427025583606, "learning_rate": 9.439337222837808e-06, "loss": 0.1826, "step": 20710 }, { "epoch": 0.6401532413164541, "grad_norm": 0.09188953546962604, "learning_rate": 9.425106280326318e-06, "loss": 0.1906, "step": 20720 }, { "epoch": 0.6404621955834987, "grad_norm": 0.08883895755213242, "learning_rate": 9.410881157541062e-06, "loss": 0.1816, "step": 20730 }, { "epoch": 0.6407711498505434, "grad_norm": 0.09682712583338604, "learning_rate": 9.396661869331877e-06, "loss": 0.1884, "step": 20740 }, { "epoch": 0.641080104117588, "grad_norm": 0.09008449684419392, "learning_rate": 9.382448430542532e-06, "loss": 0.1849, "step": 20750 }, { "epoch": 0.6413890583846326, "grad_norm": 0.09547832620192426, "learning_rate": 9.368240856010667e-06, "loss": 0.181, "step": 20760 }, { "epoch": 0.6416980126516773, "grad_norm": 0.08531693574310624, "learning_rate": 9.354039160567818e-06, "loss": 0.1811, "step": 20770 }, { "epoch": 0.6420069669187218, "grad_norm": 0.0941283910320591, "learning_rate": 9.339843359039373e-06, "loss": 0.1873, "step": 20780 }, { "epoch": 0.6423159211857665, "grad_norm": 0.08622755761927506, "learning_rate": 9.325653466244565e-06, "loss": 0.1817, "step": 20790 }, { "epoch": 0.6426248754528111, "grad_norm": 0.10957996664407295, "learning_rate": 9.311469496996472e-06, "loss": 0.1851, "step": 20800 }, { "epoch": 0.6429338297198557, "grad_norm": 0.09196273218754084, "learning_rate": 9.297291466101975e-06, "loss": 0.1839, "step": 20810 }, { "epoch": 0.6432427839869004, "grad_norm": 0.09089175446246295, "learning_rate": 9.283119388361768e-06, "loss": 0.1838, "step": 20820 }, { "epoch": 0.643551738253945, "grad_norm": 0.11048996293256481, "learning_rate": 9.268953278570314e-06, "loss": 0.1834, "step": 20830 }, { "epoch": 0.6438606925209895, "grad_norm": 0.08942417078026148, "learning_rate": 9.254793151515864e-06, "loss": 0.1878, "step": 20840 }, { "epoch": 0.6441696467880342, "grad_norm": 0.09466700710009077, "learning_rate": 9.240639021980406e-06, "loss": 0.1834, "step": 20850 }, { "epoch": 0.6444786010550788, "grad_norm": 0.09873195487288632, "learning_rate": 9.226490904739689e-06, "loss": 0.188, "step": 20860 }, { "epoch": 0.6447875553221234, "grad_norm": 0.09173169657963821, "learning_rate": 9.212348814563158e-06, "loss": 0.1816, "step": 20870 }, { "epoch": 0.6450965095891681, "grad_norm": 0.09500502487193062, "learning_rate": 9.198212766213997e-06, "loss": 0.1839, "step": 20880 }, { "epoch": 0.6454054638562127, "grad_norm": 0.09436950076336052, "learning_rate": 9.18408277444905e-06, "loss": 0.1808, "step": 20890 }, { "epoch": 0.6457144181232572, "grad_norm": 0.10117431682472817, "learning_rate": 9.169958854018874e-06, "loss": 0.1812, "step": 20900 }, { "epoch": 0.6460233723903019, "grad_norm": 0.09242615454166188, "learning_rate": 9.155841019667655e-06, "loss": 0.1811, "step": 20910 }, { "epoch": 0.6463323266573465, "grad_norm": 0.08406771936945807, "learning_rate": 9.141729286133246e-06, "loss": 0.1823, "step": 20920 }, { "epoch": 0.6466412809243912, "grad_norm": 0.09330851113984145, "learning_rate": 9.12762366814712e-06, "loss": 0.1774, "step": 20930 }, { "epoch": 0.6469502351914358, "grad_norm": 0.09296950290615348, "learning_rate": 9.11352418043438e-06, "loss": 0.187, "step": 20940 }, { "epoch": 0.6472591894584804, "grad_norm": 0.08599265520687546, "learning_rate": 9.09943083771371e-06, "loss": 0.1803, "step": 20950 }, { "epoch": 0.6475681437255251, "grad_norm": 0.09119409994774319, "learning_rate": 9.085343654697401e-06, "loss": 0.1928, "step": 20960 }, { "epoch": 0.6478770979925697, "grad_norm": 0.08082505933686907, "learning_rate": 9.07126264609129e-06, "loss": 0.1888, "step": 20970 }, { "epoch": 0.6481860522596142, "grad_norm": 0.09407633132491798, "learning_rate": 9.057187826594796e-06, "loss": 0.1836, "step": 20980 }, { "epoch": 0.6484950065266589, "grad_norm": 0.09247638767842638, "learning_rate": 9.04311921090085e-06, "loss": 0.1821, "step": 20990 }, { "epoch": 0.6488039607937035, "grad_norm": 0.09549021328910906, "learning_rate": 9.029056813695927e-06, "loss": 0.1904, "step": 21000 }, { "epoch": 0.6491129150607481, "grad_norm": 0.14298300777568937, "learning_rate": 9.015000649659997e-06, "loss": 0.1823, "step": 21010 }, { "epoch": 0.6494218693277928, "grad_norm": 0.10420840897491478, "learning_rate": 9.00095073346653e-06, "loss": 0.1856, "step": 21020 }, { "epoch": 0.6497308235948374, "grad_norm": 0.08341315756376709, "learning_rate": 8.98690707978247e-06, "loss": 0.1779, "step": 21030 }, { "epoch": 0.6500397778618819, "grad_norm": 0.12059619479918031, "learning_rate": 8.972869703268226e-06, "loss": 0.1803, "step": 21040 }, { "epoch": 0.6503487321289266, "grad_norm": 0.09904239652535235, "learning_rate": 8.958838618577658e-06, "loss": 0.1848, "step": 21050 }, { "epoch": 0.6506576863959712, "grad_norm": 0.09769625000327982, "learning_rate": 8.944813840358046e-06, "loss": 0.184, "step": 21060 }, { "epoch": 0.6509666406630159, "grad_norm": 0.0915300797061274, "learning_rate": 8.930795383250105e-06, "loss": 0.1828, "step": 21070 }, { "epoch": 0.6512755949300605, "grad_norm": 0.12815218888121502, "learning_rate": 8.91678326188793e-06, "loss": 0.1857, "step": 21080 }, { "epoch": 0.6515845491971051, "grad_norm": 0.10658342707078763, "learning_rate": 8.902777490899021e-06, "loss": 0.1851, "step": 21090 }, { "epoch": 0.6518935034641498, "grad_norm": 0.09753244504022564, "learning_rate": 8.88877808490423e-06, "loss": 0.1871, "step": 21100 }, { "epoch": 0.6522024577311943, "grad_norm": 0.08982957125297955, "learning_rate": 8.874785058517785e-06, "loss": 0.1998, "step": 21110 }, { "epoch": 0.6525114119982389, "grad_norm": 0.09968162051710036, "learning_rate": 8.860798426347239e-06, "loss": 0.1823, "step": 21120 }, { "epoch": 0.6528203662652836, "grad_norm": 0.09544534951209893, "learning_rate": 8.846818202993474e-06, "loss": 0.1848, "step": 21130 }, { "epoch": 0.6531293205323282, "grad_norm": 0.08866250879940554, "learning_rate": 8.832844403050685e-06, "loss": 0.1845, "step": 21140 }, { "epoch": 0.6534382747993728, "grad_norm": 0.09520192185065951, "learning_rate": 8.818877041106362e-06, "loss": 0.1828, "step": 21150 }, { "epoch": 0.6537472290664175, "grad_norm": 0.10104510627259106, "learning_rate": 8.804916131741258e-06, "loss": 0.1856, "step": 21160 }, { "epoch": 0.654056183333462, "grad_norm": 0.08513641643427416, "learning_rate": 8.790961689529421e-06, "loss": 0.1889, "step": 21170 }, { "epoch": 0.6543651376005066, "grad_norm": 0.09525142227450835, "learning_rate": 8.777013729038117e-06, "loss": 0.1893, "step": 21180 }, { "epoch": 0.6546740918675513, "grad_norm": 0.0934711455148004, "learning_rate": 8.763072264827866e-06, "loss": 0.1844, "step": 21190 }, { "epoch": 0.6549830461345959, "grad_norm": 0.11279023065894526, "learning_rate": 8.749137311452397e-06, "loss": 0.1857, "step": 21200 }, { "epoch": 0.6552920004016406, "grad_norm": 0.09846335887645649, "learning_rate": 8.735208883458648e-06, "loss": 0.1903, "step": 21210 }, { "epoch": 0.6556009546686852, "grad_norm": 0.09183429171380303, "learning_rate": 8.721286995386732e-06, "loss": 0.1829, "step": 21220 }, { "epoch": 0.6559099089357298, "grad_norm": 0.0903583720516295, "learning_rate": 8.707371661769953e-06, "loss": 0.1886, "step": 21230 }, { "epoch": 0.6562188632027744, "grad_norm": 0.10429231801413788, "learning_rate": 8.693462897134765e-06, "loss": 0.1859, "step": 21240 }, { "epoch": 0.656527817469819, "grad_norm": 0.08954233179434426, "learning_rate": 8.679560716000753e-06, "loss": 0.1826, "step": 21250 }, { "epoch": 0.6568367717368636, "grad_norm": 0.09577332953752825, "learning_rate": 8.665665132880652e-06, "loss": 0.1813, "step": 21260 }, { "epoch": 0.6571457260039083, "grad_norm": 0.08073918206578158, "learning_rate": 8.651776162280288e-06, "loss": 0.1864, "step": 21270 }, { "epoch": 0.6574546802709529, "grad_norm": 0.11017800688296119, "learning_rate": 8.637893818698602e-06, "loss": 0.185, "step": 21280 }, { "epoch": 0.6577636345379975, "grad_norm": 0.08123014275827171, "learning_rate": 8.624018116627597e-06, "loss": 0.1879, "step": 21290 }, { "epoch": 0.6580725888050422, "grad_norm": 0.12589748507751666, "learning_rate": 8.61014907055237e-06, "loss": 0.1825, "step": 21300 }, { "epoch": 0.6583815430720867, "grad_norm": 0.10644301918790272, "learning_rate": 8.596286694951037e-06, "loss": 0.186, "step": 21310 }, { "epoch": 0.6586904973391313, "grad_norm": 0.10167555722509153, "learning_rate": 8.582431004294781e-06, "loss": 0.1896, "step": 21320 }, { "epoch": 0.658999451606176, "grad_norm": 0.09356937316529804, "learning_rate": 8.56858201304778e-06, "loss": 0.1831, "step": 21330 }, { "epoch": 0.6593084058732206, "grad_norm": 0.10472028241752443, "learning_rate": 8.554739735667245e-06, "loss": 0.1885, "step": 21340 }, { "epoch": 0.6596173601402653, "grad_norm": 0.10025067675591194, "learning_rate": 8.540904186603351e-06, "loss": 0.1911, "step": 21350 }, { "epoch": 0.6599263144073099, "grad_norm": 0.11666191500386813, "learning_rate": 8.527075380299272e-06, "loss": 0.1843, "step": 21360 }, { "epoch": 0.6602352686743544, "grad_norm": 0.0813420857349242, "learning_rate": 8.51325333119113e-06, "loss": 0.1885, "step": 21370 }, { "epoch": 0.6605442229413991, "grad_norm": 0.1038112741059007, "learning_rate": 8.499438053707999e-06, "loss": 0.1859, "step": 21380 }, { "epoch": 0.6608531772084437, "grad_norm": 0.08261320957489784, "learning_rate": 8.485629562271876e-06, "loss": 0.1895, "step": 21390 }, { "epoch": 0.6611621314754883, "grad_norm": 0.11290315441437623, "learning_rate": 8.471827871297688e-06, "loss": 0.1842, "step": 21400 }, { "epoch": 0.661471085742533, "grad_norm": 0.09672724467882982, "learning_rate": 8.458032995193252e-06, "loss": 0.1814, "step": 21410 }, { "epoch": 0.6617800400095776, "grad_norm": 0.11404081140399208, "learning_rate": 8.444244948359272e-06, "loss": 0.1818, "step": 21420 }, { "epoch": 0.6620889942766222, "grad_norm": 0.09391262554098156, "learning_rate": 8.430463745189321e-06, "loss": 0.1843, "step": 21430 }, { "epoch": 0.6623979485436668, "grad_norm": 0.11187205517468783, "learning_rate": 8.416689400069836e-06, "loss": 0.1832, "step": 21440 }, { "epoch": 0.6627069028107114, "grad_norm": 0.09598670858020575, "learning_rate": 8.402921927380083e-06, "loss": 0.1821, "step": 21450 }, { "epoch": 0.663015857077756, "grad_norm": 0.09915378313178032, "learning_rate": 8.389161341492172e-06, "loss": 0.1827, "step": 21460 }, { "epoch": 0.6633248113448007, "grad_norm": 0.08341507724445604, "learning_rate": 8.375407656771008e-06, "loss": 0.1853, "step": 21470 }, { "epoch": 0.6636337656118453, "grad_norm": 0.09587779095111101, "learning_rate": 8.361660887574286e-06, "loss": 0.1841, "step": 21480 }, { "epoch": 0.66394271987889, "grad_norm": 0.12766745149581377, "learning_rate": 8.347921048252503e-06, "loss": 0.1828, "step": 21490 }, { "epoch": 0.6642516741459346, "grad_norm": 0.1031721423190585, "learning_rate": 8.334188153148904e-06, "loss": 0.1818, "step": 21500 }, { "epoch": 0.6645606284129791, "grad_norm": 0.09195452025706184, "learning_rate": 8.320462216599501e-06, "loss": 0.1983, "step": 21510 }, { "epoch": 0.6648695826800238, "grad_norm": 0.09793006585130341, "learning_rate": 8.30674325293302e-06, "loss": 0.1797, "step": 21520 }, { "epoch": 0.6651785369470684, "grad_norm": 0.10820003579466461, "learning_rate": 8.293031276470932e-06, "loss": 0.1862, "step": 21530 }, { "epoch": 0.665487491214113, "grad_norm": 0.08924766978867837, "learning_rate": 8.27932630152739e-06, "loss": 0.1867, "step": 21540 }, { "epoch": 0.6657964454811577, "grad_norm": 0.0864161019197335, "learning_rate": 8.265628342409263e-06, "loss": 0.1955, "step": 21550 }, { "epoch": 0.6661053997482023, "grad_norm": 0.0928457064161821, "learning_rate": 8.251937413416071e-06, "loss": 0.1829, "step": 21560 }, { "epoch": 0.6664143540152468, "grad_norm": 0.12068934710591628, "learning_rate": 8.238253528840018e-06, "loss": 0.1775, "step": 21570 }, { "epoch": 0.6667233082822915, "grad_norm": 0.08515900181313076, "learning_rate": 8.224576702965931e-06, "loss": 0.1827, "step": 21580 }, { "epoch": 0.6670322625493361, "grad_norm": 0.0972574825682533, "learning_rate": 8.210906950071296e-06, "loss": 0.1816, "step": 21590 }, { "epoch": 0.6673412168163807, "grad_norm": 0.11425152744846681, "learning_rate": 8.197244284426185e-06, "loss": 0.1801, "step": 21600 }, { "epoch": 0.6676501710834254, "grad_norm": 0.10850147262928118, "learning_rate": 8.183588720293301e-06, "loss": 0.1831, "step": 21610 }, { "epoch": 0.66795912535047, "grad_norm": 0.09469469601554154, "learning_rate": 8.169940271927906e-06, "loss": 0.1895, "step": 21620 }, { "epoch": 0.6682680796175147, "grad_norm": 0.096275588799572, "learning_rate": 8.156298953577859e-06, "loss": 0.182, "step": 21630 }, { "epoch": 0.6685770338845592, "grad_norm": 0.09430258563893375, "learning_rate": 8.142664779483558e-06, "loss": 0.1865, "step": 21640 }, { "epoch": 0.6688859881516038, "grad_norm": 0.09105192280482428, "learning_rate": 8.129037763877948e-06, "loss": 0.1829, "step": 21650 }, { "epoch": 0.6691949424186485, "grad_norm": 0.09771768495441543, "learning_rate": 8.1154179209865e-06, "loss": 0.1837, "step": 21660 }, { "epoch": 0.6695038966856931, "grad_norm": 0.11779791734450315, "learning_rate": 8.101805265027207e-06, "loss": 0.1856, "step": 21670 }, { "epoch": 0.6698128509527377, "grad_norm": 0.0927670959552032, "learning_rate": 8.088199810210542e-06, "loss": 0.1849, "step": 21680 }, { "epoch": 0.6701218052197824, "grad_norm": 0.10318928461551366, "learning_rate": 8.07460157073948e-06, "loss": 0.1844, "step": 21690 }, { "epoch": 0.670430759486827, "grad_norm": 0.10207803569915344, "learning_rate": 8.061010560809452e-06, "loss": 0.1814, "step": 21700 }, { "epoch": 0.6707397137538715, "grad_norm": 0.10859725615607474, "learning_rate": 8.047426794608333e-06, "loss": 0.1827, "step": 21710 }, { "epoch": 0.6710486680209162, "grad_norm": 0.11216570380804705, "learning_rate": 8.033850286316465e-06, "loss": 0.1838, "step": 21720 }, { "epoch": 0.6713576222879608, "grad_norm": 0.08581601704543383, "learning_rate": 8.020281050106578e-06, "loss": 0.1848, "step": 21730 }, { "epoch": 0.6716665765550054, "grad_norm": 0.1132973714821087, "learning_rate": 8.006719100143842e-06, "loss": 0.1885, "step": 21740 }, { "epoch": 0.6719755308220501, "grad_norm": 0.09055021607729273, "learning_rate": 7.993164450585797e-06, "loss": 0.1817, "step": 21750 }, { "epoch": 0.6722844850890947, "grad_norm": 0.11510073927318624, "learning_rate": 7.979617115582381e-06, "loss": 0.1852, "step": 21760 }, { "epoch": 0.6725934393561394, "grad_norm": 0.09656673805707162, "learning_rate": 7.966077109275877e-06, "loss": 0.1824, "step": 21770 }, { "epoch": 0.6729023936231839, "grad_norm": 0.09612752042169377, "learning_rate": 7.95254444580094e-06, "loss": 0.1829, "step": 21780 }, { "epoch": 0.6732113478902285, "grad_norm": 0.09804574768999724, "learning_rate": 7.939019139284533e-06, "loss": 0.1903, "step": 21790 }, { "epoch": 0.6735203021572732, "grad_norm": 0.1015705438595519, "learning_rate": 7.925501203845967e-06, "loss": 0.1852, "step": 21800 }, { "epoch": 0.6738292564243178, "grad_norm": 0.10521792775158305, "learning_rate": 7.91199065359683e-06, "loss": 0.1854, "step": 21810 }, { "epoch": 0.6741382106913624, "grad_norm": 0.09240116148105681, "learning_rate": 7.898487502641032e-06, "loss": 0.184, "step": 21820 }, { "epoch": 0.6744471649584071, "grad_norm": 0.09456845391420433, "learning_rate": 7.884991765074722e-06, "loss": 0.1818, "step": 21830 }, { "epoch": 0.6747561192254516, "grad_norm": 0.08608038992547533, "learning_rate": 7.87150345498635e-06, "loss": 0.1832, "step": 21840 }, { "epoch": 0.6750650734924962, "grad_norm": 0.09980873652774314, "learning_rate": 7.85802258645658e-06, "loss": 0.1819, "step": 21850 }, { "epoch": 0.6753740277595409, "grad_norm": 0.10895283523915741, "learning_rate": 7.844549173558315e-06, "loss": 0.1871, "step": 21860 }, { "epoch": 0.6756829820265855, "grad_norm": 0.09950732048600308, "learning_rate": 7.83108323035669e-06, "loss": 0.1804, "step": 21870 }, { "epoch": 0.6759919362936301, "grad_norm": 0.1192050036517003, "learning_rate": 7.81762477090903e-06, "loss": 0.1811, "step": 21880 }, { "epoch": 0.6763008905606748, "grad_norm": 0.10769897254606019, "learning_rate": 7.804173809264841e-06, "loss": 0.1817, "step": 21890 }, { "epoch": 0.6766098448277194, "grad_norm": 0.10617910560379765, "learning_rate": 7.790730359465823e-06, "loss": 0.1803, "step": 21900 }, { "epoch": 0.676918799094764, "grad_norm": 0.08724439637965004, "learning_rate": 7.777294435545808e-06, "loss": 0.1789, "step": 21910 }, { "epoch": 0.6772277533618086, "grad_norm": 0.11793380282645627, "learning_rate": 7.763866051530798e-06, "loss": 0.184, "step": 21920 }, { "epoch": 0.6775367076288532, "grad_norm": 0.09792740654299632, "learning_rate": 7.750445221438906e-06, "loss": 0.1849, "step": 21930 }, { "epoch": 0.6778456618958979, "grad_norm": 0.09669760023692602, "learning_rate": 7.737031959280356e-06, "loss": 0.1815, "step": 21940 }, { "epoch": 0.6781546161629425, "grad_norm": 0.09117546342880395, "learning_rate": 7.723626279057496e-06, "loss": 0.1802, "step": 21950 }, { "epoch": 0.6784635704299871, "grad_norm": 0.11057930039309655, "learning_rate": 7.710228194764727e-06, "loss": 0.1938, "step": 21960 }, { "epoch": 0.6787725246970318, "grad_norm": 0.103987368803885, "learning_rate": 7.696837720388554e-06, "loss": 0.1796, "step": 21970 }, { "epoch": 0.6790814789640763, "grad_norm": 0.08877131667329387, "learning_rate": 7.6834548699075e-06, "loss": 0.184, "step": 21980 }, { "epoch": 0.6793904332311209, "grad_norm": 0.08602755979863845, "learning_rate": 7.67007965729217e-06, "loss": 0.192, "step": 21990 }, { "epoch": 0.6796993874981656, "grad_norm": 0.08188259413918099, "learning_rate": 7.656712096505164e-06, "loss": 0.1907, "step": 22000 }, { "epoch": 0.6800083417652102, "grad_norm": 0.08245297029447374, "learning_rate": 7.64335220150111e-06, "loss": 0.1842, "step": 22010 }, { "epoch": 0.6803172960322548, "grad_norm": 0.08354299187463678, "learning_rate": 7.629999986226626e-06, "loss": 0.1811, "step": 22020 }, { "epoch": 0.6806262502992995, "grad_norm": 0.14769271108737972, "learning_rate": 7.616655464620321e-06, "loss": 0.1895, "step": 22030 }, { "epoch": 0.680935204566344, "grad_norm": 0.09456168217757971, "learning_rate": 7.60331865061276e-06, "loss": 0.1796, "step": 22040 }, { "epoch": 0.6812441588333887, "grad_norm": 0.0878522299972071, "learning_rate": 7.589989558126481e-06, "loss": 0.1834, "step": 22050 }, { "epoch": 0.6815531131004333, "grad_norm": 0.10993209787891756, "learning_rate": 7.576668201075936e-06, "loss": 0.1896, "step": 22060 }, { "epoch": 0.6818620673674779, "grad_norm": 0.09166496995708409, "learning_rate": 7.56335459336753e-06, "loss": 0.181, "step": 22070 }, { "epoch": 0.6821710216345226, "grad_norm": 0.09855574804314984, "learning_rate": 7.550048748899558e-06, "loss": 0.1783, "step": 22080 }, { "epoch": 0.6824799759015672, "grad_norm": 0.09649936095243887, "learning_rate": 7.536750681562211e-06, "loss": 0.1858, "step": 22090 }, { "epoch": 0.6827889301686118, "grad_norm": 0.09165491513749559, "learning_rate": 7.523460405237579e-06, "loss": 0.1875, "step": 22100 }, { "epoch": 0.6830978844356564, "grad_norm": 0.07787733586066546, "learning_rate": 7.5101779337996e-06, "loss": 0.1814, "step": 22110 }, { "epoch": 0.683406838702701, "grad_norm": 0.16490718623260733, "learning_rate": 7.496903281114067e-06, "loss": 0.2018, "step": 22120 }, { "epoch": 0.6837157929697456, "grad_norm": 0.09403627623821387, "learning_rate": 7.483636461038628e-06, "loss": 0.1811, "step": 22130 }, { "epoch": 0.6840247472367903, "grad_norm": 0.09960587837256916, "learning_rate": 7.470377487422727e-06, "loss": 0.1811, "step": 22140 }, { "epoch": 0.6843337015038349, "grad_norm": 0.10772946164623394, "learning_rate": 7.457126374107646e-06, "loss": 0.1824, "step": 22150 }, { "epoch": 0.6846426557708795, "grad_norm": 0.09580612300921638, "learning_rate": 7.4438831349264415e-06, "loss": 0.1792, "step": 22160 }, { "epoch": 0.6849516100379242, "grad_norm": 0.09591036425310501, "learning_rate": 7.430647783703949e-06, "loss": 0.185, "step": 22170 }, { "epoch": 0.6852605643049687, "grad_norm": 0.08932953414207497, "learning_rate": 7.417420334256787e-06, "loss": 0.1834, "step": 22180 }, { "epoch": 0.6855695185720134, "grad_norm": 0.0992893464161745, "learning_rate": 7.404200800393305e-06, "loss": 0.1832, "step": 22190 }, { "epoch": 0.685878472839058, "grad_norm": 0.10052745564077986, "learning_rate": 7.39098919591361e-06, "loss": 0.1877, "step": 22200 }, { "epoch": 0.6861874271061026, "grad_norm": 0.1131068532052622, "learning_rate": 7.377785534609509e-06, "loss": 0.181, "step": 22210 }, { "epoch": 0.6864963813731473, "grad_norm": 0.10094189745848517, "learning_rate": 7.364589830264537e-06, "loss": 0.1816, "step": 22220 }, { "epoch": 0.6868053356401919, "grad_norm": 0.09653480119282912, "learning_rate": 7.3514020966539065e-06, "loss": 0.1815, "step": 22230 }, { "epoch": 0.6871142899072364, "grad_norm": 0.0993360220318092, "learning_rate": 7.338222347544526e-06, "loss": 0.1823, "step": 22240 }, { "epoch": 0.6874232441742811, "grad_norm": 0.08422552040154874, "learning_rate": 7.325050596694947e-06, "loss": 0.1866, "step": 22250 }, { "epoch": 0.6877321984413257, "grad_norm": 0.10108479744433081, "learning_rate": 7.311886857855398e-06, "loss": 0.1833, "step": 22260 }, { "epoch": 0.6880411527083703, "grad_norm": 0.1026968384760547, "learning_rate": 7.2987311447677155e-06, "loss": 0.1849, "step": 22270 }, { "epoch": 0.688350106975415, "grad_norm": 0.10249702981191872, "learning_rate": 7.285583471165385e-06, "loss": 0.187, "step": 22280 }, { "epoch": 0.6886590612424596, "grad_norm": 0.11137966018067741, "learning_rate": 7.272443850773474e-06, "loss": 0.1865, "step": 22290 }, { "epoch": 0.6889680155095042, "grad_norm": 0.10304373007231557, "learning_rate": 7.259312297308665e-06, "loss": 0.1763, "step": 22300 }, { "epoch": 0.6892769697765488, "grad_norm": 0.09274915547043855, "learning_rate": 7.2461888244792075e-06, "loss": 0.1797, "step": 22310 }, { "epoch": 0.6895859240435934, "grad_norm": 0.10089718809079859, "learning_rate": 7.233073445984917e-06, "loss": 0.1882, "step": 22320 }, { "epoch": 0.6898948783106381, "grad_norm": 0.10824528144194742, "learning_rate": 7.219966175517151e-06, "loss": 0.1842, "step": 22330 }, { "epoch": 0.6902038325776827, "grad_norm": 0.10557538951085883, "learning_rate": 7.206867026758828e-06, "loss": 0.1861, "step": 22340 }, { "epoch": 0.6905127868447273, "grad_norm": 0.12675439247659584, "learning_rate": 7.193776013384355e-06, "loss": 0.1815, "step": 22350 }, { "epoch": 0.690821741111772, "grad_norm": 0.093012673616093, "learning_rate": 7.180693149059679e-06, "loss": 0.1811, "step": 22360 }, { "epoch": 0.6911306953788166, "grad_norm": 0.11022055746496426, "learning_rate": 7.167618447442212e-06, "loss": 0.1836, "step": 22370 }, { "epoch": 0.6914396496458611, "grad_norm": 0.10088552228914249, "learning_rate": 7.154551922180864e-06, "loss": 0.1831, "step": 22380 }, { "epoch": 0.6917486039129058, "grad_norm": 0.07829621733751456, "learning_rate": 7.141493586916004e-06, "loss": 0.1826, "step": 22390 }, { "epoch": 0.6920575581799504, "grad_norm": 0.0889419944283404, "learning_rate": 7.12844345527944e-06, "loss": 0.1835, "step": 22400 }, { "epoch": 0.692366512446995, "grad_norm": 0.10599624941114881, "learning_rate": 7.115401540894437e-06, "loss": 0.1912, "step": 22410 }, { "epoch": 0.6926754667140397, "grad_norm": 0.10729721552355075, "learning_rate": 7.102367857375663e-06, "loss": 0.208, "step": 22420 }, { "epoch": 0.6929844209810843, "grad_norm": 0.1002851903157931, "learning_rate": 7.08934241832921e-06, "loss": 0.1844, "step": 22430 }, { "epoch": 0.6932933752481288, "grad_norm": 0.10067248745974343, "learning_rate": 7.076325237352543e-06, "loss": 0.1783, "step": 22440 }, { "epoch": 0.6936023295151735, "grad_norm": 0.09227533535580679, "learning_rate": 7.063316328034533e-06, "loss": 0.1823, "step": 22450 }, { "epoch": 0.6939112837822181, "grad_norm": 0.13208989858189216, "learning_rate": 7.050315703955386e-06, "loss": 0.1803, "step": 22460 }, { "epoch": 0.6942202380492628, "grad_norm": 0.11043560756916097, "learning_rate": 7.03732337868669e-06, "loss": 0.1814, "step": 22470 }, { "epoch": 0.6945291923163074, "grad_norm": 0.11219602108245105, "learning_rate": 7.024339365791338e-06, "loss": 0.1871, "step": 22480 }, { "epoch": 0.694838146583352, "grad_norm": 0.0909153194407292, "learning_rate": 7.011363678823577e-06, "loss": 0.1898, "step": 22490 }, { "epoch": 0.6951471008503967, "grad_norm": 0.0930940860533655, "learning_rate": 6.998396331328933e-06, "loss": 0.1844, "step": 22500 }, { "epoch": 0.6954560551174412, "grad_norm": 0.09377328331032149, "learning_rate": 6.985437336844253e-06, "loss": 0.1813, "step": 22510 }, { "epoch": 0.6957650093844858, "grad_norm": 0.10876888713001147, "learning_rate": 6.972486708897638e-06, "loss": 0.1797, "step": 22520 }, { "epoch": 0.6960739636515305, "grad_norm": 0.09387171624159234, "learning_rate": 6.959544461008483e-06, "loss": 0.1814, "step": 22530 }, { "epoch": 0.6963829179185751, "grad_norm": 0.13612121256321463, "learning_rate": 6.9466106066874115e-06, "loss": 0.2073, "step": 22540 }, { "epoch": 0.6966918721856197, "grad_norm": 0.09290309901674741, "learning_rate": 6.933685159436296e-06, "loss": 0.183, "step": 22550 }, { "epoch": 0.6970008264526644, "grad_norm": 0.0951727569438628, "learning_rate": 6.920768132748224e-06, "loss": 0.1825, "step": 22560 }, { "epoch": 0.697309780719709, "grad_norm": 0.09333447460953703, "learning_rate": 6.90785954010751e-06, "loss": 0.1785, "step": 22570 }, { "epoch": 0.6976187349867535, "grad_norm": 0.09795160090038436, "learning_rate": 6.894959394989643e-06, "loss": 0.1797, "step": 22580 }, { "epoch": 0.6979276892537982, "grad_norm": 0.1005629358636702, "learning_rate": 6.882067710861313e-06, "loss": 0.1782, "step": 22590 }, { "epoch": 0.6982366435208428, "grad_norm": 0.10495606872393157, "learning_rate": 6.8691845011803585e-06, "loss": 0.181, "step": 22600 }, { "epoch": 0.6985455977878875, "grad_norm": 0.09891331134426463, "learning_rate": 6.85630977939579e-06, "loss": 0.1837, "step": 22610 }, { "epoch": 0.6988545520549321, "grad_norm": 0.09608712038311525, "learning_rate": 6.843443558947744e-06, "loss": 0.1826, "step": 22620 }, { "epoch": 0.6991635063219767, "grad_norm": 0.10823888714310015, "learning_rate": 6.8305858532674825e-06, "loss": 0.1812, "step": 22630 }, { "epoch": 0.6994724605890213, "grad_norm": 0.11483176949334299, "learning_rate": 6.81773667577739e-06, "loss": 0.1863, "step": 22640 }, { "epoch": 0.6997814148560659, "grad_norm": 0.12046064900719501, "learning_rate": 6.804896039890933e-06, "loss": 0.1841, "step": 22650 }, { "epoch": 0.7000903691231105, "grad_norm": 0.10013959191086173, "learning_rate": 6.792063959012681e-06, "loss": 0.1832, "step": 22660 }, { "epoch": 0.7003993233901552, "grad_norm": 0.09003294217810345, "learning_rate": 6.779240446538248e-06, "loss": 0.1863, "step": 22670 }, { "epoch": 0.7007082776571998, "grad_norm": 0.12499293314608835, "learning_rate": 6.766425515854328e-06, "loss": 0.1812, "step": 22680 }, { "epoch": 0.7010172319242444, "grad_norm": 0.14008647305253966, "learning_rate": 6.753619180338634e-06, "loss": 0.1767, "step": 22690 }, { "epoch": 0.7013261861912891, "grad_norm": 0.10941872997400424, "learning_rate": 6.740821453359928e-06, "loss": 0.1826, "step": 22700 }, { "epoch": 0.7016351404583336, "grad_norm": 0.10831878593425659, "learning_rate": 6.728032348277963e-06, "loss": 0.1889, "step": 22710 }, { "epoch": 0.7019440947253783, "grad_norm": 0.0946684603744815, "learning_rate": 6.7152518784435175e-06, "loss": 0.1849, "step": 22720 }, { "epoch": 0.7022530489924229, "grad_norm": 0.10259548829572136, "learning_rate": 6.702480057198325e-06, "loss": 0.1898, "step": 22730 }, { "epoch": 0.7025620032594675, "grad_norm": 0.10224576886689879, "learning_rate": 6.68971689787512e-06, "loss": 0.1845, "step": 22740 }, { "epoch": 0.7028709575265122, "grad_norm": 0.09881312799382255, "learning_rate": 6.676962413797575e-06, "loss": 0.1811, "step": 22750 }, { "epoch": 0.7031799117935568, "grad_norm": 0.0934974541588881, "learning_rate": 6.664216618280309e-06, "loss": 0.1873, "step": 22760 }, { "epoch": 0.7034888660606013, "grad_norm": 0.09846852501265885, "learning_rate": 6.6514795246288834e-06, "loss": 0.1858, "step": 22770 }, { "epoch": 0.703797820327646, "grad_norm": 0.0810910375668225, "learning_rate": 6.638751146139762e-06, "loss": 0.1865, "step": 22780 }, { "epoch": 0.7041067745946906, "grad_norm": 0.10243833744847768, "learning_rate": 6.626031496100308e-06, "loss": 0.1903, "step": 22790 }, { "epoch": 0.7044157288617352, "grad_norm": 0.10522763872225695, "learning_rate": 6.613320587788794e-06, "loss": 0.1799, "step": 22800 }, { "epoch": 0.7047246831287799, "grad_norm": 0.08690977005698929, "learning_rate": 6.600618434474339e-06, "loss": 0.1789, "step": 22810 }, { "epoch": 0.7050336373958245, "grad_norm": 0.11267023576935571, "learning_rate": 6.587925049416951e-06, "loss": 0.1788, "step": 22820 }, { "epoch": 0.7053425916628691, "grad_norm": 0.1401753366454194, "learning_rate": 6.575240445867455e-06, "loss": 0.194, "step": 22830 }, { "epoch": 0.7056515459299137, "grad_norm": 0.11646166361342897, "learning_rate": 6.56256463706754e-06, "loss": 0.1848, "step": 22840 }, { "epoch": 0.7059605001969583, "grad_norm": 0.08855345720613042, "learning_rate": 6.5498976362496884e-06, "loss": 0.1844, "step": 22850 }, { "epoch": 0.706269454464003, "grad_norm": 0.09725128694436247, "learning_rate": 6.537239456637193e-06, "loss": 0.1878, "step": 22860 }, { "epoch": 0.7065784087310476, "grad_norm": 0.08664039348579067, "learning_rate": 6.524590111444156e-06, "loss": 0.1843, "step": 22870 }, { "epoch": 0.7068873629980922, "grad_norm": 0.11022183963189743, "learning_rate": 6.5119496138754295e-06, "loss": 0.1834, "step": 22880 }, { "epoch": 0.7071963172651369, "grad_norm": 0.0818328358931418, "learning_rate": 6.499317977126656e-06, "loss": 0.1792, "step": 22890 }, { "epoch": 0.7075052715321815, "grad_norm": 0.09874177970560724, "learning_rate": 6.486695214384205e-06, "loss": 0.1843, "step": 22900 }, { "epoch": 0.707814225799226, "grad_norm": 0.11297624589105548, "learning_rate": 6.474081338825202e-06, "loss": 0.1844, "step": 22910 }, { "epoch": 0.7081231800662707, "grad_norm": 0.10243376858465415, "learning_rate": 6.461476363617477e-06, "loss": 0.1882, "step": 22920 }, { "epoch": 0.7084321343333153, "grad_norm": 0.0840623172311264, "learning_rate": 6.448880301919589e-06, "loss": 0.1814, "step": 22930 }, { "epoch": 0.7087410886003599, "grad_norm": 0.10230810556633248, "learning_rate": 6.436293166880767e-06, "loss": 0.1894, "step": 22940 }, { "epoch": 0.7090500428674046, "grad_norm": 0.09772364321195781, "learning_rate": 6.4237149716409475e-06, "loss": 0.1839, "step": 22950 }, { "epoch": 0.7093589971344492, "grad_norm": 0.11072981532651247, "learning_rate": 6.411145729330711e-06, "loss": 0.1836, "step": 22960 }, { "epoch": 0.7096679514014937, "grad_norm": 0.09011194193047975, "learning_rate": 6.398585453071316e-06, "loss": 0.179, "step": 22970 }, { "epoch": 0.7099769056685384, "grad_norm": 0.10363318374429248, "learning_rate": 6.38603415597464e-06, "loss": 0.1818, "step": 22980 }, { "epoch": 0.710285859935583, "grad_norm": 0.08146492866877648, "learning_rate": 6.37349185114319e-06, "loss": 0.1767, "step": 22990 }, { "epoch": 0.7105948142026277, "grad_norm": 0.09431933489269974, "learning_rate": 6.360958551670104e-06, "loss": 0.1907, "step": 23000 }, { "epoch": 0.7109037684696723, "grad_norm": 0.11048345189042445, "learning_rate": 6.348434270639098e-06, "loss": 0.1845, "step": 23010 }, { "epoch": 0.7112127227367169, "grad_norm": 0.13106656928455201, "learning_rate": 6.335919021124478e-06, "loss": 0.1916, "step": 23020 }, { "epoch": 0.7115216770037616, "grad_norm": 0.0980751235847233, "learning_rate": 6.3234128161911374e-06, "loss": 0.1833, "step": 23030 }, { "epoch": 0.7118306312708061, "grad_norm": 0.10255968816851609, "learning_rate": 6.310915668894501e-06, "loss": 0.1824, "step": 23040 }, { "epoch": 0.7121395855378507, "grad_norm": 0.08728828922083484, "learning_rate": 6.298427592280569e-06, "loss": 0.1814, "step": 23050 }, { "epoch": 0.7124485398048954, "grad_norm": 0.10542018314338772, "learning_rate": 6.285948599385842e-06, "loss": 0.1835, "step": 23060 }, { "epoch": 0.71275749407194, "grad_norm": 0.08436415370702952, "learning_rate": 6.273478703237365e-06, "loss": 0.1858, "step": 23070 }, { "epoch": 0.7130664483389846, "grad_norm": 0.1022259595350936, "learning_rate": 6.261017916852668e-06, "loss": 0.1762, "step": 23080 }, { "epoch": 0.7133754026060293, "grad_norm": 0.11046728828954396, "learning_rate": 6.248566253239774e-06, "loss": 0.1834, "step": 23090 }, { "epoch": 0.7136843568730739, "grad_norm": 0.08697901231904889, "learning_rate": 6.236123725397196e-06, "loss": 0.1835, "step": 23100 }, { "epoch": 0.7139933111401184, "grad_norm": 0.09245674162671554, "learning_rate": 6.223690346313888e-06, "loss": 0.1809, "step": 23110 }, { "epoch": 0.7143022654071631, "grad_norm": 0.1312160942879207, "learning_rate": 6.211266128969278e-06, "loss": 0.1826, "step": 23120 }, { "epoch": 0.7146112196742077, "grad_norm": 0.0926064562348625, "learning_rate": 6.198851086333206e-06, "loss": 0.1834, "step": 23130 }, { "epoch": 0.7149201739412524, "grad_norm": 0.10756064976094376, "learning_rate": 6.186445231365957e-06, "loss": 0.1824, "step": 23140 }, { "epoch": 0.715229128208297, "grad_norm": 0.11918399401915028, "learning_rate": 6.174048577018203e-06, "loss": 0.1857, "step": 23150 }, { "epoch": 0.7155380824753416, "grad_norm": 0.10214871420915306, "learning_rate": 6.161661136231031e-06, "loss": 0.1808, "step": 23160 }, { "epoch": 0.7158470367423863, "grad_norm": 0.09120118719203635, "learning_rate": 6.149282921935893e-06, "loss": 0.1791, "step": 23170 }, { "epoch": 0.7161559910094308, "grad_norm": 0.10105557237813377, "learning_rate": 6.136913947054626e-06, "loss": 0.1818, "step": 23180 }, { "epoch": 0.7164649452764754, "grad_norm": 0.08502730000820673, "learning_rate": 6.124554224499403e-06, "loss": 0.1851, "step": 23190 }, { "epoch": 0.7167738995435201, "grad_norm": 0.10244948066698438, "learning_rate": 6.112203767172758e-06, "loss": 0.1851, "step": 23200 }, { "epoch": 0.7170828538105647, "grad_norm": 0.09565504810996643, "learning_rate": 6.099862587967538e-06, "loss": 0.1877, "step": 23210 }, { "epoch": 0.7173918080776093, "grad_norm": 0.09854345398680876, "learning_rate": 6.087530699766906e-06, "loss": 0.1788, "step": 23220 }, { "epoch": 0.717700762344654, "grad_norm": 0.11502196801484005, "learning_rate": 6.075208115444337e-06, "loss": 0.1885, "step": 23230 }, { "epoch": 0.7180097166116985, "grad_norm": 0.09624064877420897, "learning_rate": 6.062894847863584e-06, "loss": 0.1927, "step": 23240 }, { "epoch": 0.7183186708787431, "grad_norm": 0.11049030334103088, "learning_rate": 6.050590909878668e-06, "loss": 0.1872, "step": 23250 }, { "epoch": 0.7186276251457878, "grad_norm": 0.08633503283061199, "learning_rate": 6.038296314333891e-06, "loss": 0.1787, "step": 23260 }, { "epoch": 0.7189365794128324, "grad_norm": 0.09875871867168223, "learning_rate": 6.0260110740637804e-06, "loss": 0.185, "step": 23270 }, { "epoch": 0.7192455336798771, "grad_norm": 0.08476965760872901, "learning_rate": 6.013735201893116e-06, "loss": 0.1803, "step": 23280 }, { "epoch": 0.7195544879469217, "grad_norm": 0.10772440899999361, "learning_rate": 6.001468710636878e-06, "loss": 0.1864, "step": 23290 }, { "epoch": 0.7198634422139663, "grad_norm": 0.09776798378035491, "learning_rate": 5.989211613100279e-06, "loss": 0.1822, "step": 23300 }, { "epoch": 0.720172396481011, "grad_norm": 0.09050535034739404, "learning_rate": 5.976963922078703e-06, "loss": 0.1835, "step": 23310 }, { "epoch": 0.7204813507480555, "grad_norm": 0.10821419772111825, "learning_rate": 5.964725650357717e-06, "loss": 0.1899, "step": 23320 }, { "epoch": 0.7207903050151001, "grad_norm": 0.0870279225799357, "learning_rate": 5.952496810713073e-06, "loss": 0.1801, "step": 23330 }, { "epoch": 0.7210992592821448, "grad_norm": 0.09399342395698655, "learning_rate": 5.940277415910654e-06, "loss": 0.1838, "step": 23340 }, { "epoch": 0.7214082135491894, "grad_norm": 0.09643234650439193, "learning_rate": 5.928067478706503e-06, "loss": 0.1809, "step": 23350 }, { "epoch": 0.721717167816234, "grad_norm": 0.09629637524672571, "learning_rate": 5.91586701184677e-06, "loss": 0.18, "step": 23360 }, { "epoch": 0.7220261220832787, "grad_norm": 0.09717067559422377, "learning_rate": 5.9036760280677395e-06, "loss": 0.1828, "step": 23370 }, { "epoch": 0.7223350763503232, "grad_norm": 0.09057261004876756, "learning_rate": 5.89149454009578e-06, "loss": 0.1916, "step": 23380 }, { "epoch": 0.7226440306173678, "grad_norm": 0.1301195912918232, "learning_rate": 5.879322560647361e-06, "loss": 0.1836, "step": 23390 }, { "epoch": 0.7229529848844125, "grad_norm": 0.09215293888878717, "learning_rate": 5.867160102429008e-06, "loss": 0.1855, "step": 23400 }, { "epoch": 0.7232619391514571, "grad_norm": 0.11605752935290935, "learning_rate": 5.85500717813733e-06, "loss": 0.1834, "step": 23410 }, { "epoch": 0.7235708934185018, "grad_norm": 0.09248674661172235, "learning_rate": 5.8428638004589576e-06, "loss": 0.1767, "step": 23420 }, { "epoch": 0.7238798476855464, "grad_norm": 0.1236332915964266, "learning_rate": 5.830729982070582e-06, "loss": 0.1817, "step": 23430 }, { "epoch": 0.7241888019525909, "grad_norm": 0.08963978029874643, "learning_rate": 5.818605735638895e-06, "loss": 0.1822, "step": 23440 }, { "epoch": 0.7244977562196356, "grad_norm": 0.08748669932058856, "learning_rate": 5.806491073820604e-06, "loss": 0.1778, "step": 23450 }, { "epoch": 0.7248067104866802, "grad_norm": 0.10141517797103793, "learning_rate": 5.794386009262408e-06, "loss": 0.1901, "step": 23460 }, { "epoch": 0.7251156647537248, "grad_norm": 0.09028198862404607, "learning_rate": 5.782290554600997e-06, "loss": 0.1842, "step": 23470 }, { "epoch": 0.7254246190207695, "grad_norm": 0.107960057694832, "learning_rate": 5.770204722463011e-06, "loss": 0.1853, "step": 23480 }, { "epoch": 0.7257335732878141, "grad_norm": 0.10440752222594185, "learning_rate": 5.758128525465066e-06, "loss": 0.1805, "step": 23490 }, { "epoch": 0.7260425275548587, "grad_norm": 0.08911869025863042, "learning_rate": 5.7460619762137e-06, "loss": 0.1835, "step": 23500 }, { "epoch": 0.7263514818219033, "grad_norm": 0.10135969852159145, "learning_rate": 5.734005087305397e-06, "loss": 0.1817, "step": 23510 }, { "epoch": 0.7266604360889479, "grad_norm": 0.10504601399610826, "learning_rate": 5.7219578713265394e-06, "loss": 0.1852, "step": 23520 }, { "epoch": 0.7269693903559925, "grad_norm": 0.10418614054488268, "learning_rate": 5.70992034085343e-06, "loss": 0.1818, "step": 23530 }, { "epoch": 0.7272783446230372, "grad_norm": 0.10266555162815832, "learning_rate": 5.6978925084522474e-06, "loss": 0.182, "step": 23540 }, { "epoch": 0.7275872988900818, "grad_norm": 0.11505602967127468, "learning_rate": 5.6858743866790436e-06, "loss": 0.1808, "step": 23550 }, { "epoch": 0.7278962531571265, "grad_norm": 0.09420727352399028, "learning_rate": 5.673865988079752e-06, "loss": 0.1819, "step": 23560 }, { "epoch": 0.728205207424171, "grad_norm": 0.12686431057703784, "learning_rate": 5.661867325190131e-06, "loss": 0.1822, "step": 23570 }, { "epoch": 0.7285141616912156, "grad_norm": 0.10894260390245611, "learning_rate": 5.6498784105358e-06, "loss": 0.1993, "step": 23580 }, { "epoch": 0.7288231159582603, "grad_norm": 0.10968589067160978, "learning_rate": 5.637899256632181e-06, "loss": 0.1889, "step": 23590 }, { "epoch": 0.7291320702253049, "grad_norm": 0.08247539447465888, "learning_rate": 5.625929875984526e-06, "loss": 0.1799, "step": 23600 }, { "epoch": 0.7294410244923495, "grad_norm": 0.1053181577897392, "learning_rate": 5.613970281087865e-06, "loss": 0.1809, "step": 23610 }, { "epoch": 0.7297499787593942, "grad_norm": 0.1057329141337531, "learning_rate": 5.602020484427031e-06, "loss": 0.186, "step": 23620 }, { "epoch": 0.7300589330264388, "grad_norm": 0.10549607851658224, "learning_rate": 5.5900804984766105e-06, "loss": 0.1825, "step": 23630 }, { "epoch": 0.7303678872934833, "grad_norm": 0.1304769295554578, "learning_rate": 5.578150335700968e-06, "loss": 0.1824, "step": 23640 }, { "epoch": 0.730676841560528, "grad_norm": 0.09441833755987829, "learning_rate": 5.566230008554193e-06, "loss": 0.179, "step": 23650 }, { "epoch": 0.7309857958275726, "grad_norm": 0.0873047333241518, "learning_rate": 5.5543195294801266e-06, "loss": 0.1834, "step": 23660 }, { "epoch": 0.7312947500946172, "grad_norm": 0.0978571906611976, "learning_rate": 5.5424189109123145e-06, "loss": 0.1837, "step": 23670 }, { "epoch": 0.7316037043616619, "grad_norm": 0.09212224170428467, "learning_rate": 5.530528165274017e-06, "loss": 0.1853, "step": 23680 }, { "epoch": 0.7319126586287065, "grad_norm": 0.09675995488973269, "learning_rate": 5.518647304978179e-06, "loss": 0.1793, "step": 23690 }, { "epoch": 0.7322216128957512, "grad_norm": 0.09594312676716647, "learning_rate": 5.506776342427443e-06, "loss": 0.1905, "step": 23700 }, { "epoch": 0.7325305671627957, "grad_norm": 0.08795201298524304, "learning_rate": 5.494915290014095e-06, "loss": 0.1767, "step": 23710 }, { "epoch": 0.7328395214298403, "grad_norm": 0.08975525669259507, "learning_rate": 5.483064160120104e-06, "loss": 0.1785, "step": 23720 }, { "epoch": 0.733148475696885, "grad_norm": 0.09798393327238356, "learning_rate": 5.471222965117053e-06, "loss": 0.1844, "step": 23730 }, { "epoch": 0.7334574299639296, "grad_norm": 0.08887985967333485, "learning_rate": 5.459391717366179e-06, "loss": 0.1848, "step": 23740 }, { "epoch": 0.7337663842309742, "grad_norm": 0.09556267467208845, "learning_rate": 5.447570429218309e-06, "loss": 0.1892, "step": 23750 }, { "epoch": 0.7340753384980189, "grad_norm": 0.09954155983837434, "learning_rate": 5.435759113013898e-06, "loss": 0.1829, "step": 23760 }, { "epoch": 0.7343842927650635, "grad_norm": 0.09170270036599845, "learning_rate": 5.423957781082975e-06, "loss": 0.1817, "step": 23770 }, { "epoch": 0.734693247032108, "grad_norm": 0.10152918256092645, "learning_rate": 5.412166445745146e-06, "loss": 0.1784, "step": 23780 }, { "epoch": 0.7350022012991527, "grad_norm": 0.10255613442004073, "learning_rate": 5.400385119309595e-06, "loss": 0.1861, "step": 23790 }, { "epoch": 0.7353111555661973, "grad_norm": 0.08830756639633765, "learning_rate": 5.388613814075038e-06, "loss": 0.1815, "step": 23800 }, { "epoch": 0.7356201098332419, "grad_norm": 0.0935694713904321, "learning_rate": 5.3768525423297535e-06, "loss": 0.193, "step": 23810 }, { "epoch": 0.7359290641002866, "grad_norm": 0.1028806529008077, "learning_rate": 5.36510131635152e-06, "loss": 0.1816, "step": 23820 }, { "epoch": 0.7362380183673312, "grad_norm": 0.0877809893886209, "learning_rate": 5.353360148407655e-06, "loss": 0.1856, "step": 23830 }, { "epoch": 0.7365469726343759, "grad_norm": 0.11192160145740775, "learning_rate": 5.34162905075495e-06, "loss": 0.1843, "step": 23840 }, { "epoch": 0.7368559269014204, "grad_norm": 0.1148928991099739, "learning_rate": 5.329908035639707e-06, "loss": 0.1818, "step": 23850 }, { "epoch": 0.737164881168465, "grad_norm": 0.10522764460026213, "learning_rate": 5.318197115297686e-06, "loss": 0.1818, "step": 23860 }, { "epoch": 0.7374738354355097, "grad_norm": 0.12476438434685744, "learning_rate": 5.3064963019541224e-06, "loss": 0.1851, "step": 23870 }, { "epoch": 0.7377827897025543, "grad_norm": 0.10183551431313007, "learning_rate": 5.294805607823691e-06, "loss": 0.1775, "step": 23880 }, { "epoch": 0.7380917439695989, "grad_norm": 0.10808010473544906, "learning_rate": 5.283125045110499e-06, "loss": 0.1833, "step": 23890 }, { "epoch": 0.7384006982366436, "grad_norm": 0.09474897606017593, "learning_rate": 5.271454626008097e-06, "loss": 0.1799, "step": 23900 }, { "epoch": 0.7387096525036881, "grad_norm": 0.08593317067663166, "learning_rate": 5.2597943626994244e-06, "loss": 0.1816, "step": 23910 }, { "epoch": 0.7390186067707327, "grad_norm": 0.09640191337609255, "learning_rate": 5.248144267356827e-06, "loss": 0.1804, "step": 23920 }, { "epoch": 0.7393275610377774, "grad_norm": 0.08851606579777704, "learning_rate": 5.236504352142046e-06, "loss": 0.182, "step": 23930 }, { "epoch": 0.739636515304822, "grad_norm": 0.09498005045178236, "learning_rate": 5.224874629206176e-06, "loss": 0.185, "step": 23940 }, { "epoch": 0.7399454695718666, "grad_norm": 0.09617212108109258, "learning_rate": 5.213255110689694e-06, "loss": 0.1837, "step": 23950 }, { "epoch": 0.7402544238389113, "grad_norm": 0.09896703733829941, "learning_rate": 5.201645808722399e-06, "loss": 0.1826, "step": 23960 }, { "epoch": 0.7405633781059558, "grad_norm": 0.0959834731669881, "learning_rate": 5.190046735423453e-06, "loss": 0.1821, "step": 23970 }, { "epoch": 0.7408723323730005, "grad_norm": 0.09230008342047381, "learning_rate": 5.178457902901312e-06, "loss": 0.1801, "step": 23980 }, { "epoch": 0.7411812866400451, "grad_norm": 0.10128022750500347, "learning_rate": 5.166879323253766e-06, "loss": 0.1843, "step": 23990 }, { "epoch": 0.7414902409070897, "grad_norm": 0.0995053570730854, "learning_rate": 5.155311008567888e-06, "loss": 0.1793, "step": 24000 }, { "epoch": 0.7417991951741344, "grad_norm": 0.11384768264514729, "learning_rate": 5.143752970920031e-06, "loss": 0.1848, "step": 24010 }, { "epoch": 0.742108149441179, "grad_norm": 0.09487273464577489, "learning_rate": 5.132205222375837e-06, "loss": 0.1822, "step": 24020 }, { "epoch": 0.7424171037082236, "grad_norm": 0.09016893581053735, "learning_rate": 5.1206677749901884e-06, "loss": 0.1819, "step": 24030 }, { "epoch": 0.7427260579752683, "grad_norm": 0.11529766024246506, "learning_rate": 5.109140640807231e-06, "loss": 0.1867, "step": 24040 }, { "epoch": 0.7430350122423128, "grad_norm": 0.07877091685317325, "learning_rate": 5.097623831860326e-06, "loss": 0.1813, "step": 24050 }, { "epoch": 0.7433439665093574, "grad_norm": 0.09752304307877216, "learning_rate": 5.086117360172075e-06, "loss": 0.1811, "step": 24060 }, { "epoch": 0.7436529207764021, "grad_norm": 0.0952612245690614, "learning_rate": 5.074621237754271e-06, "loss": 0.1835, "step": 24070 }, { "epoch": 0.7439618750434467, "grad_norm": 0.08884004176471252, "learning_rate": 5.063135476607921e-06, "loss": 0.181, "step": 24080 }, { "epoch": 0.7442708293104913, "grad_norm": 0.11584434045708043, "learning_rate": 5.051660088723196e-06, "loss": 0.1889, "step": 24090 }, { "epoch": 0.744579783577536, "grad_norm": 0.10743692729312139, "learning_rate": 5.040195086079457e-06, "loss": 0.1877, "step": 24100 }, { "epoch": 0.7448887378445805, "grad_norm": 0.1090829668196744, "learning_rate": 5.028740480645211e-06, "loss": 0.1843, "step": 24110 }, { "epoch": 0.7451976921116252, "grad_norm": 0.09412843117885072, "learning_rate": 5.017296284378112e-06, "loss": 0.1844, "step": 24120 }, { "epoch": 0.7455066463786698, "grad_norm": 0.098847194500982, "learning_rate": 5.0058625092249585e-06, "loss": 0.1817, "step": 24130 }, { "epoch": 0.7458156006457144, "grad_norm": 0.09326719430806776, "learning_rate": 4.994439167121656e-06, "loss": 0.1826, "step": 24140 }, { "epoch": 0.7461245549127591, "grad_norm": 0.11397199967200321, "learning_rate": 4.983026269993225e-06, "loss": 0.1856, "step": 24150 }, { "epoch": 0.7464335091798037, "grad_norm": 5.363108996119458, "learning_rate": 4.971623829753788e-06, "loss": 0.2017, "step": 24160 }, { "epoch": 0.7467424634468482, "grad_norm": 0.09358995210467394, "learning_rate": 4.960231858306539e-06, "loss": 0.178, "step": 24170 }, { "epoch": 0.7470514177138929, "grad_norm": 0.09567347404863191, "learning_rate": 4.948850367543761e-06, "loss": 0.1817, "step": 24180 }, { "epoch": 0.7473603719809375, "grad_norm": 0.09588836437827075, "learning_rate": 4.937479369346772e-06, "loss": 0.1846, "step": 24190 }, { "epoch": 0.7476693262479821, "grad_norm": 0.1070859782676116, "learning_rate": 4.926118875585962e-06, "loss": 0.1789, "step": 24200 }, { "epoch": 0.7479782805150268, "grad_norm": 0.0972454782099022, "learning_rate": 4.914768898120735e-06, "loss": 0.1863, "step": 24210 }, { "epoch": 0.7482872347820714, "grad_norm": 0.08375075180811167, "learning_rate": 4.903429448799533e-06, "loss": 0.1815, "step": 24220 }, { "epoch": 0.748596189049116, "grad_norm": 0.09041052362777277, "learning_rate": 4.892100539459795e-06, "loss": 0.1808, "step": 24230 }, { "epoch": 0.7489051433161606, "grad_norm": 0.14079957159817338, "learning_rate": 4.880782181927958e-06, "loss": 0.1889, "step": 24240 }, { "epoch": 0.7492140975832052, "grad_norm": 0.08887254059664551, "learning_rate": 4.869474388019457e-06, "loss": 0.1898, "step": 24250 }, { "epoch": 0.7495230518502499, "grad_norm": 0.10819632457395752, "learning_rate": 4.85817716953868e-06, "loss": 0.1824, "step": 24260 }, { "epoch": 0.7498320061172945, "grad_norm": 0.09821873041202388, "learning_rate": 4.846890538278995e-06, "loss": 0.1862, "step": 24270 }, { "epoch": 0.7501409603843391, "grad_norm": 0.10241921776075516, "learning_rate": 4.835614506022698e-06, "loss": 0.1861, "step": 24280 }, { "epoch": 0.7504499146513838, "grad_norm": 0.12094620874012614, "learning_rate": 4.82434908454104e-06, "loss": 0.186, "step": 24290 }, { "epoch": 0.7507588689184284, "grad_norm": 0.10047937217461048, "learning_rate": 4.813094285594178e-06, "loss": 0.1822, "step": 24300 }, { "epoch": 0.7510678231854729, "grad_norm": 0.11834769443848707, "learning_rate": 4.801850120931198e-06, "loss": 0.1853, "step": 24310 }, { "epoch": 0.7513767774525176, "grad_norm": 0.09511623380065012, "learning_rate": 4.790616602290061e-06, "loss": 0.1815, "step": 24320 }, { "epoch": 0.7516857317195622, "grad_norm": 0.14707564861017536, "learning_rate": 4.77939374139764e-06, "loss": 0.1874, "step": 24330 }, { "epoch": 0.7519946859866068, "grad_norm": 0.09398758028656463, "learning_rate": 4.768181549969664e-06, "loss": 0.1823, "step": 24340 }, { "epoch": 0.7523036402536515, "grad_norm": 0.11256436160827804, "learning_rate": 4.756980039710728e-06, "loss": 0.1821, "step": 24350 }, { "epoch": 0.7526125945206961, "grad_norm": 0.09078154595153126, "learning_rate": 4.745789222314285e-06, "loss": 0.1926, "step": 24360 }, { "epoch": 0.7529215487877406, "grad_norm": 0.10197504872218038, "learning_rate": 4.734609109462617e-06, "loss": 0.1885, "step": 24370 }, { "epoch": 0.7532305030547853, "grad_norm": 0.09246076799905716, "learning_rate": 4.723439712826827e-06, "loss": 0.1841, "step": 24380 }, { "epoch": 0.7535394573218299, "grad_norm": 0.10161241559797542, "learning_rate": 4.7122810440668496e-06, "loss": 0.1768, "step": 24390 }, { "epoch": 0.7538484115888746, "grad_norm": 0.08090771359825617, "learning_rate": 4.701133114831396e-06, "loss": 0.1797, "step": 24400 }, { "epoch": 0.7541573658559192, "grad_norm": 0.09316268411410783, "learning_rate": 4.689995936757992e-06, "loss": 0.1798, "step": 24410 }, { "epoch": 0.7544663201229638, "grad_norm": 0.09417929681523575, "learning_rate": 4.678869521472913e-06, "loss": 0.1851, "step": 24420 }, { "epoch": 0.7547752743900085, "grad_norm": 0.11206295383355991, "learning_rate": 4.667753880591227e-06, "loss": 0.1799, "step": 24430 }, { "epoch": 0.755084228657053, "grad_norm": 0.11473768654406073, "learning_rate": 4.656649025716729e-06, "loss": 0.1855, "step": 24440 }, { "epoch": 0.7553931829240976, "grad_norm": 0.09069677363493305, "learning_rate": 4.645554968441974e-06, "loss": 0.1834, "step": 24450 }, { "epoch": 0.7557021371911423, "grad_norm": 0.10102245084622918, "learning_rate": 4.634471720348232e-06, "loss": 0.1868, "step": 24460 }, { "epoch": 0.7560110914581869, "grad_norm": 0.09449436609248286, "learning_rate": 4.623399293005492e-06, "loss": 0.1894, "step": 24470 }, { "epoch": 0.7563200457252315, "grad_norm": 0.10801735260821065, "learning_rate": 4.612337697972455e-06, "loss": 0.1879, "step": 24480 }, { "epoch": 0.7566289999922762, "grad_norm": 0.11635466347841847, "learning_rate": 4.601286946796502e-06, "loss": 0.188, "step": 24490 }, { "epoch": 0.7569379542593208, "grad_norm": 0.09888514655538418, "learning_rate": 4.590247051013706e-06, "loss": 0.1862, "step": 24500 }, { "epoch": 0.7572469085263653, "grad_norm": 0.10107275520320858, "learning_rate": 4.579218022148792e-06, "loss": 0.1897, "step": 24510 }, { "epoch": 0.75755586279341, "grad_norm": 0.10341155313562964, "learning_rate": 4.5681998717151616e-06, "loss": 0.1837, "step": 24520 }, { "epoch": 0.7578648170604546, "grad_norm": 0.10502438563128154, "learning_rate": 4.557192611214842e-06, "loss": 0.1799, "step": 24530 }, { "epoch": 0.7581737713274993, "grad_norm": 0.09835721540275669, "learning_rate": 4.5461962521385044e-06, "loss": 0.1828, "step": 24540 }, { "epoch": 0.7584827255945439, "grad_norm": 0.08791103137599914, "learning_rate": 4.535210805965427e-06, "loss": 0.1812, "step": 24550 }, { "epoch": 0.7587916798615885, "grad_norm": 0.10404060454522732, "learning_rate": 4.524236284163515e-06, "loss": 0.1818, "step": 24560 }, { "epoch": 0.7591006341286332, "grad_norm": 0.09153629747520635, "learning_rate": 4.513272698189251e-06, "loss": 0.1883, "step": 24570 }, { "epoch": 0.7594095883956777, "grad_norm": 0.11948459711738549, "learning_rate": 4.502320059487708e-06, "loss": 0.1828, "step": 24580 }, { "epoch": 0.7597185426627223, "grad_norm": 0.0945741265252532, "learning_rate": 4.49137837949253e-06, "loss": 0.1773, "step": 24590 }, { "epoch": 0.760027496929767, "grad_norm": 0.09549838705180028, "learning_rate": 4.480447669625929e-06, "loss": 0.1825, "step": 24600 }, { "epoch": 0.7603364511968116, "grad_norm": 0.11788533556362385, "learning_rate": 4.469527941298651e-06, "loss": 0.1817, "step": 24610 }, { "epoch": 0.7606454054638562, "grad_norm": 0.08925962238880984, "learning_rate": 4.458619205909993e-06, "loss": 0.1799, "step": 24620 }, { "epoch": 0.7609543597309009, "grad_norm": 0.10693628034532461, "learning_rate": 4.447721474847763e-06, "loss": 0.1868, "step": 24630 }, { "epoch": 0.7612633139979454, "grad_norm": 0.104860623770395, "learning_rate": 4.436834759488294e-06, "loss": 0.1862, "step": 24640 }, { "epoch": 0.76157226826499, "grad_norm": 0.10664557214643158, "learning_rate": 4.425959071196406e-06, "loss": 0.1785, "step": 24650 }, { "epoch": 0.7618812225320347, "grad_norm": 0.08596747890700626, "learning_rate": 4.4150944213254236e-06, "loss": 0.1823, "step": 24660 }, { "epoch": 0.7621901767990793, "grad_norm": 0.10739148452238738, "learning_rate": 4.4042408212171274e-06, "loss": 0.1792, "step": 24670 }, { "epoch": 0.762499131066124, "grad_norm": 0.09046668376876144, "learning_rate": 4.393398282201788e-06, "loss": 0.1864, "step": 24680 }, { "epoch": 0.7628080853331686, "grad_norm": 0.132524535699756, "learning_rate": 4.38256681559811e-06, "loss": 0.1887, "step": 24690 }, { "epoch": 0.7631170396002132, "grad_norm": 0.1341344024501761, "learning_rate": 4.371746432713239e-06, "loss": 0.1836, "step": 24700 }, { "epoch": 0.7634259938672578, "grad_norm": 0.10341488293705148, "learning_rate": 4.360937144842766e-06, "loss": 0.1795, "step": 24710 }, { "epoch": 0.7637349481343024, "grad_norm": 0.09795578826884338, "learning_rate": 4.350138963270683e-06, "loss": 0.1819, "step": 24720 }, { "epoch": 0.764043902401347, "grad_norm": 0.1039672908776288, "learning_rate": 4.339351899269403e-06, "loss": 0.1833, "step": 24730 }, { "epoch": 0.7643528566683917, "grad_norm": 0.10484384868946324, "learning_rate": 4.328575964099717e-06, "loss": 0.185, "step": 24740 }, { "epoch": 0.7646618109354363, "grad_norm": 0.10653571158897689, "learning_rate": 4.317811169010814e-06, "loss": 0.186, "step": 24750 }, { "epoch": 0.7649707652024809, "grad_norm": 0.08368143444105516, "learning_rate": 4.30705752524024e-06, "loss": 0.1883, "step": 24760 }, { "epoch": 0.7652797194695256, "grad_norm": 0.0890081773640572, "learning_rate": 4.296315044013913e-06, "loss": 0.1845, "step": 24770 }, { "epoch": 0.7655886737365701, "grad_norm": 0.09470903053284774, "learning_rate": 4.285583736546083e-06, "loss": 0.1793, "step": 24780 }, { "epoch": 0.7658976280036147, "grad_norm": 0.10418292073091513, "learning_rate": 4.274863614039354e-06, "loss": 0.1837, "step": 24790 }, { "epoch": 0.7662065822706594, "grad_norm": 0.09603698538873698, "learning_rate": 4.26415468768464e-06, "loss": 0.1841, "step": 24800 }, { "epoch": 0.766515536537704, "grad_norm": 0.09274982431505632, "learning_rate": 4.253456968661169e-06, "loss": 0.1829, "step": 24810 }, { "epoch": 0.7668244908047487, "grad_norm": 0.10597247125711916, "learning_rate": 4.242770468136468e-06, "loss": 0.188, "step": 24820 }, { "epoch": 0.7671334450717933, "grad_norm": 0.09575521592452313, "learning_rate": 4.232095197266367e-06, "loss": 0.1825, "step": 24830 }, { "epoch": 0.7674423993388378, "grad_norm": 0.09090213374277013, "learning_rate": 4.221431167194951e-06, "loss": 0.1812, "step": 24840 }, { "epoch": 0.7677513536058825, "grad_norm": 0.08056447313594031, "learning_rate": 4.210778389054593e-06, "loss": 0.1873, "step": 24850 }, { "epoch": 0.7680603078729271, "grad_norm": 0.09664373027301218, "learning_rate": 4.200136873965899e-06, "loss": 0.1809, "step": 24860 }, { "epoch": 0.7683692621399717, "grad_norm": 0.08286401524412505, "learning_rate": 4.1895066330377354e-06, "loss": 0.1814, "step": 24870 }, { "epoch": 0.7686782164070164, "grad_norm": 0.09756381686685321, "learning_rate": 4.1788876773671846e-06, "loss": 0.1831, "step": 24880 }, { "epoch": 0.768987170674061, "grad_norm": 0.10250866255103959, "learning_rate": 4.168280018039561e-06, "loss": 0.1956, "step": 24890 }, { "epoch": 0.7692961249411056, "grad_norm": 0.10863430594661831, "learning_rate": 4.1576836661283745e-06, "loss": 0.1796, "step": 24900 }, { "epoch": 0.7696050792081502, "grad_norm": 0.10923429111387054, "learning_rate": 4.147098632695344e-06, "loss": 0.1795, "step": 24910 }, { "epoch": 0.7699140334751948, "grad_norm": 0.09035054050720479, "learning_rate": 4.136524928790361e-06, "loss": 0.1839, "step": 24920 }, { "epoch": 0.7702229877422394, "grad_norm": 0.10010626082870915, "learning_rate": 4.125962565451491e-06, "loss": 0.1823, "step": 24930 }, { "epoch": 0.7705319420092841, "grad_norm": 0.1098501348723468, "learning_rate": 4.115411553704974e-06, "loss": 0.1862, "step": 24940 }, { "epoch": 0.7708408962763287, "grad_norm": 0.08330661053469954, "learning_rate": 4.104871904565177e-06, "loss": 0.1833, "step": 24950 }, { "epoch": 0.7711498505433734, "grad_norm": 0.09553979714716174, "learning_rate": 4.094343629034632e-06, "loss": 0.1776, "step": 24960 }, { "epoch": 0.771458804810418, "grad_norm": 0.09578591656101941, "learning_rate": 4.083826738103976e-06, "loss": 0.1789, "step": 24970 }, { "epoch": 0.7717677590774625, "grad_norm": 0.09682309178619027, "learning_rate": 4.0733212427519774e-06, "loss": 0.1838, "step": 24980 }, { "epoch": 0.7720767133445072, "grad_norm": 0.09677859659507333, "learning_rate": 4.062827153945491e-06, "loss": 0.1868, "step": 24990 }, { "epoch": 0.7723856676115518, "grad_norm": 0.130969337160774, "learning_rate": 4.052344482639487e-06, "loss": 0.1834, "step": 25000 }, { "epoch": 0.7726946218785964, "grad_norm": 0.10810964448034079, "learning_rate": 4.041873239776996e-06, "loss": 0.1822, "step": 25010 }, { "epoch": 0.7730035761456411, "grad_norm": 0.12253053442932915, "learning_rate": 4.031413436289123e-06, "loss": 0.1836, "step": 25020 }, { "epoch": 0.7733125304126857, "grad_norm": 0.09211352300720242, "learning_rate": 4.0209650830950445e-06, "loss": 0.1826, "step": 25030 }, { "epoch": 0.7736214846797302, "grad_norm": 0.11529614778976754, "learning_rate": 4.010528191101966e-06, "loss": 0.1869, "step": 25040 }, { "epoch": 0.7739304389467749, "grad_norm": 0.11543787517700385, "learning_rate": 4.000102771205137e-06, "loss": 0.1898, "step": 25050 }, { "epoch": 0.7742393932138195, "grad_norm": 0.10341416860828288, "learning_rate": 3.989688834287835e-06, "loss": 0.1827, "step": 25060 }, { "epoch": 0.7745483474808641, "grad_norm": 0.10971970533837791, "learning_rate": 3.979286391221338e-06, "loss": 0.1794, "step": 25070 }, { "epoch": 0.7748573017479088, "grad_norm": 0.09906974833820673, "learning_rate": 3.968895452864942e-06, "loss": 0.183, "step": 25080 }, { "epoch": 0.7751662560149534, "grad_norm": 0.10414415075035399, "learning_rate": 3.9585160300659155e-06, "loss": 0.1923, "step": 25090 }, { "epoch": 0.7754752102819981, "grad_norm": 0.0939762528781646, "learning_rate": 3.948148133659524e-06, "loss": 0.1871, "step": 25100 }, { "epoch": 0.7757841645490426, "grad_norm": 0.09046857533809367, "learning_rate": 3.937791774468978e-06, "loss": 0.1848, "step": 25110 }, { "epoch": 0.7760931188160872, "grad_norm": 0.08905404711090185, "learning_rate": 3.927446963305469e-06, "loss": 0.1861, "step": 25120 }, { "epoch": 0.7764020730831319, "grad_norm": 0.09020382783293679, "learning_rate": 3.917113710968113e-06, "loss": 0.1819, "step": 25130 }, { "epoch": 0.7767110273501765, "grad_norm": 0.09496973877682766, "learning_rate": 3.906792028243974e-06, "loss": 0.1796, "step": 25140 }, { "epoch": 0.7770199816172211, "grad_norm": 0.09796008618843378, "learning_rate": 3.896481925908027e-06, "loss": 0.1821, "step": 25150 }, { "epoch": 0.7773289358842658, "grad_norm": 0.11406156153732327, "learning_rate": 3.8861834147231615e-06, "loss": 0.182, "step": 25160 }, { "epoch": 0.7776378901513104, "grad_norm": 0.10670777254215298, "learning_rate": 3.875896505440174e-06, "loss": 0.1874, "step": 25170 }, { "epoch": 0.7779468444183549, "grad_norm": 0.11516002557890306, "learning_rate": 3.865621208797738e-06, "loss": 0.1927, "step": 25180 }, { "epoch": 0.7782557986853996, "grad_norm": 0.11069305984302626, "learning_rate": 3.855357535522415e-06, "loss": 0.1884, "step": 25190 }, { "epoch": 0.7785647529524442, "grad_norm": 0.11518501842536932, "learning_rate": 3.8451054963286245e-06, "loss": 0.1813, "step": 25200 }, { "epoch": 0.7788737072194888, "grad_norm": 0.12654688882798518, "learning_rate": 3.834865101918647e-06, "loss": 0.1855, "step": 25210 }, { "epoch": 0.7791826614865335, "grad_norm": 0.10198879893025203, "learning_rate": 3.824636362982599e-06, "loss": 0.1798, "step": 25220 }, { "epoch": 0.7794916157535781, "grad_norm": 0.09364822371009877, "learning_rate": 3.8144192901984426e-06, "loss": 0.1904, "step": 25230 }, { "epoch": 0.7798005700206228, "grad_norm": 0.09277265898743273, "learning_rate": 3.8042138942319504e-06, "loss": 0.1784, "step": 25240 }, { "epoch": 0.7801095242876673, "grad_norm": 0.1260953472460793, "learning_rate": 3.7940201857367025e-06, "loss": 0.1872, "step": 25250 }, { "epoch": 0.7804184785547119, "grad_norm": 0.11903159460229215, "learning_rate": 3.7838381753540947e-06, "loss": 0.1849, "step": 25260 }, { "epoch": 0.7807274328217566, "grad_norm": 0.10265665865549166, "learning_rate": 3.7736678737132947e-06, "loss": 0.1811, "step": 25270 }, { "epoch": 0.7810363870888012, "grad_norm": 0.10642599001532839, "learning_rate": 3.76350929143125e-06, "loss": 0.1859, "step": 25280 }, { "epoch": 0.7813453413558458, "grad_norm": 0.12096619689749742, "learning_rate": 3.7533624391126835e-06, "loss": 0.185, "step": 25290 }, { "epoch": 0.7816542956228905, "grad_norm": 0.10298733681963296, "learning_rate": 3.7432273273500615e-06, "loss": 0.1871, "step": 25300 }, { "epoch": 0.781963249889935, "grad_norm": 0.13644706559300795, "learning_rate": 3.7331039667236055e-06, "loss": 0.1802, "step": 25310 }, { "epoch": 0.7822722041569796, "grad_norm": 0.11522219296539363, "learning_rate": 3.722992367801255e-06, "loss": 0.191, "step": 25320 }, { "epoch": 0.7825811584240243, "grad_norm": 0.10725527487457508, "learning_rate": 3.712892541138688e-06, "loss": 0.1817, "step": 25330 }, { "epoch": 0.7828901126910689, "grad_norm": 0.10883563343966805, "learning_rate": 3.7028044972792795e-06, "loss": 0.1822, "step": 25340 }, { "epoch": 0.7831990669581135, "grad_norm": 0.11503872745234577, "learning_rate": 3.692728246754115e-06, "loss": 0.1833, "step": 25350 }, { "epoch": 0.7835080212251582, "grad_norm": 0.10575277807950564, "learning_rate": 3.6826638000819575e-06, "loss": 0.1792, "step": 25360 }, { "epoch": 0.7838169754922027, "grad_norm": 0.08601618952674542, "learning_rate": 3.672611167769262e-06, "loss": 0.1778, "step": 25370 }, { "epoch": 0.7841259297592474, "grad_norm": 0.09357258961607313, "learning_rate": 3.6625703603101347e-06, "loss": 0.1857, "step": 25380 }, { "epoch": 0.784434884026292, "grad_norm": 0.10233492578376589, "learning_rate": 3.652541388186344e-06, "loss": 0.1799, "step": 25390 }, { "epoch": 0.7847438382933366, "grad_norm": 0.10152795001207753, "learning_rate": 3.6425242618673123e-06, "loss": 0.1794, "step": 25400 }, { "epoch": 0.7850527925603813, "grad_norm": 0.10989492354317137, "learning_rate": 3.6325189918100794e-06, "loss": 0.1801, "step": 25410 }, { "epoch": 0.7853617468274259, "grad_norm": 0.10837134213569849, "learning_rate": 3.622525588459324e-06, "loss": 0.1808, "step": 25420 }, { "epoch": 0.7856707010944705, "grad_norm": 0.12192784770304528, "learning_rate": 3.612544062247321e-06, "loss": 0.1839, "step": 25430 }, { "epoch": 0.7859796553615152, "grad_norm": 0.09966334737502888, "learning_rate": 3.6025744235939638e-06, "loss": 0.1806, "step": 25440 }, { "epoch": 0.7862886096285597, "grad_norm": 0.10109780873211506, "learning_rate": 3.5926166829067207e-06, "loss": 0.1814, "step": 25450 }, { "epoch": 0.7865975638956043, "grad_norm": 0.09813537785873704, "learning_rate": 3.5826708505806533e-06, "loss": 0.18, "step": 25460 }, { "epoch": 0.786906518162649, "grad_norm": 0.10611977069156357, "learning_rate": 3.5727369369983828e-06, "loss": 0.1859, "step": 25470 }, { "epoch": 0.7872154724296936, "grad_norm": 0.10485472316535571, "learning_rate": 3.5628149525300817e-06, "loss": 0.1806, "step": 25480 }, { "epoch": 0.7875244266967382, "grad_norm": 0.09427157604721462, "learning_rate": 3.5529049075334906e-06, "loss": 0.1817, "step": 25490 }, { "epoch": 0.7878333809637829, "grad_norm": 0.08384182792045067, "learning_rate": 3.5430068123538674e-06, "loss": 0.1828, "step": 25500 }, { "epoch": 0.7881423352308274, "grad_norm": 0.09922342997052187, "learning_rate": 3.5331206773239978e-06, "loss": 0.1809, "step": 25510 }, { "epoch": 0.7884512894978721, "grad_norm": 0.0910035117708188, "learning_rate": 3.523246512764195e-06, "loss": 0.1879, "step": 25520 }, { "epoch": 0.7887602437649167, "grad_norm": 0.12024080516660918, "learning_rate": 3.513384328982259e-06, "loss": 0.1808, "step": 25530 }, { "epoch": 0.7890691980319613, "grad_norm": 0.09952543576053921, "learning_rate": 3.503534136273497e-06, "loss": 0.1805, "step": 25540 }, { "epoch": 0.789378152299006, "grad_norm": 0.09815265413663787, "learning_rate": 3.4936959449206873e-06, "loss": 0.1862, "step": 25550 }, { "epoch": 0.7896871065660506, "grad_norm": 0.10812201758935382, "learning_rate": 3.483869765194093e-06, "loss": 0.1822, "step": 25560 }, { "epoch": 0.7899960608330951, "grad_norm": 0.10562731468441008, "learning_rate": 3.474055607351419e-06, "loss": 0.1835, "step": 25570 }, { "epoch": 0.7903050151001398, "grad_norm": 0.09156005852258871, "learning_rate": 3.4642534816378414e-06, "loss": 0.1794, "step": 25580 }, { "epoch": 0.7906139693671844, "grad_norm": 0.08828434729860438, "learning_rate": 3.454463398285958e-06, "loss": 0.182, "step": 25590 }, { "epoch": 0.790922923634229, "grad_norm": 0.0908660503922225, "learning_rate": 3.444685367515809e-06, "loss": 0.1888, "step": 25600 }, { "epoch": 0.7912318779012737, "grad_norm": 0.10320120559463324, "learning_rate": 3.434919399534846e-06, "loss": 0.1785, "step": 25610 }, { "epoch": 0.7915408321683183, "grad_norm": 0.0841973932355569, "learning_rate": 3.4251655045379206e-06, "loss": 0.1806, "step": 25620 }, { "epoch": 0.7918497864353629, "grad_norm": 0.09528283527352377, "learning_rate": 3.4154236927073025e-06, "loss": 0.1795, "step": 25630 }, { "epoch": 0.7921587407024075, "grad_norm": 0.08683950961777887, "learning_rate": 3.405693974212622e-06, "loss": 0.1829, "step": 25640 }, { "epoch": 0.7924676949694521, "grad_norm": 0.104330368206283, "learning_rate": 3.395976359210908e-06, "loss": 0.1828, "step": 25650 }, { "epoch": 0.7927766492364968, "grad_norm": 0.10091920942922289, "learning_rate": 3.386270857846535e-06, "loss": 0.1914, "step": 25660 }, { "epoch": 0.7930856035035414, "grad_norm": 0.10033476065215786, "learning_rate": 3.3765774802512495e-06, "loss": 0.1858, "step": 25670 }, { "epoch": 0.793394557770586, "grad_norm": 0.09334760840161517, "learning_rate": 3.366896236544123e-06, "loss": 0.1822, "step": 25680 }, { "epoch": 0.7937035120376307, "grad_norm": 0.09678851010727342, "learning_rate": 3.3572271368315814e-06, "loss": 0.1817, "step": 25690 }, { "epoch": 0.7940124663046753, "grad_norm": 0.09973566574864082, "learning_rate": 3.347570191207354e-06, "loss": 0.1839, "step": 25700 }, { "epoch": 0.7943214205717198, "grad_norm": 0.08610035124366384, "learning_rate": 3.337925409752495e-06, "loss": 0.1909, "step": 25710 }, { "epoch": 0.7946303748387645, "grad_norm": 0.10119454927901549, "learning_rate": 3.3282928025353476e-06, "loss": 0.1827, "step": 25720 }, { "epoch": 0.7949393291058091, "grad_norm": 0.13967062222556495, "learning_rate": 3.318672379611564e-06, "loss": 0.189, "step": 25730 }, { "epoch": 0.7952482833728537, "grad_norm": 0.11231914811684597, "learning_rate": 3.309064151024058e-06, "loss": 0.1876, "step": 25740 }, { "epoch": 0.7955572376398984, "grad_norm": 0.10296738133323749, "learning_rate": 3.299468126803033e-06, "loss": 0.1844, "step": 25750 }, { "epoch": 0.795866191906943, "grad_norm": 0.11107133897996203, "learning_rate": 3.2898843169659304e-06, "loss": 0.1814, "step": 25760 }, { "epoch": 0.7961751461739875, "grad_norm": 0.09354877349047017, "learning_rate": 3.2803127315174625e-06, "loss": 0.1862, "step": 25770 }, { "epoch": 0.7964841004410322, "grad_norm": 0.09370649145844268, "learning_rate": 3.2707533804495597e-06, "loss": 0.182, "step": 25780 }, { "epoch": 0.7967930547080768, "grad_norm": 0.09180617405999886, "learning_rate": 3.2612062737414016e-06, "loss": 0.1838, "step": 25790 }, { "epoch": 0.7971020089751215, "grad_norm": 0.1015999856063287, "learning_rate": 3.2516714213593657e-06, "loss": 0.1846, "step": 25800 }, { "epoch": 0.7974109632421661, "grad_norm": 0.10029948167309428, "learning_rate": 3.242148833257053e-06, "loss": 0.1829, "step": 25810 }, { "epoch": 0.7977199175092107, "grad_norm": 0.09299225626189431, "learning_rate": 3.232638519375249e-06, "loss": 0.1841, "step": 25820 }, { "epoch": 0.7980288717762554, "grad_norm": 0.09937895416502573, "learning_rate": 3.2231404896419382e-06, "loss": 0.1845, "step": 25830 }, { "epoch": 0.7983378260433, "grad_norm": 0.10619821626272419, "learning_rate": 3.2136547539722726e-06, "loss": 0.1776, "step": 25840 }, { "epoch": 0.7986467803103445, "grad_norm": 0.08838672358304156, "learning_rate": 3.2041813222685683e-06, "loss": 0.1801, "step": 25850 }, { "epoch": 0.7989557345773892, "grad_norm": 0.11498206831516014, "learning_rate": 3.1947202044203093e-06, "loss": 0.1785, "step": 25860 }, { "epoch": 0.7992646888444338, "grad_norm": 0.0890786518364633, "learning_rate": 3.1852714103041103e-06, "loss": 0.1907, "step": 25870 }, { "epoch": 0.7995736431114784, "grad_norm": 0.1026602108963983, "learning_rate": 3.175834949783734e-06, "loss": 0.1781, "step": 25880 }, { "epoch": 0.7998825973785231, "grad_norm": 0.09939878605795972, "learning_rate": 3.1664108327100544e-06, "loss": 0.1795, "step": 25890 }, { "epoch": 0.8001915516455677, "grad_norm": 0.09105069229392243, "learning_rate": 3.1569990689210764e-06, "loss": 0.1878, "step": 25900 }, { "epoch": 0.8005005059126122, "grad_norm": 0.11413370928292792, "learning_rate": 3.1475996682418912e-06, "loss": 0.1837, "step": 25910 }, { "epoch": 0.8008094601796569, "grad_norm": 0.1160283753139262, "learning_rate": 3.1382126404846993e-06, "loss": 0.1804, "step": 25920 }, { "epoch": 0.8011184144467015, "grad_norm": 0.11437225315421572, "learning_rate": 3.128837995448778e-06, "loss": 0.179, "step": 25930 }, { "epoch": 0.8014273687137462, "grad_norm": 0.12904363174166586, "learning_rate": 3.119475742920476e-06, "loss": 0.1819, "step": 25940 }, { "epoch": 0.8017363229807908, "grad_norm": 0.1027034226463252, "learning_rate": 3.1101258926732036e-06, "loss": 0.1847, "step": 25950 }, { "epoch": 0.8020452772478354, "grad_norm": 0.10556362274882444, "learning_rate": 3.100788454467439e-06, "loss": 0.1829, "step": 25960 }, { "epoch": 0.8023542315148801, "grad_norm": 0.10631526692793652, "learning_rate": 3.0914634380506794e-06, "loss": 0.1859, "step": 25970 }, { "epoch": 0.8026631857819246, "grad_norm": 0.10186607706552472, "learning_rate": 3.082150853157481e-06, "loss": 0.1801, "step": 25980 }, { "epoch": 0.8029721400489692, "grad_norm": 0.09672703905085761, "learning_rate": 3.0728507095093985e-06, "loss": 0.1782, "step": 25990 }, { "epoch": 0.8032810943160139, "grad_norm": 0.09494310970245266, "learning_rate": 3.063563016815017e-06, "loss": 0.1772, "step": 26000 }, { "epoch": 0.8035900485830585, "grad_norm": 0.1261578815363774, "learning_rate": 3.0542877847699113e-06, "loss": 0.1871, "step": 26010 }, { "epoch": 0.8038990028501031, "grad_norm": 0.09858750860348602, "learning_rate": 3.0450250230566596e-06, "loss": 0.1824, "step": 26020 }, { "epoch": 0.8042079571171478, "grad_norm": 0.12924161690654817, "learning_rate": 3.03577474134481e-06, "loss": 0.1798, "step": 26030 }, { "epoch": 0.8045169113841923, "grad_norm": 0.12313499376004705, "learning_rate": 3.026536949290895e-06, "loss": 0.1804, "step": 26040 }, { "epoch": 0.8048258656512369, "grad_norm": 0.10588291446113972, "learning_rate": 3.0173116565383956e-06, "loss": 0.1786, "step": 26050 }, { "epoch": 0.8051348199182816, "grad_norm": 0.09779036024356536, "learning_rate": 3.00809887271776e-06, "loss": 0.1823, "step": 26060 }, { "epoch": 0.8054437741853262, "grad_norm": 0.08743651968312417, "learning_rate": 2.998898607446363e-06, "loss": 0.1839, "step": 26070 }, { "epoch": 0.8057527284523709, "grad_norm": 0.09384200809989345, "learning_rate": 2.989710870328516e-06, "loss": 0.1872, "step": 26080 }, { "epoch": 0.8060616827194155, "grad_norm": 0.0998580179141942, "learning_rate": 2.9805356709554626e-06, "loss": 0.19, "step": 26090 }, { "epoch": 0.80637063698646, "grad_norm": 0.09007294558796745, "learning_rate": 2.97137301890534e-06, "loss": 0.1784, "step": 26100 }, { "epoch": 0.8066795912535047, "grad_norm": 0.09481269282717532, "learning_rate": 2.9622229237432037e-06, "loss": 0.1801, "step": 26110 }, { "epoch": 0.8069885455205493, "grad_norm": 0.09856570964423522, "learning_rate": 2.953085395020986e-06, "loss": 0.1823, "step": 26120 }, { "epoch": 0.8072974997875939, "grad_norm": 0.09033459745834758, "learning_rate": 2.9439604422775163e-06, "loss": 0.1803, "step": 26130 }, { "epoch": 0.8076064540546386, "grad_norm": 0.0954125577341479, "learning_rate": 2.934848075038484e-06, "loss": 0.1826, "step": 26140 }, { "epoch": 0.8079154083216832, "grad_norm": 0.08962536980220799, "learning_rate": 2.9257483028164405e-06, "loss": 0.1811, "step": 26150 }, { "epoch": 0.8082243625887278, "grad_norm": 0.12086052233069301, "learning_rate": 2.9166611351107996e-06, "loss": 0.183, "step": 26160 }, { "epoch": 0.8085333168557725, "grad_norm": 0.09011053516911019, "learning_rate": 2.9075865814078064e-06, "loss": 0.1858, "step": 26170 }, { "epoch": 0.808842271122817, "grad_norm": 0.09099461955071193, "learning_rate": 2.8985246511805375e-06, "loss": 0.1814, "step": 26180 }, { "epoch": 0.8091512253898616, "grad_norm": 0.10358805842616044, "learning_rate": 2.889475353888903e-06, "loss": 0.1798, "step": 26190 }, { "epoch": 0.8094601796569063, "grad_norm": 0.10379934010578504, "learning_rate": 2.880438698979611e-06, "loss": 0.1889, "step": 26200 }, { "epoch": 0.8097691339239509, "grad_norm": 0.13552284107458618, "learning_rate": 2.8714146958861877e-06, "loss": 0.1787, "step": 26210 }, { "epoch": 0.8100780881909956, "grad_norm": 0.11537436983097167, "learning_rate": 2.8624033540289334e-06, "loss": 0.1869, "step": 26220 }, { "epoch": 0.8103870424580402, "grad_norm": 0.10467640868514755, "learning_rate": 2.8534046828149506e-06, "loss": 0.1831, "step": 26230 }, { "epoch": 0.8106959967250847, "grad_norm": 0.10466198430777411, "learning_rate": 2.8444186916380966e-06, "loss": 0.1826, "step": 26240 }, { "epoch": 0.8110049509921294, "grad_norm": 0.08690735946650803, "learning_rate": 2.8354453898790074e-06, "loss": 0.1804, "step": 26250 }, { "epoch": 0.811313905259174, "grad_norm": 0.10398113807428562, "learning_rate": 2.8264847869050564e-06, "loss": 0.1802, "step": 26260 }, { "epoch": 0.8116228595262186, "grad_norm": 0.09015814681323908, "learning_rate": 2.8175368920703797e-06, "loss": 0.1831, "step": 26270 }, { "epoch": 0.8119318137932633, "grad_norm": 0.09636705603596216, "learning_rate": 2.8086017147158273e-06, "loss": 0.1846, "step": 26280 }, { "epoch": 0.8122407680603079, "grad_norm": 0.13561475956548416, "learning_rate": 2.799679264168989e-06, "loss": 0.1811, "step": 26290 }, { "epoch": 0.8125497223273525, "grad_norm": 0.08541995734046805, "learning_rate": 2.790769549744161e-06, "loss": 0.1832, "step": 26300 }, { "epoch": 0.8128586765943971, "grad_norm": 0.10269967585773117, "learning_rate": 2.781872580742342e-06, "loss": 0.184, "step": 26310 }, { "epoch": 0.8131676308614417, "grad_norm": 0.10694390579247381, "learning_rate": 2.7729883664512334e-06, "loss": 0.1844, "step": 26320 }, { "epoch": 0.8134765851284863, "grad_norm": 0.10484432211326815, "learning_rate": 2.7641169161452122e-06, "loss": 0.185, "step": 26330 }, { "epoch": 0.813785539395531, "grad_norm": 0.08086703065465609, "learning_rate": 2.7552582390853436e-06, "loss": 0.1783, "step": 26340 }, { "epoch": 0.8140944936625756, "grad_norm": 0.08832847223338984, "learning_rate": 2.7464123445193434e-06, "loss": 0.1841, "step": 26350 }, { "epoch": 0.8144034479296203, "grad_norm": 0.096732950352893, "learning_rate": 2.737579241681598e-06, "loss": 0.1798, "step": 26360 }, { "epoch": 0.8147124021966649, "grad_norm": 0.1258447057640801, "learning_rate": 2.72875893979313e-06, "loss": 0.1906, "step": 26370 }, { "epoch": 0.8150213564637094, "grad_norm": 0.10718431506263308, "learning_rate": 2.719951448061599e-06, "loss": 0.1803, "step": 26380 }, { "epoch": 0.8153303107307541, "grad_norm": 0.0849113598593184, "learning_rate": 2.7111567756813016e-06, "loss": 0.1787, "step": 26390 }, { "epoch": 0.8156392649977987, "grad_norm": 0.09298630797136359, "learning_rate": 2.7023749318331424e-06, "loss": 0.1799, "step": 26400 }, { "epoch": 0.8159482192648433, "grad_norm": 0.10733842023666557, "learning_rate": 2.6936059256846353e-06, "loss": 0.1813, "step": 26410 }, { "epoch": 0.816257173531888, "grad_norm": 0.12126086751845842, "learning_rate": 2.6848497663899003e-06, "loss": 0.1862, "step": 26420 }, { "epoch": 0.8165661277989326, "grad_norm": 0.10450695004230649, "learning_rate": 2.6761064630896325e-06, "loss": 0.1878, "step": 26430 }, { "epoch": 0.8168750820659771, "grad_norm": 0.11306162621207497, "learning_rate": 2.6673760249111245e-06, "loss": 0.1833, "step": 26440 }, { "epoch": 0.8171840363330218, "grad_norm": 0.08166325960287754, "learning_rate": 2.6586584609682206e-06, "loss": 0.1829, "step": 26450 }, { "epoch": 0.8174929906000664, "grad_norm": 0.11217080019997884, "learning_rate": 2.64995378036134e-06, "loss": 0.1861, "step": 26460 }, { "epoch": 0.817801944867111, "grad_norm": 0.09301487805870076, "learning_rate": 2.6412619921774418e-06, "loss": 0.1795, "step": 26470 }, { "epoch": 0.8181108991341557, "grad_norm": 0.10610936078817064, "learning_rate": 2.632583105490036e-06, "loss": 0.182, "step": 26480 }, { "epoch": 0.8184198534012003, "grad_norm": 0.10507038732506645, "learning_rate": 2.623917129359153e-06, "loss": 0.1911, "step": 26490 }, { "epoch": 0.818728807668245, "grad_norm": 0.09456845841819382, "learning_rate": 2.615264072831362e-06, "loss": 0.1856, "step": 26500 }, { "epoch": 0.8190377619352895, "grad_norm": 0.10497736812272437, "learning_rate": 2.606623944939726e-06, "loss": 0.1813, "step": 26510 }, { "epoch": 0.8193467162023341, "grad_norm": 0.09871780872343341, "learning_rate": 2.5979967547038304e-06, "loss": 0.1815, "step": 26520 }, { "epoch": 0.8196556704693788, "grad_norm": 0.10846989193582626, "learning_rate": 2.5893825111297387e-06, "loss": 0.1828, "step": 26530 }, { "epoch": 0.8199646247364234, "grad_norm": 0.08585119211990397, "learning_rate": 2.5807812232100062e-06, "loss": 0.178, "step": 26540 }, { "epoch": 0.820273579003468, "grad_norm": 0.10086585109572187, "learning_rate": 2.5721928999236694e-06, "loss": 0.177, "step": 26550 }, { "epoch": 0.8205825332705127, "grad_norm": 0.11676183264373326, "learning_rate": 2.5636175502362157e-06, "loss": 0.1815, "step": 26560 }, { "epoch": 0.8208914875375573, "grad_norm": 0.10019563588066743, "learning_rate": 2.555055183099608e-06, "loss": 0.181, "step": 26570 }, { "epoch": 0.8212004418046018, "grad_norm": 0.11278864270987979, "learning_rate": 2.5465058074522403e-06, "loss": 0.1818, "step": 26580 }, { "epoch": 0.8215093960716465, "grad_norm": 0.12229159159220361, "learning_rate": 2.537969432218954e-06, "loss": 0.1831, "step": 26590 }, { "epoch": 0.8218183503386911, "grad_norm": 0.08952713642378439, "learning_rate": 2.5294460663110164e-06, "loss": 0.1826, "step": 26600 }, { "epoch": 0.8221273046057357, "grad_norm": 3.9091861062561364, "learning_rate": 2.5209357186261086e-06, "loss": 0.1873, "step": 26610 }, { "epoch": 0.8224362588727804, "grad_norm": 0.11250191251129393, "learning_rate": 2.512438398048335e-06, "loss": 0.1837, "step": 26620 }, { "epoch": 0.822745213139825, "grad_norm": 0.09671601902846973, "learning_rate": 2.5039541134481892e-06, "loss": 0.18, "step": 26630 }, { "epoch": 0.8230541674068697, "grad_norm": 0.11477264808943827, "learning_rate": 2.4954828736825558e-06, "loss": 0.1905, "step": 26640 }, { "epoch": 0.8233631216739142, "grad_norm": 0.10524931662881779, "learning_rate": 2.487024687594713e-06, "loss": 0.1779, "step": 26650 }, { "epoch": 0.8236720759409588, "grad_norm": 0.1361755300026651, "learning_rate": 2.478579564014299e-06, "loss": 0.1778, "step": 26660 }, { "epoch": 0.8239810302080035, "grad_norm": 0.09541713607264742, "learning_rate": 2.470147511757327e-06, "loss": 0.187, "step": 26670 }, { "epoch": 0.8242899844750481, "grad_norm": 0.09585684365974927, "learning_rate": 2.4617285396261542e-06, "loss": 0.1817, "step": 26680 }, { "epoch": 0.8245989387420927, "grad_norm": 0.0896669875629135, "learning_rate": 2.453322656409495e-06, "loss": 0.1785, "step": 26690 }, { "epoch": 0.8249078930091374, "grad_norm": 0.09318421443377176, "learning_rate": 2.444929870882386e-06, "loss": 0.1806, "step": 26700 }, { "epoch": 0.8252168472761819, "grad_norm": 0.09788279781486932, "learning_rate": 2.436550191806206e-06, "loss": 0.1872, "step": 26710 }, { "epoch": 0.8255258015432265, "grad_norm": 0.12644756525281617, "learning_rate": 2.4281836279286385e-06, "loss": 0.1821, "step": 26720 }, { "epoch": 0.8258347558102712, "grad_norm": 0.11951582000276369, "learning_rate": 2.4198301879836865e-06, "loss": 0.1812, "step": 26730 }, { "epoch": 0.8261437100773158, "grad_norm": 0.12389603664861762, "learning_rate": 2.411489880691643e-06, "loss": 0.1813, "step": 26740 }, { "epoch": 0.8264526643443604, "grad_norm": 0.0962938853711917, "learning_rate": 2.4031627147591e-06, "loss": 0.1872, "step": 26750 }, { "epoch": 0.8267616186114051, "grad_norm": 0.0928120806627728, "learning_rate": 2.3948486988789277e-06, "loss": 0.1876, "step": 26760 }, { "epoch": 0.8270705728784497, "grad_norm": 0.09627266159755687, "learning_rate": 2.386547841730263e-06, "loss": 0.1881, "step": 26770 }, { "epoch": 0.8273795271454943, "grad_norm": 0.10702124431748927, "learning_rate": 2.3782601519785193e-06, "loss": 0.189, "step": 26780 }, { "epoch": 0.8276884814125389, "grad_norm": 0.09047835459394946, "learning_rate": 2.369985638275349e-06, "loss": 0.1856, "step": 26790 }, { "epoch": 0.8279974356795835, "grad_norm": 0.11013789930904871, "learning_rate": 2.3617243092586654e-06, "loss": 0.183, "step": 26800 }, { "epoch": 0.8283063899466282, "grad_norm": 0.11244225824158363, "learning_rate": 2.3534761735526037e-06, "loss": 0.1799, "step": 26810 }, { "epoch": 0.8286153442136728, "grad_norm": 0.10482758814454557, "learning_rate": 2.345241239767537e-06, "loss": 0.1811, "step": 26820 }, { "epoch": 0.8289242984807174, "grad_norm": 0.11023236711917002, "learning_rate": 2.3370195165000517e-06, "loss": 0.1767, "step": 26830 }, { "epoch": 0.829233252747762, "grad_norm": 0.08547987864910896, "learning_rate": 2.3288110123329443e-06, "loss": 0.1839, "step": 26840 }, { "epoch": 0.8295422070148066, "grad_norm": 0.08199369439749662, "learning_rate": 2.3206157358352065e-06, "loss": 0.1844, "step": 26850 }, { "epoch": 0.8298511612818512, "grad_norm": 0.09945310347435897, "learning_rate": 2.3124336955620335e-06, "loss": 0.177, "step": 26860 }, { "epoch": 0.8301601155488959, "grad_norm": 0.11886878897328611, "learning_rate": 2.304264900054791e-06, "loss": 0.1846, "step": 26870 }, { "epoch": 0.8304690698159405, "grad_norm": 0.1053010572103629, "learning_rate": 2.29610935784103e-06, "loss": 0.1777, "step": 26880 }, { "epoch": 0.8307780240829851, "grad_norm": 0.10941271941189175, "learning_rate": 2.2879670774344524e-06, "loss": 0.1813, "step": 26890 }, { "epoch": 0.8310869783500298, "grad_norm": 0.1029952541818937, "learning_rate": 2.2798380673349286e-06, "loss": 0.1818, "step": 26900 }, { "epoch": 0.8313959326170743, "grad_norm": 0.09693366522100368, "learning_rate": 2.2717223360284645e-06, "loss": 0.18, "step": 26910 }, { "epoch": 0.831704886884119, "grad_norm": 0.1034666131190422, "learning_rate": 2.263619891987217e-06, "loss": 0.185, "step": 26920 }, { "epoch": 0.8320138411511636, "grad_norm": 0.10629781939319172, "learning_rate": 2.255530743669457e-06, "loss": 0.1832, "step": 26930 }, { "epoch": 0.8323227954182082, "grad_norm": 0.10468632450440872, "learning_rate": 2.2474548995195904e-06, "loss": 0.1795, "step": 26940 }, { "epoch": 0.8326317496852529, "grad_norm": 0.13031490570482887, "learning_rate": 2.2393923679681244e-06, "loss": 0.1883, "step": 26950 }, { "epoch": 0.8329407039522975, "grad_norm": 0.11430169231322544, "learning_rate": 2.2313431574316762e-06, "loss": 0.1784, "step": 26960 }, { "epoch": 0.833249658219342, "grad_norm": 0.10471546126512164, "learning_rate": 2.2233072763129474e-06, "loss": 0.1793, "step": 26970 }, { "epoch": 0.8335586124863867, "grad_norm": 0.10835248995073464, "learning_rate": 2.215284733000738e-06, "loss": 0.1885, "step": 26980 }, { "epoch": 0.8338675667534313, "grad_norm": 0.08384096342789985, "learning_rate": 2.2072755358699137e-06, "loss": 0.1836, "step": 26990 }, { "epoch": 0.8341765210204759, "grad_norm": 0.08235840301835703, "learning_rate": 2.199279693281408e-06, "loss": 0.1787, "step": 27000 }, { "epoch": 0.8344854752875206, "grad_norm": 0.09118788705424001, "learning_rate": 2.191297213582224e-06, "loss": 0.1797, "step": 27010 }, { "epoch": 0.8347944295545652, "grad_norm": 0.11678461718831075, "learning_rate": 2.1833281051053994e-06, "loss": 0.1785, "step": 27020 }, { "epoch": 0.8351033838216098, "grad_norm": 0.09495668764902095, "learning_rate": 2.1753723761700293e-06, "loss": 0.1844, "step": 27030 }, { "epoch": 0.8354123380886544, "grad_norm": 0.11089841329393607, "learning_rate": 2.167430035081229e-06, "loss": 0.1816, "step": 27040 }, { "epoch": 0.835721292355699, "grad_norm": 0.09614891688568993, "learning_rate": 2.159501090130148e-06, "loss": 0.1902, "step": 27050 }, { "epoch": 0.8360302466227437, "grad_norm": 0.10128158337378926, "learning_rate": 2.1515855495939453e-06, "loss": 0.1844, "step": 27060 }, { "epoch": 0.8363392008897883, "grad_norm": 0.10813236952458656, "learning_rate": 2.143683421735787e-06, "loss": 0.1783, "step": 27070 }, { "epoch": 0.8366481551568329, "grad_norm": 0.1138112838456115, "learning_rate": 2.1357947148048334e-06, "loss": 0.1792, "step": 27080 }, { "epoch": 0.8369571094238776, "grad_norm": 0.09877559199607756, "learning_rate": 2.1279194370362506e-06, "loss": 0.1842, "step": 27090 }, { "epoch": 0.8372660636909222, "grad_norm": 0.12273893003811713, "learning_rate": 2.120057596651163e-06, "loss": 0.1808, "step": 27100 }, { "epoch": 0.8375750179579667, "grad_norm": 0.1268366219120988, "learning_rate": 2.11220920185669e-06, "loss": 0.1798, "step": 27110 }, { "epoch": 0.8378839722250114, "grad_norm": 0.0893402573141918, "learning_rate": 2.104374260845896e-06, "loss": 0.1819, "step": 27120 }, { "epoch": 0.838192926492056, "grad_norm": 0.10505320963658311, "learning_rate": 2.096552781797815e-06, "loss": 0.1844, "step": 27130 }, { "epoch": 0.8385018807591006, "grad_norm": 0.11043174167319414, "learning_rate": 2.0887447728774167e-06, "loss": 0.1843, "step": 27140 }, { "epoch": 0.8388108350261453, "grad_norm": 0.10512945708601514, "learning_rate": 2.08095024223562e-06, "loss": 0.1823, "step": 27150 }, { "epoch": 0.8391197892931899, "grad_norm": 0.09004760228100747, "learning_rate": 2.07316919800926e-06, "loss": 0.1842, "step": 27160 }, { "epoch": 0.8394287435602344, "grad_norm": 0.10913220744232488, "learning_rate": 2.0654016483211096e-06, "loss": 0.1827, "step": 27170 }, { "epoch": 0.8397376978272791, "grad_norm": 0.09261622097557005, "learning_rate": 2.0576476012798383e-06, "loss": 0.181, "step": 27180 }, { "epoch": 0.8400466520943237, "grad_norm": 0.10220607992457943, "learning_rate": 2.0499070649800343e-06, "loss": 0.1915, "step": 27190 }, { "epoch": 0.8403556063613684, "grad_norm": 0.1064276272435423, "learning_rate": 2.042180047502169e-06, "loss": 0.1844, "step": 27200 }, { "epoch": 0.840664560628413, "grad_norm": 0.10315979384687024, "learning_rate": 2.0344665569126115e-06, "loss": 0.1803, "step": 27210 }, { "epoch": 0.8409735148954576, "grad_norm": 0.1091343413173612, "learning_rate": 2.0267666012636056e-06, "loss": 0.1802, "step": 27220 }, { "epoch": 0.8412824691625023, "grad_norm": 0.10350326492163203, "learning_rate": 2.019080188593258e-06, "loss": 0.1798, "step": 27230 }, { "epoch": 0.8415914234295468, "grad_norm": 0.0979584507564618, "learning_rate": 2.011407326925557e-06, "loss": 0.1793, "step": 27240 }, { "epoch": 0.8419003776965914, "grad_norm": 0.08892882254327583, "learning_rate": 2.0037480242703226e-06, "loss": 0.1866, "step": 27250 }, { "epoch": 0.8422093319636361, "grad_norm": 0.09177020165375138, "learning_rate": 1.99610228862324e-06, "loss": 0.1907, "step": 27260 }, { "epoch": 0.8425182862306807, "grad_norm": 0.1329194927914558, "learning_rate": 1.988470127965818e-06, "loss": 0.1792, "step": 27270 }, { "epoch": 0.8428272404977253, "grad_norm": 0.10081756590933665, "learning_rate": 1.9808515502653968e-06, "loss": 0.1868, "step": 27280 }, { "epoch": 0.84313619476477, "grad_norm": 0.08951332764717372, "learning_rate": 1.9732465634751444e-06, "loss": 0.1798, "step": 27290 }, { "epoch": 0.8434451490318146, "grad_norm": 0.09778559595819382, "learning_rate": 1.9656551755340308e-06, "loss": 0.1884, "step": 27300 }, { "epoch": 0.8437541032988591, "grad_norm": 0.11482275963719252, "learning_rate": 1.9580773943668346e-06, "loss": 0.1801, "step": 27310 }, { "epoch": 0.8440630575659038, "grad_norm": 0.12016895786340931, "learning_rate": 1.9505132278841364e-06, "loss": 0.1786, "step": 27320 }, { "epoch": 0.8443720118329484, "grad_norm": 0.09166064739050628, "learning_rate": 1.9429626839822886e-06, "loss": 0.1845, "step": 27330 }, { "epoch": 0.8446809660999931, "grad_norm": 0.09642933193736093, "learning_rate": 1.9354257705434417e-06, "loss": 0.1818, "step": 27340 }, { "epoch": 0.8449899203670377, "grad_norm": 0.08630323863341553, "learning_rate": 1.9279024954355e-06, "loss": 0.188, "step": 27350 }, { "epoch": 0.8452988746340823, "grad_norm": 0.09861229733485333, "learning_rate": 1.920392866512146e-06, "loss": 0.1831, "step": 27360 }, { "epoch": 0.845607828901127, "grad_norm": 0.12747007442421499, "learning_rate": 1.9128968916127993e-06, "loss": 0.186, "step": 27370 }, { "epoch": 0.8459167831681715, "grad_norm": 0.10135488698645409, "learning_rate": 1.9054145785626448e-06, "loss": 0.1839, "step": 27380 }, { "epoch": 0.8462257374352161, "grad_norm": 0.12100329091546554, "learning_rate": 1.8979459351725893e-06, "loss": 0.1782, "step": 27390 }, { "epoch": 0.8465346917022608, "grad_norm": 0.11794669930626465, "learning_rate": 1.8904909692392808e-06, "loss": 0.177, "step": 27400 }, { "epoch": 0.8468436459693054, "grad_norm": 0.10304527310776555, "learning_rate": 1.883049688545081e-06, "loss": 0.1834, "step": 27410 }, { "epoch": 0.84715260023635, "grad_norm": 0.11079993470676724, "learning_rate": 1.8756221008580742e-06, "loss": 0.178, "step": 27420 }, { "epoch": 0.8474615545033947, "grad_norm": 0.10529550594569145, "learning_rate": 1.86820821393204e-06, "loss": 0.1868, "step": 27430 }, { "epoch": 0.8477705087704392, "grad_norm": 0.09865252971963585, "learning_rate": 1.8608080355064654e-06, "loss": 0.1805, "step": 27440 }, { "epoch": 0.8480794630374838, "grad_norm": 0.11146268569008994, "learning_rate": 1.8534215733065195e-06, "loss": 0.187, "step": 27450 }, { "epoch": 0.8483884173045285, "grad_norm": 0.09285684930021669, "learning_rate": 1.8460488350430543e-06, "loss": 0.1798, "step": 27460 }, { "epoch": 0.8486973715715731, "grad_norm": 0.09441379882019936, "learning_rate": 1.8386898284126014e-06, "loss": 0.1825, "step": 27470 }, { "epoch": 0.8490063258386178, "grad_norm": 0.09828897913817153, "learning_rate": 1.8313445610973473e-06, "loss": 0.1798, "step": 27480 }, { "epoch": 0.8493152801056624, "grad_norm": 0.09712604689226159, "learning_rate": 1.8240130407651468e-06, "loss": 0.1811, "step": 27490 }, { "epoch": 0.849624234372707, "grad_norm": 0.09650863253736786, "learning_rate": 1.816695275069496e-06, "loss": 0.1903, "step": 27500 }, { "epoch": 0.8499331886397516, "grad_norm": 0.10159140277852872, "learning_rate": 1.8093912716495309e-06, "loss": 0.1823, "step": 27510 }, { "epoch": 0.8502421429067962, "grad_norm": 0.10257575057222768, "learning_rate": 1.8021010381300306e-06, "loss": 0.1842, "step": 27520 }, { "epoch": 0.8505510971738408, "grad_norm": 0.11416021507753527, "learning_rate": 1.7948245821213927e-06, "loss": 0.1819, "step": 27530 }, { "epoch": 0.8508600514408855, "grad_norm": 0.1096439031539581, "learning_rate": 1.7875619112196272e-06, "loss": 0.1806, "step": 27540 }, { "epoch": 0.8511690057079301, "grad_norm": 0.12544886724105817, "learning_rate": 1.7803130330063683e-06, "loss": 0.185, "step": 27550 }, { "epoch": 0.8514779599749747, "grad_norm": 0.08653214946357521, "learning_rate": 1.7730779550488357e-06, "loss": 0.1809, "step": 27560 }, { "epoch": 0.8517869142420194, "grad_norm": 0.09734798167744035, "learning_rate": 1.7658566848998564e-06, "loss": 0.179, "step": 27570 }, { "epoch": 0.8520958685090639, "grad_norm": 0.11337981038383727, "learning_rate": 1.7586492300978302e-06, "loss": 0.1794, "step": 27580 }, { "epoch": 0.8524048227761085, "grad_norm": 0.12537202799438474, "learning_rate": 1.7514555981667502e-06, "loss": 0.1771, "step": 27590 }, { "epoch": 0.8527137770431532, "grad_norm": 0.09630437412885685, "learning_rate": 1.744275796616161e-06, "loss": 0.1803, "step": 27600 }, { "epoch": 0.8530227313101978, "grad_norm": 0.13605555272709954, "learning_rate": 1.7371098329411889e-06, "loss": 0.1886, "step": 27610 }, { "epoch": 0.8533316855772425, "grad_norm": 0.10014710385574904, "learning_rate": 1.7299577146224965e-06, "loss": 0.1879, "step": 27620 }, { "epoch": 0.8536406398442871, "grad_norm": 0.1153995981947399, "learning_rate": 1.7228194491263083e-06, "loss": 0.192, "step": 27630 }, { "epoch": 0.8539495941113316, "grad_norm": 0.10125870202744387, "learning_rate": 1.7156950439043767e-06, "loss": 0.1804, "step": 27640 }, { "epoch": 0.8542585483783763, "grad_norm": 0.10037355542983763, "learning_rate": 1.7085845063939925e-06, "loss": 0.1795, "step": 27650 }, { "epoch": 0.8545675026454209, "grad_norm": 0.10366857000831413, "learning_rate": 1.7014878440179631e-06, "loss": 0.1815, "step": 27660 }, { "epoch": 0.8548764569124655, "grad_norm": 0.151792790296212, "learning_rate": 1.6944050641846193e-06, "loss": 0.1844, "step": 27670 }, { "epoch": 0.8551854111795102, "grad_norm": 0.11647008610684406, "learning_rate": 1.687336174287793e-06, "loss": 0.1799, "step": 27680 }, { "epoch": 0.8554943654465548, "grad_norm": 0.11979148085812992, "learning_rate": 1.6802811817068147e-06, "loss": 0.1862, "step": 27690 }, { "epoch": 0.8558033197135994, "grad_norm": 0.09533420254047427, "learning_rate": 1.6732400938065168e-06, "loss": 0.1822, "step": 27700 }, { "epoch": 0.856112273980644, "grad_norm": 0.09103012165005812, "learning_rate": 1.666212917937206e-06, "loss": 0.182, "step": 27710 }, { "epoch": 0.8564212282476886, "grad_norm": 0.10631327289049497, "learning_rate": 1.6591996614346743e-06, "loss": 0.1873, "step": 27720 }, { "epoch": 0.8567301825147332, "grad_norm": 0.09131778998475253, "learning_rate": 1.652200331620179e-06, "loss": 0.1856, "step": 27730 }, { "epoch": 0.8570391367817779, "grad_norm": 0.09480692081231779, "learning_rate": 1.645214935800438e-06, "loss": 0.1804, "step": 27740 }, { "epoch": 0.8573480910488225, "grad_norm": 0.10350323929665772, "learning_rate": 1.6382434812676206e-06, "loss": 0.184, "step": 27750 }, { "epoch": 0.8576570453158672, "grad_norm": 0.09218235957061148, "learning_rate": 1.631285975299353e-06, "loss": 0.1806, "step": 27760 }, { "epoch": 0.8579659995829118, "grad_norm": 0.10087742572076434, "learning_rate": 1.6243424251586892e-06, "loss": 0.1798, "step": 27770 }, { "epoch": 0.8582749538499563, "grad_norm": 0.10238011832918574, "learning_rate": 1.617412838094124e-06, "loss": 0.1898, "step": 27780 }, { "epoch": 0.858583908117001, "grad_norm": 0.10578972574701347, "learning_rate": 1.6104972213395658e-06, "loss": 0.1821, "step": 27790 }, { "epoch": 0.8588928623840456, "grad_norm": 0.09645345818839185, "learning_rate": 1.60359558211435e-06, "loss": 0.1796, "step": 27800 }, { "epoch": 0.8592018166510902, "grad_norm": 0.12193528467144923, "learning_rate": 1.5967079276232083e-06, "loss": 0.1865, "step": 27810 }, { "epoch": 0.8595107709181349, "grad_norm": 0.12029400636170644, "learning_rate": 1.5898342650562876e-06, "loss": 0.1811, "step": 27820 }, { "epoch": 0.8598197251851795, "grad_norm": 0.09458211987564656, "learning_rate": 1.5829746015891127e-06, "loss": 0.1804, "step": 27830 }, { "epoch": 0.860128679452224, "grad_norm": 0.10716056428475798, "learning_rate": 1.576128944382611e-06, "loss": 0.191, "step": 27840 }, { "epoch": 0.8604376337192687, "grad_norm": 0.11321245368661662, "learning_rate": 1.569297300583074e-06, "loss": 0.1854, "step": 27850 }, { "epoch": 0.8607465879863133, "grad_norm": 0.09429588332951563, "learning_rate": 1.5624796773221755e-06, "loss": 0.1846, "step": 27860 }, { "epoch": 0.8610555422533579, "grad_norm": 0.09995022017383527, "learning_rate": 1.5556760817169424e-06, "loss": 0.1876, "step": 27870 }, { "epoch": 0.8613644965204026, "grad_norm": 0.10694670781507666, "learning_rate": 1.5488865208697717e-06, "loss": 0.1828, "step": 27880 }, { "epoch": 0.8616734507874472, "grad_norm": 0.10492273533689082, "learning_rate": 1.542111001868392e-06, "loss": 0.1778, "step": 27890 }, { "epoch": 0.8619824050544919, "grad_norm": 0.09622600235705367, "learning_rate": 1.5353495317858923e-06, "loss": 0.1824, "step": 27900 }, { "epoch": 0.8622913593215364, "grad_norm": 0.09557007187218863, "learning_rate": 1.5286021176806797e-06, "loss": 0.1892, "step": 27910 }, { "epoch": 0.862600313588581, "grad_norm": 0.10837043908556614, "learning_rate": 1.5218687665964948e-06, "loss": 0.1813, "step": 27920 }, { "epoch": 0.8629092678556257, "grad_norm": 0.10418243323508425, "learning_rate": 1.5151494855623998e-06, "loss": 0.1833, "step": 27930 }, { "epoch": 0.8632182221226703, "grad_norm": 0.10555062612200378, "learning_rate": 1.5084442815927624e-06, "loss": 0.1838, "step": 27940 }, { "epoch": 0.8635271763897149, "grad_norm": 0.09854942097035761, "learning_rate": 1.5017531616872633e-06, "loss": 0.1847, "step": 27950 }, { "epoch": 0.8638361306567596, "grad_norm": 0.11210731673036076, "learning_rate": 1.4950761328308732e-06, "loss": 0.1798, "step": 27960 }, { "epoch": 0.8641450849238042, "grad_norm": 0.09776934590645367, "learning_rate": 1.4884132019938584e-06, "loss": 0.1787, "step": 27970 }, { "epoch": 0.8644540391908487, "grad_norm": 0.09231585913918622, "learning_rate": 1.4817643761317618e-06, "loss": 0.187, "step": 27980 }, { "epoch": 0.8647629934578934, "grad_norm": 0.09509297143759726, "learning_rate": 1.4751296621854093e-06, "loss": 0.1904, "step": 27990 }, { "epoch": 0.865071947724938, "grad_norm": 0.09832590235087585, "learning_rate": 1.4685090670808876e-06, "loss": 0.1876, "step": 28000 }, { "epoch": 0.8653809019919826, "grad_norm": 0.0848024447055945, "learning_rate": 1.461902597729553e-06, "loss": 0.1795, "step": 28010 }, { "epoch": 0.8656898562590273, "grad_norm": 0.0978696028892343, "learning_rate": 1.4553102610280056e-06, "loss": 0.1858, "step": 28020 }, { "epoch": 0.8659988105260719, "grad_norm": 0.10249821211189425, "learning_rate": 1.4487320638581053e-06, "loss": 0.1803, "step": 28030 }, { "epoch": 0.8663077647931166, "grad_norm": 0.10341479468088537, "learning_rate": 1.4421680130869342e-06, "loss": 0.1804, "step": 28040 }, { "epoch": 0.8666167190601611, "grad_norm": 0.10514413389059125, "learning_rate": 1.4356181155668258e-06, "loss": 0.183, "step": 28050 }, { "epoch": 0.8669256733272057, "grad_norm": 0.14211967646179546, "learning_rate": 1.4290823781353224e-06, "loss": 0.1889, "step": 28060 }, { "epoch": 0.8672346275942504, "grad_norm": 0.09157833309032826, "learning_rate": 1.4225608076151964e-06, "loss": 0.1815, "step": 28070 }, { "epoch": 0.867543581861295, "grad_norm": 0.1257052164852312, "learning_rate": 1.4160534108144218e-06, "loss": 0.1883, "step": 28080 }, { "epoch": 0.8678525361283396, "grad_norm": 0.11028424163893609, "learning_rate": 1.409560194526185e-06, "loss": 0.1843, "step": 28090 }, { "epoch": 0.8681614903953843, "grad_norm": 0.09495208514787493, "learning_rate": 1.4030811655288583e-06, "loss": 0.1795, "step": 28100 }, { "epoch": 0.8684704446624288, "grad_norm": 0.09347743867915254, "learning_rate": 1.3966163305860168e-06, "loss": 0.182, "step": 28110 }, { "epoch": 0.8687793989294734, "grad_norm": 0.10122675508013164, "learning_rate": 1.390165696446407e-06, "loss": 0.184, "step": 28120 }, { "epoch": 0.8690883531965181, "grad_norm": 0.08562657360948904, "learning_rate": 1.383729269843959e-06, "loss": 0.1847, "step": 28130 }, { "epoch": 0.8693973074635627, "grad_norm": 0.13636203679532566, "learning_rate": 1.3773070574977663e-06, "loss": 0.1896, "step": 28140 }, { "epoch": 0.8697062617306073, "grad_norm": 0.09964190624577135, "learning_rate": 1.3708990661120812e-06, "loss": 0.1868, "step": 28150 }, { "epoch": 0.870015215997652, "grad_norm": 0.08693331717577828, "learning_rate": 1.3645053023763227e-06, "loss": 0.1878, "step": 28160 }, { "epoch": 0.8703241702646966, "grad_norm": 0.11100386596048474, "learning_rate": 1.3581257729650448e-06, "loss": 0.1832, "step": 28170 }, { "epoch": 0.8706331245317412, "grad_norm": 0.1752273090891548, "learning_rate": 1.3517604845379455e-06, "loss": 0.1897, "step": 28180 }, { "epoch": 0.8709420787987858, "grad_norm": 0.08455187605101577, "learning_rate": 1.3454094437398595e-06, "loss": 0.1792, "step": 28190 }, { "epoch": 0.8712510330658304, "grad_norm": 0.11473654980052184, "learning_rate": 1.3390726572007483e-06, "loss": 0.1891, "step": 28200 }, { "epoch": 0.8715599873328751, "grad_norm": 0.09969500923639464, "learning_rate": 1.3327501315356838e-06, "loss": 0.1801, "step": 28210 }, { "epoch": 0.8718689415999197, "grad_norm": 0.13141736816992747, "learning_rate": 1.3264418733448647e-06, "loss": 0.1806, "step": 28220 }, { "epoch": 0.8721778958669643, "grad_norm": 0.09542391208819641, "learning_rate": 1.320147889213585e-06, "loss": 0.1784, "step": 28230 }, { "epoch": 0.872486850134009, "grad_norm": 0.10039030875895556, "learning_rate": 1.313868185712247e-06, "loss": 0.1783, "step": 28240 }, { "epoch": 0.8727958044010535, "grad_norm": 0.10101882821079176, "learning_rate": 1.3076027693963322e-06, "loss": 0.1851, "step": 28250 }, { "epoch": 0.8731047586680981, "grad_norm": 0.10052606067948769, "learning_rate": 1.3013516468064218e-06, "loss": 0.1783, "step": 28260 }, { "epoch": 0.8734137129351428, "grad_norm": 0.10399175254378158, "learning_rate": 1.2951148244681626e-06, "loss": 0.18, "step": 28270 }, { "epoch": 0.8737226672021874, "grad_norm": 0.10732087087718088, "learning_rate": 1.2888923088922866e-06, "loss": 0.1875, "step": 28280 }, { "epoch": 0.874031621469232, "grad_norm": 0.12140957936172334, "learning_rate": 1.2826841065745759e-06, "loss": 0.1799, "step": 28290 }, { "epoch": 0.8743405757362767, "grad_norm": 0.13048644050569663, "learning_rate": 1.2764902239958848e-06, "loss": 0.1965, "step": 28300 }, { "epoch": 0.8746495300033212, "grad_norm": 0.08823311951562303, "learning_rate": 1.2703106676221078e-06, "loss": 0.1899, "step": 28310 }, { "epoch": 0.8749584842703659, "grad_norm": 0.09118016851784427, "learning_rate": 1.2641454439041944e-06, "loss": 0.1795, "step": 28320 }, { "epoch": 0.8752674385374105, "grad_norm": 0.09712283858601073, "learning_rate": 1.2579945592781183e-06, "loss": 0.1779, "step": 28330 }, { "epoch": 0.8755763928044551, "grad_norm": 0.09555532218485613, "learning_rate": 1.2518580201649e-06, "loss": 0.1854, "step": 28340 }, { "epoch": 0.8758853470714998, "grad_norm": 0.10978721326611847, "learning_rate": 1.2457358329705714e-06, "loss": 0.1919, "step": 28350 }, { "epoch": 0.8761943013385444, "grad_norm": 0.09877770628622538, "learning_rate": 1.2396280040861923e-06, "loss": 0.1793, "step": 28360 }, { "epoch": 0.876503255605589, "grad_norm": 0.09840061995672636, "learning_rate": 1.2335345398878256e-06, "loss": 0.1849, "step": 28370 }, { "epoch": 0.8768122098726336, "grad_norm": 0.10842132193982282, "learning_rate": 1.2274554467365429e-06, "loss": 0.1859, "step": 28380 }, { "epoch": 0.8771211641396782, "grad_norm": 0.09453393531545135, "learning_rate": 1.2213907309784144e-06, "loss": 0.1812, "step": 28390 }, { "epoch": 0.8774301184067228, "grad_norm": 0.09734470135064001, "learning_rate": 1.2153403989444977e-06, "loss": 0.184, "step": 28400 }, { "epoch": 0.8777390726737675, "grad_norm": 0.10300073893610687, "learning_rate": 1.2093044569508371e-06, "loss": 0.1869, "step": 28410 }, { "epoch": 0.8780480269408121, "grad_norm": 0.09515505870941551, "learning_rate": 1.2032829112984573e-06, "loss": 0.1837, "step": 28420 }, { "epoch": 0.8783569812078567, "grad_norm": 0.10274256747449963, "learning_rate": 1.1972757682733497e-06, "loss": 0.1797, "step": 28430 }, { "epoch": 0.8786659354749013, "grad_norm": 0.11855058908400182, "learning_rate": 1.191283034146472e-06, "loss": 0.1818, "step": 28440 }, { "epoch": 0.8789748897419459, "grad_norm": 0.09452027487374964, "learning_rate": 1.1853047151737461e-06, "loss": 0.1805, "step": 28450 }, { "epoch": 0.8792838440089906, "grad_norm": 0.09298885395420112, "learning_rate": 1.179340817596035e-06, "loss": 0.1805, "step": 28460 }, { "epoch": 0.8795927982760352, "grad_norm": 0.11865235122609638, "learning_rate": 1.1733913476391572e-06, "loss": 0.1766, "step": 28470 }, { "epoch": 0.8799017525430798, "grad_norm": 0.09375499954975781, "learning_rate": 1.1674563115138609e-06, "loss": 0.1826, "step": 28480 }, { "epoch": 0.8802107068101245, "grad_norm": 0.10916748571180004, "learning_rate": 1.1615357154158367e-06, "loss": 0.1806, "step": 28490 }, { "epoch": 0.8805196610771691, "grad_norm": 0.10295313678788805, "learning_rate": 1.1556295655256894e-06, "loss": 0.1838, "step": 28500 }, { "epoch": 0.8808286153442136, "grad_norm": 0.1330241211205987, "learning_rate": 1.149737868008955e-06, "loss": 0.1912, "step": 28510 }, { "epoch": 0.8811375696112583, "grad_norm": 0.10395038689526555, "learning_rate": 1.1438606290160703e-06, "loss": 0.1808, "step": 28520 }, { "epoch": 0.8814465238783029, "grad_norm": 0.10450541582801408, "learning_rate": 1.1379978546823911e-06, "loss": 0.1851, "step": 28530 }, { "epoch": 0.8817554781453475, "grad_norm": 0.11449832943291852, "learning_rate": 1.1321495511281616e-06, "loss": 0.191, "step": 28540 }, { "epoch": 0.8820644324123922, "grad_norm": 0.088844180214581, "learning_rate": 1.1263157244585315e-06, "loss": 0.1818, "step": 28550 }, { "epoch": 0.8823733866794368, "grad_norm": 0.1045786256839124, "learning_rate": 1.1204963807635243e-06, "loss": 0.1819, "step": 28560 }, { "epoch": 0.8826823409464813, "grad_norm": 0.0917028822218501, "learning_rate": 1.1146915261180574e-06, "loss": 0.1862, "step": 28570 }, { "epoch": 0.882991295213526, "grad_norm": 0.09509239739242586, "learning_rate": 1.1089011665819132e-06, "loss": 0.1798, "step": 28580 }, { "epoch": 0.8833002494805706, "grad_norm": 0.08936307446247876, "learning_rate": 1.1031253081997517e-06, "loss": 0.1862, "step": 28590 }, { "epoch": 0.8836092037476153, "grad_norm": 0.09163361020269918, "learning_rate": 1.0973639570010845e-06, "loss": 0.1796, "step": 28600 }, { "epoch": 0.8839181580146599, "grad_norm": 0.10171996435464667, "learning_rate": 1.0916171190002844e-06, "loss": 0.179, "step": 28610 }, { "epoch": 0.8842271122817045, "grad_norm": 0.10084838396440059, "learning_rate": 1.085884800196577e-06, "loss": 0.1874, "step": 28620 }, { "epoch": 0.8845360665487492, "grad_norm": 0.09798025936235355, "learning_rate": 1.0801670065740226e-06, "loss": 0.1763, "step": 28630 }, { "epoch": 0.8848450208157937, "grad_norm": 0.09644348296261553, "learning_rate": 1.0744637441015248e-06, "loss": 0.1812, "step": 28640 }, { "epoch": 0.8851539750828383, "grad_norm": 0.09165574629779512, "learning_rate": 1.0687750187328176e-06, "loss": 0.1763, "step": 28650 }, { "epoch": 0.885462929349883, "grad_norm": 0.09126176597133256, "learning_rate": 1.0631008364064575e-06, "loss": 0.1819, "step": 28660 }, { "epoch": 0.8857718836169276, "grad_norm": 0.10177329966346589, "learning_rate": 1.0574412030458163e-06, "loss": 0.1843, "step": 28670 }, { "epoch": 0.8860808378839722, "grad_norm": 0.09142026877755322, "learning_rate": 1.0517961245590862e-06, "loss": 0.1783, "step": 28680 }, { "epoch": 0.8863897921510169, "grad_norm": 0.10315681333619384, "learning_rate": 1.0461656068392568e-06, "loss": 0.1794, "step": 28690 }, { "epoch": 0.8866987464180615, "grad_norm": 0.10808907562664705, "learning_rate": 1.0405496557641241e-06, "loss": 0.1844, "step": 28700 }, { "epoch": 0.887007700685106, "grad_norm": 0.08912185514779751, "learning_rate": 1.0349482771962727e-06, "loss": 0.1796, "step": 28710 }, { "epoch": 0.8873166549521507, "grad_norm": 0.10877406337780003, "learning_rate": 1.0293614769830794e-06, "loss": 0.1893, "step": 28720 }, { "epoch": 0.8876256092191953, "grad_norm": 0.10967357711705748, "learning_rate": 1.0237892609566962e-06, "loss": 0.1807, "step": 28730 }, { "epoch": 0.88793456348624, "grad_norm": 0.08563485597898633, "learning_rate": 1.0182316349340565e-06, "loss": 0.1811, "step": 28740 }, { "epoch": 0.8882435177532846, "grad_norm": 0.10322718457396875, "learning_rate": 1.0126886047168592e-06, "loss": 0.1772, "step": 28750 }, { "epoch": 0.8885524720203292, "grad_norm": 0.0910924622392542, "learning_rate": 1.007160176091571e-06, "loss": 0.1842, "step": 28760 }, { "epoch": 0.8888614262873739, "grad_norm": 0.09560388790456413, "learning_rate": 1.001646354829407e-06, "loss": 0.1827, "step": 28770 }, { "epoch": 0.8891703805544184, "grad_norm": 0.1110580301572007, "learning_rate": 9.961471466863442e-07, "loss": 0.1808, "step": 28780 }, { "epoch": 0.889479334821463, "grad_norm": 0.09320975049039067, "learning_rate": 9.906625574030942e-07, "loss": 0.1839, "step": 28790 }, { "epoch": 0.8897882890885077, "grad_norm": 0.08565687704294897, "learning_rate": 9.85192592705117e-07, "loss": 0.1838, "step": 28800 }, { "epoch": 0.8900972433555523, "grad_norm": 0.0949658311326804, "learning_rate": 9.797372583026e-07, "loss": 0.1865, "step": 28810 }, { "epoch": 0.8904061976225969, "grad_norm": 0.10439513176418971, "learning_rate": 9.742965598904608e-07, "loss": 0.1819, "step": 28820 }, { "epoch": 0.8907151518896416, "grad_norm": 0.10962135114351249, "learning_rate": 9.688705031483347e-07, "loss": 0.1847, "step": 28830 }, { "epoch": 0.8910241061566861, "grad_norm": 0.08592384740259364, "learning_rate": 9.634590937405742e-07, "loss": 0.1875, "step": 28840 }, { "epoch": 0.8913330604237307, "grad_norm": 0.08823603395008275, "learning_rate": 9.58062337316245e-07, "loss": 0.1824, "step": 28850 }, { "epoch": 0.8916420146907754, "grad_norm": 0.09501198504753797, "learning_rate": 9.526802395091111e-07, "loss": 0.1779, "step": 28860 }, { "epoch": 0.89195096895782, "grad_norm": 0.10401951929700107, "learning_rate": 9.473128059376345e-07, "loss": 0.1849, "step": 28870 }, { "epoch": 0.8922599232248647, "grad_norm": 0.09717035872126756, "learning_rate": 9.419600422049685e-07, "loss": 0.1753, "step": 28880 }, { "epoch": 0.8925688774919093, "grad_norm": 0.10169162217251684, "learning_rate": 9.366219538989595e-07, "loss": 0.1814, "step": 28890 }, { "epoch": 0.8928778317589539, "grad_norm": 0.11756627345092277, "learning_rate": 9.312985465921237e-07, "loss": 0.187, "step": 28900 }, { "epoch": 0.8931867860259985, "grad_norm": 0.09539420893620165, "learning_rate": 9.259898258416588e-07, "loss": 0.1878, "step": 28910 }, { "epoch": 0.8934957402930431, "grad_norm": 0.09122934330826722, "learning_rate": 9.20695797189427e-07, "loss": 0.1862, "step": 28920 }, { "epoch": 0.8938046945600877, "grad_norm": 0.10825014330310029, "learning_rate": 9.15416466161959e-07, "loss": 0.1807, "step": 28930 }, { "epoch": 0.8941136488271324, "grad_norm": 0.07922736658973209, "learning_rate": 9.101518382704316e-07, "loss": 0.1873, "step": 28940 }, { "epoch": 0.894422603094177, "grad_norm": 0.11104154065294454, "learning_rate": 9.04901919010685e-07, "loss": 0.1877, "step": 28950 }, { "epoch": 0.8947315573612216, "grad_norm": 0.08437513465972055, "learning_rate": 8.996667138631937e-07, "loss": 0.1798, "step": 28960 }, { "epoch": 0.8950405116282663, "grad_norm": 0.09917149705694504, "learning_rate": 8.944462282930827e-07, "loss": 0.1835, "step": 28970 }, { "epoch": 0.8953494658953108, "grad_norm": 0.09337580339209801, "learning_rate": 8.89240467750101e-07, "loss": 0.1829, "step": 28980 }, { "epoch": 0.8956584201623554, "grad_norm": 0.10589561235209922, "learning_rate": 8.840494376686348e-07, "loss": 0.1795, "step": 28990 }, { "epoch": 0.8959673744294001, "grad_norm": 0.09909898805593338, "learning_rate": 8.788731434676827e-07, "loss": 0.1784, "step": 29000 }, { "epoch": 0.8962763286964447, "grad_norm": 0.1153548371364837, "learning_rate": 8.737115905508741e-07, "loss": 0.18, "step": 29010 }, { "epoch": 0.8965852829634894, "grad_norm": 0.10130362018181113, "learning_rate": 8.685647843064337e-07, "loss": 0.1808, "step": 29020 }, { "epoch": 0.896894237230534, "grad_norm": 0.10985169077075072, "learning_rate": 8.634327301072054e-07, "loss": 0.1801, "step": 29030 }, { "epoch": 0.8972031914975785, "grad_norm": 0.08066976093560242, "learning_rate": 8.583154333106269e-07, "loss": 0.1851, "step": 29040 }, { "epoch": 0.8975121457646232, "grad_norm": 0.10468372757573076, "learning_rate": 8.532128992587268e-07, "loss": 0.1785, "step": 29050 }, { "epoch": 0.8978211000316678, "grad_norm": 0.1102368775175334, "learning_rate": 8.481251332781343e-07, "loss": 0.1826, "step": 29060 }, { "epoch": 0.8981300542987124, "grad_norm": 0.12688113008564225, "learning_rate": 8.430521406800473e-07, "loss": 0.1791, "step": 29070 }, { "epoch": 0.8984390085657571, "grad_norm": 0.10385916131286953, "learning_rate": 8.379939267602549e-07, "loss": 0.185, "step": 29080 }, { "epoch": 0.8987479628328017, "grad_norm": 0.09292387950499816, "learning_rate": 8.329504967991114e-07, "loss": 0.1811, "step": 29090 }, { "epoch": 0.8990569170998463, "grad_norm": 0.0910318387040759, "learning_rate": 8.279218560615354e-07, "loss": 0.1804, "step": 29100 }, { "epoch": 0.899365871366891, "grad_norm": 0.10269314685387879, "learning_rate": 8.229080097970126e-07, "loss": 0.1807, "step": 29110 }, { "epoch": 0.8996748256339355, "grad_norm": 0.12226884148247592, "learning_rate": 8.179089632395831e-07, "loss": 0.1813, "step": 29120 }, { "epoch": 0.8999837799009801, "grad_norm": 0.10890893639008979, "learning_rate": 8.129247216078339e-07, "loss": 0.1803, "step": 29130 }, { "epoch": 0.9002927341680248, "grad_norm": 0.10686181822274243, "learning_rate": 8.079552901049031e-07, "loss": 0.1792, "step": 29140 }, { "epoch": 0.9006016884350694, "grad_norm": 0.11025449599331412, "learning_rate": 8.030006739184609e-07, "loss": 0.1856, "step": 29150 }, { "epoch": 0.9009106427021141, "grad_norm": 0.09973545902540376, "learning_rate": 7.9806087822072e-07, "loss": 0.1786, "step": 29160 }, { "epoch": 0.9012195969691587, "grad_norm": 0.08963783098207974, "learning_rate": 7.93135908168412e-07, "loss": 0.1813, "step": 29170 }, { "epoch": 0.9015285512362032, "grad_norm": 0.11163810463198195, "learning_rate": 7.882257689028011e-07, "loss": 0.1841, "step": 29180 }, { "epoch": 0.9018375055032479, "grad_norm": 0.10297976402767278, "learning_rate": 7.833304655496604e-07, "loss": 0.1788, "step": 29190 }, { "epoch": 0.9021464597702925, "grad_norm": 0.11550709958897573, "learning_rate": 7.784500032192853e-07, "loss": 0.1788, "step": 29200 }, { "epoch": 0.9024554140373371, "grad_norm": 0.10279863424340445, "learning_rate": 7.735843870064674e-07, "loss": 0.1833, "step": 29210 }, { "epoch": 0.9027643683043818, "grad_norm": 0.09129452739293219, "learning_rate": 7.687336219905134e-07, "loss": 0.1879, "step": 29220 }, { "epoch": 0.9030733225714264, "grad_norm": 0.10351253777475707, "learning_rate": 7.638977132352127e-07, "loss": 0.187, "step": 29230 }, { "epoch": 0.9033822768384709, "grad_norm": 0.11887945138669731, "learning_rate": 7.59076665788857e-07, "loss": 0.183, "step": 29240 }, { "epoch": 0.9036912311055156, "grad_norm": 0.09319794345234407, "learning_rate": 7.542704846842157e-07, "loss": 0.1803, "step": 29250 }, { "epoch": 0.9040001853725602, "grad_norm": 0.11671175457831218, "learning_rate": 7.494791749385472e-07, "loss": 0.1942, "step": 29260 }, { "epoch": 0.9043091396396048, "grad_norm": 0.09578553004986752, "learning_rate": 7.447027415535784e-07, "loss": 0.1829, "step": 29270 }, { "epoch": 0.9046180939066495, "grad_norm": 0.10359140237182186, "learning_rate": 7.399411895155095e-07, "loss": 0.1816, "step": 29280 }, { "epoch": 0.9049270481736941, "grad_norm": 0.10032944776746226, "learning_rate": 7.351945237950059e-07, "loss": 0.187, "step": 29290 }, { "epoch": 0.9052360024407388, "grad_norm": 0.10687990697775257, "learning_rate": 7.304627493471938e-07, "loss": 0.1812, "step": 29300 }, { "epoch": 0.9055449567077833, "grad_norm": 0.09628847482176447, "learning_rate": 7.257458711116488e-07, "loss": 0.1808, "step": 29310 }, { "epoch": 0.9058539109748279, "grad_norm": 0.10458719491106042, "learning_rate": 7.210438940124054e-07, "loss": 0.1781, "step": 29320 }, { "epoch": 0.9061628652418726, "grad_norm": 0.10001730084102851, "learning_rate": 7.163568229579337e-07, "loss": 0.1825, "step": 29330 }, { "epoch": 0.9064718195089172, "grad_norm": 0.0991383494556449, "learning_rate": 7.116846628411467e-07, "loss": 0.1896, "step": 29340 }, { "epoch": 0.9067807737759618, "grad_norm": 0.09150038024499577, "learning_rate": 7.07027418539396e-07, "loss": 0.1774, "step": 29350 }, { "epoch": 0.9070897280430065, "grad_norm": 0.09839956558796607, "learning_rate": 7.023850949144511e-07, "loss": 0.1801, "step": 29360 }, { "epoch": 0.907398682310051, "grad_norm": 0.09434902663828376, "learning_rate": 6.977576968125205e-07, "loss": 0.1785, "step": 29370 }, { "epoch": 0.9077076365770956, "grad_norm": 0.1058722958615921, "learning_rate": 6.931452290642149e-07, "loss": 0.1854, "step": 29380 }, { "epoch": 0.9080165908441403, "grad_norm": 0.09113919344971512, "learning_rate": 6.885476964845761e-07, "loss": 0.1837, "step": 29390 }, { "epoch": 0.9083255451111849, "grad_norm": 0.08606955551639808, "learning_rate": 6.839651038730399e-07, "loss": 0.18, "step": 29400 }, { "epoch": 0.9086344993782295, "grad_norm": 0.10582500713480847, "learning_rate": 6.793974560134581e-07, "loss": 0.1823, "step": 29410 }, { "epoch": 0.9089434536452742, "grad_norm": 0.10987654427791921, "learning_rate": 6.748447576740713e-07, "loss": 0.1832, "step": 29420 }, { "epoch": 0.9092524079123188, "grad_norm": 0.10692373351150367, "learning_rate": 6.703070136075212e-07, "loss": 0.1787, "step": 29430 }, { "epoch": 0.9095613621793635, "grad_norm": 0.1023497268432669, "learning_rate": 6.657842285508337e-07, "loss": 0.1782, "step": 29440 }, { "epoch": 0.909870316446408, "grad_norm": 0.09836289116065891, "learning_rate": 6.612764072254251e-07, "loss": 0.1788, "step": 29450 }, { "epoch": 0.9101792707134526, "grad_norm": 0.10072022627718054, "learning_rate": 6.567835543370815e-07, "loss": 0.1873, "step": 29460 }, { "epoch": 0.9104882249804973, "grad_norm": 0.10000037604708514, "learning_rate": 6.523056745759726e-07, "loss": 0.1796, "step": 29470 }, { "epoch": 0.9107971792475419, "grad_norm": 0.09392768576244617, "learning_rate": 6.478427726166275e-07, "loss": 0.1812, "step": 29480 }, { "epoch": 0.9111061335145865, "grad_norm": 0.10380911407942348, "learning_rate": 6.433948531179529e-07, "loss": 0.1831, "step": 29490 }, { "epoch": 0.9114150877816312, "grad_norm": 0.11039595676455562, "learning_rate": 6.389619207232028e-07, "loss": 0.1816, "step": 29500 }, { "epoch": 0.9117240420486757, "grad_norm": 0.09300630064449593, "learning_rate": 6.345439800599906e-07, "loss": 0.1796, "step": 29510 }, { "epoch": 0.9120329963157203, "grad_norm": 0.09061544982607218, "learning_rate": 6.30141035740282e-07, "loss": 0.1854, "step": 29520 }, { "epoch": 0.912341950582765, "grad_norm": 0.09678414878726044, "learning_rate": 6.257530923603838e-07, "loss": 0.1913, "step": 29530 }, { "epoch": 0.9126509048498096, "grad_norm": 0.10063326496934295, "learning_rate": 6.213801545009417e-07, "loss": 0.1803, "step": 29540 }, { "epoch": 0.9129598591168542, "grad_norm": 0.08755798029903349, "learning_rate": 6.170222267269459e-07, "loss": 0.1808, "step": 29550 }, { "epoch": 0.9132688133838989, "grad_norm": 0.09012231097949266, "learning_rate": 6.126793135877074e-07, "loss": 0.1865, "step": 29560 }, { "epoch": 0.9135777676509435, "grad_norm": 0.102220680898261, "learning_rate": 6.083514196168661e-07, "loss": 0.1858, "step": 29570 }, { "epoch": 0.9138867219179881, "grad_norm": 0.10101546876552676, "learning_rate": 6.04038549332388e-07, "loss": 0.1816, "step": 29580 }, { "epoch": 0.9141956761850327, "grad_norm": 0.09639490661184655, "learning_rate": 5.997407072365452e-07, "loss": 0.1849, "step": 29590 }, { "epoch": 0.9145046304520773, "grad_norm": 0.10361945677028739, "learning_rate": 5.954578978159353e-07, "loss": 0.1789, "step": 29600 }, { "epoch": 0.914813584719122, "grad_norm": 0.10471801022044404, "learning_rate": 5.911901255414504e-07, "loss": 0.1931, "step": 29610 }, { "epoch": 0.9151225389861666, "grad_norm": 0.10896051572266999, "learning_rate": 5.869373948682949e-07, "loss": 0.1875, "step": 29620 }, { "epoch": 0.9154314932532112, "grad_norm": 0.08887670788550045, "learning_rate": 5.826997102359627e-07, "loss": 0.1757, "step": 29630 }, { "epoch": 0.9157404475202559, "grad_norm": 0.10405189299446264, "learning_rate": 5.784770760682467e-07, "loss": 0.178, "step": 29640 }, { "epoch": 0.9160494017873004, "grad_norm": 0.10633378942465, "learning_rate": 5.742694967732259e-07, "loss": 0.1862, "step": 29650 }, { "epoch": 0.916358356054345, "grad_norm": 0.10989376626685819, "learning_rate": 5.700769767432634e-07, "loss": 0.185, "step": 29660 }, { "epoch": 0.9166673103213897, "grad_norm": 0.10611869518374586, "learning_rate": 5.658995203550033e-07, "loss": 0.1957, "step": 29670 }, { "epoch": 0.9169762645884343, "grad_norm": 0.09543215073081474, "learning_rate": 5.61737131969362e-07, "loss": 0.1836, "step": 29680 }, { "epoch": 0.9172852188554789, "grad_norm": 0.10380110789519864, "learning_rate": 5.575898159315273e-07, "loss": 0.1867, "step": 29690 }, { "epoch": 0.9175941731225236, "grad_norm": 0.10505856858325727, "learning_rate": 5.534575765709559e-07, "loss": 0.1761, "step": 29700 }, { "epoch": 0.9179031273895681, "grad_norm": 0.0918711821906746, "learning_rate": 5.493404182013573e-07, "loss": 0.178, "step": 29710 }, { "epoch": 0.9182120816566128, "grad_norm": 0.09006996867686663, "learning_rate": 5.452383451207083e-07, "loss": 0.185, "step": 29720 }, { "epoch": 0.9185210359236574, "grad_norm": 0.08282612653706493, "learning_rate": 5.411513616112302e-07, "loss": 0.1803, "step": 29730 }, { "epoch": 0.918829990190702, "grad_norm": 0.1029773562128662, "learning_rate": 5.370794719393934e-07, "loss": 0.1792, "step": 29740 }, { "epoch": 0.9191389444577467, "grad_norm": 0.10228421669832138, "learning_rate": 5.330226803559146e-07, "loss": 0.183, "step": 29750 }, { "epoch": 0.9194478987247913, "grad_norm": 0.09512145208943297, "learning_rate": 5.289809910957477e-07, "loss": 0.1794, "step": 29760 }, { "epoch": 0.9197568529918358, "grad_norm": 0.10968131771617376, "learning_rate": 5.249544083780761e-07, "loss": 0.1793, "step": 29770 }, { "epoch": 0.9200658072588805, "grad_norm": 0.0944946814003296, "learning_rate": 5.209429364063256e-07, "loss": 0.183, "step": 29780 }, { "epoch": 0.9203747615259251, "grad_norm": 0.08907065647558841, "learning_rate": 5.16946579368135e-07, "loss": 0.1853, "step": 29790 }, { "epoch": 0.9206837157929697, "grad_norm": 0.10808489485030408, "learning_rate": 5.129653414353685e-07, "loss": 0.1823, "step": 29800 }, { "epoch": 0.9209926700600144, "grad_norm": 0.09297344698286493, "learning_rate": 5.089992267641119e-07, "loss": 0.1771, "step": 29810 }, { "epoch": 0.921301624327059, "grad_norm": 0.10569189195410038, "learning_rate": 5.050482394946549e-07, "loss": 0.1818, "step": 29820 }, { "epoch": 0.9216105785941036, "grad_norm": 0.1031636284663917, "learning_rate": 5.011123837515047e-07, "loss": 0.1797, "step": 29830 }, { "epoch": 0.9219195328611482, "grad_norm": 0.09695635680227692, "learning_rate": 4.971916636433633e-07, "loss": 0.184, "step": 29840 }, { "epoch": 0.9222284871281928, "grad_norm": 0.10368983285980565, "learning_rate": 4.932860832631431e-07, "loss": 0.1808, "step": 29850 }, { "epoch": 0.9225374413952375, "grad_norm": 0.12075062802191754, "learning_rate": 4.893956466879395e-07, "loss": 0.1828, "step": 29860 }, { "epoch": 0.9228463956622821, "grad_norm": 0.10468553119957653, "learning_rate": 4.855203579790506e-07, "loss": 0.1881, "step": 29870 }, { "epoch": 0.9231553499293267, "grad_norm": 0.08886598344406338, "learning_rate": 4.816602211819504e-07, "loss": 0.1813, "step": 29880 }, { "epoch": 0.9234643041963714, "grad_norm": 0.09755390173940867, "learning_rate": 4.778152403263087e-07, "loss": 0.1839, "step": 29890 }, { "epoch": 0.923773258463416, "grad_norm": 0.11342180876900834, "learning_rate": 4.7398541942595797e-07, "loss": 0.1881, "step": 29900 }, { "epoch": 0.9240822127304605, "grad_norm": 0.10265418369914854, "learning_rate": 4.701707624789203e-07, "loss": 0.1812, "step": 29910 }, { "epoch": 0.9243911669975052, "grad_norm": 0.0860545327122374, "learning_rate": 4.66371273467377e-07, "loss": 0.1829, "step": 29920 }, { "epoch": 0.9247001212645498, "grad_norm": 0.0869079032605448, "learning_rate": 4.6258695635768013e-07, "loss": 0.1831, "step": 29930 }, { "epoch": 0.9250090755315944, "grad_norm": 0.08322258554903587, "learning_rate": 4.5881781510034306e-07, "loss": 0.18, "step": 29940 }, { "epoch": 0.9253180297986391, "grad_norm": 0.11584902980690737, "learning_rate": 4.5506385363003666e-07, "loss": 0.1811, "step": 29950 }, { "epoch": 0.9256269840656837, "grad_norm": 0.11449546928077053, "learning_rate": 4.513250758655846e-07, "loss": 0.179, "step": 29960 }, { "epoch": 0.9259359383327282, "grad_norm": 0.09511237831508776, "learning_rate": 4.476014857099581e-07, "loss": 0.1808, "step": 29970 }, { "epoch": 0.9262448925997729, "grad_norm": 0.09777927733821344, "learning_rate": 4.438930870502794e-07, "loss": 0.1815, "step": 29980 }, { "epoch": 0.9265538468668175, "grad_norm": 0.09791697691167553, "learning_rate": 4.401998837578086e-07, "loss": 0.1795, "step": 29990 }, { "epoch": 0.9268628011338622, "grad_norm": 0.09410905844978593, "learning_rate": 4.3652187968794166e-07, "loss": 0.1823, "step": 30000 }, { "epoch": 0.9271717554009068, "grad_norm": 0.11233822212239762, "learning_rate": 4.328590786802089e-07, "loss": 0.187, "step": 30010 }, { "epoch": 0.9274807096679514, "grad_norm": 0.10961403369670594, "learning_rate": 4.292114845582751e-07, "loss": 0.1819, "step": 30020 }, { "epoch": 0.9277896639349961, "grad_norm": 0.09096732247698736, "learning_rate": 4.255791011299193e-07, "loss": 0.1821, "step": 30030 }, { "epoch": 0.9280986182020406, "grad_norm": 0.10444246248378831, "learning_rate": 4.2196193218705327e-07, "loss": 0.1812, "step": 30040 }, { "epoch": 0.9284075724690852, "grad_norm": 0.09909974058953407, "learning_rate": 4.183599815056982e-07, "loss": 0.1809, "step": 30050 }, { "epoch": 0.9287165267361299, "grad_norm": 0.10396932966086093, "learning_rate": 4.14773252845993e-07, "loss": 0.1814, "step": 30060 }, { "epoch": 0.9290254810031745, "grad_norm": 0.10829532855810024, "learning_rate": 4.1120174995218417e-07, "loss": 0.1786, "step": 30070 }, { "epoch": 0.9293344352702191, "grad_norm": 0.10929377514996606, "learning_rate": 4.076454765526244e-07, "loss": 0.1787, "step": 30080 }, { "epoch": 0.9296433895372638, "grad_norm": 0.0870955571350648, "learning_rate": 4.0410443635976737e-07, "loss": 0.1767, "step": 30090 }, { "epoch": 0.9299523438043084, "grad_norm": 0.09428167257410036, "learning_rate": 4.0057863307016775e-07, "loss": 0.1802, "step": 30100 }, { "epoch": 0.9302612980713529, "grad_norm": 0.12259391774896332, "learning_rate": 3.9706807036446635e-07, "loss": 0.1816, "step": 30110 }, { "epoch": 0.9305702523383976, "grad_norm": 0.10314358065032986, "learning_rate": 3.9357275190740503e-07, "loss": 0.1848, "step": 30120 }, { "epoch": 0.9308792066054422, "grad_norm": 0.10591627551081069, "learning_rate": 3.900926813478001e-07, "loss": 0.185, "step": 30130 }, { "epoch": 0.9311881608724869, "grad_norm": 0.08618325405762615, "learning_rate": 3.8662786231856204e-07, "loss": 0.1819, "step": 30140 }, { "epoch": 0.9314971151395315, "grad_norm": 0.11218447029041394, "learning_rate": 3.831782984366694e-07, "loss": 0.1844, "step": 30150 }, { "epoch": 0.9318060694065761, "grad_norm": 0.08629681276531403, "learning_rate": 3.7974399330318487e-07, "loss": 0.1824, "step": 30160 }, { "epoch": 0.9321150236736208, "grad_norm": 0.09666582244614211, "learning_rate": 3.76324950503234e-07, "loss": 0.183, "step": 30170 }, { "epoch": 0.9324239779406653, "grad_norm": 0.11795066468193195, "learning_rate": 3.7292117360601676e-07, "loss": 0.1799, "step": 30180 }, { "epoch": 0.9327329322077099, "grad_norm": 0.0907756439699543, "learning_rate": 3.6953266616479077e-07, "loss": 0.1783, "step": 30190 }, { "epoch": 0.9330418864747546, "grad_norm": 0.10095984370387073, "learning_rate": 3.661594317168782e-07, "loss": 0.1811, "step": 30200 }, { "epoch": 0.9333508407417992, "grad_norm": 0.09059533748283437, "learning_rate": 3.6280147378365714e-07, "loss": 0.1856, "step": 30210 }, { "epoch": 0.9336597950088438, "grad_norm": 0.10715326895262456, "learning_rate": 3.594587958705553e-07, "loss": 0.1763, "step": 30220 }, { "epoch": 0.9339687492758885, "grad_norm": 0.0961058305704472, "learning_rate": 3.5613140146705137e-07, "loss": 0.1782, "step": 30230 }, { "epoch": 0.934277703542933, "grad_norm": 0.09863720445187962, "learning_rate": 3.5281929404666515e-07, "loss": 0.1974, "step": 30240 }, { "epoch": 0.9345866578099776, "grad_norm": 0.09458104062093899, "learning_rate": 3.495224770669675e-07, "loss": 0.1791, "step": 30250 }, { "epoch": 0.9348956120770223, "grad_norm": 0.11277168452021748, "learning_rate": 3.462409539695588e-07, "loss": 0.1813, "step": 30260 }, { "epoch": 0.9352045663440669, "grad_norm": 0.09949895573709748, "learning_rate": 3.4297472818007714e-07, "loss": 0.1834, "step": 30270 }, { "epoch": 0.9355135206111116, "grad_norm": 0.09233645489688903, "learning_rate": 3.3972380310819174e-07, "loss": 0.1809, "step": 30280 }, { "epoch": 0.9358224748781562, "grad_norm": 0.10647168986460545, "learning_rate": 3.364881821475979e-07, "loss": 0.1804, "step": 30290 }, { "epoch": 0.9361314291452008, "grad_norm": 0.09382266832164135, "learning_rate": 3.332678686760138e-07, "loss": 0.1854, "step": 30300 }, { "epoch": 0.9364403834122454, "grad_norm": 0.12441950134822441, "learning_rate": 3.3006286605518356e-07, "loss": 0.1828, "step": 30310 }, { "epoch": 0.93674933767929, "grad_norm": 0.12066314839556808, "learning_rate": 3.2687317763085923e-07, "loss": 0.1792, "step": 30320 }, { "epoch": 0.9370582919463346, "grad_norm": 0.09219619759749195, "learning_rate": 3.236988067328139e-07, "loss": 0.1855, "step": 30330 }, { "epoch": 0.9373672462133793, "grad_norm": 0.10156500316665396, "learning_rate": 3.205397566748236e-07, "loss": 0.1804, "step": 30340 }, { "epoch": 0.9376762004804239, "grad_norm": 0.1350016992242699, "learning_rate": 3.173960307546786e-07, "loss": 0.1839, "step": 30350 }, { "epoch": 0.9379851547474685, "grad_norm": 0.10143674237682757, "learning_rate": 3.142676322541654e-07, "loss": 0.1822, "step": 30360 }, { "epoch": 0.9382941090145132, "grad_norm": 0.09664601462339328, "learning_rate": 3.111545644390751e-07, "loss": 0.1867, "step": 30370 }, { "epoch": 0.9386030632815577, "grad_norm": 0.11019493789410766, "learning_rate": 3.0805683055918633e-07, "loss": 0.1833, "step": 30380 }, { "epoch": 0.9389120175486023, "grad_norm": 0.11548150338207855, "learning_rate": 3.0497443384828237e-07, "loss": 0.1846, "step": 30390 }, { "epoch": 0.939220971815647, "grad_norm": 0.09089515510412766, "learning_rate": 3.0190737752412434e-07, "loss": 0.1798, "step": 30400 }, { "epoch": 0.9395299260826916, "grad_norm": 0.09324919353782359, "learning_rate": 2.9885566478846937e-07, "loss": 0.1834, "step": 30410 }, { "epoch": 0.9398388803497363, "grad_norm": 0.09823795553559096, "learning_rate": 2.9581929882704917e-07, "loss": 0.1798, "step": 30420 }, { "epoch": 0.9401478346167809, "grad_norm": 0.10024622202052019, "learning_rate": 2.9279828280957656e-07, "loss": 0.1906, "step": 30430 }, { "epoch": 0.9404567888838254, "grad_norm": 0.09194187185121717, "learning_rate": 2.897926198897438e-07, "loss": 0.1852, "step": 30440 }, { "epoch": 0.9407657431508701, "grad_norm": 0.08630621901403171, "learning_rate": 2.8680231320521276e-07, "loss": 0.1801, "step": 30450 }, { "epoch": 0.9410746974179147, "grad_norm": 0.10337890614170019, "learning_rate": 2.838273658776164e-07, "loss": 0.1802, "step": 30460 }, { "epoch": 0.9413836516849593, "grad_norm": 0.11286791158076939, "learning_rate": 2.808677810125504e-07, "loss": 0.1885, "step": 30470 }, { "epoch": 0.941692605952004, "grad_norm": 0.10328023624849803, "learning_rate": 2.779235616995784e-07, "loss": 0.1791, "step": 30480 }, { "epoch": 0.9420015602190486, "grad_norm": 0.08821982637772095, "learning_rate": 2.7499471101221696e-07, "loss": 0.1813, "step": 30490 }, { "epoch": 0.9423105144860932, "grad_norm": 0.09749760321832934, "learning_rate": 2.7208123200794856e-07, "loss": 0.1833, "step": 30500 }, { "epoch": 0.9426194687531378, "grad_norm": 0.10999300650270014, "learning_rate": 2.6918312772819866e-07, "loss": 0.1868, "step": 30510 }, { "epoch": 0.9429284230201824, "grad_norm": 0.10523617642925231, "learning_rate": 2.663004011983505e-07, "loss": 0.1831, "step": 30520 }, { "epoch": 0.943237377287227, "grad_norm": 0.12299685114607545, "learning_rate": 2.6343305542772865e-07, "loss": 0.1765, "step": 30530 }, { "epoch": 0.9435463315542717, "grad_norm": 0.08514869164601965, "learning_rate": 2.60581093409607e-07, "loss": 0.1793, "step": 30540 }, { "epoch": 0.9438552858213163, "grad_norm": 0.09729866740408408, "learning_rate": 2.5774451812119406e-07, "loss": 0.1788, "step": 30550 }, { "epoch": 0.944164240088361, "grad_norm": 0.09401990369983891, "learning_rate": 2.549233325236411e-07, "loss": 0.1869, "step": 30560 }, { "epoch": 0.9444731943554056, "grad_norm": 0.10212319595435185, "learning_rate": 2.5211753956203064e-07, "loss": 0.1798, "step": 30570 }, { "epoch": 0.9447821486224501, "grad_norm": 0.11541113039392047, "learning_rate": 2.493271421653798e-07, "loss": 0.1806, "step": 30580 }, { "epoch": 0.9450911028894948, "grad_norm": 0.09586379427206575, "learning_rate": 2.4655214324662835e-07, "loss": 0.1831, "step": 30590 }, { "epoch": 0.9454000571565394, "grad_norm": 0.10640865997994073, "learning_rate": 2.437925457026491e-07, "loss": 0.178, "step": 30600 }, { "epoch": 0.945709011423584, "grad_norm": 0.09314700680012072, "learning_rate": 2.410483524142276e-07, "loss": 0.1824, "step": 30610 }, { "epoch": 0.9460179656906287, "grad_norm": 0.10211712835299988, "learning_rate": 2.3831956624607744e-07, "loss": 0.1766, "step": 30620 }, { "epoch": 0.9463269199576733, "grad_norm": 0.08845261052946496, "learning_rate": 2.3560619004682316e-07, "loss": 0.1874, "step": 30630 }, { "epoch": 0.9466358742247178, "grad_norm": 0.09570351981970587, "learning_rate": 2.3290822664900568e-07, "loss": 0.1794, "step": 30640 }, { "epoch": 0.9469448284917625, "grad_norm": 0.11036580762220033, "learning_rate": 2.3022567886907543e-07, "loss": 0.183, "step": 30650 }, { "epoch": 0.9472537827588071, "grad_norm": 0.09655892173293927, "learning_rate": 2.275585495073873e-07, "loss": 0.1857, "step": 30660 }, { "epoch": 0.9475627370258517, "grad_norm": 0.09270858918674574, "learning_rate": 2.249068413482025e-07, "loss": 0.1858, "step": 30670 }, { "epoch": 0.9478716912928964, "grad_norm": 0.0999655019877312, "learning_rate": 2.222705571596867e-07, "loss": 0.1865, "step": 30680 }, { "epoch": 0.948180645559941, "grad_norm": 0.10282030825716987, "learning_rate": 2.1964969969390014e-07, "loss": 0.1782, "step": 30690 }, { "epoch": 0.9484895998269857, "grad_norm": 0.11895223395832247, "learning_rate": 2.1704427168680097e-07, "loss": 0.183, "step": 30700 }, { "epoch": 0.9487985540940302, "grad_norm": 0.09845930480604034, "learning_rate": 2.1445427585824019e-07, "loss": 0.1802, "step": 30710 }, { "epoch": 0.9491075083610748, "grad_norm": 0.12006083456041604, "learning_rate": 2.1187971491195668e-07, "loss": 0.1838, "step": 30720 }, { "epoch": 0.9494164626281195, "grad_norm": 0.09735950439066168, "learning_rate": 2.093205915355806e-07, "loss": 0.1827, "step": 30730 }, { "epoch": 0.9497254168951641, "grad_norm": 0.09079479696483302, "learning_rate": 2.0677690840062158e-07, "loss": 0.1813, "step": 30740 }, { "epoch": 0.9500343711622087, "grad_norm": 0.1008015348084071, "learning_rate": 2.0424866816247723e-07, "loss": 0.1815, "step": 30750 }, { "epoch": 0.9503433254292534, "grad_norm": 0.1016307295892918, "learning_rate": 2.01735873460418e-07, "loss": 0.1791, "step": 30760 }, { "epoch": 0.950652279696298, "grad_norm": 0.10985734853998837, "learning_rate": 1.9923852691759392e-07, "loss": 0.1787, "step": 30770 }, { "epoch": 0.9509612339633425, "grad_norm": 0.09154176746505865, "learning_rate": 1.9675663114102628e-07, "loss": 0.1781, "step": 30780 }, { "epoch": 0.9512701882303872, "grad_norm": 0.08927489490479662, "learning_rate": 1.9429018872161087e-07, "loss": 0.18, "step": 30790 }, { "epoch": 0.9515791424974318, "grad_norm": 0.10352047287878095, "learning_rate": 1.9183920223410479e-07, "loss": 0.1819, "step": 30800 }, { "epoch": 0.9518880967644764, "grad_norm": 0.08637391387427489, "learning_rate": 1.89403674237138e-07, "loss": 0.1849, "step": 30810 }, { "epoch": 0.9521970510315211, "grad_norm": 0.09406334007056122, "learning_rate": 1.8698360727319675e-07, "loss": 0.181, "step": 30820 }, { "epoch": 0.9525060052985657, "grad_norm": 0.08298020608762798, "learning_rate": 1.845790038686318e-07, "loss": 0.1797, "step": 30830 }, { "epoch": 0.9528149595656104, "grad_norm": 0.10134935237751644, "learning_rate": 1.8218986653365022e-07, "loss": 0.1903, "step": 30840 }, { "epoch": 0.9531239138326549, "grad_norm": 0.1158332743556505, "learning_rate": 1.7981619776231194e-07, "loss": 0.1841, "step": 30850 }, { "epoch": 0.9534328680996995, "grad_norm": 0.09956372543851025, "learning_rate": 1.7745800003252988e-07, "loss": 0.1796, "step": 30860 }, { "epoch": 0.9537418223667442, "grad_norm": 0.10284171298465782, "learning_rate": 1.7511527580606478e-07, "loss": 0.1803, "step": 30870 }, { "epoch": 0.9540507766337888, "grad_norm": 0.11365079076638167, "learning_rate": 1.727880275285304e-07, "loss": 0.1846, "step": 30880 }, { "epoch": 0.9543597309008334, "grad_norm": 0.09891187498037164, "learning_rate": 1.7047625762938003e-07, "loss": 0.1895, "step": 30890 }, { "epoch": 0.9546686851678781, "grad_norm": 0.09859577223295982, "learning_rate": 1.6817996852190653e-07, "loss": 0.1818, "step": 30900 }, { "epoch": 0.9549776394349226, "grad_norm": 0.10919210441048655, "learning_rate": 1.6589916260324744e-07, "loss": 0.1826, "step": 30910 }, { "epoch": 0.9552865937019672, "grad_norm": 0.138930233256035, "learning_rate": 1.6363384225437316e-07, "loss": 0.1878, "step": 30920 }, { "epoch": 0.9555955479690119, "grad_norm": 0.10440889364143319, "learning_rate": 1.6138400984009206e-07, "loss": 0.18, "step": 30930 }, { "epoch": 0.9559045022360565, "grad_norm": 0.10886040349441196, "learning_rate": 1.5914966770904204e-07, "loss": 0.1834, "step": 30940 }, { "epoch": 0.9562134565031011, "grad_norm": 0.09283335419920878, "learning_rate": 1.5693081819368736e-07, "loss": 0.1841, "step": 30950 }, { "epoch": 0.9565224107701458, "grad_norm": 0.09958767730349954, "learning_rate": 1.5472746361032852e-07, "loss": 0.1821, "step": 30960 }, { "epoch": 0.9568313650371904, "grad_norm": 0.10424635733138844, "learning_rate": 1.5253960625908058e-07, "loss": 0.1844, "step": 30970 }, { "epoch": 0.957140319304235, "grad_norm": 0.09634602265514253, "learning_rate": 1.503672484238866e-07, "loss": 0.181, "step": 30980 }, { "epoch": 0.9574492735712796, "grad_norm": 0.09673533450203761, "learning_rate": 1.482103923725059e-07, "loss": 0.1819, "step": 30990 }, { "epoch": 0.9577582278383242, "grad_norm": 0.09281362169035717, "learning_rate": 1.460690403565207e-07, "loss": 0.1802, "step": 31000 }, { "epoch": 0.9580671821053689, "grad_norm": 0.09907673820780423, "learning_rate": 1.4394319461132122e-07, "loss": 0.1796, "step": 31010 }, { "epoch": 0.9583761363724135, "grad_norm": 0.10754740225687179, "learning_rate": 1.4183285735611728e-07, "loss": 0.1844, "step": 31020 }, { "epoch": 0.9586850906394581, "grad_norm": 0.10687190844780521, "learning_rate": 1.3973803079392166e-07, "loss": 0.1847, "step": 31030 }, { "epoch": 0.9589940449065028, "grad_norm": 0.09590553826888838, "learning_rate": 1.376587171115634e-07, "loss": 0.1874, "step": 31040 }, { "epoch": 0.9593029991735473, "grad_norm": 0.09899282104756171, "learning_rate": 1.3559491847966953e-07, "loss": 0.1792, "step": 31050 }, { "epoch": 0.9596119534405919, "grad_norm": 0.09017603490684323, "learning_rate": 1.3354663705267833e-07, "loss": 0.1795, "step": 31060 }, { "epoch": 0.9599209077076366, "grad_norm": 0.10258765398510361, "learning_rate": 1.3151387496882272e-07, "loss": 0.1843, "step": 31070 }, { "epoch": 0.9602298619746812, "grad_norm": 0.10480588957737341, "learning_rate": 1.2949663435014024e-07, "loss": 0.1836, "step": 31080 }, { "epoch": 0.9605388162417258, "grad_norm": 0.12923547949842573, "learning_rate": 1.2749491730246144e-07, "loss": 0.187, "step": 31090 }, { "epoch": 0.9608477705087705, "grad_norm": 0.1086299013748916, "learning_rate": 1.2550872591540973e-07, "loss": 0.1773, "step": 31100 }, { "epoch": 0.961156724775815, "grad_norm": 0.09940283741691684, "learning_rate": 1.2353806226240826e-07, "loss": 0.1829, "step": 31110 }, { "epoch": 0.9614656790428597, "grad_norm": 0.1296964227102436, "learning_rate": 1.2158292840066476e-07, "loss": 0.186, "step": 31120 }, { "epoch": 0.9617746333099043, "grad_norm": 0.10238375975253859, "learning_rate": 1.1964332637117659e-07, "loss": 0.1825, "step": 31130 }, { "epoch": 0.9620835875769489, "grad_norm": 0.09661343115266914, "learning_rate": 1.1771925819872575e-07, "loss": 0.1854, "step": 31140 }, { "epoch": 0.9623925418439936, "grad_norm": 0.1032271540916861, "learning_rate": 1.1581072589188224e-07, "loss": 0.1778, "step": 31150 }, { "epoch": 0.9627014961110382, "grad_norm": 0.10483871215715546, "learning_rate": 1.13917731442994e-07, "loss": 0.1858, "step": 31160 }, { "epoch": 0.9630104503780827, "grad_norm": 0.09993807670422895, "learning_rate": 1.1204027682819195e-07, "loss": 0.1789, "step": 31170 }, { "epoch": 0.9633194046451274, "grad_norm": 0.08679339716029881, "learning_rate": 1.1017836400737835e-07, "loss": 0.1752, "step": 31180 }, { "epoch": 0.963628358912172, "grad_norm": 0.08653556817925051, "learning_rate": 1.0833199492424006e-07, "loss": 0.1878, "step": 31190 }, { "epoch": 0.9639373131792166, "grad_norm": 0.10718704382019693, "learning_rate": 1.0650117150623195e-07, "loss": 0.183, "step": 31200 }, { "epoch": 0.9642462674462613, "grad_norm": 0.0829982620588724, "learning_rate": 1.046858956645802e-07, "loss": 0.1898, "step": 31210 }, { "epoch": 0.9645552217133059, "grad_norm": 0.0992572624909408, "learning_rate": 1.0288616929428396e-07, "loss": 0.1839, "step": 31220 }, { "epoch": 0.9648641759803505, "grad_norm": 0.09805582122223254, "learning_rate": 1.0110199427410704e-07, "loss": 0.1839, "step": 31230 }, { "epoch": 0.9651731302473952, "grad_norm": 0.10440858101052101, "learning_rate": 9.933337246658125e-08, "loss": 0.1811, "step": 31240 }, { "epoch": 0.9654820845144397, "grad_norm": 0.10158246278329792, "learning_rate": 9.758030571799969e-08, "loss": 0.1834, "step": 31250 }, { "epoch": 0.9657910387814844, "grad_norm": 0.09828361638263758, "learning_rate": 9.584279585841682e-08, "loss": 0.1834, "step": 31260 }, { "epoch": 0.966099993048529, "grad_norm": 0.10025441577548697, "learning_rate": 9.412084470165339e-08, "loss": 0.176, "step": 31270 }, { "epoch": 0.9664089473155736, "grad_norm": 0.09825165491218962, "learning_rate": 9.241445404527982e-08, "loss": 0.1831, "step": 31280 }, { "epoch": 0.9667179015826183, "grad_norm": 0.09542113090603005, "learning_rate": 9.072362567062787e-08, "loss": 0.1816, "step": 31290 }, { "epoch": 0.9670268558496629, "grad_norm": 0.08671810627123758, "learning_rate": 8.90483613427806e-08, "loss": 0.1855, "step": 31300 }, { "epoch": 0.9673358101167074, "grad_norm": 0.0981594118534967, "learning_rate": 8.738866281057578e-08, "loss": 0.1883, "step": 31310 }, { "epoch": 0.9676447643837521, "grad_norm": 0.12426012121630006, "learning_rate": 8.574453180660246e-08, "loss": 0.1869, "step": 31320 }, { "epoch": 0.9679537186507967, "grad_norm": 0.09324399655838103, "learning_rate": 8.411597004719274e-08, "loss": 0.1893, "step": 31330 }, { "epoch": 0.9682626729178413, "grad_norm": 0.10012993605343407, "learning_rate": 8.250297923243333e-08, "loss": 0.1822, "step": 31340 }, { "epoch": 0.968571627184886, "grad_norm": 0.1056940359745438, "learning_rate": 8.090556104615232e-08, "loss": 0.1937, "step": 31350 }, { "epoch": 0.9688805814519306, "grad_norm": 0.09725261456385074, "learning_rate": 7.932371715592246e-08, "loss": 0.1776, "step": 31360 }, { "epoch": 0.9691895357189751, "grad_norm": 0.10154256931540719, "learning_rate": 7.77574492130545e-08, "loss": 0.1848, "step": 31370 }, { "epoch": 0.9694984899860198, "grad_norm": 0.09149603023414711, "learning_rate": 7.620675885260387e-08, "loss": 0.1804, "step": 31380 }, { "epoch": 0.9698074442530644, "grad_norm": 0.1007167048314042, "learning_rate": 7.467164769336232e-08, "loss": 0.185, "step": 31390 }, { "epoch": 0.9701163985201091, "grad_norm": 0.09307577434497681, "learning_rate": 7.315211733785965e-08, "loss": 0.1778, "step": 31400 }, { "epoch": 0.9704253527871537, "grad_norm": 0.0955782683346041, "learning_rate": 7.164816937235697e-08, "loss": 0.1829, "step": 31410 }, { "epoch": 0.9707343070541983, "grad_norm": 0.10212685387837438, "learning_rate": 7.01598053668534e-08, "loss": 0.1826, "step": 31420 }, { "epoch": 0.971043261321243, "grad_norm": 0.09672828605485725, "learning_rate": 6.868702687507777e-08, "loss": 0.1789, "step": 31430 }, { "epoch": 0.9713522155882875, "grad_norm": 0.11117230987639781, "learning_rate": 6.722983543448524e-08, "loss": 0.1893, "step": 31440 }, { "epoch": 0.9716611698553321, "grad_norm": 0.10042907643952784, "learning_rate": 6.578823256626565e-08, "loss": 0.1796, "step": 31450 }, { "epoch": 0.9719701241223768, "grad_norm": 0.08935027307421677, "learning_rate": 6.436221977533352e-08, "loss": 0.1819, "step": 31460 }, { "epoch": 0.9722790783894214, "grad_norm": 0.08751376649380807, "learning_rate": 6.295179855032306e-08, "loss": 0.1828, "step": 31470 }, { "epoch": 0.972588032656466, "grad_norm": 0.11205014031190086, "learning_rate": 6.15569703636032e-08, "loss": 0.1816, "step": 31480 }, { "epoch": 0.9728969869235107, "grad_norm": 0.10445823831525844, "learning_rate": 6.017773667125415e-08, "loss": 0.1812, "step": 31490 }, { "epoch": 0.9732059411905553, "grad_norm": 0.09412638655296177, "learning_rate": 5.8814098913082536e-08, "loss": 0.1849, "step": 31500 }, { "epoch": 0.9735148954575998, "grad_norm": 0.09574035116215587, "learning_rate": 5.7466058512612996e-08, "loss": 0.1756, "step": 31510 }, { "epoch": 0.9738238497246445, "grad_norm": 0.08973661460284516, "learning_rate": 5.613361687708985e-08, "loss": 0.1849, "step": 31520 }, { "epoch": 0.9741328039916891, "grad_norm": 0.10257262602884924, "learning_rate": 5.48167753974671e-08, "loss": 0.1878, "step": 31530 }, { "epoch": 0.9744417582587338, "grad_norm": 0.10515722185838597, "learning_rate": 5.351553544842347e-08, "loss": 0.1805, "step": 31540 }, { "epoch": 0.9747507125257784, "grad_norm": 0.10687366098704164, "learning_rate": 5.222989838834236e-08, "loss": 0.1759, "step": 31550 }, { "epoch": 0.975059666792823, "grad_norm": 0.1285002417425426, "learning_rate": 5.0959865559320195e-08, "loss": 0.1873, "step": 31560 }, { "epoch": 0.9753686210598677, "grad_norm": 0.12217417870223621, "learning_rate": 4.970543828716978e-08, "loss": 0.1834, "step": 31570 }, { "epoch": 0.9756775753269122, "grad_norm": 0.11109395563424355, "learning_rate": 4.846661788140694e-08, "loss": 0.1845, "step": 31580 }, { "epoch": 0.9759865295939568, "grad_norm": 0.09556218542673506, "learning_rate": 4.7243405635257196e-08, "loss": 0.1804, "step": 31590 }, { "epoch": 0.9762954838610015, "grad_norm": 0.21101658083209426, "learning_rate": 4.603580282565412e-08, "loss": 0.1835, "step": 31600 }, { "epoch": 0.9766044381280461, "grad_norm": 0.12027403171534985, "learning_rate": 4.4843810713232625e-08, "loss": 0.1794, "step": 31610 }, { "epoch": 0.9769133923950907, "grad_norm": 0.10372268218321802, "learning_rate": 4.366743054233402e-08, "loss": 0.1888, "step": 31620 }, { "epoch": 0.9772223466621354, "grad_norm": 0.12108822656519833, "learning_rate": 4.25066635410043e-08, "loss": 0.1849, "step": 31630 }, { "epoch": 0.97753130092918, "grad_norm": 0.11970145616282128, "learning_rate": 4.136151092098417e-08, "loss": 0.1939, "step": 31640 }, { "epoch": 0.9778402551962245, "grad_norm": 0.091423837825491, "learning_rate": 4.023197387771904e-08, "loss": 0.179, "step": 31650 }, { "epoch": 0.9781492094632692, "grad_norm": 0.1073034529224064, "learning_rate": 3.9118053590352365e-08, "loss": 0.1806, "step": 31660 }, { "epoch": 0.9784581637303138, "grad_norm": 0.09482698429636137, "learning_rate": 3.8019751221725627e-08, "loss": 0.1876, "step": 31670 }, { "epoch": 0.9787671179973585, "grad_norm": 0.09545180835913966, "learning_rate": 3.693706791837337e-08, "loss": 0.1815, "step": 31680 }, { "epoch": 0.9790760722644031, "grad_norm": 0.10220994595932353, "learning_rate": 3.587000481052649e-08, "loss": 0.1845, "step": 31690 }, { "epoch": 0.9793850265314477, "grad_norm": 0.10371370734131442, "learning_rate": 3.481856301211228e-08, "loss": 0.1835, "step": 31700 }, { "epoch": 0.9796939807984923, "grad_norm": 0.09816401526168778, "learning_rate": 3.37827436207494e-08, "loss": 0.1816, "step": 31710 }, { "epoch": 0.9800029350655369, "grad_norm": 0.12582193678874096, "learning_rate": 3.2762547717742895e-08, "loss": 0.1875, "step": 31720 }, { "epoch": 0.9803118893325815, "grad_norm": 0.10248223971322745, "learning_rate": 3.175797636809752e-08, "loss": 0.185, "step": 31730 }, { "epoch": 0.9806208435996262, "grad_norm": 0.0991056835340578, "learning_rate": 3.0769030620499404e-08, "loss": 0.1801, "step": 31740 }, { "epoch": 0.9809297978666708, "grad_norm": 0.10545917331151045, "learning_rate": 2.9795711507327738e-08, "loss": 0.1821, "step": 31750 }, { "epoch": 0.9812387521337154, "grad_norm": 0.09048129469489209, "learning_rate": 2.8838020044644754e-08, "loss": 0.1802, "step": 31760 }, { "epoch": 0.9815477064007601, "grad_norm": 0.13834450805184254, "learning_rate": 2.789595723220406e-08, "loss": 0.1865, "step": 31770 }, { "epoch": 0.9818566606678046, "grad_norm": 0.09663800478828244, "learning_rate": 2.6969524053437333e-08, "loss": 0.1845, "step": 31780 }, { "epoch": 0.9821656149348492, "grad_norm": 0.09261691444251614, "learning_rate": 2.6058721475465953e-08, "loss": 0.1858, "step": 31790 }, { "epoch": 0.9824745692018939, "grad_norm": 0.09601610429596383, "learning_rate": 2.516355044909102e-08, "loss": 0.1784, "step": 31800 }, { "epoch": 0.9827835234689385, "grad_norm": 0.09133263106149017, "learning_rate": 2.4284011908796698e-08, "loss": 0.186, "step": 31810 }, { "epoch": 0.9830924777359832, "grad_norm": 0.08744538727191584, "learning_rate": 2.3420106772750194e-08, "loss": 0.184, "step": 31820 }, { "epoch": 0.9834014320030278, "grad_norm": 0.0956606415326059, "learning_rate": 2.257183594279677e-08, "loss": 0.1818, "step": 31830 }, { "epoch": 0.9837103862700723, "grad_norm": 0.10678364015992076, "learning_rate": 2.1739200304458083e-08, "loss": 0.1844, "step": 31840 }, { "epoch": 0.984019340537117, "grad_norm": 0.1046628681183441, "learning_rate": 2.092220072693718e-08, "loss": 0.1866, "step": 31850 }, { "epoch": 0.9843282948041616, "grad_norm": 0.08722371357166642, "learning_rate": 2.012083806311682e-08, "loss": 0.1809, "step": 31860 }, { "epoch": 0.9846372490712062, "grad_norm": 0.1120488600333755, "learning_rate": 1.93351131495495e-08, "loss": 0.1829, "step": 31870 }, { "epoch": 0.9849462033382509, "grad_norm": 0.10191755512640596, "learning_rate": 1.8565026806469098e-08, "loss": 0.1793, "step": 31880 }, { "epoch": 0.9852551576052955, "grad_norm": 0.11345238523008164, "learning_rate": 1.781057983777923e-08, "loss": 0.1786, "step": 31890 }, { "epoch": 0.98556411187234, "grad_norm": 0.09688113374479147, "learning_rate": 1.707177303106322e-08, "loss": 0.178, "step": 31900 }, { "epoch": 0.9858730661393847, "grad_norm": 0.12737817120201278, "learning_rate": 1.634860715757247e-08, "loss": 0.1908, "step": 31910 }, { "epoch": 0.9861820204064293, "grad_norm": 0.10680908672211273, "learning_rate": 1.5641082972231435e-08, "loss": 0.1816, "step": 31920 }, { "epoch": 0.9864909746734739, "grad_norm": 0.11016360323064711, "learning_rate": 1.4949201213637632e-08, "loss": 0.2067, "step": 31930 }, { "epoch": 0.9867999289405186, "grad_norm": 0.09699509535315748, "learning_rate": 1.4272962604058303e-08, "loss": 0.1767, "step": 31940 }, { "epoch": 0.9871088832075632, "grad_norm": 0.10111583516920525, "learning_rate": 1.3612367849428764e-08, "loss": 0.1832, "step": 31950 }, { "epoch": 0.9874178374746079, "grad_norm": 0.11807058180440379, "learning_rate": 1.2967417639357381e-08, "loss": 0.1878, "step": 31960 }, { "epoch": 0.9877267917416525, "grad_norm": 0.08964540220012428, "learning_rate": 1.233811264711726e-08, "loss": 0.1793, "step": 31970 }, { "epoch": 0.988035746008697, "grad_norm": 0.09161961602329624, "learning_rate": 1.1724453529651236e-08, "loss": 0.1818, "step": 31980 }, { "epoch": 0.9883447002757417, "grad_norm": 0.10667775010404798, "learning_rate": 1.1126440927568538e-08, "loss": 0.1861, "step": 31990 }, { "epoch": 0.9886536545427863, "grad_norm": 0.11851523497796096, "learning_rate": 1.0544075465143133e-08, "loss": 0.1827, "step": 32000 }, { "epoch": 0.9889626088098309, "grad_norm": 0.12087866982137467, "learning_rate": 9.977357750318716e-09, "loss": 0.1815, "step": 32010 }, { "epoch": 0.9892715630768756, "grad_norm": 0.08792807386076792, "learning_rate": 9.426288374698721e-09, "loss": 0.1846, "step": 32020 }, { "epoch": 0.9895805173439202, "grad_norm": 0.0977213762999282, "learning_rate": 8.890867913556312e-09, "loss": 0.1808, "step": 32030 }, { "epoch": 0.9898894716109647, "grad_norm": 0.11257705339231296, "learning_rate": 8.371096925824384e-09, "loss": 0.1848, "step": 32040 }, { "epoch": 0.9901984258780094, "grad_norm": 0.15921899585788993, "learning_rate": 7.866975954100574e-09, "loss": 0.1867, "step": 32050 }, { "epoch": 0.990507380145054, "grad_norm": 0.0977229273406542, "learning_rate": 7.37850552464725e-09, "loss": 0.1798, "step": 32060 }, { "epoch": 0.9908163344120986, "grad_norm": 0.08003348434572334, "learning_rate": 6.905686147384849e-09, "loss": 0.1818, "step": 32070 }, { "epoch": 0.9911252886791433, "grad_norm": 0.10666315233146619, "learning_rate": 6.4485183158985436e-09, "loss": 0.187, "step": 32080 }, { "epoch": 0.9914342429461879, "grad_norm": 0.08181195577030377, "learning_rate": 6.007002507434911e-09, "loss": 0.1825, "step": 32090 }, { "epoch": 0.9917431972132326, "grad_norm": 0.0841212971747019, "learning_rate": 5.581139182896933e-09, "loss": 0.1848, "step": 32100 }, { "epoch": 0.9920521514802771, "grad_norm": 0.0948863518499954, "learning_rate": 5.170928786852325e-09, "loss": 0.1835, "step": 32110 }, { "epoch": 0.9923611057473217, "grad_norm": 0.10020880135974115, "learning_rate": 4.776371747526875e-09, "loss": 0.1816, "step": 32120 }, { "epoch": 0.9926700600143664, "grad_norm": 0.08954383451493052, "learning_rate": 4.3974684768044445e-09, "loss": 0.1875, "step": 32130 }, { "epoch": 0.992979014281411, "grad_norm": 0.10052940338941503, "learning_rate": 4.034219370228631e-09, "loss": 0.1834, "step": 32140 }, { "epoch": 0.9932879685484556, "grad_norm": 0.08533222400980797, "learning_rate": 3.6866248070027697e-09, "loss": 0.179, "step": 32150 }, { "epoch": 0.9935969228155003, "grad_norm": 0.11467637551400281, "learning_rate": 3.3546851499849373e-09, "loss": 0.1856, "step": 32160 }, { "epoch": 0.9939058770825449, "grad_norm": 0.10168311919267087, "learning_rate": 3.038400745694614e-09, "loss": 0.1819, "step": 32170 }, { "epoch": 0.9942148313495894, "grad_norm": 0.09641899091001285, "learning_rate": 2.73777192430269e-09, "loss": 0.1872, "step": 32180 }, { "epoch": 0.9945237856166341, "grad_norm": 0.1073219913352132, "learning_rate": 2.452798999646455e-09, "loss": 0.1816, "step": 32190 }, { "epoch": 0.9948327398836787, "grad_norm": 0.09670467218894499, "learning_rate": 2.183482269207948e-09, "loss": 0.1804, "step": 32200 }, { "epoch": 0.9951416941507233, "grad_norm": 0.08275465208071395, "learning_rate": 1.9298220141356073e-09, "loss": 0.1765, "step": 32210 }, { "epoch": 0.995450648417768, "grad_norm": 0.09641862390453226, "learning_rate": 1.6918184992276153e-09, "loss": 0.1819, "step": 32220 }, { "epoch": 0.9957596026848126, "grad_norm": 0.10014425397261856, "learning_rate": 1.4694719729418937e-09, "loss": 0.1867, "step": 32230 }, { "epoch": 0.9960685569518573, "grad_norm": 0.10490746975916102, "learning_rate": 1.2627826673877741e-09, "loss": 0.1909, "step": 32240 }, { "epoch": 0.9963775112189018, "grad_norm": 0.09579448254766845, "learning_rate": 1.0717507983326603e-09, "loss": 0.1812, "step": 32250 }, { "epoch": 0.9966864654859464, "grad_norm": 0.08367775215507442, "learning_rate": 8.963765651970324e-10, "loss": 0.1838, "step": 32260 }, { "epoch": 0.9969954197529911, "grad_norm": 0.07440305022188369, "learning_rate": 7.366601510577775e-10, "loss": 0.179, "step": 32270 }, { "epoch": 0.9973043740200357, "grad_norm": 0.12597024720194774, "learning_rate": 5.926017226465242e-10, "loss": 0.1869, "step": 32280 }, { "epoch": 0.9976133282870803, "grad_norm": 0.09689654259661641, "learning_rate": 4.642014303463116e-10, "loss": 0.1792, "step": 32290 }, { "epoch": 0.997922282554125, "grad_norm": 0.10967883052574684, "learning_rate": 3.5145940819825137e-10, "loss": 0.1922, "step": 32300 }, { "epoch": 0.9982312368211695, "grad_norm": 0.10385219130875901, "learning_rate": 2.5437577389486597e-10, "loss": 0.1865, "step": 32310 }, { "epoch": 0.9985401910882141, "grad_norm": 0.10349690468541285, "learning_rate": 1.7295062878341928e-10, "loss": 0.1812, "step": 32320 }, { "epoch": 0.9988491453552588, "grad_norm": 0.10267401157150524, "learning_rate": 1.0718405786425134e-10, "loss": 0.1787, "step": 32330 }, { "epoch": 0.9991580996223034, "grad_norm": 0.10773225191306192, "learning_rate": 5.707612979244381e-11, "loss": 0.1795, "step": 32340 }, { "epoch": 0.999467053889348, "grad_norm": 0.09825659917396752, "learning_rate": 2.2626896877819826e-11, "loss": 0.1849, "step": 32350 }, { "epoch": 0.9997760081563927, "grad_norm": 0.09261055154357052, "learning_rate": 3.836395081613375e-12, "loss": 0.1789, "step": 32360 }, { "epoch": 0.9999922761433239, "step": 32367, "total_flos": 8484206288044032.0, "train_loss": 0.19035680645326206, "train_runtime": 188551.2044, "train_samples_per_second": 5.493, "train_steps_per_second": 0.172 } ], "logging_steps": 10, "max_steps": 32367, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8484206288044032.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }