diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,87533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9019084382553483, + "eval_steps": 500, + "global_step": 12500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.215267506042787e-05, + "grad_norm": 0.4064047336578369, + "learning_rate": 4e-05, + "loss": 0.4446, + "step": 1 + }, + { + "epoch": 0.00014430535012085574, + "grad_norm": 0.4091169834136963, + "learning_rate": 8e-05, + "loss": 0.4237, + "step": 2 + }, + { + "epoch": 0.0002164580251812836, + "grad_norm": 0.3088938593864441, + "learning_rate": 0.00012, + "loss": 0.4564, + "step": 3 + }, + { + "epoch": 0.0002886107002417115, + "grad_norm": 0.2945241630077362, + "learning_rate": 0.00016, + "loss": 0.405, + "step": 4 + }, + { + "epoch": 0.0003607633753021393, + "grad_norm": 0.311696320772171, + "learning_rate": 0.0002, + "loss": 0.3555, + "step": 5 + }, + { + "epoch": 0.0004329160503625672, + "grad_norm": 0.26907986402511597, + "learning_rate": 0.00019999711358060327, + "loss": 0.3276, + "step": 6 + }, + { + "epoch": 0.000505068725422995, + "grad_norm": 0.43013739585876465, + "learning_rate": 0.00019999422716120654, + "loss": 0.2577, + "step": 7 + }, + { + "epoch": 0.000577221400483423, + "grad_norm": 0.5789860486984253, + "learning_rate": 0.0001999913407418098, + "loss": 0.328, + "step": 8 + }, + { + "epoch": 0.0006493740755438508, + "grad_norm": 0.22699680924415588, + "learning_rate": 0.00019998845432241306, + "loss": 0.2613, + "step": 9 + }, + { + "epoch": 0.0007215267506042786, + "grad_norm": 0.21423082053661346, + "learning_rate": 0.00019998556790301632, + "loss": 0.2668, + "step": 10 + }, + { + "epoch": 0.0007936794256647065, + "grad_norm": 0.22188498079776764, + "learning_rate": 0.0001999826814836196, + "loss": 0.2194, + "step": 11 + }, + { + "epoch": 0.0008658321007251344, + "grad_norm": 0.15280145406723022, + "learning_rate": 0.00019997979506422285, + "loss": 0.2203, + "step": 12 + }, + { + "epoch": 0.0009379847757855622, + "grad_norm": 0.19237686693668365, + "learning_rate": 0.0001999769086448261, + "loss": 0.2326, + "step": 13 + }, + { + "epoch": 0.00101013745084599, + "grad_norm": 0.16257072985172272, + "learning_rate": 0.00019997402222542935, + "loss": 0.2396, + "step": 14 + }, + { + "epoch": 0.001082290125906418, + "grad_norm": 0.16879022121429443, + "learning_rate": 0.0001999711358060326, + "loss": 0.2542, + "step": 15 + }, + { + "epoch": 0.001154442800966846, + "grad_norm": 0.15356074273586273, + "learning_rate": 0.0001999682493866359, + "loss": 0.2433, + "step": 16 + }, + { + "epoch": 0.0012265954760272736, + "grad_norm": 0.1393197476863861, + "learning_rate": 0.00019996536296723916, + "loss": 0.214, + "step": 17 + }, + { + "epoch": 0.0012987481510877016, + "grad_norm": 0.14472635090351105, + "learning_rate": 0.00019996247654784243, + "loss": 0.2378, + "step": 18 + }, + { + "epoch": 0.0013709008261481295, + "grad_norm": 0.2018187940120697, + "learning_rate": 0.00019995959012844566, + "loss": 0.2555, + "step": 19 + }, + { + "epoch": 0.0014430535012085572, + "grad_norm": 0.1205526664853096, + "learning_rate": 0.00019995670370904893, + "loss": 0.2554, + "step": 20 + }, + { + "epoch": 0.0015152061762689851, + "grad_norm": 0.17153723537921906, + "learning_rate": 0.0001999538172896522, + "loss": 0.2014, + "step": 21 + }, + { + "epoch": 0.001587358851329413, + "grad_norm": 0.1561480462551117, + "learning_rate": 0.00019995093087025545, + "loss": 0.2274, + "step": 22 + }, + { + "epoch": 0.0016595115263898408, + "grad_norm": 0.13652458786964417, + "learning_rate": 0.00019994804445085874, + "loss": 0.2294, + "step": 23 + }, + { + "epoch": 0.0017316642014502687, + "grad_norm": 0.13224199414253235, + "learning_rate": 0.00019994515803146198, + "loss": 0.2178, + "step": 24 + }, + { + "epoch": 0.0018038168765106967, + "grad_norm": 0.13510029017925262, + "learning_rate": 0.00019994227161206524, + "loss": 0.2151, + "step": 25 + }, + { + "epoch": 0.0018759695515711244, + "grad_norm": 0.11560554802417755, + "learning_rate": 0.0001999393851926685, + "loss": 0.2607, + "step": 26 + }, + { + "epoch": 0.0019481222266315523, + "grad_norm": 0.13390889763832092, + "learning_rate": 0.00019993649877327177, + "loss": 0.2261, + "step": 27 + }, + { + "epoch": 0.00202027490169198, + "grad_norm": 0.12282073497772217, + "learning_rate": 0.00019993361235387503, + "loss": 0.2208, + "step": 28 + }, + { + "epoch": 0.002092427576752408, + "grad_norm": 0.12827306985855103, + "learning_rate": 0.0001999307259344783, + "loss": 0.2006, + "step": 29 + }, + { + "epoch": 0.002164580251812836, + "grad_norm": 0.12924723327159882, + "learning_rate": 0.00019992783951508156, + "loss": 0.1889, + "step": 30 + }, + { + "epoch": 0.002236732926873264, + "grad_norm": 0.13463133573532104, + "learning_rate": 0.00019992495309568482, + "loss": 0.2547, + "step": 31 + }, + { + "epoch": 0.002308885601933692, + "grad_norm": 0.20450986921787262, + "learning_rate": 0.00019992206667628808, + "loss": 0.1662, + "step": 32 + }, + { + "epoch": 0.0023810382769941197, + "grad_norm": 0.16089263558387756, + "learning_rate": 0.00019991918025689134, + "loss": 0.2459, + "step": 33 + }, + { + "epoch": 0.0024531909520545472, + "grad_norm": 0.2842683792114258, + "learning_rate": 0.0001999162938374946, + "loss": 0.2433, + "step": 34 + }, + { + "epoch": 0.002525343627114975, + "grad_norm": 0.13188135623931885, + "learning_rate": 0.00019991340741809784, + "loss": 0.2258, + "step": 35 + }, + { + "epoch": 0.002597496302175403, + "grad_norm": 0.11618123203516006, + "learning_rate": 0.0001999105209987011, + "loss": 0.2393, + "step": 36 + }, + { + "epoch": 0.002669648977235831, + "grad_norm": 0.12183308601379395, + "learning_rate": 0.0001999076345793044, + "loss": 0.2065, + "step": 37 + }, + { + "epoch": 0.002741801652296259, + "grad_norm": 0.11567720770835876, + "learning_rate": 0.00019990474815990766, + "loss": 0.2237, + "step": 38 + }, + { + "epoch": 0.002813954327356687, + "grad_norm": 0.12731248140335083, + "learning_rate": 0.00019990186174051092, + "loss": 0.1821, + "step": 39 + }, + { + "epoch": 0.0028861070024171144, + "grad_norm": 0.11343208700418472, + "learning_rate": 0.00019989897532111416, + "loss": 0.2308, + "step": 40 + }, + { + "epoch": 0.0029582596774775424, + "grad_norm": 0.09807998687028885, + "learning_rate": 0.00019989608890171742, + "loss": 0.2089, + "step": 41 + }, + { + "epoch": 0.0030304123525379703, + "grad_norm": 0.11223408579826355, + "learning_rate": 0.00019989320248232068, + "loss": 0.2324, + "step": 42 + }, + { + "epoch": 0.0031025650275983982, + "grad_norm": 0.10409168899059296, + "learning_rate": 0.00019989031606292395, + "loss": 0.209, + "step": 43 + }, + { + "epoch": 0.003174717702658826, + "grad_norm": 0.3627258837223053, + "learning_rate": 0.00019988742964352724, + "loss": 0.1686, + "step": 44 + }, + { + "epoch": 0.003246870377719254, + "grad_norm": 0.11073557287454605, + "learning_rate": 0.00019988454322413047, + "loss": 0.1728, + "step": 45 + }, + { + "epoch": 0.0033190230527796816, + "grad_norm": 0.1800556778907776, + "learning_rate": 0.00019988165680473374, + "loss": 0.2397, + "step": 46 + }, + { + "epoch": 0.0033911757278401095, + "grad_norm": 0.11776737868785858, + "learning_rate": 0.000199878770385337, + "loss": 0.2432, + "step": 47 + }, + { + "epoch": 0.0034633284029005375, + "grad_norm": 0.11778844147920609, + "learning_rate": 0.00019987588396594026, + "loss": 0.2224, + "step": 48 + }, + { + "epoch": 0.0035354810779609654, + "grad_norm": 0.14010971784591675, + "learning_rate": 0.00019987299754654352, + "loss": 0.2071, + "step": 49 + }, + { + "epoch": 0.0036076337530213934, + "grad_norm": 0.20106203854084015, + "learning_rate": 0.0001998701111271468, + "loss": 0.215, + "step": 50 + }, + { + "epoch": 0.0036797864280818213, + "grad_norm": 0.2706080377101898, + "learning_rate": 0.00019986722470775005, + "loss": 0.2544, + "step": 51 + }, + { + "epoch": 0.003751939103142249, + "grad_norm": 0.3544776141643524, + "learning_rate": 0.0001998643382883533, + "loss": 0.2248, + "step": 52 + }, + { + "epoch": 0.0038240917782026767, + "grad_norm": 0.25506851077079773, + "learning_rate": 0.00019986145186895658, + "loss": 0.2055, + "step": 53 + }, + { + "epoch": 0.0038962444532631047, + "grad_norm": 0.14315347373485565, + "learning_rate": 0.00019985856544955984, + "loss": 0.2289, + "step": 54 + }, + { + "epoch": 0.003968397128323533, + "grad_norm": 0.11961130797863007, + "learning_rate": 0.0001998556790301631, + "loss": 0.1826, + "step": 55 + }, + { + "epoch": 0.00404054980338396, + "grad_norm": 0.09153182804584503, + "learning_rate": 0.00019985279261076634, + "loss": 0.2162, + "step": 56 + }, + { + "epoch": 0.0041127024784443885, + "grad_norm": 0.1307227462530136, + "learning_rate": 0.0001998499061913696, + "loss": 0.2662, + "step": 57 + }, + { + "epoch": 0.004184855153504816, + "grad_norm": 0.10103806108236313, + "learning_rate": 0.0001998470197719729, + "loss": 0.1798, + "step": 58 + }, + { + "epoch": 0.004257007828565244, + "grad_norm": 0.14712268114089966, + "learning_rate": 0.00019984413335257615, + "loss": 0.2556, + "step": 59 + }, + { + "epoch": 0.004329160503625672, + "grad_norm": 0.08857639878988266, + "learning_rate": 0.00019984124693317942, + "loss": 0.1957, + "step": 60 + }, + { + "epoch": 0.0044013131786861, + "grad_norm": 0.12688061594963074, + "learning_rate": 0.00019983836051378265, + "loss": 0.2482, + "step": 61 + }, + { + "epoch": 0.004473465853746528, + "grad_norm": 0.09386658668518066, + "learning_rate": 0.00019983547409438591, + "loss": 0.1949, + "step": 62 + }, + { + "epoch": 0.004545618528806955, + "grad_norm": 0.10285080969333649, + "learning_rate": 0.00019983258767498918, + "loss": 0.186, + "step": 63 + }, + { + "epoch": 0.004617771203867384, + "grad_norm": 0.08779972791671753, + "learning_rate": 0.00019982970125559244, + "loss": 0.178, + "step": 64 + }, + { + "epoch": 0.004689923878927811, + "grad_norm": 0.15989039838314056, + "learning_rate": 0.00019982681483619573, + "loss": 0.2476, + "step": 65 + }, + { + "epoch": 0.0047620765539882395, + "grad_norm": 0.14378827810287476, + "learning_rate": 0.00019982392841679897, + "loss": 0.2609, + "step": 66 + }, + { + "epoch": 0.004834229229048667, + "grad_norm": 0.10464286059141159, + "learning_rate": 0.00019982104199740223, + "loss": 0.1926, + "step": 67 + }, + { + "epoch": 0.0049063819041090945, + "grad_norm": 0.11753690242767334, + "learning_rate": 0.0001998181555780055, + "loss": 0.1781, + "step": 68 + }, + { + "epoch": 0.004978534579169523, + "grad_norm": 0.14723753929138184, + "learning_rate": 0.00019981526915860876, + "loss": 0.2429, + "step": 69 + }, + { + "epoch": 0.00505068725422995, + "grad_norm": 0.14722511172294617, + "learning_rate": 0.00019981238273921202, + "loss": 0.2199, + "step": 70 + }, + { + "epoch": 0.005122839929290379, + "grad_norm": 0.11642223596572876, + "learning_rate": 0.00019980949631981528, + "loss": 0.1837, + "step": 71 + }, + { + "epoch": 0.005194992604350806, + "grad_norm": 0.10480733215808868, + "learning_rate": 0.00019980660990041854, + "loss": 0.204, + "step": 72 + }, + { + "epoch": 0.005267145279411235, + "grad_norm": 0.09915363788604736, + "learning_rate": 0.0001998037234810218, + "loss": 0.1918, + "step": 73 + }, + { + "epoch": 0.005339297954471662, + "grad_norm": 0.1103401780128479, + "learning_rate": 0.00019980083706162507, + "loss": 0.165, + "step": 74 + }, + { + "epoch": 0.00541145062953209, + "grad_norm": 0.11738649010658264, + "learning_rate": 0.00019979795064222833, + "loss": 0.2065, + "step": 75 + }, + { + "epoch": 0.005483603304592518, + "grad_norm": 0.11540112644433975, + "learning_rate": 0.0001997950642228316, + "loss": 0.2224, + "step": 76 + }, + { + "epoch": 0.0055557559796529455, + "grad_norm": 0.08912346512079239, + "learning_rate": 0.00019979217780343483, + "loss": 0.182, + "step": 77 + }, + { + "epoch": 0.005627908654713374, + "grad_norm": 0.11579877883195877, + "learning_rate": 0.0001997892913840381, + "loss": 0.2424, + "step": 78 + }, + { + "epoch": 0.005700061329773801, + "grad_norm": 0.09065508097410202, + "learning_rate": 0.00019978640496464138, + "loss": 0.2211, + "step": 79 + }, + { + "epoch": 0.005772214004834229, + "grad_norm": 0.13304202258586884, + "learning_rate": 0.00019978351854524465, + "loss": 0.2186, + "step": 80 + }, + { + "epoch": 0.005844366679894657, + "grad_norm": 0.11378730833530426, + "learning_rate": 0.0001997806321258479, + "loss": 0.1959, + "step": 81 + }, + { + "epoch": 0.005916519354955085, + "grad_norm": 0.10128432512283325, + "learning_rate": 0.00019977774570645115, + "loss": 0.2345, + "step": 82 + }, + { + "epoch": 0.005988672030015513, + "grad_norm": 0.1330646276473999, + "learning_rate": 0.0001997748592870544, + "loss": 0.2207, + "step": 83 + }, + { + "epoch": 0.006060824705075941, + "grad_norm": 0.11121273785829544, + "learning_rate": 0.00019977197286765767, + "loss": 0.2075, + "step": 84 + }, + { + "epoch": 0.006132977380136369, + "grad_norm": 0.09679929167032242, + "learning_rate": 0.00019976908644826093, + "loss": 0.219, + "step": 85 + }, + { + "epoch": 0.0062051300551967965, + "grad_norm": 0.1264830380678177, + "learning_rate": 0.00019976620002886422, + "loss": 0.2433, + "step": 86 + }, + { + "epoch": 0.006277282730257224, + "grad_norm": 0.0941806361079216, + "learning_rate": 0.00019976331360946746, + "loss": 0.2117, + "step": 87 + }, + { + "epoch": 0.006349435405317652, + "grad_norm": 0.09046623110771179, + "learning_rate": 0.00019976042719007072, + "loss": 0.1889, + "step": 88 + }, + { + "epoch": 0.00642158808037808, + "grad_norm": 0.12082833051681519, + "learning_rate": 0.000199757540770674, + "loss": 0.227, + "step": 89 + }, + { + "epoch": 0.006493740755438508, + "grad_norm": 0.12156786024570465, + "learning_rate": 0.00019975465435127725, + "loss": 0.2019, + "step": 90 + }, + { + "epoch": 0.006565893430498936, + "grad_norm": 0.08919201791286469, + "learning_rate": 0.0001997517679318805, + "loss": 0.1537, + "step": 91 + }, + { + "epoch": 0.006638046105559363, + "grad_norm": 0.10201513767242432, + "learning_rate": 0.00019974888151248378, + "loss": 0.1898, + "step": 92 + }, + { + "epoch": 0.006710198780619792, + "grad_norm": 0.12830446660518646, + "learning_rate": 0.00019974599509308704, + "loss": 0.2381, + "step": 93 + }, + { + "epoch": 0.006782351455680219, + "grad_norm": 0.11241954565048218, + "learning_rate": 0.0001997431086736903, + "loss": 0.2021, + "step": 94 + }, + { + "epoch": 0.0068545041307406475, + "grad_norm": 0.11420100927352905, + "learning_rate": 0.00019974022225429356, + "loss": 0.188, + "step": 95 + }, + { + "epoch": 0.006926656805801075, + "grad_norm": 0.12041347473859787, + "learning_rate": 0.00019973733583489683, + "loss": 0.2477, + "step": 96 + }, + { + "epoch": 0.006998809480861503, + "grad_norm": 0.10603255778551102, + "learning_rate": 0.0001997344494155001, + "loss": 0.1971, + "step": 97 + }, + { + "epoch": 0.007070962155921931, + "grad_norm": 0.1029348373413086, + "learning_rate": 0.00019973156299610333, + "loss": 0.2247, + "step": 98 + }, + { + "epoch": 0.007143114830982358, + "grad_norm": 0.1543446034193039, + "learning_rate": 0.0001997286765767066, + "loss": 0.2284, + "step": 99 + }, + { + "epoch": 0.007215267506042787, + "grad_norm": 0.10962533950805664, + "learning_rate": 0.00019972579015730988, + "loss": 0.1839, + "step": 100 + }, + { + "epoch": 0.007287420181103214, + "grad_norm": 0.15612627565860748, + "learning_rate": 0.00019972290373791314, + "loss": 0.1894, + "step": 101 + }, + { + "epoch": 0.007359572856163643, + "grad_norm": 0.11387870460748672, + "learning_rate": 0.0001997200173185164, + "loss": 0.1916, + "step": 102 + }, + { + "epoch": 0.00743172553122407, + "grad_norm": 0.09269845485687256, + "learning_rate": 0.00019971713089911964, + "loss": 0.162, + "step": 103 + }, + { + "epoch": 0.007503878206284498, + "grad_norm": 0.10952046513557434, + "learning_rate": 0.0001997142444797229, + "loss": 0.1859, + "step": 104 + }, + { + "epoch": 0.007576030881344926, + "grad_norm": 0.0959896668791771, + "learning_rate": 0.00019971135806032617, + "loss": 0.2286, + "step": 105 + }, + { + "epoch": 0.0076481835564053535, + "grad_norm": 0.09485447406768799, + "learning_rate": 0.00019970847164092943, + "loss": 0.2151, + "step": 106 + }, + { + "epoch": 0.007720336231465782, + "grad_norm": 0.11261118948459625, + "learning_rate": 0.0001997055852215327, + "loss": 0.2022, + "step": 107 + }, + { + "epoch": 0.007792488906526209, + "grad_norm": 0.126130148768425, + "learning_rate": 0.00019970269880213595, + "loss": 0.2423, + "step": 108 + }, + { + "epoch": 0.007864641581586637, + "grad_norm": 0.10524505376815796, + "learning_rate": 0.00019969981238273922, + "loss": 0.2011, + "step": 109 + }, + { + "epoch": 0.007936794256647065, + "grad_norm": 0.08516758680343628, + "learning_rate": 0.00019969692596334248, + "loss": 0.1857, + "step": 110 + }, + { + "epoch": 0.008008946931707494, + "grad_norm": 0.10273415595293045, + "learning_rate": 0.00019969403954394574, + "loss": 0.2111, + "step": 111 + }, + { + "epoch": 0.00808109960676792, + "grad_norm": 0.11832364648580551, + "learning_rate": 0.000199691153124549, + "loss": 0.2038, + "step": 112 + }, + { + "epoch": 0.008153252281828349, + "grad_norm": 0.10919308662414551, + "learning_rate": 0.00019968826670515227, + "loss": 0.2113, + "step": 113 + }, + { + "epoch": 0.008225404956888777, + "grad_norm": 0.12120673805475235, + "learning_rate": 0.0001996853802857555, + "loss": 0.1866, + "step": 114 + }, + { + "epoch": 0.008297557631949205, + "grad_norm": 0.1033109650015831, + "learning_rate": 0.0001996824938663588, + "loss": 0.2323, + "step": 115 + }, + { + "epoch": 0.008369710307009632, + "grad_norm": 0.12112661451101303, + "learning_rate": 0.00019967960744696206, + "loss": 0.2291, + "step": 116 + }, + { + "epoch": 0.00844186298207006, + "grad_norm": 0.07624496519565582, + "learning_rate": 0.00019967672102756532, + "loss": 0.1695, + "step": 117 + }, + { + "epoch": 0.008514015657130489, + "grad_norm": 0.07618191838264465, + "learning_rate": 0.00019967383460816858, + "loss": 0.1675, + "step": 118 + }, + { + "epoch": 0.008586168332190915, + "grad_norm": 0.10014130920171738, + "learning_rate": 0.00019967094818877185, + "loss": 0.2024, + "step": 119 + }, + { + "epoch": 0.008658321007251344, + "grad_norm": 0.12303517013788223, + "learning_rate": 0.00019966806176937508, + "loss": 0.1653, + "step": 120 + }, + { + "epoch": 0.008730473682311772, + "grad_norm": 0.10565467923879623, + "learning_rate": 0.00019966517534997835, + "loss": 0.1942, + "step": 121 + }, + { + "epoch": 0.0088026263573722, + "grad_norm": 0.10748612135648727, + "learning_rate": 0.00019966228893058164, + "loss": 0.1983, + "step": 122 + }, + { + "epoch": 0.008874779032432627, + "grad_norm": 0.09169552475214005, + "learning_rate": 0.0001996594025111849, + "loss": 0.1889, + "step": 123 + }, + { + "epoch": 0.008946931707493055, + "grad_norm": 0.09497978538274765, + "learning_rate": 0.00019965651609178816, + "loss": 0.1736, + "step": 124 + }, + { + "epoch": 0.009019084382553484, + "grad_norm": 0.12305834144353867, + "learning_rate": 0.0001996536296723914, + "loss": 0.2279, + "step": 125 + }, + { + "epoch": 0.00909123705761391, + "grad_norm": 0.10519791394472122, + "learning_rate": 0.00019965074325299466, + "loss": 0.1727, + "step": 126 + }, + { + "epoch": 0.009163389732674339, + "grad_norm": 0.0940847247838974, + "learning_rate": 0.00019964785683359792, + "loss": 0.2071, + "step": 127 + }, + { + "epoch": 0.009235542407734767, + "grad_norm": 0.09172669053077698, + "learning_rate": 0.00019964497041420119, + "loss": 0.1632, + "step": 128 + }, + { + "epoch": 0.009307695082795194, + "grad_norm": 0.11957385390996933, + "learning_rate": 0.00019964208399480448, + "loss": 0.2209, + "step": 129 + }, + { + "epoch": 0.009379847757855622, + "grad_norm": 0.08616233617067337, + "learning_rate": 0.0001996391975754077, + "loss": 0.1936, + "step": 130 + }, + { + "epoch": 0.00945200043291605, + "grad_norm": 0.09599475562572479, + "learning_rate": 0.00019963631115601098, + "loss": 0.2139, + "step": 131 + }, + { + "epoch": 0.009524153107976479, + "grad_norm": 0.11431720107793808, + "learning_rate": 0.00019963342473661424, + "loss": 0.1974, + "step": 132 + }, + { + "epoch": 0.009596305783036906, + "grad_norm": 0.12138434499502182, + "learning_rate": 0.0001996305383172175, + "loss": 0.1607, + "step": 133 + }, + { + "epoch": 0.009668458458097334, + "grad_norm": 0.0832832008600235, + "learning_rate": 0.00019962765189782076, + "loss": 0.1999, + "step": 134 + }, + { + "epoch": 0.009740611133157762, + "grad_norm": 0.12049597501754761, + "learning_rate": 0.00019962476547842403, + "loss": 0.2017, + "step": 135 + }, + { + "epoch": 0.009812763808218189, + "grad_norm": 0.1307975798845291, + "learning_rate": 0.0001996218790590273, + "loss": 0.2409, + "step": 136 + }, + { + "epoch": 0.009884916483278617, + "grad_norm": 0.09753284603357315, + "learning_rate": 0.00019961899263963055, + "loss": 0.1278, + "step": 137 + }, + { + "epoch": 0.009957069158339046, + "grad_norm": 0.1166425570845604, + "learning_rate": 0.00019961610622023382, + "loss": 0.2263, + "step": 138 + }, + { + "epoch": 0.010029221833399474, + "grad_norm": 0.11471468210220337, + "learning_rate": 0.00019961321980083708, + "loss": 0.2244, + "step": 139 + }, + { + "epoch": 0.0101013745084599, + "grad_norm": 0.11294344067573547, + "learning_rate": 0.00019961033338144034, + "loss": 0.2296, + "step": 140 + }, + { + "epoch": 0.010173527183520329, + "grad_norm": 0.09066087007522583, + "learning_rate": 0.00019960744696204358, + "loss": 0.1726, + "step": 141 + }, + { + "epoch": 0.010245679858580757, + "grad_norm": 0.11471463739871979, + "learning_rate": 0.00019960456054264684, + "loss": 0.2075, + "step": 142 + }, + { + "epoch": 0.010317832533641184, + "grad_norm": 0.11662106215953827, + "learning_rate": 0.00019960167412325013, + "loss": 0.2538, + "step": 143 + }, + { + "epoch": 0.010389985208701612, + "grad_norm": 0.12267880141735077, + "learning_rate": 0.0001995987877038534, + "loss": 0.2094, + "step": 144 + }, + { + "epoch": 0.01046213788376204, + "grad_norm": 0.1315954029560089, + "learning_rate": 0.00019959590128445666, + "loss": 0.2061, + "step": 145 + }, + { + "epoch": 0.01053429055882247, + "grad_norm": 0.09367269277572632, + "learning_rate": 0.0001995930148650599, + "loss": 0.1829, + "step": 146 + }, + { + "epoch": 0.010606443233882896, + "grad_norm": 0.10474827885627747, + "learning_rate": 0.00019959012844566315, + "loss": 0.1826, + "step": 147 + }, + { + "epoch": 0.010678595908943324, + "grad_norm": 0.08622976392507553, + "learning_rate": 0.00019958724202626642, + "loss": 0.1816, + "step": 148 + }, + { + "epoch": 0.010750748584003753, + "grad_norm": 0.08372960984706879, + "learning_rate": 0.00019958435560686968, + "loss": 0.2197, + "step": 149 + }, + { + "epoch": 0.01082290125906418, + "grad_norm": 0.09493155032396317, + "learning_rate": 0.00019958146918747297, + "loss": 0.2209, + "step": 150 + }, + { + "epoch": 0.010895053934124608, + "grad_norm": 0.10592415183782578, + "learning_rate": 0.0001995785827680762, + "loss": 0.2309, + "step": 151 + }, + { + "epoch": 0.010967206609185036, + "grad_norm": 0.10627980530261993, + "learning_rate": 0.00019957569634867947, + "loss": 0.1596, + "step": 152 + }, + { + "epoch": 0.011039359284245463, + "grad_norm": 0.0862220972776413, + "learning_rate": 0.00019957280992928273, + "loss": 0.165, + "step": 153 + }, + { + "epoch": 0.011111511959305891, + "grad_norm": 0.07782924920320511, + "learning_rate": 0.000199569923509886, + "loss": 0.1936, + "step": 154 + }, + { + "epoch": 0.01118366463436632, + "grad_norm": 0.08833504468202591, + "learning_rate": 0.00019956703709048926, + "loss": 0.1869, + "step": 155 + }, + { + "epoch": 0.011255817309426748, + "grad_norm": 0.09978055208921432, + "learning_rate": 0.00019956415067109252, + "loss": 0.1544, + "step": 156 + }, + { + "epoch": 0.011327969984487174, + "grad_norm": 0.11412329226732254, + "learning_rate": 0.00019956126425169578, + "loss": 0.2143, + "step": 157 + }, + { + "epoch": 0.011400122659547603, + "grad_norm": 0.10396774858236313, + "learning_rate": 0.00019955837783229905, + "loss": 0.1566, + "step": 158 + }, + { + "epoch": 0.011472275334608031, + "grad_norm": 0.11875411868095398, + "learning_rate": 0.0001995554914129023, + "loss": 0.2081, + "step": 159 + }, + { + "epoch": 0.011544428009668458, + "grad_norm": 0.12535615265369415, + "learning_rate": 0.00019955260499350557, + "loss": 0.2066, + "step": 160 + }, + { + "epoch": 0.011616580684728886, + "grad_norm": 0.12918449938297272, + "learning_rate": 0.00019954971857410884, + "loss": 0.206, + "step": 161 + }, + { + "epoch": 0.011688733359789314, + "grad_norm": 0.11256776750087738, + "learning_rate": 0.00019954683215471207, + "loss": 0.1751, + "step": 162 + }, + { + "epoch": 0.011760886034849743, + "grad_norm": 0.13259808719158173, + "learning_rate": 0.00019954394573531533, + "loss": 0.2242, + "step": 163 + }, + { + "epoch": 0.01183303870991017, + "grad_norm": 0.12500351667404175, + "learning_rate": 0.00019954105931591862, + "loss": 0.2372, + "step": 164 + }, + { + "epoch": 0.011905191384970598, + "grad_norm": 0.08265276998281479, + "learning_rate": 0.0001995381728965219, + "loss": 0.1907, + "step": 165 + }, + { + "epoch": 0.011977344060031026, + "grad_norm": 0.09001553803682327, + "learning_rate": 0.00019953528647712515, + "loss": 0.1423, + "step": 166 + }, + { + "epoch": 0.012049496735091453, + "grad_norm": 0.12104904651641846, + "learning_rate": 0.00019953240005772839, + "loss": 0.191, + "step": 167 + }, + { + "epoch": 0.012121649410151881, + "grad_norm": 0.09061753004789352, + "learning_rate": 0.00019952951363833165, + "loss": 0.2026, + "step": 168 + }, + { + "epoch": 0.01219380208521231, + "grad_norm": 0.0975954681634903, + "learning_rate": 0.0001995266272189349, + "loss": 0.2199, + "step": 169 + }, + { + "epoch": 0.012265954760272738, + "grad_norm": 0.07146725058555603, + "learning_rate": 0.00019952374079953817, + "loss": 0.1557, + "step": 170 + }, + { + "epoch": 0.012338107435333165, + "grad_norm": 0.11715718358755112, + "learning_rate": 0.00019952085438014146, + "loss": 0.2198, + "step": 171 + }, + { + "epoch": 0.012410260110393593, + "grad_norm": 0.12954525649547577, + "learning_rate": 0.0001995179679607447, + "loss": 0.2212, + "step": 172 + }, + { + "epoch": 0.012482412785454021, + "grad_norm": 0.10778049379587173, + "learning_rate": 0.00019951508154134796, + "loss": 0.1775, + "step": 173 + }, + { + "epoch": 0.012554565460514448, + "grad_norm": 0.10183624178171158, + "learning_rate": 0.00019951219512195123, + "loss": 0.2009, + "step": 174 + }, + { + "epoch": 0.012626718135574876, + "grad_norm": 0.10871998220682144, + "learning_rate": 0.0001995093087025545, + "loss": 0.2297, + "step": 175 + }, + { + "epoch": 0.012698870810635305, + "grad_norm": 0.0910383015871048, + "learning_rate": 0.00019950642228315775, + "loss": 0.1571, + "step": 176 + }, + { + "epoch": 0.012771023485695733, + "grad_norm": 0.1175563782453537, + "learning_rate": 0.00019950353586376102, + "loss": 0.1874, + "step": 177 + }, + { + "epoch": 0.01284317616075616, + "grad_norm": 0.11125342547893524, + "learning_rate": 0.00019950064944436428, + "loss": 0.1946, + "step": 178 + }, + { + "epoch": 0.012915328835816588, + "grad_norm": 0.08524177223443985, + "learning_rate": 0.00019949776302496754, + "loss": 0.1982, + "step": 179 + }, + { + "epoch": 0.012987481510877016, + "grad_norm": 0.10269410908222198, + "learning_rate": 0.0001994948766055708, + "loss": 0.1501, + "step": 180 + }, + { + "epoch": 0.013059634185937443, + "grad_norm": 0.09117331355810165, + "learning_rate": 0.00019949199018617407, + "loss": 0.1941, + "step": 181 + }, + { + "epoch": 0.013131786860997871, + "grad_norm": 0.12752005457878113, + "learning_rate": 0.00019948910376677733, + "loss": 0.1972, + "step": 182 + }, + { + "epoch": 0.0132039395360583, + "grad_norm": 0.09392760694026947, + "learning_rate": 0.00019948621734738057, + "loss": 0.1545, + "step": 183 + }, + { + "epoch": 0.013276092211118726, + "grad_norm": 0.0918208435177803, + "learning_rate": 0.00019948333092798383, + "loss": 0.1953, + "step": 184 + }, + { + "epoch": 0.013348244886179155, + "grad_norm": 0.138113334774971, + "learning_rate": 0.00019948044450858712, + "loss": 0.2497, + "step": 185 + }, + { + "epoch": 0.013420397561239583, + "grad_norm": 0.10034741461277008, + "learning_rate": 0.00019947755808919038, + "loss": 0.205, + "step": 186 + }, + { + "epoch": 0.013492550236300012, + "grad_norm": 0.14482171833515167, + "learning_rate": 0.00019947467166979364, + "loss": 0.217, + "step": 187 + }, + { + "epoch": 0.013564702911360438, + "grad_norm": 0.15158987045288086, + "learning_rate": 0.00019947178525039688, + "loss": 0.2686, + "step": 188 + }, + { + "epoch": 0.013636855586420867, + "grad_norm": 0.10592421144247055, + "learning_rate": 0.00019946889883100014, + "loss": 0.2172, + "step": 189 + }, + { + "epoch": 0.013709008261481295, + "grad_norm": 0.10008352994918823, + "learning_rate": 0.0001994660124116034, + "loss": 0.211, + "step": 190 + }, + { + "epoch": 0.013781160936541722, + "grad_norm": 0.09804289788007736, + "learning_rate": 0.00019946312599220667, + "loss": 0.1999, + "step": 191 + }, + { + "epoch": 0.01385331361160215, + "grad_norm": 0.12120037525892258, + "learning_rate": 0.00019946023957280996, + "loss": 0.2195, + "step": 192 + }, + { + "epoch": 0.013925466286662578, + "grad_norm": 0.08471149951219559, + "learning_rate": 0.0001994573531534132, + "loss": 0.1866, + "step": 193 + }, + { + "epoch": 0.013997618961723007, + "grad_norm": 0.11269561201334, + "learning_rate": 0.00019945446673401646, + "loss": 0.1874, + "step": 194 + }, + { + "epoch": 0.014069771636783433, + "grad_norm": 0.160725399851799, + "learning_rate": 0.00019945158031461972, + "loss": 0.2352, + "step": 195 + }, + { + "epoch": 0.014141924311843862, + "grad_norm": 0.09515868127346039, + "learning_rate": 0.00019944869389522298, + "loss": 0.1824, + "step": 196 + }, + { + "epoch": 0.01421407698690429, + "grad_norm": 0.13595260679721832, + "learning_rate": 0.00019944580747582625, + "loss": 0.2187, + "step": 197 + }, + { + "epoch": 0.014286229661964717, + "grad_norm": 0.11196540296077728, + "learning_rate": 0.0001994429210564295, + "loss": 0.2037, + "step": 198 + }, + { + "epoch": 0.014358382337025145, + "grad_norm": 0.10346249490976334, + "learning_rate": 0.00019944003463703277, + "loss": 0.1673, + "step": 199 + }, + { + "epoch": 0.014430535012085573, + "grad_norm": 0.09775584191083908, + "learning_rate": 0.00019943714821763604, + "loss": 0.173, + "step": 200 + }, + { + "epoch": 0.014502687687146002, + "grad_norm": 0.1201501190662384, + "learning_rate": 0.0001994342617982393, + "loss": 0.1857, + "step": 201 + }, + { + "epoch": 0.014574840362206428, + "grad_norm": 0.12281841039657593, + "learning_rate": 0.00019943137537884256, + "loss": 0.1431, + "step": 202 + }, + { + "epoch": 0.014646993037266857, + "grad_norm": 0.10916811227798462, + "learning_rate": 0.00019942848895944582, + "loss": 0.2277, + "step": 203 + }, + { + "epoch": 0.014719145712327285, + "grad_norm": 0.08134673535823822, + "learning_rate": 0.00019942560254004906, + "loss": 0.2047, + "step": 204 + }, + { + "epoch": 0.014791298387387712, + "grad_norm": 0.12304705381393433, + "learning_rate": 0.00019942271612065232, + "loss": 0.209, + "step": 205 + }, + { + "epoch": 0.01486345106244814, + "grad_norm": 0.13313840329647064, + "learning_rate": 0.0001994198297012556, + "loss": 0.2049, + "step": 206 + }, + { + "epoch": 0.014935603737508569, + "grad_norm": 0.08853999525308609, + "learning_rate": 0.00019941694328185888, + "loss": 0.1843, + "step": 207 + }, + { + "epoch": 0.015007756412568995, + "grad_norm": 0.09260358661413193, + "learning_rate": 0.00019941405686246214, + "loss": 0.2099, + "step": 208 + }, + { + "epoch": 0.015079909087629424, + "grad_norm": 0.08907170593738556, + "learning_rate": 0.00019941117044306537, + "loss": 0.1899, + "step": 209 + }, + { + "epoch": 0.015152061762689852, + "grad_norm": 0.09744348376989365, + "learning_rate": 0.00019940828402366864, + "loss": 0.2257, + "step": 210 + }, + { + "epoch": 0.01522421443775028, + "grad_norm": 0.08101709187030792, + "learning_rate": 0.0001994053976042719, + "loss": 0.1763, + "step": 211 + }, + { + "epoch": 0.015296367112810707, + "grad_norm": 0.11687085032463074, + "learning_rate": 0.00019940251118487516, + "loss": 0.173, + "step": 212 + }, + { + "epoch": 0.015368519787871135, + "grad_norm": 0.11114943027496338, + "learning_rate": 0.00019939962476547845, + "loss": 0.1958, + "step": 213 + }, + { + "epoch": 0.015440672462931564, + "grad_norm": 0.17458894848823547, + "learning_rate": 0.0001993967383460817, + "loss": 0.2033, + "step": 214 + }, + { + "epoch": 0.01551282513799199, + "grad_norm": 0.10832903534173965, + "learning_rate": 0.00019939385192668495, + "loss": 0.1901, + "step": 215 + }, + { + "epoch": 0.015584977813052419, + "grad_norm": 0.10273167490959167, + "learning_rate": 0.00019939096550728821, + "loss": 0.2046, + "step": 216 + }, + { + "epoch": 0.015657130488112847, + "grad_norm": 0.10712496936321259, + "learning_rate": 0.00019938807908789148, + "loss": 0.1952, + "step": 217 + }, + { + "epoch": 0.015729283163173274, + "grad_norm": 0.09058426320552826, + "learning_rate": 0.00019938519266849474, + "loss": 0.1985, + "step": 218 + }, + { + "epoch": 0.015801435838233704, + "grad_norm": 0.10236865282058716, + "learning_rate": 0.000199382306249098, + "loss": 0.2118, + "step": 219 + }, + { + "epoch": 0.01587358851329413, + "grad_norm": 0.10352004319429398, + "learning_rate": 0.00019937941982970127, + "loss": 0.2027, + "step": 220 + }, + { + "epoch": 0.015945741188354557, + "grad_norm": 0.0897122472524643, + "learning_rate": 0.00019937653341030453, + "loss": 0.1682, + "step": 221 + }, + { + "epoch": 0.016017893863414987, + "grad_norm": 0.1268249899148941, + "learning_rate": 0.0001993736469909078, + "loss": 0.2118, + "step": 222 + }, + { + "epoch": 0.016090046538475414, + "grad_norm": 0.10709933191537857, + "learning_rate": 0.00019937076057151106, + "loss": 0.2165, + "step": 223 + }, + { + "epoch": 0.01616219921353584, + "grad_norm": 0.11887793987989426, + "learning_rate": 0.00019936787415211432, + "loss": 0.1918, + "step": 224 + }, + { + "epoch": 0.01623435188859627, + "grad_norm": 0.13047105073928833, + "learning_rate": 0.00019936498773271755, + "loss": 0.1893, + "step": 225 + }, + { + "epoch": 0.016306504563656697, + "grad_norm": 0.1016681045293808, + "learning_rate": 0.00019936210131332082, + "loss": 0.1479, + "step": 226 + }, + { + "epoch": 0.016378657238717124, + "grad_norm": 0.10665053129196167, + "learning_rate": 0.0001993592148939241, + "loss": 0.2123, + "step": 227 + }, + { + "epoch": 0.016450809913777554, + "grad_norm": 0.1363358050584793, + "learning_rate": 0.00019935632847452737, + "loss": 0.2246, + "step": 228 + }, + { + "epoch": 0.01652296258883798, + "grad_norm": 0.10417909920215607, + "learning_rate": 0.00019935344205513063, + "loss": 0.1528, + "step": 229 + }, + { + "epoch": 0.01659511526389841, + "grad_norm": 0.09790351241827011, + "learning_rate": 0.00019935055563573387, + "loss": 0.2257, + "step": 230 + }, + { + "epoch": 0.016667267938958837, + "grad_norm": 0.09754263609647751, + "learning_rate": 0.00019934766921633713, + "loss": 0.1574, + "step": 231 + }, + { + "epoch": 0.016739420614019264, + "grad_norm": 0.16002969443798065, + "learning_rate": 0.0001993447827969404, + "loss": 0.2083, + "step": 232 + }, + { + "epoch": 0.016811573289079694, + "grad_norm": 0.09000971168279648, + "learning_rate": 0.00019934189637754366, + "loss": 0.1675, + "step": 233 + }, + { + "epoch": 0.01688372596414012, + "grad_norm": 0.11512437462806702, + "learning_rate": 0.00019933900995814695, + "loss": 0.193, + "step": 234 + }, + { + "epoch": 0.016955878639200547, + "grad_norm": 0.11979632079601288, + "learning_rate": 0.00019933612353875018, + "loss": 0.2054, + "step": 235 + }, + { + "epoch": 0.017028031314260977, + "grad_norm": 0.11952786147594452, + "learning_rate": 0.00019933323711935345, + "loss": 0.1668, + "step": 236 + }, + { + "epoch": 0.017100183989321404, + "grad_norm": 0.10959208011627197, + "learning_rate": 0.0001993303506999567, + "loss": 0.216, + "step": 237 + }, + { + "epoch": 0.01717233666438183, + "grad_norm": 0.12177053093910217, + "learning_rate": 0.00019932746428055997, + "loss": 0.2314, + "step": 238 + }, + { + "epoch": 0.01724448933944226, + "grad_norm": 0.10955671221017838, + "learning_rate": 0.00019932457786116324, + "loss": 0.2267, + "step": 239 + }, + { + "epoch": 0.017316642014502687, + "grad_norm": 0.11161771416664124, + "learning_rate": 0.0001993216914417665, + "loss": 0.2044, + "step": 240 + }, + { + "epoch": 0.017388794689563114, + "grad_norm": 0.10793675482273102, + "learning_rate": 0.00019931880502236976, + "loss": 0.1914, + "step": 241 + }, + { + "epoch": 0.017460947364623544, + "grad_norm": 0.17880329489707947, + "learning_rate": 0.00019931591860297302, + "loss": 0.2513, + "step": 242 + }, + { + "epoch": 0.01753310003968397, + "grad_norm": 0.08590879291296005, + "learning_rate": 0.0001993130321835763, + "loss": 0.1594, + "step": 243 + }, + { + "epoch": 0.0176052527147444, + "grad_norm": 0.09276142716407776, + "learning_rate": 0.00019931014576417955, + "loss": 0.1619, + "step": 244 + }, + { + "epoch": 0.017677405389804828, + "grad_norm": 0.11142954975366592, + "learning_rate": 0.0001993072593447828, + "loss": 0.2105, + "step": 245 + }, + { + "epoch": 0.017749558064865254, + "grad_norm": 0.0828106701374054, + "learning_rate": 0.00019930437292538608, + "loss": 0.1931, + "step": 246 + }, + { + "epoch": 0.017821710739925684, + "grad_norm": 0.13978850841522217, + "learning_rate": 0.0001993014865059893, + "loss": 0.2162, + "step": 247 + }, + { + "epoch": 0.01789386341498611, + "grad_norm": 0.10149714350700378, + "learning_rate": 0.0001992986000865926, + "loss": 0.1972, + "step": 248 + }, + { + "epoch": 0.017966016090046538, + "grad_norm": 0.0819367840886116, + "learning_rate": 0.00019929571366719586, + "loss": 0.2244, + "step": 249 + }, + { + "epoch": 0.018038168765106968, + "grad_norm": 0.12735427916049957, + "learning_rate": 0.00019929282724779913, + "loss": 0.2066, + "step": 250 + }, + { + "epoch": 0.018110321440167394, + "grad_norm": 0.1065041646361351, + "learning_rate": 0.0001992899408284024, + "loss": 0.1747, + "step": 251 + }, + { + "epoch": 0.01818247411522782, + "grad_norm": 0.10140767693519592, + "learning_rate": 0.00019928705440900563, + "loss": 0.2042, + "step": 252 + }, + { + "epoch": 0.01825462679028825, + "grad_norm": 0.11508601158857346, + "learning_rate": 0.0001992841679896089, + "loss": 0.1923, + "step": 253 + }, + { + "epoch": 0.018326779465348678, + "grad_norm": 0.10136186331510544, + "learning_rate": 0.00019928128157021215, + "loss": 0.1708, + "step": 254 + }, + { + "epoch": 0.018398932140409104, + "grad_norm": 0.09931738674640656, + "learning_rate": 0.00019927839515081544, + "loss": 0.1829, + "step": 255 + }, + { + "epoch": 0.018471084815469534, + "grad_norm": 0.1116984635591507, + "learning_rate": 0.0001992755087314187, + "loss": 0.2105, + "step": 256 + }, + { + "epoch": 0.01854323749052996, + "grad_norm": 0.08264172822237015, + "learning_rate": 0.00019927262231202194, + "loss": 0.2176, + "step": 257 + }, + { + "epoch": 0.018615390165590388, + "grad_norm": 0.09286441653966904, + "learning_rate": 0.0001992697358926252, + "loss": 0.1286, + "step": 258 + }, + { + "epoch": 0.018687542840650818, + "grad_norm": 0.08997014909982681, + "learning_rate": 0.00019926684947322847, + "loss": 0.1848, + "step": 259 + }, + { + "epoch": 0.018759695515711244, + "grad_norm": 0.11541948467493057, + "learning_rate": 0.00019926396305383173, + "loss": 0.1909, + "step": 260 + }, + { + "epoch": 0.018831848190771674, + "grad_norm": 0.09805244952440262, + "learning_rate": 0.000199261076634435, + "loss": 0.2093, + "step": 261 + }, + { + "epoch": 0.0189040008658321, + "grad_norm": 0.09658953547477722, + "learning_rate": 0.00019925819021503826, + "loss": 0.1757, + "step": 262 + }, + { + "epoch": 0.018976153540892528, + "grad_norm": 0.09282110631465912, + "learning_rate": 0.00019925530379564152, + "loss": 0.1895, + "step": 263 + }, + { + "epoch": 0.019048306215952958, + "grad_norm": 0.11182378232479095, + "learning_rate": 0.00019925241737624478, + "loss": 0.1958, + "step": 264 + }, + { + "epoch": 0.019120458891013385, + "grad_norm": 0.09475011378526688, + "learning_rate": 0.00019924953095684804, + "loss": 0.1566, + "step": 265 + }, + { + "epoch": 0.01919261156607381, + "grad_norm": 0.09486480802297592, + "learning_rate": 0.0001992466445374513, + "loss": 0.1876, + "step": 266 + }, + { + "epoch": 0.01926476424113424, + "grad_norm": 0.11596336960792542, + "learning_rate": 0.00019924375811805457, + "loss": 0.2141, + "step": 267 + }, + { + "epoch": 0.019336916916194668, + "grad_norm": 0.10102172940969467, + "learning_rate": 0.0001992408716986578, + "loss": 0.2738, + "step": 268 + }, + { + "epoch": 0.019409069591255095, + "grad_norm": 0.10024615377187729, + "learning_rate": 0.0001992379852792611, + "loss": 0.1769, + "step": 269 + }, + { + "epoch": 0.019481222266315525, + "grad_norm": 0.1163974329829216, + "learning_rate": 0.00019923509885986436, + "loss": 0.248, + "step": 270 + }, + { + "epoch": 0.01955337494137595, + "grad_norm": 0.07875881344079971, + "learning_rate": 0.00019923221244046762, + "loss": 0.2059, + "step": 271 + }, + { + "epoch": 0.019625527616436378, + "grad_norm": 0.11664444208145142, + "learning_rate": 0.00019922932602107088, + "loss": 0.1966, + "step": 272 + }, + { + "epoch": 0.019697680291496808, + "grad_norm": 0.11768995225429535, + "learning_rate": 0.00019922643960167412, + "loss": 0.1731, + "step": 273 + }, + { + "epoch": 0.019769832966557235, + "grad_norm": 0.13091285526752472, + "learning_rate": 0.00019922355318227738, + "loss": 0.1955, + "step": 274 + }, + { + "epoch": 0.01984198564161766, + "grad_norm": 0.08766860514879227, + "learning_rate": 0.00019922066676288065, + "loss": 0.159, + "step": 275 + }, + { + "epoch": 0.01991413831667809, + "grad_norm": 0.08569549769163132, + "learning_rate": 0.00019921778034348394, + "loss": 0.1847, + "step": 276 + }, + { + "epoch": 0.019986290991738518, + "grad_norm": 0.08787121623754501, + "learning_rate": 0.0001992148939240872, + "loss": 0.1943, + "step": 277 + }, + { + "epoch": 0.020058443666798948, + "grad_norm": 0.10062495619058609, + "learning_rate": 0.00019921200750469043, + "loss": 0.2028, + "step": 278 + }, + { + "epoch": 0.020130596341859375, + "grad_norm": 0.11618790775537491, + "learning_rate": 0.0001992091210852937, + "loss": 0.1724, + "step": 279 + }, + { + "epoch": 0.0202027490169198, + "grad_norm": 0.08776410669088364, + "learning_rate": 0.00019920623466589696, + "loss": 0.2305, + "step": 280 + }, + { + "epoch": 0.02027490169198023, + "grad_norm": 0.10468615591526031, + "learning_rate": 0.00019920334824650022, + "loss": 0.171, + "step": 281 + }, + { + "epoch": 0.020347054367040658, + "grad_norm": 0.08151569217443466, + "learning_rate": 0.0001992004618271035, + "loss": 0.1512, + "step": 282 + }, + { + "epoch": 0.020419207042101085, + "grad_norm": 0.09557295590639114, + "learning_rate": 0.00019919757540770675, + "loss": 0.1675, + "step": 283 + }, + { + "epoch": 0.020491359717161515, + "grad_norm": 0.13301190733909607, + "learning_rate": 0.00019919468898831, + "loss": 0.1818, + "step": 284 + }, + { + "epoch": 0.02056351239222194, + "grad_norm": 0.10880567878484726, + "learning_rate": 0.00019919180256891328, + "loss": 0.2223, + "step": 285 + }, + { + "epoch": 0.020635665067282368, + "grad_norm": 0.13995389640331268, + "learning_rate": 0.00019918891614951654, + "loss": 0.1912, + "step": 286 + }, + { + "epoch": 0.020707817742342798, + "grad_norm": 0.11351920664310455, + "learning_rate": 0.0001991860297301198, + "loss": 0.1991, + "step": 287 + }, + { + "epoch": 0.020779970417403225, + "grad_norm": 0.1257031112909317, + "learning_rate": 0.00019918314331072306, + "loss": 0.1956, + "step": 288 + }, + { + "epoch": 0.02085212309246365, + "grad_norm": 0.12956175208091736, + "learning_rate": 0.0001991802568913263, + "loss": 0.2028, + "step": 289 + }, + { + "epoch": 0.02092427576752408, + "grad_norm": 0.11403210461139679, + "learning_rate": 0.0001991773704719296, + "loss": 0.2217, + "step": 290 + }, + { + "epoch": 0.020996428442584508, + "grad_norm": 0.08624578267335892, + "learning_rate": 0.00019917448405253285, + "loss": 0.2113, + "step": 291 + }, + { + "epoch": 0.02106858111764494, + "grad_norm": 0.07737656682729721, + "learning_rate": 0.00019917159763313612, + "loss": 0.1596, + "step": 292 + }, + { + "epoch": 0.021140733792705365, + "grad_norm": 0.107571080327034, + "learning_rate": 0.00019916871121373938, + "loss": 0.1912, + "step": 293 + }, + { + "epoch": 0.02121288646776579, + "grad_norm": 0.10967458039522171, + "learning_rate": 0.00019916582479434261, + "loss": 0.1997, + "step": 294 + }, + { + "epoch": 0.02128503914282622, + "grad_norm": 0.09726656973361969, + "learning_rate": 0.00019916293837494588, + "loss": 0.2109, + "step": 295 + }, + { + "epoch": 0.02135719181788665, + "grad_norm": 0.13646143674850464, + "learning_rate": 0.00019916005195554914, + "loss": 0.1724, + "step": 296 + }, + { + "epoch": 0.021429344492947075, + "grad_norm": 0.15287163853645325, + "learning_rate": 0.00019915716553615243, + "loss": 0.2461, + "step": 297 + }, + { + "epoch": 0.021501497168007505, + "grad_norm": 0.11280661821365356, + "learning_rate": 0.0001991542791167557, + "loss": 0.2093, + "step": 298 + }, + { + "epoch": 0.021573649843067932, + "grad_norm": 0.12336954474449158, + "learning_rate": 0.00019915139269735893, + "loss": 0.1926, + "step": 299 + }, + { + "epoch": 0.02164580251812836, + "grad_norm": 0.10354389250278473, + "learning_rate": 0.0001991485062779622, + "loss": 0.1568, + "step": 300 + }, + { + "epoch": 0.02171795519318879, + "grad_norm": 0.07334471493959427, + "learning_rate": 0.00019914561985856545, + "loss": 0.2174, + "step": 301 + }, + { + "epoch": 0.021790107868249215, + "grad_norm": 0.11739696562290192, + "learning_rate": 0.00019914273343916872, + "loss": 0.198, + "step": 302 + }, + { + "epoch": 0.021862260543309642, + "grad_norm": 0.08452331274747849, + "learning_rate": 0.00019913984701977198, + "loss": 0.2173, + "step": 303 + }, + { + "epoch": 0.021934413218370072, + "grad_norm": 0.09673911333084106, + "learning_rate": 0.00019913696060037524, + "loss": 0.1304, + "step": 304 + }, + { + "epoch": 0.0220065658934305, + "grad_norm": 0.08587480336427689, + "learning_rate": 0.0001991340741809785, + "loss": 0.1477, + "step": 305 + }, + { + "epoch": 0.022078718568490925, + "grad_norm": 0.0859738439321518, + "learning_rate": 0.00019913118776158177, + "loss": 0.1676, + "step": 306 + }, + { + "epoch": 0.022150871243551355, + "grad_norm": 0.11247943341732025, + "learning_rate": 0.00019912830134218503, + "loss": 0.1838, + "step": 307 + }, + { + "epoch": 0.022223023918611782, + "grad_norm": 0.09340526908636093, + "learning_rate": 0.0001991254149227883, + "loss": 0.1447, + "step": 308 + }, + { + "epoch": 0.022295176593672212, + "grad_norm": 0.099124975502491, + "learning_rate": 0.00019912252850339156, + "loss": 0.1725, + "step": 309 + }, + { + "epoch": 0.02236732926873264, + "grad_norm": 0.11203792691230774, + "learning_rate": 0.0001991196420839948, + "loss": 0.1841, + "step": 310 + }, + { + "epoch": 0.022439481943793065, + "grad_norm": 0.11389707773923874, + "learning_rate": 0.00019911675566459806, + "loss": 0.1977, + "step": 311 + }, + { + "epoch": 0.022511634618853495, + "grad_norm": 0.09892468899488449, + "learning_rate": 0.00019911386924520135, + "loss": 0.1473, + "step": 312 + }, + { + "epoch": 0.022583787293913922, + "grad_norm": 0.09083772450685501, + "learning_rate": 0.0001991109828258046, + "loss": 0.2101, + "step": 313 + }, + { + "epoch": 0.02265593996897435, + "grad_norm": 0.09250988066196442, + "learning_rate": 0.00019910809640640787, + "loss": 0.1889, + "step": 314 + }, + { + "epoch": 0.02272809264403478, + "grad_norm": 0.08847782015800476, + "learning_rate": 0.0001991052099870111, + "loss": 0.2198, + "step": 315 + }, + { + "epoch": 0.022800245319095205, + "grad_norm": 0.08334208279848099, + "learning_rate": 0.00019910232356761437, + "loss": 0.1677, + "step": 316 + }, + { + "epoch": 0.022872397994155632, + "grad_norm": 0.08387161046266556, + "learning_rate": 0.00019909943714821763, + "loss": 0.1982, + "step": 317 + }, + { + "epoch": 0.022944550669216062, + "grad_norm": 0.10020548105239868, + "learning_rate": 0.0001990965507288209, + "loss": 0.2098, + "step": 318 + }, + { + "epoch": 0.02301670334427649, + "grad_norm": 0.09410782158374786, + "learning_rate": 0.0001990936643094242, + "loss": 0.1557, + "step": 319 + }, + { + "epoch": 0.023088856019336915, + "grad_norm": 0.10243143886327744, + "learning_rate": 0.00019909077789002742, + "loss": 0.1781, + "step": 320 + }, + { + "epoch": 0.023161008694397345, + "grad_norm": 0.09203314036130905, + "learning_rate": 0.00019908789147063069, + "loss": 0.1438, + "step": 321 + }, + { + "epoch": 0.023233161369457772, + "grad_norm": 0.10731811821460724, + "learning_rate": 0.00019908500505123395, + "loss": 0.1463, + "step": 322 + }, + { + "epoch": 0.023305314044518202, + "grad_norm": 0.13198642432689667, + "learning_rate": 0.0001990821186318372, + "loss": 0.2845, + "step": 323 + }, + { + "epoch": 0.02337746671957863, + "grad_norm": 0.10663548856973648, + "learning_rate": 0.00019907923221244048, + "loss": 0.1802, + "step": 324 + }, + { + "epoch": 0.023449619394639055, + "grad_norm": 0.07761314511299133, + "learning_rate": 0.00019907634579304374, + "loss": 0.1921, + "step": 325 + }, + { + "epoch": 0.023521772069699486, + "grad_norm": 0.10983889549970627, + "learning_rate": 0.000199073459373647, + "loss": 0.1734, + "step": 326 + }, + { + "epoch": 0.023593924744759912, + "grad_norm": 0.10811863094568253, + "learning_rate": 0.00019907057295425026, + "loss": 0.1821, + "step": 327 + }, + { + "epoch": 0.02366607741982034, + "grad_norm": 0.09619367122650146, + "learning_rate": 0.00019906768653485353, + "loss": 0.1961, + "step": 328 + }, + { + "epoch": 0.02373823009488077, + "grad_norm": 0.1124391108751297, + "learning_rate": 0.0001990648001154568, + "loss": 0.1885, + "step": 329 + }, + { + "epoch": 0.023810382769941196, + "grad_norm": 0.12001251429319382, + "learning_rate": 0.00019906191369606005, + "loss": 0.2287, + "step": 330 + }, + { + "epoch": 0.023882535445001622, + "grad_norm": 0.08693698793649673, + "learning_rate": 0.0001990590272766633, + "loss": 0.229, + "step": 331 + }, + { + "epoch": 0.023954688120062052, + "grad_norm": 0.11607387661933899, + "learning_rate": 0.00019905614085726655, + "loss": 0.1936, + "step": 332 + }, + { + "epoch": 0.02402684079512248, + "grad_norm": 0.09263213723897934, + "learning_rate": 0.00019905325443786984, + "loss": 0.19, + "step": 333 + }, + { + "epoch": 0.024098993470182906, + "grad_norm": 0.09929122775793076, + "learning_rate": 0.0001990503680184731, + "loss": 0.1874, + "step": 334 + }, + { + "epoch": 0.024171146145243336, + "grad_norm": 0.09905881434679031, + "learning_rate": 0.00019904748159907637, + "loss": 0.1787, + "step": 335 + }, + { + "epoch": 0.024243298820303762, + "grad_norm": 0.11537329107522964, + "learning_rate": 0.0001990445951796796, + "loss": 0.2046, + "step": 336 + }, + { + "epoch": 0.02431545149536419, + "grad_norm": 0.1090816855430603, + "learning_rate": 0.00019904170876028287, + "loss": 0.1524, + "step": 337 + }, + { + "epoch": 0.02438760417042462, + "grad_norm": 0.09531082957983017, + "learning_rate": 0.00019903882234088613, + "loss": 0.1949, + "step": 338 + }, + { + "epoch": 0.024459756845485046, + "grad_norm": 0.11652404814958572, + "learning_rate": 0.0001990359359214894, + "loss": 0.1892, + "step": 339 + }, + { + "epoch": 0.024531909520545476, + "grad_norm": 0.08294610679149628, + "learning_rate": 0.00019903304950209268, + "loss": 0.1688, + "step": 340 + }, + { + "epoch": 0.024604062195605902, + "grad_norm": 0.07933833450078964, + "learning_rate": 0.00019903016308269592, + "loss": 0.1871, + "step": 341 + }, + { + "epoch": 0.02467621487066633, + "grad_norm": 0.09110282361507416, + "learning_rate": 0.00019902727666329918, + "loss": 0.2057, + "step": 342 + }, + { + "epoch": 0.02474836754572676, + "grad_norm": 0.0912768542766571, + "learning_rate": 0.00019902439024390244, + "loss": 0.1817, + "step": 343 + }, + { + "epoch": 0.024820520220787186, + "grad_norm": 0.12034342437982559, + "learning_rate": 0.0001990215038245057, + "loss": 0.2095, + "step": 344 + }, + { + "epoch": 0.024892672895847612, + "grad_norm": 0.08294276893138885, + "learning_rate": 0.00019901861740510897, + "loss": 0.2083, + "step": 345 + }, + { + "epoch": 0.024964825570908043, + "grad_norm": 0.08428749442100525, + "learning_rate": 0.00019901573098571223, + "loss": 0.1915, + "step": 346 + }, + { + "epoch": 0.02503697824596847, + "grad_norm": 0.09327653795480728, + "learning_rate": 0.0001990128445663155, + "loss": 0.1996, + "step": 347 + }, + { + "epoch": 0.025109130921028896, + "grad_norm": 0.1007128432393074, + "learning_rate": 0.00019900995814691876, + "loss": 0.2161, + "step": 348 + }, + { + "epoch": 0.025181283596089326, + "grad_norm": 0.07850717753171921, + "learning_rate": 0.00019900707172752202, + "loss": 0.1771, + "step": 349 + }, + { + "epoch": 0.025253436271149753, + "grad_norm": 0.09387306123971939, + "learning_rate": 0.00019900418530812528, + "loss": 0.1898, + "step": 350 + }, + { + "epoch": 0.02532558894621018, + "grad_norm": 0.11617643386125565, + "learning_rate": 0.00019900129888872855, + "loss": 0.2539, + "step": 351 + }, + { + "epoch": 0.02539774162127061, + "grad_norm": 0.08300226926803589, + "learning_rate": 0.0001989984124693318, + "loss": 0.1891, + "step": 352 + }, + { + "epoch": 0.025469894296331036, + "grad_norm": 0.09945572912693024, + "learning_rate": 0.00019899552604993505, + "loss": 0.1913, + "step": 353 + }, + { + "epoch": 0.025542046971391466, + "grad_norm": 0.09289573132991791, + "learning_rate": 0.00019899263963053834, + "loss": 0.1854, + "step": 354 + }, + { + "epoch": 0.025614199646451893, + "grad_norm": 0.08331576734781265, + "learning_rate": 0.0001989897532111416, + "loss": 0.126, + "step": 355 + }, + { + "epoch": 0.02568635232151232, + "grad_norm": 0.1058429703116417, + "learning_rate": 0.00019898686679174486, + "loss": 0.205, + "step": 356 + }, + { + "epoch": 0.02575850499657275, + "grad_norm": 0.11856674402952194, + "learning_rate": 0.00019898398037234812, + "loss": 0.1962, + "step": 357 + }, + { + "epoch": 0.025830657671633176, + "grad_norm": 0.09198472648859024, + "learning_rate": 0.00019898109395295136, + "loss": 0.1879, + "step": 358 + }, + { + "epoch": 0.025902810346693603, + "grad_norm": 0.09058953076601028, + "learning_rate": 0.00019897820753355462, + "loss": 0.1632, + "step": 359 + }, + { + "epoch": 0.025974963021754033, + "grad_norm": 0.08021187037229538, + "learning_rate": 0.00019897532111415789, + "loss": 0.165, + "step": 360 + }, + { + "epoch": 0.02604711569681446, + "grad_norm": 0.0965966060757637, + "learning_rate": 0.00019897243469476118, + "loss": 0.1542, + "step": 361 + }, + { + "epoch": 0.026119268371874886, + "grad_norm": 0.08407099545001984, + "learning_rate": 0.00019896954827536444, + "loss": 0.1826, + "step": 362 + }, + { + "epoch": 0.026191421046935316, + "grad_norm": 0.10092399269342422, + "learning_rate": 0.00019896666185596767, + "loss": 0.1754, + "step": 363 + }, + { + "epoch": 0.026263573721995743, + "grad_norm": 0.09448864310979843, + "learning_rate": 0.00019896377543657094, + "loss": 0.1839, + "step": 364 + }, + { + "epoch": 0.02633572639705617, + "grad_norm": 0.13143576681613922, + "learning_rate": 0.0001989608890171742, + "loss": 0.1908, + "step": 365 + }, + { + "epoch": 0.0264078790721166, + "grad_norm": 0.12041793018579483, + "learning_rate": 0.00019895800259777746, + "loss": 0.2097, + "step": 366 + }, + { + "epoch": 0.026480031747177026, + "grad_norm": 0.105339415371418, + "learning_rate": 0.00019895511617838073, + "loss": 0.18, + "step": 367 + }, + { + "epoch": 0.026552184422237453, + "grad_norm": 0.10656247287988663, + "learning_rate": 0.000198952229758984, + "loss": 0.2078, + "step": 368 + }, + { + "epoch": 0.026624337097297883, + "grad_norm": 0.11251800507307053, + "learning_rate": 0.00019894934333958725, + "loss": 0.1981, + "step": 369 + }, + { + "epoch": 0.02669648977235831, + "grad_norm": 0.12720337510108948, + "learning_rate": 0.00019894645692019052, + "loss": 0.2709, + "step": 370 + }, + { + "epoch": 0.02676864244741874, + "grad_norm": 0.09239493310451508, + "learning_rate": 0.00019894357050079378, + "loss": 0.2046, + "step": 371 + }, + { + "epoch": 0.026840795122479166, + "grad_norm": 0.10885576903820038, + "learning_rate": 0.00019894068408139704, + "loss": 0.1987, + "step": 372 + }, + { + "epoch": 0.026912947797539593, + "grad_norm": 0.11913982033729553, + "learning_rate": 0.0001989377976620003, + "loss": 0.1626, + "step": 373 + }, + { + "epoch": 0.026985100472600023, + "grad_norm": 0.12544821202754974, + "learning_rate": 0.00019893491124260354, + "loss": 0.2216, + "step": 374 + }, + { + "epoch": 0.02705725314766045, + "grad_norm": 0.13693448901176453, + "learning_rate": 0.00019893202482320683, + "loss": 0.2212, + "step": 375 + }, + { + "epoch": 0.027129405822720876, + "grad_norm": 0.08375770598649979, + "learning_rate": 0.0001989291384038101, + "loss": 0.1654, + "step": 376 + }, + { + "epoch": 0.027201558497781306, + "grad_norm": 0.11591768264770508, + "learning_rate": 0.00019892625198441336, + "loss": 0.1436, + "step": 377 + }, + { + "epoch": 0.027273711172841733, + "grad_norm": 0.11632189899682999, + "learning_rate": 0.00019892336556501662, + "loss": 0.2036, + "step": 378 + }, + { + "epoch": 0.02734586384790216, + "grad_norm": 0.10341499745845795, + "learning_rate": 0.00019892047914561985, + "loss": 0.2585, + "step": 379 + }, + { + "epoch": 0.02741801652296259, + "grad_norm": 0.10301853716373444, + "learning_rate": 0.00019891759272622312, + "loss": 0.1743, + "step": 380 + }, + { + "epoch": 0.027490169198023016, + "grad_norm": 0.10895366221666336, + "learning_rate": 0.00019891470630682638, + "loss": 0.2726, + "step": 381 + }, + { + "epoch": 0.027562321873083443, + "grad_norm": 0.09963389486074448, + "learning_rate": 0.00019891181988742967, + "loss": 0.2418, + "step": 382 + }, + { + "epoch": 0.027634474548143873, + "grad_norm": 0.09963802248239517, + "learning_rate": 0.00019890893346803293, + "loss": 0.2392, + "step": 383 + }, + { + "epoch": 0.0277066272232043, + "grad_norm": 0.08482670038938522, + "learning_rate": 0.00019890604704863617, + "loss": 0.1992, + "step": 384 + }, + { + "epoch": 0.027778779898264726, + "grad_norm": 0.09376131743192673, + "learning_rate": 0.00019890316062923943, + "loss": 0.1705, + "step": 385 + }, + { + "epoch": 0.027850932573325157, + "grad_norm": 0.12203828245401382, + "learning_rate": 0.0001989002742098427, + "loss": 0.1923, + "step": 386 + }, + { + "epoch": 0.027923085248385583, + "grad_norm": 0.09458067268133163, + "learning_rate": 0.00019889738779044596, + "loss": 0.1744, + "step": 387 + }, + { + "epoch": 0.027995237923446013, + "grad_norm": 0.09280339628458023, + "learning_rate": 0.00019889450137104922, + "loss": 0.1624, + "step": 388 + }, + { + "epoch": 0.02806739059850644, + "grad_norm": 0.11521881073713303, + "learning_rate": 0.00019889161495165248, + "loss": 0.2032, + "step": 389 + }, + { + "epoch": 0.028139543273566867, + "grad_norm": 0.10273708403110504, + "learning_rate": 0.00019888872853225575, + "loss": 0.1503, + "step": 390 + }, + { + "epoch": 0.028211695948627297, + "grad_norm": 0.12336786091327667, + "learning_rate": 0.000198885842112859, + "loss": 0.2347, + "step": 391 + }, + { + "epoch": 0.028283848623687723, + "grad_norm": 0.08287611603736877, + "learning_rate": 0.00019888295569346227, + "loss": 0.1682, + "step": 392 + }, + { + "epoch": 0.02835600129874815, + "grad_norm": 0.0861361101269722, + "learning_rate": 0.00019888006927406554, + "loss": 0.19, + "step": 393 + }, + { + "epoch": 0.02842815397380858, + "grad_norm": 0.1366668939590454, + "learning_rate": 0.0001988771828546688, + "loss": 0.1839, + "step": 394 + }, + { + "epoch": 0.028500306648869007, + "grad_norm": 0.1496826559305191, + "learning_rate": 0.00019887429643527203, + "loss": 0.204, + "step": 395 + }, + { + "epoch": 0.028572459323929433, + "grad_norm": 0.12759724259376526, + "learning_rate": 0.00019887141001587532, + "loss": 0.2151, + "step": 396 + }, + { + "epoch": 0.028644611998989863, + "grad_norm": 0.08731114119291306, + "learning_rate": 0.0001988685235964786, + "loss": 0.1315, + "step": 397 + }, + { + "epoch": 0.02871676467405029, + "grad_norm": 0.0812445878982544, + "learning_rate": 0.00019886563717708185, + "loss": 0.1613, + "step": 398 + }, + { + "epoch": 0.028788917349110717, + "grad_norm": 0.20545347034931183, + "learning_rate": 0.0001988627507576851, + "loss": 0.2087, + "step": 399 + }, + { + "epoch": 0.028861070024171147, + "grad_norm": 0.09925662726163864, + "learning_rate": 0.00019885986433828835, + "loss": 0.199, + "step": 400 + }, + { + "epoch": 0.028933222699231573, + "grad_norm": 0.08507594466209412, + "learning_rate": 0.0001988569779188916, + "loss": 0.1732, + "step": 401 + }, + { + "epoch": 0.029005375374292004, + "grad_norm": 0.12562596797943115, + "learning_rate": 0.00019885409149949487, + "loss": 0.2141, + "step": 402 + }, + { + "epoch": 0.02907752804935243, + "grad_norm": 0.07977975159883499, + "learning_rate": 0.00019885120508009816, + "loss": 0.1969, + "step": 403 + }, + { + "epoch": 0.029149680724412857, + "grad_norm": 0.10498231649398804, + "learning_rate": 0.00019884831866070143, + "loss": 0.1847, + "step": 404 + }, + { + "epoch": 0.029221833399473287, + "grad_norm": 0.10222010314464569, + "learning_rate": 0.00019884543224130466, + "loss": 0.1883, + "step": 405 + }, + { + "epoch": 0.029293986074533714, + "grad_norm": 0.1068202406167984, + "learning_rate": 0.00019884254582190793, + "loss": 0.1362, + "step": 406 + }, + { + "epoch": 0.02936613874959414, + "grad_norm": 0.13500475883483887, + "learning_rate": 0.0001988396594025112, + "loss": 0.2243, + "step": 407 + }, + { + "epoch": 0.02943829142465457, + "grad_norm": 0.11419196426868439, + "learning_rate": 0.00019883677298311445, + "loss": 0.2222, + "step": 408 + }, + { + "epoch": 0.029510444099714997, + "grad_norm": 0.09644989669322968, + "learning_rate": 0.00019883388656371771, + "loss": 0.1744, + "step": 409 + }, + { + "epoch": 0.029582596774775424, + "grad_norm": 0.09299837052822113, + "learning_rate": 0.00019883100014432098, + "loss": 0.1963, + "step": 410 + }, + { + "epoch": 0.029654749449835854, + "grad_norm": 0.08479341864585876, + "learning_rate": 0.00019882811372492424, + "loss": 0.2138, + "step": 411 + }, + { + "epoch": 0.02972690212489628, + "grad_norm": 0.12209715694189072, + "learning_rate": 0.0001988252273055275, + "loss": 0.2051, + "step": 412 + }, + { + "epoch": 0.029799054799956707, + "grad_norm": 0.1003933921456337, + "learning_rate": 0.00019882234088613077, + "loss": 0.176, + "step": 413 + }, + { + "epoch": 0.029871207475017137, + "grad_norm": 0.11374977976083755, + "learning_rate": 0.00019881945446673403, + "loss": 0.177, + "step": 414 + }, + { + "epoch": 0.029943360150077564, + "grad_norm": 0.13240540027618408, + "learning_rate": 0.0001988165680473373, + "loss": 0.1991, + "step": 415 + }, + { + "epoch": 0.03001551282513799, + "grad_norm": 0.082539863884449, + "learning_rate": 0.00019881368162794053, + "loss": 0.1831, + "step": 416 + }, + { + "epoch": 0.03008766550019842, + "grad_norm": 0.08623918890953064, + "learning_rate": 0.00019881079520854382, + "loss": 0.1593, + "step": 417 + }, + { + "epoch": 0.030159818175258847, + "grad_norm": 0.09880755096673965, + "learning_rate": 0.00019880790878914708, + "loss": 0.1569, + "step": 418 + }, + { + "epoch": 0.030231970850319277, + "grad_norm": 0.09099980443716049, + "learning_rate": 0.00019880502236975034, + "loss": 0.1791, + "step": 419 + }, + { + "epoch": 0.030304123525379704, + "grad_norm": 0.08195766806602478, + "learning_rate": 0.0001988021359503536, + "loss": 0.1759, + "step": 420 + }, + { + "epoch": 0.03037627620044013, + "grad_norm": 0.08706981688737869, + "learning_rate": 0.00019879924953095684, + "loss": 0.1947, + "step": 421 + }, + { + "epoch": 0.03044842887550056, + "grad_norm": 0.08890935033559799, + "learning_rate": 0.0001987963631115601, + "loss": 0.1449, + "step": 422 + }, + { + "epoch": 0.030520581550560987, + "grad_norm": 0.08960860967636108, + "learning_rate": 0.00019879347669216337, + "loss": 0.1971, + "step": 423 + }, + { + "epoch": 0.030592734225621414, + "grad_norm": 0.09105844795703888, + "learning_rate": 0.00019879059027276666, + "loss": 0.1579, + "step": 424 + }, + { + "epoch": 0.030664886900681844, + "grad_norm": 0.0840487852692604, + "learning_rate": 0.00019878770385336992, + "loss": 0.1593, + "step": 425 + }, + { + "epoch": 0.03073703957574227, + "grad_norm": 0.10095898807048798, + "learning_rate": 0.00019878481743397316, + "loss": 0.2121, + "step": 426 + }, + { + "epoch": 0.030809192250802697, + "grad_norm": 0.08656778931617737, + "learning_rate": 0.00019878193101457642, + "loss": 0.1855, + "step": 427 + }, + { + "epoch": 0.030881344925863127, + "grad_norm": 0.12554162740707397, + "learning_rate": 0.00019877904459517968, + "loss": 0.1579, + "step": 428 + }, + { + "epoch": 0.030953497600923554, + "grad_norm": 0.1407862901687622, + "learning_rate": 0.00019877615817578295, + "loss": 0.2155, + "step": 429 + }, + { + "epoch": 0.03102565027598398, + "grad_norm": 0.11547860503196716, + "learning_rate": 0.0001987732717563862, + "loss": 0.1828, + "step": 430 + }, + { + "epoch": 0.03109780295104441, + "grad_norm": 0.09747839719057083, + "learning_rate": 0.00019877038533698947, + "loss": 0.1626, + "step": 431 + }, + { + "epoch": 0.031169955626104837, + "grad_norm": 0.10276393592357635, + "learning_rate": 0.00019876749891759274, + "loss": 0.2393, + "step": 432 + }, + { + "epoch": 0.031242108301165267, + "grad_norm": 0.09598229825496674, + "learning_rate": 0.000198764612498196, + "loss": 0.1932, + "step": 433 + }, + { + "epoch": 0.031314260976225694, + "grad_norm": 0.08966390043497086, + "learning_rate": 0.00019876172607879926, + "loss": 0.1931, + "step": 434 + }, + { + "epoch": 0.031386413651286124, + "grad_norm": 0.10524100810289383, + "learning_rate": 0.00019875883965940252, + "loss": 0.1931, + "step": 435 + }, + { + "epoch": 0.03145856632634655, + "grad_norm": 0.09162607789039612, + "learning_rate": 0.0001987559532400058, + "loss": 0.1788, + "step": 436 + }, + { + "epoch": 0.03153071900140698, + "grad_norm": 0.13454949855804443, + "learning_rate": 0.00019875306682060902, + "loss": 0.1972, + "step": 437 + }, + { + "epoch": 0.03160287167646741, + "grad_norm": 0.13340908288955688, + "learning_rate": 0.0001987501804012123, + "loss": 0.2229, + "step": 438 + }, + { + "epoch": 0.03167502435152783, + "grad_norm": 0.12137065082788467, + "learning_rate": 0.00019874729398181558, + "loss": 0.1873, + "step": 439 + }, + { + "epoch": 0.03174717702658826, + "grad_norm": 0.08705832809209824, + "learning_rate": 0.00019874440756241884, + "loss": 0.1627, + "step": 440 + }, + { + "epoch": 0.03181932970164869, + "grad_norm": 0.10541950911283493, + "learning_rate": 0.0001987415211430221, + "loss": 0.1981, + "step": 441 + }, + { + "epoch": 0.031891482376709114, + "grad_norm": 0.07744797319173813, + "learning_rate": 0.00019873863472362534, + "loss": 0.2073, + "step": 442 + }, + { + "epoch": 0.031963635051769544, + "grad_norm": 0.09692710638046265, + "learning_rate": 0.0001987357483042286, + "loss": 0.179, + "step": 443 + }, + { + "epoch": 0.032035787726829974, + "grad_norm": 0.2575322389602661, + "learning_rate": 0.00019873286188483186, + "loss": 0.1625, + "step": 444 + }, + { + "epoch": 0.0321079404018904, + "grad_norm": 0.10213745385408401, + "learning_rate": 0.00019872997546543515, + "loss": 0.1663, + "step": 445 + }, + { + "epoch": 0.03218009307695083, + "grad_norm": 0.1628570258617401, + "learning_rate": 0.00019872708904603842, + "loss": 0.207, + "step": 446 + }, + { + "epoch": 0.03225224575201126, + "grad_norm": 0.23218366503715515, + "learning_rate": 0.00019872420262664165, + "loss": 0.1867, + "step": 447 + }, + { + "epoch": 0.03232439842707168, + "grad_norm": 0.09101596474647522, + "learning_rate": 0.00019872131620724491, + "loss": 0.2133, + "step": 448 + }, + { + "epoch": 0.03239655110213211, + "grad_norm": 0.10827759653329849, + "learning_rate": 0.00019871842978784818, + "loss": 0.1562, + "step": 449 + }, + { + "epoch": 0.03246870377719254, + "grad_norm": 0.08563897013664246, + "learning_rate": 0.00019871554336845144, + "loss": 0.1678, + "step": 450 + }, + { + "epoch": 0.032540856452252964, + "grad_norm": 0.13903899490833282, + "learning_rate": 0.0001987126569490547, + "loss": 0.1915, + "step": 451 + }, + { + "epoch": 0.032613009127313394, + "grad_norm": 0.13983823359012604, + "learning_rate": 0.00019870977052965797, + "loss": 0.1932, + "step": 452 + }, + { + "epoch": 0.032685161802373824, + "grad_norm": 0.06946656852960587, + "learning_rate": 0.00019870688411026123, + "loss": 0.1394, + "step": 453 + }, + { + "epoch": 0.03275731447743425, + "grad_norm": 0.08703076094388962, + "learning_rate": 0.0001987039976908645, + "loss": 0.1516, + "step": 454 + }, + { + "epoch": 0.03282946715249468, + "grad_norm": 0.0886911153793335, + "learning_rate": 0.00019870111127146776, + "loss": 0.1921, + "step": 455 + }, + { + "epoch": 0.03290161982755511, + "grad_norm": 0.10807514935731888, + "learning_rate": 0.00019869822485207102, + "loss": 0.2187, + "step": 456 + }, + { + "epoch": 0.03297377250261554, + "grad_norm": 0.10857339948415756, + "learning_rate": 0.00019869533843267428, + "loss": 0.183, + "step": 457 + }, + { + "epoch": 0.03304592517767596, + "grad_norm": 0.08645518869161606, + "learning_rate": 0.00019869245201327752, + "loss": 0.2359, + "step": 458 + }, + { + "epoch": 0.03311807785273639, + "grad_norm": 0.09541124105453491, + "learning_rate": 0.0001986895655938808, + "loss": 0.2041, + "step": 459 + }, + { + "epoch": 0.03319023052779682, + "grad_norm": 0.09739275276660919, + "learning_rate": 0.00019868667917448407, + "loss": 0.1494, + "step": 460 + }, + { + "epoch": 0.033262383202857244, + "grad_norm": 0.10426805168390274, + "learning_rate": 0.00019868379275508733, + "loss": 0.183, + "step": 461 + }, + { + "epoch": 0.033334535877917675, + "grad_norm": 0.13501611351966858, + "learning_rate": 0.0001986809063356906, + "loss": 0.1999, + "step": 462 + }, + { + "epoch": 0.033406688552978105, + "grad_norm": 0.1044807955622673, + "learning_rate": 0.00019867801991629386, + "loss": 0.1828, + "step": 463 + }, + { + "epoch": 0.03347884122803853, + "grad_norm": 0.09801997244358063, + "learning_rate": 0.0001986751334968971, + "loss": 0.19, + "step": 464 + }, + { + "epoch": 0.03355099390309896, + "grad_norm": 0.09675435721874237, + "learning_rate": 0.00019867224707750036, + "loss": 0.2026, + "step": 465 + }, + { + "epoch": 0.03362314657815939, + "grad_norm": 0.07665897905826569, + "learning_rate": 0.00019866936065810365, + "loss": 0.2056, + "step": 466 + }, + { + "epoch": 0.03369529925321981, + "grad_norm": 0.10023979097604752, + "learning_rate": 0.0001986664742387069, + "loss": 0.1935, + "step": 467 + }, + { + "epoch": 0.03376745192828024, + "grad_norm": 0.1360999494791031, + "learning_rate": 0.00019866358781931017, + "loss": 0.1964, + "step": 468 + }, + { + "epoch": 0.03383960460334067, + "grad_norm": 0.11151537299156189, + "learning_rate": 0.0001986607013999134, + "loss": 0.1422, + "step": 469 + }, + { + "epoch": 0.033911757278401095, + "grad_norm": 0.09877774864435196, + "learning_rate": 0.00019865781498051667, + "loss": 0.2298, + "step": 470 + }, + { + "epoch": 0.033983909953461525, + "grad_norm": 0.10054466873407364, + "learning_rate": 0.00019865492856111993, + "loss": 0.1612, + "step": 471 + }, + { + "epoch": 0.034056062628521955, + "grad_norm": 0.09187640994787216, + "learning_rate": 0.0001986520421417232, + "loss": 0.158, + "step": 472 + }, + { + "epoch": 0.03412821530358238, + "grad_norm": 0.0961819589138031, + "learning_rate": 0.0001986491557223265, + "loss": 0.1946, + "step": 473 + }, + { + "epoch": 0.03420036797864281, + "grad_norm": 0.12049927562475204, + "learning_rate": 0.00019864626930292972, + "loss": 0.1582, + "step": 474 + }, + { + "epoch": 0.03427252065370324, + "grad_norm": 0.09458652138710022, + "learning_rate": 0.00019864338288353299, + "loss": 0.1743, + "step": 475 + }, + { + "epoch": 0.03434467332876366, + "grad_norm": 0.10272986441850662, + "learning_rate": 0.00019864049646413625, + "loss": 0.2269, + "step": 476 + }, + { + "epoch": 0.03441682600382409, + "grad_norm": 0.12928758561611176, + "learning_rate": 0.0001986376100447395, + "loss": 0.1989, + "step": 477 + }, + { + "epoch": 0.03448897867888452, + "grad_norm": 0.0998743325471878, + "learning_rate": 0.00019863472362534278, + "loss": 0.1788, + "step": 478 + }, + { + "epoch": 0.034561131353944945, + "grad_norm": 0.11031027883291245, + "learning_rate": 0.00019863183720594604, + "loss": 0.2189, + "step": 479 + }, + { + "epoch": 0.034633284029005375, + "grad_norm": 0.08675051480531693, + "learning_rate": 0.0001986289507865493, + "loss": 0.1687, + "step": 480 + }, + { + "epoch": 0.034705436704065805, + "grad_norm": 0.08388551324605942, + "learning_rate": 0.00019862606436715256, + "loss": 0.1719, + "step": 481 + }, + { + "epoch": 0.03477758937912623, + "grad_norm": 0.09967280924320221, + "learning_rate": 0.00019862317794775583, + "loss": 0.1702, + "step": 482 + }, + { + "epoch": 0.03484974205418666, + "grad_norm": 0.13154242932796478, + "learning_rate": 0.0001986202915283591, + "loss": 0.1838, + "step": 483 + }, + { + "epoch": 0.03492189472924709, + "grad_norm": 0.08529820293188095, + "learning_rate": 0.00019861740510896235, + "loss": 0.1792, + "step": 484 + }, + { + "epoch": 0.03499404740430751, + "grad_norm": 0.09009307622909546, + "learning_rate": 0.0001986145186895656, + "loss": 0.189, + "step": 485 + }, + { + "epoch": 0.03506620007936794, + "grad_norm": 0.10295873880386353, + "learning_rate": 0.00019861163227016885, + "loss": 0.1849, + "step": 486 + }, + { + "epoch": 0.03513835275442837, + "grad_norm": 0.13797029852867126, + "learning_rate": 0.00019860874585077214, + "loss": 0.1871, + "step": 487 + }, + { + "epoch": 0.0352105054294888, + "grad_norm": 0.09551911801099777, + "learning_rate": 0.0001986058594313754, + "loss": 0.1643, + "step": 488 + }, + { + "epoch": 0.035282658104549225, + "grad_norm": 0.11506037414073944, + "learning_rate": 0.00019860297301197867, + "loss": 0.1496, + "step": 489 + }, + { + "epoch": 0.035354810779609655, + "grad_norm": 0.09025515615940094, + "learning_rate": 0.0001986000865925819, + "loss": 0.1772, + "step": 490 + }, + { + "epoch": 0.035426963454670085, + "grad_norm": 0.10068268328905106, + "learning_rate": 0.00019859720017318517, + "loss": 0.1583, + "step": 491 + }, + { + "epoch": 0.03549911612973051, + "grad_norm": 0.1159784346818924, + "learning_rate": 0.00019859431375378843, + "loss": 0.1908, + "step": 492 + }, + { + "epoch": 0.03557126880479094, + "grad_norm": 0.1259685754776001, + "learning_rate": 0.0001985914273343917, + "loss": 0.1654, + "step": 493 + }, + { + "epoch": 0.03564342147985137, + "grad_norm": 0.1246333047747612, + "learning_rate": 0.00019858854091499498, + "loss": 0.2091, + "step": 494 + }, + { + "epoch": 0.03571557415491179, + "grad_norm": 0.15852630138397217, + "learning_rate": 0.00019858565449559822, + "loss": 0.227, + "step": 495 + }, + { + "epoch": 0.03578772682997222, + "grad_norm": 0.10838798433542252, + "learning_rate": 0.00019858276807620148, + "loss": 0.1773, + "step": 496 + }, + { + "epoch": 0.03585987950503265, + "grad_norm": 0.09128434211015701, + "learning_rate": 0.00019857988165680474, + "loss": 0.1643, + "step": 497 + }, + { + "epoch": 0.035932032180093075, + "grad_norm": 0.17223364114761353, + "learning_rate": 0.000198576995237408, + "loss": 0.1859, + "step": 498 + }, + { + "epoch": 0.036004184855153505, + "grad_norm": 0.1001114621758461, + "learning_rate": 0.00019857410881801127, + "loss": 0.1668, + "step": 499 + }, + { + "epoch": 0.036076337530213935, + "grad_norm": 0.08362135291099548, + "learning_rate": 0.00019857122239861453, + "loss": 0.2022, + "step": 500 + }, + { + "epoch": 0.03614849020527436, + "grad_norm": 0.10879844427108765, + "learning_rate": 0.00019856833597921777, + "loss": 0.217, + "step": 501 + }, + { + "epoch": 0.03622064288033479, + "grad_norm": 0.1339205950498581, + "learning_rate": 0.00019856544955982106, + "loss": 0.1782, + "step": 502 + }, + { + "epoch": 0.03629279555539522, + "grad_norm": 0.11790255457162857, + "learning_rate": 0.00019856256314042432, + "loss": 0.21, + "step": 503 + }, + { + "epoch": 0.03636494823045564, + "grad_norm": 0.13036282360553741, + "learning_rate": 0.00019855967672102758, + "loss": 0.2059, + "step": 504 + }, + { + "epoch": 0.03643710090551607, + "grad_norm": 0.14188435673713684, + "learning_rate": 0.00019855679030163085, + "loss": 0.2295, + "step": 505 + }, + { + "epoch": 0.0365092535805765, + "grad_norm": 0.18794411420822144, + "learning_rate": 0.00019855390388223408, + "loss": 0.1999, + "step": 506 + }, + { + "epoch": 0.036581406255636925, + "grad_norm": 0.2316272109746933, + "learning_rate": 0.00019855101746283735, + "loss": 0.1816, + "step": 507 + }, + { + "epoch": 0.036653558930697355, + "grad_norm": 0.1288416087627411, + "learning_rate": 0.0001985481310434406, + "loss": 0.1966, + "step": 508 + }, + { + "epoch": 0.036725711605757785, + "grad_norm": 0.10679508745670319, + "learning_rate": 0.0001985452446240439, + "loss": 0.154, + "step": 509 + }, + { + "epoch": 0.03679786428081821, + "grad_norm": 0.0898621529340744, + "learning_rate": 0.00019854235820464716, + "loss": 0.1894, + "step": 510 + }, + { + "epoch": 0.03687001695587864, + "grad_norm": 0.0908215343952179, + "learning_rate": 0.0001985394717852504, + "loss": 0.1742, + "step": 511 + }, + { + "epoch": 0.03694216963093907, + "grad_norm": 0.17195391654968262, + "learning_rate": 0.00019853658536585366, + "loss": 0.2291, + "step": 512 + }, + { + "epoch": 0.03701432230599949, + "grad_norm": 0.12300444394350052, + "learning_rate": 0.00019853369894645692, + "loss": 0.1971, + "step": 513 + }, + { + "epoch": 0.03708647498105992, + "grad_norm": 0.10059063881635666, + "learning_rate": 0.00019853081252706019, + "loss": 0.1918, + "step": 514 + }, + { + "epoch": 0.03715862765612035, + "grad_norm": 0.10917804390192032, + "learning_rate": 0.00019852792610766345, + "loss": 0.1648, + "step": 515 + }, + { + "epoch": 0.037230780331180775, + "grad_norm": 0.10756178200244904, + "learning_rate": 0.0001985250396882667, + "loss": 0.1771, + "step": 516 + }, + { + "epoch": 0.037302933006241205, + "grad_norm": 0.10932893306016922, + "learning_rate": 0.00019852215326886997, + "loss": 0.1977, + "step": 517 + }, + { + "epoch": 0.037375085681301635, + "grad_norm": 0.15212245285511017, + "learning_rate": 0.00019851926684947324, + "loss": 0.2453, + "step": 518 + }, + { + "epoch": 0.03744723835636206, + "grad_norm": 0.08777447789907455, + "learning_rate": 0.0001985163804300765, + "loss": 0.1742, + "step": 519 + }, + { + "epoch": 0.03751939103142249, + "grad_norm": 0.10264381021261215, + "learning_rate": 0.00019851349401067976, + "loss": 0.205, + "step": 520 + }, + { + "epoch": 0.03759154370648292, + "grad_norm": 0.09408518671989441, + "learning_rate": 0.00019851060759128303, + "loss": 0.1924, + "step": 521 + }, + { + "epoch": 0.03766369638154335, + "grad_norm": 0.09707022458314896, + "learning_rate": 0.00019850772117188626, + "loss": 0.1457, + "step": 522 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 0.08716564625501633, + "learning_rate": 0.00019850483475248955, + "loss": 0.1853, + "step": 523 + }, + { + "epoch": 0.0378080017316642, + "grad_norm": 0.17085005342960358, + "learning_rate": 0.00019850194833309282, + "loss": 0.2285, + "step": 524 + }, + { + "epoch": 0.03788015440672463, + "grad_norm": 0.1418502926826477, + "learning_rate": 0.00019849906191369608, + "loss": 0.1484, + "step": 525 + }, + { + "epoch": 0.037952307081785056, + "grad_norm": 0.11434414982795715, + "learning_rate": 0.00019849617549429934, + "loss": 0.1856, + "step": 526 + }, + { + "epoch": 0.038024459756845486, + "grad_norm": 0.07442096620798111, + "learning_rate": 0.00019849328907490258, + "loss": 0.1715, + "step": 527 + }, + { + "epoch": 0.038096612431905916, + "grad_norm": 0.09527246654033661, + "learning_rate": 0.00019849040265550584, + "loss": 0.1711, + "step": 528 + }, + { + "epoch": 0.03816876510696634, + "grad_norm": 0.09395115822553635, + "learning_rate": 0.0001984875162361091, + "loss": 0.1553, + "step": 529 + }, + { + "epoch": 0.03824091778202677, + "grad_norm": 0.10012837499380112, + "learning_rate": 0.0001984846298167124, + "loss": 0.1637, + "step": 530 + }, + { + "epoch": 0.0383130704570872, + "grad_norm": 0.12191860377788544, + "learning_rate": 0.00019848174339731566, + "loss": 0.2048, + "step": 531 + }, + { + "epoch": 0.03838522313214762, + "grad_norm": 0.14264048635959625, + "learning_rate": 0.0001984788569779189, + "loss": 0.1559, + "step": 532 + }, + { + "epoch": 0.03845737580720805, + "grad_norm": 0.11062449216842651, + "learning_rate": 0.00019847597055852215, + "loss": 0.2168, + "step": 533 + }, + { + "epoch": 0.03852952848226848, + "grad_norm": 0.12129320949316025, + "learning_rate": 0.00019847308413912542, + "loss": 0.1881, + "step": 534 + }, + { + "epoch": 0.038601681157328906, + "grad_norm": 0.10533594340085983, + "learning_rate": 0.00019847019771972868, + "loss": 0.1531, + "step": 535 + }, + { + "epoch": 0.038673833832389336, + "grad_norm": 0.10378196835517883, + "learning_rate": 0.00019846731130033194, + "loss": 0.2205, + "step": 536 + }, + { + "epoch": 0.038745986507449766, + "grad_norm": 0.11916801333427429, + "learning_rate": 0.0001984644248809352, + "loss": 0.2088, + "step": 537 + }, + { + "epoch": 0.03881813918251019, + "grad_norm": 0.07917596399784088, + "learning_rate": 0.00019846153846153847, + "loss": 0.1905, + "step": 538 + }, + { + "epoch": 0.03889029185757062, + "grad_norm": 0.11136099696159363, + "learning_rate": 0.00019845865204214173, + "loss": 0.2169, + "step": 539 + }, + { + "epoch": 0.03896244453263105, + "grad_norm": 0.1269828975200653, + "learning_rate": 0.000198455765622745, + "loss": 0.2285, + "step": 540 + }, + { + "epoch": 0.03903459720769147, + "grad_norm": 0.10883588343858719, + "learning_rate": 0.00019845287920334826, + "loss": 0.1356, + "step": 541 + }, + { + "epoch": 0.0391067498827519, + "grad_norm": 0.12637732923030853, + "learning_rate": 0.00019844999278395152, + "loss": 0.1686, + "step": 542 + }, + { + "epoch": 0.03917890255781233, + "grad_norm": 0.08260592073202133, + "learning_rate": 0.00019844710636455476, + "loss": 0.247, + "step": 543 + }, + { + "epoch": 0.039251055232872756, + "grad_norm": 0.1035764068365097, + "learning_rate": 0.00019844421994515805, + "loss": 0.1732, + "step": 544 + }, + { + "epoch": 0.039323207907933186, + "grad_norm": 0.10097551345825195, + "learning_rate": 0.0001984413335257613, + "loss": 0.2054, + "step": 545 + }, + { + "epoch": 0.039395360582993616, + "grad_norm": 0.10047705471515656, + "learning_rate": 0.00019843844710636457, + "loss": 0.1334, + "step": 546 + }, + { + "epoch": 0.03946751325805404, + "grad_norm": 0.12177751958370209, + "learning_rate": 0.00019843556068696784, + "loss": 0.1698, + "step": 547 + }, + { + "epoch": 0.03953966593311447, + "grad_norm": 0.11659875512123108, + "learning_rate": 0.00019843267426757107, + "loss": 0.1794, + "step": 548 + }, + { + "epoch": 0.0396118186081749, + "grad_norm": 0.11983144283294678, + "learning_rate": 0.00019842978784817433, + "loss": 0.2204, + "step": 549 + }, + { + "epoch": 0.03968397128323532, + "grad_norm": 0.09205503761768341, + "learning_rate": 0.0001984269014287776, + "loss": 0.1878, + "step": 550 + }, + { + "epoch": 0.03975612395829575, + "grad_norm": 0.09256859123706818, + "learning_rate": 0.0001984240150093809, + "loss": 0.2018, + "step": 551 + }, + { + "epoch": 0.03982827663335618, + "grad_norm": 0.10241582244634628, + "learning_rate": 0.00019842112858998415, + "loss": 0.2129, + "step": 552 + }, + { + "epoch": 0.03990042930841661, + "grad_norm": 0.09551103413105011, + "learning_rate": 0.00019841824217058739, + "loss": 0.1742, + "step": 553 + }, + { + "epoch": 0.039972581983477036, + "grad_norm": 0.10912424325942993, + "learning_rate": 0.00019841535575119065, + "loss": 0.1939, + "step": 554 + }, + { + "epoch": 0.040044734658537466, + "grad_norm": 0.10542840510606766, + "learning_rate": 0.0001984124693317939, + "loss": 0.1925, + "step": 555 + }, + { + "epoch": 0.040116887333597896, + "grad_norm": 0.09988424926996231, + "learning_rate": 0.00019840958291239717, + "loss": 0.1617, + "step": 556 + }, + { + "epoch": 0.04018904000865832, + "grad_norm": 0.10071025788784027, + "learning_rate": 0.00019840669649300044, + "loss": 0.1637, + "step": 557 + }, + { + "epoch": 0.04026119268371875, + "grad_norm": 0.09819203615188599, + "learning_rate": 0.0001984038100736037, + "loss": 0.1398, + "step": 558 + }, + { + "epoch": 0.04033334535877918, + "grad_norm": 0.09179025143384933, + "learning_rate": 0.00019840092365420696, + "loss": 0.1722, + "step": 559 + }, + { + "epoch": 0.0404054980338396, + "grad_norm": 0.093745656311512, + "learning_rate": 0.00019839803723481023, + "loss": 0.1665, + "step": 560 + }, + { + "epoch": 0.04047765070890003, + "grad_norm": 0.12472759187221527, + "learning_rate": 0.0001983951508154135, + "loss": 0.142, + "step": 561 + }, + { + "epoch": 0.04054980338396046, + "grad_norm": 0.1199507862329483, + "learning_rate": 0.00019839226439601675, + "loss": 0.1699, + "step": 562 + }, + { + "epoch": 0.040621956059020886, + "grad_norm": 0.12098688632249832, + "learning_rate": 0.00019838937797662002, + "loss": 0.1944, + "step": 563 + }, + { + "epoch": 0.040694108734081316, + "grad_norm": 0.11055096238851547, + "learning_rate": 0.00019838649155722325, + "loss": 0.1638, + "step": 564 + }, + { + "epoch": 0.040766261409141746, + "grad_norm": 0.1174546554684639, + "learning_rate": 0.00019838360513782654, + "loss": 0.1746, + "step": 565 + }, + { + "epoch": 0.04083841408420217, + "grad_norm": 0.08360788226127625, + "learning_rate": 0.0001983807187184298, + "loss": 0.1667, + "step": 566 + }, + { + "epoch": 0.0409105667592626, + "grad_norm": 0.10997113585472107, + "learning_rate": 0.00019837783229903307, + "loss": 0.1781, + "step": 567 + }, + { + "epoch": 0.04098271943432303, + "grad_norm": 0.09272412210702896, + "learning_rate": 0.00019837494587963633, + "loss": 0.1386, + "step": 568 + }, + { + "epoch": 0.04105487210938345, + "grad_norm": 0.15601404011249542, + "learning_rate": 0.00019837205946023957, + "loss": 0.1773, + "step": 569 + }, + { + "epoch": 0.04112702478444388, + "grad_norm": 0.09089847654104233, + "learning_rate": 0.00019836917304084283, + "loss": 0.1821, + "step": 570 + }, + { + "epoch": 0.04119917745950431, + "grad_norm": 0.14816875755786896, + "learning_rate": 0.0001983662866214461, + "loss": 0.1865, + "step": 571 + }, + { + "epoch": 0.041271330134564736, + "grad_norm": 0.08765484392642975, + "learning_rate": 0.00019836340020204938, + "loss": 0.2037, + "step": 572 + }, + { + "epoch": 0.041343482809625166, + "grad_norm": 0.11936503648757935, + "learning_rate": 0.00019836051378265264, + "loss": 0.1625, + "step": 573 + }, + { + "epoch": 0.041415635484685596, + "grad_norm": 0.09595091640949249, + "learning_rate": 0.00019835762736325588, + "loss": 0.2067, + "step": 574 + }, + { + "epoch": 0.04148778815974602, + "grad_norm": 0.07997038960456848, + "learning_rate": 0.00019835474094385914, + "loss": 0.1594, + "step": 575 + }, + { + "epoch": 0.04155994083480645, + "grad_norm": 0.11834511905908585, + "learning_rate": 0.0001983518545244624, + "loss": 0.1785, + "step": 576 + }, + { + "epoch": 0.04163209350986688, + "grad_norm": 0.0922444686293602, + "learning_rate": 0.00019834896810506567, + "loss": 0.1694, + "step": 577 + }, + { + "epoch": 0.0417042461849273, + "grad_norm": 0.09497426450252533, + "learning_rate": 0.00019834608168566893, + "loss": 0.173, + "step": 578 + }, + { + "epoch": 0.04177639885998773, + "grad_norm": 0.11161245405673981, + "learning_rate": 0.0001983431952662722, + "loss": 0.1949, + "step": 579 + }, + { + "epoch": 0.04184855153504816, + "grad_norm": 0.09253062307834625, + "learning_rate": 0.00019834030884687546, + "loss": 0.1983, + "step": 580 + }, + { + "epoch": 0.041920704210108586, + "grad_norm": 0.10557336360216141, + "learning_rate": 0.00019833742242747872, + "loss": 0.1835, + "step": 581 + }, + { + "epoch": 0.041992856885169016, + "grad_norm": 0.15258745849132538, + "learning_rate": 0.00019833453600808198, + "loss": 0.1598, + "step": 582 + }, + { + "epoch": 0.04206500956022945, + "grad_norm": 0.09645214676856995, + "learning_rate": 0.00019833164958868525, + "loss": 0.1596, + "step": 583 + }, + { + "epoch": 0.04213716223528988, + "grad_norm": 0.10072506964206696, + "learning_rate": 0.0001983287631692885, + "loss": 0.191, + "step": 584 + }, + { + "epoch": 0.0422093149103503, + "grad_norm": 0.12612341344356537, + "learning_rate": 0.00019832587674989177, + "loss": 0.1558, + "step": 585 + }, + { + "epoch": 0.04228146758541073, + "grad_norm": 0.08388601988554001, + "learning_rate": 0.00019832299033049504, + "loss": 0.1369, + "step": 586 + }, + { + "epoch": 0.04235362026047116, + "grad_norm": 0.1265336275100708, + "learning_rate": 0.0001983201039110983, + "loss": 0.1663, + "step": 587 + }, + { + "epoch": 0.04242577293553158, + "grad_norm": 0.09034436196088791, + "learning_rate": 0.00019831721749170156, + "loss": 0.1655, + "step": 588 + }, + { + "epoch": 0.04249792561059201, + "grad_norm": 0.10816913843154907, + "learning_rate": 0.00019831433107230482, + "loss": 0.1637, + "step": 589 + }, + { + "epoch": 0.04257007828565244, + "grad_norm": 0.08399533480405807, + "learning_rate": 0.0001983114446529081, + "loss": 0.1515, + "step": 590 + }, + { + "epoch": 0.04264223096071287, + "grad_norm": 0.0992622897028923, + "learning_rate": 0.00019830855823351132, + "loss": 0.1813, + "step": 591 + }, + { + "epoch": 0.0427143836357733, + "grad_norm": 0.10754331201314926, + "learning_rate": 0.00019830567181411459, + "loss": 0.1651, + "step": 592 + }, + { + "epoch": 0.04278653631083373, + "grad_norm": 0.09457876533269882, + "learning_rate": 0.00019830278539471788, + "loss": 0.1797, + "step": 593 + }, + { + "epoch": 0.04285868898589415, + "grad_norm": 0.08354930579662323, + "learning_rate": 0.00019829989897532114, + "loss": 0.1702, + "step": 594 + }, + { + "epoch": 0.04293084166095458, + "grad_norm": 0.13465763628482819, + "learning_rate": 0.0001982970125559244, + "loss": 0.1844, + "step": 595 + }, + { + "epoch": 0.04300299433601501, + "grad_norm": 0.12427057325839996, + "learning_rate": 0.00019829412613652764, + "loss": 0.2262, + "step": 596 + }, + { + "epoch": 0.04307514701107543, + "grad_norm": 0.1210947260260582, + "learning_rate": 0.0001982912397171309, + "loss": 0.2213, + "step": 597 + }, + { + "epoch": 0.043147299686135863, + "grad_norm": 0.11663418263196945, + "learning_rate": 0.00019828835329773416, + "loss": 0.1689, + "step": 598 + }, + { + "epoch": 0.043219452361196294, + "grad_norm": 0.11826334148645401, + "learning_rate": 0.00019828546687833743, + "loss": 0.1991, + "step": 599 + }, + { + "epoch": 0.04329160503625672, + "grad_norm": 0.11704009026288986, + "learning_rate": 0.00019828258045894072, + "loss": 0.1546, + "step": 600 + }, + { + "epoch": 0.04336375771131715, + "grad_norm": 0.13563813269138336, + "learning_rate": 0.00019827969403954395, + "loss": 0.1606, + "step": 601 + }, + { + "epoch": 0.04343591038637758, + "grad_norm": 0.10411211103200912, + "learning_rate": 0.00019827680762014721, + "loss": 0.1432, + "step": 602 + }, + { + "epoch": 0.043508063061438, + "grad_norm": 0.11733534187078476, + "learning_rate": 0.00019827392120075048, + "loss": 0.1368, + "step": 603 + }, + { + "epoch": 0.04358021573649843, + "grad_norm": 0.1115807443857193, + "learning_rate": 0.00019827103478135374, + "loss": 0.1922, + "step": 604 + }, + { + "epoch": 0.04365236841155886, + "grad_norm": 0.1343086063861847, + "learning_rate": 0.000198268148361957, + "loss": 0.2196, + "step": 605 + }, + { + "epoch": 0.043724521086619283, + "grad_norm": 0.09392105042934418, + "learning_rate": 0.00019826526194256027, + "loss": 0.1787, + "step": 606 + }, + { + "epoch": 0.043796673761679714, + "grad_norm": 0.09433528035879135, + "learning_rate": 0.00019826237552316353, + "loss": 0.1883, + "step": 607 + }, + { + "epoch": 0.043868826436740144, + "grad_norm": 0.11469694972038269, + "learning_rate": 0.0001982594891037668, + "loss": 0.2255, + "step": 608 + }, + { + "epoch": 0.04394097911180057, + "grad_norm": 0.1101691946387291, + "learning_rate": 0.00019825660268437006, + "loss": 0.1569, + "step": 609 + }, + { + "epoch": 0.044013131786861, + "grad_norm": 0.10026239603757858, + "learning_rate": 0.00019825371626497332, + "loss": 0.1712, + "step": 610 + }, + { + "epoch": 0.04408528446192143, + "grad_norm": 0.15787126123905182, + "learning_rate": 0.00019825082984557658, + "loss": 0.186, + "step": 611 + }, + { + "epoch": 0.04415743713698185, + "grad_norm": 0.10418775677680969, + "learning_rate": 0.00019824794342617982, + "loss": 0.1462, + "step": 612 + }, + { + "epoch": 0.04422958981204228, + "grad_norm": 0.0792064517736435, + "learning_rate": 0.00019824505700678308, + "loss": 0.1684, + "step": 613 + }, + { + "epoch": 0.04430174248710271, + "grad_norm": 0.10991806536912918, + "learning_rate": 0.00019824217058738637, + "loss": 0.1581, + "step": 614 + }, + { + "epoch": 0.04437389516216314, + "grad_norm": 0.08922984451055527, + "learning_rate": 0.00019823928416798963, + "loss": 0.1659, + "step": 615 + }, + { + "epoch": 0.044446047837223564, + "grad_norm": 0.1666286438703537, + "learning_rate": 0.0001982363977485929, + "loss": 0.1732, + "step": 616 + }, + { + "epoch": 0.044518200512283994, + "grad_norm": 0.1010286882519722, + "learning_rate": 0.00019823351132919613, + "loss": 0.1621, + "step": 617 + }, + { + "epoch": 0.044590353187344424, + "grad_norm": 0.09831476211547852, + "learning_rate": 0.0001982306249097994, + "loss": 0.185, + "step": 618 + }, + { + "epoch": 0.04466250586240485, + "grad_norm": 0.10219982266426086, + "learning_rate": 0.00019822773849040266, + "loss": 0.1577, + "step": 619 + }, + { + "epoch": 0.04473465853746528, + "grad_norm": 0.08898953348398209, + "learning_rate": 0.00019822485207100592, + "loss": 0.1435, + "step": 620 + }, + { + "epoch": 0.04480681121252571, + "grad_norm": 0.11173056066036224, + "learning_rate": 0.0001982219656516092, + "loss": 0.2292, + "step": 621 + }, + { + "epoch": 0.04487896388758613, + "grad_norm": 0.11962389200925827, + "learning_rate": 0.00019821907923221245, + "loss": 0.1796, + "step": 622 + }, + { + "epoch": 0.04495111656264656, + "grad_norm": 0.09178833663463593, + "learning_rate": 0.0001982161928128157, + "loss": 0.1968, + "step": 623 + }, + { + "epoch": 0.04502326923770699, + "grad_norm": 0.09955684840679169, + "learning_rate": 0.00019821330639341897, + "loss": 0.2178, + "step": 624 + }, + { + "epoch": 0.045095421912767414, + "grad_norm": 0.08512556552886963, + "learning_rate": 0.00019821041997402223, + "loss": 0.1563, + "step": 625 + }, + { + "epoch": 0.045167574587827844, + "grad_norm": 0.10034400224685669, + "learning_rate": 0.0001982075335546255, + "loss": 0.2094, + "step": 626 + }, + { + "epoch": 0.045239727262888274, + "grad_norm": 0.1015506461262703, + "learning_rate": 0.00019820464713522876, + "loss": 0.2016, + "step": 627 + }, + { + "epoch": 0.0453118799379487, + "grad_norm": 0.08328647911548615, + "learning_rate": 0.00019820176071583202, + "loss": 0.1894, + "step": 628 + }, + { + "epoch": 0.04538403261300913, + "grad_norm": 0.09488657861948013, + "learning_rate": 0.0001981988742964353, + "loss": 0.1643, + "step": 629 + }, + { + "epoch": 0.04545618528806956, + "grad_norm": 0.13005909323692322, + "learning_rate": 0.00019819598787703855, + "loss": 0.1713, + "step": 630 + }, + { + "epoch": 0.04552833796312998, + "grad_norm": 0.09195161610841751, + "learning_rate": 0.0001981931014576418, + "loss": 0.1754, + "step": 631 + }, + { + "epoch": 0.04560049063819041, + "grad_norm": 0.09518012404441833, + "learning_rate": 0.00019819021503824508, + "loss": 0.1907, + "step": 632 + }, + { + "epoch": 0.04567264331325084, + "grad_norm": 0.12911607325077057, + "learning_rate": 0.0001981873286188483, + "loss": 0.1522, + "step": 633 + }, + { + "epoch": 0.045744795988311264, + "grad_norm": 0.09490635991096497, + "learning_rate": 0.00019818444219945157, + "loss": 0.1362, + "step": 634 + }, + { + "epoch": 0.045816948663371694, + "grad_norm": 0.10800332576036453, + "learning_rate": 0.00019818155578005486, + "loss": 0.1842, + "step": 635 + }, + { + "epoch": 0.045889101338432124, + "grad_norm": 0.10239040851593018, + "learning_rate": 0.00019817866936065813, + "loss": 0.1807, + "step": 636 + }, + { + "epoch": 0.04596125401349255, + "grad_norm": 0.11252974718809128, + "learning_rate": 0.0001981757829412614, + "loss": 0.1881, + "step": 637 + }, + { + "epoch": 0.04603340668855298, + "grad_norm": 0.10153605788946152, + "learning_rate": 0.00019817289652186463, + "loss": 0.2006, + "step": 638 + }, + { + "epoch": 0.04610555936361341, + "grad_norm": 0.09800973534584045, + "learning_rate": 0.0001981700101024679, + "loss": 0.1909, + "step": 639 + }, + { + "epoch": 0.04617771203867383, + "grad_norm": 0.08246664702892303, + "learning_rate": 0.00019816712368307115, + "loss": 0.1475, + "step": 640 + }, + { + "epoch": 0.04624986471373426, + "grad_norm": 0.08874426782131195, + "learning_rate": 0.00019816423726367441, + "loss": 0.1972, + "step": 641 + }, + { + "epoch": 0.04632201738879469, + "grad_norm": 0.08816955238580704, + "learning_rate": 0.0001981613508442777, + "loss": 0.1459, + "step": 642 + }, + { + "epoch": 0.046394170063855114, + "grad_norm": 0.10154946893453598, + "learning_rate": 0.00019815846442488094, + "loss": 0.2218, + "step": 643 + }, + { + "epoch": 0.046466322738915544, + "grad_norm": 0.1170118972659111, + "learning_rate": 0.0001981555780054842, + "loss": 0.1207, + "step": 644 + }, + { + "epoch": 0.046538475413975974, + "grad_norm": 0.1154472604393959, + "learning_rate": 0.00019815269158608747, + "loss": 0.1987, + "step": 645 + }, + { + "epoch": 0.046610628089036404, + "grad_norm": 0.12462370842695236, + "learning_rate": 0.00019814980516669073, + "loss": 0.1656, + "step": 646 + }, + { + "epoch": 0.04668278076409683, + "grad_norm": 0.0983177199959755, + "learning_rate": 0.000198146918747294, + "loss": 0.154, + "step": 647 + }, + { + "epoch": 0.04675493343915726, + "grad_norm": 0.10129359364509583, + "learning_rate": 0.00019814403232789726, + "loss": 0.1307, + "step": 648 + }, + { + "epoch": 0.04682708611421769, + "grad_norm": 0.16896426677703857, + "learning_rate": 0.00019814114590850052, + "loss": 0.1752, + "step": 649 + }, + { + "epoch": 0.04689923878927811, + "grad_norm": 0.09189584851264954, + "learning_rate": 0.00019813825948910378, + "loss": 0.2356, + "step": 650 + }, + { + "epoch": 0.04697139146433854, + "grad_norm": 0.10254523903131485, + "learning_rate": 0.00019813537306970704, + "loss": 0.2083, + "step": 651 + }, + { + "epoch": 0.04704354413939897, + "grad_norm": 0.14748401939868927, + "learning_rate": 0.0001981324866503103, + "loss": 0.2093, + "step": 652 + }, + { + "epoch": 0.047115696814459394, + "grad_norm": 0.08709222823381424, + "learning_rate": 0.00019812960023091357, + "loss": 0.2254, + "step": 653 + }, + { + "epoch": 0.047187849489519824, + "grad_norm": 0.10217154771089554, + "learning_rate": 0.0001981267138115168, + "loss": 0.2106, + "step": 654 + }, + { + "epoch": 0.047260002164580255, + "grad_norm": 0.08088697493076324, + "learning_rate": 0.00019812382739212007, + "loss": 0.1829, + "step": 655 + }, + { + "epoch": 0.04733215483964068, + "grad_norm": 0.08585620671510696, + "learning_rate": 0.00019812094097272336, + "loss": 0.2046, + "step": 656 + }, + { + "epoch": 0.04740430751470111, + "grad_norm": 0.0953826904296875, + "learning_rate": 0.00019811805455332662, + "loss": 0.2091, + "step": 657 + }, + { + "epoch": 0.04747646018976154, + "grad_norm": 0.0792909488081932, + "learning_rate": 0.00019811516813392988, + "loss": 0.2172, + "step": 658 + }, + { + "epoch": 0.04754861286482196, + "grad_norm": 0.10460888594388962, + "learning_rate": 0.00019811228171453312, + "loss": 0.1897, + "step": 659 + }, + { + "epoch": 0.04762076553988239, + "grad_norm": 0.10770123451948166, + "learning_rate": 0.00019810939529513638, + "loss": 0.1761, + "step": 660 + }, + { + "epoch": 0.04769291821494282, + "grad_norm": 0.16509467363357544, + "learning_rate": 0.00019810650887573965, + "loss": 0.2253, + "step": 661 + }, + { + "epoch": 0.047765070890003244, + "grad_norm": 0.11456334590911865, + "learning_rate": 0.0001981036224563429, + "loss": 0.1971, + "step": 662 + }, + { + "epoch": 0.047837223565063675, + "grad_norm": 0.10152343660593033, + "learning_rate": 0.0001981007360369462, + "loss": 0.1994, + "step": 663 + }, + { + "epoch": 0.047909376240124105, + "grad_norm": 0.12069497257471085, + "learning_rate": 0.00019809784961754943, + "loss": 0.2312, + "step": 664 + }, + { + "epoch": 0.04798152891518453, + "grad_norm": 0.10090707987546921, + "learning_rate": 0.0001980949631981527, + "loss": 0.1837, + "step": 665 + }, + { + "epoch": 0.04805368159024496, + "grad_norm": 0.09290478378534317, + "learning_rate": 0.00019809207677875596, + "loss": 0.2106, + "step": 666 + }, + { + "epoch": 0.04812583426530539, + "grad_norm": 0.11278660595417023, + "learning_rate": 0.00019808919035935922, + "loss": 0.2042, + "step": 667 + }, + { + "epoch": 0.04819798694036581, + "grad_norm": 0.08206578344106674, + "learning_rate": 0.00019808630393996249, + "loss": 0.2105, + "step": 668 + }, + { + "epoch": 0.04827013961542624, + "grad_norm": 0.08444409817457199, + "learning_rate": 0.00019808341752056575, + "loss": 0.1807, + "step": 669 + }, + { + "epoch": 0.04834229229048667, + "grad_norm": 0.09454233944416046, + "learning_rate": 0.000198080531101169, + "loss": 0.1319, + "step": 670 + }, + { + "epoch": 0.048414444965547095, + "grad_norm": 0.10502707213163376, + "learning_rate": 0.00019807764468177228, + "loss": 0.2048, + "step": 671 + }, + { + "epoch": 0.048486597640607525, + "grad_norm": 0.11612027138471603, + "learning_rate": 0.00019807475826237554, + "loss": 0.1878, + "step": 672 + }, + { + "epoch": 0.048558750315667955, + "grad_norm": 0.1150597631931305, + "learning_rate": 0.0001980718718429788, + "loss": 0.2027, + "step": 673 + }, + { + "epoch": 0.04863090299072838, + "grad_norm": 0.11744673550128937, + "learning_rate": 0.00019806898542358206, + "loss": 0.173, + "step": 674 + }, + { + "epoch": 0.04870305566578881, + "grad_norm": 0.1384035348892212, + "learning_rate": 0.0001980660990041853, + "loss": 0.2282, + "step": 675 + }, + { + "epoch": 0.04877520834084924, + "grad_norm": 0.12562517821788788, + "learning_rate": 0.00019806321258478856, + "loss": 0.1754, + "step": 676 + }, + { + "epoch": 0.04884736101590967, + "grad_norm": 0.13998053967952728, + "learning_rate": 0.00019806032616539185, + "loss": 0.1818, + "step": 677 + }, + { + "epoch": 0.04891951369097009, + "grad_norm": 0.1093667671084404, + "learning_rate": 0.00019805743974599512, + "loss": 0.167, + "step": 678 + }, + { + "epoch": 0.04899166636603052, + "grad_norm": 0.11169123649597168, + "learning_rate": 0.00019805455332659838, + "loss": 0.1963, + "step": 679 + }, + { + "epoch": 0.04906381904109095, + "grad_norm": 0.11249633878469467, + "learning_rate": 0.00019805166690720161, + "loss": 0.2211, + "step": 680 + }, + { + "epoch": 0.049135971716151375, + "grad_norm": 0.10238689184188843, + "learning_rate": 0.00019804878048780488, + "loss": 0.1911, + "step": 681 + }, + { + "epoch": 0.049208124391211805, + "grad_norm": 0.09815354645252228, + "learning_rate": 0.00019804589406840814, + "loss": 0.1566, + "step": 682 + }, + { + "epoch": 0.049280277066272235, + "grad_norm": 0.09693794697523117, + "learning_rate": 0.0001980430076490114, + "loss": 0.1876, + "step": 683 + }, + { + "epoch": 0.04935242974133266, + "grad_norm": 0.0989205613732338, + "learning_rate": 0.0001980401212296147, + "loss": 0.1785, + "step": 684 + }, + { + "epoch": 0.04942458241639309, + "grad_norm": 0.1322227567434311, + "learning_rate": 0.00019803723481021793, + "loss": 0.216, + "step": 685 + }, + { + "epoch": 0.04949673509145352, + "grad_norm": 0.08061541616916656, + "learning_rate": 0.0001980343483908212, + "loss": 0.1985, + "step": 686 + }, + { + "epoch": 0.04956888776651394, + "grad_norm": 0.07591982185840607, + "learning_rate": 0.00019803146197142445, + "loss": 0.1757, + "step": 687 + }, + { + "epoch": 0.04964104044157437, + "grad_norm": 0.08189503103494644, + "learning_rate": 0.00019802857555202772, + "loss": 0.1801, + "step": 688 + }, + { + "epoch": 0.0497131931166348, + "grad_norm": 0.09155537933111191, + "learning_rate": 0.00019802568913263098, + "loss": 0.1492, + "step": 689 + }, + { + "epoch": 0.049785345791695225, + "grad_norm": 0.09057817608118057, + "learning_rate": 0.00019802280271323424, + "loss": 0.1734, + "step": 690 + }, + { + "epoch": 0.049857498466755655, + "grad_norm": 0.07322797179222107, + "learning_rate": 0.0001980199162938375, + "loss": 0.161, + "step": 691 + }, + { + "epoch": 0.049929651141816085, + "grad_norm": 0.14538775384426117, + "learning_rate": 0.00019801702987444077, + "loss": 0.1784, + "step": 692 + }, + { + "epoch": 0.05000180381687651, + "grad_norm": 0.11068142205476761, + "learning_rate": 0.00019801414345504403, + "loss": 0.1881, + "step": 693 + }, + { + "epoch": 0.05007395649193694, + "grad_norm": 0.09967081993818283, + "learning_rate": 0.0001980112570356473, + "loss": 0.225, + "step": 694 + }, + { + "epoch": 0.05014610916699737, + "grad_norm": 0.08908654004335403, + "learning_rate": 0.00019800837061625056, + "loss": 0.185, + "step": 695 + }, + { + "epoch": 0.05021826184205779, + "grad_norm": 0.11593464761972427, + "learning_rate": 0.00019800548419685382, + "loss": 0.214, + "step": 696 + }, + { + "epoch": 0.05029041451711822, + "grad_norm": 0.08826512098312378, + "learning_rate": 0.00019800259777745706, + "loss": 0.1974, + "step": 697 + }, + { + "epoch": 0.05036256719217865, + "grad_norm": 0.12316125631332397, + "learning_rate": 0.00019799971135806032, + "loss": 0.1621, + "step": 698 + }, + { + "epoch": 0.050434719867239075, + "grad_norm": 0.11270264536142349, + "learning_rate": 0.0001979968249386636, + "loss": 0.1789, + "step": 699 + }, + { + "epoch": 0.050506872542299505, + "grad_norm": 0.08740193396806717, + "learning_rate": 0.00019799393851926687, + "loss": 0.1843, + "step": 700 + }, + { + "epoch": 0.050579025217359935, + "grad_norm": 0.09596771746873856, + "learning_rate": 0.00019799105209987014, + "loss": 0.1776, + "step": 701 + }, + { + "epoch": 0.05065117789242036, + "grad_norm": 0.08829741179943085, + "learning_rate": 0.00019798816568047337, + "loss": 0.1942, + "step": 702 + }, + { + "epoch": 0.05072333056748079, + "grad_norm": 0.11870314925909042, + "learning_rate": 0.00019798527926107663, + "loss": 0.2104, + "step": 703 + }, + { + "epoch": 0.05079548324254122, + "grad_norm": 0.08130889385938644, + "learning_rate": 0.0001979823928416799, + "loss": 0.1645, + "step": 704 + }, + { + "epoch": 0.05086763591760164, + "grad_norm": 0.10137905180454254, + "learning_rate": 0.00019797950642228316, + "loss": 0.1645, + "step": 705 + }, + { + "epoch": 0.05093978859266207, + "grad_norm": 0.11528564244508743, + "learning_rate": 0.00019797662000288645, + "loss": 0.1545, + "step": 706 + }, + { + "epoch": 0.0510119412677225, + "grad_norm": 0.11569073796272278, + "learning_rate": 0.00019797373358348969, + "loss": 0.1696, + "step": 707 + }, + { + "epoch": 0.05108409394278293, + "grad_norm": 0.08077122271060944, + "learning_rate": 0.00019797084716409295, + "loss": 0.1832, + "step": 708 + }, + { + "epoch": 0.051156246617843355, + "grad_norm": 0.08243294805288315, + "learning_rate": 0.0001979679607446962, + "loss": 0.1951, + "step": 709 + }, + { + "epoch": 0.051228399292903785, + "grad_norm": 0.10372677445411682, + "learning_rate": 0.00019796507432529947, + "loss": 0.1642, + "step": 710 + }, + { + "epoch": 0.051300551967964216, + "grad_norm": 0.10604501515626907, + "learning_rate": 0.00019796218790590274, + "loss": 0.2088, + "step": 711 + }, + { + "epoch": 0.05137270464302464, + "grad_norm": 0.10709025710821152, + "learning_rate": 0.000197959301486506, + "loss": 0.1519, + "step": 712 + }, + { + "epoch": 0.05144485731808507, + "grad_norm": 0.10411393642425537, + "learning_rate": 0.00019795641506710926, + "loss": 0.2138, + "step": 713 + }, + { + "epoch": 0.0515170099931455, + "grad_norm": 0.11297295987606049, + "learning_rate": 0.00019795352864771253, + "loss": 0.1728, + "step": 714 + }, + { + "epoch": 0.05158916266820592, + "grad_norm": 0.0939662978053093, + "learning_rate": 0.0001979506422283158, + "loss": 0.2185, + "step": 715 + }, + { + "epoch": 0.05166131534326635, + "grad_norm": 0.07480581849813461, + "learning_rate": 0.00019794775580891905, + "loss": 0.1902, + "step": 716 + }, + { + "epoch": 0.05173346801832678, + "grad_norm": 0.09393037110567093, + "learning_rate": 0.00019794486938952232, + "loss": 0.1825, + "step": 717 + }, + { + "epoch": 0.051805620693387205, + "grad_norm": 0.086279958486557, + "learning_rate": 0.00019794198297012555, + "loss": 0.2103, + "step": 718 + }, + { + "epoch": 0.051877773368447636, + "grad_norm": 0.09034014493227005, + "learning_rate": 0.00019793909655072881, + "loss": 0.1879, + "step": 719 + }, + { + "epoch": 0.051949926043508066, + "grad_norm": 0.09409305453300476, + "learning_rate": 0.0001979362101313321, + "loss": 0.1037, + "step": 720 + }, + { + "epoch": 0.05202207871856849, + "grad_norm": 0.10134784877300262, + "learning_rate": 0.00019793332371193537, + "loss": 0.1811, + "step": 721 + }, + { + "epoch": 0.05209423139362892, + "grad_norm": 0.10113094747066498, + "learning_rate": 0.00019793043729253863, + "loss": 0.1572, + "step": 722 + }, + { + "epoch": 0.05216638406868935, + "grad_norm": 0.126359760761261, + "learning_rate": 0.00019792755087314187, + "loss": 0.2265, + "step": 723 + }, + { + "epoch": 0.05223853674374977, + "grad_norm": 0.11764882504940033, + "learning_rate": 0.00019792466445374513, + "loss": 0.1743, + "step": 724 + }, + { + "epoch": 0.0523106894188102, + "grad_norm": 0.0821533054113388, + "learning_rate": 0.0001979217780343484, + "loss": 0.1595, + "step": 725 + }, + { + "epoch": 0.05238284209387063, + "grad_norm": 0.10342086851596832, + "learning_rate": 0.00019791889161495165, + "loss": 0.1718, + "step": 726 + }, + { + "epoch": 0.052454994768931056, + "grad_norm": 0.13683348894119263, + "learning_rate": 0.00019791600519555494, + "loss": 0.162, + "step": 727 + }, + { + "epoch": 0.052527147443991486, + "grad_norm": 0.08543882519006729, + "learning_rate": 0.00019791311877615818, + "loss": 0.1992, + "step": 728 + }, + { + "epoch": 0.052599300119051916, + "grad_norm": 0.09584735333919525, + "learning_rate": 0.00019791023235676144, + "loss": 0.1365, + "step": 729 + }, + { + "epoch": 0.05267145279411234, + "grad_norm": 0.14931128919124603, + "learning_rate": 0.0001979073459373647, + "loss": 0.1962, + "step": 730 + }, + { + "epoch": 0.05274360546917277, + "grad_norm": 0.10803002119064331, + "learning_rate": 0.00019790445951796797, + "loss": 0.13, + "step": 731 + }, + { + "epoch": 0.0528157581442332, + "grad_norm": 0.1050458624958992, + "learning_rate": 0.00019790157309857123, + "loss": 0.2151, + "step": 732 + }, + { + "epoch": 0.05288791081929362, + "grad_norm": 0.07899843156337738, + "learning_rate": 0.0001978986866791745, + "loss": 0.1776, + "step": 733 + }, + { + "epoch": 0.05296006349435405, + "grad_norm": 0.09396897256374359, + "learning_rate": 0.00019789580025977776, + "loss": 0.1372, + "step": 734 + }, + { + "epoch": 0.05303221616941448, + "grad_norm": 0.14268507063388824, + "learning_rate": 0.00019789291384038102, + "loss": 0.2246, + "step": 735 + }, + { + "epoch": 0.053104368844474906, + "grad_norm": 0.09302389621734619, + "learning_rate": 0.00019789002742098428, + "loss": 0.1988, + "step": 736 + }, + { + "epoch": 0.053176521519535336, + "grad_norm": 0.1272112876176834, + "learning_rate": 0.00019788714100158755, + "loss": 0.1537, + "step": 737 + }, + { + "epoch": 0.053248674194595766, + "grad_norm": 0.09457764029502869, + "learning_rate": 0.0001978842545821908, + "loss": 0.1599, + "step": 738 + }, + { + "epoch": 0.05332082686965619, + "grad_norm": 0.09066502749919891, + "learning_rate": 0.00019788136816279405, + "loss": 0.1865, + "step": 739 + }, + { + "epoch": 0.05339297954471662, + "grad_norm": 0.09228852391242981, + "learning_rate": 0.0001978784817433973, + "loss": 0.1781, + "step": 740 + }, + { + "epoch": 0.05346513221977705, + "grad_norm": 0.12406047433614731, + "learning_rate": 0.0001978755953240006, + "loss": 0.2168, + "step": 741 + }, + { + "epoch": 0.05353728489483748, + "grad_norm": 0.10978226363658905, + "learning_rate": 0.00019787270890460386, + "loss": 0.1879, + "step": 742 + }, + { + "epoch": 0.0536094375698979, + "grad_norm": 0.11857189983129501, + "learning_rate": 0.00019786982248520712, + "loss": 0.2438, + "step": 743 + }, + { + "epoch": 0.05368159024495833, + "grad_norm": 0.11693238466978073, + "learning_rate": 0.00019786693606581036, + "loss": 0.1973, + "step": 744 + }, + { + "epoch": 0.05375374292001876, + "grad_norm": 0.09075228124856949, + "learning_rate": 0.00019786404964641362, + "loss": 0.1361, + "step": 745 + }, + { + "epoch": 0.053825895595079186, + "grad_norm": 0.09898970276117325, + "learning_rate": 0.00019786116322701689, + "loss": 0.1743, + "step": 746 + }, + { + "epoch": 0.053898048270139616, + "grad_norm": 0.09719021618366241, + "learning_rate": 0.00019785827680762015, + "loss": 0.1934, + "step": 747 + }, + { + "epoch": 0.053970200945200046, + "grad_norm": 0.09431437402963638, + "learning_rate": 0.00019785539038822344, + "loss": 0.2014, + "step": 748 + }, + { + "epoch": 0.05404235362026047, + "grad_norm": 0.11232171952724457, + "learning_rate": 0.00019785250396882667, + "loss": 0.1365, + "step": 749 + }, + { + "epoch": 0.0541145062953209, + "grad_norm": 0.09927115589380264, + "learning_rate": 0.00019784961754942994, + "loss": 0.1727, + "step": 750 + }, + { + "epoch": 0.05418665897038133, + "grad_norm": 0.08680518716573715, + "learning_rate": 0.0001978467311300332, + "loss": 0.1887, + "step": 751 + }, + { + "epoch": 0.05425881164544175, + "grad_norm": 0.11436621099710464, + "learning_rate": 0.00019784384471063646, + "loss": 0.218, + "step": 752 + }, + { + "epoch": 0.05433096432050218, + "grad_norm": 0.11894845962524414, + "learning_rate": 0.00019784095829123973, + "loss": 0.1375, + "step": 753 + }, + { + "epoch": 0.05440311699556261, + "grad_norm": 0.09046079963445663, + "learning_rate": 0.000197838071871843, + "loss": 0.178, + "step": 754 + }, + { + "epoch": 0.054475269670623036, + "grad_norm": 0.35691019892692566, + "learning_rate": 0.00019783518545244625, + "loss": 0.1678, + "step": 755 + }, + { + "epoch": 0.054547422345683466, + "grad_norm": 0.15913482010364532, + "learning_rate": 0.00019783229903304952, + "loss": 0.1991, + "step": 756 + }, + { + "epoch": 0.054619575020743896, + "grad_norm": 0.13208337128162384, + "learning_rate": 0.00019782941261365278, + "loss": 0.1762, + "step": 757 + }, + { + "epoch": 0.05469172769580432, + "grad_norm": 0.1702233850955963, + "learning_rate": 0.00019782652619425604, + "loss": 0.1493, + "step": 758 + }, + { + "epoch": 0.05476388037086475, + "grad_norm": 0.1387334018945694, + "learning_rate": 0.0001978236397748593, + "loss": 0.194, + "step": 759 + }, + { + "epoch": 0.05483603304592518, + "grad_norm": 0.1383606642484665, + "learning_rate": 0.00019782075335546254, + "loss": 0.2381, + "step": 760 + }, + { + "epoch": 0.0549081857209856, + "grad_norm": 0.08609382063150406, + "learning_rate": 0.0001978178669360658, + "loss": 0.18, + "step": 761 + }, + { + "epoch": 0.05498033839604603, + "grad_norm": 0.13723242282867432, + "learning_rate": 0.0001978149805166691, + "loss": 0.1756, + "step": 762 + }, + { + "epoch": 0.05505249107110646, + "grad_norm": 0.09601780772209167, + "learning_rate": 0.00019781209409727236, + "loss": 0.1518, + "step": 763 + }, + { + "epoch": 0.055124643746166886, + "grad_norm": 0.28943732380867004, + "learning_rate": 0.00019780920767787562, + "loss": 0.1632, + "step": 764 + }, + { + "epoch": 0.055196796421227316, + "grad_norm": 0.11374638974666595, + "learning_rate": 0.00019780632125847885, + "loss": 0.1644, + "step": 765 + }, + { + "epoch": 0.055268949096287746, + "grad_norm": 0.1234770193696022, + "learning_rate": 0.00019780343483908212, + "loss": 0.2059, + "step": 766 + }, + { + "epoch": 0.05534110177134817, + "grad_norm": 0.09630633145570755, + "learning_rate": 0.00019780054841968538, + "loss": 0.189, + "step": 767 + }, + { + "epoch": 0.0554132544464086, + "grad_norm": 0.11028503626585007, + "learning_rate": 0.00019779766200028864, + "loss": 0.1629, + "step": 768 + }, + { + "epoch": 0.05548540712146903, + "grad_norm": 0.09348886460065842, + "learning_rate": 0.00019779477558089193, + "loss": 0.1601, + "step": 769 + }, + { + "epoch": 0.05555755979652945, + "grad_norm": 0.08891778439283371, + "learning_rate": 0.00019779188916149517, + "loss": 0.1983, + "step": 770 + }, + { + "epoch": 0.05562971247158988, + "grad_norm": 0.10887889564037323, + "learning_rate": 0.00019778900274209843, + "loss": 0.16, + "step": 771 + }, + { + "epoch": 0.05570186514665031, + "grad_norm": 0.09718909859657288, + "learning_rate": 0.0001977861163227017, + "loss": 0.1322, + "step": 772 + }, + { + "epoch": 0.05577401782171074, + "grad_norm": 0.11404313892126083, + "learning_rate": 0.00019778322990330496, + "loss": 0.2253, + "step": 773 + }, + { + "epoch": 0.055846170496771166, + "grad_norm": 0.12127923965454102, + "learning_rate": 0.00019778034348390822, + "loss": 0.1852, + "step": 774 + }, + { + "epoch": 0.055918323171831597, + "grad_norm": 0.11532068252563477, + "learning_rate": 0.00019777745706451148, + "loss": 0.2012, + "step": 775 + }, + { + "epoch": 0.05599047584689203, + "grad_norm": 0.09808876365423203, + "learning_rate": 0.00019777457064511475, + "loss": 0.1932, + "step": 776 + }, + { + "epoch": 0.05606262852195245, + "grad_norm": 0.1004500538110733, + "learning_rate": 0.000197771684225718, + "loss": 0.2526, + "step": 777 + }, + { + "epoch": 0.05613478119701288, + "grad_norm": 0.12450846284627914, + "learning_rate": 0.00019776879780632127, + "loss": 0.2019, + "step": 778 + }, + { + "epoch": 0.05620693387207331, + "grad_norm": 0.12701664865016937, + "learning_rate": 0.00019776591138692454, + "loss": 0.2032, + "step": 779 + }, + { + "epoch": 0.05627908654713373, + "grad_norm": 0.14145851135253906, + "learning_rate": 0.0001977630249675278, + "loss": 0.1964, + "step": 780 + }, + { + "epoch": 0.05635123922219416, + "grad_norm": 0.11641249805688858, + "learning_rate": 0.00019776013854813103, + "loss": 0.2092, + "step": 781 + }, + { + "epoch": 0.05642339189725459, + "grad_norm": 0.12057143449783325, + "learning_rate": 0.0001977572521287343, + "loss": 0.2071, + "step": 782 + }, + { + "epoch": 0.056495544572315017, + "grad_norm": 0.09990455210208893, + "learning_rate": 0.0001977543657093376, + "loss": 0.1968, + "step": 783 + }, + { + "epoch": 0.05656769724737545, + "grad_norm": 0.09977443516254425, + "learning_rate": 0.00019775147928994085, + "loss": 0.187, + "step": 784 + }, + { + "epoch": 0.05663984992243588, + "grad_norm": 0.11669214814901352, + "learning_rate": 0.0001977485928705441, + "loss": 0.2187, + "step": 785 + }, + { + "epoch": 0.0567120025974963, + "grad_norm": 0.11437924206256866, + "learning_rate": 0.00019774570645114735, + "loss": 0.205, + "step": 786 + }, + { + "epoch": 0.05678415527255673, + "grad_norm": 0.13286525011062622, + "learning_rate": 0.0001977428200317506, + "loss": 0.1939, + "step": 787 + }, + { + "epoch": 0.05685630794761716, + "grad_norm": 0.15068671107292175, + "learning_rate": 0.00019773993361235387, + "loss": 0.1648, + "step": 788 + }, + { + "epoch": 0.05692846062267758, + "grad_norm": 0.15105897188186646, + "learning_rate": 0.00019773704719295714, + "loss": 0.2111, + "step": 789 + }, + { + "epoch": 0.05700061329773801, + "grad_norm": 0.0838138684630394, + "learning_rate": 0.00019773416077356043, + "loss": 0.1812, + "step": 790 + }, + { + "epoch": 0.057072765972798443, + "grad_norm": 0.11074309051036835, + "learning_rate": 0.00019773127435416366, + "loss": 0.174, + "step": 791 + }, + { + "epoch": 0.05714491864785887, + "grad_norm": 0.12502411007881165, + "learning_rate": 0.00019772838793476693, + "loss": 0.1996, + "step": 792 + }, + { + "epoch": 0.0572170713229193, + "grad_norm": 0.11896955966949463, + "learning_rate": 0.0001977255015153702, + "loss": 0.1898, + "step": 793 + }, + { + "epoch": 0.05728922399797973, + "grad_norm": 0.10241451859474182, + "learning_rate": 0.00019772261509597345, + "loss": 0.16, + "step": 794 + }, + { + "epoch": 0.05736137667304015, + "grad_norm": 0.11450854688882828, + "learning_rate": 0.00019771972867657671, + "loss": 0.1798, + "step": 795 + }, + { + "epoch": 0.05743352934810058, + "grad_norm": 0.12698432803153992, + "learning_rate": 0.00019771684225717998, + "loss": 0.1536, + "step": 796 + }, + { + "epoch": 0.05750568202316101, + "grad_norm": 0.0907829999923706, + "learning_rate": 0.00019771395583778324, + "loss": 0.1776, + "step": 797 + }, + { + "epoch": 0.05757783469822143, + "grad_norm": 0.08737281709909439, + "learning_rate": 0.0001977110694183865, + "loss": 0.1806, + "step": 798 + }, + { + "epoch": 0.057649987373281864, + "grad_norm": 0.09027869254350662, + "learning_rate": 0.00019770818299898977, + "loss": 0.1328, + "step": 799 + }, + { + "epoch": 0.057722140048342294, + "grad_norm": 0.11647447943687439, + "learning_rate": 0.00019770529657959303, + "loss": 0.2173, + "step": 800 + }, + { + "epoch": 0.05779429272340272, + "grad_norm": 0.0926554799079895, + "learning_rate": 0.0001977024101601963, + "loss": 0.228, + "step": 801 + }, + { + "epoch": 0.05786644539846315, + "grad_norm": 0.1256769746541977, + "learning_rate": 0.00019769952374079953, + "loss": 0.2025, + "step": 802 + }, + { + "epoch": 0.05793859807352358, + "grad_norm": 0.1006527915596962, + "learning_rate": 0.0001976966373214028, + "loss": 0.1862, + "step": 803 + }, + { + "epoch": 0.05801075074858401, + "grad_norm": 0.10265506058931351, + "learning_rate": 0.00019769375090200608, + "loss": 0.173, + "step": 804 + }, + { + "epoch": 0.05808290342364443, + "grad_norm": 0.09494657814502716, + "learning_rate": 0.00019769086448260934, + "loss": 0.2186, + "step": 805 + }, + { + "epoch": 0.05815505609870486, + "grad_norm": 0.1186690479516983, + "learning_rate": 0.0001976879780632126, + "loss": 0.212, + "step": 806 + }, + { + "epoch": 0.05822720877376529, + "grad_norm": 0.09116118401288986, + "learning_rate": 0.00019768509164381584, + "loss": 0.2086, + "step": 807 + }, + { + "epoch": 0.058299361448825714, + "grad_norm": 0.10806886851787567, + "learning_rate": 0.0001976822052244191, + "loss": 0.158, + "step": 808 + }, + { + "epoch": 0.058371514123886144, + "grad_norm": 0.09465332329273224, + "learning_rate": 0.00019767931880502237, + "loss": 0.1479, + "step": 809 + }, + { + "epoch": 0.058443666798946574, + "grad_norm": 0.10689811408519745, + "learning_rate": 0.00019767643238562563, + "loss": 0.1648, + "step": 810 + }, + { + "epoch": 0.058515819474007, + "grad_norm": 0.08733882009983063, + "learning_rate": 0.00019767354596622892, + "loss": 0.1963, + "step": 811 + }, + { + "epoch": 0.05858797214906743, + "grad_norm": 0.10385697335004807, + "learning_rate": 0.00019767065954683218, + "loss": 0.1631, + "step": 812 + }, + { + "epoch": 0.05866012482412786, + "grad_norm": 0.1437450498342514, + "learning_rate": 0.00019766777312743542, + "loss": 0.1684, + "step": 813 + }, + { + "epoch": 0.05873227749918828, + "grad_norm": 0.08907853811979294, + "learning_rate": 0.00019766488670803868, + "loss": 0.1789, + "step": 814 + }, + { + "epoch": 0.05880443017424871, + "grad_norm": 0.08962389826774597, + "learning_rate": 0.00019766200028864195, + "loss": 0.17, + "step": 815 + }, + { + "epoch": 0.05887658284930914, + "grad_norm": 0.10819995403289795, + "learning_rate": 0.0001976591138692452, + "loss": 0.2291, + "step": 816 + }, + { + "epoch": 0.058948735524369564, + "grad_norm": 0.09052648395299911, + "learning_rate": 0.00019765622744984847, + "loss": 0.1897, + "step": 817 + }, + { + "epoch": 0.059020888199429994, + "grad_norm": 0.10309138894081116, + "learning_rate": 0.00019765334103045173, + "loss": 0.1634, + "step": 818 + }, + { + "epoch": 0.059093040874490424, + "grad_norm": 0.0909322202205658, + "learning_rate": 0.000197650454611055, + "loss": 0.1774, + "step": 819 + }, + { + "epoch": 0.05916519354955085, + "grad_norm": 0.0867924764752388, + "learning_rate": 0.00019764756819165826, + "loss": 0.1937, + "step": 820 + }, + { + "epoch": 0.05923734622461128, + "grad_norm": 0.08892381936311722, + "learning_rate": 0.00019764468177226152, + "loss": 0.1359, + "step": 821 + }, + { + "epoch": 0.05930949889967171, + "grad_norm": 0.101357601583004, + "learning_rate": 0.0001976417953528648, + "loss": 0.1597, + "step": 822 + }, + { + "epoch": 0.05938165157473213, + "grad_norm": 0.10044662654399872, + "learning_rate": 0.00019763890893346805, + "loss": 0.1363, + "step": 823 + }, + { + "epoch": 0.05945380424979256, + "grad_norm": 0.09395581483840942, + "learning_rate": 0.00019763602251407129, + "loss": 0.1674, + "step": 824 + }, + { + "epoch": 0.05952595692485299, + "grad_norm": 0.10966863483190536, + "learning_rate": 0.00019763313609467458, + "loss": 0.1458, + "step": 825 + }, + { + "epoch": 0.059598109599913414, + "grad_norm": 0.1251005083322525, + "learning_rate": 0.00019763024967527784, + "loss": 0.26, + "step": 826 + }, + { + "epoch": 0.059670262274973844, + "grad_norm": 0.09615496546030045, + "learning_rate": 0.0001976273632558811, + "loss": 0.1879, + "step": 827 + }, + { + "epoch": 0.059742414950034274, + "grad_norm": 0.10016679763793945, + "learning_rate": 0.00019762447683648436, + "loss": 0.1718, + "step": 828 + }, + { + "epoch": 0.0598145676250947, + "grad_norm": 0.11508508771657944, + "learning_rate": 0.0001976215904170876, + "loss": 0.1755, + "step": 829 + }, + { + "epoch": 0.05988672030015513, + "grad_norm": 0.08913619071245193, + "learning_rate": 0.00019761870399769086, + "loss": 0.148, + "step": 830 + }, + { + "epoch": 0.05995887297521556, + "grad_norm": 0.09995342791080475, + "learning_rate": 0.00019761581757829413, + "loss": 0.1599, + "step": 831 + }, + { + "epoch": 0.06003102565027598, + "grad_norm": 0.14899706840515137, + "learning_rate": 0.00019761293115889742, + "loss": 0.1411, + "step": 832 + }, + { + "epoch": 0.06010317832533641, + "grad_norm": 0.13075637817382812, + "learning_rate": 0.00019761004473950068, + "loss": 0.2015, + "step": 833 + }, + { + "epoch": 0.06017533100039684, + "grad_norm": 0.10823075473308563, + "learning_rate": 0.00019760715832010391, + "loss": 0.1733, + "step": 834 + }, + { + "epoch": 0.06024748367545727, + "grad_norm": 0.09528510272502899, + "learning_rate": 0.00019760427190070718, + "loss": 0.1984, + "step": 835 + }, + { + "epoch": 0.060319636350517694, + "grad_norm": 0.14064089953899384, + "learning_rate": 0.00019760138548131044, + "loss": 0.2143, + "step": 836 + }, + { + "epoch": 0.060391789025578124, + "grad_norm": 0.08362717926502228, + "learning_rate": 0.0001975984990619137, + "loss": 0.1659, + "step": 837 + }, + { + "epoch": 0.060463941700638554, + "grad_norm": 0.09632274508476257, + "learning_rate": 0.00019759561264251697, + "loss": 0.1424, + "step": 838 + }, + { + "epoch": 0.06053609437569898, + "grad_norm": 0.0978073701262474, + "learning_rate": 0.00019759272622312023, + "loss": 0.1318, + "step": 839 + }, + { + "epoch": 0.06060824705075941, + "grad_norm": 0.11037838459014893, + "learning_rate": 0.0001975898398037235, + "loss": 0.2055, + "step": 840 + }, + { + "epoch": 0.06068039972581984, + "grad_norm": 0.09025062620639801, + "learning_rate": 0.00019758695338432675, + "loss": 0.1897, + "step": 841 + }, + { + "epoch": 0.06075255240088026, + "grad_norm": 0.14597173035144806, + "learning_rate": 0.00019758406696493002, + "loss": 0.2609, + "step": 842 + }, + { + "epoch": 0.06082470507594069, + "grad_norm": 0.09020640701055527, + "learning_rate": 0.00019758118054553328, + "loss": 0.1906, + "step": 843 + }, + { + "epoch": 0.06089685775100112, + "grad_norm": 0.0924547016620636, + "learning_rate": 0.00019757829412613654, + "loss": 0.2269, + "step": 844 + }, + { + "epoch": 0.060969010426061544, + "grad_norm": 0.12248987704515457, + "learning_rate": 0.00019757540770673978, + "loss": 0.1816, + "step": 845 + }, + { + "epoch": 0.061041163101121974, + "grad_norm": 0.07481729239225388, + "learning_rate": 0.00019757252128734307, + "loss": 0.154, + "step": 846 + }, + { + "epoch": 0.061113315776182404, + "grad_norm": 0.08164883404970169, + "learning_rate": 0.00019756963486794633, + "loss": 0.1804, + "step": 847 + }, + { + "epoch": 0.06118546845124283, + "grad_norm": 0.09927359223365784, + "learning_rate": 0.0001975667484485496, + "loss": 0.2308, + "step": 848 + }, + { + "epoch": 0.06125762112630326, + "grad_norm": 0.14488515257835388, + "learning_rate": 0.00019756386202915286, + "loss": 0.228, + "step": 849 + }, + { + "epoch": 0.06132977380136369, + "grad_norm": 0.10332830250263214, + "learning_rate": 0.0001975609756097561, + "loss": 0.1946, + "step": 850 + }, + { + "epoch": 0.06140192647642411, + "grad_norm": 0.1296503245830536, + "learning_rate": 0.00019755808919035936, + "loss": 0.1589, + "step": 851 + }, + { + "epoch": 0.06147407915148454, + "grad_norm": 0.12913277745246887, + "learning_rate": 0.00019755520277096262, + "loss": 0.2017, + "step": 852 + }, + { + "epoch": 0.06154623182654497, + "grad_norm": 0.09249990433454514, + "learning_rate": 0.0001975523163515659, + "loss": 0.1749, + "step": 853 + }, + { + "epoch": 0.061618384501605394, + "grad_norm": 0.12817887961864471, + "learning_rate": 0.00019754942993216917, + "loss": 0.1677, + "step": 854 + }, + { + "epoch": 0.061690537176665824, + "grad_norm": 0.09516768902540207, + "learning_rate": 0.0001975465435127724, + "loss": 0.1482, + "step": 855 + }, + { + "epoch": 0.061762689851726255, + "grad_norm": 0.1048058345913887, + "learning_rate": 0.00019754365709337567, + "loss": 0.2181, + "step": 856 + }, + { + "epoch": 0.06183484252678668, + "grad_norm": 0.11138293892145157, + "learning_rate": 0.00019754077067397893, + "loss": 0.2056, + "step": 857 + }, + { + "epoch": 0.06190699520184711, + "grad_norm": 0.09336519241333008, + "learning_rate": 0.0001975378842545822, + "loss": 0.2204, + "step": 858 + }, + { + "epoch": 0.06197914787690754, + "grad_norm": 0.07680778205394745, + "learning_rate": 0.00019753499783518546, + "loss": 0.152, + "step": 859 + }, + { + "epoch": 0.06205130055196796, + "grad_norm": 0.11393015831708908, + "learning_rate": 0.00019753211141578872, + "loss": 0.1537, + "step": 860 + }, + { + "epoch": 0.06212345322702839, + "grad_norm": 0.09241417795419693, + "learning_rate": 0.00019752922499639199, + "loss": 0.2049, + "step": 861 + }, + { + "epoch": 0.06219560590208882, + "grad_norm": 0.08792451024055481, + "learning_rate": 0.00019752633857699525, + "loss": 0.179, + "step": 862 + }, + { + "epoch": 0.062267758577149245, + "grad_norm": 0.0883568525314331, + "learning_rate": 0.0001975234521575985, + "loss": 0.1811, + "step": 863 + }, + { + "epoch": 0.062339911252209675, + "grad_norm": 0.11310116201639175, + "learning_rate": 0.00019752056573820178, + "loss": 0.1885, + "step": 864 + }, + { + "epoch": 0.062412063927270105, + "grad_norm": 0.07407009601593018, + "learning_rate": 0.00019751767931880504, + "loss": 0.1885, + "step": 865 + }, + { + "epoch": 0.062484216602330535, + "grad_norm": 0.1365663707256317, + "learning_rate": 0.00019751479289940827, + "loss": 0.1182, + "step": 866 + }, + { + "epoch": 0.06255636927739096, + "grad_norm": 0.07840821146965027, + "learning_rate": 0.00019751190648001156, + "loss": 0.1688, + "step": 867 + }, + { + "epoch": 0.06262852195245139, + "grad_norm": 0.10654614120721817, + "learning_rate": 0.00019750902006061483, + "loss": 0.1288, + "step": 868 + }, + { + "epoch": 0.06270067462751182, + "grad_norm": 0.10260630398988724, + "learning_rate": 0.0001975061336412181, + "loss": 0.2249, + "step": 869 + }, + { + "epoch": 0.06277282730257225, + "grad_norm": 0.11337298899888992, + "learning_rate": 0.00019750324722182135, + "loss": 0.2393, + "step": 870 + }, + { + "epoch": 0.06284497997763266, + "grad_norm": 0.09580923616886139, + "learning_rate": 0.0001975003608024246, + "loss": 0.2109, + "step": 871 + }, + { + "epoch": 0.0629171326526931, + "grad_norm": 0.09207534044981003, + "learning_rate": 0.00019749747438302785, + "loss": 0.2034, + "step": 872 + }, + { + "epoch": 0.06298928532775352, + "grad_norm": 0.1136569008231163, + "learning_rate": 0.00019749458796363111, + "loss": 0.1593, + "step": 873 + }, + { + "epoch": 0.06306143800281395, + "grad_norm": 0.10841278731822968, + "learning_rate": 0.0001974917015442344, + "loss": 0.1655, + "step": 874 + }, + { + "epoch": 0.06313359067787438, + "grad_norm": 0.1440107375383377, + "learning_rate": 0.00019748881512483767, + "loss": 0.1858, + "step": 875 + }, + { + "epoch": 0.06320574335293482, + "grad_norm": 0.10138542205095291, + "learning_rate": 0.0001974859287054409, + "loss": 0.2196, + "step": 876 + }, + { + "epoch": 0.06327789602799523, + "grad_norm": 0.10658544301986694, + "learning_rate": 0.00019748304228604417, + "loss": 0.2105, + "step": 877 + }, + { + "epoch": 0.06335004870305566, + "grad_norm": 0.09705675393342972, + "learning_rate": 0.00019748015586664743, + "loss": 0.1708, + "step": 878 + }, + { + "epoch": 0.06342220137811609, + "grad_norm": 0.11171989887952805, + "learning_rate": 0.0001974772694472507, + "loss": 0.1685, + "step": 879 + }, + { + "epoch": 0.06349435405317652, + "grad_norm": 0.07426609098911285, + "learning_rate": 0.00019747438302785395, + "loss": 0.1925, + "step": 880 + }, + { + "epoch": 0.06356650672823695, + "grad_norm": 0.1063368171453476, + "learning_rate": 0.00019747149660845722, + "loss": 0.1822, + "step": 881 + }, + { + "epoch": 0.06363865940329738, + "grad_norm": 0.12676234543323517, + "learning_rate": 0.00019746861018906048, + "loss": 0.1912, + "step": 882 + }, + { + "epoch": 0.06371081207835781, + "grad_norm": 0.10321086645126343, + "learning_rate": 0.00019746572376966374, + "loss": 0.164, + "step": 883 + }, + { + "epoch": 0.06378296475341823, + "grad_norm": 0.0895160511136055, + "learning_rate": 0.000197462837350267, + "loss": 0.1928, + "step": 884 + }, + { + "epoch": 0.06385511742847866, + "grad_norm": 0.09870103001594543, + "learning_rate": 0.00019745995093087027, + "loss": 0.2003, + "step": 885 + }, + { + "epoch": 0.06392727010353909, + "grad_norm": 0.0940144807100296, + "learning_rate": 0.00019745706451147353, + "loss": 0.1727, + "step": 886 + }, + { + "epoch": 0.06399942277859952, + "grad_norm": 0.11259470134973526, + "learning_rate": 0.00019745417809207677, + "loss": 0.2041, + "step": 887 + }, + { + "epoch": 0.06407157545365995, + "grad_norm": 0.08136259019374847, + "learning_rate": 0.00019745129167268003, + "loss": 0.1506, + "step": 888 + }, + { + "epoch": 0.06414372812872038, + "grad_norm": 0.11417430639266968, + "learning_rate": 0.00019744840525328332, + "loss": 0.1939, + "step": 889 + }, + { + "epoch": 0.0642158808037808, + "grad_norm": 0.11523161828517914, + "learning_rate": 0.00019744551883388658, + "loss": 0.1678, + "step": 890 + }, + { + "epoch": 0.06428803347884122, + "grad_norm": 0.1095183789730072, + "learning_rate": 0.00019744263241448985, + "loss": 0.1182, + "step": 891 + }, + { + "epoch": 0.06436018615390166, + "grad_norm": 0.08293008804321289, + "learning_rate": 0.00019743974599509308, + "loss": 0.1762, + "step": 892 + }, + { + "epoch": 0.06443233882896209, + "grad_norm": 0.08579592406749725, + "learning_rate": 0.00019743685957569635, + "loss": 0.1645, + "step": 893 + }, + { + "epoch": 0.06450449150402252, + "grad_norm": 0.09980572015047073, + "learning_rate": 0.0001974339731562996, + "loss": 0.1829, + "step": 894 + }, + { + "epoch": 0.06457664417908295, + "grad_norm": 0.10224307328462601, + "learning_rate": 0.00019743108673690287, + "loss": 0.1701, + "step": 895 + }, + { + "epoch": 0.06464879685414336, + "grad_norm": 0.0910172089934349, + "learning_rate": 0.00019742820031750616, + "loss": 0.153, + "step": 896 + }, + { + "epoch": 0.06472094952920379, + "grad_norm": 0.09426335990428925, + "learning_rate": 0.0001974253138981094, + "loss": 0.1888, + "step": 897 + }, + { + "epoch": 0.06479310220426422, + "grad_norm": 0.11477576196193695, + "learning_rate": 0.00019742242747871266, + "loss": 0.1528, + "step": 898 + }, + { + "epoch": 0.06486525487932465, + "grad_norm": 0.09423889219760895, + "learning_rate": 0.00019741954105931592, + "loss": 0.1631, + "step": 899 + }, + { + "epoch": 0.06493740755438508, + "grad_norm": 0.08917507529258728, + "learning_rate": 0.00019741665463991919, + "loss": 0.1684, + "step": 900 + }, + { + "epoch": 0.06500956022944551, + "grad_norm": 0.10076791793107986, + "learning_rate": 0.00019741376822052245, + "loss": 0.1609, + "step": 901 + }, + { + "epoch": 0.06508171290450593, + "grad_norm": 0.1011313796043396, + "learning_rate": 0.0001974108818011257, + "loss": 0.1489, + "step": 902 + }, + { + "epoch": 0.06515386557956636, + "grad_norm": 0.09580199420452118, + "learning_rate": 0.00019740799538172897, + "loss": 0.1983, + "step": 903 + }, + { + "epoch": 0.06522601825462679, + "grad_norm": 0.1301235854625702, + "learning_rate": 0.00019740510896233224, + "loss": 0.2307, + "step": 904 + }, + { + "epoch": 0.06529817092968722, + "grad_norm": 0.11178337782621384, + "learning_rate": 0.0001974022225429355, + "loss": 0.1782, + "step": 905 + }, + { + "epoch": 0.06537032360474765, + "grad_norm": 0.108871228992939, + "learning_rate": 0.00019739933612353876, + "loss": 0.1613, + "step": 906 + }, + { + "epoch": 0.06544247627980808, + "grad_norm": 0.08900240808725357, + "learning_rate": 0.00019739644970414203, + "loss": 0.1448, + "step": 907 + }, + { + "epoch": 0.0655146289548685, + "grad_norm": 0.08600308746099472, + "learning_rate": 0.00019739356328474526, + "loss": 0.2338, + "step": 908 + }, + { + "epoch": 0.06558678162992893, + "grad_norm": 0.10805013030767441, + "learning_rate": 0.00019739067686534853, + "loss": 0.195, + "step": 909 + }, + { + "epoch": 0.06565893430498936, + "grad_norm": 0.13240516185760498, + "learning_rate": 0.00019738779044595182, + "loss": 0.2384, + "step": 910 + }, + { + "epoch": 0.06573108698004979, + "grad_norm": 0.09833045303821564, + "learning_rate": 0.00019738490402655508, + "loss": 0.1868, + "step": 911 + }, + { + "epoch": 0.06580323965511022, + "grad_norm": 0.09301164746284485, + "learning_rate": 0.00019738201760715834, + "loss": 0.1649, + "step": 912 + }, + { + "epoch": 0.06587539233017065, + "grad_norm": 0.10523568093776703, + "learning_rate": 0.00019737913118776158, + "loss": 0.2057, + "step": 913 + }, + { + "epoch": 0.06594754500523108, + "grad_norm": 0.1101023256778717, + "learning_rate": 0.00019737624476836484, + "loss": 0.1605, + "step": 914 + }, + { + "epoch": 0.06601969768029149, + "grad_norm": 0.1447688192129135, + "learning_rate": 0.0001973733583489681, + "loss": 0.1954, + "step": 915 + }, + { + "epoch": 0.06609185035535192, + "grad_norm": 0.1215805783867836, + "learning_rate": 0.00019737047192957137, + "loss": 0.1871, + "step": 916 + }, + { + "epoch": 0.06616400303041235, + "grad_norm": 0.1299552321434021, + "learning_rate": 0.00019736758551017466, + "loss": 0.182, + "step": 917 + }, + { + "epoch": 0.06623615570547278, + "grad_norm": 0.10295114666223526, + "learning_rate": 0.0001973646990907779, + "loss": 0.221, + "step": 918 + }, + { + "epoch": 0.06630830838053321, + "grad_norm": 0.10897116363048553, + "learning_rate": 0.00019736181267138115, + "loss": 0.1452, + "step": 919 + }, + { + "epoch": 0.06638046105559364, + "grad_norm": 0.09574306756258011, + "learning_rate": 0.00019735892625198442, + "loss": 0.1695, + "step": 920 + }, + { + "epoch": 0.06645261373065406, + "grad_norm": 0.12180160731077194, + "learning_rate": 0.00019735603983258768, + "loss": 0.1928, + "step": 921 + }, + { + "epoch": 0.06652476640571449, + "grad_norm": 0.09444516152143478, + "learning_rate": 0.00019735315341319094, + "loss": 0.1188, + "step": 922 + }, + { + "epoch": 0.06659691908077492, + "grad_norm": 0.11703677475452423, + "learning_rate": 0.0001973502669937942, + "loss": 0.1623, + "step": 923 + }, + { + "epoch": 0.06666907175583535, + "grad_norm": 0.08961249142885208, + "learning_rate": 0.00019734738057439747, + "loss": 0.21, + "step": 924 + }, + { + "epoch": 0.06674122443089578, + "grad_norm": 0.11890044808387756, + "learning_rate": 0.00019734449415500073, + "loss": 0.1562, + "step": 925 + }, + { + "epoch": 0.06681337710595621, + "grad_norm": 0.11046113073825836, + "learning_rate": 0.000197341607735604, + "loss": 0.1453, + "step": 926 + }, + { + "epoch": 0.06688552978101663, + "grad_norm": 0.0871260017156601, + "learning_rate": 0.00019733872131620726, + "loss": 0.1935, + "step": 927 + }, + { + "epoch": 0.06695768245607706, + "grad_norm": 0.08003012835979462, + "learning_rate": 0.00019733583489681052, + "loss": 0.1737, + "step": 928 + }, + { + "epoch": 0.06702983513113749, + "grad_norm": 0.10298404097557068, + "learning_rate": 0.00019733294847741378, + "loss": 0.1584, + "step": 929 + }, + { + "epoch": 0.06710198780619792, + "grad_norm": 0.12050239741802216, + "learning_rate": 0.00019733006205801702, + "loss": 0.2297, + "step": 930 + }, + { + "epoch": 0.06717414048125835, + "grad_norm": 0.09646282345056534, + "learning_rate": 0.0001973271756386203, + "loss": 0.1768, + "step": 931 + }, + { + "epoch": 0.06724629315631878, + "grad_norm": 0.09590929746627808, + "learning_rate": 0.00019732428921922357, + "loss": 0.1385, + "step": 932 + }, + { + "epoch": 0.06731844583137919, + "grad_norm": 0.10071932524442673, + "learning_rate": 0.00019732140279982684, + "loss": 0.2076, + "step": 933 + }, + { + "epoch": 0.06739059850643962, + "grad_norm": 0.10966494679450989, + "learning_rate": 0.0001973185163804301, + "loss": 0.2114, + "step": 934 + }, + { + "epoch": 0.06746275118150005, + "grad_norm": 0.08536006510257721, + "learning_rate": 0.00019731562996103333, + "loss": 0.1917, + "step": 935 + }, + { + "epoch": 0.06753490385656048, + "grad_norm": 0.089837945997715, + "learning_rate": 0.0001973127435416366, + "loss": 0.1454, + "step": 936 + }, + { + "epoch": 0.06760705653162091, + "grad_norm": 0.08100643008947372, + "learning_rate": 0.00019730985712223986, + "loss": 0.1833, + "step": 937 + }, + { + "epoch": 0.06767920920668134, + "grad_norm": 0.09855519980192184, + "learning_rate": 0.00019730697070284315, + "loss": 0.2072, + "step": 938 + }, + { + "epoch": 0.06775136188174176, + "grad_norm": 0.10398002713918686, + "learning_rate": 0.0001973040842834464, + "loss": 0.1914, + "step": 939 + }, + { + "epoch": 0.06782351455680219, + "grad_norm": 0.11511258035898209, + "learning_rate": 0.00019730119786404965, + "loss": 0.1788, + "step": 940 + }, + { + "epoch": 0.06789566723186262, + "grad_norm": 0.08235523104667664, + "learning_rate": 0.0001972983114446529, + "loss": 0.2027, + "step": 941 + }, + { + "epoch": 0.06796781990692305, + "grad_norm": 0.07562468945980072, + "learning_rate": 0.00019729542502525617, + "loss": 0.1686, + "step": 942 + }, + { + "epoch": 0.06803997258198348, + "grad_norm": 0.07198230177164078, + "learning_rate": 0.00019729253860585944, + "loss": 0.2053, + "step": 943 + }, + { + "epoch": 0.06811212525704391, + "grad_norm": 0.07216834276914597, + "learning_rate": 0.0001972896521864627, + "loss": 0.1892, + "step": 944 + }, + { + "epoch": 0.06818427793210434, + "grad_norm": 0.06897939741611481, + "learning_rate": 0.00019728676576706596, + "loss": 0.1581, + "step": 945 + }, + { + "epoch": 0.06825643060716476, + "grad_norm": 0.0761767327785492, + "learning_rate": 0.00019728387934766923, + "loss": 0.1801, + "step": 946 + }, + { + "epoch": 0.06832858328222519, + "grad_norm": 0.09585803747177124, + "learning_rate": 0.0001972809929282725, + "loss": 0.2046, + "step": 947 + }, + { + "epoch": 0.06840073595728562, + "grad_norm": 0.07439012825489044, + "learning_rate": 0.00019727810650887575, + "loss": 0.1596, + "step": 948 + }, + { + "epoch": 0.06847288863234605, + "grad_norm": 0.11946660280227661, + "learning_rate": 0.00019727522008947901, + "loss": 0.1758, + "step": 949 + }, + { + "epoch": 0.06854504130740648, + "grad_norm": 0.096490778028965, + "learning_rate": 0.00019727233367008228, + "loss": 0.1824, + "step": 950 + }, + { + "epoch": 0.0686171939824669, + "grad_norm": 0.12073836475610733, + "learning_rate": 0.00019726944725068551, + "loss": 0.1525, + "step": 951 + }, + { + "epoch": 0.06868934665752732, + "grad_norm": 0.137812539935112, + "learning_rate": 0.0001972665608312888, + "loss": 0.1564, + "step": 952 + }, + { + "epoch": 0.06876149933258775, + "grad_norm": 0.1103888601064682, + "learning_rate": 0.00019726367441189207, + "loss": 0.1475, + "step": 953 + }, + { + "epoch": 0.06883365200764818, + "grad_norm": 0.08909444510936737, + "learning_rate": 0.00019726078799249533, + "loss": 0.1765, + "step": 954 + }, + { + "epoch": 0.06890580468270861, + "grad_norm": 0.09811177104711533, + "learning_rate": 0.0001972579015730986, + "loss": 0.1684, + "step": 955 + }, + { + "epoch": 0.06897795735776904, + "grad_norm": 0.0942537784576416, + "learning_rate": 0.00019725501515370183, + "loss": 0.1621, + "step": 956 + }, + { + "epoch": 0.06905011003282947, + "grad_norm": 0.11584122478961945, + "learning_rate": 0.0001972521287343051, + "loss": 0.2448, + "step": 957 + }, + { + "epoch": 0.06912226270788989, + "grad_norm": 0.08744748681783676, + "learning_rate": 0.00019724924231490835, + "loss": 0.2051, + "step": 958 + }, + { + "epoch": 0.06919441538295032, + "grad_norm": 0.12525174021720886, + "learning_rate": 0.00019724635589551164, + "loss": 0.1545, + "step": 959 + }, + { + "epoch": 0.06926656805801075, + "grad_norm": 0.09919869899749756, + "learning_rate": 0.0001972434694761149, + "loss": 0.1952, + "step": 960 + }, + { + "epoch": 0.06933872073307118, + "grad_norm": 0.10349156707525253, + "learning_rate": 0.00019724058305671814, + "loss": 0.1692, + "step": 961 + }, + { + "epoch": 0.06941087340813161, + "grad_norm": 0.11499597877264023, + "learning_rate": 0.0001972376966373214, + "loss": 0.2181, + "step": 962 + }, + { + "epoch": 0.06948302608319204, + "grad_norm": 0.08199431002140045, + "learning_rate": 0.00019723481021792467, + "loss": 0.1664, + "step": 963 + }, + { + "epoch": 0.06955517875825246, + "grad_norm": 0.09471738338470459, + "learning_rate": 0.00019723192379852793, + "loss": 0.1669, + "step": 964 + }, + { + "epoch": 0.06962733143331289, + "grad_norm": 0.1080731600522995, + "learning_rate": 0.0001972290373791312, + "loss": 0.2029, + "step": 965 + }, + { + "epoch": 0.06969948410837332, + "grad_norm": 0.09761572629213333, + "learning_rate": 0.00019722615095973446, + "loss": 0.2349, + "step": 966 + }, + { + "epoch": 0.06977163678343375, + "grad_norm": 0.08952382206916809, + "learning_rate": 0.00019722326454033772, + "loss": 0.1348, + "step": 967 + }, + { + "epoch": 0.06984378945849418, + "grad_norm": 0.08425026386976242, + "learning_rate": 0.00019722037812094098, + "loss": 0.212, + "step": 968 + }, + { + "epoch": 0.0699159421335546, + "grad_norm": 0.12387498468160629, + "learning_rate": 0.00019721749170154425, + "loss": 0.1754, + "step": 969 + }, + { + "epoch": 0.06998809480861502, + "grad_norm": 0.12308784574270248, + "learning_rate": 0.0001972146052821475, + "loss": 0.183, + "step": 970 + }, + { + "epoch": 0.07006024748367545, + "grad_norm": 0.08441948890686035, + "learning_rate": 0.00019721171886275077, + "loss": 0.1386, + "step": 971 + }, + { + "epoch": 0.07013240015873588, + "grad_norm": 0.09757070243358612, + "learning_rate": 0.000197208832443354, + "loss": 0.177, + "step": 972 + }, + { + "epoch": 0.07020455283379631, + "grad_norm": 0.09631665050983429, + "learning_rate": 0.0001972059460239573, + "loss": 0.1936, + "step": 973 + }, + { + "epoch": 0.07027670550885674, + "grad_norm": 0.09656485170125961, + "learning_rate": 0.00019720305960456056, + "loss": 0.1684, + "step": 974 + }, + { + "epoch": 0.07034885818391717, + "grad_norm": 0.14445240795612335, + "learning_rate": 0.00019720017318516382, + "loss": 0.1749, + "step": 975 + }, + { + "epoch": 0.0704210108589776, + "grad_norm": 0.11951139569282532, + "learning_rate": 0.0001971972867657671, + "loss": 0.2132, + "step": 976 + }, + { + "epoch": 0.07049316353403802, + "grad_norm": 0.11091769486665726, + "learning_rate": 0.00019719440034637032, + "loss": 0.194, + "step": 977 + }, + { + "epoch": 0.07056531620909845, + "grad_norm": 0.18498805165290833, + "learning_rate": 0.00019719151392697359, + "loss": 0.1952, + "step": 978 + }, + { + "epoch": 0.07063746888415888, + "grad_norm": 0.1627284437417984, + "learning_rate": 0.00019718862750757685, + "loss": 0.211, + "step": 979 + }, + { + "epoch": 0.07070962155921931, + "grad_norm": 0.08590367436408997, + "learning_rate": 0.00019718574108818014, + "loss": 0.2123, + "step": 980 + }, + { + "epoch": 0.07078177423427974, + "grad_norm": 0.11220984905958176, + "learning_rate": 0.0001971828546687834, + "loss": 0.1771, + "step": 981 + }, + { + "epoch": 0.07085392690934017, + "grad_norm": 0.12736214697360992, + "learning_rate": 0.00019717996824938664, + "loss": 0.1532, + "step": 982 + }, + { + "epoch": 0.07092607958440059, + "grad_norm": 0.10040099918842316, + "learning_rate": 0.0001971770818299899, + "loss": 0.2033, + "step": 983 + }, + { + "epoch": 0.07099823225946102, + "grad_norm": 0.07447941601276398, + "learning_rate": 0.00019717419541059316, + "loss": 0.132, + "step": 984 + }, + { + "epoch": 0.07107038493452145, + "grad_norm": 0.11526428908109665, + "learning_rate": 0.00019717130899119643, + "loss": 0.1395, + "step": 985 + }, + { + "epoch": 0.07114253760958188, + "grad_norm": 0.12547869980335236, + "learning_rate": 0.0001971684225717997, + "loss": 0.1404, + "step": 986 + }, + { + "epoch": 0.0712146902846423, + "grad_norm": 0.17025943100452423, + "learning_rate": 0.00019716553615240295, + "loss": 0.1794, + "step": 987 + }, + { + "epoch": 0.07128684295970274, + "grad_norm": 0.10432759672403336, + "learning_rate": 0.00019716264973300621, + "loss": 0.1592, + "step": 988 + }, + { + "epoch": 0.07135899563476315, + "grad_norm": 0.15890909731388092, + "learning_rate": 0.00019715976331360948, + "loss": 0.262, + "step": 989 + }, + { + "epoch": 0.07143114830982358, + "grad_norm": 0.13272574543952942, + "learning_rate": 0.00019715687689421274, + "loss": 0.1584, + "step": 990 + }, + { + "epoch": 0.07150330098488401, + "grad_norm": 0.08162888139486313, + "learning_rate": 0.000197153990474816, + "loss": 0.1998, + "step": 991 + }, + { + "epoch": 0.07157545365994444, + "grad_norm": 0.09594608843326569, + "learning_rate": 0.00019715110405541927, + "loss": 0.1447, + "step": 992 + }, + { + "epoch": 0.07164760633500487, + "grad_norm": 0.11136913299560547, + "learning_rate": 0.0001971482176360225, + "loss": 0.192, + "step": 993 + }, + { + "epoch": 0.0717197590100653, + "grad_norm": 0.11597933620214462, + "learning_rate": 0.0001971453312166258, + "loss": 0.188, + "step": 994 + }, + { + "epoch": 0.07179191168512572, + "grad_norm": 0.10965341329574585, + "learning_rate": 0.00019714244479722906, + "loss": 0.1712, + "step": 995 + }, + { + "epoch": 0.07186406436018615, + "grad_norm": 0.09491154551506042, + "learning_rate": 0.00019713955837783232, + "loss": 0.1851, + "step": 996 + }, + { + "epoch": 0.07193621703524658, + "grad_norm": 0.10133666545152664, + "learning_rate": 0.00019713667195843558, + "loss": 0.194, + "step": 997 + }, + { + "epoch": 0.07200836971030701, + "grad_norm": 0.09461408853530884, + "learning_rate": 0.00019713378553903882, + "loss": 0.1784, + "step": 998 + }, + { + "epoch": 0.07208052238536744, + "grad_norm": 0.08958426862955093, + "learning_rate": 0.00019713089911964208, + "loss": 0.2179, + "step": 999 + }, + { + "epoch": 0.07215267506042787, + "grad_norm": 0.08880864828824997, + "learning_rate": 0.00019712801270024534, + "loss": 0.1661, + "step": 1000 + }, + { + "epoch": 0.07222482773548829, + "grad_norm": 0.11652393639087677, + "learning_rate": 0.00019712512628084863, + "loss": 0.1578, + "step": 1001 + }, + { + "epoch": 0.07229698041054872, + "grad_norm": 0.17131830751895905, + "learning_rate": 0.0001971222398614519, + "loss": 0.2147, + "step": 1002 + }, + { + "epoch": 0.07236913308560915, + "grad_norm": 0.10526955872774124, + "learning_rate": 0.00019711935344205513, + "loss": 0.2008, + "step": 1003 + }, + { + "epoch": 0.07244128576066958, + "grad_norm": 0.12301938980817795, + "learning_rate": 0.0001971164670226584, + "loss": 0.1413, + "step": 1004 + }, + { + "epoch": 0.07251343843573001, + "grad_norm": 0.09197913110256195, + "learning_rate": 0.00019711358060326166, + "loss": 0.1553, + "step": 1005 + }, + { + "epoch": 0.07258559111079044, + "grad_norm": 0.11922527104616165, + "learning_rate": 0.00019711069418386492, + "loss": 0.2427, + "step": 1006 + }, + { + "epoch": 0.07265774378585087, + "grad_norm": 0.10435183346271515, + "learning_rate": 0.00019710780776446818, + "loss": 0.1628, + "step": 1007 + }, + { + "epoch": 0.07272989646091128, + "grad_norm": 0.09419992566108704, + "learning_rate": 0.00019710492134507145, + "loss": 0.1985, + "step": 1008 + }, + { + "epoch": 0.07280204913597171, + "grad_norm": 0.11829221993684769, + "learning_rate": 0.0001971020349256747, + "loss": 0.1782, + "step": 1009 + }, + { + "epoch": 0.07287420181103214, + "grad_norm": 0.09988191723823547, + "learning_rate": 0.00019709914850627797, + "loss": 0.1895, + "step": 1010 + }, + { + "epoch": 0.07294635448609257, + "grad_norm": 0.1229916587471962, + "learning_rate": 0.00019709626208688123, + "loss": 0.1646, + "step": 1011 + }, + { + "epoch": 0.073018507161153, + "grad_norm": 0.08967486023902893, + "learning_rate": 0.0001970933756674845, + "loss": 0.1948, + "step": 1012 + }, + { + "epoch": 0.07309065983621343, + "grad_norm": 0.10921390354633331, + "learning_rate": 0.00019709048924808776, + "loss": 0.1574, + "step": 1013 + }, + { + "epoch": 0.07316281251127385, + "grad_norm": 0.13804689049720764, + "learning_rate": 0.000197087602828691, + "loss": 0.1984, + "step": 1014 + }, + { + "epoch": 0.07323496518633428, + "grad_norm": 0.11681187152862549, + "learning_rate": 0.0001970847164092943, + "loss": 0.1783, + "step": 1015 + }, + { + "epoch": 0.07330711786139471, + "grad_norm": 0.11250089108943939, + "learning_rate": 0.00019708182998989755, + "loss": 0.1954, + "step": 1016 + }, + { + "epoch": 0.07337927053645514, + "grad_norm": 0.10940463095903397, + "learning_rate": 0.0001970789435705008, + "loss": 0.1874, + "step": 1017 + }, + { + "epoch": 0.07345142321151557, + "grad_norm": 0.09099728614091873, + "learning_rate": 0.00019707605715110408, + "loss": 0.2076, + "step": 1018 + }, + { + "epoch": 0.073523575886576, + "grad_norm": 0.10192853212356567, + "learning_rate": 0.0001970731707317073, + "loss": 0.2001, + "step": 1019 + }, + { + "epoch": 0.07359572856163642, + "grad_norm": 0.09343042969703674, + "learning_rate": 0.00019707028431231057, + "loss": 0.2092, + "step": 1020 + }, + { + "epoch": 0.07366788123669685, + "grad_norm": 0.10208447277545929, + "learning_rate": 0.00019706739789291384, + "loss": 0.1896, + "step": 1021 + }, + { + "epoch": 0.07374003391175728, + "grad_norm": 0.09970434010028839, + "learning_rate": 0.00019706451147351713, + "loss": 0.1718, + "step": 1022 + }, + { + "epoch": 0.07381218658681771, + "grad_norm": 0.11564663052558899, + "learning_rate": 0.0001970616250541204, + "loss": 0.1859, + "step": 1023 + }, + { + "epoch": 0.07388433926187814, + "grad_norm": 0.11144092679023743, + "learning_rate": 0.00019705873863472363, + "loss": 0.1895, + "step": 1024 + }, + { + "epoch": 0.07395649193693857, + "grad_norm": 0.1214640662074089, + "learning_rate": 0.0001970558522153269, + "loss": 0.1183, + "step": 1025 + }, + { + "epoch": 0.07402864461199898, + "grad_norm": 0.13590200245380402, + "learning_rate": 0.00019705296579593015, + "loss": 0.2695, + "step": 1026 + }, + { + "epoch": 0.07410079728705941, + "grad_norm": 0.10062559694051743, + "learning_rate": 0.00019705007937653341, + "loss": 0.2017, + "step": 1027 + }, + { + "epoch": 0.07417294996211984, + "grad_norm": 0.09059272706508636, + "learning_rate": 0.00019704719295713668, + "loss": 0.1862, + "step": 1028 + }, + { + "epoch": 0.07424510263718027, + "grad_norm": 0.10175316780805588, + "learning_rate": 0.00019704430653773994, + "loss": 0.157, + "step": 1029 + }, + { + "epoch": 0.0743172553122407, + "grad_norm": 0.11331373453140259, + "learning_rate": 0.0001970414201183432, + "loss": 0.1846, + "step": 1030 + }, + { + "epoch": 0.07438940798730113, + "grad_norm": 0.07486823946237564, + "learning_rate": 0.00019703853369894647, + "loss": 0.1812, + "step": 1031 + }, + { + "epoch": 0.07446156066236155, + "grad_norm": 0.10093438625335693, + "learning_rate": 0.00019703564727954973, + "loss": 0.1548, + "step": 1032 + }, + { + "epoch": 0.07453371333742198, + "grad_norm": 0.10439279675483704, + "learning_rate": 0.000197032760860153, + "loss": 0.1327, + "step": 1033 + }, + { + "epoch": 0.07460586601248241, + "grad_norm": 0.08357524126768112, + "learning_rate": 0.00019702987444075625, + "loss": 0.1591, + "step": 1034 + }, + { + "epoch": 0.07467801868754284, + "grad_norm": 0.09035192430019379, + "learning_rate": 0.00019702698802135952, + "loss": 0.1327, + "step": 1035 + }, + { + "epoch": 0.07475017136260327, + "grad_norm": 0.08603281527757645, + "learning_rate": 0.00019702410160196278, + "loss": 0.2147, + "step": 1036 + }, + { + "epoch": 0.0748223240376637, + "grad_norm": 0.1261157989501953, + "learning_rate": 0.00019702121518256604, + "loss": 0.1838, + "step": 1037 + }, + { + "epoch": 0.07489447671272412, + "grad_norm": 0.11462076753377914, + "learning_rate": 0.0001970183287631693, + "loss": 0.1553, + "step": 1038 + }, + { + "epoch": 0.07496662938778455, + "grad_norm": 0.11107989400625229, + "learning_rate": 0.00019701544234377257, + "loss": 0.1559, + "step": 1039 + }, + { + "epoch": 0.07503878206284498, + "grad_norm": 0.12100755423307419, + "learning_rate": 0.00019701255592437583, + "loss": 0.1868, + "step": 1040 + }, + { + "epoch": 0.07511093473790541, + "grad_norm": 0.08341117948293686, + "learning_rate": 0.00019700966950497907, + "loss": 0.1399, + "step": 1041 + }, + { + "epoch": 0.07518308741296584, + "grad_norm": 0.09755191951990128, + "learning_rate": 0.00019700678308558233, + "loss": 0.1681, + "step": 1042 + }, + { + "epoch": 0.07525524008802627, + "grad_norm": 0.11107684671878815, + "learning_rate": 0.00019700389666618562, + "loss": 0.187, + "step": 1043 + }, + { + "epoch": 0.0753273927630867, + "grad_norm": 0.10081485658884048, + "learning_rate": 0.00019700101024678888, + "loss": 0.1734, + "step": 1044 + }, + { + "epoch": 0.07539954543814711, + "grad_norm": 0.09995072335004807, + "learning_rate": 0.00019699812382739215, + "loss": 0.1679, + "step": 1045 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 0.11626127362251282, + "learning_rate": 0.00019699523740799538, + "loss": 0.1877, + "step": 1046 + }, + { + "epoch": 0.07554385078826797, + "grad_norm": 0.13476620614528656, + "learning_rate": 0.00019699235098859865, + "loss": 0.1746, + "step": 1047 + }, + { + "epoch": 0.0756160034633284, + "grad_norm": 0.07211948931217194, + "learning_rate": 0.0001969894645692019, + "loss": 0.164, + "step": 1048 + }, + { + "epoch": 0.07568815613838883, + "grad_norm": 0.14787274599075317, + "learning_rate": 0.00019698657814980517, + "loss": 0.1627, + "step": 1049 + }, + { + "epoch": 0.07576030881344926, + "grad_norm": 0.13697263598442078, + "learning_rate": 0.00019698369173040846, + "loss": 0.2167, + "step": 1050 + }, + { + "epoch": 0.07583246148850968, + "grad_norm": 0.10592759400606155, + "learning_rate": 0.0001969808053110117, + "loss": 0.1767, + "step": 1051 + }, + { + "epoch": 0.07590461416357011, + "grad_norm": 0.11751779913902283, + "learning_rate": 0.00019697791889161496, + "loss": 0.1859, + "step": 1052 + }, + { + "epoch": 0.07597676683863054, + "grad_norm": 0.10343729704618454, + "learning_rate": 0.00019697503247221822, + "loss": 0.2388, + "step": 1053 + }, + { + "epoch": 0.07604891951369097, + "grad_norm": 0.09543080627918243, + "learning_rate": 0.00019697214605282149, + "loss": 0.1869, + "step": 1054 + }, + { + "epoch": 0.0761210721887514, + "grad_norm": 0.08784335851669312, + "learning_rate": 0.00019696925963342475, + "loss": 0.1665, + "step": 1055 + }, + { + "epoch": 0.07619322486381183, + "grad_norm": 0.08296474069356918, + "learning_rate": 0.000196966373214028, + "loss": 0.1704, + "step": 1056 + }, + { + "epoch": 0.07626537753887225, + "grad_norm": 0.11708515137434006, + "learning_rate": 0.00019696348679463127, + "loss": 0.1648, + "step": 1057 + }, + { + "epoch": 0.07633753021393268, + "grad_norm": 0.10343474894762039, + "learning_rate": 0.00019696060037523454, + "loss": 0.1674, + "step": 1058 + }, + { + "epoch": 0.07640968288899311, + "grad_norm": 0.11188158392906189, + "learning_rate": 0.0001969577139558378, + "loss": 0.1915, + "step": 1059 + }, + { + "epoch": 0.07648183556405354, + "grad_norm": 0.10264435410499573, + "learning_rate": 0.00019695482753644106, + "loss": 0.1711, + "step": 1060 + }, + { + "epoch": 0.07655398823911397, + "grad_norm": 0.11818800866603851, + "learning_rate": 0.00019695194111704433, + "loss": 0.1847, + "step": 1061 + }, + { + "epoch": 0.0766261409141744, + "grad_norm": 0.10329384356737137, + "learning_rate": 0.00019694905469764756, + "loss": 0.1858, + "step": 1062 + }, + { + "epoch": 0.07669829358923481, + "grad_norm": 0.10981201380491257, + "learning_rate": 0.00019694616827825083, + "loss": 0.2187, + "step": 1063 + }, + { + "epoch": 0.07677044626429524, + "grad_norm": 0.08695390820503235, + "learning_rate": 0.00019694328185885412, + "loss": 0.1824, + "step": 1064 + }, + { + "epoch": 0.07684259893935567, + "grad_norm": 0.09196928143501282, + "learning_rate": 0.00019694039543945738, + "loss": 0.1538, + "step": 1065 + }, + { + "epoch": 0.0769147516144161, + "grad_norm": 0.08134390413761139, + "learning_rate": 0.00019693750902006064, + "loss": 0.2178, + "step": 1066 + }, + { + "epoch": 0.07698690428947653, + "grad_norm": 0.08974107354879379, + "learning_rate": 0.00019693462260066388, + "loss": 0.176, + "step": 1067 + }, + { + "epoch": 0.07705905696453696, + "grad_norm": 0.09833265095949173, + "learning_rate": 0.00019693173618126714, + "loss": 0.1878, + "step": 1068 + }, + { + "epoch": 0.07713120963959738, + "grad_norm": 0.10632316023111343, + "learning_rate": 0.0001969288497618704, + "loss": 0.1922, + "step": 1069 + }, + { + "epoch": 0.07720336231465781, + "grad_norm": 0.1112053319811821, + "learning_rate": 0.00019692596334247367, + "loss": 0.1548, + "step": 1070 + }, + { + "epoch": 0.07727551498971824, + "grad_norm": 0.10864856839179993, + "learning_rate": 0.00019692307692307696, + "loss": 0.1791, + "step": 1071 + }, + { + "epoch": 0.07734766766477867, + "grad_norm": 0.07739702612161636, + "learning_rate": 0.0001969201905036802, + "loss": 0.1628, + "step": 1072 + }, + { + "epoch": 0.0774198203398391, + "grad_norm": 0.09403367340564728, + "learning_rate": 0.00019691730408428345, + "loss": 0.1485, + "step": 1073 + }, + { + "epoch": 0.07749197301489953, + "grad_norm": 0.10768328607082367, + "learning_rate": 0.00019691441766488672, + "loss": 0.156, + "step": 1074 + }, + { + "epoch": 0.07756412568995996, + "grad_norm": 0.10109016299247742, + "learning_rate": 0.00019691153124548998, + "loss": 0.1485, + "step": 1075 + }, + { + "epoch": 0.07763627836502038, + "grad_norm": 0.10066195577383041, + "learning_rate": 0.00019690864482609324, + "loss": 0.1837, + "step": 1076 + }, + { + "epoch": 0.07770843104008081, + "grad_norm": 0.11048019677400589, + "learning_rate": 0.0001969057584066965, + "loss": 0.1931, + "step": 1077 + }, + { + "epoch": 0.07778058371514124, + "grad_norm": 0.1042243242263794, + "learning_rate": 0.00019690287198729977, + "loss": 0.1405, + "step": 1078 + }, + { + "epoch": 0.07785273639020167, + "grad_norm": 0.08323122560977936, + "learning_rate": 0.00019689998556790303, + "loss": 0.1908, + "step": 1079 + }, + { + "epoch": 0.0779248890652621, + "grad_norm": 0.10694662481546402, + "learning_rate": 0.0001968970991485063, + "loss": 0.1343, + "step": 1080 + }, + { + "epoch": 0.07799704174032253, + "grad_norm": 0.12616930902004242, + "learning_rate": 0.00019689421272910956, + "loss": 0.2282, + "step": 1081 + }, + { + "epoch": 0.07806919441538294, + "grad_norm": 0.1291603147983551, + "learning_rate": 0.00019689132630971282, + "loss": 0.1912, + "step": 1082 + }, + { + "epoch": 0.07814134709044337, + "grad_norm": 0.11131034046411514, + "learning_rate": 0.00019688843989031606, + "loss": 0.1971, + "step": 1083 + }, + { + "epoch": 0.0782134997655038, + "grad_norm": 0.11821369081735611, + "learning_rate": 0.00019688555347091932, + "loss": 0.199, + "step": 1084 + }, + { + "epoch": 0.07828565244056424, + "grad_norm": 0.1535119265317917, + "learning_rate": 0.00019688266705152258, + "loss": 0.15, + "step": 1085 + }, + { + "epoch": 0.07835780511562467, + "grad_norm": 0.10242103785276413, + "learning_rate": 0.00019687978063212587, + "loss": 0.1796, + "step": 1086 + }, + { + "epoch": 0.0784299577906851, + "grad_norm": 0.0983833447098732, + "learning_rate": 0.00019687689421272914, + "loss": 0.1702, + "step": 1087 + }, + { + "epoch": 0.07850211046574551, + "grad_norm": 0.0991702526807785, + "learning_rate": 0.00019687400779333237, + "loss": 0.1483, + "step": 1088 + }, + { + "epoch": 0.07857426314080594, + "grad_norm": 0.10307537019252777, + "learning_rate": 0.00019687112137393563, + "loss": 0.1982, + "step": 1089 + }, + { + "epoch": 0.07864641581586637, + "grad_norm": 0.07653114944696426, + "learning_rate": 0.0001968682349545389, + "loss": 0.1948, + "step": 1090 + }, + { + "epoch": 0.0787185684909268, + "grad_norm": 0.09349947422742844, + "learning_rate": 0.00019686534853514216, + "loss": 0.1875, + "step": 1091 + }, + { + "epoch": 0.07879072116598723, + "grad_norm": 0.11042381823062897, + "learning_rate": 0.00019686246211574542, + "loss": 0.2184, + "step": 1092 + }, + { + "epoch": 0.07886287384104766, + "grad_norm": 0.09334763139486313, + "learning_rate": 0.00019685957569634869, + "loss": 0.1927, + "step": 1093 + }, + { + "epoch": 0.07893502651610808, + "grad_norm": 0.09462998807430267, + "learning_rate": 0.00019685668927695195, + "loss": 0.1587, + "step": 1094 + }, + { + "epoch": 0.07900717919116851, + "grad_norm": 0.08485822379589081, + "learning_rate": 0.0001968538028575552, + "loss": 0.1815, + "step": 1095 + }, + { + "epoch": 0.07907933186622894, + "grad_norm": 0.08465959876775742, + "learning_rate": 0.00019685091643815847, + "loss": 0.1805, + "step": 1096 + }, + { + "epoch": 0.07915148454128937, + "grad_norm": 0.08012067526578903, + "learning_rate": 0.00019684803001876174, + "loss": 0.1194, + "step": 1097 + }, + { + "epoch": 0.0792236372163498, + "grad_norm": 0.09432154148817062, + "learning_rate": 0.000196845143599365, + "loss": 0.1646, + "step": 1098 + }, + { + "epoch": 0.07929578989141023, + "grad_norm": 0.07500480860471725, + "learning_rate": 0.00019684225717996824, + "loss": 0.1987, + "step": 1099 + }, + { + "epoch": 0.07936794256647065, + "grad_norm": 0.08517158776521683, + "learning_rate": 0.00019683937076057153, + "loss": 0.1765, + "step": 1100 + }, + { + "epoch": 0.07944009524153108, + "grad_norm": 0.13203036785125732, + "learning_rate": 0.0001968364843411748, + "loss": 0.1771, + "step": 1101 + }, + { + "epoch": 0.0795122479165915, + "grad_norm": 0.11799517273902893, + "learning_rate": 0.00019683359792177805, + "loss": 0.1671, + "step": 1102 + }, + { + "epoch": 0.07958440059165194, + "grad_norm": 0.09855810552835464, + "learning_rate": 0.00019683071150238132, + "loss": 0.1851, + "step": 1103 + }, + { + "epoch": 0.07965655326671237, + "grad_norm": 0.09309118241071701, + "learning_rate": 0.00019682782508298455, + "loss": 0.2037, + "step": 1104 + }, + { + "epoch": 0.0797287059417728, + "grad_norm": 0.08551649749279022, + "learning_rate": 0.00019682493866358781, + "loss": 0.1802, + "step": 1105 + }, + { + "epoch": 0.07980085861683323, + "grad_norm": 0.10401365160942078, + "learning_rate": 0.00019682205224419108, + "loss": 0.2303, + "step": 1106 + }, + { + "epoch": 0.07987301129189364, + "grad_norm": 0.11717145144939423, + "learning_rate": 0.00019681916582479437, + "loss": 0.2169, + "step": 1107 + }, + { + "epoch": 0.07994516396695407, + "grad_norm": 0.11967819184064865, + "learning_rate": 0.00019681627940539763, + "loss": 0.1258, + "step": 1108 + }, + { + "epoch": 0.0800173166420145, + "grad_norm": 0.08622618764638901, + "learning_rate": 0.00019681339298600087, + "loss": 0.157, + "step": 1109 + }, + { + "epoch": 0.08008946931707493, + "grad_norm": 0.10485079884529114, + "learning_rate": 0.00019681050656660413, + "loss": 0.1563, + "step": 1110 + }, + { + "epoch": 0.08016162199213536, + "grad_norm": 0.08495404571294785, + "learning_rate": 0.0001968076201472074, + "loss": 0.1957, + "step": 1111 + }, + { + "epoch": 0.08023377466719579, + "grad_norm": 0.100055031478405, + "learning_rate": 0.00019680473372781065, + "loss": 0.1845, + "step": 1112 + }, + { + "epoch": 0.08030592734225621, + "grad_norm": 0.0924968346953392, + "learning_rate": 0.00019680184730841392, + "loss": 0.2243, + "step": 1113 + }, + { + "epoch": 0.08037808001731664, + "grad_norm": 0.13996759057044983, + "learning_rate": 0.00019679896088901718, + "loss": 0.2316, + "step": 1114 + }, + { + "epoch": 0.08045023269237707, + "grad_norm": 0.08373535424470901, + "learning_rate": 0.00019679607446962044, + "loss": 0.1856, + "step": 1115 + }, + { + "epoch": 0.0805223853674375, + "grad_norm": 0.0935160368680954, + "learning_rate": 0.0001967931880502237, + "loss": 0.19, + "step": 1116 + }, + { + "epoch": 0.08059453804249793, + "grad_norm": 0.11538738757371902, + "learning_rate": 0.00019679030163082697, + "loss": 0.1846, + "step": 1117 + }, + { + "epoch": 0.08066669071755836, + "grad_norm": 0.0877399742603302, + "learning_rate": 0.00019678741521143023, + "loss": 0.1535, + "step": 1118 + }, + { + "epoch": 0.08073884339261878, + "grad_norm": 0.08489222079515457, + "learning_rate": 0.0001967845287920335, + "loss": 0.1773, + "step": 1119 + }, + { + "epoch": 0.0808109960676792, + "grad_norm": 0.07990337163209915, + "learning_rate": 0.00019678164237263673, + "loss": 0.1776, + "step": 1120 + }, + { + "epoch": 0.08088314874273964, + "grad_norm": 0.10108862072229385, + "learning_rate": 0.00019677875595324002, + "loss": 0.2599, + "step": 1121 + }, + { + "epoch": 0.08095530141780007, + "grad_norm": 0.0793904960155487, + "learning_rate": 0.00019677586953384328, + "loss": 0.1569, + "step": 1122 + }, + { + "epoch": 0.0810274540928605, + "grad_norm": 0.1183282658457756, + "learning_rate": 0.00019677298311444655, + "loss": 0.1651, + "step": 1123 + }, + { + "epoch": 0.08109960676792093, + "grad_norm": 0.08836779743432999, + "learning_rate": 0.0001967700966950498, + "loss": 0.1462, + "step": 1124 + }, + { + "epoch": 0.08117175944298134, + "grad_norm": 0.11735153943300247, + "learning_rate": 0.00019676721027565305, + "loss": 0.2149, + "step": 1125 + }, + { + "epoch": 0.08124391211804177, + "grad_norm": 0.11171982437372208, + "learning_rate": 0.0001967643238562563, + "loss": 0.1914, + "step": 1126 + }, + { + "epoch": 0.0813160647931022, + "grad_norm": 0.137335866689682, + "learning_rate": 0.00019676143743685957, + "loss": 0.1839, + "step": 1127 + }, + { + "epoch": 0.08138821746816263, + "grad_norm": 0.11637444794178009, + "learning_rate": 0.00019675855101746286, + "loss": 0.1445, + "step": 1128 + }, + { + "epoch": 0.08146037014322306, + "grad_norm": 0.12163744121789932, + "learning_rate": 0.00019675566459806612, + "loss": 0.1604, + "step": 1129 + }, + { + "epoch": 0.08153252281828349, + "grad_norm": 0.12734180688858032, + "learning_rate": 0.00019675277817866936, + "loss": 0.2484, + "step": 1130 + }, + { + "epoch": 0.08160467549334391, + "grad_norm": 0.12239005416631699, + "learning_rate": 0.00019674989175927262, + "loss": 0.2532, + "step": 1131 + }, + { + "epoch": 0.08167682816840434, + "grad_norm": 0.12187651544809341, + "learning_rate": 0.00019674700533987589, + "loss": 0.1701, + "step": 1132 + }, + { + "epoch": 0.08174898084346477, + "grad_norm": 0.14173132181167603, + "learning_rate": 0.00019674411892047915, + "loss": 0.1907, + "step": 1133 + }, + { + "epoch": 0.0818211335185252, + "grad_norm": 0.10581263154745102, + "learning_rate": 0.0001967412325010824, + "loss": 0.2139, + "step": 1134 + }, + { + "epoch": 0.08189328619358563, + "grad_norm": 0.10544908046722412, + "learning_rate": 0.00019673834608168567, + "loss": 0.1428, + "step": 1135 + }, + { + "epoch": 0.08196543886864606, + "grad_norm": 0.096528060734272, + "learning_rate": 0.00019673545966228894, + "loss": 0.1651, + "step": 1136 + }, + { + "epoch": 0.08203759154370649, + "grad_norm": 0.08613178133964539, + "learning_rate": 0.0001967325732428922, + "loss": 0.1922, + "step": 1137 + }, + { + "epoch": 0.0821097442187669, + "grad_norm": 0.09921222925186157, + "learning_rate": 0.00019672968682349546, + "loss": 0.1547, + "step": 1138 + }, + { + "epoch": 0.08218189689382734, + "grad_norm": 0.08947184681892395, + "learning_rate": 0.00019672680040409873, + "loss": 0.1771, + "step": 1139 + }, + { + "epoch": 0.08225404956888777, + "grad_norm": 0.09903664886951447, + "learning_rate": 0.000196723913984702, + "loss": 0.1525, + "step": 1140 + }, + { + "epoch": 0.0823262022439482, + "grad_norm": 0.0916438177227974, + "learning_rate": 0.00019672102756530522, + "loss": 0.2297, + "step": 1141 + }, + { + "epoch": 0.08239835491900863, + "grad_norm": 0.08452580869197845, + "learning_rate": 0.00019671814114590851, + "loss": 0.1251, + "step": 1142 + }, + { + "epoch": 0.08247050759406906, + "grad_norm": 0.11102423816919327, + "learning_rate": 0.00019671525472651178, + "loss": 0.1875, + "step": 1143 + }, + { + "epoch": 0.08254266026912947, + "grad_norm": 0.08565377444028854, + "learning_rate": 0.00019671236830711504, + "loss": 0.1532, + "step": 1144 + }, + { + "epoch": 0.0826148129441899, + "grad_norm": 0.1014544740319252, + "learning_rate": 0.0001967094818877183, + "loss": 0.1683, + "step": 1145 + }, + { + "epoch": 0.08268696561925033, + "grad_norm": 0.10558659583330154, + "learning_rate": 0.00019670659546832154, + "loss": 0.1624, + "step": 1146 + }, + { + "epoch": 0.08275911829431076, + "grad_norm": 0.09612471610307693, + "learning_rate": 0.0001967037090489248, + "loss": 0.179, + "step": 1147 + }, + { + "epoch": 0.08283127096937119, + "grad_norm": 0.09432287514209747, + "learning_rate": 0.00019670082262952807, + "loss": 0.1544, + "step": 1148 + }, + { + "epoch": 0.08290342364443162, + "grad_norm": 0.10006943345069885, + "learning_rate": 0.00019669793621013136, + "loss": 0.1908, + "step": 1149 + }, + { + "epoch": 0.08297557631949204, + "grad_norm": 0.08191773295402527, + "learning_rate": 0.00019669504979073462, + "loss": 0.2056, + "step": 1150 + }, + { + "epoch": 0.08304772899455247, + "grad_norm": 0.12594346702098846, + "learning_rate": 0.00019669216337133785, + "loss": 0.1722, + "step": 1151 + }, + { + "epoch": 0.0831198816696129, + "grad_norm": 0.08761388808488846, + "learning_rate": 0.00019668927695194112, + "loss": 0.161, + "step": 1152 + }, + { + "epoch": 0.08319203434467333, + "grad_norm": 0.12430427223443985, + "learning_rate": 0.00019668639053254438, + "loss": 0.1482, + "step": 1153 + }, + { + "epoch": 0.08326418701973376, + "grad_norm": 0.12445410341024399, + "learning_rate": 0.00019668350411314764, + "loss": 0.1945, + "step": 1154 + }, + { + "epoch": 0.08333633969479419, + "grad_norm": 0.13118945062160492, + "learning_rate": 0.0001966806176937509, + "loss": 0.1537, + "step": 1155 + }, + { + "epoch": 0.0834084923698546, + "grad_norm": 0.07383442670106888, + "learning_rate": 0.00019667773127435417, + "loss": 0.1932, + "step": 1156 + }, + { + "epoch": 0.08348064504491504, + "grad_norm": 0.12663637101650238, + "learning_rate": 0.00019667484485495743, + "loss": 0.1956, + "step": 1157 + }, + { + "epoch": 0.08355279771997547, + "grad_norm": 0.0797528624534607, + "learning_rate": 0.0001966719584355607, + "loss": 0.2317, + "step": 1158 + }, + { + "epoch": 0.0836249503950359, + "grad_norm": 0.13396123051643372, + "learning_rate": 0.00019666907201616396, + "loss": 0.2296, + "step": 1159 + }, + { + "epoch": 0.08369710307009633, + "grad_norm": 0.08744298666715622, + "learning_rate": 0.00019666618559676722, + "loss": 0.1572, + "step": 1160 + }, + { + "epoch": 0.08376925574515676, + "grad_norm": 0.09590718150138855, + "learning_rate": 0.00019666329917737048, + "loss": 0.1724, + "step": 1161 + }, + { + "epoch": 0.08384140842021717, + "grad_norm": 0.08623688668012619, + "learning_rate": 0.00019666041275797375, + "loss": 0.156, + "step": 1162 + }, + { + "epoch": 0.0839135610952776, + "grad_norm": 0.14770671725273132, + "learning_rate": 0.000196657526338577, + "loss": 0.1919, + "step": 1163 + }, + { + "epoch": 0.08398571377033803, + "grad_norm": 0.11314375698566437, + "learning_rate": 0.00019665463991918027, + "loss": 0.1295, + "step": 1164 + }, + { + "epoch": 0.08405786644539846, + "grad_norm": 0.10177276283502579, + "learning_rate": 0.00019665175349978353, + "loss": 0.1967, + "step": 1165 + }, + { + "epoch": 0.0841300191204589, + "grad_norm": 0.17404918372631073, + "learning_rate": 0.0001966488670803868, + "loss": 0.2155, + "step": 1166 + }, + { + "epoch": 0.08420217179551932, + "grad_norm": 0.07917987555265427, + "learning_rate": 0.00019664598066099006, + "loss": 0.1971, + "step": 1167 + }, + { + "epoch": 0.08427432447057975, + "grad_norm": 0.10241827368736267, + "learning_rate": 0.0001966430942415933, + "loss": 0.173, + "step": 1168 + }, + { + "epoch": 0.08434647714564017, + "grad_norm": 0.09378177672624588, + "learning_rate": 0.00019664020782219656, + "loss": 0.1652, + "step": 1169 + }, + { + "epoch": 0.0844186298207006, + "grad_norm": 0.09627138823270798, + "learning_rate": 0.00019663732140279985, + "loss": 0.1774, + "step": 1170 + }, + { + "epoch": 0.08449078249576103, + "grad_norm": 0.1196071207523346, + "learning_rate": 0.0001966344349834031, + "loss": 0.1497, + "step": 1171 + }, + { + "epoch": 0.08456293517082146, + "grad_norm": 0.10868272185325623, + "learning_rate": 0.00019663154856400638, + "loss": 0.2377, + "step": 1172 + }, + { + "epoch": 0.08463508784588189, + "grad_norm": 0.10114963352680206, + "learning_rate": 0.0001966286621446096, + "loss": 0.2186, + "step": 1173 + }, + { + "epoch": 0.08470724052094232, + "grad_norm": 0.10729393362998962, + "learning_rate": 0.00019662577572521287, + "loss": 0.1126, + "step": 1174 + }, + { + "epoch": 0.08477939319600274, + "grad_norm": 0.10542712360620499, + "learning_rate": 0.00019662288930581614, + "loss": 0.1955, + "step": 1175 + }, + { + "epoch": 0.08485154587106317, + "grad_norm": 0.10797982662916183, + "learning_rate": 0.0001966200028864194, + "loss": 0.2124, + "step": 1176 + }, + { + "epoch": 0.0849236985461236, + "grad_norm": 0.09008362144231796, + "learning_rate": 0.0001966171164670227, + "loss": 0.1638, + "step": 1177 + }, + { + "epoch": 0.08499585122118403, + "grad_norm": 0.1290101408958435, + "learning_rate": 0.00019661423004762593, + "loss": 0.1333, + "step": 1178 + }, + { + "epoch": 0.08506800389624446, + "grad_norm": 0.13693585991859436, + "learning_rate": 0.0001966113436282292, + "loss": 0.1503, + "step": 1179 + }, + { + "epoch": 0.08514015657130489, + "grad_norm": 0.13027171790599823, + "learning_rate": 0.00019660845720883245, + "loss": 0.1838, + "step": 1180 + }, + { + "epoch": 0.0852123092463653, + "grad_norm": 0.08405692875385284, + "learning_rate": 0.00019660557078943571, + "loss": 0.2052, + "step": 1181 + }, + { + "epoch": 0.08528446192142573, + "grad_norm": 0.13299930095672607, + "learning_rate": 0.00019660268437003898, + "loss": 0.1651, + "step": 1182 + }, + { + "epoch": 0.08535661459648616, + "grad_norm": 0.10912304371595383, + "learning_rate": 0.00019659979795064224, + "loss": 0.1503, + "step": 1183 + }, + { + "epoch": 0.0854287672715466, + "grad_norm": 0.14194819331169128, + "learning_rate": 0.0001965969115312455, + "loss": 0.1891, + "step": 1184 + }, + { + "epoch": 0.08550091994660702, + "grad_norm": 0.13994021713733673, + "learning_rate": 0.00019659402511184877, + "loss": 0.2066, + "step": 1185 + }, + { + "epoch": 0.08557307262166745, + "grad_norm": 0.10049525648355484, + "learning_rate": 0.00019659113869245203, + "loss": 0.2056, + "step": 1186 + }, + { + "epoch": 0.08564522529672787, + "grad_norm": 0.08980170637369156, + "learning_rate": 0.0001965882522730553, + "loss": 0.1698, + "step": 1187 + }, + { + "epoch": 0.0857173779717883, + "grad_norm": 0.08638997375965118, + "learning_rate": 0.00019658536585365856, + "loss": 0.2183, + "step": 1188 + }, + { + "epoch": 0.08578953064684873, + "grad_norm": 0.09948911517858505, + "learning_rate": 0.0001965824794342618, + "loss": 0.1589, + "step": 1189 + }, + { + "epoch": 0.08586168332190916, + "grad_norm": 0.1180720329284668, + "learning_rate": 0.00019657959301486505, + "loss": 0.1794, + "step": 1190 + }, + { + "epoch": 0.08593383599696959, + "grad_norm": 0.11050887405872345, + "learning_rate": 0.00019657670659546834, + "loss": 0.1852, + "step": 1191 + }, + { + "epoch": 0.08600598867203002, + "grad_norm": 0.08088482171297073, + "learning_rate": 0.0001965738201760716, + "loss": 0.1616, + "step": 1192 + }, + { + "epoch": 0.08607814134709044, + "grad_norm": 0.11148399114608765, + "learning_rate": 0.00019657093375667487, + "loss": 0.142, + "step": 1193 + }, + { + "epoch": 0.08615029402215087, + "grad_norm": 0.12479886412620544, + "learning_rate": 0.0001965680473372781, + "loss": 0.2248, + "step": 1194 + }, + { + "epoch": 0.0862224466972113, + "grad_norm": 0.09896652400493622, + "learning_rate": 0.00019656516091788137, + "loss": 0.1401, + "step": 1195 + }, + { + "epoch": 0.08629459937227173, + "grad_norm": 0.11217128485441208, + "learning_rate": 0.00019656227449848463, + "loss": 0.1598, + "step": 1196 + }, + { + "epoch": 0.08636675204733216, + "grad_norm": 0.08890548348426819, + "learning_rate": 0.0001965593880790879, + "loss": 0.12, + "step": 1197 + }, + { + "epoch": 0.08643890472239259, + "grad_norm": 0.16628938913345337, + "learning_rate": 0.00019655650165969118, + "loss": 0.1875, + "step": 1198 + }, + { + "epoch": 0.08651105739745302, + "grad_norm": 0.11361134797334671, + "learning_rate": 0.00019655361524029442, + "loss": 0.2373, + "step": 1199 + }, + { + "epoch": 0.08658321007251343, + "grad_norm": 0.108740895986557, + "learning_rate": 0.00019655072882089768, + "loss": 0.1734, + "step": 1200 + }, + { + "epoch": 0.08665536274757386, + "grad_norm": 0.08382868021726608, + "learning_rate": 0.00019654784240150095, + "loss": 0.1664, + "step": 1201 + }, + { + "epoch": 0.0867275154226343, + "grad_norm": 0.10402888059616089, + "learning_rate": 0.0001965449559821042, + "loss": 0.1608, + "step": 1202 + }, + { + "epoch": 0.08679966809769472, + "grad_norm": 0.09176527708768845, + "learning_rate": 0.00019654206956270747, + "loss": 0.1876, + "step": 1203 + }, + { + "epoch": 0.08687182077275515, + "grad_norm": 0.08105053752660751, + "learning_rate": 0.00019653918314331073, + "loss": 0.1778, + "step": 1204 + }, + { + "epoch": 0.08694397344781558, + "grad_norm": 0.08214316517114639, + "learning_rate": 0.000196536296723914, + "loss": 0.1846, + "step": 1205 + }, + { + "epoch": 0.087016126122876, + "grad_norm": 0.1279810667037964, + "learning_rate": 0.00019653341030451726, + "loss": 0.1731, + "step": 1206 + }, + { + "epoch": 0.08708827879793643, + "grad_norm": 0.09262233972549438, + "learning_rate": 0.00019653052388512052, + "loss": 0.1863, + "step": 1207 + }, + { + "epoch": 0.08716043147299686, + "grad_norm": 0.12644372880458832, + "learning_rate": 0.00019652763746572379, + "loss": 0.1779, + "step": 1208 + }, + { + "epoch": 0.08723258414805729, + "grad_norm": 0.07663599401712418, + "learning_rate": 0.00019652475104632705, + "loss": 0.1311, + "step": 1209 + }, + { + "epoch": 0.08730473682311772, + "grad_norm": 0.10031388700008392, + "learning_rate": 0.00019652186462693029, + "loss": 0.1757, + "step": 1210 + }, + { + "epoch": 0.08737688949817815, + "grad_norm": 0.09371486306190491, + "learning_rate": 0.00019651897820753355, + "loss": 0.1984, + "step": 1211 + }, + { + "epoch": 0.08744904217323857, + "grad_norm": 0.10455991327762604, + "learning_rate": 0.00019651609178813684, + "loss": 0.1622, + "step": 1212 + }, + { + "epoch": 0.087521194848299, + "grad_norm": 0.1045440062880516, + "learning_rate": 0.0001965132053687401, + "loss": 0.1987, + "step": 1213 + }, + { + "epoch": 0.08759334752335943, + "grad_norm": 0.11814934015274048, + "learning_rate": 0.00019651031894934336, + "loss": 0.1644, + "step": 1214 + }, + { + "epoch": 0.08766550019841986, + "grad_norm": 0.09841133654117584, + "learning_rate": 0.0001965074325299466, + "loss": 0.1716, + "step": 1215 + }, + { + "epoch": 0.08773765287348029, + "grad_norm": 0.09436903148889542, + "learning_rate": 0.00019650454611054986, + "loss": 0.2072, + "step": 1216 + }, + { + "epoch": 0.08780980554854072, + "grad_norm": 0.11973531544208527, + "learning_rate": 0.00019650165969115313, + "loss": 0.2328, + "step": 1217 + }, + { + "epoch": 0.08788195822360113, + "grad_norm": 0.08357010781764984, + "learning_rate": 0.0001964987732717564, + "loss": 0.1726, + "step": 1218 + }, + { + "epoch": 0.08795411089866156, + "grad_norm": 0.09621085226535797, + "learning_rate": 0.00019649588685235968, + "loss": 0.1573, + "step": 1219 + }, + { + "epoch": 0.088026263573722, + "grad_norm": 0.08089716732501984, + "learning_rate": 0.00019649300043296291, + "loss": 0.1485, + "step": 1220 + }, + { + "epoch": 0.08809841624878242, + "grad_norm": 0.10134495049715042, + "learning_rate": 0.00019649011401356618, + "loss": 0.1628, + "step": 1221 + }, + { + "epoch": 0.08817056892384285, + "grad_norm": 0.10229915380477905, + "learning_rate": 0.00019648722759416944, + "loss": 0.164, + "step": 1222 + }, + { + "epoch": 0.08824272159890328, + "grad_norm": 0.1260552704334259, + "learning_rate": 0.0001964843411747727, + "loss": 0.1824, + "step": 1223 + }, + { + "epoch": 0.0883148742739637, + "grad_norm": 0.08896566182374954, + "learning_rate": 0.00019648145475537597, + "loss": 0.1831, + "step": 1224 + }, + { + "epoch": 0.08838702694902413, + "grad_norm": 0.12777185440063477, + "learning_rate": 0.00019647856833597923, + "loss": 0.1803, + "step": 1225 + }, + { + "epoch": 0.08845917962408456, + "grad_norm": 0.10827764123678207, + "learning_rate": 0.0001964756819165825, + "loss": 0.1915, + "step": 1226 + }, + { + "epoch": 0.08853133229914499, + "grad_norm": 0.09689563512802124, + "learning_rate": 0.00019647279549718575, + "loss": 0.1168, + "step": 1227 + }, + { + "epoch": 0.08860348497420542, + "grad_norm": 0.11655878275632858, + "learning_rate": 0.00019646990907778902, + "loss": 0.1583, + "step": 1228 + }, + { + "epoch": 0.08867563764926585, + "grad_norm": 0.10739599168300629, + "learning_rate": 0.00019646702265839228, + "loss": 0.1698, + "step": 1229 + }, + { + "epoch": 0.08874779032432628, + "grad_norm": 0.07532233744859695, + "learning_rate": 0.00019646413623899554, + "loss": 0.1712, + "step": 1230 + }, + { + "epoch": 0.0888199429993867, + "grad_norm": 0.10705269128084183, + "learning_rate": 0.00019646124981959878, + "loss": 0.2176, + "step": 1231 + }, + { + "epoch": 0.08889209567444713, + "grad_norm": 0.07377097010612488, + "learning_rate": 0.00019645836340020204, + "loss": 0.1904, + "step": 1232 + }, + { + "epoch": 0.08896424834950756, + "grad_norm": 0.08052223920822144, + "learning_rate": 0.00019645547698080533, + "loss": 0.1457, + "step": 1233 + }, + { + "epoch": 0.08903640102456799, + "grad_norm": 0.10867580771446228, + "learning_rate": 0.0001964525905614086, + "loss": 0.1826, + "step": 1234 + }, + { + "epoch": 0.08910855369962842, + "grad_norm": 0.07923214137554169, + "learning_rate": 0.00019644970414201186, + "loss": 0.1916, + "step": 1235 + }, + { + "epoch": 0.08918070637468885, + "grad_norm": 0.09806831926107407, + "learning_rate": 0.0001964468177226151, + "loss": 0.1921, + "step": 1236 + }, + { + "epoch": 0.08925285904974926, + "grad_norm": 0.08039659261703491, + "learning_rate": 0.00019644393130321836, + "loss": 0.1799, + "step": 1237 + }, + { + "epoch": 0.0893250117248097, + "grad_norm": 0.11143870651721954, + "learning_rate": 0.00019644104488382162, + "loss": 0.1252, + "step": 1238 + }, + { + "epoch": 0.08939716439987012, + "grad_norm": 0.11356040090322495, + "learning_rate": 0.00019643815846442488, + "loss": 0.198, + "step": 1239 + }, + { + "epoch": 0.08946931707493055, + "grad_norm": 0.10516027361154556, + "learning_rate": 0.00019643527204502817, + "loss": 0.2374, + "step": 1240 + }, + { + "epoch": 0.08954146974999098, + "grad_norm": 0.09420349448919296, + "learning_rate": 0.0001964323856256314, + "loss": 0.2187, + "step": 1241 + }, + { + "epoch": 0.08961362242505141, + "grad_norm": 0.11258124560117722, + "learning_rate": 0.00019642949920623467, + "loss": 0.1896, + "step": 1242 + }, + { + "epoch": 0.08968577510011183, + "grad_norm": 0.10528513789176941, + "learning_rate": 0.00019642661278683793, + "loss": 0.1911, + "step": 1243 + }, + { + "epoch": 0.08975792777517226, + "grad_norm": 0.10851096361875534, + "learning_rate": 0.0001964237263674412, + "loss": 0.1566, + "step": 1244 + }, + { + "epoch": 0.08983008045023269, + "grad_norm": 0.10526153445243835, + "learning_rate": 0.00019642083994804446, + "loss": 0.1836, + "step": 1245 + }, + { + "epoch": 0.08990223312529312, + "grad_norm": 0.09496541321277618, + "learning_rate": 0.00019641795352864772, + "loss": 0.1467, + "step": 1246 + }, + { + "epoch": 0.08997438580035355, + "grad_norm": 0.09573590755462646, + "learning_rate": 0.00019641506710925099, + "loss": 0.1629, + "step": 1247 + }, + { + "epoch": 0.09004653847541398, + "grad_norm": 0.09841755032539368, + "learning_rate": 0.00019641218068985425, + "loss": 0.1682, + "step": 1248 + }, + { + "epoch": 0.0901186911504744, + "grad_norm": 0.10681945830583572, + "learning_rate": 0.0001964092942704575, + "loss": 0.1816, + "step": 1249 + }, + { + "epoch": 0.09019084382553483, + "grad_norm": 0.12309513986110687, + "learning_rate": 0.00019640640785106077, + "loss": 0.1984, + "step": 1250 + }, + { + "epoch": 0.09026299650059526, + "grad_norm": 0.08377508074045181, + "learning_rate": 0.00019640352143166404, + "loss": 0.1832, + "step": 1251 + }, + { + "epoch": 0.09033514917565569, + "grad_norm": 0.10586296021938324, + "learning_rate": 0.00019640063501226727, + "loss": 0.1582, + "step": 1252 + }, + { + "epoch": 0.09040730185071612, + "grad_norm": 0.09107067435979843, + "learning_rate": 0.00019639774859287054, + "loss": 0.1677, + "step": 1253 + }, + { + "epoch": 0.09047945452577655, + "grad_norm": 0.09457964450120926, + "learning_rate": 0.00019639486217347383, + "loss": 0.1474, + "step": 1254 + }, + { + "epoch": 0.09055160720083696, + "grad_norm": 0.08729027211666107, + "learning_rate": 0.0001963919757540771, + "loss": 0.1455, + "step": 1255 + }, + { + "epoch": 0.0906237598758974, + "grad_norm": 0.1331973373889923, + "learning_rate": 0.00019638908933468035, + "loss": 0.191, + "step": 1256 + }, + { + "epoch": 0.09069591255095782, + "grad_norm": 0.07884123176336288, + "learning_rate": 0.0001963862029152836, + "loss": 0.161, + "step": 1257 + }, + { + "epoch": 0.09076806522601825, + "grad_norm": 0.09368634223937988, + "learning_rate": 0.00019638331649588685, + "loss": 0.1526, + "step": 1258 + }, + { + "epoch": 0.09084021790107868, + "grad_norm": 0.10609408468008041, + "learning_rate": 0.00019638043007649011, + "loss": 0.203, + "step": 1259 + }, + { + "epoch": 0.09091237057613911, + "grad_norm": 0.13501423597335815, + "learning_rate": 0.00019637754365709338, + "loss": 0.2078, + "step": 1260 + }, + { + "epoch": 0.09098452325119954, + "grad_norm": 0.08630816638469696, + "learning_rate": 0.00019637465723769667, + "loss": 0.1602, + "step": 1261 + }, + { + "epoch": 0.09105667592625996, + "grad_norm": 0.08650074899196625, + "learning_rate": 0.0001963717708182999, + "loss": 0.1545, + "step": 1262 + }, + { + "epoch": 0.09112882860132039, + "grad_norm": 0.08092677593231201, + "learning_rate": 0.00019636888439890317, + "loss": 0.1308, + "step": 1263 + }, + { + "epoch": 0.09120098127638082, + "grad_norm": 0.08196258544921875, + "learning_rate": 0.00019636599797950643, + "loss": 0.1415, + "step": 1264 + }, + { + "epoch": 0.09127313395144125, + "grad_norm": 0.08814175426959991, + "learning_rate": 0.0001963631115601097, + "loss": 0.169, + "step": 1265 + }, + { + "epoch": 0.09134528662650168, + "grad_norm": 0.11394612491130829, + "learning_rate": 0.00019636022514071295, + "loss": 0.2231, + "step": 1266 + }, + { + "epoch": 0.09141743930156211, + "grad_norm": 0.08286873996257782, + "learning_rate": 0.00019635733872131622, + "loss": 0.2207, + "step": 1267 + }, + { + "epoch": 0.09148959197662253, + "grad_norm": 0.11120649427175522, + "learning_rate": 0.00019635445230191948, + "loss": 0.1806, + "step": 1268 + }, + { + "epoch": 0.09156174465168296, + "grad_norm": 0.13481257855892181, + "learning_rate": 0.00019635156588252274, + "loss": 0.1771, + "step": 1269 + }, + { + "epoch": 0.09163389732674339, + "grad_norm": 0.08911901712417603, + "learning_rate": 0.000196348679463126, + "loss": 0.1489, + "step": 1270 + }, + { + "epoch": 0.09170605000180382, + "grad_norm": 0.12352734804153442, + "learning_rate": 0.00019634579304372927, + "loss": 0.1912, + "step": 1271 + }, + { + "epoch": 0.09177820267686425, + "grad_norm": 0.1041436716914177, + "learning_rate": 0.00019634290662433253, + "loss": 0.205, + "step": 1272 + }, + { + "epoch": 0.09185035535192468, + "grad_norm": 0.09959304332733154, + "learning_rate": 0.0001963400202049358, + "loss": 0.1857, + "step": 1273 + }, + { + "epoch": 0.0919225080269851, + "grad_norm": 0.09183462709188461, + "learning_rate": 0.00019633713378553903, + "loss": 0.1586, + "step": 1274 + }, + { + "epoch": 0.09199466070204552, + "grad_norm": 0.08344271034002304, + "learning_rate": 0.00019633424736614232, + "loss": 0.1967, + "step": 1275 + }, + { + "epoch": 0.09206681337710595, + "grad_norm": 0.08204984664916992, + "learning_rate": 0.00019633136094674558, + "loss": 0.1647, + "step": 1276 + }, + { + "epoch": 0.09213896605216639, + "grad_norm": 0.11258382350206375, + "learning_rate": 0.00019632847452734885, + "loss": 0.1751, + "step": 1277 + }, + { + "epoch": 0.09221111872722682, + "grad_norm": 0.10533545166254044, + "learning_rate": 0.0001963255881079521, + "loss": 0.1371, + "step": 1278 + }, + { + "epoch": 0.09228327140228725, + "grad_norm": 0.07517839968204498, + "learning_rate": 0.00019632270168855535, + "loss": 0.1603, + "step": 1279 + }, + { + "epoch": 0.09235542407734766, + "grad_norm": 0.09975581616163254, + "learning_rate": 0.0001963198152691586, + "loss": 0.1834, + "step": 1280 + }, + { + "epoch": 0.09242757675240809, + "grad_norm": 0.08929795026779175, + "learning_rate": 0.00019631692884976187, + "loss": 0.1156, + "step": 1281 + }, + { + "epoch": 0.09249972942746852, + "grad_norm": 0.10165119916200638, + "learning_rate": 0.00019631404243036513, + "loss": 0.1557, + "step": 1282 + }, + { + "epoch": 0.09257188210252895, + "grad_norm": 0.08109511435031891, + "learning_rate": 0.00019631115601096842, + "loss": 0.1985, + "step": 1283 + }, + { + "epoch": 0.09264403477758938, + "grad_norm": 0.11984848231077194, + "learning_rate": 0.00019630826959157166, + "loss": 0.2291, + "step": 1284 + }, + { + "epoch": 0.09271618745264981, + "grad_norm": 0.101690873503685, + "learning_rate": 0.00019630538317217492, + "loss": 0.1889, + "step": 1285 + }, + { + "epoch": 0.09278834012771023, + "grad_norm": 0.10042715072631836, + "learning_rate": 0.00019630249675277819, + "loss": 0.1608, + "step": 1286 + }, + { + "epoch": 0.09286049280277066, + "grad_norm": 0.110273078083992, + "learning_rate": 0.00019629961033338145, + "loss": 0.149, + "step": 1287 + }, + { + "epoch": 0.09293264547783109, + "grad_norm": 0.10497692972421646, + "learning_rate": 0.0001962967239139847, + "loss": 0.1997, + "step": 1288 + }, + { + "epoch": 0.09300479815289152, + "grad_norm": 0.08940223604440689, + "learning_rate": 0.00019629383749458797, + "loss": 0.1184, + "step": 1289 + }, + { + "epoch": 0.09307695082795195, + "grad_norm": 0.10932713001966476, + "learning_rate": 0.00019629095107519124, + "loss": 0.1967, + "step": 1290 + }, + { + "epoch": 0.09314910350301238, + "grad_norm": 0.08406266570091248, + "learning_rate": 0.0001962880646557945, + "loss": 0.1615, + "step": 1291 + }, + { + "epoch": 0.09322125617807281, + "grad_norm": 0.08657065033912659, + "learning_rate": 0.00019628517823639776, + "loss": 0.1742, + "step": 1292 + }, + { + "epoch": 0.09329340885313323, + "grad_norm": 0.09516967087984085, + "learning_rate": 0.00019628229181700103, + "loss": 0.1821, + "step": 1293 + }, + { + "epoch": 0.09336556152819366, + "grad_norm": 0.11135697364807129, + "learning_rate": 0.0001962794053976043, + "loss": 0.1969, + "step": 1294 + }, + { + "epoch": 0.09343771420325409, + "grad_norm": 0.12651841342449188, + "learning_rate": 0.00019627651897820753, + "loss": 0.2189, + "step": 1295 + }, + { + "epoch": 0.09350986687831452, + "grad_norm": 0.08434838056564331, + "learning_rate": 0.0001962736325588108, + "loss": 0.1582, + "step": 1296 + }, + { + "epoch": 0.09358201955337495, + "grad_norm": 0.10231063514947891, + "learning_rate": 0.00019627074613941408, + "loss": 0.1773, + "step": 1297 + }, + { + "epoch": 0.09365417222843538, + "grad_norm": 0.1403086632490158, + "learning_rate": 0.00019626785972001734, + "loss": 0.1989, + "step": 1298 + }, + { + "epoch": 0.09372632490349579, + "grad_norm": 0.10541427135467529, + "learning_rate": 0.0001962649733006206, + "loss": 0.2189, + "step": 1299 + }, + { + "epoch": 0.09379847757855622, + "grad_norm": 0.14472924172878265, + "learning_rate": 0.00019626208688122384, + "loss": 0.1733, + "step": 1300 + }, + { + "epoch": 0.09387063025361665, + "grad_norm": 0.09914866834878922, + "learning_rate": 0.0001962592004618271, + "loss": 0.1519, + "step": 1301 + }, + { + "epoch": 0.09394278292867708, + "grad_norm": 0.10584486275911331, + "learning_rate": 0.00019625631404243037, + "loss": 0.1786, + "step": 1302 + }, + { + "epoch": 0.09401493560373751, + "grad_norm": 0.13134770095348358, + "learning_rate": 0.00019625342762303363, + "loss": 0.1605, + "step": 1303 + }, + { + "epoch": 0.09408708827879794, + "grad_norm": 0.09906939417123795, + "learning_rate": 0.00019625054120363692, + "loss": 0.1696, + "step": 1304 + }, + { + "epoch": 0.09415924095385836, + "grad_norm": 0.10578920692205429, + "learning_rate": 0.00019624765478424015, + "loss": 0.1924, + "step": 1305 + }, + { + "epoch": 0.09423139362891879, + "grad_norm": 0.10673406720161438, + "learning_rate": 0.00019624476836484342, + "loss": 0.1971, + "step": 1306 + }, + { + "epoch": 0.09430354630397922, + "grad_norm": 0.11942119151353836, + "learning_rate": 0.00019624188194544668, + "loss": 0.1865, + "step": 1307 + }, + { + "epoch": 0.09437569897903965, + "grad_norm": 0.10493606328964233, + "learning_rate": 0.00019623899552604994, + "loss": 0.16, + "step": 1308 + }, + { + "epoch": 0.09444785165410008, + "grad_norm": 0.10441295802593231, + "learning_rate": 0.0001962361091066532, + "loss": 0.175, + "step": 1309 + }, + { + "epoch": 0.09452000432916051, + "grad_norm": 0.12301217764616013, + "learning_rate": 0.00019623322268725647, + "loss": 0.1717, + "step": 1310 + }, + { + "epoch": 0.09459215700422093, + "grad_norm": 0.07668205350637436, + "learning_rate": 0.00019623033626785973, + "loss": 0.1145, + "step": 1311 + }, + { + "epoch": 0.09466430967928136, + "grad_norm": 0.09936694800853729, + "learning_rate": 0.000196227449848463, + "loss": 0.1496, + "step": 1312 + }, + { + "epoch": 0.09473646235434179, + "grad_norm": 0.11265905201435089, + "learning_rate": 0.00019622456342906626, + "loss": 0.1435, + "step": 1313 + }, + { + "epoch": 0.09480861502940222, + "grad_norm": 0.10028904676437378, + "learning_rate": 0.00019622167700966952, + "loss": 0.197, + "step": 1314 + }, + { + "epoch": 0.09488076770446265, + "grad_norm": 0.07026661932468414, + "learning_rate": 0.00019621879059027278, + "loss": 0.2116, + "step": 1315 + }, + { + "epoch": 0.09495292037952308, + "grad_norm": 0.10615105926990509, + "learning_rate": 0.00019621590417087602, + "loss": 0.2152, + "step": 1316 + }, + { + "epoch": 0.09502507305458349, + "grad_norm": 0.1014433428645134, + "learning_rate": 0.00019621301775147928, + "loss": 0.1702, + "step": 1317 + }, + { + "epoch": 0.09509722572964392, + "grad_norm": 0.10013467073440552, + "learning_rate": 0.00019621013133208257, + "loss": 0.1884, + "step": 1318 + }, + { + "epoch": 0.09516937840470435, + "grad_norm": 0.11329302936792374, + "learning_rate": 0.00019620724491268584, + "loss": 0.1707, + "step": 1319 + }, + { + "epoch": 0.09524153107976478, + "grad_norm": 0.10168576240539551, + "learning_rate": 0.0001962043584932891, + "loss": 0.1698, + "step": 1320 + }, + { + "epoch": 0.09531368375482521, + "grad_norm": 0.09280646592378616, + "learning_rate": 0.00019620147207389233, + "loss": 0.1553, + "step": 1321 + }, + { + "epoch": 0.09538583642988564, + "grad_norm": 0.10105575621128082, + "learning_rate": 0.0001961985856544956, + "loss": 0.2239, + "step": 1322 + }, + { + "epoch": 0.09545798910494607, + "grad_norm": 0.08288126438856125, + "learning_rate": 0.00019619569923509886, + "loss": 0.193, + "step": 1323 + }, + { + "epoch": 0.09553014178000649, + "grad_norm": 0.1465429663658142, + "learning_rate": 0.00019619281281570212, + "loss": 0.2039, + "step": 1324 + }, + { + "epoch": 0.09560229445506692, + "grad_norm": 0.12137012183666229, + "learning_rate": 0.0001961899263963054, + "loss": 0.1393, + "step": 1325 + }, + { + "epoch": 0.09567444713012735, + "grad_norm": 0.09829273074865341, + "learning_rate": 0.00019618703997690865, + "loss": 0.2547, + "step": 1326 + }, + { + "epoch": 0.09574659980518778, + "grad_norm": 0.13386209309101105, + "learning_rate": 0.0001961841535575119, + "loss": 0.1183, + "step": 1327 + }, + { + "epoch": 0.09581875248024821, + "grad_norm": 0.12641870975494385, + "learning_rate": 0.00019618126713811517, + "loss": 0.1984, + "step": 1328 + }, + { + "epoch": 0.09589090515530864, + "grad_norm": 0.1062311977148056, + "learning_rate": 0.00019617838071871844, + "loss": 0.171, + "step": 1329 + }, + { + "epoch": 0.09596305783036906, + "grad_norm": 0.08198920637369156, + "learning_rate": 0.0001961754942993217, + "loss": 0.1579, + "step": 1330 + }, + { + "epoch": 0.09603521050542949, + "grad_norm": 0.09765391051769257, + "learning_rate": 0.00019617260787992496, + "loss": 0.1361, + "step": 1331 + }, + { + "epoch": 0.09610736318048992, + "grad_norm": 0.12339577823877335, + "learning_rate": 0.00019616972146052823, + "loss": 0.1762, + "step": 1332 + }, + { + "epoch": 0.09617951585555035, + "grad_norm": 0.11164752393960953, + "learning_rate": 0.0001961668350411315, + "loss": 0.1621, + "step": 1333 + }, + { + "epoch": 0.09625166853061078, + "grad_norm": 0.114966481924057, + "learning_rate": 0.00019616394862173475, + "loss": 0.1737, + "step": 1334 + }, + { + "epoch": 0.0963238212056712, + "grad_norm": 0.11052409559488297, + "learning_rate": 0.00019616106220233801, + "loss": 0.2064, + "step": 1335 + }, + { + "epoch": 0.09639597388073162, + "grad_norm": 0.08219371736049652, + "learning_rate": 0.00019615817578294128, + "loss": 0.196, + "step": 1336 + }, + { + "epoch": 0.09646812655579205, + "grad_norm": 0.12721070647239685, + "learning_rate": 0.0001961552893635445, + "loss": 0.1684, + "step": 1337 + }, + { + "epoch": 0.09654027923085248, + "grad_norm": 0.10690361261367798, + "learning_rate": 0.00019615240294414778, + "loss": 0.1265, + "step": 1338 + }, + { + "epoch": 0.09661243190591291, + "grad_norm": 0.0963577851653099, + "learning_rate": 0.00019614951652475107, + "loss": 0.1894, + "step": 1339 + }, + { + "epoch": 0.09668458458097334, + "grad_norm": 0.11837514489889145, + "learning_rate": 0.00019614663010535433, + "loss": 0.2115, + "step": 1340 + }, + { + "epoch": 0.09675673725603377, + "grad_norm": 0.09708958864212036, + "learning_rate": 0.0001961437436859576, + "loss": 0.1593, + "step": 1341 + }, + { + "epoch": 0.09682888993109419, + "grad_norm": 0.10255219042301178, + "learning_rate": 0.00019614085726656083, + "loss": 0.1921, + "step": 1342 + }, + { + "epoch": 0.09690104260615462, + "grad_norm": 0.12087555229663849, + "learning_rate": 0.0001961379708471641, + "loss": 0.2046, + "step": 1343 + }, + { + "epoch": 0.09697319528121505, + "grad_norm": 0.11907682567834854, + "learning_rate": 0.00019613508442776735, + "loss": 0.2054, + "step": 1344 + }, + { + "epoch": 0.09704534795627548, + "grad_norm": 0.09736671298742294, + "learning_rate": 0.00019613219800837062, + "loss": 0.1358, + "step": 1345 + }, + { + "epoch": 0.09711750063133591, + "grad_norm": 0.1320803314447403, + "learning_rate": 0.0001961293115889739, + "loss": 0.1826, + "step": 1346 + }, + { + "epoch": 0.09718965330639634, + "grad_norm": 0.1130470335483551, + "learning_rate": 0.00019612642516957714, + "loss": 0.1957, + "step": 1347 + }, + { + "epoch": 0.09726180598145676, + "grad_norm": 0.08177877962589264, + "learning_rate": 0.0001961235387501804, + "loss": 0.1725, + "step": 1348 + }, + { + "epoch": 0.09733395865651719, + "grad_norm": 0.0829310342669487, + "learning_rate": 0.00019612065233078367, + "loss": 0.1714, + "step": 1349 + }, + { + "epoch": 0.09740611133157762, + "grad_norm": 0.1222817674279213, + "learning_rate": 0.00019611776591138693, + "loss": 0.2691, + "step": 1350 + }, + { + "epoch": 0.09747826400663805, + "grad_norm": 0.07808643579483032, + "learning_rate": 0.0001961148794919902, + "loss": 0.2056, + "step": 1351 + }, + { + "epoch": 0.09755041668169848, + "grad_norm": 0.08548199385404587, + "learning_rate": 0.00019611199307259346, + "loss": 0.1849, + "step": 1352 + }, + { + "epoch": 0.0976225693567589, + "grad_norm": 0.10797590017318726, + "learning_rate": 0.00019610910665319672, + "loss": 0.1903, + "step": 1353 + }, + { + "epoch": 0.09769472203181934, + "grad_norm": 0.13164952397346497, + "learning_rate": 0.00019610622023379998, + "loss": 0.1855, + "step": 1354 + }, + { + "epoch": 0.09776687470687975, + "grad_norm": 0.08189984411001205, + "learning_rate": 0.00019610333381440325, + "loss": 0.1786, + "step": 1355 + }, + { + "epoch": 0.09783902738194018, + "grad_norm": 0.08770643919706345, + "learning_rate": 0.0001961004473950065, + "loss": 0.1575, + "step": 1356 + }, + { + "epoch": 0.09791118005700061, + "grad_norm": 0.09954666346311569, + "learning_rate": 0.00019609756097560977, + "loss": 0.1713, + "step": 1357 + }, + { + "epoch": 0.09798333273206104, + "grad_norm": 0.08523159474134445, + "learning_rate": 0.000196094674556213, + "loss": 0.1947, + "step": 1358 + }, + { + "epoch": 0.09805548540712147, + "grad_norm": 0.08452267944812775, + "learning_rate": 0.00019609178813681627, + "loss": 0.1361, + "step": 1359 + }, + { + "epoch": 0.0981276380821819, + "grad_norm": 0.0849485918879509, + "learning_rate": 0.00019608890171741956, + "loss": 0.1569, + "step": 1360 + }, + { + "epoch": 0.09819979075724232, + "grad_norm": 0.10117647051811218, + "learning_rate": 0.00019608601529802282, + "loss": 0.1745, + "step": 1361 + }, + { + "epoch": 0.09827194343230275, + "grad_norm": 0.10660509765148163, + "learning_rate": 0.0001960831288786261, + "loss": 0.1587, + "step": 1362 + }, + { + "epoch": 0.09834409610736318, + "grad_norm": 0.10959257185459137, + "learning_rate": 0.00019608024245922932, + "loss": 0.1666, + "step": 1363 + }, + { + "epoch": 0.09841624878242361, + "grad_norm": 0.11396210640668869, + "learning_rate": 0.00019607735603983259, + "loss": 0.1734, + "step": 1364 + }, + { + "epoch": 0.09848840145748404, + "grad_norm": 0.11623997986316681, + "learning_rate": 0.00019607446962043585, + "loss": 0.1813, + "step": 1365 + }, + { + "epoch": 0.09856055413254447, + "grad_norm": 0.08269207179546356, + "learning_rate": 0.0001960715832010391, + "loss": 0.1407, + "step": 1366 + }, + { + "epoch": 0.09863270680760489, + "grad_norm": 0.11096511036157608, + "learning_rate": 0.0001960686967816424, + "loss": 0.1442, + "step": 1367 + }, + { + "epoch": 0.09870485948266532, + "grad_norm": 0.07984766364097595, + "learning_rate": 0.00019606581036224564, + "loss": 0.1939, + "step": 1368 + }, + { + "epoch": 0.09877701215772575, + "grad_norm": 0.08203259110450745, + "learning_rate": 0.0001960629239428489, + "loss": 0.1831, + "step": 1369 + }, + { + "epoch": 0.09884916483278618, + "grad_norm": 0.09003271162509918, + "learning_rate": 0.00019606003752345216, + "loss": 0.1968, + "step": 1370 + }, + { + "epoch": 0.0989213175078466, + "grad_norm": 0.10677336156368256, + "learning_rate": 0.00019605715110405543, + "loss": 0.1771, + "step": 1371 + }, + { + "epoch": 0.09899347018290704, + "grad_norm": 0.08796536922454834, + "learning_rate": 0.0001960542646846587, + "loss": 0.1318, + "step": 1372 + }, + { + "epoch": 0.09906562285796745, + "grad_norm": 0.09083075076341629, + "learning_rate": 0.00019605137826526195, + "loss": 0.1625, + "step": 1373 + }, + { + "epoch": 0.09913777553302788, + "grad_norm": 0.0895177498459816, + "learning_rate": 0.00019604849184586521, + "loss": 0.1992, + "step": 1374 + }, + { + "epoch": 0.09920992820808831, + "grad_norm": 0.08662980794906616, + "learning_rate": 0.00019604560542646848, + "loss": 0.1895, + "step": 1375 + }, + { + "epoch": 0.09928208088314874, + "grad_norm": 0.11467055231332779, + "learning_rate": 0.00019604271900707174, + "loss": 0.1559, + "step": 1376 + }, + { + "epoch": 0.09935423355820917, + "grad_norm": 0.09523959457874298, + "learning_rate": 0.000196039832587675, + "loss": 0.1757, + "step": 1377 + }, + { + "epoch": 0.0994263862332696, + "grad_norm": 0.08383305370807648, + "learning_rate": 0.00019603694616827827, + "loss": 0.2024, + "step": 1378 + }, + { + "epoch": 0.09949853890833002, + "grad_norm": 0.09364939481019974, + "learning_rate": 0.00019603405974888153, + "loss": 0.2114, + "step": 1379 + }, + { + "epoch": 0.09957069158339045, + "grad_norm": 0.11210822314023972, + "learning_rate": 0.00019603117332948476, + "loss": 0.2107, + "step": 1380 + }, + { + "epoch": 0.09964284425845088, + "grad_norm": 0.11405359208583832, + "learning_rate": 0.00019602828691008805, + "loss": 0.1591, + "step": 1381 + }, + { + "epoch": 0.09971499693351131, + "grad_norm": 0.0980084240436554, + "learning_rate": 0.00019602540049069132, + "loss": 0.2372, + "step": 1382 + }, + { + "epoch": 0.09978714960857174, + "grad_norm": 0.08842863887548447, + "learning_rate": 0.00019602251407129458, + "loss": 0.1548, + "step": 1383 + }, + { + "epoch": 0.09985930228363217, + "grad_norm": 0.08733103424310684, + "learning_rate": 0.00019601962765189784, + "loss": 0.1947, + "step": 1384 + }, + { + "epoch": 0.0999314549586926, + "grad_norm": 0.1021757498383522, + "learning_rate": 0.00019601674123250108, + "loss": 0.1453, + "step": 1385 + }, + { + "epoch": 0.10000360763375302, + "grad_norm": 0.09830335527658463, + "learning_rate": 0.00019601385481310434, + "loss": 0.1332, + "step": 1386 + }, + { + "epoch": 0.10007576030881345, + "grad_norm": 0.12369471043348312, + "learning_rate": 0.0001960109683937076, + "loss": 0.1892, + "step": 1387 + }, + { + "epoch": 0.10014791298387388, + "grad_norm": 0.111944779753685, + "learning_rate": 0.0001960080819743109, + "loss": 0.1579, + "step": 1388 + }, + { + "epoch": 0.10022006565893431, + "grad_norm": 0.10621371865272522, + "learning_rate": 0.00019600519555491416, + "loss": 0.1645, + "step": 1389 + }, + { + "epoch": 0.10029221833399474, + "grad_norm": 0.14481788873672485, + "learning_rate": 0.0001960023091355174, + "loss": 0.1319, + "step": 1390 + }, + { + "epoch": 0.10036437100905517, + "grad_norm": 0.10226688534021378, + "learning_rate": 0.00019599942271612066, + "loss": 0.1653, + "step": 1391 + }, + { + "epoch": 0.10043652368411558, + "grad_norm": 0.2072441279888153, + "learning_rate": 0.00019599653629672392, + "loss": 0.1961, + "step": 1392 + }, + { + "epoch": 0.10050867635917601, + "grad_norm": 0.10382064431905746, + "learning_rate": 0.00019599364987732718, + "loss": 0.1648, + "step": 1393 + }, + { + "epoch": 0.10058082903423644, + "grad_norm": 0.12104596942663193, + "learning_rate": 0.00019599076345793045, + "loss": 0.1877, + "step": 1394 + }, + { + "epoch": 0.10065298170929687, + "grad_norm": 0.10014254599809647, + "learning_rate": 0.0001959878770385337, + "loss": 0.1435, + "step": 1395 + }, + { + "epoch": 0.1007251343843573, + "grad_norm": 0.09614317119121552, + "learning_rate": 0.00019598499061913697, + "loss": 0.1442, + "step": 1396 + }, + { + "epoch": 0.10079728705941773, + "grad_norm": 0.0885712131857872, + "learning_rate": 0.00019598210419974023, + "loss": 0.1523, + "step": 1397 + }, + { + "epoch": 0.10086943973447815, + "grad_norm": 0.10141981393098831, + "learning_rate": 0.0001959792177803435, + "loss": 0.1827, + "step": 1398 + }, + { + "epoch": 0.10094159240953858, + "grad_norm": 0.12346911430358887, + "learning_rate": 0.00019597633136094676, + "loss": 0.1847, + "step": 1399 + }, + { + "epoch": 0.10101374508459901, + "grad_norm": 0.0943661779165268, + "learning_rate": 0.00019597344494155002, + "loss": 0.1962, + "step": 1400 + }, + { + "epoch": 0.10108589775965944, + "grad_norm": 0.1327252984046936, + "learning_rate": 0.00019597055852215326, + "loss": 0.2174, + "step": 1401 + }, + { + "epoch": 0.10115805043471987, + "grad_norm": 0.09227154403924942, + "learning_rate": 0.00019596767210275655, + "loss": 0.1983, + "step": 1402 + }, + { + "epoch": 0.1012302031097803, + "grad_norm": 0.11135019361972809, + "learning_rate": 0.0001959647856833598, + "loss": 0.1826, + "step": 1403 + }, + { + "epoch": 0.10130235578484072, + "grad_norm": 0.10142301023006439, + "learning_rate": 0.00019596189926396308, + "loss": 0.1814, + "step": 1404 + }, + { + "epoch": 0.10137450845990115, + "grad_norm": 0.12080994993448257, + "learning_rate": 0.00019595901284456634, + "loss": 0.2221, + "step": 1405 + }, + { + "epoch": 0.10144666113496158, + "grad_norm": 0.09118806570768356, + "learning_rate": 0.00019595612642516957, + "loss": 0.1475, + "step": 1406 + }, + { + "epoch": 0.10151881381002201, + "grad_norm": 0.10375121235847473, + "learning_rate": 0.00019595324000577284, + "loss": 0.1894, + "step": 1407 + }, + { + "epoch": 0.10159096648508244, + "grad_norm": 0.10166884958744049, + "learning_rate": 0.0001959503535863761, + "loss": 0.1522, + "step": 1408 + }, + { + "epoch": 0.10166311916014287, + "grad_norm": 0.09178225696086884, + "learning_rate": 0.0001959474671669794, + "loss": 0.1603, + "step": 1409 + }, + { + "epoch": 0.10173527183520328, + "grad_norm": 0.09689350426197052, + "learning_rate": 0.00019594458074758265, + "loss": 0.1607, + "step": 1410 + }, + { + "epoch": 0.10180742451026371, + "grad_norm": 0.1341812163591385, + "learning_rate": 0.0001959416943281859, + "loss": 0.1851, + "step": 1411 + }, + { + "epoch": 0.10187957718532414, + "grad_norm": 0.10188182443380356, + "learning_rate": 0.00019593880790878915, + "loss": 0.1766, + "step": 1412 + }, + { + "epoch": 0.10195172986038457, + "grad_norm": 0.1186114251613617, + "learning_rate": 0.00019593592148939241, + "loss": 0.1471, + "step": 1413 + }, + { + "epoch": 0.102023882535445, + "grad_norm": 0.10667675733566284, + "learning_rate": 0.00019593303506999568, + "loss": 0.1608, + "step": 1414 + }, + { + "epoch": 0.10209603521050543, + "grad_norm": 0.09607324004173279, + "learning_rate": 0.00019593014865059894, + "loss": 0.1984, + "step": 1415 + }, + { + "epoch": 0.10216818788556586, + "grad_norm": 0.09958678483963013, + "learning_rate": 0.0001959272622312022, + "loss": 0.1772, + "step": 1416 + }, + { + "epoch": 0.10224034056062628, + "grad_norm": 0.09940075874328613, + "learning_rate": 0.00019592437581180547, + "loss": 0.1814, + "step": 1417 + }, + { + "epoch": 0.10231249323568671, + "grad_norm": 0.09523070603609085, + "learning_rate": 0.00019592148939240873, + "loss": 0.2013, + "step": 1418 + }, + { + "epoch": 0.10238464591074714, + "grad_norm": 0.10605626553297043, + "learning_rate": 0.000195918602973012, + "loss": 0.1434, + "step": 1419 + }, + { + "epoch": 0.10245679858580757, + "grad_norm": 0.11710896342992783, + "learning_rate": 0.00019591571655361525, + "loss": 0.1903, + "step": 1420 + }, + { + "epoch": 0.102528951260868, + "grad_norm": 0.07783154398202896, + "learning_rate": 0.00019591283013421852, + "loss": 0.1334, + "step": 1421 + }, + { + "epoch": 0.10260110393592843, + "grad_norm": 0.10030199587345123, + "learning_rate": 0.00019590994371482175, + "loss": 0.185, + "step": 1422 + }, + { + "epoch": 0.10267325661098885, + "grad_norm": 0.11000876873731613, + "learning_rate": 0.00019590705729542504, + "loss": 0.1661, + "step": 1423 + }, + { + "epoch": 0.10274540928604928, + "grad_norm": 0.10521721839904785, + "learning_rate": 0.0001959041708760283, + "loss": 0.2145, + "step": 1424 + }, + { + "epoch": 0.10281756196110971, + "grad_norm": 0.07904649525880814, + "learning_rate": 0.00019590128445663157, + "loss": 0.1375, + "step": 1425 + }, + { + "epoch": 0.10288971463617014, + "grad_norm": 0.0979461818933487, + "learning_rate": 0.00019589839803723483, + "loss": 0.1924, + "step": 1426 + }, + { + "epoch": 0.10296186731123057, + "grad_norm": 0.07898340374231339, + "learning_rate": 0.00019589551161783807, + "loss": 0.1944, + "step": 1427 + }, + { + "epoch": 0.103034019986291, + "grad_norm": 0.1010308638215065, + "learning_rate": 0.00019589262519844133, + "loss": 0.1628, + "step": 1428 + }, + { + "epoch": 0.10310617266135141, + "grad_norm": 0.10152356326580048, + "learning_rate": 0.0001958897387790446, + "loss": 0.1568, + "step": 1429 + }, + { + "epoch": 0.10317832533641184, + "grad_norm": 0.08201948553323746, + "learning_rate": 0.00019588685235964788, + "loss": 0.1974, + "step": 1430 + }, + { + "epoch": 0.10325047801147227, + "grad_norm": 0.11100131273269653, + "learning_rate": 0.00019588396594025115, + "loss": 0.1299, + "step": 1431 + }, + { + "epoch": 0.1033226306865327, + "grad_norm": 0.1156645119190216, + "learning_rate": 0.00019588107952085438, + "loss": 0.1461, + "step": 1432 + }, + { + "epoch": 0.10339478336159313, + "grad_norm": 0.13311316072940826, + "learning_rate": 0.00019587819310145765, + "loss": 0.1769, + "step": 1433 + }, + { + "epoch": 0.10346693603665356, + "grad_norm": 0.145923912525177, + "learning_rate": 0.0001958753066820609, + "loss": 0.1783, + "step": 1434 + }, + { + "epoch": 0.10353908871171398, + "grad_norm": 0.1302771270275116, + "learning_rate": 0.00019587242026266417, + "loss": 0.1602, + "step": 1435 + }, + { + "epoch": 0.10361124138677441, + "grad_norm": 0.11546266824007034, + "learning_rate": 0.00019586953384326743, + "loss": 0.1572, + "step": 1436 + }, + { + "epoch": 0.10368339406183484, + "grad_norm": 0.11403997242450714, + "learning_rate": 0.0001958666474238707, + "loss": 0.209, + "step": 1437 + }, + { + "epoch": 0.10375554673689527, + "grad_norm": 0.10422119498252869, + "learning_rate": 0.00019586376100447396, + "loss": 0.1745, + "step": 1438 + }, + { + "epoch": 0.1038276994119557, + "grad_norm": 0.10391309857368469, + "learning_rate": 0.00019586087458507722, + "loss": 0.1888, + "step": 1439 + }, + { + "epoch": 0.10389985208701613, + "grad_norm": 0.09338228404521942, + "learning_rate": 0.00019585798816568049, + "loss": 0.1704, + "step": 1440 + }, + { + "epoch": 0.10397200476207655, + "grad_norm": 0.12580829858779907, + "learning_rate": 0.00019585510174628375, + "loss": 0.2125, + "step": 1441 + }, + { + "epoch": 0.10404415743713698, + "grad_norm": 0.08979323506355286, + "learning_rate": 0.000195852215326887, + "loss": 0.1669, + "step": 1442 + }, + { + "epoch": 0.10411631011219741, + "grad_norm": 0.08331722021102905, + "learning_rate": 0.00019584932890749025, + "loss": 0.1573, + "step": 1443 + }, + { + "epoch": 0.10418846278725784, + "grad_norm": 0.09525292366743088, + "learning_rate": 0.00019584644248809354, + "loss": 0.1159, + "step": 1444 + }, + { + "epoch": 0.10426061546231827, + "grad_norm": 0.09168519824743271, + "learning_rate": 0.0001958435560686968, + "loss": 0.1735, + "step": 1445 + }, + { + "epoch": 0.1043327681373787, + "grad_norm": 0.1023101806640625, + "learning_rate": 0.00019584066964930006, + "loss": 0.1785, + "step": 1446 + }, + { + "epoch": 0.10440492081243911, + "grad_norm": 0.10439193993806839, + "learning_rate": 0.00019583778322990333, + "loss": 0.1829, + "step": 1447 + }, + { + "epoch": 0.10447707348749954, + "grad_norm": 0.13283102214336395, + "learning_rate": 0.00019583489681050656, + "loss": 0.2259, + "step": 1448 + }, + { + "epoch": 0.10454922616255997, + "grad_norm": 0.11387287080287933, + "learning_rate": 0.00019583201039110983, + "loss": 0.1728, + "step": 1449 + }, + { + "epoch": 0.1046213788376204, + "grad_norm": 0.10596080124378204, + "learning_rate": 0.0001958291239717131, + "loss": 0.1797, + "step": 1450 + }, + { + "epoch": 0.10469353151268083, + "grad_norm": 0.10325146466493607, + "learning_rate": 0.00019582623755231638, + "loss": 0.1658, + "step": 1451 + }, + { + "epoch": 0.10476568418774126, + "grad_norm": 0.10266296565532684, + "learning_rate": 0.00019582335113291964, + "loss": 0.1042, + "step": 1452 + }, + { + "epoch": 0.1048378368628017, + "grad_norm": 0.08678191900253296, + "learning_rate": 0.00019582046471352288, + "loss": 0.1679, + "step": 1453 + }, + { + "epoch": 0.10490998953786211, + "grad_norm": 0.07487280666828156, + "learning_rate": 0.00019581757829412614, + "loss": 0.1272, + "step": 1454 + }, + { + "epoch": 0.10498214221292254, + "grad_norm": 0.09336668252944946, + "learning_rate": 0.0001958146918747294, + "loss": 0.1733, + "step": 1455 + }, + { + "epoch": 0.10505429488798297, + "grad_norm": 0.10310094803571701, + "learning_rate": 0.00019581180545533267, + "loss": 0.2009, + "step": 1456 + }, + { + "epoch": 0.1051264475630434, + "grad_norm": 0.13999375700950623, + "learning_rate": 0.00019580891903593593, + "loss": 0.1613, + "step": 1457 + }, + { + "epoch": 0.10519860023810383, + "grad_norm": 0.1124359667301178, + "learning_rate": 0.0001958060326165392, + "loss": 0.1823, + "step": 1458 + }, + { + "epoch": 0.10527075291316426, + "grad_norm": 0.11623510718345642, + "learning_rate": 0.00019580314619714245, + "loss": 0.1867, + "step": 1459 + }, + { + "epoch": 0.10534290558822468, + "grad_norm": 0.11208704859018326, + "learning_rate": 0.00019580025977774572, + "loss": 0.1965, + "step": 1460 + }, + { + "epoch": 0.10541505826328511, + "grad_norm": 0.10755287855863571, + "learning_rate": 0.00019579737335834898, + "loss": 0.189, + "step": 1461 + }, + { + "epoch": 0.10548721093834554, + "grad_norm": 0.09834279119968414, + "learning_rate": 0.00019579448693895224, + "loss": 0.1299, + "step": 1462 + }, + { + "epoch": 0.10555936361340597, + "grad_norm": 0.09488647431135178, + "learning_rate": 0.0001957916005195555, + "loss": 0.1659, + "step": 1463 + }, + { + "epoch": 0.1056315162884664, + "grad_norm": 0.1207553967833519, + "learning_rate": 0.00019578871410015874, + "loss": 0.1402, + "step": 1464 + }, + { + "epoch": 0.10570366896352683, + "grad_norm": 0.11781546473503113, + "learning_rate": 0.00019578582768076203, + "loss": 0.1758, + "step": 1465 + }, + { + "epoch": 0.10577582163858724, + "grad_norm": 0.11281391978263855, + "learning_rate": 0.0001957829412613653, + "loss": 0.1741, + "step": 1466 + }, + { + "epoch": 0.10584797431364767, + "grad_norm": 0.1315668672323227, + "learning_rate": 0.00019578005484196856, + "loss": 0.1777, + "step": 1467 + }, + { + "epoch": 0.1059201269887081, + "grad_norm": 0.1111454963684082, + "learning_rate": 0.00019577716842257182, + "loss": 0.1368, + "step": 1468 + }, + { + "epoch": 0.10599227966376853, + "grad_norm": 0.11257988959550858, + "learning_rate": 0.00019577428200317506, + "loss": 0.1658, + "step": 1469 + }, + { + "epoch": 0.10606443233882897, + "grad_norm": 0.08634955435991287, + "learning_rate": 0.00019577139558377832, + "loss": 0.1979, + "step": 1470 + }, + { + "epoch": 0.1061365850138894, + "grad_norm": 0.08781882375478745, + "learning_rate": 0.00019576850916438158, + "loss": 0.1314, + "step": 1471 + }, + { + "epoch": 0.10620873768894981, + "grad_norm": 0.08333154767751694, + "learning_rate": 0.00019576562274498487, + "loss": 0.1581, + "step": 1472 + }, + { + "epoch": 0.10628089036401024, + "grad_norm": 0.10107365995645523, + "learning_rate": 0.00019576273632558814, + "loss": 0.1643, + "step": 1473 + }, + { + "epoch": 0.10635304303907067, + "grad_norm": 0.08048343658447266, + "learning_rate": 0.00019575984990619137, + "loss": 0.1469, + "step": 1474 + }, + { + "epoch": 0.1064251957141311, + "grad_norm": 0.08940555900335312, + "learning_rate": 0.00019575696348679463, + "loss": 0.2077, + "step": 1475 + }, + { + "epoch": 0.10649734838919153, + "grad_norm": 0.08923406898975372, + "learning_rate": 0.0001957540770673979, + "loss": 0.1601, + "step": 1476 + }, + { + "epoch": 0.10656950106425196, + "grad_norm": 0.07388593256473541, + "learning_rate": 0.00019575119064800116, + "loss": 0.1766, + "step": 1477 + }, + { + "epoch": 0.10664165373931238, + "grad_norm": 0.1094571202993393, + "learning_rate": 0.00019574830422860442, + "loss": 0.2324, + "step": 1478 + }, + { + "epoch": 0.10671380641437281, + "grad_norm": 0.10609541088342667, + "learning_rate": 0.00019574541780920769, + "loss": 0.1805, + "step": 1479 + }, + { + "epoch": 0.10678595908943324, + "grad_norm": 0.08510361611843109, + "learning_rate": 0.00019574253138981095, + "loss": 0.1686, + "step": 1480 + }, + { + "epoch": 0.10685811176449367, + "grad_norm": 0.11509072035551071, + "learning_rate": 0.0001957396449704142, + "loss": 0.1218, + "step": 1481 + }, + { + "epoch": 0.1069302644395541, + "grad_norm": 0.10149054229259491, + "learning_rate": 0.00019573675855101747, + "loss": 0.175, + "step": 1482 + }, + { + "epoch": 0.10700241711461453, + "grad_norm": 0.09114514291286469, + "learning_rate": 0.00019573387213162074, + "loss": 0.1839, + "step": 1483 + }, + { + "epoch": 0.10707456978967496, + "grad_norm": 0.08684788644313812, + "learning_rate": 0.000195730985712224, + "loss": 0.2015, + "step": 1484 + }, + { + "epoch": 0.10714672246473537, + "grad_norm": 0.09528748691082001, + "learning_rate": 0.00019572809929282724, + "loss": 0.2082, + "step": 1485 + }, + { + "epoch": 0.1072188751397958, + "grad_norm": 0.11334405094385147, + "learning_rate": 0.0001957252128734305, + "loss": 0.1956, + "step": 1486 + }, + { + "epoch": 0.10729102781485624, + "grad_norm": 0.1072673425078392, + "learning_rate": 0.0001957223264540338, + "loss": 0.1729, + "step": 1487 + }, + { + "epoch": 0.10736318048991667, + "grad_norm": 0.08538205176591873, + "learning_rate": 0.00019571944003463705, + "loss": 0.2203, + "step": 1488 + }, + { + "epoch": 0.1074353331649771, + "grad_norm": 0.11392519623041153, + "learning_rate": 0.00019571655361524031, + "loss": 0.1691, + "step": 1489 + }, + { + "epoch": 0.10750748584003753, + "grad_norm": 0.10131382197141647, + "learning_rate": 0.00019571366719584355, + "loss": 0.1073, + "step": 1490 + }, + { + "epoch": 0.10757963851509794, + "grad_norm": 0.09362057596445084, + "learning_rate": 0.00019571078077644681, + "loss": 0.1805, + "step": 1491 + }, + { + "epoch": 0.10765179119015837, + "grad_norm": 0.11688234657049179, + "learning_rate": 0.00019570789435705008, + "loss": 0.1806, + "step": 1492 + }, + { + "epoch": 0.1077239438652188, + "grad_norm": 0.09200257807970047, + "learning_rate": 0.00019570500793765334, + "loss": 0.2012, + "step": 1493 + }, + { + "epoch": 0.10779609654027923, + "grad_norm": 0.08040358126163483, + "learning_rate": 0.00019570212151825663, + "loss": 0.1734, + "step": 1494 + }, + { + "epoch": 0.10786824921533966, + "grad_norm": 0.09139800816774368, + "learning_rate": 0.00019569923509885987, + "loss": 0.1674, + "step": 1495 + }, + { + "epoch": 0.10794040189040009, + "grad_norm": 0.0903816744685173, + "learning_rate": 0.00019569634867946313, + "loss": 0.1911, + "step": 1496 + }, + { + "epoch": 0.10801255456546051, + "grad_norm": 0.10706637799739838, + "learning_rate": 0.0001956934622600664, + "loss": 0.177, + "step": 1497 + }, + { + "epoch": 0.10808470724052094, + "grad_norm": 0.12878459692001343, + "learning_rate": 0.00019569057584066965, + "loss": 0.1312, + "step": 1498 + }, + { + "epoch": 0.10815685991558137, + "grad_norm": 0.09276431053876877, + "learning_rate": 0.00019568768942127292, + "loss": 0.169, + "step": 1499 + }, + { + "epoch": 0.1082290125906418, + "grad_norm": 0.12874440848827362, + "learning_rate": 0.00019568480300187618, + "loss": 0.1653, + "step": 1500 + }, + { + "epoch": 0.10830116526570223, + "grad_norm": 0.09412311762571335, + "learning_rate": 0.00019568191658247944, + "loss": 0.1277, + "step": 1501 + }, + { + "epoch": 0.10837331794076266, + "grad_norm": 0.11970049142837524, + "learning_rate": 0.0001956790301630827, + "loss": 0.1501, + "step": 1502 + }, + { + "epoch": 0.10844547061582308, + "grad_norm": 0.10821323841810226, + "learning_rate": 0.00019567614374368597, + "loss": 0.1541, + "step": 1503 + }, + { + "epoch": 0.1085176232908835, + "grad_norm": 0.08639746904373169, + "learning_rate": 0.00019567325732428923, + "loss": 0.1369, + "step": 1504 + }, + { + "epoch": 0.10858977596594394, + "grad_norm": 0.12217281758785248, + "learning_rate": 0.0001956703709048925, + "loss": 0.1869, + "step": 1505 + }, + { + "epoch": 0.10866192864100437, + "grad_norm": 0.0951509103178978, + "learning_rate": 0.00019566748448549576, + "loss": 0.16, + "step": 1506 + }, + { + "epoch": 0.1087340813160648, + "grad_norm": 0.07016367465257645, + "learning_rate": 0.000195664598066099, + "loss": 0.1543, + "step": 1507 + }, + { + "epoch": 0.10880623399112523, + "grad_norm": 0.0987938717007637, + "learning_rate": 0.00019566171164670228, + "loss": 0.1773, + "step": 1508 + }, + { + "epoch": 0.10887838666618564, + "grad_norm": 0.10179755836725235, + "learning_rate": 0.00019565882522730555, + "loss": 0.1666, + "step": 1509 + }, + { + "epoch": 0.10895053934124607, + "grad_norm": 0.0916648656129837, + "learning_rate": 0.0001956559388079088, + "loss": 0.1513, + "step": 1510 + }, + { + "epoch": 0.1090226920163065, + "grad_norm": 0.11163599789142609, + "learning_rate": 0.00019565305238851207, + "loss": 0.1825, + "step": 1511 + }, + { + "epoch": 0.10909484469136693, + "grad_norm": 0.11444026231765747, + "learning_rate": 0.0001956501659691153, + "loss": 0.2145, + "step": 1512 + }, + { + "epoch": 0.10916699736642736, + "grad_norm": 0.09019803255796432, + "learning_rate": 0.00019564727954971857, + "loss": 0.2128, + "step": 1513 + }, + { + "epoch": 0.10923915004148779, + "grad_norm": 0.08074397593736649, + "learning_rate": 0.00019564439313032183, + "loss": 0.1521, + "step": 1514 + }, + { + "epoch": 0.10931130271654822, + "grad_norm": 0.09058935195207596, + "learning_rate": 0.00019564150671092512, + "loss": 0.1599, + "step": 1515 + }, + { + "epoch": 0.10938345539160864, + "grad_norm": 0.0978584885597229, + "learning_rate": 0.0001956386202915284, + "loss": 0.2016, + "step": 1516 + }, + { + "epoch": 0.10945560806666907, + "grad_norm": 0.0993848517537117, + "learning_rate": 0.00019563573387213162, + "loss": 0.145, + "step": 1517 + }, + { + "epoch": 0.1095277607417295, + "grad_norm": 0.09804774075746536, + "learning_rate": 0.00019563284745273489, + "loss": 0.1784, + "step": 1518 + }, + { + "epoch": 0.10959991341678993, + "grad_norm": 0.10656802356243134, + "learning_rate": 0.00019562996103333815, + "loss": 0.177, + "step": 1519 + }, + { + "epoch": 0.10967206609185036, + "grad_norm": 0.11479037255048752, + "learning_rate": 0.0001956270746139414, + "loss": 0.1352, + "step": 1520 + }, + { + "epoch": 0.10974421876691079, + "grad_norm": 0.09832447022199631, + "learning_rate": 0.00019562418819454467, + "loss": 0.1955, + "step": 1521 + }, + { + "epoch": 0.1098163714419712, + "grad_norm": 0.07826386392116547, + "learning_rate": 0.00019562130177514794, + "loss": 0.1794, + "step": 1522 + }, + { + "epoch": 0.10988852411703164, + "grad_norm": 0.10118061304092407, + "learning_rate": 0.0001956184153557512, + "loss": 0.1235, + "step": 1523 + }, + { + "epoch": 0.10996067679209207, + "grad_norm": 0.08155392110347748, + "learning_rate": 0.00019561552893635446, + "loss": 0.1757, + "step": 1524 + }, + { + "epoch": 0.1100328294671525, + "grad_norm": 0.10516910254955292, + "learning_rate": 0.00019561264251695773, + "loss": 0.1905, + "step": 1525 + }, + { + "epoch": 0.11010498214221293, + "grad_norm": 0.09425098448991776, + "learning_rate": 0.000195609756097561, + "loss": 0.1523, + "step": 1526 + }, + { + "epoch": 0.11017713481727336, + "grad_norm": 0.08117407560348511, + "learning_rate": 0.00019560686967816425, + "loss": 0.1487, + "step": 1527 + }, + { + "epoch": 0.11024928749233377, + "grad_norm": 0.07771728187799454, + "learning_rate": 0.0001956039832587675, + "loss": 0.1793, + "step": 1528 + }, + { + "epoch": 0.1103214401673942, + "grad_norm": 0.10780569165945053, + "learning_rate": 0.00019560109683937078, + "loss": 0.2023, + "step": 1529 + }, + { + "epoch": 0.11039359284245463, + "grad_norm": 0.08715084940195084, + "learning_rate": 0.00019559821041997404, + "loss": 0.1709, + "step": 1530 + }, + { + "epoch": 0.11046574551751506, + "grad_norm": 0.09841153770685196, + "learning_rate": 0.0001955953240005773, + "loss": 0.1775, + "step": 1531 + }, + { + "epoch": 0.11053789819257549, + "grad_norm": 0.08543652296066284, + "learning_rate": 0.00019559243758118057, + "loss": 0.1415, + "step": 1532 + }, + { + "epoch": 0.11061005086763592, + "grad_norm": 0.11666995286941528, + "learning_rate": 0.0001955895511617838, + "loss": 0.168, + "step": 1533 + }, + { + "epoch": 0.11068220354269634, + "grad_norm": 0.1458568274974823, + "learning_rate": 0.00019558666474238707, + "loss": 0.1881, + "step": 1534 + }, + { + "epoch": 0.11075435621775677, + "grad_norm": 0.08567783981561661, + "learning_rate": 0.00019558377832299033, + "loss": 0.1388, + "step": 1535 + }, + { + "epoch": 0.1108265088928172, + "grad_norm": 0.09290947765111923, + "learning_rate": 0.00019558089190359362, + "loss": 0.1727, + "step": 1536 + }, + { + "epoch": 0.11089866156787763, + "grad_norm": 0.09171988070011139, + "learning_rate": 0.00019557800548419688, + "loss": 0.1762, + "step": 1537 + }, + { + "epoch": 0.11097081424293806, + "grad_norm": 0.1283859759569168, + "learning_rate": 0.00019557511906480012, + "loss": 0.1589, + "step": 1538 + }, + { + "epoch": 0.11104296691799849, + "grad_norm": 0.10872713476419449, + "learning_rate": 0.00019557223264540338, + "loss": 0.1557, + "step": 1539 + }, + { + "epoch": 0.1111151195930589, + "grad_norm": 0.10511759668588638, + "learning_rate": 0.00019556934622600664, + "loss": 0.1625, + "step": 1540 + }, + { + "epoch": 0.11118727226811934, + "grad_norm": 0.13609780371189117, + "learning_rate": 0.0001955664598066099, + "loss": 0.2419, + "step": 1541 + }, + { + "epoch": 0.11125942494317977, + "grad_norm": 0.09898494929075241, + "learning_rate": 0.00019556357338721317, + "loss": 0.1473, + "step": 1542 + }, + { + "epoch": 0.1113315776182402, + "grad_norm": 0.09187084436416626, + "learning_rate": 0.00019556068696781643, + "loss": 0.158, + "step": 1543 + }, + { + "epoch": 0.11140373029330063, + "grad_norm": 0.08768919110298157, + "learning_rate": 0.0001955578005484197, + "loss": 0.1775, + "step": 1544 + }, + { + "epoch": 0.11147588296836106, + "grad_norm": 0.10175874084234238, + "learning_rate": 0.00019555491412902296, + "loss": 0.1393, + "step": 1545 + }, + { + "epoch": 0.11154803564342149, + "grad_norm": 0.07539024949073792, + "learning_rate": 0.00019555202770962622, + "loss": 0.1957, + "step": 1546 + }, + { + "epoch": 0.1116201883184819, + "grad_norm": 0.11006683856248856, + "learning_rate": 0.00019554914129022948, + "loss": 0.1453, + "step": 1547 + }, + { + "epoch": 0.11169234099354233, + "grad_norm": 0.1028423011302948, + "learning_rate": 0.00019554625487083275, + "loss": 0.2072, + "step": 1548 + }, + { + "epoch": 0.11176449366860276, + "grad_norm": 0.09654255211353302, + "learning_rate": 0.00019554336845143598, + "loss": 0.1575, + "step": 1549 + }, + { + "epoch": 0.11183664634366319, + "grad_norm": 0.09816180914640427, + "learning_rate": 0.00019554048203203927, + "loss": 0.1517, + "step": 1550 + }, + { + "epoch": 0.11190879901872362, + "grad_norm": 0.08485947549343109, + "learning_rate": 0.00019553759561264253, + "loss": 0.1849, + "step": 1551 + }, + { + "epoch": 0.11198095169378405, + "grad_norm": 0.09766913950443268, + "learning_rate": 0.0001955347091932458, + "loss": 0.1676, + "step": 1552 + }, + { + "epoch": 0.11205310436884447, + "grad_norm": 0.11628931015729904, + "learning_rate": 0.00019553182277384906, + "loss": 0.1864, + "step": 1553 + }, + { + "epoch": 0.1121252570439049, + "grad_norm": 0.10539587587118149, + "learning_rate": 0.0001955289363544523, + "loss": 0.1292, + "step": 1554 + }, + { + "epoch": 0.11219740971896533, + "grad_norm": 0.1077449768781662, + "learning_rate": 0.00019552604993505556, + "loss": 0.1268, + "step": 1555 + }, + { + "epoch": 0.11226956239402576, + "grad_norm": 0.11250253021717072, + "learning_rate": 0.00019552316351565882, + "loss": 0.1739, + "step": 1556 + }, + { + "epoch": 0.11234171506908619, + "grad_norm": 0.0985347256064415, + "learning_rate": 0.0001955202770962621, + "loss": 0.1718, + "step": 1557 + }, + { + "epoch": 0.11241386774414662, + "grad_norm": 0.10934750735759735, + "learning_rate": 0.00019551739067686538, + "loss": 0.1863, + "step": 1558 + }, + { + "epoch": 0.11248602041920704, + "grad_norm": 0.10609246790409088, + "learning_rate": 0.0001955145042574686, + "loss": 0.1707, + "step": 1559 + }, + { + "epoch": 0.11255817309426747, + "grad_norm": 0.10776393860578537, + "learning_rate": 0.00019551161783807187, + "loss": 0.181, + "step": 1560 + }, + { + "epoch": 0.1126303257693279, + "grad_norm": 0.1034221202135086, + "learning_rate": 0.00019550873141867514, + "loss": 0.1665, + "step": 1561 + }, + { + "epoch": 0.11270247844438833, + "grad_norm": 0.11250422894954681, + "learning_rate": 0.0001955058449992784, + "loss": 0.1729, + "step": 1562 + }, + { + "epoch": 0.11277463111944876, + "grad_norm": 0.10397494584321976, + "learning_rate": 0.00019550295857988166, + "loss": 0.1848, + "step": 1563 + }, + { + "epoch": 0.11284678379450919, + "grad_norm": 0.11455897241830826, + "learning_rate": 0.00019550007216048493, + "loss": 0.1534, + "step": 1564 + }, + { + "epoch": 0.1129189364695696, + "grad_norm": 0.07431942969560623, + "learning_rate": 0.0001954971857410882, + "loss": 0.1523, + "step": 1565 + }, + { + "epoch": 0.11299108914463003, + "grad_norm": 0.11153899133205414, + "learning_rate": 0.00019549429932169145, + "loss": 0.1257, + "step": 1566 + }, + { + "epoch": 0.11306324181969046, + "grad_norm": 0.1270129382610321, + "learning_rate": 0.00019549141290229471, + "loss": 0.1419, + "step": 1567 + }, + { + "epoch": 0.1131353944947509, + "grad_norm": 0.11523669213056564, + "learning_rate": 0.00019548852648289798, + "loss": 0.1652, + "step": 1568 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.11013611406087875, + "learning_rate": 0.00019548564006350124, + "loss": 0.1768, + "step": 1569 + }, + { + "epoch": 0.11327969984487175, + "grad_norm": 0.12150274962186813, + "learning_rate": 0.00019548275364410448, + "loss": 0.184, + "step": 1570 + }, + { + "epoch": 0.11335185251993217, + "grad_norm": 0.09226622432470322, + "learning_rate": 0.00019547986722470777, + "loss": 0.2027, + "step": 1571 + }, + { + "epoch": 0.1134240051949926, + "grad_norm": 0.1071082353591919, + "learning_rate": 0.00019547698080531103, + "loss": 0.1827, + "step": 1572 + }, + { + "epoch": 0.11349615787005303, + "grad_norm": 0.10402612388134003, + "learning_rate": 0.0001954740943859143, + "loss": 0.1321, + "step": 1573 + }, + { + "epoch": 0.11356831054511346, + "grad_norm": 0.09049825370311737, + "learning_rate": 0.00019547120796651755, + "loss": 0.1583, + "step": 1574 + }, + { + "epoch": 0.11364046322017389, + "grad_norm": 0.10656756907701492, + "learning_rate": 0.0001954683215471208, + "loss": 0.1589, + "step": 1575 + }, + { + "epoch": 0.11371261589523432, + "grad_norm": 0.0985848605632782, + "learning_rate": 0.00019546543512772405, + "loss": 0.1589, + "step": 1576 + }, + { + "epoch": 0.11378476857029475, + "grad_norm": 0.11201207339763641, + "learning_rate": 0.00019546254870832732, + "loss": 0.1625, + "step": 1577 + }, + { + "epoch": 0.11385692124535517, + "grad_norm": 0.12026341259479523, + "learning_rate": 0.0001954596622889306, + "loss": 0.1625, + "step": 1578 + }, + { + "epoch": 0.1139290739204156, + "grad_norm": 0.12375893443822861, + "learning_rate": 0.00019545677586953387, + "loss": 0.1375, + "step": 1579 + }, + { + "epoch": 0.11400122659547603, + "grad_norm": 0.10440582782030106, + "learning_rate": 0.0001954538894501371, + "loss": 0.1857, + "step": 1580 + }, + { + "epoch": 0.11407337927053646, + "grad_norm": 0.1234811544418335, + "learning_rate": 0.00019545100303074037, + "loss": 0.2006, + "step": 1581 + }, + { + "epoch": 0.11414553194559689, + "grad_norm": 0.07862017303705215, + "learning_rate": 0.00019544811661134363, + "loss": 0.1567, + "step": 1582 + }, + { + "epoch": 0.11421768462065732, + "grad_norm": 0.10096339136362076, + "learning_rate": 0.0001954452301919469, + "loss": 0.1572, + "step": 1583 + }, + { + "epoch": 0.11428983729571773, + "grad_norm": 0.08745244890451431, + "learning_rate": 0.00019544234377255016, + "loss": 0.1643, + "step": 1584 + }, + { + "epoch": 0.11436198997077816, + "grad_norm": 0.11481693387031555, + "learning_rate": 0.00019543945735315342, + "loss": 0.1985, + "step": 1585 + }, + { + "epoch": 0.1144341426458386, + "grad_norm": 0.09870748966932297, + "learning_rate": 0.00019543657093375668, + "loss": 0.2008, + "step": 1586 + }, + { + "epoch": 0.11450629532089902, + "grad_norm": 0.07589593529701233, + "learning_rate": 0.00019543368451435995, + "loss": 0.1787, + "step": 1587 + }, + { + "epoch": 0.11457844799595945, + "grad_norm": 0.08917496353387833, + "learning_rate": 0.0001954307980949632, + "loss": 0.1816, + "step": 1588 + }, + { + "epoch": 0.11465060067101988, + "grad_norm": 0.105776846408844, + "learning_rate": 0.00019542791167556647, + "loss": 0.1531, + "step": 1589 + }, + { + "epoch": 0.1147227533460803, + "grad_norm": 0.10567320138216019, + "learning_rate": 0.00019542502525616973, + "loss": 0.1735, + "step": 1590 + }, + { + "epoch": 0.11479490602114073, + "grad_norm": 0.12525592744350433, + "learning_rate": 0.00019542213883677297, + "loss": 0.2097, + "step": 1591 + }, + { + "epoch": 0.11486705869620116, + "grad_norm": 0.07959985733032227, + "learning_rate": 0.00019541925241737626, + "loss": 0.1516, + "step": 1592 + }, + { + "epoch": 0.11493921137126159, + "grad_norm": 0.08625461161136627, + "learning_rate": 0.00019541636599797952, + "loss": 0.1585, + "step": 1593 + }, + { + "epoch": 0.11501136404632202, + "grad_norm": 0.13847310841083527, + "learning_rate": 0.00019541347957858279, + "loss": 0.1463, + "step": 1594 + }, + { + "epoch": 0.11508351672138245, + "grad_norm": 0.10553795844316483, + "learning_rate": 0.00019541059315918605, + "loss": 0.1763, + "step": 1595 + }, + { + "epoch": 0.11515566939644287, + "grad_norm": 0.12277592718601227, + "learning_rate": 0.00019540770673978928, + "loss": 0.1423, + "step": 1596 + }, + { + "epoch": 0.1152278220715033, + "grad_norm": 0.11685627698898315, + "learning_rate": 0.00019540482032039255, + "loss": 0.1841, + "step": 1597 + }, + { + "epoch": 0.11529997474656373, + "grad_norm": 0.08955440670251846, + "learning_rate": 0.0001954019339009958, + "loss": 0.1408, + "step": 1598 + }, + { + "epoch": 0.11537212742162416, + "grad_norm": 0.10267322510480881, + "learning_rate": 0.0001953990474815991, + "loss": 0.1594, + "step": 1599 + }, + { + "epoch": 0.11544428009668459, + "grad_norm": 0.10553912818431854, + "learning_rate": 0.00019539616106220236, + "loss": 0.1533, + "step": 1600 + }, + { + "epoch": 0.11551643277174502, + "grad_norm": 0.11601997911930084, + "learning_rate": 0.0001953932746428056, + "loss": 0.1846, + "step": 1601 + }, + { + "epoch": 0.11558858544680543, + "grad_norm": 0.0766264870762825, + "learning_rate": 0.00019539038822340886, + "loss": 0.1782, + "step": 1602 + }, + { + "epoch": 0.11566073812186586, + "grad_norm": 0.09016980975866318, + "learning_rate": 0.00019538750180401213, + "loss": 0.1318, + "step": 1603 + }, + { + "epoch": 0.1157328907969263, + "grad_norm": 0.11082588136196136, + "learning_rate": 0.0001953846153846154, + "loss": 0.1902, + "step": 1604 + }, + { + "epoch": 0.11580504347198672, + "grad_norm": 0.11670586466789246, + "learning_rate": 0.00019538172896521865, + "loss": 0.1491, + "step": 1605 + }, + { + "epoch": 0.11587719614704715, + "grad_norm": 0.10419308394193649, + "learning_rate": 0.00019537884254582191, + "loss": 0.1363, + "step": 1606 + }, + { + "epoch": 0.11594934882210758, + "grad_norm": 0.08640503883361816, + "learning_rate": 0.00019537595612642518, + "loss": 0.1925, + "step": 1607 + }, + { + "epoch": 0.11602150149716801, + "grad_norm": 0.08510327339172363, + "learning_rate": 0.00019537306970702844, + "loss": 0.1509, + "step": 1608 + }, + { + "epoch": 0.11609365417222843, + "grad_norm": 0.08421725779771805, + "learning_rate": 0.0001953701832876317, + "loss": 0.1933, + "step": 1609 + }, + { + "epoch": 0.11616580684728886, + "grad_norm": 0.13186711072921753, + "learning_rate": 0.00019536729686823497, + "loss": 0.1862, + "step": 1610 + }, + { + "epoch": 0.11623795952234929, + "grad_norm": 0.10276197642087936, + "learning_rate": 0.00019536441044883823, + "loss": 0.1773, + "step": 1611 + }, + { + "epoch": 0.11631011219740972, + "grad_norm": 0.09136662632226944, + "learning_rate": 0.0001953615240294415, + "loss": 0.1462, + "step": 1612 + }, + { + "epoch": 0.11638226487247015, + "grad_norm": 0.10753851383924484, + "learning_rate": 0.00019535863761004475, + "loss": 0.1734, + "step": 1613 + }, + { + "epoch": 0.11645441754753058, + "grad_norm": 0.08567678928375244, + "learning_rate": 0.00019535575119064802, + "loss": 0.1715, + "step": 1614 + }, + { + "epoch": 0.116526570222591, + "grad_norm": 0.10128544270992279, + "learning_rate": 0.00019535286477125128, + "loss": 0.1622, + "step": 1615 + }, + { + "epoch": 0.11659872289765143, + "grad_norm": 0.09036710858345032, + "learning_rate": 0.00019534997835185454, + "loss": 0.1923, + "step": 1616 + }, + { + "epoch": 0.11667087557271186, + "grad_norm": 0.09953843057155609, + "learning_rate": 0.0001953470919324578, + "loss": 0.1499, + "step": 1617 + }, + { + "epoch": 0.11674302824777229, + "grad_norm": 0.10273440927267075, + "learning_rate": 0.00019534420551306104, + "loss": 0.1543, + "step": 1618 + }, + { + "epoch": 0.11681518092283272, + "grad_norm": 0.1256304830312729, + "learning_rate": 0.0001953413190936643, + "loss": 0.1577, + "step": 1619 + }, + { + "epoch": 0.11688733359789315, + "grad_norm": 0.1289820522069931, + "learning_rate": 0.0001953384326742676, + "loss": 0.1909, + "step": 1620 + }, + { + "epoch": 0.11695948627295356, + "grad_norm": 0.11304235458374023, + "learning_rate": 0.00019533554625487086, + "loss": 0.1849, + "step": 1621 + }, + { + "epoch": 0.117031638948014, + "grad_norm": 0.11579146981239319, + "learning_rate": 0.00019533265983547412, + "loss": 0.153, + "step": 1622 + }, + { + "epoch": 0.11710379162307442, + "grad_norm": 0.0958210751414299, + "learning_rate": 0.00019532977341607736, + "loss": 0.118, + "step": 1623 + }, + { + "epoch": 0.11717594429813485, + "grad_norm": 0.1111057698726654, + "learning_rate": 0.00019532688699668062, + "loss": 0.1566, + "step": 1624 + }, + { + "epoch": 0.11724809697319528, + "grad_norm": 0.12752115726470947, + "learning_rate": 0.00019532400057728388, + "loss": 0.163, + "step": 1625 + }, + { + "epoch": 0.11732024964825571, + "grad_norm": 0.09364619851112366, + "learning_rate": 0.00019532111415788715, + "loss": 0.1347, + "step": 1626 + }, + { + "epoch": 0.11739240232331613, + "grad_norm": 0.10146146267652512, + "learning_rate": 0.00019531822773849044, + "loss": 0.2043, + "step": 1627 + }, + { + "epoch": 0.11746455499837656, + "grad_norm": 0.09532111883163452, + "learning_rate": 0.00019531534131909367, + "loss": 0.1745, + "step": 1628 + }, + { + "epoch": 0.11753670767343699, + "grad_norm": 0.14005210995674133, + "learning_rate": 0.00019531245489969693, + "loss": 0.1598, + "step": 1629 + }, + { + "epoch": 0.11760886034849742, + "grad_norm": 0.08359965682029724, + "learning_rate": 0.0001953095684803002, + "loss": 0.1939, + "step": 1630 + }, + { + "epoch": 0.11768101302355785, + "grad_norm": 0.10559289902448654, + "learning_rate": 0.00019530668206090346, + "loss": 0.2026, + "step": 1631 + }, + { + "epoch": 0.11775316569861828, + "grad_norm": 0.12911143898963928, + "learning_rate": 0.00019530379564150672, + "loss": 0.1926, + "step": 1632 + }, + { + "epoch": 0.1178253183736787, + "grad_norm": 0.10912565886974335, + "learning_rate": 0.00019530090922210999, + "loss": 0.1911, + "step": 1633 + }, + { + "epoch": 0.11789747104873913, + "grad_norm": 0.0978940799832344, + "learning_rate": 0.00019529802280271325, + "loss": 0.1991, + "step": 1634 + }, + { + "epoch": 0.11796962372379956, + "grad_norm": 0.09110507369041443, + "learning_rate": 0.0001952951363833165, + "loss": 0.1923, + "step": 1635 + }, + { + "epoch": 0.11804177639885999, + "grad_norm": 0.1002359539270401, + "learning_rate": 0.00019529224996391977, + "loss": 0.1648, + "step": 1636 + }, + { + "epoch": 0.11811392907392042, + "grad_norm": 0.1239672303199768, + "learning_rate": 0.00019528936354452304, + "loss": 0.1559, + "step": 1637 + }, + { + "epoch": 0.11818608174898085, + "grad_norm": 0.08740098774433136, + "learning_rate": 0.0001952864771251263, + "loss": 0.1789, + "step": 1638 + }, + { + "epoch": 0.11825823442404128, + "grad_norm": 0.10871398448944092, + "learning_rate": 0.00019528359070572954, + "loss": 0.178, + "step": 1639 + }, + { + "epoch": 0.1183303870991017, + "grad_norm": 0.11047673970460892, + "learning_rate": 0.0001952807042863328, + "loss": 0.2121, + "step": 1640 + }, + { + "epoch": 0.11840253977416212, + "grad_norm": 0.10063087940216064, + "learning_rate": 0.0001952778178669361, + "loss": 0.167, + "step": 1641 + }, + { + "epoch": 0.11847469244922255, + "grad_norm": 0.09995663166046143, + "learning_rate": 0.00019527493144753935, + "loss": 0.1805, + "step": 1642 + }, + { + "epoch": 0.11854684512428298, + "grad_norm": 0.10936806350946426, + "learning_rate": 0.00019527204502814262, + "loss": 0.1924, + "step": 1643 + }, + { + "epoch": 0.11861899779934341, + "grad_norm": 0.08394020050764084, + "learning_rate": 0.00019526915860874585, + "loss": 0.154, + "step": 1644 + }, + { + "epoch": 0.11869115047440384, + "grad_norm": 0.11076677590608597, + "learning_rate": 0.00019526627218934911, + "loss": 0.1793, + "step": 1645 + }, + { + "epoch": 0.11876330314946426, + "grad_norm": 0.10604029893875122, + "learning_rate": 0.00019526338576995238, + "loss": 0.1375, + "step": 1646 + }, + { + "epoch": 0.11883545582452469, + "grad_norm": 0.1203065738081932, + "learning_rate": 0.00019526049935055564, + "loss": 0.2158, + "step": 1647 + }, + { + "epoch": 0.11890760849958512, + "grad_norm": 0.09037487953901291, + "learning_rate": 0.00019525761293115893, + "loss": 0.1883, + "step": 1648 + }, + { + "epoch": 0.11897976117464555, + "grad_norm": 0.07776744663715363, + "learning_rate": 0.00019525472651176217, + "loss": 0.1345, + "step": 1649 + }, + { + "epoch": 0.11905191384970598, + "grad_norm": 0.07779388129711151, + "learning_rate": 0.00019525184009236543, + "loss": 0.1715, + "step": 1650 + }, + { + "epoch": 0.11912406652476641, + "grad_norm": 0.08709803223609924, + "learning_rate": 0.0001952489536729687, + "loss": 0.1528, + "step": 1651 + }, + { + "epoch": 0.11919621919982683, + "grad_norm": 0.11145090311765671, + "learning_rate": 0.00019524606725357195, + "loss": 0.1767, + "step": 1652 + }, + { + "epoch": 0.11926837187488726, + "grad_norm": 0.10255642980337143, + "learning_rate": 0.00019524318083417522, + "loss": 0.1929, + "step": 1653 + }, + { + "epoch": 0.11934052454994769, + "grad_norm": 0.09450104832649231, + "learning_rate": 0.00019524029441477848, + "loss": 0.1896, + "step": 1654 + }, + { + "epoch": 0.11941267722500812, + "grad_norm": 0.11130858957767487, + "learning_rate": 0.00019523740799538174, + "loss": 0.1638, + "step": 1655 + }, + { + "epoch": 0.11948482990006855, + "grad_norm": 0.08688686043024063, + "learning_rate": 0.000195234521575985, + "loss": 0.1483, + "step": 1656 + }, + { + "epoch": 0.11955698257512898, + "grad_norm": 0.11365038901567459, + "learning_rate": 0.00019523163515658827, + "loss": 0.184, + "step": 1657 + }, + { + "epoch": 0.1196291352501894, + "grad_norm": 0.10233087092638016, + "learning_rate": 0.00019522874873719153, + "loss": 0.1884, + "step": 1658 + }, + { + "epoch": 0.11970128792524982, + "grad_norm": 0.09503836929798126, + "learning_rate": 0.0001952258623177948, + "loss": 0.1698, + "step": 1659 + }, + { + "epoch": 0.11977344060031025, + "grad_norm": 0.07523205131292343, + "learning_rate": 0.00019522297589839803, + "loss": 0.1337, + "step": 1660 + }, + { + "epoch": 0.11984559327537068, + "grad_norm": 0.08313417434692383, + "learning_rate": 0.0001952200894790013, + "loss": 0.1379, + "step": 1661 + }, + { + "epoch": 0.11991774595043111, + "grad_norm": 0.09840066730976105, + "learning_rate": 0.00019521720305960458, + "loss": 0.173, + "step": 1662 + }, + { + "epoch": 0.11998989862549155, + "grad_norm": 0.12853272259235382, + "learning_rate": 0.00019521431664020785, + "loss": 0.1646, + "step": 1663 + }, + { + "epoch": 0.12006205130055196, + "grad_norm": 0.10483130812644958, + "learning_rate": 0.0001952114302208111, + "loss": 0.1382, + "step": 1664 + }, + { + "epoch": 0.12013420397561239, + "grad_norm": 0.14726237952709198, + "learning_rate": 0.00019520854380141435, + "loss": 0.225, + "step": 1665 + }, + { + "epoch": 0.12020635665067282, + "grad_norm": 0.1127578616142273, + "learning_rate": 0.0001952056573820176, + "loss": 0.1695, + "step": 1666 + }, + { + "epoch": 0.12027850932573325, + "grad_norm": 0.11055461317300797, + "learning_rate": 0.00019520277096262087, + "loss": 0.1303, + "step": 1667 + }, + { + "epoch": 0.12035066200079368, + "grad_norm": 0.11091664433479309, + "learning_rate": 0.00019519988454322413, + "loss": 0.1941, + "step": 1668 + }, + { + "epoch": 0.12042281467585411, + "grad_norm": 0.10414857417345047, + "learning_rate": 0.0001951969981238274, + "loss": 0.1641, + "step": 1669 + }, + { + "epoch": 0.12049496735091454, + "grad_norm": 0.09459809213876724, + "learning_rate": 0.00019519411170443066, + "loss": 0.1808, + "step": 1670 + }, + { + "epoch": 0.12056712002597496, + "grad_norm": 0.09319958090782166, + "learning_rate": 0.00019519122528503392, + "loss": 0.1927, + "step": 1671 + }, + { + "epoch": 0.12063927270103539, + "grad_norm": 0.11097006499767303, + "learning_rate": 0.00019518833886563719, + "loss": 0.2317, + "step": 1672 + }, + { + "epoch": 0.12071142537609582, + "grad_norm": 0.11948166787624359, + "learning_rate": 0.00019518545244624045, + "loss": 0.1461, + "step": 1673 + }, + { + "epoch": 0.12078357805115625, + "grad_norm": 0.14143267273902893, + "learning_rate": 0.0001951825660268437, + "loss": 0.1854, + "step": 1674 + }, + { + "epoch": 0.12085573072621668, + "grad_norm": 0.08225080370903015, + "learning_rate": 0.00019517967960744697, + "loss": 0.1516, + "step": 1675 + }, + { + "epoch": 0.12092788340127711, + "grad_norm": 0.08731880784034729, + "learning_rate": 0.0001951767931880502, + "loss": 0.1624, + "step": 1676 + }, + { + "epoch": 0.12100003607633752, + "grad_norm": 0.11483090370893478, + "learning_rate": 0.0001951739067686535, + "loss": 0.2156, + "step": 1677 + }, + { + "epoch": 0.12107218875139795, + "grad_norm": 0.10485479235649109, + "learning_rate": 0.00019517102034925676, + "loss": 0.165, + "step": 1678 + }, + { + "epoch": 0.12114434142645839, + "grad_norm": 0.11514680087566376, + "learning_rate": 0.00019516813392986003, + "loss": 0.1726, + "step": 1679 + }, + { + "epoch": 0.12121649410151882, + "grad_norm": 0.10860565304756165, + "learning_rate": 0.0001951652475104633, + "loss": 0.1405, + "step": 1680 + }, + { + "epoch": 0.12128864677657925, + "grad_norm": 0.10668021440505981, + "learning_rate": 0.00019516236109106652, + "loss": 0.1898, + "step": 1681 + }, + { + "epoch": 0.12136079945163968, + "grad_norm": 0.0982346162199974, + "learning_rate": 0.0001951594746716698, + "loss": 0.1302, + "step": 1682 + }, + { + "epoch": 0.12143295212670009, + "grad_norm": 0.10028962790966034, + "learning_rate": 0.00019515658825227305, + "loss": 0.1509, + "step": 1683 + }, + { + "epoch": 0.12150510480176052, + "grad_norm": 0.10393217951059341, + "learning_rate": 0.00019515370183287634, + "loss": 0.1409, + "step": 1684 + }, + { + "epoch": 0.12157725747682095, + "grad_norm": 0.09106632322072983, + "learning_rate": 0.0001951508154134796, + "loss": 0.1388, + "step": 1685 + }, + { + "epoch": 0.12164941015188138, + "grad_norm": 0.10406849533319473, + "learning_rate": 0.00019514792899408284, + "loss": 0.2068, + "step": 1686 + }, + { + "epoch": 0.12172156282694181, + "grad_norm": 0.1421014368534088, + "learning_rate": 0.0001951450425746861, + "loss": 0.1819, + "step": 1687 + }, + { + "epoch": 0.12179371550200224, + "grad_norm": 0.09534769505262375, + "learning_rate": 0.00019514215615528937, + "loss": 0.1508, + "step": 1688 + }, + { + "epoch": 0.12186586817706266, + "grad_norm": 0.12619756162166595, + "learning_rate": 0.00019513926973589263, + "loss": 0.164, + "step": 1689 + }, + { + "epoch": 0.12193802085212309, + "grad_norm": 0.1002705991268158, + "learning_rate": 0.0001951363833164959, + "loss": 0.1623, + "step": 1690 + }, + { + "epoch": 0.12201017352718352, + "grad_norm": 0.08136925101280212, + "learning_rate": 0.00019513349689709915, + "loss": 0.1039, + "step": 1691 + }, + { + "epoch": 0.12208232620224395, + "grad_norm": 0.08619451522827148, + "learning_rate": 0.00019513061047770242, + "loss": 0.1516, + "step": 1692 + }, + { + "epoch": 0.12215447887730438, + "grad_norm": 0.0915481299161911, + "learning_rate": 0.00019512772405830568, + "loss": 0.163, + "step": 1693 + }, + { + "epoch": 0.12222663155236481, + "grad_norm": 0.10088937729597092, + "learning_rate": 0.00019512483763890894, + "loss": 0.1918, + "step": 1694 + }, + { + "epoch": 0.12229878422742523, + "grad_norm": 0.10313641279935837, + "learning_rate": 0.0001951219512195122, + "loss": 0.1931, + "step": 1695 + }, + { + "epoch": 0.12237093690248566, + "grad_norm": 0.11182332783937454, + "learning_rate": 0.00019511906480011547, + "loss": 0.1783, + "step": 1696 + }, + { + "epoch": 0.12244308957754609, + "grad_norm": 0.08624817430973053, + "learning_rate": 0.0001951161783807187, + "loss": 0.1667, + "step": 1697 + }, + { + "epoch": 0.12251524225260652, + "grad_norm": 0.09656798839569092, + "learning_rate": 0.000195113291961322, + "loss": 0.1418, + "step": 1698 + }, + { + "epoch": 0.12258739492766695, + "grad_norm": 0.10291645675897598, + "learning_rate": 0.00019511040554192526, + "loss": 0.1306, + "step": 1699 + }, + { + "epoch": 0.12265954760272738, + "grad_norm": 0.08263203501701355, + "learning_rate": 0.00019510751912252852, + "loss": 0.223, + "step": 1700 + }, + { + "epoch": 0.1227317002777878, + "grad_norm": 0.09930901229381561, + "learning_rate": 0.00019510463270313178, + "loss": 0.1213, + "step": 1701 + }, + { + "epoch": 0.12280385295284822, + "grad_norm": 0.08658596873283386, + "learning_rate": 0.00019510174628373502, + "loss": 0.1608, + "step": 1702 + }, + { + "epoch": 0.12287600562790865, + "grad_norm": 0.12086457759141922, + "learning_rate": 0.00019509885986433828, + "loss": 0.1929, + "step": 1703 + }, + { + "epoch": 0.12294815830296908, + "grad_norm": 0.09840729832649231, + "learning_rate": 0.00019509597344494154, + "loss": 0.1773, + "step": 1704 + }, + { + "epoch": 0.12302031097802951, + "grad_norm": 0.17573504149913788, + "learning_rate": 0.00019509308702554484, + "loss": 0.2211, + "step": 1705 + }, + { + "epoch": 0.12309246365308994, + "grad_norm": 0.11827810853719711, + "learning_rate": 0.0001950902006061481, + "loss": 0.1823, + "step": 1706 + }, + { + "epoch": 0.12316461632815037, + "grad_norm": 0.11534013599157333, + "learning_rate": 0.00019508731418675133, + "loss": 0.196, + "step": 1707 + }, + { + "epoch": 0.12323676900321079, + "grad_norm": 0.10985036194324493, + "learning_rate": 0.0001950844277673546, + "loss": 0.1829, + "step": 1708 + }, + { + "epoch": 0.12330892167827122, + "grad_norm": 0.09258478134870529, + "learning_rate": 0.00019508154134795786, + "loss": 0.2058, + "step": 1709 + }, + { + "epoch": 0.12338107435333165, + "grad_norm": 0.10214863717556, + "learning_rate": 0.00019507865492856112, + "loss": 0.2164, + "step": 1710 + }, + { + "epoch": 0.12345322702839208, + "grad_norm": 0.16184085607528687, + "learning_rate": 0.00019507576850916439, + "loss": 0.203, + "step": 1711 + }, + { + "epoch": 0.12352537970345251, + "grad_norm": 0.10491573065519333, + "learning_rate": 0.00019507288208976765, + "loss": 0.1248, + "step": 1712 + }, + { + "epoch": 0.12359753237851294, + "grad_norm": 0.11957216262817383, + "learning_rate": 0.0001950699956703709, + "loss": 0.1726, + "step": 1713 + }, + { + "epoch": 0.12366968505357336, + "grad_norm": 0.1047922745347023, + "learning_rate": 0.00019506710925097417, + "loss": 0.2082, + "step": 1714 + }, + { + "epoch": 0.12374183772863379, + "grad_norm": 0.10638313740491867, + "learning_rate": 0.00019506422283157744, + "loss": 0.1508, + "step": 1715 + }, + { + "epoch": 0.12381399040369422, + "grad_norm": 0.12371479719877243, + "learning_rate": 0.0001950613364121807, + "loss": 0.1322, + "step": 1716 + }, + { + "epoch": 0.12388614307875465, + "grad_norm": 0.13481566309928894, + "learning_rate": 0.00019505844999278396, + "loss": 0.1856, + "step": 1717 + }, + { + "epoch": 0.12395829575381508, + "grad_norm": 0.14135459065437317, + "learning_rate": 0.0001950555635733872, + "loss": 0.1781, + "step": 1718 + }, + { + "epoch": 0.1240304484288755, + "grad_norm": 0.13202592730522156, + "learning_rate": 0.0001950526771539905, + "loss": 0.1651, + "step": 1719 + }, + { + "epoch": 0.12410260110393592, + "grad_norm": 0.09290815889835358, + "learning_rate": 0.00019504979073459375, + "loss": 0.1565, + "step": 1720 + }, + { + "epoch": 0.12417475377899635, + "grad_norm": 0.10955124348402023, + "learning_rate": 0.00019504690431519701, + "loss": 0.1963, + "step": 1721 + }, + { + "epoch": 0.12424690645405678, + "grad_norm": 0.10231587290763855, + "learning_rate": 0.00019504401789580028, + "loss": 0.2123, + "step": 1722 + }, + { + "epoch": 0.12431905912911721, + "grad_norm": 0.1099744439125061, + "learning_rate": 0.0001950411314764035, + "loss": 0.1897, + "step": 1723 + }, + { + "epoch": 0.12439121180417764, + "grad_norm": 0.11467958241701126, + "learning_rate": 0.00019503824505700678, + "loss": 0.2005, + "step": 1724 + }, + { + "epoch": 0.12446336447923807, + "grad_norm": 0.10473181307315826, + "learning_rate": 0.00019503535863761004, + "loss": 0.2056, + "step": 1725 + }, + { + "epoch": 0.12453551715429849, + "grad_norm": 0.11245948821306229, + "learning_rate": 0.00019503247221821333, + "loss": 0.1327, + "step": 1726 + }, + { + "epoch": 0.12460766982935892, + "grad_norm": 0.10495847463607788, + "learning_rate": 0.0001950295857988166, + "loss": 0.1853, + "step": 1727 + }, + { + "epoch": 0.12467982250441935, + "grad_norm": 0.12626883387565613, + "learning_rate": 0.00019502669937941986, + "loss": 0.216, + "step": 1728 + }, + { + "epoch": 0.12475197517947978, + "grad_norm": 0.10064269602298737, + "learning_rate": 0.0001950238129600231, + "loss": 0.115, + "step": 1729 + }, + { + "epoch": 0.12482412785454021, + "grad_norm": 0.09452646970748901, + "learning_rate": 0.00019502092654062635, + "loss": 0.1808, + "step": 1730 + }, + { + "epoch": 0.12489628052960064, + "grad_norm": 0.1017569899559021, + "learning_rate": 0.00019501804012122962, + "loss": 0.1619, + "step": 1731 + }, + { + "epoch": 0.12496843320466107, + "grad_norm": 0.10112976282835007, + "learning_rate": 0.00019501515370183288, + "loss": 0.1686, + "step": 1732 + }, + { + "epoch": 0.1250405858797215, + "grad_norm": 0.10376875102519989, + "learning_rate": 0.00019501226728243617, + "loss": 0.1839, + "step": 1733 + }, + { + "epoch": 0.12511273855478192, + "grad_norm": 0.11961390823125839, + "learning_rate": 0.0001950093808630394, + "loss": 0.2095, + "step": 1734 + }, + { + "epoch": 0.12518489122984236, + "grad_norm": 0.10202952474355698, + "learning_rate": 0.00019500649444364267, + "loss": 0.1537, + "step": 1735 + }, + { + "epoch": 0.12525704390490278, + "grad_norm": 0.08299683779478073, + "learning_rate": 0.00019500360802424593, + "loss": 0.184, + "step": 1736 + }, + { + "epoch": 0.1253291965799632, + "grad_norm": 0.09358169138431549, + "learning_rate": 0.0001950007216048492, + "loss": 0.1424, + "step": 1737 + }, + { + "epoch": 0.12540134925502364, + "grad_norm": 0.08781565725803375, + "learning_rate": 0.00019499783518545246, + "loss": 0.1685, + "step": 1738 + }, + { + "epoch": 0.12547350193008405, + "grad_norm": 0.10147970169782639, + "learning_rate": 0.00019499494876605572, + "loss": 0.1333, + "step": 1739 + }, + { + "epoch": 0.1255456546051445, + "grad_norm": 0.12914858758449554, + "learning_rate": 0.00019499206234665898, + "loss": 0.1214, + "step": 1740 + }, + { + "epoch": 0.1256178072802049, + "grad_norm": 0.1447230875492096, + "learning_rate": 0.00019498917592726225, + "loss": 0.12, + "step": 1741 + }, + { + "epoch": 0.12568995995526533, + "grad_norm": 0.10296161472797394, + "learning_rate": 0.0001949862895078655, + "loss": 0.1166, + "step": 1742 + }, + { + "epoch": 0.12576211263032577, + "grad_norm": 0.1182079017162323, + "learning_rate": 0.00019498340308846877, + "loss": 0.1606, + "step": 1743 + }, + { + "epoch": 0.1258342653053862, + "grad_norm": 0.1383650302886963, + "learning_rate": 0.00019498051666907203, + "loss": 0.1936, + "step": 1744 + }, + { + "epoch": 0.12590641798044663, + "grad_norm": 0.09023472666740417, + "learning_rate": 0.00019497763024967527, + "loss": 0.1789, + "step": 1745 + }, + { + "epoch": 0.12597857065550705, + "grad_norm": 0.130020871758461, + "learning_rate": 0.00019497474383027853, + "loss": 0.213, + "step": 1746 + }, + { + "epoch": 0.1260507233305675, + "grad_norm": 0.10528901219367981, + "learning_rate": 0.00019497185741088182, + "loss": 0.1635, + "step": 1747 + }, + { + "epoch": 0.1261228760056279, + "grad_norm": 0.11983592808246613, + "learning_rate": 0.00019496897099148509, + "loss": 0.139, + "step": 1748 + }, + { + "epoch": 0.12619502868068833, + "grad_norm": 0.09582766890525818, + "learning_rate": 0.00019496608457208835, + "loss": 0.1395, + "step": 1749 + }, + { + "epoch": 0.12626718135574877, + "grad_norm": 0.12090190500020981, + "learning_rate": 0.00019496319815269159, + "loss": 0.1818, + "step": 1750 + }, + { + "epoch": 0.12633933403080919, + "grad_norm": 0.11621379107236862, + "learning_rate": 0.00019496031173329485, + "loss": 0.1754, + "step": 1751 + }, + { + "epoch": 0.12641148670586963, + "grad_norm": 0.1177566647529602, + "learning_rate": 0.0001949574253138981, + "loss": 0.1402, + "step": 1752 + }, + { + "epoch": 0.12648363938093005, + "grad_norm": 0.09794148057699203, + "learning_rate": 0.00019495453889450137, + "loss": 0.1986, + "step": 1753 + }, + { + "epoch": 0.12655579205599046, + "grad_norm": 0.11040861904621124, + "learning_rate": 0.00019495165247510466, + "loss": 0.145, + "step": 1754 + }, + { + "epoch": 0.1266279447310509, + "grad_norm": 0.08527898788452148, + "learning_rate": 0.0001949487660557079, + "loss": 0.158, + "step": 1755 + }, + { + "epoch": 0.12670009740611132, + "grad_norm": 0.11260313540697098, + "learning_rate": 0.00019494587963631116, + "loss": 0.1364, + "step": 1756 + }, + { + "epoch": 0.12677225008117177, + "grad_norm": 0.10999318212270737, + "learning_rate": 0.00019494299321691443, + "loss": 0.1579, + "step": 1757 + }, + { + "epoch": 0.12684440275623218, + "grad_norm": 0.11420169472694397, + "learning_rate": 0.0001949401067975177, + "loss": 0.1892, + "step": 1758 + }, + { + "epoch": 0.12691655543129263, + "grad_norm": 0.11526145786046982, + "learning_rate": 0.00019493722037812095, + "loss": 0.2043, + "step": 1759 + }, + { + "epoch": 0.12698870810635304, + "grad_norm": 0.1126892939209938, + "learning_rate": 0.00019493433395872421, + "loss": 0.1905, + "step": 1760 + }, + { + "epoch": 0.12706086078141346, + "grad_norm": 0.10353560000658035, + "learning_rate": 0.00019493144753932748, + "loss": 0.1542, + "step": 1761 + }, + { + "epoch": 0.1271330134564739, + "grad_norm": 0.0921681746840477, + "learning_rate": 0.00019492856111993074, + "loss": 0.1878, + "step": 1762 + }, + { + "epoch": 0.12720516613153432, + "grad_norm": 0.1161520853638649, + "learning_rate": 0.000194925674700534, + "loss": 0.1212, + "step": 1763 + }, + { + "epoch": 0.12727731880659476, + "grad_norm": 0.09371655434370041, + "learning_rate": 0.00019492278828113727, + "loss": 0.1605, + "step": 1764 + }, + { + "epoch": 0.12734947148165518, + "grad_norm": 0.07800782471895218, + "learning_rate": 0.00019491990186174053, + "loss": 0.1915, + "step": 1765 + }, + { + "epoch": 0.12742162415671562, + "grad_norm": 0.08994324505329132, + "learning_rate": 0.00019491701544234376, + "loss": 0.2124, + "step": 1766 + }, + { + "epoch": 0.12749377683177604, + "grad_norm": 0.09498997032642365, + "learning_rate": 0.00019491412902294703, + "loss": 0.1627, + "step": 1767 + }, + { + "epoch": 0.12756592950683646, + "grad_norm": 0.10948889702558517, + "learning_rate": 0.00019491124260355032, + "loss": 0.1443, + "step": 1768 + }, + { + "epoch": 0.1276380821818969, + "grad_norm": 0.10349909216165543, + "learning_rate": 0.00019490835618415358, + "loss": 0.1674, + "step": 1769 + }, + { + "epoch": 0.12771023485695732, + "grad_norm": 0.09799700230360031, + "learning_rate": 0.00019490546976475684, + "loss": 0.1926, + "step": 1770 + }, + { + "epoch": 0.12778238753201776, + "grad_norm": 0.11425378918647766, + "learning_rate": 0.00019490258334536008, + "loss": 0.1984, + "step": 1771 + }, + { + "epoch": 0.12785454020707818, + "grad_norm": 0.09551405161619186, + "learning_rate": 0.00019489969692596334, + "loss": 0.1712, + "step": 1772 + }, + { + "epoch": 0.1279266928821386, + "grad_norm": 0.1070886105298996, + "learning_rate": 0.0001948968105065666, + "loss": 0.1371, + "step": 1773 + }, + { + "epoch": 0.12799884555719904, + "grad_norm": 0.09951864928007126, + "learning_rate": 0.00019489392408716987, + "loss": 0.1281, + "step": 1774 + }, + { + "epoch": 0.12807099823225945, + "grad_norm": 0.10588033497333527, + "learning_rate": 0.00019489103766777316, + "loss": 0.1838, + "step": 1775 + }, + { + "epoch": 0.1281431509073199, + "grad_norm": 0.1081177219748497, + "learning_rate": 0.0001948881512483764, + "loss": 0.1113, + "step": 1776 + }, + { + "epoch": 0.1282153035823803, + "grad_norm": 0.10994046181440353, + "learning_rate": 0.00019488526482897966, + "loss": 0.1365, + "step": 1777 + }, + { + "epoch": 0.12828745625744076, + "grad_norm": 0.10076797008514404, + "learning_rate": 0.00019488237840958292, + "loss": 0.1585, + "step": 1778 + }, + { + "epoch": 0.12835960893250117, + "grad_norm": 0.129889577627182, + "learning_rate": 0.00019487949199018618, + "loss": 0.1926, + "step": 1779 + }, + { + "epoch": 0.1284317616075616, + "grad_norm": 0.10640796273946762, + "learning_rate": 0.00019487660557078945, + "loss": 0.2057, + "step": 1780 + }, + { + "epoch": 0.12850391428262203, + "grad_norm": 0.08477828651666641, + "learning_rate": 0.0001948737191513927, + "loss": 0.1444, + "step": 1781 + }, + { + "epoch": 0.12857606695768245, + "grad_norm": 0.08494777977466583, + "learning_rate": 0.00019487083273199597, + "loss": 0.1356, + "step": 1782 + }, + { + "epoch": 0.1286482196327429, + "grad_norm": 0.08862127363681793, + "learning_rate": 0.00019486794631259923, + "loss": 0.1545, + "step": 1783 + }, + { + "epoch": 0.1287203723078033, + "grad_norm": 0.08903194963932037, + "learning_rate": 0.0001948650598932025, + "loss": 0.1755, + "step": 1784 + }, + { + "epoch": 0.12879252498286373, + "grad_norm": 0.11961119621992111, + "learning_rate": 0.00019486217347380576, + "loss": 0.141, + "step": 1785 + }, + { + "epoch": 0.12886467765792417, + "grad_norm": 0.0972859337925911, + "learning_rate": 0.00019485928705440902, + "loss": 0.1719, + "step": 1786 + }, + { + "epoch": 0.1289368303329846, + "grad_norm": 0.11359525471925735, + "learning_rate": 0.00019485640063501226, + "loss": 0.1921, + "step": 1787 + }, + { + "epoch": 0.12900898300804503, + "grad_norm": 0.1031796783208847, + "learning_rate": 0.00019485351421561552, + "loss": 0.1678, + "step": 1788 + }, + { + "epoch": 0.12908113568310545, + "grad_norm": 0.1136065348982811, + "learning_rate": 0.0001948506277962188, + "loss": 0.1921, + "step": 1789 + }, + { + "epoch": 0.1291532883581659, + "grad_norm": 0.11536866426467896, + "learning_rate": 0.00019484774137682207, + "loss": 0.2159, + "step": 1790 + }, + { + "epoch": 0.1292254410332263, + "grad_norm": 0.11552587151527405, + "learning_rate": 0.00019484485495742534, + "loss": 0.1385, + "step": 1791 + }, + { + "epoch": 0.12929759370828672, + "grad_norm": 0.09561877697706223, + "learning_rate": 0.00019484196853802857, + "loss": 0.1334, + "step": 1792 + }, + { + "epoch": 0.12936974638334717, + "grad_norm": 0.11691746860742569, + "learning_rate": 0.00019483908211863184, + "loss": 0.1323, + "step": 1793 + }, + { + "epoch": 0.12944189905840758, + "grad_norm": 0.11047590523958206, + "learning_rate": 0.0001948361956992351, + "loss": 0.1599, + "step": 1794 + }, + { + "epoch": 0.12951405173346803, + "grad_norm": 0.12649111449718475, + "learning_rate": 0.00019483330927983836, + "loss": 0.1529, + "step": 1795 + }, + { + "epoch": 0.12958620440852844, + "grad_norm": 0.11031024903059006, + "learning_rate": 0.00019483042286044165, + "loss": 0.1502, + "step": 1796 + }, + { + "epoch": 0.1296583570835889, + "grad_norm": 0.11705251038074493, + "learning_rate": 0.0001948275364410449, + "loss": 0.1905, + "step": 1797 + }, + { + "epoch": 0.1297305097586493, + "grad_norm": 0.09784354269504547, + "learning_rate": 0.00019482465002164815, + "loss": 0.1761, + "step": 1798 + }, + { + "epoch": 0.12980266243370972, + "grad_norm": 0.0812259167432785, + "learning_rate": 0.00019482176360225141, + "loss": 0.2058, + "step": 1799 + }, + { + "epoch": 0.12987481510877016, + "grad_norm": 0.093953937292099, + "learning_rate": 0.00019481887718285468, + "loss": 0.1552, + "step": 1800 + }, + { + "epoch": 0.12994696778383058, + "grad_norm": 0.0977918803691864, + "learning_rate": 0.00019481599076345794, + "loss": 0.1953, + "step": 1801 + }, + { + "epoch": 0.13001912045889102, + "grad_norm": 0.08500286936759949, + "learning_rate": 0.0001948131043440612, + "loss": 0.1679, + "step": 1802 + }, + { + "epoch": 0.13009127313395144, + "grad_norm": 0.1158568263053894, + "learning_rate": 0.00019481021792466447, + "loss": 0.1697, + "step": 1803 + }, + { + "epoch": 0.13016342580901186, + "grad_norm": 0.09711387008428574, + "learning_rate": 0.00019480733150526773, + "loss": 0.1522, + "step": 1804 + }, + { + "epoch": 0.1302355784840723, + "grad_norm": 0.08754990249872208, + "learning_rate": 0.000194804445085871, + "loss": 0.1696, + "step": 1805 + }, + { + "epoch": 0.13030773115913272, + "grad_norm": 0.10196410864591599, + "learning_rate": 0.00019480155866647425, + "loss": 0.1643, + "step": 1806 + }, + { + "epoch": 0.13037988383419316, + "grad_norm": 0.1056194081902504, + "learning_rate": 0.00019479867224707752, + "loss": 0.1424, + "step": 1807 + }, + { + "epoch": 0.13045203650925358, + "grad_norm": 0.07937432825565338, + "learning_rate": 0.00019479578582768075, + "loss": 0.1661, + "step": 1808 + }, + { + "epoch": 0.13052418918431402, + "grad_norm": 0.11897121369838715, + "learning_rate": 0.00019479289940828402, + "loss": 0.146, + "step": 1809 + }, + { + "epoch": 0.13059634185937444, + "grad_norm": 0.08482971787452698, + "learning_rate": 0.0001947900129888873, + "loss": 0.1845, + "step": 1810 + }, + { + "epoch": 0.13066849453443485, + "grad_norm": 0.09962055087089539, + "learning_rate": 0.00019478712656949057, + "loss": 0.1175, + "step": 1811 + }, + { + "epoch": 0.1307406472094953, + "grad_norm": 0.1010812520980835, + "learning_rate": 0.00019478424015009383, + "loss": 0.1942, + "step": 1812 + }, + { + "epoch": 0.1308127998845557, + "grad_norm": 0.09674418717622757, + "learning_rate": 0.00019478135373069707, + "loss": 0.1658, + "step": 1813 + }, + { + "epoch": 0.13088495255961616, + "grad_norm": 0.11402438580989838, + "learning_rate": 0.00019477846731130033, + "loss": 0.1824, + "step": 1814 + }, + { + "epoch": 0.13095710523467657, + "grad_norm": 0.13043256103992462, + "learning_rate": 0.0001947755808919036, + "loss": 0.1987, + "step": 1815 + }, + { + "epoch": 0.131029257909737, + "grad_norm": 0.14432241022586823, + "learning_rate": 0.00019477269447250686, + "loss": 0.1455, + "step": 1816 + }, + { + "epoch": 0.13110141058479743, + "grad_norm": 0.10322273522615433, + "learning_rate": 0.00019476980805311015, + "loss": 0.2021, + "step": 1817 + }, + { + "epoch": 0.13117356325985785, + "grad_norm": 0.08912544697523117, + "learning_rate": 0.00019476692163371338, + "loss": 0.1385, + "step": 1818 + }, + { + "epoch": 0.1312457159349183, + "grad_norm": 0.10674835741519928, + "learning_rate": 0.00019476403521431665, + "loss": 0.202, + "step": 1819 + }, + { + "epoch": 0.1313178686099787, + "grad_norm": 0.10658169537782669, + "learning_rate": 0.0001947611487949199, + "loss": 0.2185, + "step": 1820 + }, + { + "epoch": 0.13139002128503915, + "grad_norm": 0.09862136840820312, + "learning_rate": 0.00019475826237552317, + "loss": 0.1156, + "step": 1821 + }, + { + "epoch": 0.13146217396009957, + "grad_norm": 0.10120377689599991, + "learning_rate": 0.00019475537595612643, + "loss": 0.1456, + "step": 1822 + }, + { + "epoch": 0.13153432663516, + "grad_norm": 0.08391633629798889, + "learning_rate": 0.0001947524895367297, + "loss": 0.1388, + "step": 1823 + }, + { + "epoch": 0.13160647931022043, + "grad_norm": 0.1270926147699356, + "learning_rate": 0.00019474960311733296, + "loss": 0.1729, + "step": 1824 + }, + { + "epoch": 0.13167863198528085, + "grad_norm": 0.10564117878675461, + "learning_rate": 0.00019474671669793622, + "loss": 0.1369, + "step": 1825 + }, + { + "epoch": 0.1317507846603413, + "grad_norm": 0.1271250993013382, + "learning_rate": 0.00019474383027853949, + "loss": 0.1726, + "step": 1826 + }, + { + "epoch": 0.1318229373354017, + "grad_norm": 0.09364531934261322, + "learning_rate": 0.00019474094385914275, + "loss": 0.159, + "step": 1827 + }, + { + "epoch": 0.13189509001046215, + "grad_norm": 0.1323820948600769, + "learning_rate": 0.000194738057439746, + "loss": 0.2438, + "step": 1828 + }, + { + "epoch": 0.13196724268552257, + "grad_norm": 0.11897341907024384, + "learning_rate": 0.00019473517102034925, + "loss": 0.1228, + "step": 1829 + }, + { + "epoch": 0.13203939536058298, + "grad_norm": 0.1225760281085968, + "learning_rate": 0.0001947322846009525, + "loss": 0.1622, + "step": 1830 + }, + { + "epoch": 0.13211154803564343, + "grad_norm": 0.10834752768278122, + "learning_rate": 0.0001947293981815558, + "loss": 0.1722, + "step": 1831 + }, + { + "epoch": 0.13218370071070384, + "grad_norm": 0.11604952067136765, + "learning_rate": 0.00019472651176215906, + "loss": 0.2066, + "step": 1832 + }, + { + "epoch": 0.1322558533857643, + "grad_norm": 0.1207766979932785, + "learning_rate": 0.00019472362534276233, + "loss": 0.1554, + "step": 1833 + }, + { + "epoch": 0.1323280060608247, + "grad_norm": 0.11709817498922348, + "learning_rate": 0.00019472073892336556, + "loss": 0.1966, + "step": 1834 + }, + { + "epoch": 0.13240015873588512, + "grad_norm": 0.09540904313325882, + "learning_rate": 0.00019471785250396883, + "loss": 0.1972, + "step": 1835 + }, + { + "epoch": 0.13247231141094556, + "grad_norm": 0.12114161998033524, + "learning_rate": 0.0001947149660845721, + "loss": 0.1502, + "step": 1836 + }, + { + "epoch": 0.13254446408600598, + "grad_norm": 0.08637981116771698, + "learning_rate": 0.00019471207966517535, + "loss": 0.2123, + "step": 1837 + }, + { + "epoch": 0.13261661676106642, + "grad_norm": 0.09737266600131989, + "learning_rate": 0.00019470919324577864, + "loss": 0.1474, + "step": 1838 + }, + { + "epoch": 0.13268876943612684, + "grad_norm": 0.16904398798942566, + "learning_rate": 0.00019470630682638188, + "loss": 0.1927, + "step": 1839 + }, + { + "epoch": 0.13276092211118729, + "grad_norm": 0.09032367169857025, + "learning_rate": 0.00019470342040698514, + "loss": 0.1916, + "step": 1840 + }, + { + "epoch": 0.1328330747862477, + "grad_norm": 0.15205013751983643, + "learning_rate": 0.0001947005339875884, + "loss": 0.2313, + "step": 1841 + }, + { + "epoch": 0.13290522746130812, + "grad_norm": 0.10285206884145737, + "learning_rate": 0.00019469764756819167, + "loss": 0.2256, + "step": 1842 + }, + { + "epoch": 0.13297738013636856, + "grad_norm": 0.07314669340848923, + "learning_rate": 0.00019469476114879493, + "loss": 0.1652, + "step": 1843 + }, + { + "epoch": 0.13304953281142898, + "grad_norm": 0.08424288779497147, + "learning_rate": 0.0001946918747293982, + "loss": 0.1769, + "step": 1844 + }, + { + "epoch": 0.13312168548648942, + "grad_norm": 0.1007574275135994, + "learning_rate": 0.00019468898831000145, + "loss": 0.1512, + "step": 1845 + }, + { + "epoch": 0.13319383816154984, + "grad_norm": 0.09143602102994919, + "learning_rate": 0.00019468610189060472, + "loss": 0.1753, + "step": 1846 + }, + { + "epoch": 0.13326599083661025, + "grad_norm": 0.09254894405603409, + "learning_rate": 0.00019468321547120798, + "loss": 0.1769, + "step": 1847 + }, + { + "epoch": 0.1333381435116707, + "grad_norm": 0.11452503502368927, + "learning_rate": 0.00019468032905181124, + "loss": 0.1905, + "step": 1848 + }, + { + "epoch": 0.13341029618673111, + "grad_norm": 0.09670179337263107, + "learning_rate": 0.0001946774426324145, + "loss": 0.1682, + "step": 1849 + }, + { + "epoch": 0.13348244886179156, + "grad_norm": 0.09185737371444702, + "learning_rate": 0.00019467455621301777, + "loss": 0.1554, + "step": 1850 + }, + { + "epoch": 0.13355460153685197, + "grad_norm": 0.11319153755903244, + "learning_rate": 0.000194671669793621, + "loss": 0.1502, + "step": 1851 + }, + { + "epoch": 0.13362675421191242, + "grad_norm": 0.09828914701938629, + "learning_rate": 0.0001946687833742243, + "loss": 0.1499, + "step": 1852 + }, + { + "epoch": 0.13369890688697283, + "grad_norm": 0.08897096663713455, + "learning_rate": 0.00019466589695482756, + "loss": 0.1501, + "step": 1853 + }, + { + "epoch": 0.13377105956203325, + "grad_norm": 0.12592235207557678, + "learning_rate": 0.00019466301053543082, + "loss": 0.1774, + "step": 1854 + }, + { + "epoch": 0.1338432122370937, + "grad_norm": 0.1032068282365799, + "learning_rate": 0.00019466012411603408, + "loss": 0.1933, + "step": 1855 + }, + { + "epoch": 0.1339153649121541, + "grad_norm": 0.09040278196334839, + "learning_rate": 0.00019465723769663732, + "loss": 0.1217, + "step": 1856 + }, + { + "epoch": 0.13398751758721456, + "grad_norm": 0.12092549353837967, + "learning_rate": 0.00019465435127724058, + "loss": 0.1422, + "step": 1857 + }, + { + "epoch": 0.13405967026227497, + "grad_norm": 0.11543098092079163, + "learning_rate": 0.00019465146485784385, + "loss": 0.2229, + "step": 1858 + }, + { + "epoch": 0.13413182293733542, + "grad_norm": 0.10631363093852997, + "learning_rate": 0.00019464857843844714, + "loss": 0.1798, + "step": 1859 + }, + { + "epoch": 0.13420397561239583, + "grad_norm": 0.11276703327894211, + "learning_rate": 0.0001946456920190504, + "loss": 0.1369, + "step": 1860 + }, + { + "epoch": 0.13427612828745625, + "grad_norm": 0.09238825738430023, + "learning_rate": 0.00019464280559965363, + "loss": 0.1926, + "step": 1861 + }, + { + "epoch": 0.1343482809625167, + "grad_norm": 0.10447818040847778, + "learning_rate": 0.0001946399191802569, + "loss": 0.1148, + "step": 1862 + }, + { + "epoch": 0.1344204336375771, + "grad_norm": 0.12700176239013672, + "learning_rate": 0.00019463703276086016, + "loss": 0.1665, + "step": 1863 + }, + { + "epoch": 0.13449258631263755, + "grad_norm": 0.10683518648147583, + "learning_rate": 0.00019463414634146342, + "loss": 0.139, + "step": 1864 + }, + { + "epoch": 0.13456473898769797, + "grad_norm": 0.08258277922868729, + "learning_rate": 0.00019463125992206669, + "loss": 0.1678, + "step": 1865 + }, + { + "epoch": 0.13463689166275838, + "grad_norm": 0.10914620757102966, + "learning_rate": 0.00019462837350266995, + "loss": 0.2019, + "step": 1866 + }, + { + "epoch": 0.13470904433781883, + "grad_norm": 0.09897346794605255, + "learning_rate": 0.0001946254870832732, + "loss": 0.1521, + "step": 1867 + }, + { + "epoch": 0.13478119701287924, + "grad_norm": 0.09971527755260468, + "learning_rate": 0.00019462260066387647, + "loss": 0.1715, + "step": 1868 + }, + { + "epoch": 0.1348533496879397, + "grad_norm": 0.11484746634960175, + "learning_rate": 0.00019461971424447974, + "loss": 0.2191, + "step": 1869 + }, + { + "epoch": 0.1349255023630001, + "grad_norm": 0.09504850953817368, + "learning_rate": 0.000194616827825083, + "loss": 0.2144, + "step": 1870 + }, + { + "epoch": 0.13499765503806055, + "grad_norm": 0.10230226814746857, + "learning_rate": 0.00019461394140568626, + "loss": 0.2234, + "step": 1871 + }, + { + "epoch": 0.13506980771312097, + "grad_norm": 0.09885798394680023, + "learning_rate": 0.0001946110549862895, + "loss": 0.1635, + "step": 1872 + }, + { + "epoch": 0.13514196038818138, + "grad_norm": 0.09148430079221725, + "learning_rate": 0.00019460816856689276, + "loss": 0.1906, + "step": 1873 + }, + { + "epoch": 0.13521411306324183, + "grad_norm": 0.10847075283527374, + "learning_rate": 0.00019460528214749605, + "loss": 0.1491, + "step": 1874 + }, + { + "epoch": 0.13528626573830224, + "grad_norm": 0.0958617776632309, + "learning_rate": 0.00019460239572809931, + "loss": 0.2183, + "step": 1875 + }, + { + "epoch": 0.13535841841336269, + "grad_norm": 0.09568075835704803, + "learning_rate": 0.00019459950930870258, + "loss": 0.1558, + "step": 1876 + }, + { + "epoch": 0.1354305710884231, + "grad_norm": 0.09332866221666336, + "learning_rate": 0.00019459662288930581, + "loss": 0.1599, + "step": 1877 + }, + { + "epoch": 0.13550272376348352, + "grad_norm": 0.09271859377622604, + "learning_rate": 0.00019459373646990908, + "loss": 0.1796, + "step": 1878 + }, + { + "epoch": 0.13557487643854396, + "grad_norm": 0.1295979619026184, + "learning_rate": 0.00019459085005051234, + "loss": 0.19, + "step": 1879 + }, + { + "epoch": 0.13564702911360438, + "grad_norm": 0.0919765904545784, + "learning_rate": 0.0001945879636311156, + "loss": 0.1655, + "step": 1880 + }, + { + "epoch": 0.13571918178866482, + "grad_norm": 0.08181504905223846, + "learning_rate": 0.0001945850772117189, + "loss": 0.1394, + "step": 1881 + }, + { + "epoch": 0.13579133446372524, + "grad_norm": 0.10545797646045685, + "learning_rate": 0.00019458219079232213, + "loss": 0.1501, + "step": 1882 + }, + { + "epoch": 0.13586348713878568, + "grad_norm": 0.09444499015808105, + "learning_rate": 0.0001945793043729254, + "loss": 0.1779, + "step": 1883 + }, + { + "epoch": 0.1359356398138461, + "grad_norm": 0.07674098014831543, + "learning_rate": 0.00019457641795352865, + "loss": 0.1684, + "step": 1884 + }, + { + "epoch": 0.13600779248890651, + "grad_norm": 0.11596457660198212, + "learning_rate": 0.00019457353153413192, + "loss": 0.1599, + "step": 1885 + }, + { + "epoch": 0.13607994516396696, + "grad_norm": 0.11078653484582901, + "learning_rate": 0.00019457064511473518, + "loss": 0.1681, + "step": 1886 + }, + { + "epoch": 0.13615209783902738, + "grad_norm": 0.11662109941244125, + "learning_rate": 0.00019456775869533844, + "loss": 0.1897, + "step": 1887 + }, + { + "epoch": 0.13622425051408782, + "grad_norm": 0.11415568739175797, + "learning_rate": 0.0001945648722759417, + "loss": 0.2309, + "step": 1888 + }, + { + "epoch": 0.13629640318914824, + "grad_norm": 0.105406254529953, + "learning_rate": 0.00019456198585654497, + "loss": 0.2058, + "step": 1889 + }, + { + "epoch": 0.13636855586420868, + "grad_norm": 0.11074227094650269, + "learning_rate": 0.00019455909943714823, + "loss": 0.1473, + "step": 1890 + }, + { + "epoch": 0.1364407085392691, + "grad_norm": 0.10308904945850372, + "learning_rate": 0.0001945562130177515, + "loss": 0.1371, + "step": 1891 + }, + { + "epoch": 0.1365128612143295, + "grad_norm": 0.07854129374027252, + "learning_rate": 0.00019455332659835476, + "loss": 0.1641, + "step": 1892 + }, + { + "epoch": 0.13658501388938996, + "grad_norm": 0.12944363057613373, + "learning_rate": 0.000194550440178958, + "loss": 0.171, + "step": 1893 + }, + { + "epoch": 0.13665716656445037, + "grad_norm": 0.08137572556734085, + "learning_rate": 0.00019454755375956126, + "loss": 0.2022, + "step": 1894 + }, + { + "epoch": 0.13672931923951082, + "grad_norm": 0.10226333141326904, + "learning_rate": 0.00019454466734016455, + "loss": 0.1736, + "step": 1895 + }, + { + "epoch": 0.13680147191457123, + "grad_norm": 0.12806734442710876, + "learning_rate": 0.0001945417809207678, + "loss": 0.2227, + "step": 1896 + }, + { + "epoch": 0.13687362458963165, + "grad_norm": 0.09774816781282425, + "learning_rate": 0.00019453889450137107, + "loss": 0.1582, + "step": 1897 + }, + { + "epoch": 0.1369457772646921, + "grad_norm": 0.10411229729652405, + "learning_rate": 0.0001945360080819743, + "loss": 0.1476, + "step": 1898 + }, + { + "epoch": 0.1370179299397525, + "grad_norm": 0.11847859621047974, + "learning_rate": 0.00019453312166257757, + "loss": 0.1689, + "step": 1899 + }, + { + "epoch": 0.13709008261481295, + "grad_norm": 0.11297795176506042, + "learning_rate": 0.00019453023524318083, + "loss": 0.2132, + "step": 1900 + }, + { + "epoch": 0.13716223528987337, + "grad_norm": 0.11359799653291702, + "learning_rate": 0.0001945273488237841, + "loss": 0.1307, + "step": 1901 + }, + { + "epoch": 0.1372343879649338, + "grad_norm": 0.13657835125923157, + "learning_rate": 0.0001945244624043874, + "loss": 0.2036, + "step": 1902 + }, + { + "epoch": 0.13730654063999423, + "grad_norm": 0.15571734309196472, + "learning_rate": 0.00019452157598499062, + "loss": 0.2117, + "step": 1903 + }, + { + "epoch": 0.13737869331505465, + "grad_norm": 0.1211012601852417, + "learning_rate": 0.00019451868956559389, + "loss": 0.1738, + "step": 1904 + }, + { + "epoch": 0.1374508459901151, + "grad_norm": 0.0977838858962059, + "learning_rate": 0.00019451580314619715, + "loss": 0.1783, + "step": 1905 + }, + { + "epoch": 0.1375229986651755, + "grad_norm": 0.09584508091211319, + "learning_rate": 0.0001945129167268004, + "loss": 0.1419, + "step": 1906 + }, + { + "epoch": 0.13759515134023595, + "grad_norm": 0.10878670960664749, + "learning_rate": 0.00019451003030740367, + "loss": 0.1396, + "step": 1907 + }, + { + "epoch": 0.13766730401529637, + "grad_norm": 0.09390582889318466, + "learning_rate": 0.00019450714388800694, + "loss": 0.1816, + "step": 1908 + }, + { + "epoch": 0.13773945669035678, + "grad_norm": 0.09813504666090012, + "learning_rate": 0.0001945042574686102, + "loss": 0.2146, + "step": 1909 + }, + { + "epoch": 0.13781160936541723, + "grad_norm": 0.08499909937381744, + "learning_rate": 0.00019450137104921346, + "loss": 0.163, + "step": 1910 + }, + { + "epoch": 0.13788376204047764, + "grad_norm": 0.08602595329284668, + "learning_rate": 0.00019449848462981673, + "loss": 0.167, + "step": 1911 + }, + { + "epoch": 0.13795591471553809, + "grad_norm": 0.08455024659633636, + "learning_rate": 0.00019449559821042, + "loss": 0.1724, + "step": 1912 + }, + { + "epoch": 0.1380280673905985, + "grad_norm": 0.08802422136068344, + "learning_rate": 0.00019449271179102325, + "loss": 0.1765, + "step": 1913 + }, + { + "epoch": 0.13810022006565895, + "grad_norm": 0.10855334252119064, + "learning_rate": 0.0001944898253716265, + "loss": 0.177, + "step": 1914 + }, + { + "epoch": 0.13817237274071936, + "grad_norm": 0.11316860467195511, + "learning_rate": 0.00019448693895222975, + "loss": 0.1573, + "step": 1915 + }, + { + "epoch": 0.13824452541577978, + "grad_norm": 0.10946952551603317, + "learning_rate": 0.00019448405253283304, + "loss": 0.156, + "step": 1916 + }, + { + "epoch": 0.13831667809084022, + "grad_norm": 0.09252781420946121, + "learning_rate": 0.0001944811661134363, + "loss": 0.1535, + "step": 1917 + }, + { + "epoch": 0.13838883076590064, + "grad_norm": 0.12185255438089371, + "learning_rate": 0.00019447827969403957, + "loss": 0.1442, + "step": 1918 + }, + { + "epoch": 0.13846098344096108, + "grad_norm": 0.09439793974161148, + "learning_rate": 0.0001944753932746428, + "loss": 0.1578, + "step": 1919 + }, + { + "epoch": 0.1385331361160215, + "grad_norm": 0.09366545081138611, + "learning_rate": 0.00019447250685524606, + "loss": 0.1105, + "step": 1920 + }, + { + "epoch": 0.13860528879108194, + "grad_norm": 0.14343176782131195, + "learning_rate": 0.00019446962043584933, + "loss": 0.1632, + "step": 1921 + }, + { + "epoch": 0.13867744146614236, + "grad_norm": 0.12436092644929886, + "learning_rate": 0.0001944667340164526, + "loss": 0.2113, + "step": 1922 + }, + { + "epoch": 0.13874959414120278, + "grad_norm": 0.0957041010260582, + "learning_rate": 0.00019446384759705588, + "loss": 0.1225, + "step": 1923 + }, + { + "epoch": 0.13882174681626322, + "grad_norm": 0.12467844784259796, + "learning_rate": 0.00019446096117765912, + "loss": 0.1908, + "step": 1924 + }, + { + "epoch": 0.13889389949132364, + "grad_norm": 0.14240126311779022, + "learning_rate": 0.00019445807475826238, + "loss": 0.2221, + "step": 1925 + }, + { + "epoch": 0.13896605216638408, + "grad_norm": 0.07637129724025726, + "learning_rate": 0.00019445518833886564, + "loss": 0.1789, + "step": 1926 + }, + { + "epoch": 0.1390382048414445, + "grad_norm": 0.09346475452184677, + "learning_rate": 0.0001944523019194689, + "loss": 0.1883, + "step": 1927 + }, + { + "epoch": 0.1391103575165049, + "grad_norm": 0.10540024191141129, + "learning_rate": 0.00019444941550007217, + "loss": 0.1405, + "step": 1928 + }, + { + "epoch": 0.13918251019156536, + "grad_norm": 0.09005219489336014, + "learning_rate": 0.00019444652908067543, + "loss": 0.185, + "step": 1929 + }, + { + "epoch": 0.13925466286662577, + "grad_norm": 0.12073545157909393, + "learning_rate": 0.0001944436426612787, + "loss": 0.1647, + "step": 1930 + }, + { + "epoch": 0.13932681554168622, + "grad_norm": 0.12057455629110336, + "learning_rate": 0.00019444075624188196, + "loss": 0.2004, + "step": 1931 + }, + { + "epoch": 0.13939896821674663, + "grad_norm": 0.107715904712677, + "learning_rate": 0.00019443786982248522, + "loss": 0.1593, + "step": 1932 + }, + { + "epoch": 0.13947112089180708, + "grad_norm": 0.0985242947936058, + "learning_rate": 0.00019443498340308848, + "loss": 0.1693, + "step": 1933 + }, + { + "epoch": 0.1395432735668675, + "grad_norm": 0.09869597107172012, + "learning_rate": 0.00019443209698369175, + "loss": 0.1748, + "step": 1934 + }, + { + "epoch": 0.1396154262419279, + "grad_norm": 0.11241772025823593, + "learning_rate": 0.00019442921056429498, + "loss": 0.137, + "step": 1935 + }, + { + "epoch": 0.13968757891698835, + "grad_norm": 0.11124883592128754, + "learning_rate": 0.00019442632414489824, + "loss": 0.1555, + "step": 1936 + }, + { + "epoch": 0.13975973159204877, + "grad_norm": 0.0856330394744873, + "learning_rate": 0.00019442343772550153, + "loss": 0.1238, + "step": 1937 + }, + { + "epoch": 0.1398318842671092, + "grad_norm": 0.10875482112169266, + "learning_rate": 0.0001944205513061048, + "loss": 0.1169, + "step": 1938 + }, + { + "epoch": 0.13990403694216963, + "grad_norm": 0.1266622692346573, + "learning_rate": 0.00019441766488670806, + "loss": 0.2094, + "step": 1939 + }, + { + "epoch": 0.13997618961723005, + "grad_norm": 0.08601401746273041, + "learning_rate": 0.0001944147784673113, + "loss": 0.1181, + "step": 1940 + }, + { + "epoch": 0.1400483422922905, + "grad_norm": 0.11201686412096024, + "learning_rate": 0.00019441189204791456, + "loss": 0.1631, + "step": 1941 + }, + { + "epoch": 0.1401204949673509, + "grad_norm": 0.08824379742145538, + "learning_rate": 0.00019440900562851782, + "loss": 0.1511, + "step": 1942 + }, + { + "epoch": 0.14019264764241135, + "grad_norm": 0.10340140759944916, + "learning_rate": 0.00019440611920912109, + "loss": 0.1422, + "step": 1943 + }, + { + "epoch": 0.14026480031747177, + "grad_norm": 0.10507574677467346, + "learning_rate": 0.00019440323278972438, + "loss": 0.1216, + "step": 1944 + }, + { + "epoch": 0.1403369529925322, + "grad_norm": 0.10035047680139542, + "learning_rate": 0.0001944003463703276, + "loss": 0.1783, + "step": 1945 + }, + { + "epoch": 0.14040910566759263, + "grad_norm": 0.09842780977487564, + "learning_rate": 0.00019439745995093087, + "loss": 0.1415, + "step": 1946 + }, + { + "epoch": 0.14048125834265304, + "grad_norm": 0.08447740972042084, + "learning_rate": 0.00019439457353153414, + "loss": 0.1409, + "step": 1947 + }, + { + "epoch": 0.1405534110177135, + "grad_norm": 0.10483802855014801, + "learning_rate": 0.0001943916871121374, + "loss": 0.1897, + "step": 1948 + }, + { + "epoch": 0.1406255636927739, + "grad_norm": 0.12013499438762665, + "learning_rate": 0.00019438880069274066, + "loss": 0.2073, + "step": 1949 + }, + { + "epoch": 0.14069771636783435, + "grad_norm": 0.11932390183210373, + "learning_rate": 0.00019438591427334393, + "loss": 0.1487, + "step": 1950 + }, + { + "epoch": 0.14076986904289476, + "grad_norm": 0.12911708652973175, + "learning_rate": 0.0001943830278539472, + "loss": 0.1897, + "step": 1951 + }, + { + "epoch": 0.1408420217179552, + "grad_norm": 0.09570632874965668, + "learning_rate": 0.00019438014143455045, + "loss": 0.202, + "step": 1952 + }, + { + "epoch": 0.14091417439301562, + "grad_norm": 0.13848714530467987, + "learning_rate": 0.00019437725501515371, + "loss": 0.1811, + "step": 1953 + }, + { + "epoch": 0.14098632706807604, + "grad_norm": 0.11773217469453812, + "learning_rate": 0.00019437436859575698, + "loss": 0.1762, + "step": 1954 + }, + { + "epoch": 0.14105847974313648, + "grad_norm": 0.11776330322027206, + "learning_rate": 0.00019437148217636024, + "loss": 0.1865, + "step": 1955 + }, + { + "epoch": 0.1411306324181969, + "grad_norm": 0.1038830578327179, + "learning_rate": 0.0001943685957569635, + "loss": 0.1383, + "step": 1956 + }, + { + "epoch": 0.14120278509325734, + "grad_norm": 0.09537827223539352, + "learning_rate": 0.00019436570933756674, + "loss": 0.1467, + "step": 1957 + }, + { + "epoch": 0.14127493776831776, + "grad_norm": 0.11215817928314209, + "learning_rate": 0.00019436282291817003, + "loss": 0.1738, + "step": 1958 + }, + { + "epoch": 0.14134709044337818, + "grad_norm": 0.08892468363046646, + "learning_rate": 0.0001943599364987733, + "loss": 0.19, + "step": 1959 + }, + { + "epoch": 0.14141924311843862, + "grad_norm": 0.1363980621099472, + "learning_rate": 0.00019435705007937655, + "loss": 0.1803, + "step": 1960 + }, + { + "epoch": 0.14149139579349904, + "grad_norm": 0.09271281212568283, + "learning_rate": 0.00019435416365997982, + "loss": 0.143, + "step": 1961 + }, + { + "epoch": 0.14156354846855948, + "grad_norm": 0.09465882927179337, + "learning_rate": 0.00019435127724058305, + "loss": 0.1877, + "step": 1962 + }, + { + "epoch": 0.1416357011436199, + "grad_norm": 0.16298234462738037, + "learning_rate": 0.00019434839082118632, + "loss": 0.2021, + "step": 1963 + }, + { + "epoch": 0.14170785381868034, + "grad_norm": 0.1135706827044487, + "learning_rate": 0.00019434550440178958, + "loss": 0.2151, + "step": 1964 + }, + { + "epoch": 0.14178000649374076, + "grad_norm": 0.08292524516582489, + "learning_rate": 0.00019434261798239287, + "loss": 0.1909, + "step": 1965 + }, + { + "epoch": 0.14185215916880117, + "grad_norm": 0.1263180375099182, + "learning_rate": 0.00019433973156299613, + "loss": 0.1869, + "step": 1966 + }, + { + "epoch": 0.14192431184386162, + "grad_norm": 0.12897604703903198, + "learning_rate": 0.00019433684514359937, + "loss": 0.1787, + "step": 1967 + }, + { + "epoch": 0.14199646451892203, + "grad_norm": 0.08955071121454239, + "learning_rate": 0.00019433395872420263, + "loss": 0.1883, + "step": 1968 + }, + { + "epoch": 0.14206861719398248, + "grad_norm": 0.09205218404531479, + "learning_rate": 0.0001943310723048059, + "loss": 0.2319, + "step": 1969 + }, + { + "epoch": 0.1421407698690429, + "grad_norm": 0.10378434509038925, + "learning_rate": 0.00019432818588540916, + "loss": 0.1748, + "step": 1970 + }, + { + "epoch": 0.1422129225441033, + "grad_norm": 0.10211426019668579, + "learning_rate": 0.00019432529946601242, + "loss": 0.1973, + "step": 1971 + }, + { + "epoch": 0.14228507521916375, + "grad_norm": 0.10617047548294067, + "learning_rate": 0.00019432241304661568, + "loss": 0.172, + "step": 1972 + }, + { + "epoch": 0.14235722789422417, + "grad_norm": 0.12520731985569, + "learning_rate": 0.00019431952662721895, + "loss": 0.1725, + "step": 1973 + }, + { + "epoch": 0.1424293805692846, + "grad_norm": 0.09489186853170395, + "learning_rate": 0.0001943166402078222, + "loss": 0.1586, + "step": 1974 + }, + { + "epoch": 0.14250153324434503, + "grad_norm": 0.09619477391242981, + "learning_rate": 0.00019431375378842547, + "loss": 0.1627, + "step": 1975 + }, + { + "epoch": 0.14257368591940547, + "grad_norm": 0.10977262258529663, + "learning_rate": 0.00019431086736902873, + "loss": 0.1902, + "step": 1976 + }, + { + "epoch": 0.1426458385944659, + "grad_norm": 0.11333739757537842, + "learning_rate": 0.000194307980949632, + "loss": 0.1579, + "step": 1977 + }, + { + "epoch": 0.1427179912695263, + "grad_norm": 0.10972239822149277, + "learning_rate": 0.00019430509453023523, + "loss": 0.1612, + "step": 1978 + }, + { + "epoch": 0.14279014394458675, + "grad_norm": 0.11612124741077423, + "learning_rate": 0.00019430220811083852, + "loss": 0.1494, + "step": 1979 + }, + { + "epoch": 0.14286229661964717, + "grad_norm": 0.11529509723186493, + "learning_rate": 0.00019429932169144179, + "loss": 0.1768, + "step": 1980 + }, + { + "epoch": 0.1429344492947076, + "grad_norm": 0.09690426290035248, + "learning_rate": 0.00019429643527204505, + "loss": 0.1722, + "step": 1981 + }, + { + "epoch": 0.14300660196976803, + "grad_norm": 0.10925997048616409, + "learning_rate": 0.0001942935488526483, + "loss": 0.1886, + "step": 1982 + }, + { + "epoch": 0.14307875464482847, + "grad_norm": 0.12713110446929932, + "learning_rate": 0.00019429066243325155, + "loss": 0.1769, + "step": 1983 + }, + { + "epoch": 0.1431509073198889, + "grad_norm": 0.1117352619767189, + "learning_rate": 0.0001942877760138548, + "loss": 0.1929, + "step": 1984 + }, + { + "epoch": 0.1432230599949493, + "grad_norm": 0.10470899194478989, + "learning_rate": 0.00019428488959445807, + "loss": 0.1455, + "step": 1985 + }, + { + "epoch": 0.14329521267000975, + "grad_norm": 0.12778021395206451, + "learning_rate": 0.00019428200317506136, + "loss": 0.1646, + "step": 1986 + }, + { + "epoch": 0.14336736534507016, + "grad_norm": 0.12582525610923767, + "learning_rate": 0.00019427911675566463, + "loss": 0.237, + "step": 1987 + }, + { + "epoch": 0.1434395180201306, + "grad_norm": 0.09902142733335495, + "learning_rate": 0.00019427623033626786, + "loss": 0.1767, + "step": 1988 + }, + { + "epoch": 0.14351167069519102, + "grad_norm": 0.09508440643548965, + "learning_rate": 0.00019427334391687113, + "loss": 0.1734, + "step": 1989 + }, + { + "epoch": 0.14358382337025144, + "grad_norm": 0.09515651315450668, + "learning_rate": 0.0001942704574974744, + "loss": 0.1623, + "step": 1990 + }, + { + "epoch": 0.14365597604531188, + "grad_norm": 0.10254396498203278, + "learning_rate": 0.00019426757107807765, + "loss": 0.154, + "step": 1991 + }, + { + "epoch": 0.1437281287203723, + "grad_norm": 0.0942922905087471, + "learning_rate": 0.00019426468465868091, + "loss": 0.1381, + "step": 1992 + }, + { + "epoch": 0.14380028139543274, + "grad_norm": 0.0835883840918541, + "learning_rate": 0.00019426179823928418, + "loss": 0.1729, + "step": 1993 + }, + { + "epoch": 0.14387243407049316, + "grad_norm": 0.11193643510341644, + "learning_rate": 0.00019425891181988744, + "loss": 0.1195, + "step": 1994 + }, + { + "epoch": 0.1439445867455536, + "grad_norm": 0.13489171862602234, + "learning_rate": 0.0001942560254004907, + "loss": 0.1484, + "step": 1995 + }, + { + "epoch": 0.14401673942061402, + "grad_norm": 0.09976416081190109, + "learning_rate": 0.00019425313898109397, + "loss": 0.1413, + "step": 1996 + }, + { + "epoch": 0.14408889209567444, + "grad_norm": 0.11297965794801712, + "learning_rate": 0.00019425025256169723, + "loss": 0.1699, + "step": 1997 + }, + { + "epoch": 0.14416104477073488, + "grad_norm": 0.08634833991527557, + "learning_rate": 0.0001942473661423005, + "loss": 0.1921, + "step": 1998 + }, + { + "epoch": 0.1442331974457953, + "grad_norm": 0.12895886600017548, + "learning_rate": 0.00019424447972290373, + "loss": 0.2203, + "step": 1999 + }, + { + "epoch": 0.14430535012085574, + "grad_norm": 0.10602421313524246, + "learning_rate": 0.00019424159330350702, + "loss": 0.1411, + "step": 2000 + }, + { + "epoch": 0.14437750279591616, + "grad_norm": 0.09780508279800415, + "learning_rate": 0.00019423870688411028, + "loss": 0.1217, + "step": 2001 + }, + { + "epoch": 0.14444965547097657, + "grad_norm": 0.10753864049911499, + "learning_rate": 0.00019423582046471354, + "loss": 0.1232, + "step": 2002 + }, + { + "epoch": 0.14452180814603702, + "grad_norm": 0.10284475982189178, + "learning_rate": 0.0001942329340453168, + "loss": 0.146, + "step": 2003 + }, + { + "epoch": 0.14459396082109743, + "grad_norm": 0.11914925277233124, + "learning_rate": 0.00019423004762592004, + "loss": 0.1712, + "step": 2004 + }, + { + "epoch": 0.14466611349615788, + "grad_norm": 0.10978496074676514, + "learning_rate": 0.0001942271612065233, + "loss": 0.2042, + "step": 2005 + }, + { + "epoch": 0.1447382661712183, + "grad_norm": 0.10618040710687637, + "learning_rate": 0.00019422427478712657, + "loss": 0.1746, + "step": 2006 + }, + { + "epoch": 0.14481041884627874, + "grad_norm": 0.11452309042215347, + "learning_rate": 0.00019422138836772986, + "loss": 0.1598, + "step": 2007 + }, + { + "epoch": 0.14488257152133915, + "grad_norm": 0.11929607391357422, + "learning_rate": 0.00019421850194833312, + "loss": 0.1393, + "step": 2008 + }, + { + "epoch": 0.14495472419639957, + "grad_norm": 0.10663137584924698, + "learning_rate": 0.00019421561552893636, + "loss": 0.1993, + "step": 2009 + }, + { + "epoch": 0.14502687687146001, + "grad_norm": 0.09824167937040329, + "learning_rate": 0.00019421272910953962, + "loss": 0.1712, + "step": 2010 + }, + { + "epoch": 0.14509902954652043, + "grad_norm": 0.10585514456033707, + "learning_rate": 0.00019420984269014288, + "loss": 0.1899, + "step": 2011 + }, + { + "epoch": 0.14517118222158087, + "grad_norm": 0.1193956658244133, + "learning_rate": 0.00019420695627074615, + "loss": 0.2205, + "step": 2012 + }, + { + "epoch": 0.1452433348966413, + "grad_norm": 0.15590347349643707, + "learning_rate": 0.0001942040698513494, + "loss": 0.2128, + "step": 2013 + }, + { + "epoch": 0.14531548757170173, + "grad_norm": 0.12089014053344727, + "learning_rate": 0.00019420118343195267, + "loss": 0.1908, + "step": 2014 + }, + { + "epoch": 0.14538764024676215, + "grad_norm": 0.09876030683517456, + "learning_rate": 0.00019419829701255593, + "loss": 0.1806, + "step": 2015 + }, + { + "epoch": 0.14545979292182257, + "grad_norm": 0.14723807573318481, + "learning_rate": 0.0001941954105931592, + "loss": 0.234, + "step": 2016 + }, + { + "epoch": 0.145531945596883, + "grad_norm": 0.11394286155700684, + "learning_rate": 0.00019419252417376246, + "loss": 0.1521, + "step": 2017 + }, + { + "epoch": 0.14560409827194343, + "grad_norm": 0.09444501250982285, + "learning_rate": 0.00019418963775436572, + "loss": 0.1686, + "step": 2018 + }, + { + "epoch": 0.14567625094700387, + "grad_norm": 0.11814798414707184, + "learning_rate": 0.00019418675133496899, + "loss": 0.1781, + "step": 2019 + }, + { + "epoch": 0.1457484036220643, + "grad_norm": 0.08465192466974258, + "learning_rate": 0.00019418386491557222, + "loss": 0.1583, + "step": 2020 + }, + { + "epoch": 0.1458205562971247, + "grad_norm": 0.10846804827451706, + "learning_rate": 0.0001941809784961755, + "loss": 0.1428, + "step": 2021 + }, + { + "epoch": 0.14589270897218515, + "grad_norm": 0.10981445759534836, + "learning_rate": 0.00019417809207677877, + "loss": 0.2108, + "step": 2022 + }, + { + "epoch": 0.14596486164724556, + "grad_norm": 0.10098119080066681, + "learning_rate": 0.00019417520565738204, + "loss": 0.155, + "step": 2023 + }, + { + "epoch": 0.146037014322306, + "grad_norm": 0.088902547955513, + "learning_rate": 0.0001941723192379853, + "loss": 0.1681, + "step": 2024 + }, + { + "epoch": 0.14610916699736642, + "grad_norm": 0.10116327553987503, + "learning_rate": 0.00019416943281858854, + "loss": 0.1989, + "step": 2025 + }, + { + "epoch": 0.14618131967242687, + "grad_norm": 0.09185565263032913, + "learning_rate": 0.0001941665463991918, + "loss": 0.1841, + "step": 2026 + }, + { + "epoch": 0.14625347234748728, + "grad_norm": 0.10266850143671036, + "learning_rate": 0.00019416365997979506, + "loss": 0.2001, + "step": 2027 + }, + { + "epoch": 0.1463256250225477, + "grad_norm": 0.1450365036725998, + "learning_rate": 0.00019416077356039835, + "loss": 0.1862, + "step": 2028 + }, + { + "epoch": 0.14639777769760814, + "grad_norm": 0.11125296354293823, + "learning_rate": 0.00019415788714100162, + "loss": 0.1762, + "step": 2029 + }, + { + "epoch": 0.14646993037266856, + "grad_norm": 0.09041983634233475, + "learning_rate": 0.00019415500072160485, + "loss": 0.1851, + "step": 2030 + }, + { + "epoch": 0.146542083047729, + "grad_norm": 0.10607034713029861, + "learning_rate": 0.00019415211430220811, + "loss": 0.1271, + "step": 2031 + }, + { + "epoch": 0.14661423572278942, + "grad_norm": 0.10995256155729294, + "learning_rate": 0.00019414922788281138, + "loss": 0.1591, + "step": 2032 + }, + { + "epoch": 0.14668638839784984, + "grad_norm": 0.11722482740879059, + "learning_rate": 0.00019414634146341464, + "loss": 0.1574, + "step": 2033 + }, + { + "epoch": 0.14675854107291028, + "grad_norm": 0.16310542821884155, + "learning_rate": 0.0001941434550440179, + "loss": 0.1847, + "step": 2034 + }, + { + "epoch": 0.1468306937479707, + "grad_norm": 0.1479705572128296, + "learning_rate": 0.00019414056862462117, + "loss": 0.198, + "step": 2035 + }, + { + "epoch": 0.14690284642303114, + "grad_norm": 0.09503147006034851, + "learning_rate": 0.00019413768220522443, + "loss": 0.1295, + "step": 2036 + }, + { + "epoch": 0.14697499909809156, + "grad_norm": 0.12894858419895172, + "learning_rate": 0.0001941347957858277, + "loss": 0.1788, + "step": 2037 + }, + { + "epoch": 0.147047151773152, + "grad_norm": 0.10748642683029175, + "learning_rate": 0.00019413190936643095, + "loss": 0.1436, + "step": 2038 + }, + { + "epoch": 0.14711930444821242, + "grad_norm": 0.097632497549057, + "learning_rate": 0.00019412902294703422, + "loss": 0.158, + "step": 2039 + }, + { + "epoch": 0.14719145712327283, + "grad_norm": 0.09195639938116074, + "learning_rate": 0.00019412613652763748, + "loss": 0.1506, + "step": 2040 + }, + { + "epoch": 0.14726360979833328, + "grad_norm": 0.11854150891304016, + "learning_rate": 0.00019412325010824072, + "loss": 0.1444, + "step": 2041 + }, + { + "epoch": 0.1473357624733937, + "grad_norm": 0.12206149101257324, + "learning_rate": 0.000194120363688844, + "loss": 0.2044, + "step": 2042 + }, + { + "epoch": 0.14740791514845414, + "grad_norm": 0.09963233768939972, + "learning_rate": 0.00019411747726944727, + "loss": 0.1322, + "step": 2043 + }, + { + "epoch": 0.14748006782351455, + "grad_norm": 0.09679488092660904, + "learning_rate": 0.00019411459085005053, + "loss": 0.1614, + "step": 2044 + }, + { + "epoch": 0.14755222049857497, + "grad_norm": 0.08919347077608109, + "learning_rate": 0.0001941117044306538, + "loss": 0.1638, + "step": 2045 + }, + { + "epoch": 0.14762437317363541, + "grad_norm": 0.10073181241750717, + "learning_rate": 0.00019410881801125703, + "loss": 0.1578, + "step": 2046 + }, + { + "epoch": 0.14769652584869583, + "grad_norm": 0.09946445375680923, + "learning_rate": 0.0001941059315918603, + "loss": 0.1246, + "step": 2047 + }, + { + "epoch": 0.14776867852375627, + "grad_norm": 0.12240669876337051, + "learning_rate": 0.00019410304517246356, + "loss": 0.1702, + "step": 2048 + }, + { + "epoch": 0.1478408311988167, + "grad_norm": 0.13062608242034912, + "learning_rate": 0.00019410015875306685, + "loss": 0.1688, + "step": 2049 + }, + { + "epoch": 0.14791298387387714, + "grad_norm": 0.10089318454265594, + "learning_rate": 0.0001940972723336701, + "loss": 0.1463, + "step": 2050 + }, + { + "epoch": 0.14798513654893755, + "grad_norm": 0.10280703753232956, + "learning_rate": 0.00019409438591427335, + "loss": 0.1733, + "step": 2051 + }, + { + "epoch": 0.14805728922399797, + "grad_norm": 0.10768252611160278, + "learning_rate": 0.0001940914994948766, + "loss": 0.1955, + "step": 2052 + }, + { + "epoch": 0.1481294418990584, + "grad_norm": 0.10142546147108078, + "learning_rate": 0.00019408861307547987, + "loss": 0.1717, + "step": 2053 + }, + { + "epoch": 0.14820159457411883, + "grad_norm": 0.08525700867176056, + "learning_rate": 0.00019408572665608313, + "loss": 0.1226, + "step": 2054 + }, + { + "epoch": 0.14827374724917927, + "grad_norm": 0.19013550877571106, + "learning_rate": 0.0001940828402366864, + "loss": 0.1807, + "step": 2055 + }, + { + "epoch": 0.1483458999242397, + "grad_norm": 0.08787938207387924, + "learning_rate": 0.00019407995381728966, + "loss": 0.1473, + "step": 2056 + }, + { + "epoch": 0.14841805259930013, + "grad_norm": 0.10118581354618073, + "learning_rate": 0.00019407706739789292, + "loss": 0.1856, + "step": 2057 + }, + { + "epoch": 0.14849020527436055, + "grad_norm": 0.10909256339073181, + "learning_rate": 0.00019407418097849619, + "loss": 0.1626, + "step": 2058 + }, + { + "epoch": 0.14856235794942096, + "grad_norm": 0.12503749132156372, + "learning_rate": 0.00019407129455909945, + "loss": 0.1626, + "step": 2059 + }, + { + "epoch": 0.1486345106244814, + "grad_norm": 0.0884174332022667, + "learning_rate": 0.0001940684081397027, + "loss": 0.1542, + "step": 2060 + }, + { + "epoch": 0.14870666329954182, + "grad_norm": 0.08488410711288452, + "learning_rate": 0.00019406552172030597, + "loss": 0.143, + "step": 2061 + }, + { + "epoch": 0.14877881597460227, + "grad_norm": 0.0907353013753891, + "learning_rate": 0.0001940626353009092, + "loss": 0.1397, + "step": 2062 + }, + { + "epoch": 0.14885096864966268, + "grad_norm": 0.10294927656650543, + "learning_rate": 0.00019405974888151247, + "loss": 0.2162, + "step": 2063 + }, + { + "epoch": 0.1489231213247231, + "grad_norm": 0.10255087167024612, + "learning_rate": 0.00019405686246211576, + "loss": 0.1783, + "step": 2064 + }, + { + "epoch": 0.14899527399978355, + "grad_norm": 0.10541793704032898, + "learning_rate": 0.00019405397604271903, + "loss": 0.1471, + "step": 2065 + }, + { + "epoch": 0.14906742667484396, + "grad_norm": 0.09721536934375763, + "learning_rate": 0.0001940510896233223, + "loss": 0.1421, + "step": 2066 + }, + { + "epoch": 0.1491395793499044, + "grad_norm": 0.09638044238090515, + "learning_rate": 0.00019404820320392552, + "loss": 0.1849, + "step": 2067 + }, + { + "epoch": 0.14921173202496482, + "grad_norm": 0.12224458903074265, + "learning_rate": 0.0001940453167845288, + "loss": 0.1846, + "step": 2068 + }, + { + "epoch": 0.14928388470002527, + "grad_norm": 0.13559360802173615, + "learning_rate": 0.00019404243036513205, + "loss": 0.1324, + "step": 2069 + }, + { + "epoch": 0.14935603737508568, + "grad_norm": 0.10047730058431625, + "learning_rate": 0.0001940395439457353, + "loss": 0.1733, + "step": 2070 + }, + { + "epoch": 0.1494281900501461, + "grad_norm": 0.10166924446821213, + "learning_rate": 0.0001940366575263386, + "loss": 0.1772, + "step": 2071 + }, + { + "epoch": 0.14950034272520654, + "grad_norm": 0.08213718980550766, + "learning_rate": 0.00019403377110694184, + "loss": 0.155, + "step": 2072 + }, + { + "epoch": 0.14957249540026696, + "grad_norm": 0.11648483574390411, + "learning_rate": 0.0001940308846875451, + "loss": 0.1453, + "step": 2073 + }, + { + "epoch": 0.1496446480753274, + "grad_norm": 0.09811491519212723, + "learning_rate": 0.00019402799826814837, + "loss": 0.1383, + "step": 2074 + }, + { + "epoch": 0.14971680075038782, + "grad_norm": 0.09943008422851562, + "learning_rate": 0.00019402511184875163, + "loss": 0.149, + "step": 2075 + }, + { + "epoch": 0.14978895342544823, + "grad_norm": 0.09095396101474762, + "learning_rate": 0.0001940222254293549, + "loss": 0.1747, + "step": 2076 + }, + { + "epoch": 0.14986110610050868, + "grad_norm": 0.09340521693229675, + "learning_rate": 0.00019401933900995815, + "loss": 0.1428, + "step": 2077 + }, + { + "epoch": 0.1499332587755691, + "grad_norm": 0.10803819447755814, + "learning_rate": 0.00019401645259056142, + "loss": 0.1745, + "step": 2078 + }, + { + "epoch": 0.15000541145062954, + "grad_norm": 0.0965447798371315, + "learning_rate": 0.00019401356617116468, + "loss": 0.173, + "step": 2079 + }, + { + "epoch": 0.15007756412568996, + "grad_norm": 0.09196187555789948, + "learning_rate": 0.00019401067975176794, + "loss": 0.2279, + "step": 2080 + }, + { + "epoch": 0.1501497168007504, + "grad_norm": 0.0951419547200203, + "learning_rate": 0.0001940077933323712, + "loss": 0.1623, + "step": 2081 + }, + { + "epoch": 0.15022186947581082, + "grad_norm": 0.09142082929611206, + "learning_rate": 0.00019400490691297447, + "loss": 0.1723, + "step": 2082 + }, + { + "epoch": 0.15029402215087123, + "grad_norm": 0.09847602993249893, + "learning_rate": 0.00019400202049357773, + "loss": 0.1311, + "step": 2083 + }, + { + "epoch": 0.15036617482593168, + "grad_norm": 0.11255521327257156, + "learning_rate": 0.00019399913407418097, + "loss": 0.1861, + "step": 2084 + }, + { + "epoch": 0.1504383275009921, + "grad_norm": 0.11026148498058319, + "learning_rate": 0.00019399624765478426, + "loss": 0.1332, + "step": 2085 + }, + { + "epoch": 0.15051048017605254, + "grad_norm": 0.09424188733100891, + "learning_rate": 0.00019399336123538752, + "loss": 0.1839, + "step": 2086 + }, + { + "epoch": 0.15058263285111295, + "grad_norm": 0.09201314300298691, + "learning_rate": 0.00019399047481599078, + "loss": 0.1479, + "step": 2087 + }, + { + "epoch": 0.1506547855261734, + "grad_norm": 0.12157163769006729, + "learning_rate": 0.00019398758839659405, + "loss": 0.1567, + "step": 2088 + }, + { + "epoch": 0.1507269382012338, + "grad_norm": 0.08465547859668732, + "learning_rate": 0.00019398470197719728, + "loss": 0.138, + "step": 2089 + }, + { + "epoch": 0.15079909087629423, + "grad_norm": 0.0765736773610115, + "learning_rate": 0.00019398181555780054, + "loss": 0.1473, + "step": 2090 + }, + { + "epoch": 0.15087124355135467, + "grad_norm": 0.09816624224185944, + "learning_rate": 0.0001939789291384038, + "loss": 0.1539, + "step": 2091 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 0.09986189752817154, + "learning_rate": 0.0001939760427190071, + "loss": 0.1679, + "step": 2092 + }, + { + "epoch": 0.15101554890147553, + "grad_norm": 0.11674615740776062, + "learning_rate": 0.00019397315629961036, + "loss": 0.1389, + "step": 2093 + }, + { + "epoch": 0.15108770157653595, + "grad_norm": 0.11395678669214249, + "learning_rate": 0.0001939702698802136, + "loss": 0.142, + "step": 2094 + }, + { + "epoch": 0.15115985425159637, + "grad_norm": 0.10832277685403824, + "learning_rate": 0.00019396738346081686, + "loss": 0.1535, + "step": 2095 + }, + { + "epoch": 0.1512320069266568, + "grad_norm": 0.11256370693445206, + "learning_rate": 0.00019396449704142012, + "loss": 0.1809, + "step": 2096 + }, + { + "epoch": 0.15130415960171723, + "grad_norm": 0.11528280377388, + "learning_rate": 0.00019396161062202339, + "loss": 0.1339, + "step": 2097 + }, + { + "epoch": 0.15137631227677767, + "grad_norm": 0.09868552535772324, + "learning_rate": 0.00019395872420262665, + "loss": 0.1552, + "step": 2098 + }, + { + "epoch": 0.15144846495183809, + "grad_norm": 0.1049208790063858, + "learning_rate": 0.0001939558377832299, + "loss": 0.1688, + "step": 2099 + }, + { + "epoch": 0.15152061762689853, + "grad_norm": 0.08519754558801651, + "learning_rate": 0.00019395295136383317, + "loss": 0.1878, + "step": 2100 + }, + { + "epoch": 0.15159277030195895, + "grad_norm": 0.09680404514074326, + "learning_rate": 0.00019395006494443644, + "loss": 0.1586, + "step": 2101 + }, + { + "epoch": 0.15166492297701936, + "grad_norm": 0.11290711164474487, + "learning_rate": 0.0001939471785250397, + "loss": 0.1597, + "step": 2102 + }, + { + "epoch": 0.1517370756520798, + "grad_norm": 0.09537848085165024, + "learning_rate": 0.00019394429210564296, + "loss": 0.2275, + "step": 2103 + }, + { + "epoch": 0.15180922832714022, + "grad_norm": 0.08265725523233414, + "learning_rate": 0.00019394140568624623, + "loss": 0.1428, + "step": 2104 + }, + { + "epoch": 0.15188138100220067, + "grad_norm": 0.09027748554944992, + "learning_rate": 0.00019393851926684946, + "loss": 0.1272, + "step": 2105 + }, + { + "epoch": 0.15195353367726108, + "grad_norm": 0.10689380019903183, + "learning_rate": 0.00019393563284745275, + "loss": 0.1369, + "step": 2106 + }, + { + "epoch": 0.1520256863523215, + "grad_norm": 0.10896710306406021, + "learning_rate": 0.00019393274642805601, + "loss": 0.1801, + "step": 2107 + }, + { + "epoch": 0.15209783902738194, + "grad_norm": 0.13935106992721558, + "learning_rate": 0.00019392986000865928, + "loss": 0.1628, + "step": 2108 + }, + { + "epoch": 0.15216999170244236, + "grad_norm": 0.12007452547550201, + "learning_rate": 0.00019392697358926254, + "loss": 0.1343, + "step": 2109 + }, + { + "epoch": 0.1522421443775028, + "grad_norm": 0.1309816837310791, + "learning_rate": 0.00019392408716986578, + "loss": 0.1575, + "step": 2110 + }, + { + "epoch": 0.15231429705256322, + "grad_norm": 0.13151468336582184, + "learning_rate": 0.00019392120075046904, + "loss": 0.1374, + "step": 2111 + }, + { + "epoch": 0.15238644972762366, + "grad_norm": 0.11823946237564087, + "learning_rate": 0.0001939183143310723, + "loss": 0.1567, + "step": 2112 + }, + { + "epoch": 0.15245860240268408, + "grad_norm": 0.1083201989531517, + "learning_rate": 0.0001939154279116756, + "loss": 0.1858, + "step": 2113 + }, + { + "epoch": 0.1525307550777445, + "grad_norm": 0.1123339906334877, + "learning_rate": 0.00019391254149227885, + "loss": 0.1731, + "step": 2114 + }, + { + "epoch": 0.15260290775280494, + "grad_norm": 0.08939201384782791, + "learning_rate": 0.0001939096550728821, + "loss": 0.1823, + "step": 2115 + }, + { + "epoch": 0.15267506042786536, + "grad_norm": 0.09028434008359909, + "learning_rate": 0.00019390676865348535, + "loss": 0.1502, + "step": 2116 + }, + { + "epoch": 0.1527472131029258, + "grad_norm": 0.1175919696688652, + "learning_rate": 0.00019390388223408862, + "loss": 0.1618, + "step": 2117 + }, + { + "epoch": 0.15281936577798622, + "grad_norm": 0.12090510874986649, + "learning_rate": 0.00019390099581469188, + "loss": 0.1444, + "step": 2118 + }, + { + "epoch": 0.15289151845304666, + "grad_norm": 0.1123272255063057, + "learning_rate": 0.00019389810939529514, + "loss": 0.1279, + "step": 2119 + }, + { + "epoch": 0.15296367112810708, + "grad_norm": 0.10467422753572464, + "learning_rate": 0.0001938952229758984, + "loss": 0.192, + "step": 2120 + }, + { + "epoch": 0.1530358238031675, + "grad_norm": 0.1312311738729477, + "learning_rate": 0.00019389233655650167, + "loss": 0.1893, + "step": 2121 + }, + { + "epoch": 0.15310797647822794, + "grad_norm": 0.1281033158302307, + "learning_rate": 0.00019388945013710493, + "loss": 0.1529, + "step": 2122 + }, + { + "epoch": 0.15318012915328835, + "grad_norm": 0.08082175254821777, + "learning_rate": 0.0001938865637177082, + "loss": 0.1717, + "step": 2123 + }, + { + "epoch": 0.1532522818283488, + "grad_norm": 0.15641988813877106, + "learning_rate": 0.00019388367729831146, + "loss": 0.1819, + "step": 2124 + }, + { + "epoch": 0.1533244345034092, + "grad_norm": 0.09862814843654633, + "learning_rate": 0.00019388079087891472, + "loss": 0.1349, + "step": 2125 + }, + { + "epoch": 0.15339658717846963, + "grad_norm": 0.09552069008350372, + "learning_rate": 0.00019387790445951796, + "loss": 0.2239, + "step": 2126 + }, + { + "epoch": 0.15346873985353007, + "grad_norm": 0.142461359500885, + "learning_rate": 0.00019387501804012125, + "loss": 0.1413, + "step": 2127 + }, + { + "epoch": 0.1535408925285905, + "grad_norm": 0.09516191482543945, + "learning_rate": 0.0001938721316207245, + "loss": 0.1565, + "step": 2128 + }, + { + "epoch": 0.15361304520365093, + "grad_norm": 0.09291546791791916, + "learning_rate": 0.00019386924520132777, + "loss": 0.1653, + "step": 2129 + }, + { + "epoch": 0.15368519787871135, + "grad_norm": 0.16194333136081696, + "learning_rate": 0.00019386635878193103, + "loss": 0.2035, + "step": 2130 + }, + { + "epoch": 0.1537573505537718, + "grad_norm": 0.11512897163629532, + "learning_rate": 0.00019386347236253427, + "loss": 0.1014, + "step": 2131 + }, + { + "epoch": 0.1538295032288322, + "grad_norm": 0.11555943638086319, + "learning_rate": 0.00019386058594313753, + "loss": 0.2309, + "step": 2132 + }, + { + "epoch": 0.15390165590389263, + "grad_norm": 0.10416556894779205, + "learning_rate": 0.0001938576995237408, + "loss": 0.1617, + "step": 2133 + }, + { + "epoch": 0.15397380857895307, + "grad_norm": 0.10757438093423843, + "learning_rate": 0.00019385481310434409, + "loss": 0.2139, + "step": 2134 + }, + { + "epoch": 0.15404596125401349, + "grad_norm": 0.10977016389369965, + "learning_rate": 0.00019385192668494735, + "loss": 0.2053, + "step": 2135 + }, + { + "epoch": 0.15411811392907393, + "grad_norm": 0.10284312814474106, + "learning_rate": 0.00019384904026555059, + "loss": 0.1563, + "step": 2136 + }, + { + "epoch": 0.15419026660413435, + "grad_norm": 0.11248984932899475, + "learning_rate": 0.00019384615384615385, + "loss": 0.1516, + "step": 2137 + }, + { + "epoch": 0.15426241927919476, + "grad_norm": 0.0976649597287178, + "learning_rate": 0.0001938432674267571, + "loss": 0.1447, + "step": 2138 + }, + { + "epoch": 0.1543345719542552, + "grad_norm": 0.12282803654670715, + "learning_rate": 0.00019384038100736037, + "loss": 0.1759, + "step": 2139 + }, + { + "epoch": 0.15440672462931562, + "grad_norm": 0.11239445954561234, + "learning_rate": 0.00019383749458796364, + "loss": 0.2223, + "step": 2140 + }, + { + "epoch": 0.15447887730437607, + "grad_norm": 0.11776033788919449, + "learning_rate": 0.0001938346081685669, + "loss": 0.2105, + "step": 2141 + }, + { + "epoch": 0.15455102997943648, + "grad_norm": 0.0812140703201294, + "learning_rate": 0.00019383172174917016, + "loss": 0.1552, + "step": 2142 + }, + { + "epoch": 0.15462318265449693, + "grad_norm": 0.12144932895898819, + "learning_rate": 0.00019382883532977343, + "loss": 0.165, + "step": 2143 + }, + { + "epoch": 0.15469533532955734, + "grad_norm": 0.12033209204673767, + "learning_rate": 0.0001938259489103767, + "loss": 0.1523, + "step": 2144 + }, + { + "epoch": 0.15476748800461776, + "grad_norm": 0.09720002114772797, + "learning_rate": 0.00019382306249097995, + "loss": 0.1637, + "step": 2145 + }, + { + "epoch": 0.1548396406796782, + "grad_norm": 0.12371323257684708, + "learning_rate": 0.00019382017607158321, + "loss": 0.2387, + "step": 2146 + }, + { + "epoch": 0.15491179335473862, + "grad_norm": 0.11101663112640381, + "learning_rate": 0.00019381728965218645, + "loss": 0.1638, + "step": 2147 + }, + { + "epoch": 0.15498394602979906, + "grad_norm": 0.10234714299440384, + "learning_rate": 0.00019381440323278974, + "loss": 0.1607, + "step": 2148 + }, + { + "epoch": 0.15505609870485948, + "grad_norm": 0.11836856603622437, + "learning_rate": 0.000193811516813393, + "loss": 0.1584, + "step": 2149 + }, + { + "epoch": 0.15512825137991992, + "grad_norm": 0.08731173723936081, + "learning_rate": 0.00019380863039399627, + "loss": 0.1681, + "step": 2150 + }, + { + "epoch": 0.15520040405498034, + "grad_norm": 0.09717448055744171, + "learning_rate": 0.00019380574397459953, + "loss": 0.1758, + "step": 2151 + }, + { + "epoch": 0.15527255673004076, + "grad_norm": 0.11712459474802017, + "learning_rate": 0.00019380285755520276, + "loss": 0.1758, + "step": 2152 + }, + { + "epoch": 0.1553447094051012, + "grad_norm": 0.08976966142654419, + "learning_rate": 0.00019379997113580603, + "loss": 0.1641, + "step": 2153 + }, + { + "epoch": 0.15541686208016162, + "grad_norm": 0.08658537268638611, + "learning_rate": 0.0001937970847164093, + "loss": 0.1674, + "step": 2154 + }, + { + "epoch": 0.15548901475522206, + "grad_norm": 0.09062183648347855, + "learning_rate": 0.00019379419829701258, + "loss": 0.143, + "step": 2155 + }, + { + "epoch": 0.15556116743028248, + "grad_norm": 0.10794931650161743, + "learning_rate": 0.00019379131187761584, + "loss": 0.2079, + "step": 2156 + }, + { + "epoch": 0.1556333201053429, + "grad_norm": 0.14580285549163818, + "learning_rate": 0.00019378842545821908, + "loss": 0.1764, + "step": 2157 + }, + { + "epoch": 0.15570547278040334, + "grad_norm": 0.0971672311425209, + "learning_rate": 0.00019378553903882234, + "loss": 0.1797, + "step": 2158 + }, + { + "epoch": 0.15577762545546375, + "grad_norm": 0.10162393748760223, + "learning_rate": 0.0001937826526194256, + "loss": 0.1689, + "step": 2159 + }, + { + "epoch": 0.1558497781305242, + "grad_norm": 0.09460558742284775, + "learning_rate": 0.00019377976620002887, + "loss": 0.158, + "step": 2160 + }, + { + "epoch": 0.1559219308055846, + "grad_norm": 0.11674042046070099, + "learning_rate": 0.00019377687978063213, + "loss": 0.1995, + "step": 2161 + }, + { + "epoch": 0.15599408348064506, + "grad_norm": 0.09924761950969696, + "learning_rate": 0.0001937739933612354, + "loss": 0.1761, + "step": 2162 + }, + { + "epoch": 0.15606623615570547, + "grad_norm": 0.12028750777244568, + "learning_rate": 0.00019377110694183866, + "loss": 0.1683, + "step": 2163 + }, + { + "epoch": 0.1561383888307659, + "grad_norm": 0.10652889311313629, + "learning_rate": 0.00019376822052244192, + "loss": 0.1868, + "step": 2164 + }, + { + "epoch": 0.15621054150582633, + "grad_norm": 0.1042298898100853, + "learning_rate": 0.00019376533410304518, + "loss": 0.1889, + "step": 2165 + }, + { + "epoch": 0.15628269418088675, + "grad_norm": 0.0930410623550415, + "learning_rate": 0.00019376244768364845, + "loss": 0.1612, + "step": 2166 + }, + { + "epoch": 0.1563548468559472, + "grad_norm": 0.08166609704494476, + "learning_rate": 0.0001937595612642517, + "loss": 0.1889, + "step": 2167 + }, + { + "epoch": 0.1564269995310076, + "grad_norm": 0.09500760585069656, + "learning_rate": 0.00019375667484485494, + "loss": 0.1847, + "step": 2168 + }, + { + "epoch": 0.15649915220606803, + "grad_norm": 0.10849113762378693, + "learning_rate": 0.00019375378842545823, + "loss": 0.1745, + "step": 2169 + }, + { + "epoch": 0.15657130488112847, + "grad_norm": 0.08661068975925446, + "learning_rate": 0.0001937509020060615, + "loss": 0.1455, + "step": 2170 + }, + { + "epoch": 0.1566434575561889, + "grad_norm": 0.11696840822696686, + "learning_rate": 0.00019374801558666476, + "loss": 0.1396, + "step": 2171 + }, + { + "epoch": 0.15671561023124933, + "grad_norm": 0.08419200032949448, + "learning_rate": 0.00019374512916726802, + "loss": 0.1242, + "step": 2172 + }, + { + "epoch": 0.15678776290630975, + "grad_norm": 0.08925656229257584, + "learning_rate": 0.00019374224274787126, + "loss": 0.1693, + "step": 2173 + }, + { + "epoch": 0.1568599155813702, + "grad_norm": 0.10732999444007874, + "learning_rate": 0.00019373935632847452, + "loss": 0.1577, + "step": 2174 + }, + { + "epoch": 0.1569320682564306, + "grad_norm": 0.08582869917154312, + "learning_rate": 0.00019373646990907778, + "loss": 0.128, + "step": 2175 + }, + { + "epoch": 0.15700422093149102, + "grad_norm": 0.10227958112955093, + "learning_rate": 0.00019373358348968107, + "loss": 0.1814, + "step": 2176 + }, + { + "epoch": 0.15707637360655147, + "grad_norm": 0.10264535993337631, + "learning_rate": 0.00019373069707028434, + "loss": 0.1629, + "step": 2177 + }, + { + "epoch": 0.15714852628161188, + "grad_norm": 0.08098578453063965, + "learning_rate": 0.00019372781065088757, + "loss": 0.1958, + "step": 2178 + }, + { + "epoch": 0.15722067895667233, + "grad_norm": 0.09199430793523788, + "learning_rate": 0.00019372492423149084, + "loss": 0.2053, + "step": 2179 + }, + { + "epoch": 0.15729283163173274, + "grad_norm": 0.10968173295259476, + "learning_rate": 0.0001937220378120941, + "loss": 0.1753, + "step": 2180 + }, + { + "epoch": 0.1573649843067932, + "grad_norm": 0.1071987971663475, + "learning_rate": 0.00019371915139269736, + "loss": 0.1894, + "step": 2181 + }, + { + "epoch": 0.1574371369818536, + "grad_norm": 0.1399054378271103, + "learning_rate": 0.00019371626497330063, + "loss": 0.1645, + "step": 2182 + }, + { + "epoch": 0.15750928965691402, + "grad_norm": 0.11898241192102432, + "learning_rate": 0.0001937133785539039, + "loss": 0.1419, + "step": 2183 + }, + { + "epoch": 0.15758144233197446, + "grad_norm": 0.12177052348852158, + "learning_rate": 0.00019371049213450715, + "loss": 0.2028, + "step": 2184 + }, + { + "epoch": 0.15765359500703488, + "grad_norm": 0.12613415718078613, + "learning_rate": 0.00019370760571511041, + "loss": 0.193, + "step": 2185 + }, + { + "epoch": 0.15772574768209532, + "grad_norm": 0.1313813477754593, + "learning_rate": 0.00019370471929571368, + "loss": 0.1802, + "step": 2186 + }, + { + "epoch": 0.15779790035715574, + "grad_norm": 0.10271768271923065, + "learning_rate": 0.00019370183287631694, + "loss": 0.1372, + "step": 2187 + }, + { + "epoch": 0.15787005303221616, + "grad_norm": 0.11866496503353119, + "learning_rate": 0.0001936989464569202, + "loss": 0.1582, + "step": 2188 + }, + { + "epoch": 0.1579422057072766, + "grad_norm": 0.08273667097091675, + "learning_rate": 0.00019369606003752347, + "loss": 0.2097, + "step": 2189 + }, + { + "epoch": 0.15801435838233702, + "grad_norm": 0.08744383603334427, + "learning_rate": 0.00019369317361812673, + "loss": 0.229, + "step": 2190 + }, + { + "epoch": 0.15808651105739746, + "grad_norm": 0.10181351006031036, + "learning_rate": 0.00019369028719873, + "loss": 0.1521, + "step": 2191 + }, + { + "epoch": 0.15815866373245788, + "grad_norm": 0.09820724278688431, + "learning_rate": 0.00019368740077933325, + "loss": 0.2011, + "step": 2192 + }, + { + "epoch": 0.15823081640751832, + "grad_norm": 0.09312766790390015, + "learning_rate": 0.00019368451435993652, + "loss": 0.1985, + "step": 2193 + }, + { + "epoch": 0.15830296908257874, + "grad_norm": 0.10480905324220657, + "learning_rate": 0.00019368162794053978, + "loss": 0.1862, + "step": 2194 + }, + { + "epoch": 0.15837512175763915, + "grad_norm": 0.10711691528558731, + "learning_rate": 0.00019367874152114302, + "loss": 0.1601, + "step": 2195 + }, + { + "epoch": 0.1584472744326996, + "grad_norm": 0.1277097761631012, + "learning_rate": 0.00019367585510174628, + "loss": 0.2108, + "step": 2196 + }, + { + "epoch": 0.15851942710776, + "grad_norm": 0.14156398177146912, + "learning_rate": 0.00019367296868234957, + "loss": 0.1836, + "step": 2197 + }, + { + "epoch": 0.15859157978282046, + "grad_norm": 0.0874231830239296, + "learning_rate": 0.00019367008226295283, + "loss": 0.202, + "step": 2198 + }, + { + "epoch": 0.15866373245788087, + "grad_norm": 0.08548527210950851, + "learning_rate": 0.0001936671958435561, + "loss": 0.1939, + "step": 2199 + }, + { + "epoch": 0.1587358851329413, + "grad_norm": 0.10133644193410873, + "learning_rate": 0.00019366430942415933, + "loss": 0.1292, + "step": 2200 + }, + { + "epoch": 0.15880803780800173, + "grad_norm": 0.09973689913749695, + "learning_rate": 0.0001936614230047626, + "loss": 0.1267, + "step": 2201 + }, + { + "epoch": 0.15888019048306215, + "grad_norm": 0.11650566756725311, + "learning_rate": 0.00019365853658536586, + "loss": 0.1686, + "step": 2202 + }, + { + "epoch": 0.1589523431581226, + "grad_norm": 0.09599412977695465, + "learning_rate": 0.00019365565016596912, + "loss": 0.1675, + "step": 2203 + }, + { + "epoch": 0.159024495833183, + "grad_norm": 0.12936724722385406, + "learning_rate": 0.0001936527637465724, + "loss": 0.1783, + "step": 2204 + }, + { + "epoch": 0.15909664850824345, + "grad_norm": 0.1200418621301651, + "learning_rate": 0.00019364987732717565, + "loss": 0.2125, + "step": 2205 + }, + { + "epoch": 0.15916880118330387, + "grad_norm": 0.12506982684135437, + "learning_rate": 0.0001936469909077789, + "loss": 0.1663, + "step": 2206 + }, + { + "epoch": 0.1592409538583643, + "grad_norm": 0.12607517838478088, + "learning_rate": 0.00019364410448838217, + "loss": 0.1716, + "step": 2207 + }, + { + "epoch": 0.15931310653342473, + "grad_norm": 0.12602943181991577, + "learning_rate": 0.00019364121806898543, + "loss": 0.1949, + "step": 2208 + }, + { + "epoch": 0.15938525920848515, + "grad_norm": 0.09044279903173447, + "learning_rate": 0.0001936383316495887, + "loss": 0.1656, + "step": 2209 + }, + { + "epoch": 0.1594574118835456, + "grad_norm": 0.1014801487326622, + "learning_rate": 0.00019363544523019196, + "loss": 0.1379, + "step": 2210 + }, + { + "epoch": 0.159529564558606, + "grad_norm": 0.143855020403862, + "learning_rate": 0.00019363255881079522, + "loss": 0.1992, + "step": 2211 + }, + { + "epoch": 0.15960171723366645, + "grad_norm": 0.09931483119726181, + "learning_rate": 0.00019362967239139849, + "loss": 0.1715, + "step": 2212 + }, + { + "epoch": 0.15967386990872687, + "grad_norm": 0.11490177363157272, + "learning_rate": 0.00019362678597200175, + "loss": 0.1756, + "step": 2213 + }, + { + "epoch": 0.15974602258378728, + "grad_norm": 0.10598494857549667, + "learning_rate": 0.000193623899552605, + "loss": 0.1875, + "step": 2214 + }, + { + "epoch": 0.15981817525884773, + "grad_norm": 0.109975166618824, + "learning_rate": 0.00019362101313320827, + "loss": 0.1705, + "step": 2215 + }, + { + "epoch": 0.15989032793390814, + "grad_norm": 0.1269775778055191, + "learning_rate": 0.0001936181267138115, + "loss": 0.2032, + "step": 2216 + }, + { + "epoch": 0.1599624806089686, + "grad_norm": 0.09906476736068726, + "learning_rate": 0.00019361524029441477, + "loss": 0.181, + "step": 2217 + }, + { + "epoch": 0.160034633284029, + "grad_norm": 0.09717681258916855, + "learning_rate": 0.00019361235387501806, + "loss": 0.1818, + "step": 2218 + }, + { + "epoch": 0.16010678595908942, + "grad_norm": 0.0946667492389679, + "learning_rate": 0.00019360946745562133, + "loss": 0.1486, + "step": 2219 + }, + { + "epoch": 0.16017893863414986, + "grad_norm": 0.1109989807009697, + "learning_rate": 0.0001936065810362246, + "loss": 0.1428, + "step": 2220 + }, + { + "epoch": 0.16025109130921028, + "grad_norm": 0.10091498494148254, + "learning_rate": 0.00019360369461682782, + "loss": 0.169, + "step": 2221 + }, + { + "epoch": 0.16032324398427072, + "grad_norm": 0.10776679217815399, + "learning_rate": 0.0001936008081974311, + "loss": 0.1712, + "step": 2222 + }, + { + "epoch": 0.16039539665933114, + "grad_norm": 0.11089781671762466, + "learning_rate": 0.00019359792177803435, + "loss": 0.2127, + "step": 2223 + }, + { + "epoch": 0.16046754933439158, + "grad_norm": 0.1164630874991417, + "learning_rate": 0.00019359503535863761, + "loss": 0.1456, + "step": 2224 + }, + { + "epoch": 0.160539702009452, + "grad_norm": 0.11441965401172638, + "learning_rate": 0.0001935921489392409, + "loss": 0.164, + "step": 2225 + }, + { + "epoch": 0.16061185468451242, + "grad_norm": 0.10857464373111725, + "learning_rate": 0.00019358926251984414, + "loss": 0.1597, + "step": 2226 + }, + { + "epoch": 0.16068400735957286, + "grad_norm": 0.11473193019628525, + "learning_rate": 0.0001935863761004474, + "loss": 0.1757, + "step": 2227 + }, + { + "epoch": 0.16075616003463328, + "grad_norm": 0.12451031059026718, + "learning_rate": 0.00019358348968105067, + "loss": 0.1495, + "step": 2228 + }, + { + "epoch": 0.16082831270969372, + "grad_norm": 0.12104455381631851, + "learning_rate": 0.00019358060326165393, + "loss": 0.1389, + "step": 2229 + }, + { + "epoch": 0.16090046538475414, + "grad_norm": 0.12772192060947418, + "learning_rate": 0.0001935777168422572, + "loss": 0.208, + "step": 2230 + }, + { + "epoch": 0.16097261805981455, + "grad_norm": 0.13302107155323029, + "learning_rate": 0.00019357483042286045, + "loss": 0.1717, + "step": 2231 + }, + { + "epoch": 0.161044770734875, + "grad_norm": 0.11099439114332199, + "learning_rate": 0.00019357194400346372, + "loss": 0.1563, + "step": 2232 + }, + { + "epoch": 0.16111692340993541, + "grad_norm": 0.13218525052070618, + "learning_rate": 0.00019356905758406698, + "loss": 0.1688, + "step": 2233 + }, + { + "epoch": 0.16118907608499586, + "grad_norm": 0.1205550953745842, + "learning_rate": 0.00019356617116467024, + "loss": 0.1322, + "step": 2234 + }, + { + "epoch": 0.16126122876005627, + "grad_norm": 0.16065192222595215, + "learning_rate": 0.0001935632847452735, + "loss": 0.1724, + "step": 2235 + }, + { + "epoch": 0.16133338143511672, + "grad_norm": 0.08642034977674484, + "learning_rate": 0.00019356039832587677, + "loss": 0.1477, + "step": 2236 + }, + { + "epoch": 0.16140553411017713, + "grad_norm": 0.10994623601436615, + "learning_rate": 0.00019355751190648, + "loss": 0.1642, + "step": 2237 + }, + { + "epoch": 0.16147768678523755, + "grad_norm": 0.12689784169197083, + "learning_rate": 0.00019355462548708327, + "loss": 0.1916, + "step": 2238 + }, + { + "epoch": 0.161549839460298, + "grad_norm": 0.09766849130392075, + "learning_rate": 0.00019355173906768656, + "loss": 0.1336, + "step": 2239 + }, + { + "epoch": 0.1616219921353584, + "grad_norm": 0.11204902082681656, + "learning_rate": 0.00019354885264828982, + "loss": 0.2044, + "step": 2240 + }, + { + "epoch": 0.16169414481041886, + "grad_norm": 0.11278552561998367, + "learning_rate": 0.00019354596622889308, + "loss": 0.153, + "step": 2241 + }, + { + "epoch": 0.16176629748547927, + "grad_norm": 0.10636945068836212, + "learning_rate": 0.00019354307980949632, + "loss": 0.1827, + "step": 2242 + }, + { + "epoch": 0.16183845016053972, + "grad_norm": 0.10965976119041443, + "learning_rate": 0.00019354019339009958, + "loss": 0.1321, + "step": 2243 + }, + { + "epoch": 0.16191060283560013, + "grad_norm": 0.11676888912916183, + "learning_rate": 0.00019353730697070285, + "loss": 0.1791, + "step": 2244 + }, + { + "epoch": 0.16198275551066055, + "grad_norm": 0.11976998299360275, + "learning_rate": 0.0001935344205513061, + "loss": 0.2231, + "step": 2245 + }, + { + "epoch": 0.162054908185721, + "grad_norm": 0.1357678771018982, + "learning_rate": 0.0001935315341319094, + "loss": 0.1573, + "step": 2246 + }, + { + "epoch": 0.1621270608607814, + "grad_norm": 0.09829548001289368, + "learning_rate": 0.00019352864771251263, + "loss": 0.1865, + "step": 2247 + }, + { + "epoch": 0.16219921353584185, + "grad_norm": 0.08333700150251389, + "learning_rate": 0.0001935257612931159, + "loss": 0.1929, + "step": 2248 + }, + { + "epoch": 0.16227136621090227, + "grad_norm": 0.09477890282869339, + "learning_rate": 0.00019352287487371916, + "loss": 0.1576, + "step": 2249 + }, + { + "epoch": 0.16234351888596268, + "grad_norm": 0.0912022739648819, + "learning_rate": 0.00019351998845432242, + "loss": 0.1589, + "step": 2250 + }, + { + "epoch": 0.16241567156102313, + "grad_norm": 0.11333397775888443, + "learning_rate": 0.00019351710203492569, + "loss": 0.1641, + "step": 2251 + }, + { + "epoch": 0.16248782423608354, + "grad_norm": 0.10127349197864532, + "learning_rate": 0.00019351421561552895, + "loss": 0.1438, + "step": 2252 + }, + { + "epoch": 0.162559976911144, + "grad_norm": 0.09116199612617493, + "learning_rate": 0.0001935113291961322, + "loss": 0.2014, + "step": 2253 + }, + { + "epoch": 0.1626321295862044, + "grad_norm": 0.09721777588129044, + "learning_rate": 0.00019350844277673547, + "loss": 0.1275, + "step": 2254 + }, + { + "epoch": 0.16270428226126485, + "grad_norm": 0.12168851494789124, + "learning_rate": 0.00019350555635733874, + "loss": 0.1581, + "step": 2255 + }, + { + "epoch": 0.16277643493632526, + "grad_norm": 0.1065860167145729, + "learning_rate": 0.000193502669937942, + "loss": 0.1792, + "step": 2256 + }, + { + "epoch": 0.16284858761138568, + "grad_norm": 0.11777549237012863, + "learning_rate": 0.00019349978351854526, + "loss": 0.1588, + "step": 2257 + }, + { + "epoch": 0.16292074028644613, + "grad_norm": 0.1272294819355011, + "learning_rate": 0.0001934968970991485, + "loss": 0.1635, + "step": 2258 + }, + { + "epoch": 0.16299289296150654, + "grad_norm": 0.12159106135368347, + "learning_rate": 0.00019349401067975176, + "loss": 0.1577, + "step": 2259 + }, + { + "epoch": 0.16306504563656699, + "grad_norm": 0.10105059295892715, + "learning_rate": 0.00019349112426035502, + "loss": 0.1884, + "step": 2260 + }, + { + "epoch": 0.1631371983116274, + "grad_norm": 0.1152854785323143, + "learning_rate": 0.00019348823784095831, + "loss": 0.1443, + "step": 2261 + }, + { + "epoch": 0.16320935098668782, + "grad_norm": 0.0869210883975029, + "learning_rate": 0.00019348535142156158, + "loss": 0.1848, + "step": 2262 + }, + { + "epoch": 0.16328150366174826, + "grad_norm": 0.1061026081442833, + "learning_rate": 0.0001934824650021648, + "loss": 0.1658, + "step": 2263 + }, + { + "epoch": 0.16335365633680868, + "grad_norm": 0.10692035406827927, + "learning_rate": 0.00019347957858276808, + "loss": 0.1439, + "step": 2264 + }, + { + "epoch": 0.16342580901186912, + "grad_norm": 0.0852183923125267, + "learning_rate": 0.00019347669216337134, + "loss": 0.1552, + "step": 2265 + }, + { + "epoch": 0.16349796168692954, + "grad_norm": 0.08484194427728653, + "learning_rate": 0.0001934738057439746, + "loss": 0.1303, + "step": 2266 + }, + { + "epoch": 0.16357011436198998, + "grad_norm": 0.09818069636821747, + "learning_rate": 0.00019347091932457787, + "loss": 0.1647, + "step": 2267 + }, + { + "epoch": 0.1636422670370504, + "grad_norm": 0.12069786339998245, + "learning_rate": 0.00019346803290518113, + "loss": 0.1827, + "step": 2268 + }, + { + "epoch": 0.16371441971211081, + "grad_norm": 0.10593380779027939, + "learning_rate": 0.0001934651464857844, + "loss": 0.1872, + "step": 2269 + }, + { + "epoch": 0.16378657238717126, + "grad_norm": 0.11723898351192474, + "learning_rate": 0.00019346226006638765, + "loss": 0.1629, + "step": 2270 + }, + { + "epoch": 0.16385872506223167, + "grad_norm": 0.09910007566213608, + "learning_rate": 0.00019345937364699092, + "loss": 0.1778, + "step": 2271 + }, + { + "epoch": 0.16393087773729212, + "grad_norm": 0.10543759912252426, + "learning_rate": 0.00019345648722759418, + "loss": 0.2005, + "step": 2272 + }, + { + "epoch": 0.16400303041235254, + "grad_norm": 0.10561928153038025, + "learning_rate": 0.00019345360080819744, + "loss": 0.1714, + "step": 2273 + }, + { + "epoch": 0.16407518308741298, + "grad_norm": 0.08312105387449265, + "learning_rate": 0.00019345071438880068, + "loss": 0.1865, + "step": 2274 + }, + { + "epoch": 0.1641473357624734, + "grad_norm": 0.08049603551626205, + "learning_rate": 0.00019344782796940397, + "loss": 0.1019, + "step": 2275 + }, + { + "epoch": 0.1642194884375338, + "grad_norm": 0.10335752367973328, + "learning_rate": 0.00019344494155000723, + "loss": 0.2238, + "step": 2276 + }, + { + "epoch": 0.16429164111259426, + "grad_norm": 0.10471571236848831, + "learning_rate": 0.0001934420551306105, + "loss": 0.1611, + "step": 2277 + }, + { + "epoch": 0.16436379378765467, + "grad_norm": 0.10903488099575043, + "learning_rate": 0.00019343916871121376, + "loss": 0.1799, + "step": 2278 + }, + { + "epoch": 0.16443594646271512, + "grad_norm": 0.10458233207464218, + "learning_rate": 0.000193436282291817, + "loss": 0.1665, + "step": 2279 + }, + { + "epoch": 0.16450809913777553, + "grad_norm": 0.12355650961399078, + "learning_rate": 0.00019343339587242026, + "loss": 0.2636, + "step": 2280 + }, + { + "epoch": 0.16458025181283595, + "grad_norm": 0.11420013010501862, + "learning_rate": 0.00019343050945302352, + "loss": 0.1303, + "step": 2281 + }, + { + "epoch": 0.1646524044878964, + "grad_norm": 0.11854225397109985, + "learning_rate": 0.0001934276230336268, + "loss": 0.1505, + "step": 2282 + }, + { + "epoch": 0.1647245571629568, + "grad_norm": 0.14060252904891968, + "learning_rate": 0.00019342473661423007, + "loss": 0.1488, + "step": 2283 + }, + { + "epoch": 0.16479670983801725, + "grad_norm": 0.11233499646186829, + "learning_rate": 0.0001934218501948333, + "loss": 0.1616, + "step": 2284 + }, + { + "epoch": 0.16486886251307767, + "grad_norm": 0.10339425504207611, + "learning_rate": 0.00019341896377543657, + "loss": 0.1892, + "step": 2285 + }, + { + "epoch": 0.1649410151881381, + "grad_norm": 0.13305120170116425, + "learning_rate": 0.00019341607735603983, + "loss": 0.212, + "step": 2286 + }, + { + "epoch": 0.16501316786319853, + "grad_norm": 0.09930705279111862, + "learning_rate": 0.0001934131909366431, + "loss": 0.1401, + "step": 2287 + }, + { + "epoch": 0.16508532053825895, + "grad_norm": 0.09781865030527115, + "learning_rate": 0.00019341030451724636, + "loss": 0.1714, + "step": 2288 + }, + { + "epoch": 0.1651574732133194, + "grad_norm": 0.10273580253124237, + "learning_rate": 0.00019340741809784962, + "loss": 0.1578, + "step": 2289 + }, + { + "epoch": 0.1652296258883798, + "grad_norm": 0.10975068807601929, + "learning_rate": 0.00019340453167845289, + "loss": 0.1728, + "step": 2290 + }, + { + "epoch": 0.16530177856344025, + "grad_norm": 0.10452619940042496, + "learning_rate": 0.00019340164525905615, + "loss": 0.1471, + "step": 2291 + }, + { + "epoch": 0.16537393123850067, + "grad_norm": 0.11115576326847076, + "learning_rate": 0.0001933987588396594, + "loss": 0.1697, + "step": 2292 + }, + { + "epoch": 0.16544608391356108, + "grad_norm": 0.11397110670804977, + "learning_rate": 0.00019339587242026267, + "loss": 0.1428, + "step": 2293 + }, + { + "epoch": 0.16551823658862153, + "grad_norm": 0.12284787744283676, + "learning_rate": 0.00019339298600086594, + "loss": 0.1353, + "step": 2294 + }, + { + "epoch": 0.16559038926368194, + "grad_norm": 0.10357421636581421, + "learning_rate": 0.0001933900995814692, + "loss": 0.1307, + "step": 2295 + }, + { + "epoch": 0.16566254193874239, + "grad_norm": 0.10247401148080826, + "learning_rate": 0.00019338721316207246, + "loss": 0.1461, + "step": 2296 + }, + { + "epoch": 0.1657346946138028, + "grad_norm": 0.09464439004659653, + "learning_rate": 0.00019338432674267573, + "loss": 0.1649, + "step": 2297 + }, + { + "epoch": 0.16580684728886325, + "grad_norm": 0.10897719860076904, + "learning_rate": 0.000193381440323279, + "loss": 0.2221, + "step": 2298 + }, + { + "epoch": 0.16587899996392366, + "grad_norm": 0.1040739193558693, + "learning_rate": 0.00019337855390388225, + "loss": 0.1588, + "step": 2299 + }, + { + "epoch": 0.16595115263898408, + "grad_norm": 0.11330358684062958, + "learning_rate": 0.00019337566748448551, + "loss": 0.172, + "step": 2300 + }, + { + "epoch": 0.16602330531404452, + "grad_norm": 0.10044106096029282, + "learning_rate": 0.00019337278106508875, + "loss": 0.1805, + "step": 2301 + }, + { + "epoch": 0.16609545798910494, + "grad_norm": 0.0981152132153511, + "learning_rate": 0.000193369894645692, + "loss": 0.1646, + "step": 2302 + }, + { + "epoch": 0.16616761066416538, + "grad_norm": 0.09416420757770538, + "learning_rate": 0.0001933670082262953, + "loss": 0.1378, + "step": 2303 + }, + { + "epoch": 0.1662397633392258, + "grad_norm": 0.14568422734737396, + "learning_rate": 0.00019336412180689857, + "loss": 0.1851, + "step": 2304 + }, + { + "epoch": 0.16631191601428624, + "grad_norm": 0.0925934836268425, + "learning_rate": 0.00019336123538750183, + "loss": 0.1784, + "step": 2305 + }, + { + "epoch": 0.16638406868934666, + "grad_norm": 0.11252991110086441, + "learning_rate": 0.00019335834896810506, + "loss": 0.1639, + "step": 2306 + }, + { + "epoch": 0.16645622136440708, + "grad_norm": 0.10067819058895111, + "learning_rate": 0.00019335546254870833, + "loss": 0.1457, + "step": 2307 + }, + { + "epoch": 0.16652837403946752, + "grad_norm": 0.10660653561353683, + "learning_rate": 0.0001933525761293116, + "loss": 0.1504, + "step": 2308 + }, + { + "epoch": 0.16660052671452794, + "grad_norm": 0.10056259483098984, + "learning_rate": 0.00019334968970991485, + "loss": 0.2131, + "step": 2309 + }, + { + "epoch": 0.16667267938958838, + "grad_norm": 0.10658378154039383, + "learning_rate": 0.00019334680329051814, + "loss": 0.174, + "step": 2310 + }, + { + "epoch": 0.1667448320646488, + "grad_norm": 0.12302286922931671, + "learning_rate": 0.00019334391687112138, + "loss": 0.1729, + "step": 2311 + }, + { + "epoch": 0.1668169847397092, + "grad_norm": 0.09981415420770645, + "learning_rate": 0.00019334103045172464, + "loss": 0.165, + "step": 2312 + }, + { + "epoch": 0.16688913741476966, + "grad_norm": 0.10932218283414841, + "learning_rate": 0.0001933381440323279, + "loss": 0.1452, + "step": 2313 + }, + { + "epoch": 0.16696129008983007, + "grad_norm": 0.09633667767047882, + "learning_rate": 0.00019333525761293117, + "loss": 0.1822, + "step": 2314 + }, + { + "epoch": 0.16703344276489052, + "grad_norm": 0.10372579097747803, + "learning_rate": 0.00019333237119353443, + "loss": 0.1534, + "step": 2315 + }, + { + "epoch": 0.16710559543995093, + "grad_norm": 0.1061662957072258, + "learning_rate": 0.0001933294847741377, + "loss": 0.1239, + "step": 2316 + }, + { + "epoch": 0.16717774811501138, + "grad_norm": 0.09024469554424286, + "learning_rate": 0.00019332659835474096, + "loss": 0.1143, + "step": 2317 + }, + { + "epoch": 0.1672499007900718, + "grad_norm": 0.10284189879894257, + "learning_rate": 0.00019332371193534422, + "loss": 0.1853, + "step": 2318 + }, + { + "epoch": 0.1673220534651322, + "grad_norm": 0.10474637895822525, + "learning_rate": 0.00019332082551594748, + "loss": 0.1611, + "step": 2319 + }, + { + "epoch": 0.16739420614019265, + "grad_norm": 0.10879724472761154, + "learning_rate": 0.00019331793909655075, + "loss": 0.1297, + "step": 2320 + }, + { + "epoch": 0.16746635881525307, + "grad_norm": 0.11982034891843796, + "learning_rate": 0.000193315052677154, + "loss": 0.1903, + "step": 2321 + }, + { + "epoch": 0.1675385114903135, + "grad_norm": 0.1146751120686531, + "learning_rate": 0.00019331216625775724, + "loss": 0.1926, + "step": 2322 + }, + { + "epoch": 0.16761066416537393, + "grad_norm": 0.12385010719299316, + "learning_rate": 0.0001933092798383605, + "loss": 0.1683, + "step": 2323 + }, + { + "epoch": 0.16768281684043435, + "grad_norm": 0.10480430722236633, + "learning_rate": 0.0001933063934189638, + "loss": 0.2119, + "step": 2324 + }, + { + "epoch": 0.1677549695154948, + "grad_norm": 0.15246498584747314, + "learning_rate": 0.00019330350699956706, + "loss": 0.1945, + "step": 2325 + }, + { + "epoch": 0.1678271221905552, + "grad_norm": 0.09928609430789948, + "learning_rate": 0.00019330062058017032, + "loss": 0.2073, + "step": 2326 + }, + { + "epoch": 0.16789927486561565, + "grad_norm": 0.10196902602910995, + "learning_rate": 0.00019329773416077356, + "loss": 0.1749, + "step": 2327 + }, + { + "epoch": 0.16797142754067607, + "grad_norm": 0.11232678592205048, + "learning_rate": 0.00019329484774137682, + "loss": 0.1576, + "step": 2328 + }, + { + "epoch": 0.1680435802157365, + "grad_norm": 0.10379444807767868, + "learning_rate": 0.00019329196132198008, + "loss": 0.1423, + "step": 2329 + }, + { + "epoch": 0.16811573289079693, + "grad_norm": 0.10844355821609497, + "learning_rate": 0.00019328907490258335, + "loss": 0.1809, + "step": 2330 + }, + { + "epoch": 0.16818788556585734, + "grad_norm": 0.09713105112314224, + "learning_rate": 0.00019328618848318664, + "loss": 0.1865, + "step": 2331 + }, + { + "epoch": 0.1682600382409178, + "grad_norm": 0.08709347248077393, + "learning_rate": 0.00019328330206378987, + "loss": 0.1612, + "step": 2332 + }, + { + "epoch": 0.1683321909159782, + "grad_norm": 0.09003622084856033, + "learning_rate": 0.00019328041564439314, + "loss": 0.1728, + "step": 2333 + }, + { + "epoch": 0.16840434359103865, + "grad_norm": 0.07673250883817673, + "learning_rate": 0.0001932775292249964, + "loss": 0.1515, + "step": 2334 + }, + { + "epoch": 0.16847649626609906, + "grad_norm": 0.1532372087240219, + "learning_rate": 0.00019327464280559966, + "loss": 0.1785, + "step": 2335 + }, + { + "epoch": 0.1685486489411595, + "grad_norm": 0.10392007976770401, + "learning_rate": 0.00019327175638620293, + "loss": 0.1887, + "step": 2336 + }, + { + "epoch": 0.16862080161621992, + "grad_norm": 0.15098121762275696, + "learning_rate": 0.0001932688699668062, + "loss": 0.1305, + "step": 2337 + }, + { + "epoch": 0.16869295429128034, + "grad_norm": 0.15759523212909698, + "learning_rate": 0.00019326598354740945, + "loss": 0.1842, + "step": 2338 + }, + { + "epoch": 0.16876510696634078, + "grad_norm": 0.12343751639127731, + "learning_rate": 0.00019326309712801271, + "loss": 0.1749, + "step": 2339 + }, + { + "epoch": 0.1688372596414012, + "grad_norm": 0.09935485571622849, + "learning_rate": 0.00019326021070861598, + "loss": 0.1787, + "step": 2340 + }, + { + "epoch": 0.16890941231646164, + "grad_norm": 0.10357770323753357, + "learning_rate": 0.00019325732428921924, + "loss": 0.1751, + "step": 2341 + }, + { + "epoch": 0.16898156499152206, + "grad_norm": 0.1449156552553177, + "learning_rate": 0.0001932544378698225, + "loss": 0.1437, + "step": 2342 + }, + { + "epoch": 0.16905371766658248, + "grad_norm": 0.10184285044670105, + "learning_rate": 0.00019325155145042574, + "loss": 0.1394, + "step": 2343 + }, + { + "epoch": 0.16912587034164292, + "grad_norm": 0.08780080825090408, + "learning_rate": 0.000193248665031029, + "loss": 0.1691, + "step": 2344 + }, + { + "epoch": 0.16919802301670334, + "grad_norm": 0.12476243823766708, + "learning_rate": 0.0001932457786116323, + "loss": 0.1498, + "step": 2345 + }, + { + "epoch": 0.16927017569176378, + "grad_norm": 0.11500517278909683, + "learning_rate": 0.00019324289219223555, + "loss": 0.2247, + "step": 2346 + }, + { + "epoch": 0.1693423283668242, + "grad_norm": 0.10371655970811844, + "learning_rate": 0.00019324000577283882, + "loss": 0.1764, + "step": 2347 + }, + { + "epoch": 0.16941448104188464, + "grad_norm": 0.1306602954864502, + "learning_rate": 0.00019323711935344205, + "loss": 0.1783, + "step": 2348 + }, + { + "epoch": 0.16948663371694506, + "grad_norm": 0.08568400889635086, + "learning_rate": 0.00019323423293404532, + "loss": 0.1726, + "step": 2349 + }, + { + "epoch": 0.16955878639200547, + "grad_norm": 0.13382382690906525, + "learning_rate": 0.00019323134651464858, + "loss": 0.1861, + "step": 2350 + }, + { + "epoch": 0.16963093906706592, + "grad_norm": 0.1182415708899498, + "learning_rate": 0.00019322846009525184, + "loss": 0.1278, + "step": 2351 + }, + { + "epoch": 0.16970309174212633, + "grad_norm": 0.11527302116155624, + "learning_rate": 0.00019322557367585513, + "loss": 0.1562, + "step": 2352 + }, + { + "epoch": 0.16977524441718678, + "grad_norm": 0.11253364384174347, + "learning_rate": 0.00019322268725645837, + "loss": 0.0954, + "step": 2353 + }, + { + "epoch": 0.1698473970922472, + "grad_norm": 0.12901516258716583, + "learning_rate": 0.00019321980083706163, + "loss": 0.1748, + "step": 2354 + }, + { + "epoch": 0.1699195497673076, + "grad_norm": 0.10245874524116516, + "learning_rate": 0.0001932169144176649, + "loss": 0.1775, + "step": 2355 + }, + { + "epoch": 0.16999170244236805, + "grad_norm": 0.09780610352754593, + "learning_rate": 0.00019321402799826816, + "loss": 0.155, + "step": 2356 + }, + { + "epoch": 0.17006385511742847, + "grad_norm": 0.09187139570713043, + "learning_rate": 0.00019321114157887142, + "loss": 0.1706, + "step": 2357 + }, + { + "epoch": 0.1701360077924889, + "grad_norm": 0.144823357462883, + "learning_rate": 0.00019320825515947468, + "loss": 0.1642, + "step": 2358 + }, + { + "epoch": 0.17020816046754933, + "grad_norm": 0.1118035688996315, + "learning_rate": 0.00019320536874007795, + "loss": 0.1515, + "step": 2359 + }, + { + "epoch": 0.17028031314260977, + "grad_norm": 0.10937260091304779, + "learning_rate": 0.0001932024823206812, + "loss": 0.165, + "step": 2360 + }, + { + "epoch": 0.1703524658176702, + "grad_norm": 0.12276444584131241, + "learning_rate": 0.00019319959590128447, + "loss": 0.1457, + "step": 2361 + }, + { + "epoch": 0.1704246184927306, + "grad_norm": 0.09414833784103394, + "learning_rate": 0.00019319670948188773, + "loss": 0.1957, + "step": 2362 + }, + { + "epoch": 0.17049677116779105, + "grad_norm": 0.13359293341636658, + "learning_rate": 0.000193193823062491, + "loss": 0.1163, + "step": 2363 + }, + { + "epoch": 0.17056892384285147, + "grad_norm": 0.12425096333026886, + "learning_rate": 0.00019319093664309423, + "loss": 0.1781, + "step": 2364 + }, + { + "epoch": 0.1706410765179119, + "grad_norm": 0.10758412629365921, + "learning_rate": 0.0001931880502236975, + "loss": 0.1899, + "step": 2365 + }, + { + "epoch": 0.17071322919297233, + "grad_norm": 0.09605345875024796, + "learning_rate": 0.00019318516380430079, + "loss": 0.1694, + "step": 2366 + }, + { + "epoch": 0.17078538186803277, + "grad_norm": 0.12173870205879211, + "learning_rate": 0.00019318227738490405, + "loss": 0.1703, + "step": 2367 + }, + { + "epoch": 0.1708575345430932, + "grad_norm": 0.12638935446739197, + "learning_rate": 0.0001931793909655073, + "loss": 0.2067, + "step": 2368 + }, + { + "epoch": 0.1709296872181536, + "grad_norm": 0.11101670563220978, + "learning_rate": 0.00019317650454611055, + "loss": 0.2279, + "step": 2369 + }, + { + "epoch": 0.17100183989321405, + "grad_norm": 0.09994448721408844, + "learning_rate": 0.0001931736181267138, + "loss": 0.218, + "step": 2370 + }, + { + "epoch": 0.17107399256827446, + "grad_norm": 0.1153847947716713, + "learning_rate": 0.00019317073170731707, + "loss": 0.2328, + "step": 2371 + }, + { + "epoch": 0.1711461452433349, + "grad_norm": 0.13979651033878326, + "learning_rate": 0.00019316784528792034, + "loss": 0.1631, + "step": 2372 + }, + { + "epoch": 0.17121829791839532, + "grad_norm": 0.11289111524820328, + "learning_rate": 0.00019316495886852363, + "loss": 0.1486, + "step": 2373 + }, + { + "epoch": 0.17129045059345574, + "grad_norm": 0.08866613358259201, + "learning_rate": 0.00019316207244912686, + "loss": 0.1804, + "step": 2374 + }, + { + "epoch": 0.17136260326851618, + "grad_norm": 0.12603819370269775, + "learning_rate": 0.00019315918602973013, + "loss": 0.1929, + "step": 2375 + }, + { + "epoch": 0.1714347559435766, + "grad_norm": 0.1151173859834671, + "learning_rate": 0.0001931562996103334, + "loss": 0.1452, + "step": 2376 + }, + { + "epoch": 0.17150690861863704, + "grad_norm": 0.10496576875448227, + "learning_rate": 0.00019315341319093665, + "loss": 0.1625, + "step": 2377 + }, + { + "epoch": 0.17157906129369746, + "grad_norm": 0.15824727714061737, + "learning_rate": 0.00019315052677153991, + "loss": 0.1282, + "step": 2378 + }, + { + "epoch": 0.1716512139687579, + "grad_norm": 0.08507123589515686, + "learning_rate": 0.00019314764035214318, + "loss": 0.147, + "step": 2379 + }, + { + "epoch": 0.17172336664381832, + "grad_norm": 0.11039454489946365, + "learning_rate": 0.00019314475393274644, + "loss": 0.1098, + "step": 2380 + }, + { + "epoch": 0.17179551931887874, + "grad_norm": 0.08755354583263397, + "learning_rate": 0.0001931418675133497, + "loss": 0.1515, + "step": 2381 + }, + { + "epoch": 0.17186767199393918, + "grad_norm": 0.11836013942956924, + "learning_rate": 0.00019313898109395297, + "loss": 0.1708, + "step": 2382 + }, + { + "epoch": 0.1719398246689996, + "grad_norm": 0.13358359038829803, + "learning_rate": 0.00019313609467455623, + "loss": 0.1909, + "step": 2383 + }, + { + "epoch": 0.17201197734406004, + "grad_norm": 0.09466656297445297, + "learning_rate": 0.0001931332082551595, + "loss": 0.1541, + "step": 2384 + }, + { + "epoch": 0.17208413001912046, + "grad_norm": 0.10225172340869904, + "learning_rate": 0.00019313032183576273, + "loss": 0.1458, + "step": 2385 + }, + { + "epoch": 0.17215628269418087, + "grad_norm": 0.09411462396383286, + "learning_rate": 0.000193127435416366, + "loss": 0.171, + "step": 2386 + }, + { + "epoch": 0.17222843536924132, + "grad_norm": 0.11404234915971756, + "learning_rate": 0.00019312454899696928, + "loss": 0.1751, + "step": 2387 + }, + { + "epoch": 0.17230058804430173, + "grad_norm": 0.1240854263305664, + "learning_rate": 0.00019312166257757254, + "loss": 0.2152, + "step": 2388 + }, + { + "epoch": 0.17237274071936218, + "grad_norm": 0.09119916707277298, + "learning_rate": 0.0001931187761581758, + "loss": 0.1732, + "step": 2389 + }, + { + "epoch": 0.1724448933944226, + "grad_norm": 0.12059430032968521, + "learning_rate": 0.00019311588973877904, + "loss": 0.2001, + "step": 2390 + }, + { + "epoch": 0.17251704606948304, + "grad_norm": 0.10738098621368408, + "learning_rate": 0.0001931130033193823, + "loss": 0.183, + "step": 2391 + }, + { + "epoch": 0.17258919874454345, + "grad_norm": 0.08014591783285141, + "learning_rate": 0.00019311011689998557, + "loss": 0.1185, + "step": 2392 + }, + { + "epoch": 0.17266135141960387, + "grad_norm": 0.15968464314937592, + "learning_rate": 0.00019310723048058883, + "loss": 0.1773, + "step": 2393 + }, + { + "epoch": 0.17273350409466431, + "grad_norm": 0.10324010252952576, + "learning_rate": 0.00019310434406119212, + "loss": 0.1825, + "step": 2394 + }, + { + "epoch": 0.17280565676972473, + "grad_norm": 0.12751446664333344, + "learning_rate": 0.00019310145764179536, + "loss": 0.158, + "step": 2395 + }, + { + "epoch": 0.17287780944478517, + "grad_norm": 0.10440117120742798, + "learning_rate": 0.00019309857122239862, + "loss": 0.1772, + "step": 2396 + }, + { + "epoch": 0.1729499621198456, + "grad_norm": 0.11839499324560165, + "learning_rate": 0.00019309568480300188, + "loss": 0.1507, + "step": 2397 + }, + { + "epoch": 0.17302211479490603, + "grad_norm": 0.09740056842565536, + "learning_rate": 0.00019309279838360515, + "loss": 0.1601, + "step": 2398 + }, + { + "epoch": 0.17309426746996645, + "grad_norm": 0.08723609149456024, + "learning_rate": 0.0001930899119642084, + "loss": 0.1731, + "step": 2399 + }, + { + "epoch": 0.17316642014502687, + "grad_norm": 0.1224941685795784, + "learning_rate": 0.00019308702554481167, + "loss": 0.1299, + "step": 2400 + }, + { + "epoch": 0.1732385728200873, + "grad_norm": 0.12198638170957565, + "learning_rate": 0.00019308413912541493, + "loss": 0.1758, + "step": 2401 + }, + { + "epoch": 0.17331072549514773, + "grad_norm": 0.11759866029024124, + "learning_rate": 0.0001930812527060182, + "loss": 0.1445, + "step": 2402 + }, + { + "epoch": 0.17338287817020817, + "grad_norm": 0.11716272681951523, + "learning_rate": 0.00019307836628662146, + "loss": 0.1983, + "step": 2403 + }, + { + "epoch": 0.1734550308452686, + "grad_norm": 0.11761771887540817, + "learning_rate": 0.00019307547986722472, + "loss": 0.1539, + "step": 2404 + }, + { + "epoch": 0.173527183520329, + "grad_norm": 0.11381730437278748, + "learning_rate": 0.00019307259344782799, + "loss": 0.1637, + "step": 2405 + }, + { + "epoch": 0.17359933619538945, + "grad_norm": 0.11773020029067993, + "learning_rate": 0.00019306970702843122, + "loss": 0.1539, + "step": 2406 + }, + { + "epoch": 0.17367148887044986, + "grad_norm": 0.08517422527074814, + "learning_rate": 0.00019306682060903448, + "loss": 0.2072, + "step": 2407 + }, + { + "epoch": 0.1737436415455103, + "grad_norm": 0.13268540799617767, + "learning_rate": 0.00019306393418963777, + "loss": 0.1786, + "step": 2408 + }, + { + "epoch": 0.17381579422057072, + "grad_norm": 0.10123803466558456, + "learning_rate": 0.00019306104777024104, + "loss": 0.1678, + "step": 2409 + }, + { + "epoch": 0.17388794689563117, + "grad_norm": 0.10507965087890625, + "learning_rate": 0.0001930581613508443, + "loss": 0.1454, + "step": 2410 + }, + { + "epoch": 0.17396009957069158, + "grad_norm": 0.10920926183462143, + "learning_rate": 0.00019305527493144754, + "loss": 0.1498, + "step": 2411 + }, + { + "epoch": 0.174032252245752, + "grad_norm": 0.10431352257728577, + "learning_rate": 0.0001930523885120508, + "loss": 0.1378, + "step": 2412 + }, + { + "epoch": 0.17410440492081244, + "grad_norm": 0.10723818838596344, + "learning_rate": 0.00019304950209265406, + "loss": 0.1334, + "step": 2413 + }, + { + "epoch": 0.17417655759587286, + "grad_norm": 0.10780136287212372, + "learning_rate": 0.00019304661567325732, + "loss": 0.1427, + "step": 2414 + }, + { + "epoch": 0.1742487102709333, + "grad_norm": 0.18205569684505463, + "learning_rate": 0.00019304372925386061, + "loss": 0.2193, + "step": 2415 + }, + { + "epoch": 0.17432086294599372, + "grad_norm": 0.15683774650096893, + "learning_rate": 0.00019304084283446385, + "loss": 0.1789, + "step": 2416 + }, + { + "epoch": 0.17439301562105414, + "grad_norm": 0.08884182572364807, + "learning_rate": 0.00019303795641506711, + "loss": 0.1223, + "step": 2417 + }, + { + "epoch": 0.17446516829611458, + "grad_norm": 0.11683040857315063, + "learning_rate": 0.00019303506999567038, + "loss": 0.1857, + "step": 2418 + }, + { + "epoch": 0.174537320971175, + "grad_norm": 0.15970462560653687, + "learning_rate": 0.00019303218357627364, + "loss": 0.1684, + "step": 2419 + }, + { + "epoch": 0.17460947364623544, + "grad_norm": 0.11105070263147354, + "learning_rate": 0.0001930292971568769, + "loss": 0.1835, + "step": 2420 + }, + { + "epoch": 0.17468162632129586, + "grad_norm": 0.12702690064907074, + "learning_rate": 0.00019302641073748017, + "loss": 0.1917, + "step": 2421 + }, + { + "epoch": 0.1747537789963563, + "grad_norm": 0.10951557010412216, + "learning_rate": 0.00019302352431808343, + "loss": 0.154, + "step": 2422 + }, + { + "epoch": 0.17482593167141672, + "grad_norm": 0.10169512778520584, + "learning_rate": 0.0001930206378986867, + "loss": 0.1557, + "step": 2423 + }, + { + "epoch": 0.17489808434647713, + "grad_norm": 0.1406065970659256, + "learning_rate": 0.00019301775147928995, + "loss": 0.1466, + "step": 2424 + }, + { + "epoch": 0.17497023702153758, + "grad_norm": 0.15568317472934723, + "learning_rate": 0.00019301486505989322, + "loss": 0.1885, + "step": 2425 + }, + { + "epoch": 0.175042389696598, + "grad_norm": 0.10522514581680298, + "learning_rate": 0.00019301197864049648, + "loss": 0.1622, + "step": 2426 + }, + { + "epoch": 0.17511454237165844, + "grad_norm": 0.1026366651058197, + "learning_rate": 0.00019300909222109974, + "loss": 0.1999, + "step": 2427 + }, + { + "epoch": 0.17518669504671885, + "grad_norm": 0.1487097442150116, + "learning_rate": 0.00019300620580170298, + "loss": 0.2077, + "step": 2428 + }, + { + "epoch": 0.1752588477217793, + "grad_norm": 0.10587791353464127, + "learning_rate": 0.00019300331938230627, + "loss": 0.1311, + "step": 2429 + }, + { + "epoch": 0.17533100039683971, + "grad_norm": 0.10131558030843735, + "learning_rate": 0.00019300043296290953, + "loss": 0.1353, + "step": 2430 + }, + { + "epoch": 0.17540315307190013, + "grad_norm": 0.11770360916852951, + "learning_rate": 0.0001929975465435128, + "loss": 0.19, + "step": 2431 + }, + { + "epoch": 0.17547530574696057, + "grad_norm": 0.10171361267566681, + "learning_rate": 0.00019299466012411606, + "loss": 0.1825, + "step": 2432 + }, + { + "epoch": 0.175547458422021, + "grad_norm": 0.12811347842216492, + "learning_rate": 0.0001929917737047193, + "loss": 0.1983, + "step": 2433 + }, + { + "epoch": 0.17561961109708144, + "grad_norm": 0.10407224297523499, + "learning_rate": 0.00019298888728532256, + "loss": 0.1566, + "step": 2434 + }, + { + "epoch": 0.17569176377214185, + "grad_norm": 0.09604757279157639, + "learning_rate": 0.00019298600086592582, + "loss": 0.1551, + "step": 2435 + }, + { + "epoch": 0.17576391644720227, + "grad_norm": 0.09429436177015305, + "learning_rate": 0.0001929831144465291, + "loss": 0.1617, + "step": 2436 + }, + { + "epoch": 0.1758360691222627, + "grad_norm": 0.09811285138130188, + "learning_rate": 0.00019298022802713237, + "loss": 0.1994, + "step": 2437 + }, + { + "epoch": 0.17590822179732313, + "grad_norm": 0.09046891331672668, + "learning_rate": 0.0001929773416077356, + "loss": 0.1091, + "step": 2438 + }, + { + "epoch": 0.17598037447238357, + "grad_norm": 0.130147784948349, + "learning_rate": 0.00019297445518833887, + "loss": 0.1649, + "step": 2439 + }, + { + "epoch": 0.176052527147444, + "grad_norm": 0.10541129112243652, + "learning_rate": 0.00019297156876894213, + "loss": 0.1769, + "step": 2440 + }, + { + "epoch": 0.17612467982250443, + "grad_norm": 0.12361346185207367, + "learning_rate": 0.0001929686823495454, + "loss": 0.1589, + "step": 2441 + }, + { + "epoch": 0.17619683249756485, + "grad_norm": 0.12085054069757462, + "learning_rate": 0.00019296579593014866, + "loss": 0.1732, + "step": 2442 + }, + { + "epoch": 0.17626898517262526, + "grad_norm": 0.10131417214870453, + "learning_rate": 0.00019296290951075192, + "loss": 0.1803, + "step": 2443 + }, + { + "epoch": 0.1763411378476857, + "grad_norm": 0.09739714860916138, + "learning_rate": 0.00019296002309135519, + "loss": 0.1738, + "step": 2444 + }, + { + "epoch": 0.17641329052274612, + "grad_norm": 0.09897683560848236, + "learning_rate": 0.00019295713667195845, + "loss": 0.187, + "step": 2445 + }, + { + "epoch": 0.17648544319780657, + "grad_norm": 0.09419049322605133, + "learning_rate": 0.0001929542502525617, + "loss": 0.1542, + "step": 2446 + }, + { + "epoch": 0.17655759587286698, + "grad_norm": 0.10498613864183426, + "learning_rate": 0.00019295136383316497, + "loss": 0.176, + "step": 2447 + }, + { + "epoch": 0.1766297485479274, + "grad_norm": 0.13526107370853424, + "learning_rate": 0.00019294847741376824, + "loss": 0.1597, + "step": 2448 + }, + { + "epoch": 0.17670190122298784, + "grad_norm": 0.09594099223613739, + "learning_rate": 0.00019294559099437147, + "loss": 0.1705, + "step": 2449 + }, + { + "epoch": 0.17677405389804826, + "grad_norm": 0.1210499256849289, + "learning_rate": 0.00019294270457497476, + "loss": 0.1298, + "step": 2450 + }, + { + "epoch": 0.1768462065731087, + "grad_norm": 0.10663893073797226, + "learning_rate": 0.00019293981815557803, + "loss": 0.1931, + "step": 2451 + }, + { + "epoch": 0.17691835924816912, + "grad_norm": 0.10738261789083481, + "learning_rate": 0.0001929369317361813, + "loss": 0.1941, + "step": 2452 + }, + { + "epoch": 0.17699051192322957, + "grad_norm": 0.10399194061756134, + "learning_rate": 0.00019293404531678455, + "loss": 0.2344, + "step": 2453 + }, + { + "epoch": 0.17706266459828998, + "grad_norm": 0.09402377903461456, + "learning_rate": 0.0001929311588973878, + "loss": 0.1542, + "step": 2454 + }, + { + "epoch": 0.1771348172733504, + "grad_norm": 0.11488594114780426, + "learning_rate": 0.00019292827247799105, + "loss": 0.1302, + "step": 2455 + }, + { + "epoch": 0.17720696994841084, + "grad_norm": 0.10841730237007141, + "learning_rate": 0.0001929253860585943, + "loss": 0.1471, + "step": 2456 + }, + { + "epoch": 0.17727912262347126, + "grad_norm": 0.09893527626991272, + "learning_rate": 0.00019292249963919758, + "loss": 0.1645, + "step": 2457 + }, + { + "epoch": 0.1773512752985317, + "grad_norm": 0.1275603026151657, + "learning_rate": 0.00019291961321980087, + "loss": 0.1024, + "step": 2458 + }, + { + "epoch": 0.17742342797359212, + "grad_norm": 0.10018088668584824, + "learning_rate": 0.0001929167268004041, + "loss": 0.1572, + "step": 2459 + }, + { + "epoch": 0.17749558064865256, + "grad_norm": 0.13826268911361694, + "learning_rate": 0.00019291384038100737, + "loss": 0.1963, + "step": 2460 + }, + { + "epoch": 0.17756773332371298, + "grad_norm": 0.12875942885875702, + "learning_rate": 0.00019291095396161063, + "loss": 0.1136, + "step": 2461 + }, + { + "epoch": 0.1776398859987734, + "grad_norm": 0.0969284251332283, + "learning_rate": 0.0001929080675422139, + "loss": 0.1702, + "step": 2462 + }, + { + "epoch": 0.17771203867383384, + "grad_norm": 0.09803799539804459, + "learning_rate": 0.00019290518112281715, + "loss": 0.193, + "step": 2463 + }, + { + "epoch": 0.17778419134889425, + "grad_norm": 0.13163645565509796, + "learning_rate": 0.00019290229470342042, + "loss": 0.1273, + "step": 2464 + }, + { + "epoch": 0.1778563440239547, + "grad_norm": 0.10827518999576569, + "learning_rate": 0.00019289940828402368, + "loss": 0.1639, + "step": 2465 + }, + { + "epoch": 0.17792849669901512, + "grad_norm": 0.10406279563903809, + "learning_rate": 0.00019289652186462694, + "loss": 0.1677, + "step": 2466 + }, + { + "epoch": 0.17800064937407553, + "grad_norm": 0.11019009351730347, + "learning_rate": 0.0001928936354452302, + "loss": 0.1992, + "step": 2467 + }, + { + "epoch": 0.17807280204913598, + "grad_norm": 0.1016637310385704, + "learning_rate": 0.00019289074902583347, + "loss": 0.1469, + "step": 2468 + }, + { + "epoch": 0.1781449547241964, + "grad_norm": 0.10010644793510437, + "learning_rate": 0.00019288786260643673, + "loss": 0.1226, + "step": 2469 + }, + { + "epoch": 0.17821710739925684, + "grad_norm": 0.19892233610153198, + "learning_rate": 0.00019288497618703997, + "loss": 0.2157, + "step": 2470 + }, + { + "epoch": 0.17828926007431725, + "grad_norm": 0.11943595111370087, + "learning_rate": 0.00019288208976764323, + "loss": 0.1482, + "step": 2471 + }, + { + "epoch": 0.1783614127493777, + "grad_norm": 0.09678040444850922, + "learning_rate": 0.00019287920334824652, + "loss": 0.1555, + "step": 2472 + }, + { + "epoch": 0.1784335654244381, + "grad_norm": 0.11203937232494354, + "learning_rate": 0.00019287631692884978, + "loss": 0.1725, + "step": 2473 + }, + { + "epoch": 0.17850571809949853, + "grad_norm": 0.10264791548252106, + "learning_rate": 0.00019287343050945305, + "loss": 0.1598, + "step": 2474 + }, + { + "epoch": 0.17857787077455897, + "grad_norm": 0.11397009342908859, + "learning_rate": 0.00019287054409005628, + "loss": 0.1627, + "step": 2475 + }, + { + "epoch": 0.1786500234496194, + "grad_norm": 0.11981376260519028, + "learning_rate": 0.00019286765767065954, + "loss": 0.1643, + "step": 2476 + }, + { + "epoch": 0.17872217612467983, + "grad_norm": 0.1312798261642456, + "learning_rate": 0.0001928647712512628, + "loss": 0.1575, + "step": 2477 + }, + { + "epoch": 0.17879432879974025, + "grad_norm": 0.140218123793602, + "learning_rate": 0.00019286188483186607, + "loss": 0.1921, + "step": 2478 + }, + { + "epoch": 0.17886648147480066, + "grad_norm": 0.1026497408747673, + "learning_rate": 0.00019285899841246936, + "loss": 0.2242, + "step": 2479 + }, + { + "epoch": 0.1789386341498611, + "grad_norm": 0.1413601189851761, + "learning_rate": 0.0001928561119930726, + "loss": 0.1298, + "step": 2480 + }, + { + "epoch": 0.17901078682492153, + "grad_norm": 0.09540276229381561, + "learning_rate": 0.00019285322557367586, + "loss": 0.1595, + "step": 2481 + }, + { + "epoch": 0.17908293949998197, + "grad_norm": 0.08501024544239044, + "learning_rate": 0.00019285033915427912, + "loss": 0.1599, + "step": 2482 + }, + { + "epoch": 0.17915509217504239, + "grad_norm": 0.10932368785142899, + "learning_rate": 0.00019284745273488239, + "loss": 0.2136, + "step": 2483 + }, + { + "epoch": 0.17922724485010283, + "grad_norm": 0.09894034266471863, + "learning_rate": 0.00019284456631548565, + "loss": 0.1926, + "step": 2484 + }, + { + "epoch": 0.17929939752516325, + "grad_norm": 0.09841787070035934, + "learning_rate": 0.0001928416798960889, + "loss": 0.1734, + "step": 2485 + }, + { + "epoch": 0.17937155020022366, + "grad_norm": 0.10464194416999817, + "learning_rate": 0.00019283879347669217, + "loss": 0.1717, + "step": 2486 + }, + { + "epoch": 0.1794437028752841, + "grad_norm": 0.10199958831071854, + "learning_rate": 0.00019283590705729544, + "loss": 0.1482, + "step": 2487 + }, + { + "epoch": 0.17951585555034452, + "grad_norm": 0.08751571178436279, + "learning_rate": 0.0001928330206378987, + "loss": 0.1094, + "step": 2488 + }, + { + "epoch": 0.17958800822540497, + "grad_norm": 0.11693772673606873, + "learning_rate": 0.00019283013421850196, + "loss": 0.1885, + "step": 2489 + }, + { + "epoch": 0.17966016090046538, + "grad_norm": 0.10919500887393951, + "learning_rate": 0.00019282724779910523, + "loss": 0.1584, + "step": 2490 + }, + { + "epoch": 0.17973231357552583, + "grad_norm": 0.11971744894981384, + "learning_rate": 0.00019282436137970846, + "loss": 0.157, + "step": 2491 + }, + { + "epoch": 0.17980446625058624, + "grad_norm": 0.15336687862873077, + "learning_rate": 0.00019282147496031172, + "loss": 0.1952, + "step": 2492 + }, + { + "epoch": 0.17987661892564666, + "grad_norm": 0.1436939239501953, + "learning_rate": 0.00019281858854091501, + "loss": 0.1478, + "step": 2493 + }, + { + "epoch": 0.1799487716007071, + "grad_norm": 0.1352475881576538, + "learning_rate": 0.00019281570212151828, + "loss": 0.1621, + "step": 2494 + }, + { + "epoch": 0.18002092427576752, + "grad_norm": 0.12049786746501923, + "learning_rate": 0.00019281281570212154, + "loss": 0.1737, + "step": 2495 + }, + { + "epoch": 0.18009307695082796, + "grad_norm": 0.11868083477020264, + "learning_rate": 0.00019280992928272478, + "loss": 0.1521, + "step": 2496 + }, + { + "epoch": 0.18016522962588838, + "grad_norm": 0.09464991092681885, + "learning_rate": 0.00019280704286332804, + "loss": 0.1801, + "step": 2497 + }, + { + "epoch": 0.1802373823009488, + "grad_norm": 0.11203427612781525, + "learning_rate": 0.0001928041564439313, + "loss": 0.1716, + "step": 2498 + }, + { + "epoch": 0.18030953497600924, + "grad_norm": 0.0979161262512207, + "learning_rate": 0.00019280127002453456, + "loss": 0.1732, + "step": 2499 + }, + { + "epoch": 0.18038168765106966, + "grad_norm": 0.09018220752477646, + "learning_rate": 0.00019279838360513785, + "loss": 0.1433, + "step": 2500 + }, + { + "epoch": 0.1804538403261301, + "grad_norm": 0.09370570629835129, + "learning_rate": 0.0001927954971857411, + "loss": 0.1446, + "step": 2501 + }, + { + "epoch": 0.18052599300119052, + "grad_norm": 0.10692581534385681, + "learning_rate": 0.00019279261076634435, + "loss": 0.1434, + "step": 2502 + }, + { + "epoch": 0.18059814567625096, + "grad_norm": 0.12523172795772552, + "learning_rate": 0.00019278972434694762, + "loss": 0.1837, + "step": 2503 + }, + { + "epoch": 0.18067029835131138, + "grad_norm": 0.08301182836294174, + "learning_rate": 0.00019278683792755088, + "loss": 0.2049, + "step": 2504 + }, + { + "epoch": 0.1807424510263718, + "grad_norm": 0.09529439359903336, + "learning_rate": 0.00019278395150815414, + "loss": 0.1667, + "step": 2505 + }, + { + "epoch": 0.18081460370143224, + "grad_norm": 0.1019364520907402, + "learning_rate": 0.0001927810650887574, + "loss": 0.2199, + "step": 2506 + }, + { + "epoch": 0.18088675637649265, + "grad_norm": 0.1121387854218483, + "learning_rate": 0.00019277817866936067, + "loss": 0.1385, + "step": 2507 + }, + { + "epoch": 0.1809589090515531, + "grad_norm": 0.0993325337767601, + "learning_rate": 0.00019277529224996393, + "loss": 0.135, + "step": 2508 + }, + { + "epoch": 0.1810310617266135, + "grad_norm": 0.08489389717578888, + "learning_rate": 0.0001927724058305672, + "loss": 0.1392, + "step": 2509 + }, + { + "epoch": 0.18110321440167393, + "grad_norm": 0.10872458666563034, + "learning_rate": 0.00019276951941117046, + "loss": 0.1647, + "step": 2510 + }, + { + "epoch": 0.18117536707673437, + "grad_norm": 0.12486433982849121, + "learning_rate": 0.00019276663299177372, + "loss": 0.1372, + "step": 2511 + }, + { + "epoch": 0.1812475197517948, + "grad_norm": 0.10761299729347229, + "learning_rate": 0.00019276374657237696, + "loss": 0.1716, + "step": 2512 + }, + { + "epoch": 0.18131967242685523, + "grad_norm": 0.10109484195709229, + "learning_rate": 0.00019276086015298022, + "loss": 0.1726, + "step": 2513 + }, + { + "epoch": 0.18139182510191565, + "grad_norm": 0.09445323795080185, + "learning_rate": 0.0001927579737335835, + "loss": 0.1212, + "step": 2514 + }, + { + "epoch": 0.1814639777769761, + "grad_norm": 0.12211337685585022, + "learning_rate": 0.00019275508731418677, + "loss": 0.1474, + "step": 2515 + }, + { + "epoch": 0.1815361304520365, + "grad_norm": 0.11376041173934937, + "learning_rate": 0.00019275220089479003, + "loss": 0.1504, + "step": 2516 + }, + { + "epoch": 0.18160828312709693, + "grad_norm": 0.130318284034729, + "learning_rate": 0.00019274931447539327, + "loss": 0.1754, + "step": 2517 + }, + { + "epoch": 0.18168043580215737, + "grad_norm": 0.10562107712030411, + "learning_rate": 0.00019274642805599653, + "loss": 0.1757, + "step": 2518 + }, + { + "epoch": 0.18175258847721779, + "grad_norm": 0.09059479832649231, + "learning_rate": 0.0001927435416365998, + "loss": 0.167, + "step": 2519 + }, + { + "epoch": 0.18182474115227823, + "grad_norm": 0.11078401654958725, + "learning_rate": 0.00019274065521720306, + "loss": 0.1871, + "step": 2520 + }, + { + "epoch": 0.18189689382733865, + "grad_norm": 0.13319344818592072, + "learning_rate": 0.00019273776879780635, + "loss": 0.1739, + "step": 2521 + }, + { + "epoch": 0.1819690465023991, + "grad_norm": 0.09682480245828629, + "learning_rate": 0.00019273488237840958, + "loss": 0.1527, + "step": 2522 + }, + { + "epoch": 0.1820411991774595, + "grad_norm": 0.09306000173091888, + "learning_rate": 0.00019273199595901285, + "loss": 0.1591, + "step": 2523 + }, + { + "epoch": 0.18211335185251992, + "grad_norm": 0.11338866502046585, + "learning_rate": 0.0001927291095396161, + "loss": 0.155, + "step": 2524 + }, + { + "epoch": 0.18218550452758037, + "grad_norm": 0.09607966244220734, + "learning_rate": 0.00019272622312021937, + "loss": 0.1522, + "step": 2525 + }, + { + "epoch": 0.18225765720264078, + "grad_norm": 0.13238127529621124, + "learning_rate": 0.00019272333670082264, + "loss": 0.136, + "step": 2526 + }, + { + "epoch": 0.18232980987770123, + "grad_norm": 0.13262353837490082, + "learning_rate": 0.0001927204502814259, + "loss": 0.167, + "step": 2527 + }, + { + "epoch": 0.18240196255276164, + "grad_norm": 0.10406520217657089, + "learning_rate": 0.00019271756386202916, + "loss": 0.135, + "step": 2528 + }, + { + "epoch": 0.18247411522782206, + "grad_norm": 0.1390426605939865, + "learning_rate": 0.00019271467744263243, + "loss": 0.1838, + "step": 2529 + }, + { + "epoch": 0.1825462679028825, + "grad_norm": 0.09890906512737274, + "learning_rate": 0.0001927117910232357, + "loss": 0.151, + "step": 2530 + }, + { + "epoch": 0.18261842057794292, + "grad_norm": 0.10099121183156967, + "learning_rate": 0.00019270890460383895, + "loss": 0.1531, + "step": 2531 + }, + { + "epoch": 0.18269057325300336, + "grad_norm": 0.1050528958439827, + "learning_rate": 0.00019270601818444221, + "loss": 0.1536, + "step": 2532 + }, + { + "epoch": 0.18276272592806378, + "grad_norm": 0.08454670011997223, + "learning_rate": 0.00019270313176504548, + "loss": 0.1672, + "step": 2533 + }, + { + "epoch": 0.18283487860312422, + "grad_norm": 0.09192633628845215, + "learning_rate": 0.0001927002453456487, + "loss": 0.1723, + "step": 2534 + }, + { + "epoch": 0.18290703127818464, + "grad_norm": 0.10295901447534561, + "learning_rate": 0.000192697358926252, + "loss": 0.1387, + "step": 2535 + }, + { + "epoch": 0.18297918395324506, + "grad_norm": 0.0948997363448143, + "learning_rate": 0.00019269447250685527, + "loss": 0.1655, + "step": 2536 + }, + { + "epoch": 0.1830513366283055, + "grad_norm": 0.10542097687721252, + "learning_rate": 0.00019269158608745853, + "loss": 0.1618, + "step": 2537 + }, + { + "epoch": 0.18312348930336592, + "grad_norm": 0.1150515228509903, + "learning_rate": 0.0001926886996680618, + "loss": 0.1504, + "step": 2538 + }, + { + "epoch": 0.18319564197842636, + "grad_norm": 0.11629487574100494, + "learning_rate": 0.00019268581324866503, + "loss": 0.1609, + "step": 2539 + }, + { + "epoch": 0.18326779465348678, + "grad_norm": 0.11165773868560791, + "learning_rate": 0.0001926829268292683, + "loss": 0.1511, + "step": 2540 + }, + { + "epoch": 0.1833399473285472, + "grad_norm": 0.11779014766216278, + "learning_rate": 0.00019268004040987155, + "loss": 0.1827, + "step": 2541 + }, + { + "epoch": 0.18341210000360764, + "grad_norm": 0.10018985718488693, + "learning_rate": 0.00019267715399047484, + "loss": 0.1803, + "step": 2542 + }, + { + "epoch": 0.18348425267866805, + "grad_norm": 0.12660762667655945, + "learning_rate": 0.0001926742675710781, + "loss": 0.1804, + "step": 2543 + }, + { + "epoch": 0.1835564053537285, + "grad_norm": 0.10073110461235046, + "learning_rate": 0.00019267138115168134, + "loss": 0.1949, + "step": 2544 + }, + { + "epoch": 0.1836285580287889, + "grad_norm": 0.15038368105888367, + "learning_rate": 0.0001926684947322846, + "loss": 0.2406, + "step": 2545 + }, + { + "epoch": 0.18370071070384936, + "grad_norm": 0.1125505194067955, + "learning_rate": 0.00019266560831288787, + "loss": 0.2075, + "step": 2546 + }, + { + "epoch": 0.18377286337890977, + "grad_norm": 0.0908120647072792, + "learning_rate": 0.00019266272189349113, + "loss": 0.1807, + "step": 2547 + }, + { + "epoch": 0.1838450160539702, + "grad_norm": 0.12920835614204407, + "learning_rate": 0.0001926598354740944, + "loss": 0.1864, + "step": 2548 + }, + { + "epoch": 0.18391716872903063, + "grad_norm": 0.1229652538895607, + "learning_rate": 0.00019265694905469766, + "loss": 0.1311, + "step": 2549 + }, + { + "epoch": 0.18398932140409105, + "grad_norm": 0.11898133158683777, + "learning_rate": 0.00019265406263530092, + "loss": 0.1303, + "step": 2550 + }, + { + "epoch": 0.1840614740791515, + "grad_norm": 0.09316210448741913, + "learning_rate": 0.00019265117621590418, + "loss": 0.1293, + "step": 2551 + }, + { + "epoch": 0.1841336267542119, + "grad_norm": 0.12110738456249237, + "learning_rate": 0.00019264828979650745, + "loss": 0.1711, + "step": 2552 + }, + { + "epoch": 0.18420577942927235, + "grad_norm": 0.10128328204154968, + "learning_rate": 0.0001926454033771107, + "loss": 0.1686, + "step": 2553 + }, + { + "epoch": 0.18427793210433277, + "grad_norm": 0.09798388183116913, + "learning_rate": 0.00019264251695771397, + "loss": 0.1716, + "step": 2554 + }, + { + "epoch": 0.1843500847793932, + "grad_norm": 0.12545473873615265, + "learning_rate": 0.0001926396305383172, + "loss": 0.1823, + "step": 2555 + }, + { + "epoch": 0.18442223745445363, + "grad_norm": 0.12609075009822845, + "learning_rate": 0.0001926367441189205, + "loss": 0.1552, + "step": 2556 + }, + { + "epoch": 0.18449439012951405, + "grad_norm": 0.1211591586470604, + "learning_rate": 0.00019263385769952376, + "loss": 0.1661, + "step": 2557 + }, + { + "epoch": 0.1845665428045745, + "grad_norm": 0.09905112534761429, + "learning_rate": 0.00019263097128012702, + "loss": 0.182, + "step": 2558 + }, + { + "epoch": 0.1846386954796349, + "grad_norm": 0.10378430783748627, + "learning_rate": 0.00019262808486073029, + "loss": 0.1729, + "step": 2559 + }, + { + "epoch": 0.18471084815469532, + "grad_norm": 0.1171564906835556, + "learning_rate": 0.00019262519844133352, + "loss": 0.1537, + "step": 2560 + }, + { + "epoch": 0.18478300082975577, + "grad_norm": 0.12228664755821228, + "learning_rate": 0.00019262231202193678, + "loss": 0.1954, + "step": 2561 + }, + { + "epoch": 0.18485515350481618, + "grad_norm": 0.10984918475151062, + "learning_rate": 0.00019261942560254005, + "loss": 0.1702, + "step": 2562 + }, + { + "epoch": 0.18492730617987663, + "grad_norm": 0.11387166380882263, + "learning_rate": 0.00019261653918314334, + "loss": 0.1633, + "step": 2563 + }, + { + "epoch": 0.18499945885493704, + "grad_norm": 0.1470084935426712, + "learning_rate": 0.0001926136527637466, + "loss": 0.1654, + "step": 2564 + }, + { + "epoch": 0.1850716115299975, + "grad_norm": 0.09996671974658966, + "learning_rate": 0.00019261076634434984, + "loss": 0.2324, + "step": 2565 + }, + { + "epoch": 0.1851437642050579, + "grad_norm": 0.13593421876430511, + "learning_rate": 0.0001926078799249531, + "loss": 0.2202, + "step": 2566 + }, + { + "epoch": 0.18521591688011832, + "grad_norm": 0.08700580149888992, + "learning_rate": 0.00019260499350555636, + "loss": 0.1401, + "step": 2567 + }, + { + "epoch": 0.18528806955517876, + "grad_norm": 0.11398850381374359, + "learning_rate": 0.00019260210708615963, + "loss": 0.1733, + "step": 2568 + }, + { + "epoch": 0.18536022223023918, + "grad_norm": 0.09895952045917511, + "learning_rate": 0.0001925992206667629, + "loss": 0.2249, + "step": 2569 + }, + { + "epoch": 0.18543237490529962, + "grad_norm": 0.12415144592523575, + "learning_rate": 0.00019259633424736615, + "loss": 0.2309, + "step": 2570 + }, + { + "epoch": 0.18550452758036004, + "grad_norm": 0.10959386825561523, + "learning_rate": 0.00019259344782796941, + "loss": 0.2302, + "step": 2571 + }, + { + "epoch": 0.18557668025542046, + "grad_norm": 0.0890393927693367, + "learning_rate": 0.00019259056140857268, + "loss": 0.1575, + "step": 2572 + }, + { + "epoch": 0.1856488329304809, + "grad_norm": 0.09315341711044312, + "learning_rate": 0.00019258767498917594, + "loss": 0.1541, + "step": 2573 + }, + { + "epoch": 0.18572098560554132, + "grad_norm": 0.10352786630392075, + "learning_rate": 0.0001925847885697792, + "loss": 0.1917, + "step": 2574 + }, + { + "epoch": 0.18579313828060176, + "grad_norm": 0.12139669060707092, + "learning_rate": 0.00019258190215038247, + "loss": 0.1942, + "step": 2575 + }, + { + "epoch": 0.18586529095566218, + "grad_norm": 0.10332541167736053, + "learning_rate": 0.0001925790157309857, + "loss": 0.1508, + "step": 2576 + }, + { + "epoch": 0.18593744363072262, + "grad_norm": 0.10060901939868927, + "learning_rate": 0.000192576129311589, + "loss": 0.1717, + "step": 2577 + }, + { + "epoch": 0.18600959630578304, + "grad_norm": 0.11494073271751404, + "learning_rate": 0.00019257324289219225, + "loss": 0.1392, + "step": 2578 + }, + { + "epoch": 0.18608174898084345, + "grad_norm": 0.10815019905567169, + "learning_rate": 0.00019257035647279552, + "loss": 0.2348, + "step": 2579 + }, + { + "epoch": 0.1861539016559039, + "grad_norm": 0.10394209623336792, + "learning_rate": 0.00019256747005339878, + "loss": 0.2208, + "step": 2580 + }, + { + "epoch": 0.1862260543309643, + "grad_norm": 0.09226703643798828, + "learning_rate": 0.00019256458363400202, + "loss": 0.1539, + "step": 2581 + }, + { + "epoch": 0.18629820700602476, + "grad_norm": 0.12375635653734207, + "learning_rate": 0.00019256169721460528, + "loss": 0.1241, + "step": 2582 + }, + { + "epoch": 0.18637035968108517, + "grad_norm": 0.10145562142133713, + "learning_rate": 0.00019255881079520854, + "loss": 0.1774, + "step": 2583 + }, + { + "epoch": 0.18644251235614562, + "grad_norm": 0.10398755967617035, + "learning_rate": 0.00019255592437581183, + "loss": 0.1931, + "step": 2584 + }, + { + "epoch": 0.18651466503120603, + "grad_norm": 0.10972506552934647, + "learning_rate": 0.0001925530379564151, + "loss": 0.1863, + "step": 2585 + }, + { + "epoch": 0.18658681770626645, + "grad_norm": 0.09676441550254822, + "learning_rate": 0.00019255015153701833, + "loss": 0.1602, + "step": 2586 + }, + { + "epoch": 0.1866589703813269, + "grad_norm": 0.09614020586013794, + "learning_rate": 0.0001925472651176216, + "loss": 0.149, + "step": 2587 + }, + { + "epoch": 0.1867311230563873, + "grad_norm": 0.09428024291992188, + "learning_rate": 0.00019254437869822486, + "loss": 0.1444, + "step": 2588 + }, + { + "epoch": 0.18680327573144775, + "grad_norm": 0.08294445276260376, + "learning_rate": 0.00019254149227882812, + "loss": 0.1203, + "step": 2589 + }, + { + "epoch": 0.18687542840650817, + "grad_norm": 0.09792113304138184, + "learning_rate": 0.00019253860585943138, + "loss": 0.1312, + "step": 2590 + }, + { + "epoch": 0.1869475810815686, + "grad_norm": 0.09841245412826538, + "learning_rate": 0.00019253571944003465, + "loss": 0.1562, + "step": 2591 + }, + { + "epoch": 0.18701973375662903, + "grad_norm": 0.09592325985431671, + "learning_rate": 0.0001925328330206379, + "loss": 0.1313, + "step": 2592 + }, + { + "epoch": 0.18709188643168945, + "grad_norm": 0.10660023242235184, + "learning_rate": 0.00019252994660124117, + "loss": 0.1376, + "step": 2593 + }, + { + "epoch": 0.1871640391067499, + "grad_norm": 0.11521776765584946, + "learning_rate": 0.00019252706018184443, + "loss": 0.1718, + "step": 2594 + }, + { + "epoch": 0.1872361917818103, + "grad_norm": 0.09450531005859375, + "learning_rate": 0.0001925241737624477, + "loss": 0.1841, + "step": 2595 + }, + { + "epoch": 0.18730834445687075, + "grad_norm": 0.13401101529598236, + "learning_rate": 0.00019252128734305096, + "loss": 0.2122, + "step": 2596 + }, + { + "epoch": 0.18738049713193117, + "grad_norm": 0.133869469165802, + "learning_rate": 0.0001925184009236542, + "loss": 0.163, + "step": 2597 + }, + { + "epoch": 0.18745264980699158, + "grad_norm": 0.08174880594015121, + "learning_rate": 0.00019251551450425749, + "loss": 0.1994, + "step": 2598 + }, + { + "epoch": 0.18752480248205203, + "grad_norm": 0.08731729537248611, + "learning_rate": 0.00019251262808486075, + "loss": 0.1421, + "step": 2599 + }, + { + "epoch": 0.18759695515711244, + "grad_norm": 0.11166815459728241, + "learning_rate": 0.000192509741665464, + "loss": 0.1765, + "step": 2600 + }, + { + "epoch": 0.1876691078321729, + "grad_norm": 0.1146954819560051, + "learning_rate": 0.00019250685524606727, + "loss": 0.1676, + "step": 2601 + }, + { + "epoch": 0.1877412605072333, + "grad_norm": 0.09924327582120895, + "learning_rate": 0.0001925039688266705, + "loss": 0.1573, + "step": 2602 + }, + { + "epoch": 0.18781341318229372, + "grad_norm": 0.10343486070632935, + "learning_rate": 0.00019250108240727377, + "loss": 0.1352, + "step": 2603 + }, + { + "epoch": 0.18788556585735416, + "grad_norm": 0.08584459125995636, + "learning_rate": 0.00019249819598787704, + "loss": 0.1924, + "step": 2604 + }, + { + "epoch": 0.18795771853241458, + "grad_norm": 0.10155855119228363, + "learning_rate": 0.00019249530956848033, + "loss": 0.1907, + "step": 2605 + }, + { + "epoch": 0.18802987120747502, + "grad_norm": 0.10088808834552765, + "learning_rate": 0.0001924924231490836, + "loss": 0.1457, + "step": 2606 + }, + { + "epoch": 0.18810202388253544, + "grad_norm": 0.0985383465886116, + "learning_rate": 0.00019248953672968682, + "loss": 0.1897, + "step": 2607 + }, + { + "epoch": 0.18817417655759588, + "grad_norm": 0.10378298163414001, + "learning_rate": 0.0001924866503102901, + "loss": 0.1524, + "step": 2608 + }, + { + "epoch": 0.1882463292326563, + "grad_norm": 0.12473829090595245, + "learning_rate": 0.00019248376389089335, + "loss": 0.1566, + "step": 2609 + }, + { + "epoch": 0.18831848190771672, + "grad_norm": 0.1098468080163002, + "learning_rate": 0.0001924808774714966, + "loss": 0.2182, + "step": 2610 + }, + { + "epoch": 0.18839063458277716, + "grad_norm": 0.09675396233797073, + "learning_rate": 0.00019247799105209988, + "loss": 0.1436, + "step": 2611 + }, + { + "epoch": 0.18846278725783758, + "grad_norm": 0.08573716878890991, + "learning_rate": 0.00019247510463270314, + "loss": 0.1762, + "step": 2612 + }, + { + "epoch": 0.18853493993289802, + "grad_norm": 0.08802764117717743, + "learning_rate": 0.0001924722182133064, + "loss": 0.1698, + "step": 2613 + }, + { + "epoch": 0.18860709260795844, + "grad_norm": 0.09013257920742035, + "learning_rate": 0.00019246933179390967, + "loss": 0.1848, + "step": 2614 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 0.08657081425189972, + "learning_rate": 0.00019246644537451293, + "loss": 0.1726, + "step": 2615 + }, + { + "epoch": 0.1887513979580793, + "grad_norm": 0.10190070420503616, + "learning_rate": 0.0001924635589551162, + "loss": 0.186, + "step": 2616 + }, + { + "epoch": 0.18882355063313971, + "grad_norm": 0.09923119097948074, + "learning_rate": 0.00019246067253571945, + "loss": 0.1401, + "step": 2617 + }, + { + "epoch": 0.18889570330820016, + "grad_norm": 0.09834610670804977, + "learning_rate": 0.0001924577861163227, + "loss": 0.184, + "step": 2618 + }, + { + "epoch": 0.18896785598326057, + "grad_norm": 0.11239048838615417, + "learning_rate": 0.00019245489969692598, + "loss": 0.1629, + "step": 2619 + }, + { + "epoch": 0.18904000865832102, + "grad_norm": 0.14011728763580322, + "learning_rate": 0.00019245201327752924, + "loss": 0.1499, + "step": 2620 + }, + { + "epoch": 0.18911216133338143, + "grad_norm": 0.10546540468931198, + "learning_rate": 0.0001924491268581325, + "loss": 0.1378, + "step": 2621 + }, + { + "epoch": 0.18918431400844185, + "grad_norm": 0.10639491677284241, + "learning_rate": 0.00019244624043873577, + "loss": 0.1846, + "step": 2622 + }, + { + "epoch": 0.1892564666835023, + "grad_norm": 0.12428712844848633, + "learning_rate": 0.000192443354019339, + "loss": 0.1778, + "step": 2623 + }, + { + "epoch": 0.1893286193585627, + "grad_norm": 0.09785983711481094, + "learning_rate": 0.00019244046759994227, + "loss": 0.1766, + "step": 2624 + }, + { + "epoch": 0.18940077203362315, + "grad_norm": 0.12053379416465759, + "learning_rate": 0.00019243758118054553, + "loss": 0.1428, + "step": 2625 + }, + { + "epoch": 0.18947292470868357, + "grad_norm": 0.11321604996919632, + "learning_rate": 0.00019243469476114882, + "loss": 0.1368, + "step": 2626 + }, + { + "epoch": 0.18954507738374402, + "grad_norm": 0.10080399364233017, + "learning_rate": 0.00019243180834175208, + "loss": 0.1684, + "step": 2627 + }, + { + "epoch": 0.18961723005880443, + "grad_norm": 0.12170656770467758, + "learning_rate": 0.00019242892192235532, + "loss": 0.164, + "step": 2628 + }, + { + "epoch": 0.18968938273386485, + "grad_norm": 0.12450167536735535, + "learning_rate": 0.00019242603550295858, + "loss": 0.1932, + "step": 2629 + }, + { + "epoch": 0.1897615354089253, + "grad_norm": 0.1402718424797058, + "learning_rate": 0.00019242314908356184, + "loss": 0.192, + "step": 2630 + }, + { + "epoch": 0.1898336880839857, + "grad_norm": 0.38308095932006836, + "learning_rate": 0.0001924202626641651, + "loss": 0.1575, + "step": 2631 + }, + { + "epoch": 0.18990584075904615, + "grad_norm": 0.11162221431732178, + "learning_rate": 0.00019241737624476837, + "loss": 0.1695, + "step": 2632 + }, + { + "epoch": 0.18997799343410657, + "grad_norm": 0.16652381420135498, + "learning_rate": 0.00019241448982537163, + "loss": 0.1193, + "step": 2633 + }, + { + "epoch": 0.19005014610916698, + "grad_norm": 0.16206100583076477, + "learning_rate": 0.0001924116034059749, + "loss": 0.2161, + "step": 2634 + }, + { + "epoch": 0.19012229878422743, + "grad_norm": 0.1144791916012764, + "learning_rate": 0.00019240871698657816, + "loss": 0.1752, + "step": 2635 + }, + { + "epoch": 0.19019445145928784, + "grad_norm": 0.14860175549983978, + "learning_rate": 0.00019240583056718142, + "loss": 0.1665, + "step": 2636 + }, + { + "epoch": 0.1902666041343483, + "grad_norm": 0.09630924463272095, + "learning_rate": 0.00019240294414778469, + "loss": 0.1827, + "step": 2637 + }, + { + "epoch": 0.1903387568094087, + "grad_norm": 0.1070321798324585, + "learning_rate": 0.00019240005772838795, + "loss": 0.1331, + "step": 2638 + }, + { + "epoch": 0.19041090948446915, + "grad_norm": 0.10391581058502197, + "learning_rate": 0.00019239717130899118, + "loss": 0.1732, + "step": 2639 + }, + { + "epoch": 0.19048306215952956, + "grad_norm": 0.10819260776042938, + "learning_rate": 0.00019239428488959447, + "loss": 0.1693, + "step": 2640 + }, + { + "epoch": 0.19055521483458998, + "grad_norm": 0.12624964118003845, + "learning_rate": 0.00019239139847019774, + "loss": 0.1855, + "step": 2641 + }, + { + "epoch": 0.19062736750965042, + "grad_norm": 0.11198997497558594, + "learning_rate": 0.000192388512050801, + "loss": 0.1963, + "step": 2642 + }, + { + "epoch": 0.19069952018471084, + "grad_norm": 0.09690425544977188, + "learning_rate": 0.00019238562563140426, + "loss": 0.1878, + "step": 2643 + }, + { + "epoch": 0.19077167285977129, + "grad_norm": 0.19683675467967987, + "learning_rate": 0.00019238273921200753, + "loss": 0.1794, + "step": 2644 + }, + { + "epoch": 0.1908438255348317, + "grad_norm": 0.10952795296907425, + "learning_rate": 0.00019237985279261076, + "loss": 0.1849, + "step": 2645 + }, + { + "epoch": 0.19091597820989215, + "grad_norm": 0.09718668460845947, + "learning_rate": 0.00019237696637321402, + "loss": 0.1835, + "step": 2646 + }, + { + "epoch": 0.19098813088495256, + "grad_norm": 0.11417441070079803, + "learning_rate": 0.00019237407995381731, + "loss": 0.1974, + "step": 2647 + }, + { + "epoch": 0.19106028356001298, + "grad_norm": 0.09637610614299774, + "learning_rate": 0.00019237119353442058, + "loss": 0.178, + "step": 2648 + }, + { + "epoch": 0.19113243623507342, + "grad_norm": 0.08902224153280258, + "learning_rate": 0.00019236830711502384, + "loss": 0.1692, + "step": 2649 + }, + { + "epoch": 0.19120458891013384, + "grad_norm": 0.12328056246042252, + "learning_rate": 0.00019236542069562708, + "loss": 0.1091, + "step": 2650 + }, + { + "epoch": 0.19127674158519428, + "grad_norm": 0.09165138751268387, + "learning_rate": 0.00019236253427623034, + "loss": 0.165, + "step": 2651 + }, + { + "epoch": 0.1913488942602547, + "grad_norm": 0.10049431025981903, + "learning_rate": 0.0001923596478568336, + "loss": 0.151, + "step": 2652 + }, + { + "epoch": 0.19142104693531511, + "grad_norm": 0.10753556340932846, + "learning_rate": 0.00019235676143743686, + "loss": 0.145, + "step": 2653 + }, + { + "epoch": 0.19149319961037556, + "grad_norm": 0.11923553794622421, + "learning_rate": 0.00019235387501804013, + "loss": 0.1713, + "step": 2654 + }, + { + "epoch": 0.19156535228543597, + "grad_norm": 0.12246944010257721, + "learning_rate": 0.0001923509885986434, + "loss": 0.1532, + "step": 2655 + }, + { + "epoch": 0.19163750496049642, + "grad_norm": 0.14121495187282562, + "learning_rate": 0.00019234810217924665, + "loss": 0.2189, + "step": 2656 + }, + { + "epoch": 0.19170965763555683, + "grad_norm": 0.11362232267856598, + "learning_rate": 0.00019234521575984992, + "loss": 0.2029, + "step": 2657 + }, + { + "epoch": 0.19178181031061728, + "grad_norm": 0.1333397775888443, + "learning_rate": 0.00019234232934045318, + "loss": 0.1472, + "step": 2658 + }, + { + "epoch": 0.1918539629856777, + "grad_norm": 0.10232511907815933, + "learning_rate": 0.00019233944292105644, + "loss": 0.1862, + "step": 2659 + }, + { + "epoch": 0.1919261156607381, + "grad_norm": 0.10242033749818802, + "learning_rate": 0.0001923365565016597, + "loss": 0.1907, + "step": 2660 + }, + { + "epoch": 0.19199826833579856, + "grad_norm": 0.12704692780971527, + "learning_rate": 0.00019233367008226294, + "loss": 0.1682, + "step": 2661 + }, + { + "epoch": 0.19207042101085897, + "grad_norm": 0.10353595018386841, + "learning_rate": 0.00019233078366286623, + "loss": 0.1588, + "step": 2662 + }, + { + "epoch": 0.19214257368591942, + "grad_norm": 0.1133752167224884, + "learning_rate": 0.0001923278972434695, + "loss": 0.1579, + "step": 2663 + }, + { + "epoch": 0.19221472636097983, + "grad_norm": 0.10782555490732193, + "learning_rate": 0.00019232501082407276, + "loss": 0.1505, + "step": 2664 + }, + { + "epoch": 0.19228687903604025, + "grad_norm": 0.1164044663310051, + "learning_rate": 0.00019232212440467602, + "loss": 0.1738, + "step": 2665 + }, + { + "epoch": 0.1923590317111007, + "grad_norm": 0.09039617329835892, + "learning_rate": 0.00019231923798527926, + "loss": 0.1875, + "step": 2666 + }, + { + "epoch": 0.1924311843861611, + "grad_norm": 0.1016000509262085, + "learning_rate": 0.00019231635156588252, + "loss": 0.1345, + "step": 2667 + }, + { + "epoch": 0.19250333706122155, + "grad_norm": 0.09287573397159576, + "learning_rate": 0.00019231346514648578, + "loss": 0.1379, + "step": 2668 + }, + { + "epoch": 0.19257548973628197, + "grad_norm": 0.12193844467401505, + "learning_rate": 0.00019231057872708907, + "loss": 0.1735, + "step": 2669 + }, + { + "epoch": 0.1926476424113424, + "grad_norm": 0.12390395998954773, + "learning_rate": 0.00019230769230769233, + "loss": 0.1557, + "step": 2670 + }, + { + "epoch": 0.19271979508640283, + "grad_norm": 0.09519024193286896, + "learning_rate": 0.00019230480588829557, + "loss": 0.1581, + "step": 2671 + }, + { + "epoch": 0.19279194776146324, + "grad_norm": 0.12861637771129608, + "learning_rate": 0.00019230191946889883, + "loss": 0.1445, + "step": 2672 + }, + { + "epoch": 0.1928641004365237, + "grad_norm": 0.1024019792675972, + "learning_rate": 0.0001922990330495021, + "loss": 0.1357, + "step": 2673 + }, + { + "epoch": 0.1929362531115841, + "grad_norm": 0.10311330854892731, + "learning_rate": 0.00019229614663010536, + "loss": 0.1821, + "step": 2674 + }, + { + "epoch": 0.19300840578664455, + "grad_norm": 0.10143375396728516, + "learning_rate": 0.00019229326021070862, + "loss": 0.1418, + "step": 2675 + }, + { + "epoch": 0.19308055846170497, + "grad_norm": 0.11127184331417084, + "learning_rate": 0.00019229037379131189, + "loss": 0.193, + "step": 2676 + }, + { + "epoch": 0.1931527111367654, + "grad_norm": 0.1024523377418518, + "learning_rate": 0.00019228748737191515, + "loss": 0.1124, + "step": 2677 + }, + { + "epoch": 0.19322486381182583, + "grad_norm": 0.10198451578617096, + "learning_rate": 0.0001922846009525184, + "loss": 0.1819, + "step": 2678 + }, + { + "epoch": 0.19329701648688624, + "grad_norm": 0.13140185177326202, + "learning_rate": 0.00019228171453312167, + "loss": 0.1434, + "step": 2679 + }, + { + "epoch": 0.19336916916194669, + "grad_norm": 0.08785970509052277, + "learning_rate": 0.00019227882811372494, + "loss": 0.1228, + "step": 2680 + }, + { + "epoch": 0.1934413218370071, + "grad_norm": 0.08911599218845367, + "learning_rate": 0.0001922759416943282, + "loss": 0.1648, + "step": 2681 + }, + { + "epoch": 0.19351347451206755, + "grad_norm": 0.1131206527352333, + "learning_rate": 0.00019227305527493144, + "loss": 0.2027, + "step": 2682 + }, + { + "epoch": 0.19358562718712796, + "grad_norm": 0.09311474859714508, + "learning_rate": 0.00019227016885553473, + "loss": 0.133, + "step": 2683 + }, + { + "epoch": 0.19365777986218838, + "grad_norm": 0.14921562373638153, + "learning_rate": 0.000192267282436138, + "loss": 0.1894, + "step": 2684 + }, + { + "epoch": 0.19372993253724882, + "grad_norm": 0.1435394287109375, + "learning_rate": 0.00019226439601674125, + "loss": 0.1447, + "step": 2685 + }, + { + "epoch": 0.19380208521230924, + "grad_norm": 0.11073862016201019, + "learning_rate": 0.00019226150959734451, + "loss": 0.1524, + "step": 2686 + }, + { + "epoch": 0.19387423788736968, + "grad_norm": 0.11298488080501556, + "learning_rate": 0.00019225862317794775, + "loss": 0.1576, + "step": 2687 + }, + { + "epoch": 0.1939463905624301, + "grad_norm": 0.11842662841081619, + "learning_rate": 0.000192255736758551, + "loss": 0.1702, + "step": 2688 + }, + { + "epoch": 0.19401854323749054, + "grad_norm": 0.1126062273979187, + "learning_rate": 0.00019225285033915428, + "loss": 0.1704, + "step": 2689 + }, + { + "epoch": 0.19409069591255096, + "grad_norm": 0.11057586222887039, + "learning_rate": 0.00019224996391975757, + "loss": 0.1566, + "step": 2690 + }, + { + "epoch": 0.19416284858761138, + "grad_norm": 0.11670485138893127, + "learning_rate": 0.00019224707750036083, + "loss": 0.1672, + "step": 2691 + }, + { + "epoch": 0.19423500126267182, + "grad_norm": 0.21490761637687683, + "learning_rate": 0.00019224419108096406, + "loss": 0.2064, + "step": 2692 + }, + { + "epoch": 0.19430715393773224, + "grad_norm": 0.11341680586338043, + "learning_rate": 0.00019224130466156733, + "loss": 0.1794, + "step": 2693 + }, + { + "epoch": 0.19437930661279268, + "grad_norm": 0.09577616304159164, + "learning_rate": 0.0001922384182421706, + "loss": 0.1553, + "step": 2694 + }, + { + "epoch": 0.1944514592878531, + "grad_norm": 0.10448767989873886, + "learning_rate": 0.00019223553182277385, + "loss": 0.188, + "step": 2695 + }, + { + "epoch": 0.1945236119629135, + "grad_norm": 0.11948369443416595, + "learning_rate": 0.00019223264540337712, + "loss": 0.1362, + "step": 2696 + }, + { + "epoch": 0.19459576463797396, + "grad_norm": 0.11946840584278107, + "learning_rate": 0.00019222975898398038, + "loss": 0.142, + "step": 2697 + }, + { + "epoch": 0.19466791731303437, + "grad_norm": 0.11634260416030884, + "learning_rate": 0.00019222687256458364, + "loss": 0.1802, + "step": 2698 + }, + { + "epoch": 0.19474006998809482, + "grad_norm": 0.09894415736198425, + "learning_rate": 0.0001922239861451869, + "loss": 0.171, + "step": 2699 + }, + { + "epoch": 0.19481222266315523, + "grad_norm": 0.15403254330158234, + "learning_rate": 0.00019222109972579017, + "loss": 0.1629, + "step": 2700 + }, + { + "epoch": 0.19488437533821568, + "grad_norm": 0.16679127514362335, + "learning_rate": 0.00019221821330639343, + "loss": 0.1866, + "step": 2701 + }, + { + "epoch": 0.1949565280132761, + "grad_norm": 0.11218617111444473, + "learning_rate": 0.0001922153268869967, + "loss": 0.1852, + "step": 2702 + }, + { + "epoch": 0.1950286806883365, + "grad_norm": 0.18190835416316986, + "learning_rate": 0.00019221244046759993, + "loss": 0.1637, + "step": 2703 + }, + { + "epoch": 0.19510083336339695, + "grad_norm": 0.08974639326334, + "learning_rate": 0.00019220955404820322, + "loss": 0.2067, + "step": 2704 + }, + { + "epoch": 0.19517298603845737, + "grad_norm": 0.1249222680926323, + "learning_rate": 0.00019220666762880648, + "loss": 0.18, + "step": 2705 + }, + { + "epoch": 0.1952451387135178, + "grad_norm": 0.09737565368413925, + "learning_rate": 0.00019220378120940975, + "loss": 0.1286, + "step": 2706 + }, + { + "epoch": 0.19531729138857823, + "grad_norm": 0.10697299987077713, + "learning_rate": 0.000192200894790013, + "loss": 0.1476, + "step": 2707 + }, + { + "epoch": 0.19538944406363867, + "grad_norm": 0.10189025849103928, + "learning_rate": 0.00019219800837061624, + "loss": 0.1714, + "step": 2708 + }, + { + "epoch": 0.1954615967386991, + "grad_norm": 0.11121237277984619, + "learning_rate": 0.0001921951219512195, + "loss": 0.1894, + "step": 2709 + }, + { + "epoch": 0.1955337494137595, + "grad_norm": 0.1351558119058609, + "learning_rate": 0.00019219223553182277, + "loss": 0.1662, + "step": 2710 + }, + { + "epoch": 0.19560590208881995, + "grad_norm": 0.10621657222509384, + "learning_rate": 0.00019218934911242606, + "loss": 0.1292, + "step": 2711 + }, + { + "epoch": 0.19567805476388037, + "grad_norm": 0.11616257578134537, + "learning_rate": 0.00019218646269302932, + "loss": 0.1725, + "step": 2712 + }, + { + "epoch": 0.1957502074389408, + "grad_norm": 0.09937336295843124, + "learning_rate": 0.00019218357627363256, + "loss": 0.1548, + "step": 2713 + }, + { + "epoch": 0.19582236011400123, + "grad_norm": 0.097499780356884, + "learning_rate": 0.00019218068985423582, + "loss": 0.1249, + "step": 2714 + }, + { + "epoch": 0.19589451278906164, + "grad_norm": 0.11946720629930496, + "learning_rate": 0.00019217780343483908, + "loss": 0.159, + "step": 2715 + }, + { + "epoch": 0.1959666654641221, + "grad_norm": 0.1145441010594368, + "learning_rate": 0.00019217491701544235, + "loss": 0.1583, + "step": 2716 + }, + { + "epoch": 0.1960388181391825, + "grad_norm": 0.12353675067424774, + "learning_rate": 0.0001921720305960456, + "loss": 0.1819, + "step": 2717 + }, + { + "epoch": 0.19611097081424295, + "grad_norm": 0.1489514261484146, + "learning_rate": 0.00019216914417664887, + "loss": 0.1741, + "step": 2718 + }, + { + "epoch": 0.19618312348930336, + "grad_norm": 0.08473455905914307, + "learning_rate": 0.00019216625775725214, + "loss": 0.1341, + "step": 2719 + }, + { + "epoch": 0.1962552761643638, + "grad_norm": 0.10539857298135757, + "learning_rate": 0.0001921633713378554, + "loss": 0.145, + "step": 2720 + }, + { + "epoch": 0.19632742883942422, + "grad_norm": 0.10995670408010483, + "learning_rate": 0.00019216048491845866, + "loss": 0.1665, + "step": 2721 + }, + { + "epoch": 0.19639958151448464, + "grad_norm": 0.12223058193922043, + "learning_rate": 0.00019215759849906193, + "loss": 0.1956, + "step": 2722 + }, + { + "epoch": 0.19647173418954508, + "grad_norm": 0.11050442606210709, + "learning_rate": 0.0001921547120796652, + "loss": 0.1682, + "step": 2723 + }, + { + "epoch": 0.1965438868646055, + "grad_norm": 0.11254192888736725, + "learning_rate": 0.00019215182566026842, + "loss": 0.1601, + "step": 2724 + }, + { + "epoch": 0.19661603953966594, + "grad_norm": 0.12485778331756592, + "learning_rate": 0.00019214893924087171, + "loss": 0.1618, + "step": 2725 + }, + { + "epoch": 0.19668819221472636, + "grad_norm": 0.09796317666769028, + "learning_rate": 0.00019214605282147498, + "loss": 0.1907, + "step": 2726 + }, + { + "epoch": 0.19676034488978678, + "grad_norm": 0.1087847352027893, + "learning_rate": 0.00019214316640207824, + "loss": 0.1331, + "step": 2727 + }, + { + "epoch": 0.19683249756484722, + "grad_norm": 0.10561531782150269, + "learning_rate": 0.0001921402799826815, + "loss": 0.1408, + "step": 2728 + }, + { + "epoch": 0.19690465023990764, + "grad_norm": 0.09105509519577026, + "learning_rate": 0.00019213739356328474, + "loss": 0.207, + "step": 2729 + }, + { + "epoch": 0.19697680291496808, + "grad_norm": 0.10997123271226883, + "learning_rate": 0.000192134507143888, + "loss": 0.1938, + "step": 2730 + }, + { + "epoch": 0.1970489555900285, + "grad_norm": 0.11604651063680649, + "learning_rate": 0.00019213162072449126, + "loss": 0.1932, + "step": 2731 + }, + { + "epoch": 0.19712110826508894, + "grad_norm": 0.0898117646574974, + "learning_rate": 0.00019212873430509455, + "loss": 0.1623, + "step": 2732 + }, + { + "epoch": 0.19719326094014936, + "grad_norm": 0.1296977400779724, + "learning_rate": 0.00019212584788569782, + "loss": 0.2296, + "step": 2733 + }, + { + "epoch": 0.19726541361520977, + "grad_norm": 0.09977329522371292, + "learning_rate": 0.00019212296146630105, + "loss": 0.1267, + "step": 2734 + }, + { + "epoch": 0.19733756629027022, + "grad_norm": 0.11098989844322205, + "learning_rate": 0.00019212007504690432, + "loss": 0.1853, + "step": 2735 + }, + { + "epoch": 0.19740971896533063, + "grad_norm": 0.12152834236621857, + "learning_rate": 0.00019211718862750758, + "loss": 0.1582, + "step": 2736 + }, + { + "epoch": 0.19748187164039108, + "grad_norm": 0.12584428489208221, + "learning_rate": 0.00019211430220811084, + "loss": 0.1826, + "step": 2737 + }, + { + "epoch": 0.1975540243154515, + "grad_norm": 0.11646226048469543, + "learning_rate": 0.0001921114157887141, + "loss": 0.1609, + "step": 2738 + }, + { + "epoch": 0.19762617699051194, + "grad_norm": 0.10537946224212646, + "learning_rate": 0.00019210852936931737, + "loss": 0.2103, + "step": 2739 + }, + { + "epoch": 0.19769832966557235, + "grad_norm": 0.11018376052379608, + "learning_rate": 0.00019210564294992063, + "loss": 0.1759, + "step": 2740 + }, + { + "epoch": 0.19777048234063277, + "grad_norm": 0.15066130459308624, + "learning_rate": 0.0001921027565305239, + "loss": 0.1411, + "step": 2741 + }, + { + "epoch": 0.1978426350156932, + "grad_norm": 0.10171657055616379, + "learning_rate": 0.00019209987011112716, + "loss": 0.1499, + "step": 2742 + }, + { + "epoch": 0.19791478769075363, + "grad_norm": 0.08922936022281647, + "learning_rate": 0.00019209698369173042, + "loss": 0.1409, + "step": 2743 + }, + { + "epoch": 0.19798694036581407, + "grad_norm": 0.1321311891078949, + "learning_rate": 0.00019209409727233368, + "loss": 0.2057, + "step": 2744 + }, + { + "epoch": 0.1980590930408745, + "grad_norm": 0.14305776357650757, + "learning_rate": 0.00019209121085293692, + "loss": 0.1298, + "step": 2745 + }, + { + "epoch": 0.1981312457159349, + "grad_norm": 0.08078758418560028, + "learning_rate": 0.0001920883244335402, + "loss": 0.1526, + "step": 2746 + }, + { + "epoch": 0.19820339839099535, + "grad_norm": 0.11699184775352478, + "learning_rate": 0.00019208543801414347, + "loss": 0.1661, + "step": 2747 + }, + { + "epoch": 0.19827555106605577, + "grad_norm": 0.11144742369651794, + "learning_rate": 0.00019208255159474673, + "loss": 0.1587, + "step": 2748 + }, + { + "epoch": 0.1983477037411162, + "grad_norm": 0.13285599648952484, + "learning_rate": 0.00019207966517535, + "loss": 0.227, + "step": 2749 + }, + { + "epoch": 0.19841985641617663, + "grad_norm": 0.11385666579008102, + "learning_rate": 0.00019207677875595323, + "loss": 0.1164, + "step": 2750 + }, + { + "epoch": 0.19849200909123707, + "grad_norm": 0.17403168976306915, + "learning_rate": 0.0001920738923365565, + "loss": 0.1651, + "step": 2751 + }, + { + "epoch": 0.1985641617662975, + "grad_norm": 0.10947877913713455, + "learning_rate": 0.00019207100591715976, + "loss": 0.1555, + "step": 2752 + }, + { + "epoch": 0.1986363144413579, + "grad_norm": 0.13744814693927765, + "learning_rate": 0.00019206811949776305, + "loss": 0.2367, + "step": 2753 + }, + { + "epoch": 0.19870846711641835, + "grad_norm": 0.09754101932048798, + "learning_rate": 0.0001920652330783663, + "loss": 0.1478, + "step": 2754 + }, + { + "epoch": 0.19878061979147876, + "grad_norm": 0.09394462406635284, + "learning_rate": 0.00019206234665896955, + "loss": 0.1497, + "step": 2755 + }, + { + "epoch": 0.1988527724665392, + "grad_norm": 0.11369165033102036, + "learning_rate": 0.0001920594602395728, + "loss": 0.2044, + "step": 2756 + }, + { + "epoch": 0.19892492514159962, + "grad_norm": 0.10330383479595184, + "learning_rate": 0.00019205657382017607, + "loss": 0.1497, + "step": 2757 + }, + { + "epoch": 0.19899707781666004, + "grad_norm": 0.10162612050771713, + "learning_rate": 0.00019205368740077934, + "loss": 0.1602, + "step": 2758 + }, + { + "epoch": 0.19906923049172048, + "grad_norm": 0.09283189475536346, + "learning_rate": 0.0001920508009813826, + "loss": 0.1232, + "step": 2759 + }, + { + "epoch": 0.1991413831667809, + "grad_norm": 0.09310007095336914, + "learning_rate": 0.00019204791456198586, + "loss": 0.1668, + "step": 2760 + }, + { + "epoch": 0.19921353584184134, + "grad_norm": 0.09826725721359253, + "learning_rate": 0.00019204502814258912, + "loss": 0.2198, + "step": 2761 + }, + { + "epoch": 0.19928568851690176, + "grad_norm": 0.08991654962301254, + "learning_rate": 0.0001920421417231924, + "loss": 0.1652, + "step": 2762 + }, + { + "epoch": 0.1993578411919622, + "grad_norm": 0.10206000506877899, + "learning_rate": 0.00019203925530379565, + "loss": 0.1718, + "step": 2763 + }, + { + "epoch": 0.19942999386702262, + "grad_norm": 0.11106408387422562, + "learning_rate": 0.00019203636888439891, + "loss": 0.1797, + "step": 2764 + }, + { + "epoch": 0.19950214654208304, + "grad_norm": 0.10109609365463257, + "learning_rate": 0.00019203348246500218, + "loss": 0.1481, + "step": 2765 + }, + { + "epoch": 0.19957429921714348, + "grad_norm": 0.11432026326656342, + "learning_rate": 0.00019203059604560544, + "loss": 0.1739, + "step": 2766 + }, + { + "epoch": 0.1996464518922039, + "grad_norm": 0.1216094046831131, + "learning_rate": 0.0001920277096262087, + "loss": 0.1703, + "step": 2767 + }, + { + "epoch": 0.19971860456726434, + "grad_norm": 0.1027732789516449, + "learning_rate": 0.00019202482320681197, + "loss": 0.1404, + "step": 2768 + }, + { + "epoch": 0.19979075724232476, + "grad_norm": 0.08884584158658981, + "learning_rate": 0.00019202193678741523, + "loss": 0.1589, + "step": 2769 + }, + { + "epoch": 0.1998629099173852, + "grad_norm": 0.18108291923999786, + "learning_rate": 0.0001920190503680185, + "loss": 0.2151, + "step": 2770 + }, + { + "epoch": 0.19993506259244562, + "grad_norm": 0.0910463035106659, + "learning_rate": 0.00019201616394862175, + "loss": 0.1887, + "step": 2771 + }, + { + "epoch": 0.20000721526750603, + "grad_norm": 0.09141043573617935, + "learning_rate": 0.000192013277529225, + "loss": 0.1428, + "step": 2772 + }, + { + "epoch": 0.20007936794256648, + "grad_norm": 0.10917165875434875, + "learning_rate": 0.00019201039110982825, + "loss": 0.1422, + "step": 2773 + }, + { + "epoch": 0.2001515206176269, + "grad_norm": 0.10315337777137756, + "learning_rate": 0.00019200750469043154, + "loss": 0.1784, + "step": 2774 + }, + { + "epoch": 0.20022367329268734, + "grad_norm": 0.10837302356958389, + "learning_rate": 0.0001920046182710348, + "loss": 0.1691, + "step": 2775 + }, + { + "epoch": 0.20029582596774775, + "grad_norm": 0.13153184950351715, + "learning_rate": 0.00019200173185163807, + "loss": 0.1985, + "step": 2776 + }, + { + "epoch": 0.20036797864280817, + "grad_norm": 0.10565356910228729, + "learning_rate": 0.0001919988454322413, + "loss": 0.1602, + "step": 2777 + }, + { + "epoch": 0.20044013131786861, + "grad_norm": 0.10361316055059433, + "learning_rate": 0.00019199595901284457, + "loss": 0.1682, + "step": 2778 + }, + { + "epoch": 0.20051228399292903, + "grad_norm": 0.12748175859451294, + "learning_rate": 0.00019199307259344783, + "loss": 0.1404, + "step": 2779 + }, + { + "epoch": 0.20058443666798947, + "grad_norm": 0.09716325998306274, + "learning_rate": 0.0001919901861740511, + "loss": 0.111, + "step": 2780 + }, + { + "epoch": 0.2006565893430499, + "grad_norm": 0.17050409317016602, + "learning_rate": 0.00019198729975465438, + "loss": 0.1974, + "step": 2781 + }, + { + "epoch": 0.20072874201811033, + "grad_norm": 0.09248002618551254, + "learning_rate": 0.00019198441333525762, + "loss": 0.1602, + "step": 2782 + }, + { + "epoch": 0.20080089469317075, + "grad_norm": 0.08953045308589935, + "learning_rate": 0.00019198152691586088, + "loss": 0.1698, + "step": 2783 + }, + { + "epoch": 0.20087304736823117, + "grad_norm": 0.12317956984043121, + "learning_rate": 0.00019197864049646415, + "loss": 0.2052, + "step": 2784 + }, + { + "epoch": 0.2009452000432916, + "grad_norm": 0.10364929586648941, + "learning_rate": 0.0001919757540770674, + "loss": 0.1787, + "step": 2785 + }, + { + "epoch": 0.20101735271835203, + "grad_norm": 0.1174645721912384, + "learning_rate": 0.00019197286765767067, + "loss": 0.2013, + "step": 2786 + }, + { + "epoch": 0.20108950539341247, + "grad_norm": 0.12904039025306702, + "learning_rate": 0.00019196998123827393, + "loss": 0.1326, + "step": 2787 + }, + { + "epoch": 0.2011616580684729, + "grad_norm": 0.1159813329577446, + "learning_rate": 0.0001919670948188772, + "loss": 0.1674, + "step": 2788 + }, + { + "epoch": 0.2012338107435333, + "grad_norm": 0.12107376009225845, + "learning_rate": 0.00019196420839948046, + "loss": 0.1459, + "step": 2789 + }, + { + "epoch": 0.20130596341859375, + "grad_norm": 0.11082377284765244, + "learning_rate": 0.00019196132198008372, + "loss": 0.1312, + "step": 2790 + }, + { + "epoch": 0.20137811609365416, + "grad_norm": 0.09276112914085388, + "learning_rate": 0.00019195843556068699, + "loss": 0.1522, + "step": 2791 + }, + { + "epoch": 0.2014502687687146, + "grad_norm": 0.09536813199520111, + "learning_rate": 0.00019195554914129025, + "loss": 0.1426, + "step": 2792 + }, + { + "epoch": 0.20152242144377502, + "grad_norm": 0.1258544921875, + "learning_rate": 0.00019195266272189348, + "loss": 0.1246, + "step": 2793 + }, + { + "epoch": 0.20159457411883547, + "grad_norm": 0.10447549819946289, + "learning_rate": 0.00019194977630249675, + "loss": 0.1726, + "step": 2794 + }, + { + "epoch": 0.20166672679389588, + "grad_norm": 0.10352654755115509, + "learning_rate": 0.00019194688988310004, + "loss": 0.1757, + "step": 2795 + }, + { + "epoch": 0.2017388794689563, + "grad_norm": 0.12527719140052795, + "learning_rate": 0.0001919440034637033, + "loss": 0.2088, + "step": 2796 + }, + { + "epoch": 0.20181103214401674, + "grad_norm": 0.09983530640602112, + "learning_rate": 0.00019194111704430656, + "loss": 0.161, + "step": 2797 + }, + { + "epoch": 0.20188318481907716, + "grad_norm": 0.09162452071905136, + "learning_rate": 0.0001919382306249098, + "loss": 0.18, + "step": 2798 + }, + { + "epoch": 0.2019553374941376, + "grad_norm": 0.109310083091259, + "learning_rate": 0.00019193534420551306, + "loss": 0.161, + "step": 2799 + }, + { + "epoch": 0.20202749016919802, + "grad_norm": 0.09814836084842682, + "learning_rate": 0.00019193245778611632, + "loss": 0.162, + "step": 2800 + }, + { + "epoch": 0.20209964284425846, + "grad_norm": 0.13046738505363464, + "learning_rate": 0.0001919295713667196, + "loss": 0.1757, + "step": 2801 + }, + { + "epoch": 0.20217179551931888, + "grad_norm": 0.11764957755804062, + "learning_rate": 0.00019192668494732288, + "loss": 0.173, + "step": 2802 + }, + { + "epoch": 0.2022439481943793, + "grad_norm": 0.11445638537406921, + "learning_rate": 0.0001919237985279261, + "loss": 0.1719, + "step": 2803 + }, + { + "epoch": 0.20231610086943974, + "grad_norm": 0.11463334411382675, + "learning_rate": 0.00019192091210852938, + "loss": 0.1545, + "step": 2804 + }, + { + "epoch": 0.20238825354450016, + "grad_norm": 0.09536374360322952, + "learning_rate": 0.00019191802568913264, + "loss": 0.1495, + "step": 2805 + }, + { + "epoch": 0.2024604062195606, + "grad_norm": 0.09586406499147415, + "learning_rate": 0.0001919151392697359, + "loss": 0.1777, + "step": 2806 + }, + { + "epoch": 0.20253255889462102, + "grad_norm": 0.10726621001958847, + "learning_rate": 0.00019191225285033917, + "loss": 0.1317, + "step": 2807 + }, + { + "epoch": 0.20260471156968143, + "grad_norm": 0.09684295207262039, + "learning_rate": 0.00019190936643094243, + "loss": 0.1371, + "step": 2808 + }, + { + "epoch": 0.20267686424474188, + "grad_norm": 0.108054518699646, + "learning_rate": 0.0001919064800115457, + "loss": 0.1505, + "step": 2809 + }, + { + "epoch": 0.2027490169198023, + "grad_norm": 0.1165657788515091, + "learning_rate": 0.00019190359359214895, + "loss": 0.1844, + "step": 2810 + }, + { + "epoch": 0.20282116959486274, + "grad_norm": 0.10754991322755814, + "learning_rate": 0.00019190070717275222, + "loss": 0.161, + "step": 2811 + }, + { + "epoch": 0.20289332226992315, + "grad_norm": 0.12590479850769043, + "learning_rate": 0.00019189782075335548, + "loss": 0.1631, + "step": 2812 + }, + { + "epoch": 0.2029654749449836, + "grad_norm": 0.09850574284791946, + "learning_rate": 0.00019189493433395874, + "loss": 0.1037, + "step": 2813 + }, + { + "epoch": 0.20303762762004401, + "grad_norm": 0.1069825142621994, + "learning_rate": 0.00019189204791456198, + "loss": 0.1669, + "step": 2814 + }, + { + "epoch": 0.20310978029510443, + "grad_norm": 0.09667236357927322, + "learning_rate": 0.00019188916149516524, + "loss": 0.1858, + "step": 2815 + }, + { + "epoch": 0.20318193297016487, + "grad_norm": 0.10493231564760208, + "learning_rate": 0.00019188627507576853, + "loss": 0.1521, + "step": 2816 + }, + { + "epoch": 0.2032540856452253, + "grad_norm": 0.11691664159297943, + "learning_rate": 0.0001918833886563718, + "loss": 0.1328, + "step": 2817 + }, + { + "epoch": 0.20332623832028573, + "grad_norm": 0.11871805042028427, + "learning_rate": 0.00019188050223697506, + "loss": 0.121, + "step": 2818 + }, + { + "epoch": 0.20339839099534615, + "grad_norm": 0.10238055884838104, + "learning_rate": 0.0001918776158175783, + "loss": 0.1265, + "step": 2819 + }, + { + "epoch": 0.20347054367040657, + "grad_norm": 0.12179987132549286, + "learning_rate": 0.00019187472939818156, + "loss": 0.1236, + "step": 2820 + }, + { + "epoch": 0.203542696345467, + "grad_norm": 0.13222713768482208, + "learning_rate": 0.00019187184297878482, + "loss": 0.1727, + "step": 2821 + }, + { + "epoch": 0.20361484902052743, + "grad_norm": 0.11437147855758667, + "learning_rate": 0.00019186895655938808, + "loss": 0.1206, + "step": 2822 + }, + { + "epoch": 0.20368700169558787, + "grad_norm": 0.14340893924236298, + "learning_rate": 0.00019186607013999137, + "loss": 0.1365, + "step": 2823 + }, + { + "epoch": 0.2037591543706483, + "grad_norm": 0.08930408209562302, + "learning_rate": 0.0001918631837205946, + "loss": 0.1495, + "step": 2824 + }, + { + "epoch": 0.20383130704570873, + "grad_norm": 0.09873627871274948, + "learning_rate": 0.00019186029730119787, + "loss": 0.1484, + "step": 2825 + }, + { + "epoch": 0.20390345972076915, + "grad_norm": 0.10518117249011993, + "learning_rate": 0.00019185741088180113, + "loss": 0.1574, + "step": 2826 + }, + { + "epoch": 0.20397561239582956, + "grad_norm": 0.12713289260864258, + "learning_rate": 0.0001918545244624044, + "loss": 0.152, + "step": 2827 + }, + { + "epoch": 0.20404776507089, + "grad_norm": 0.11654222011566162, + "learning_rate": 0.00019185163804300766, + "loss": 0.1762, + "step": 2828 + }, + { + "epoch": 0.20411991774595042, + "grad_norm": 0.11516798287630081, + "learning_rate": 0.00019184875162361092, + "loss": 0.138, + "step": 2829 + }, + { + "epoch": 0.20419207042101087, + "grad_norm": 0.08868873864412308, + "learning_rate": 0.00019184586520421419, + "loss": 0.1865, + "step": 2830 + }, + { + "epoch": 0.20426422309607128, + "grad_norm": 0.11917749792337418, + "learning_rate": 0.00019184297878481745, + "loss": 0.1713, + "step": 2831 + }, + { + "epoch": 0.20433637577113173, + "grad_norm": 0.1309102177619934, + "learning_rate": 0.0001918400923654207, + "loss": 0.1773, + "step": 2832 + }, + { + "epoch": 0.20440852844619214, + "grad_norm": 0.10248704999685287, + "learning_rate": 0.00019183720594602397, + "loss": 0.1367, + "step": 2833 + }, + { + "epoch": 0.20448068112125256, + "grad_norm": 0.11289657652378082, + "learning_rate": 0.00019183431952662724, + "loss": 0.1478, + "step": 2834 + }, + { + "epoch": 0.204552833796313, + "grad_norm": 0.10069409757852554, + "learning_rate": 0.00019183143310723047, + "loss": 0.1606, + "step": 2835 + }, + { + "epoch": 0.20462498647137342, + "grad_norm": 0.10064855217933655, + "learning_rate": 0.00019182854668783374, + "loss": 0.1771, + "step": 2836 + }, + { + "epoch": 0.20469713914643387, + "grad_norm": 0.0808321163058281, + "learning_rate": 0.00019182566026843703, + "loss": 0.1805, + "step": 2837 + }, + { + "epoch": 0.20476929182149428, + "grad_norm": 0.09613669663667679, + "learning_rate": 0.0001918227738490403, + "loss": 0.1979, + "step": 2838 + }, + { + "epoch": 0.2048414444965547, + "grad_norm": 0.10829374194145203, + "learning_rate": 0.00019181988742964355, + "loss": 0.1421, + "step": 2839 + }, + { + "epoch": 0.20491359717161514, + "grad_norm": 0.09369208663702011, + "learning_rate": 0.0001918170010102468, + "loss": 0.1579, + "step": 2840 + }, + { + "epoch": 0.20498574984667556, + "grad_norm": 0.12511524558067322, + "learning_rate": 0.00019181411459085005, + "loss": 0.1656, + "step": 2841 + }, + { + "epoch": 0.205057902521736, + "grad_norm": 0.11365848034620285, + "learning_rate": 0.0001918112281714533, + "loss": 0.1608, + "step": 2842 + }, + { + "epoch": 0.20513005519679642, + "grad_norm": 0.12248922139406204, + "learning_rate": 0.00019180834175205658, + "loss": 0.1933, + "step": 2843 + }, + { + "epoch": 0.20520220787185686, + "grad_norm": 0.08641145378351212, + "learning_rate": 0.00019180545533265984, + "loss": 0.1935, + "step": 2844 + }, + { + "epoch": 0.20527436054691728, + "grad_norm": 0.08721951395273209, + "learning_rate": 0.0001918025689132631, + "loss": 0.1815, + "step": 2845 + }, + { + "epoch": 0.2053465132219777, + "grad_norm": 0.11892024427652359, + "learning_rate": 0.00019179968249386636, + "loss": 0.1964, + "step": 2846 + }, + { + "epoch": 0.20541866589703814, + "grad_norm": 0.10294033586978912, + "learning_rate": 0.00019179679607446963, + "loss": 0.1705, + "step": 2847 + }, + { + "epoch": 0.20549081857209855, + "grad_norm": 0.10080970823764801, + "learning_rate": 0.0001917939096550729, + "loss": 0.1362, + "step": 2848 + }, + { + "epoch": 0.205562971247159, + "grad_norm": 0.11637333780527115, + "learning_rate": 0.00019179102323567615, + "loss": 0.1374, + "step": 2849 + }, + { + "epoch": 0.20563512392221941, + "grad_norm": 0.09417811781167984, + "learning_rate": 0.00019178813681627942, + "loss": 0.1653, + "step": 2850 + }, + { + "epoch": 0.20570727659727983, + "grad_norm": 0.08716203272342682, + "learning_rate": 0.00019178525039688265, + "loss": 0.1416, + "step": 2851 + }, + { + "epoch": 0.20577942927234028, + "grad_norm": 0.09685619920492172, + "learning_rate": 0.00019178236397748594, + "loss": 0.1677, + "step": 2852 + }, + { + "epoch": 0.2058515819474007, + "grad_norm": 0.11075788736343384, + "learning_rate": 0.0001917794775580892, + "loss": 0.1525, + "step": 2853 + }, + { + "epoch": 0.20592373462246114, + "grad_norm": 0.10047938674688339, + "learning_rate": 0.00019177659113869247, + "loss": 0.1717, + "step": 2854 + }, + { + "epoch": 0.20599588729752155, + "grad_norm": 0.1073223203420639, + "learning_rate": 0.00019177370471929573, + "loss": 0.1471, + "step": 2855 + }, + { + "epoch": 0.206068039972582, + "grad_norm": 0.11038212478160858, + "learning_rate": 0.00019177081829989897, + "loss": 0.1768, + "step": 2856 + }, + { + "epoch": 0.2061401926476424, + "grad_norm": 0.14250384271144867, + "learning_rate": 0.00019176793188050223, + "loss": 0.1612, + "step": 2857 + }, + { + "epoch": 0.20621234532270283, + "grad_norm": 0.09535922855138779, + "learning_rate": 0.0001917650454611055, + "loss": 0.1846, + "step": 2858 + }, + { + "epoch": 0.20628449799776327, + "grad_norm": 0.08837927877902985, + "learning_rate": 0.00019176215904170878, + "loss": 0.2123, + "step": 2859 + }, + { + "epoch": 0.2063566506728237, + "grad_norm": 0.13702549040317535, + "learning_rate": 0.00019175927262231205, + "loss": 0.1898, + "step": 2860 + }, + { + "epoch": 0.20642880334788413, + "grad_norm": 0.17124445736408234, + "learning_rate": 0.00019175638620291528, + "loss": 0.1385, + "step": 2861 + }, + { + "epoch": 0.20650095602294455, + "grad_norm": 0.11649008840322495, + "learning_rate": 0.00019175349978351854, + "loss": 0.1569, + "step": 2862 + }, + { + "epoch": 0.20657310869800496, + "grad_norm": 0.10121750086545944, + "learning_rate": 0.0001917506133641218, + "loss": 0.1926, + "step": 2863 + }, + { + "epoch": 0.2066452613730654, + "grad_norm": 0.09889732301235199, + "learning_rate": 0.00019174772694472507, + "loss": 0.1631, + "step": 2864 + }, + { + "epoch": 0.20671741404812582, + "grad_norm": 0.10092010349035263, + "learning_rate": 0.00019174484052532833, + "loss": 0.1761, + "step": 2865 + }, + { + "epoch": 0.20678956672318627, + "grad_norm": 0.12512940168380737, + "learning_rate": 0.0001917419541059316, + "loss": 0.1802, + "step": 2866 + }, + { + "epoch": 0.20686171939824669, + "grad_norm": 0.09495396912097931, + "learning_rate": 0.00019173906768653486, + "loss": 0.1253, + "step": 2867 + }, + { + "epoch": 0.20693387207330713, + "grad_norm": 0.16718077659606934, + "learning_rate": 0.00019173618126713812, + "loss": 0.1576, + "step": 2868 + }, + { + "epoch": 0.20700602474836755, + "grad_norm": 0.10851576924324036, + "learning_rate": 0.00019173329484774138, + "loss": 0.1319, + "step": 2869 + }, + { + "epoch": 0.20707817742342796, + "grad_norm": 0.1416240930557251, + "learning_rate": 0.00019173040842834465, + "loss": 0.1587, + "step": 2870 + }, + { + "epoch": 0.2071503300984884, + "grad_norm": 0.11564162373542786, + "learning_rate": 0.0001917275220089479, + "loss": 0.1991, + "step": 2871 + }, + { + "epoch": 0.20722248277354882, + "grad_norm": 0.114931121468544, + "learning_rate": 0.00019172463558955117, + "loss": 0.1375, + "step": 2872 + }, + { + "epoch": 0.20729463544860927, + "grad_norm": 0.12055815756320953, + "learning_rate": 0.00019172174917015444, + "loss": 0.205, + "step": 2873 + }, + { + "epoch": 0.20736678812366968, + "grad_norm": 0.09396502375602722, + "learning_rate": 0.0001917188627507577, + "loss": 0.1844, + "step": 2874 + }, + { + "epoch": 0.20743894079873013, + "grad_norm": 0.1035708636045456, + "learning_rate": 0.00019171597633136096, + "loss": 0.1642, + "step": 2875 + }, + { + "epoch": 0.20751109347379054, + "grad_norm": 0.1005023792386055, + "learning_rate": 0.00019171308991196423, + "loss": 0.1491, + "step": 2876 + }, + { + "epoch": 0.20758324614885096, + "grad_norm": 0.1094493642449379, + "learning_rate": 0.0001917102034925675, + "loss": 0.2038, + "step": 2877 + }, + { + "epoch": 0.2076553988239114, + "grad_norm": 0.0964755117893219, + "learning_rate": 0.00019170731707317072, + "loss": 0.1884, + "step": 2878 + }, + { + "epoch": 0.20772755149897182, + "grad_norm": 0.12796197831630707, + "learning_rate": 0.000191704430653774, + "loss": 0.1468, + "step": 2879 + }, + { + "epoch": 0.20779970417403226, + "grad_norm": 0.12494756281375885, + "learning_rate": 0.00019170154423437728, + "loss": 0.1768, + "step": 2880 + }, + { + "epoch": 0.20787185684909268, + "grad_norm": 0.12739567458629608, + "learning_rate": 0.00019169865781498054, + "loss": 0.1578, + "step": 2881 + }, + { + "epoch": 0.2079440095241531, + "grad_norm": 0.16343368589878082, + "learning_rate": 0.0001916957713955838, + "loss": 0.219, + "step": 2882 + }, + { + "epoch": 0.20801616219921354, + "grad_norm": 0.12768669426441193, + "learning_rate": 0.00019169288497618704, + "loss": 0.1762, + "step": 2883 + }, + { + "epoch": 0.20808831487427396, + "grad_norm": 0.13747148215770721, + "learning_rate": 0.0001916899985567903, + "loss": 0.1697, + "step": 2884 + }, + { + "epoch": 0.2081604675493344, + "grad_norm": 0.10046649724245071, + "learning_rate": 0.00019168711213739356, + "loss": 0.1539, + "step": 2885 + }, + { + "epoch": 0.20823262022439482, + "grad_norm": 0.10763157159090042, + "learning_rate": 0.00019168422571799683, + "loss": 0.127, + "step": 2886 + }, + { + "epoch": 0.20830477289945526, + "grad_norm": 0.13337762653827667, + "learning_rate": 0.00019168133929860012, + "loss": 0.1887, + "step": 2887 + }, + { + "epoch": 0.20837692557451568, + "grad_norm": 0.13826853036880493, + "learning_rate": 0.00019167845287920335, + "loss": 0.1365, + "step": 2888 + }, + { + "epoch": 0.2084490782495761, + "grad_norm": 0.10385999083518982, + "learning_rate": 0.00019167556645980662, + "loss": 0.1486, + "step": 2889 + }, + { + "epoch": 0.20852123092463654, + "grad_norm": 0.09602903574705124, + "learning_rate": 0.00019167268004040988, + "loss": 0.172, + "step": 2890 + }, + { + "epoch": 0.20859338359969695, + "grad_norm": 0.09549222141504288, + "learning_rate": 0.00019166979362101314, + "loss": 0.1291, + "step": 2891 + }, + { + "epoch": 0.2086655362747574, + "grad_norm": 0.09645403176546097, + "learning_rate": 0.0001916669072016164, + "loss": 0.1772, + "step": 2892 + }, + { + "epoch": 0.2087376889498178, + "grad_norm": 0.12109261751174927, + "learning_rate": 0.00019166402078221967, + "loss": 0.1943, + "step": 2893 + }, + { + "epoch": 0.20880984162487823, + "grad_norm": 0.11377090960741043, + "learning_rate": 0.00019166113436282293, + "loss": 0.1528, + "step": 2894 + }, + { + "epoch": 0.20888199429993867, + "grad_norm": 0.10664435476064682, + "learning_rate": 0.0001916582479434262, + "loss": 0.2075, + "step": 2895 + }, + { + "epoch": 0.2089541469749991, + "grad_norm": 0.14281053841114044, + "learning_rate": 0.00019165536152402946, + "loss": 0.1748, + "step": 2896 + }, + { + "epoch": 0.20902629965005953, + "grad_norm": 0.11265065521001816, + "learning_rate": 0.00019165247510463272, + "loss": 0.1176, + "step": 2897 + }, + { + "epoch": 0.20909845232511995, + "grad_norm": 0.10463102161884308, + "learning_rate": 0.00019164958868523598, + "loss": 0.1907, + "step": 2898 + }, + { + "epoch": 0.2091706050001804, + "grad_norm": 0.12577512860298157, + "learning_rate": 0.00019164670226583922, + "loss": 0.1556, + "step": 2899 + }, + { + "epoch": 0.2092427576752408, + "grad_norm": 0.12625141441822052, + "learning_rate": 0.00019164381584644248, + "loss": 0.1633, + "step": 2900 + }, + { + "epoch": 0.20931491035030123, + "grad_norm": 0.1196192130446434, + "learning_rate": 0.00019164092942704577, + "loss": 0.1529, + "step": 2901 + }, + { + "epoch": 0.20938706302536167, + "grad_norm": 0.11164863407611847, + "learning_rate": 0.00019163804300764903, + "loss": 0.1327, + "step": 2902 + }, + { + "epoch": 0.20945921570042209, + "grad_norm": 0.09533857554197311, + "learning_rate": 0.0001916351565882523, + "loss": 0.1671, + "step": 2903 + }, + { + "epoch": 0.20953136837548253, + "grad_norm": 0.10212215781211853, + "learning_rate": 0.00019163227016885553, + "loss": 0.1618, + "step": 2904 + }, + { + "epoch": 0.20960352105054295, + "grad_norm": 0.09262704104185104, + "learning_rate": 0.0001916293837494588, + "loss": 0.1759, + "step": 2905 + }, + { + "epoch": 0.2096756737256034, + "grad_norm": 0.08996018022298813, + "learning_rate": 0.00019162649733006206, + "loss": 0.1603, + "step": 2906 + }, + { + "epoch": 0.2097478264006638, + "grad_norm": 0.1032821536064148, + "learning_rate": 0.00019162361091066532, + "loss": 0.2387, + "step": 2907 + }, + { + "epoch": 0.20981997907572422, + "grad_norm": 0.0967317521572113, + "learning_rate": 0.0001916207244912686, + "loss": 0.1469, + "step": 2908 + }, + { + "epoch": 0.20989213175078467, + "grad_norm": 0.09922664612531662, + "learning_rate": 0.00019161783807187185, + "loss": 0.1919, + "step": 2909 + }, + { + "epoch": 0.20996428442584508, + "grad_norm": 0.08808860182762146, + "learning_rate": 0.0001916149516524751, + "loss": 0.1554, + "step": 2910 + }, + { + "epoch": 0.21003643710090553, + "grad_norm": 0.11175331473350525, + "learning_rate": 0.00019161206523307837, + "loss": 0.2047, + "step": 2911 + }, + { + "epoch": 0.21010858977596594, + "grad_norm": 0.1385175883769989, + "learning_rate": 0.00019160917881368164, + "loss": 0.1589, + "step": 2912 + }, + { + "epoch": 0.21018074245102636, + "grad_norm": 0.11769337952136993, + "learning_rate": 0.0001916062923942849, + "loss": 0.1651, + "step": 2913 + }, + { + "epoch": 0.2102528951260868, + "grad_norm": 0.10524415969848633, + "learning_rate": 0.00019160340597488816, + "loss": 0.1827, + "step": 2914 + }, + { + "epoch": 0.21032504780114722, + "grad_norm": 0.15402990579605103, + "learning_rate": 0.00019160051955549143, + "loss": 0.145, + "step": 2915 + }, + { + "epoch": 0.21039720047620766, + "grad_norm": 0.08767557144165039, + "learning_rate": 0.0001915976331360947, + "loss": 0.1558, + "step": 2916 + }, + { + "epoch": 0.21046935315126808, + "grad_norm": 0.13662488758563995, + "learning_rate": 0.00019159474671669795, + "loss": 0.1743, + "step": 2917 + }, + { + "epoch": 0.21054150582632852, + "grad_norm": 0.09334705770015717, + "learning_rate": 0.00019159186029730121, + "loss": 0.1683, + "step": 2918 + }, + { + "epoch": 0.21061365850138894, + "grad_norm": 0.19569367170333862, + "learning_rate": 0.00019158897387790448, + "loss": 0.1536, + "step": 2919 + }, + { + "epoch": 0.21068581117644936, + "grad_norm": 0.09817244112491608, + "learning_rate": 0.0001915860874585077, + "loss": 0.1578, + "step": 2920 + }, + { + "epoch": 0.2107579638515098, + "grad_norm": 0.10119809955358505, + "learning_rate": 0.00019158320103911098, + "loss": 0.1348, + "step": 2921 + }, + { + "epoch": 0.21083011652657022, + "grad_norm": 0.11312271654605865, + "learning_rate": 0.00019158031461971427, + "loss": 0.1169, + "step": 2922 + }, + { + "epoch": 0.21090226920163066, + "grad_norm": 0.09860809892416, + "learning_rate": 0.00019157742820031753, + "loss": 0.1417, + "step": 2923 + }, + { + "epoch": 0.21097442187669108, + "grad_norm": 0.17511418461799622, + "learning_rate": 0.0001915745417809208, + "loss": 0.1677, + "step": 2924 + }, + { + "epoch": 0.2110465745517515, + "grad_norm": 0.1460758000612259, + "learning_rate": 0.00019157165536152403, + "loss": 0.1962, + "step": 2925 + }, + { + "epoch": 0.21111872722681194, + "grad_norm": 0.10037209093570709, + "learning_rate": 0.0001915687689421273, + "loss": 0.1742, + "step": 2926 + }, + { + "epoch": 0.21119087990187235, + "grad_norm": 0.08584155887365341, + "learning_rate": 0.00019156588252273055, + "loss": 0.1463, + "step": 2927 + }, + { + "epoch": 0.2112630325769328, + "grad_norm": 0.14209772646427155, + "learning_rate": 0.00019156299610333382, + "loss": 0.1466, + "step": 2928 + }, + { + "epoch": 0.2113351852519932, + "grad_norm": 0.1284438967704773, + "learning_rate": 0.0001915601096839371, + "loss": 0.1588, + "step": 2929 + }, + { + "epoch": 0.21140733792705366, + "grad_norm": 0.09972860664129257, + "learning_rate": 0.00019155722326454034, + "loss": 0.1931, + "step": 2930 + }, + { + "epoch": 0.21147949060211407, + "grad_norm": 0.14195489883422852, + "learning_rate": 0.0001915543368451436, + "loss": 0.186, + "step": 2931 + }, + { + "epoch": 0.2115516432771745, + "grad_norm": 0.10553430765867233, + "learning_rate": 0.00019155145042574687, + "loss": 0.1062, + "step": 2932 + }, + { + "epoch": 0.21162379595223493, + "grad_norm": 0.1190626472234726, + "learning_rate": 0.00019154856400635013, + "loss": 0.1828, + "step": 2933 + }, + { + "epoch": 0.21169594862729535, + "grad_norm": 0.1097407415509224, + "learning_rate": 0.0001915456775869534, + "loss": 0.1551, + "step": 2934 + }, + { + "epoch": 0.2117681013023558, + "grad_norm": 0.11319176852703094, + "learning_rate": 0.00019154279116755666, + "loss": 0.1491, + "step": 2935 + }, + { + "epoch": 0.2118402539774162, + "grad_norm": 0.10741961747407913, + "learning_rate": 0.00019153990474815992, + "loss": 0.1723, + "step": 2936 + }, + { + "epoch": 0.21191240665247665, + "grad_norm": 0.1034005731344223, + "learning_rate": 0.00019153701832876318, + "loss": 0.184, + "step": 2937 + }, + { + "epoch": 0.21198455932753707, + "grad_norm": 0.12362311035394669, + "learning_rate": 0.00019153413190936645, + "loss": 0.1185, + "step": 2938 + }, + { + "epoch": 0.21205671200259749, + "grad_norm": 0.11626561731100082, + "learning_rate": 0.0001915312454899697, + "loss": 0.1414, + "step": 2939 + }, + { + "epoch": 0.21212886467765793, + "grad_norm": 0.10941056162118912, + "learning_rate": 0.00019152835907057297, + "loss": 0.1416, + "step": 2940 + }, + { + "epoch": 0.21220101735271835, + "grad_norm": 0.086311936378479, + "learning_rate": 0.0001915254726511762, + "loss": 0.1494, + "step": 2941 + }, + { + "epoch": 0.2122731700277788, + "grad_norm": 0.1236078068614006, + "learning_rate": 0.00019152258623177947, + "loss": 0.1859, + "step": 2942 + }, + { + "epoch": 0.2123453227028392, + "grad_norm": 0.14309728145599365, + "learning_rate": 0.00019151969981238276, + "loss": 0.2143, + "step": 2943 + }, + { + "epoch": 0.21241747537789962, + "grad_norm": 0.10707279294729233, + "learning_rate": 0.00019151681339298602, + "loss": 0.1604, + "step": 2944 + }, + { + "epoch": 0.21248962805296007, + "grad_norm": 0.11479876190423965, + "learning_rate": 0.00019151392697358929, + "loss": 0.1564, + "step": 2945 + }, + { + "epoch": 0.21256178072802048, + "grad_norm": 0.12447728961706161, + "learning_rate": 0.00019151104055419252, + "loss": 0.1724, + "step": 2946 + }, + { + "epoch": 0.21263393340308093, + "grad_norm": 0.12344259768724442, + "learning_rate": 0.00019150815413479578, + "loss": 0.1851, + "step": 2947 + }, + { + "epoch": 0.21270608607814134, + "grad_norm": 0.10869882255792618, + "learning_rate": 0.00019150526771539905, + "loss": 0.1996, + "step": 2948 + }, + { + "epoch": 0.2127782387532018, + "grad_norm": 0.08875145763158798, + "learning_rate": 0.0001915023812960023, + "loss": 0.1327, + "step": 2949 + }, + { + "epoch": 0.2128503914282622, + "grad_norm": 0.09537830203771591, + "learning_rate": 0.0001914994948766056, + "loss": 0.1982, + "step": 2950 + }, + { + "epoch": 0.21292254410332262, + "grad_norm": 0.1370457410812378, + "learning_rate": 0.00019149660845720884, + "loss": 0.1879, + "step": 2951 + }, + { + "epoch": 0.21299469677838306, + "grad_norm": 0.09743336588144302, + "learning_rate": 0.0001914937220378121, + "loss": 0.1397, + "step": 2952 + }, + { + "epoch": 0.21306684945344348, + "grad_norm": 0.09189710021018982, + "learning_rate": 0.00019149083561841536, + "loss": 0.1825, + "step": 2953 + }, + { + "epoch": 0.21313900212850392, + "grad_norm": 0.14623673260211945, + "learning_rate": 0.00019148794919901862, + "loss": 0.1614, + "step": 2954 + }, + { + "epoch": 0.21321115480356434, + "grad_norm": 0.12172803282737732, + "learning_rate": 0.0001914850627796219, + "loss": 0.223, + "step": 2955 + }, + { + "epoch": 0.21328330747862476, + "grad_norm": 0.11884385347366333, + "learning_rate": 0.00019148217636022515, + "loss": 0.1287, + "step": 2956 + }, + { + "epoch": 0.2133554601536852, + "grad_norm": 0.10667689889669418, + "learning_rate": 0.00019147928994082841, + "loss": 0.1682, + "step": 2957 + }, + { + "epoch": 0.21342761282874562, + "grad_norm": 0.09498683363199234, + "learning_rate": 0.00019147640352143168, + "loss": 0.1062, + "step": 2958 + }, + { + "epoch": 0.21349976550380606, + "grad_norm": 0.09856303781270981, + "learning_rate": 0.00019147351710203494, + "loss": 0.1967, + "step": 2959 + }, + { + "epoch": 0.21357191817886648, + "grad_norm": 0.11182020604610443, + "learning_rate": 0.0001914706306826382, + "loss": 0.207, + "step": 2960 + }, + { + "epoch": 0.21364407085392692, + "grad_norm": 0.11299645900726318, + "learning_rate": 0.00019146774426324147, + "loss": 0.2056, + "step": 2961 + }, + { + "epoch": 0.21371622352898734, + "grad_norm": 0.11871636658906937, + "learning_rate": 0.0001914648578438447, + "loss": 0.1811, + "step": 2962 + }, + { + "epoch": 0.21378837620404775, + "grad_norm": 0.10775674134492874, + "learning_rate": 0.00019146197142444796, + "loss": 0.1377, + "step": 2963 + }, + { + "epoch": 0.2138605288791082, + "grad_norm": 0.1358514130115509, + "learning_rate": 0.00019145908500505125, + "loss": 0.174, + "step": 2964 + }, + { + "epoch": 0.2139326815541686, + "grad_norm": 0.11490422487258911, + "learning_rate": 0.00019145619858565452, + "loss": 0.1962, + "step": 2965 + }, + { + "epoch": 0.21400483422922906, + "grad_norm": 0.1141917034983635, + "learning_rate": 0.00019145331216625778, + "loss": 0.1818, + "step": 2966 + }, + { + "epoch": 0.21407698690428947, + "grad_norm": 0.12067859619855881, + "learning_rate": 0.00019145042574686102, + "loss": 0.1473, + "step": 2967 + }, + { + "epoch": 0.21414913957934992, + "grad_norm": 0.10996080935001373, + "learning_rate": 0.00019144753932746428, + "loss": 0.1726, + "step": 2968 + }, + { + "epoch": 0.21422129225441033, + "grad_norm": 0.13538621366024017, + "learning_rate": 0.00019144465290806754, + "loss": 0.1607, + "step": 2969 + }, + { + "epoch": 0.21429344492947075, + "grad_norm": 0.09215865284204483, + "learning_rate": 0.0001914417664886708, + "loss": 0.1534, + "step": 2970 + }, + { + "epoch": 0.2143655976045312, + "grad_norm": 0.09540208429098129, + "learning_rate": 0.0001914388800692741, + "loss": 0.1916, + "step": 2971 + }, + { + "epoch": 0.2144377502795916, + "grad_norm": 0.10014095902442932, + "learning_rate": 0.00019143599364987733, + "loss": 0.1561, + "step": 2972 + }, + { + "epoch": 0.21450990295465205, + "grad_norm": 0.09593141078948975, + "learning_rate": 0.0001914331072304806, + "loss": 0.1565, + "step": 2973 + }, + { + "epoch": 0.21458205562971247, + "grad_norm": 0.10834231972694397, + "learning_rate": 0.00019143022081108386, + "loss": 0.154, + "step": 2974 + }, + { + "epoch": 0.2146542083047729, + "grad_norm": 0.11025810986757278, + "learning_rate": 0.00019142733439168712, + "loss": 0.2079, + "step": 2975 + }, + { + "epoch": 0.21472636097983333, + "grad_norm": 0.13344484567642212, + "learning_rate": 0.00019142444797229038, + "loss": 0.1311, + "step": 2976 + }, + { + "epoch": 0.21479851365489375, + "grad_norm": 0.13370724022388458, + "learning_rate": 0.00019142156155289364, + "loss": 0.2002, + "step": 2977 + }, + { + "epoch": 0.2148706663299542, + "grad_norm": 0.12002386152744293, + "learning_rate": 0.0001914186751334969, + "loss": 0.1451, + "step": 2978 + }, + { + "epoch": 0.2149428190050146, + "grad_norm": 0.128787562251091, + "learning_rate": 0.00019141578871410017, + "loss": 0.2031, + "step": 2979 + }, + { + "epoch": 0.21501497168007505, + "grad_norm": 0.11301092058420181, + "learning_rate": 0.00019141290229470343, + "loss": 0.1534, + "step": 2980 + }, + { + "epoch": 0.21508712435513547, + "grad_norm": 0.10730845481157303, + "learning_rate": 0.0001914100158753067, + "loss": 0.1985, + "step": 2981 + }, + { + "epoch": 0.21515927703019588, + "grad_norm": 0.10858388245105743, + "learning_rate": 0.00019140712945590996, + "loss": 0.1915, + "step": 2982 + }, + { + "epoch": 0.21523142970525633, + "grad_norm": 0.12959133088588715, + "learning_rate": 0.0001914042430365132, + "loss": 0.1395, + "step": 2983 + }, + { + "epoch": 0.21530358238031674, + "grad_norm": 0.13987712562084198, + "learning_rate": 0.00019140135661711646, + "loss": 0.1777, + "step": 2984 + }, + { + "epoch": 0.2153757350553772, + "grad_norm": 0.107297882437706, + "learning_rate": 0.00019139847019771975, + "loss": 0.153, + "step": 2985 + }, + { + "epoch": 0.2154478877304376, + "grad_norm": 0.0874083861708641, + "learning_rate": 0.000191395583778323, + "loss": 0.1361, + "step": 2986 + }, + { + "epoch": 0.21552004040549802, + "grad_norm": 0.08346877247095108, + "learning_rate": 0.00019139269735892627, + "loss": 0.1945, + "step": 2987 + }, + { + "epoch": 0.21559219308055846, + "grad_norm": 0.09217635542154312, + "learning_rate": 0.0001913898109395295, + "loss": 0.1037, + "step": 2988 + }, + { + "epoch": 0.21566434575561888, + "grad_norm": 0.09559168666601181, + "learning_rate": 0.00019138692452013277, + "loss": 0.1419, + "step": 2989 + }, + { + "epoch": 0.21573649843067932, + "grad_norm": 0.13188686966896057, + "learning_rate": 0.00019138403810073604, + "loss": 0.1728, + "step": 2990 + }, + { + "epoch": 0.21580865110573974, + "grad_norm": 0.13026142120361328, + "learning_rate": 0.0001913811516813393, + "loss": 0.1051, + "step": 2991 + }, + { + "epoch": 0.21588080378080018, + "grad_norm": 0.12346373498439789, + "learning_rate": 0.0001913782652619426, + "loss": 0.174, + "step": 2992 + }, + { + "epoch": 0.2159529564558606, + "grad_norm": 0.10167650878429413, + "learning_rate": 0.00019137537884254585, + "loss": 0.182, + "step": 2993 + }, + { + "epoch": 0.21602510913092102, + "grad_norm": 0.12183064967393875, + "learning_rate": 0.0001913724924231491, + "loss": 0.1568, + "step": 2994 + }, + { + "epoch": 0.21609726180598146, + "grad_norm": 0.1322290003299713, + "learning_rate": 0.00019136960600375235, + "loss": 0.1851, + "step": 2995 + }, + { + "epoch": 0.21616941448104188, + "grad_norm": 0.11985625326633453, + "learning_rate": 0.0001913667195843556, + "loss": 0.1265, + "step": 2996 + }, + { + "epoch": 0.21624156715610232, + "grad_norm": 0.12829148769378662, + "learning_rate": 0.00019136383316495888, + "loss": 0.1566, + "step": 2997 + }, + { + "epoch": 0.21631371983116274, + "grad_norm": 0.10552884638309479, + "learning_rate": 0.00019136094674556214, + "loss": 0.1533, + "step": 2998 + }, + { + "epoch": 0.21638587250622318, + "grad_norm": 0.12362322956323624, + "learning_rate": 0.0001913580603261654, + "loss": 0.2138, + "step": 2999 + }, + { + "epoch": 0.2164580251812836, + "grad_norm": 0.09862411767244339, + "learning_rate": 0.00019135517390676867, + "loss": 0.1708, + "step": 3000 + }, + { + "epoch": 0.216530177856344, + "grad_norm": 0.1280146986246109, + "learning_rate": 0.00019135228748737193, + "loss": 0.1904, + "step": 3001 + }, + { + "epoch": 0.21660233053140446, + "grad_norm": 0.1173025518655777, + "learning_rate": 0.0001913494010679752, + "loss": 0.1402, + "step": 3002 + }, + { + "epoch": 0.21667448320646487, + "grad_norm": 0.1203983724117279, + "learning_rate": 0.00019134651464857845, + "loss": 0.1818, + "step": 3003 + }, + { + "epoch": 0.21674663588152532, + "grad_norm": 0.12568947672843933, + "learning_rate": 0.00019134362822918172, + "loss": 0.1846, + "step": 3004 + }, + { + "epoch": 0.21681878855658573, + "grad_norm": 0.12149185687303543, + "learning_rate": 0.00019134074180978495, + "loss": 0.208, + "step": 3005 + }, + { + "epoch": 0.21689094123164615, + "grad_norm": 0.12697440385818481, + "learning_rate": 0.00019133785539038824, + "loss": 0.2275, + "step": 3006 + }, + { + "epoch": 0.2169630939067066, + "grad_norm": 0.10044363886117935, + "learning_rate": 0.0001913349689709915, + "loss": 0.1716, + "step": 3007 + }, + { + "epoch": 0.217035246581767, + "grad_norm": 0.11203659325838089, + "learning_rate": 0.00019133208255159477, + "loss": 0.1759, + "step": 3008 + }, + { + "epoch": 0.21710739925682745, + "grad_norm": 0.1191219836473465, + "learning_rate": 0.00019132919613219803, + "loss": 0.1691, + "step": 3009 + }, + { + "epoch": 0.21717955193188787, + "grad_norm": 0.12754131853580475, + "learning_rate": 0.00019132630971280127, + "loss": 0.126, + "step": 3010 + }, + { + "epoch": 0.21725170460694831, + "grad_norm": 0.13199105858802795, + "learning_rate": 0.00019132342329340453, + "loss": 0.1483, + "step": 3011 + }, + { + "epoch": 0.21732385728200873, + "grad_norm": 0.09693431109189987, + "learning_rate": 0.0001913205368740078, + "loss": 0.1428, + "step": 3012 + }, + { + "epoch": 0.21739600995706915, + "grad_norm": 0.1105511337518692, + "learning_rate": 0.00019131765045461108, + "loss": 0.1549, + "step": 3013 + }, + { + "epoch": 0.2174681626321296, + "grad_norm": 0.11166469007730484, + "learning_rate": 0.00019131476403521435, + "loss": 0.1292, + "step": 3014 + }, + { + "epoch": 0.21754031530719, + "grad_norm": 0.1035347580909729, + "learning_rate": 0.00019131187761581758, + "loss": 0.146, + "step": 3015 + }, + { + "epoch": 0.21761246798225045, + "grad_norm": 0.12998312711715698, + "learning_rate": 0.00019130899119642084, + "loss": 0.2104, + "step": 3016 + }, + { + "epoch": 0.21768462065731087, + "grad_norm": 0.10929346084594727, + "learning_rate": 0.0001913061047770241, + "loss": 0.186, + "step": 3017 + }, + { + "epoch": 0.21775677333237128, + "grad_norm": 0.11475662142038345, + "learning_rate": 0.00019130321835762737, + "loss": 0.1326, + "step": 3018 + }, + { + "epoch": 0.21782892600743173, + "grad_norm": 0.10569846630096436, + "learning_rate": 0.00019130033193823063, + "loss": 0.1797, + "step": 3019 + }, + { + "epoch": 0.21790107868249214, + "grad_norm": 0.09804397821426392, + "learning_rate": 0.0001912974455188339, + "loss": 0.1646, + "step": 3020 + }, + { + "epoch": 0.2179732313575526, + "grad_norm": 0.10693866014480591, + "learning_rate": 0.00019129455909943716, + "loss": 0.1377, + "step": 3021 + }, + { + "epoch": 0.218045384032613, + "grad_norm": 0.12969979643821716, + "learning_rate": 0.00019129167268004042, + "loss": 0.1416, + "step": 3022 + }, + { + "epoch": 0.21811753670767345, + "grad_norm": 0.11125360429286957, + "learning_rate": 0.00019128878626064369, + "loss": 0.1604, + "step": 3023 + }, + { + "epoch": 0.21818968938273386, + "grad_norm": 0.0953187644481659, + "learning_rate": 0.00019128589984124695, + "loss": 0.1974, + "step": 3024 + }, + { + "epoch": 0.21826184205779428, + "grad_norm": 0.11042159795761108, + "learning_rate": 0.0001912830134218502, + "loss": 0.1408, + "step": 3025 + }, + { + "epoch": 0.21833399473285472, + "grad_norm": 0.10423165559768677, + "learning_rate": 0.00019128012700245345, + "loss": 0.1304, + "step": 3026 + }, + { + "epoch": 0.21840614740791514, + "grad_norm": 0.12057065218687057, + "learning_rate": 0.00019127724058305674, + "loss": 0.2244, + "step": 3027 + }, + { + "epoch": 0.21847830008297559, + "grad_norm": 0.09170324355363846, + "learning_rate": 0.00019127435416366, + "loss": 0.2083, + "step": 3028 + }, + { + "epoch": 0.218550452758036, + "grad_norm": 0.12476850301027298, + "learning_rate": 0.00019127146774426326, + "loss": 0.1567, + "step": 3029 + }, + { + "epoch": 0.21862260543309645, + "grad_norm": 0.09885885566473007, + "learning_rate": 0.00019126858132486653, + "loss": 0.195, + "step": 3030 + }, + { + "epoch": 0.21869475810815686, + "grad_norm": 0.12118660658597946, + "learning_rate": 0.00019126569490546976, + "loss": 0.1311, + "step": 3031 + }, + { + "epoch": 0.21876691078321728, + "grad_norm": 0.13478021323680878, + "learning_rate": 0.00019126280848607302, + "loss": 0.1493, + "step": 3032 + }, + { + "epoch": 0.21883906345827772, + "grad_norm": 0.11218003183603287, + "learning_rate": 0.0001912599220666763, + "loss": 0.1607, + "step": 3033 + }, + { + "epoch": 0.21891121613333814, + "grad_norm": 0.12066203355789185, + "learning_rate": 0.00019125703564727958, + "loss": 0.1581, + "step": 3034 + }, + { + "epoch": 0.21898336880839858, + "grad_norm": 0.12402987480163574, + "learning_rate": 0.00019125414922788284, + "loss": 0.2097, + "step": 3035 + }, + { + "epoch": 0.219055521483459, + "grad_norm": 0.1115858182311058, + "learning_rate": 0.00019125126280848608, + "loss": 0.1534, + "step": 3036 + }, + { + "epoch": 0.21912767415851941, + "grad_norm": 0.09497125446796417, + "learning_rate": 0.00019124837638908934, + "loss": 0.1503, + "step": 3037 + }, + { + "epoch": 0.21919982683357986, + "grad_norm": 0.11514826864004135, + "learning_rate": 0.0001912454899696926, + "loss": 0.1594, + "step": 3038 + }, + { + "epoch": 0.21927197950864027, + "grad_norm": 0.0928627997636795, + "learning_rate": 0.00019124260355029586, + "loss": 0.193, + "step": 3039 + }, + { + "epoch": 0.21934413218370072, + "grad_norm": 0.113450787961483, + "learning_rate": 0.00019123971713089913, + "loss": 0.1353, + "step": 3040 + }, + { + "epoch": 0.21941628485876113, + "grad_norm": 0.09100908041000366, + "learning_rate": 0.0001912368307115024, + "loss": 0.1948, + "step": 3041 + }, + { + "epoch": 0.21948843753382158, + "grad_norm": 0.08781091123819351, + "learning_rate": 0.00019123394429210565, + "loss": 0.1401, + "step": 3042 + }, + { + "epoch": 0.219560590208882, + "grad_norm": 0.08657009154558182, + "learning_rate": 0.00019123105787270892, + "loss": 0.1571, + "step": 3043 + }, + { + "epoch": 0.2196327428839424, + "grad_norm": 0.10702019184827805, + "learning_rate": 0.00019122817145331218, + "loss": 0.1428, + "step": 3044 + }, + { + "epoch": 0.21970489555900286, + "grad_norm": 0.10281525552272797, + "learning_rate": 0.00019122528503391544, + "loss": 0.177, + "step": 3045 + }, + { + "epoch": 0.21977704823406327, + "grad_norm": 0.08736959099769592, + "learning_rate": 0.0001912223986145187, + "loss": 0.1571, + "step": 3046 + }, + { + "epoch": 0.21984920090912372, + "grad_norm": 0.11643321067094803, + "learning_rate": 0.00019121951219512194, + "loss": 0.1386, + "step": 3047 + }, + { + "epoch": 0.21992135358418413, + "grad_norm": 0.11699894815683365, + "learning_rate": 0.0001912166257757252, + "loss": 0.1531, + "step": 3048 + }, + { + "epoch": 0.21999350625924455, + "grad_norm": 0.12016183882951736, + "learning_rate": 0.0001912137393563285, + "loss": 0.1519, + "step": 3049 + }, + { + "epoch": 0.220065658934305, + "grad_norm": 0.09290867298841476, + "learning_rate": 0.00019121085293693176, + "loss": 0.1796, + "step": 3050 + }, + { + "epoch": 0.2201378116093654, + "grad_norm": 0.1268221139907837, + "learning_rate": 0.00019120796651753502, + "loss": 0.1367, + "step": 3051 + }, + { + "epoch": 0.22020996428442585, + "grad_norm": 0.10724075883626938, + "learning_rate": 0.00019120508009813826, + "loss": 0.1374, + "step": 3052 + }, + { + "epoch": 0.22028211695948627, + "grad_norm": 0.10854970663785934, + "learning_rate": 0.00019120219367874152, + "loss": 0.1701, + "step": 3053 + }, + { + "epoch": 0.2203542696345467, + "grad_norm": 0.12508173286914825, + "learning_rate": 0.00019119930725934478, + "loss": 0.1551, + "step": 3054 + }, + { + "epoch": 0.22042642230960713, + "grad_norm": 0.1480616331100464, + "learning_rate": 0.00019119642083994804, + "loss": 0.1589, + "step": 3055 + }, + { + "epoch": 0.22049857498466754, + "grad_norm": 0.11518403887748718, + "learning_rate": 0.00019119353442055133, + "loss": 0.2207, + "step": 3056 + }, + { + "epoch": 0.220570727659728, + "grad_norm": 0.14292766153812408, + "learning_rate": 0.00019119064800115457, + "loss": 0.1321, + "step": 3057 + }, + { + "epoch": 0.2206428803347884, + "grad_norm": 0.09787577390670776, + "learning_rate": 0.00019118776158175783, + "loss": 0.2041, + "step": 3058 + }, + { + "epoch": 0.22071503300984885, + "grad_norm": 0.12522047758102417, + "learning_rate": 0.0001911848751623611, + "loss": 0.1175, + "step": 3059 + }, + { + "epoch": 0.22078718568490927, + "grad_norm": 0.11362268030643463, + "learning_rate": 0.00019118198874296436, + "loss": 0.1269, + "step": 3060 + }, + { + "epoch": 0.2208593383599697, + "grad_norm": 0.1254872977733612, + "learning_rate": 0.00019117910232356762, + "loss": 0.122, + "step": 3061 + }, + { + "epoch": 0.22093149103503013, + "grad_norm": 0.12300896644592285, + "learning_rate": 0.00019117621590417088, + "loss": 0.1664, + "step": 3062 + }, + { + "epoch": 0.22100364371009054, + "grad_norm": 0.09971299767494202, + "learning_rate": 0.00019117332948477415, + "loss": 0.1542, + "step": 3063 + }, + { + "epoch": 0.22107579638515099, + "grad_norm": 0.09395595639944077, + "learning_rate": 0.0001911704430653774, + "loss": 0.1641, + "step": 3064 + }, + { + "epoch": 0.2211479490602114, + "grad_norm": 0.13465718924999237, + "learning_rate": 0.00019116755664598067, + "loss": 0.145, + "step": 3065 + }, + { + "epoch": 0.22122010173527185, + "grad_norm": 0.11642010509967804, + "learning_rate": 0.00019116467022658394, + "loss": 0.1488, + "step": 3066 + }, + { + "epoch": 0.22129225441033226, + "grad_norm": 0.12988223135471344, + "learning_rate": 0.0001911617838071872, + "loss": 0.2116, + "step": 3067 + }, + { + "epoch": 0.22136440708539268, + "grad_norm": 0.10000425577163696, + "learning_rate": 0.00019115889738779044, + "loss": 0.1726, + "step": 3068 + }, + { + "epoch": 0.22143655976045312, + "grad_norm": 0.11322981864213943, + "learning_rate": 0.0001911560109683937, + "loss": 0.2174, + "step": 3069 + }, + { + "epoch": 0.22150871243551354, + "grad_norm": 0.16038878262043, + "learning_rate": 0.000191153124548997, + "loss": 0.1741, + "step": 3070 + }, + { + "epoch": 0.22158086511057398, + "grad_norm": 0.11883338540792465, + "learning_rate": 0.00019115023812960025, + "loss": 0.1211, + "step": 3071 + }, + { + "epoch": 0.2216530177856344, + "grad_norm": 0.11840777099132538, + "learning_rate": 0.00019114735171020351, + "loss": 0.1558, + "step": 3072 + }, + { + "epoch": 0.22172517046069484, + "grad_norm": 0.09917975962162018, + "learning_rate": 0.00019114446529080675, + "loss": 0.1485, + "step": 3073 + }, + { + "epoch": 0.22179732313575526, + "grad_norm": 0.11423740535974503, + "learning_rate": 0.00019114157887141, + "loss": 0.1703, + "step": 3074 + }, + { + "epoch": 0.22186947581081568, + "grad_norm": 0.15673841536045074, + "learning_rate": 0.00019113869245201328, + "loss": 0.1398, + "step": 3075 + }, + { + "epoch": 0.22194162848587612, + "grad_norm": 0.09600669890642166, + "learning_rate": 0.00019113580603261654, + "loss": 0.1484, + "step": 3076 + }, + { + "epoch": 0.22201378116093654, + "grad_norm": 0.10209406912326813, + "learning_rate": 0.00019113291961321983, + "loss": 0.1613, + "step": 3077 + }, + { + "epoch": 0.22208593383599698, + "grad_norm": 0.10075556486845016, + "learning_rate": 0.00019113003319382306, + "loss": 0.1599, + "step": 3078 + }, + { + "epoch": 0.2221580865110574, + "grad_norm": 0.11392730474472046, + "learning_rate": 0.00019112714677442633, + "loss": 0.1592, + "step": 3079 + }, + { + "epoch": 0.2222302391861178, + "grad_norm": 0.11423641443252563, + "learning_rate": 0.0001911242603550296, + "loss": 0.1579, + "step": 3080 + }, + { + "epoch": 0.22230239186117826, + "grad_norm": 0.11044574528932571, + "learning_rate": 0.00019112137393563285, + "loss": 0.1136, + "step": 3081 + }, + { + "epoch": 0.22237454453623867, + "grad_norm": 0.10218565165996552, + "learning_rate": 0.00019111848751623612, + "loss": 0.1612, + "step": 3082 + }, + { + "epoch": 0.22244669721129912, + "grad_norm": 0.11987396329641342, + "learning_rate": 0.00019111560109683938, + "loss": 0.1751, + "step": 3083 + }, + { + "epoch": 0.22251884988635953, + "grad_norm": 0.12039399892091751, + "learning_rate": 0.00019111271467744264, + "loss": 0.1596, + "step": 3084 + }, + { + "epoch": 0.22259100256141998, + "grad_norm": 0.11418411880731583, + "learning_rate": 0.0001911098282580459, + "loss": 0.1349, + "step": 3085 + }, + { + "epoch": 0.2226631552364804, + "grad_norm": 0.10532049834728241, + "learning_rate": 0.00019110694183864917, + "loss": 0.1326, + "step": 3086 + }, + { + "epoch": 0.2227353079115408, + "grad_norm": 0.09723195433616638, + "learning_rate": 0.00019110405541925243, + "loss": 0.1468, + "step": 3087 + }, + { + "epoch": 0.22280746058660125, + "grad_norm": 0.09533675760030746, + "learning_rate": 0.0001911011689998557, + "loss": 0.1551, + "step": 3088 + }, + { + "epoch": 0.22287961326166167, + "grad_norm": 0.13404034078121185, + "learning_rate": 0.00019109828258045893, + "loss": 0.1659, + "step": 3089 + }, + { + "epoch": 0.2229517659367221, + "grad_norm": 0.10431547462940216, + "learning_rate": 0.0001910953961610622, + "loss": 0.1477, + "step": 3090 + }, + { + "epoch": 0.22302391861178253, + "grad_norm": 0.11172360181808472, + "learning_rate": 0.00019109250974166548, + "loss": 0.121, + "step": 3091 + }, + { + "epoch": 0.22309607128684297, + "grad_norm": 0.12391429394483566, + "learning_rate": 0.00019108962332226875, + "loss": 0.2049, + "step": 3092 + }, + { + "epoch": 0.2231682239619034, + "grad_norm": 0.10060824453830719, + "learning_rate": 0.000191086736902872, + "loss": 0.1771, + "step": 3093 + }, + { + "epoch": 0.2232403766369638, + "grad_norm": 0.10428790748119354, + "learning_rate": 0.00019108385048347524, + "loss": 0.175, + "step": 3094 + }, + { + "epoch": 0.22331252931202425, + "grad_norm": 0.09861437231302261, + "learning_rate": 0.0001910809640640785, + "loss": 0.1792, + "step": 3095 + }, + { + "epoch": 0.22338468198708467, + "grad_norm": 0.10132397711277008, + "learning_rate": 0.00019107807764468177, + "loss": 0.1169, + "step": 3096 + }, + { + "epoch": 0.2234568346621451, + "grad_norm": 0.11215358972549438, + "learning_rate": 0.00019107519122528503, + "loss": 0.1656, + "step": 3097 + }, + { + "epoch": 0.22352898733720553, + "grad_norm": 0.10548488795757294, + "learning_rate": 0.00019107230480588832, + "loss": 0.2087, + "step": 3098 + }, + { + "epoch": 0.22360114001226594, + "grad_norm": 0.10882903635501862, + "learning_rate": 0.00019106941838649156, + "loss": 0.1485, + "step": 3099 + }, + { + "epoch": 0.22367329268732639, + "grad_norm": 0.12354625761508942, + "learning_rate": 0.00019106653196709482, + "loss": 0.1761, + "step": 3100 + }, + { + "epoch": 0.2237454453623868, + "grad_norm": 0.09949934482574463, + "learning_rate": 0.00019106364554769808, + "loss": 0.1178, + "step": 3101 + }, + { + "epoch": 0.22381759803744725, + "grad_norm": 0.151101216673851, + "learning_rate": 0.00019106075912830135, + "loss": 0.1846, + "step": 3102 + }, + { + "epoch": 0.22388975071250766, + "grad_norm": 0.11479262262582779, + "learning_rate": 0.0001910578727089046, + "loss": 0.2069, + "step": 3103 + }, + { + "epoch": 0.2239619033875681, + "grad_norm": 0.10903146862983704, + "learning_rate": 0.00019105498628950787, + "loss": 0.1742, + "step": 3104 + }, + { + "epoch": 0.22403405606262852, + "grad_norm": 0.08431587368249893, + "learning_rate": 0.00019105209987011114, + "loss": 0.156, + "step": 3105 + }, + { + "epoch": 0.22410620873768894, + "grad_norm": 0.10032767057418823, + "learning_rate": 0.0001910492134507144, + "loss": 0.1504, + "step": 3106 + }, + { + "epoch": 0.22417836141274938, + "grad_norm": 0.1023092269897461, + "learning_rate": 0.00019104632703131766, + "loss": 0.1612, + "step": 3107 + }, + { + "epoch": 0.2242505140878098, + "grad_norm": 0.0954068973660469, + "learning_rate": 0.00019104344061192093, + "loss": 0.1334, + "step": 3108 + }, + { + "epoch": 0.22432266676287024, + "grad_norm": 0.11150647699832916, + "learning_rate": 0.0001910405541925242, + "loss": 0.1877, + "step": 3109 + }, + { + "epoch": 0.22439481943793066, + "grad_norm": 0.10485085099935532, + "learning_rate": 0.00019103766777312745, + "loss": 0.1624, + "step": 3110 + }, + { + "epoch": 0.22446697211299108, + "grad_norm": 0.10941198468208313, + "learning_rate": 0.0001910347813537307, + "loss": 0.1276, + "step": 3111 + }, + { + "epoch": 0.22453912478805152, + "grad_norm": 0.1089855507016182, + "learning_rate": 0.00019103189493433398, + "loss": 0.1347, + "step": 3112 + }, + { + "epoch": 0.22461127746311194, + "grad_norm": 0.11594249308109283, + "learning_rate": 0.00019102900851493724, + "loss": 0.1606, + "step": 3113 + }, + { + "epoch": 0.22468343013817238, + "grad_norm": 0.10764208436012268, + "learning_rate": 0.0001910261220955405, + "loss": 0.1326, + "step": 3114 + }, + { + "epoch": 0.2247555828132328, + "grad_norm": 0.124549500644207, + "learning_rate": 0.00019102323567614377, + "loss": 0.1678, + "step": 3115 + }, + { + "epoch": 0.22482773548829324, + "grad_norm": 0.1057669147849083, + "learning_rate": 0.000191020349256747, + "loss": 0.1681, + "step": 3116 + }, + { + "epoch": 0.22489988816335366, + "grad_norm": 0.12275435030460358, + "learning_rate": 0.00019101746283735026, + "loss": 0.1619, + "step": 3117 + }, + { + "epoch": 0.22497204083841407, + "grad_norm": 0.09398017078638077, + "learning_rate": 0.00019101457641795353, + "loss": 0.1691, + "step": 3118 + }, + { + "epoch": 0.22504419351347452, + "grad_norm": 0.09353601932525635, + "learning_rate": 0.00019101168999855682, + "loss": 0.1682, + "step": 3119 + }, + { + "epoch": 0.22511634618853493, + "grad_norm": 0.09263613820075989, + "learning_rate": 0.00019100880357916008, + "loss": 0.18, + "step": 3120 + }, + { + "epoch": 0.22518849886359538, + "grad_norm": 0.11538559198379517, + "learning_rate": 0.00019100591715976332, + "loss": 0.1233, + "step": 3121 + }, + { + "epoch": 0.2252606515386558, + "grad_norm": 0.09187686443328857, + "learning_rate": 0.00019100303074036658, + "loss": 0.1298, + "step": 3122 + }, + { + "epoch": 0.22533280421371624, + "grad_norm": 0.11960215121507645, + "learning_rate": 0.00019100014432096984, + "loss": 0.2008, + "step": 3123 + }, + { + "epoch": 0.22540495688877665, + "grad_norm": 0.10731208324432373, + "learning_rate": 0.0001909972579015731, + "loss": 0.1744, + "step": 3124 + }, + { + "epoch": 0.22547710956383707, + "grad_norm": 0.10698284208774567, + "learning_rate": 0.00019099437148217637, + "loss": 0.1521, + "step": 3125 + }, + { + "epoch": 0.2255492622388975, + "grad_norm": 0.12406127899885178, + "learning_rate": 0.00019099148506277963, + "loss": 0.1518, + "step": 3126 + }, + { + "epoch": 0.22562141491395793, + "grad_norm": 0.1567186415195465, + "learning_rate": 0.0001909885986433829, + "loss": 0.2005, + "step": 3127 + }, + { + "epoch": 0.22569356758901837, + "grad_norm": 0.10155721008777618, + "learning_rate": 0.00019098571222398616, + "loss": 0.1554, + "step": 3128 + }, + { + "epoch": 0.2257657202640788, + "grad_norm": 0.15428893268108368, + "learning_rate": 0.00019098282580458942, + "loss": 0.1672, + "step": 3129 + }, + { + "epoch": 0.2258378729391392, + "grad_norm": 0.11552326381206512, + "learning_rate": 0.00019097993938519268, + "loss": 0.14, + "step": 3130 + }, + { + "epoch": 0.22591002561419965, + "grad_norm": 0.1283775120973587, + "learning_rate": 0.00019097705296579595, + "loss": 0.2078, + "step": 3131 + }, + { + "epoch": 0.22598217828926007, + "grad_norm": 0.09192639589309692, + "learning_rate": 0.00019097416654639918, + "loss": 0.1366, + "step": 3132 + }, + { + "epoch": 0.2260543309643205, + "grad_norm": 0.09906932711601257, + "learning_rate": 0.00019097128012700247, + "loss": 0.1406, + "step": 3133 + }, + { + "epoch": 0.22612648363938093, + "grad_norm": 0.1220705658197403, + "learning_rate": 0.00019096839370760573, + "loss": 0.187, + "step": 3134 + }, + { + "epoch": 0.22619863631444137, + "grad_norm": 0.1256432682275772, + "learning_rate": 0.000190965507288209, + "loss": 0.1441, + "step": 3135 + }, + { + "epoch": 0.2262707889895018, + "grad_norm": 0.10777739435434341, + "learning_rate": 0.00019096262086881226, + "loss": 0.1595, + "step": 3136 + }, + { + "epoch": 0.2263429416645622, + "grad_norm": 0.09372089058160782, + "learning_rate": 0.0001909597344494155, + "loss": 0.1858, + "step": 3137 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.10302763432264328, + "learning_rate": 0.00019095684803001876, + "loss": 0.1532, + "step": 3138 + }, + { + "epoch": 0.22648724701468306, + "grad_norm": 0.10827448219060898, + "learning_rate": 0.00019095396161062202, + "loss": 0.1759, + "step": 3139 + }, + { + "epoch": 0.2265593996897435, + "grad_norm": 0.10127770155668259, + "learning_rate": 0.0001909510751912253, + "loss": 0.1424, + "step": 3140 + }, + { + "epoch": 0.22663155236480392, + "grad_norm": 0.1250356286764145, + "learning_rate": 0.00019094818877182857, + "loss": 0.14, + "step": 3141 + }, + { + "epoch": 0.22670370503986434, + "grad_norm": 0.09610163420438766, + "learning_rate": 0.0001909453023524318, + "loss": 0.1763, + "step": 3142 + }, + { + "epoch": 0.22677585771492478, + "grad_norm": 0.1547233760356903, + "learning_rate": 0.00019094241593303507, + "loss": 0.1494, + "step": 3143 + }, + { + "epoch": 0.2268480103899852, + "grad_norm": 0.10779442638158798, + "learning_rate": 0.00019093952951363834, + "loss": 0.1992, + "step": 3144 + }, + { + "epoch": 0.22692016306504564, + "grad_norm": 0.13055872917175293, + "learning_rate": 0.0001909366430942416, + "loss": 0.195, + "step": 3145 + }, + { + "epoch": 0.22699231574010606, + "grad_norm": 0.1157553419470787, + "learning_rate": 0.00019093375667484486, + "loss": 0.1614, + "step": 3146 + }, + { + "epoch": 0.2270644684151665, + "grad_norm": 0.17626427114009857, + "learning_rate": 0.00019093087025544812, + "loss": 0.1898, + "step": 3147 + }, + { + "epoch": 0.22713662109022692, + "grad_norm": 0.0990791916847229, + "learning_rate": 0.0001909279838360514, + "loss": 0.2164, + "step": 3148 + }, + { + "epoch": 0.22720877376528734, + "grad_norm": 0.19372622668743134, + "learning_rate": 0.00019092509741665465, + "loss": 0.1583, + "step": 3149 + }, + { + "epoch": 0.22728092644034778, + "grad_norm": 0.1041846051812172, + "learning_rate": 0.00019092221099725791, + "loss": 0.1766, + "step": 3150 + }, + { + "epoch": 0.2273530791154082, + "grad_norm": 0.1104101613163948, + "learning_rate": 0.00019091932457786118, + "loss": 0.1466, + "step": 3151 + }, + { + "epoch": 0.22742523179046864, + "grad_norm": 0.09540078043937683, + "learning_rate": 0.00019091643815846444, + "loss": 0.1366, + "step": 3152 + }, + { + "epoch": 0.22749738446552906, + "grad_norm": 0.14345785975456238, + "learning_rate": 0.00019091355173906768, + "loss": 0.1839, + "step": 3153 + }, + { + "epoch": 0.2275695371405895, + "grad_norm": 0.10636702924966812, + "learning_rate": 0.00019091066531967097, + "loss": 0.1484, + "step": 3154 + }, + { + "epoch": 0.22764168981564992, + "grad_norm": 0.11496289819478989, + "learning_rate": 0.00019090777890027423, + "loss": 0.1803, + "step": 3155 + }, + { + "epoch": 0.22771384249071033, + "grad_norm": 0.13613425195217133, + "learning_rate": 0.0001909048924808775, + "loss": 0.1839, + "step": 3156 + }, + { + "epoch": 0.22778599516577078, + "grad_norm": 0.10940530151128769, + "learning_rate": 0.00019090200606148075, + "loss": 0.1476, + "step": 3157 + }, + { + "epoch": 0.2278581478408312, + "grad_norm": 0.08880080282688141, + "learning_rate": 0.000190899119642084, + "loss": 0.1605, + "step": 3158 + }, + { + "epoch": 0.22793030051589164, + "grad_norm": 0.1113625019788742, + "learning_rate": 0.00019089623322268725, + "loss": 0.1393, + "step": 3159 + }, + { + "epoch": 0.22800245319095205, + "grad_norm": 0.12891051173210144, + "learning_rate": 0.00019089334680329052, + "loss": 0.194, + "step": 3160 + }, + { + "epoch": 0.22807460586601247, + "grad_norm": 0.0975043773651123, + "learning_rate": 0.0001908904603838938, + "loss": 0.1996, + "step": 3161 + }, + { + "epoch": 0.2281467585410729, + "grad_norm": 0.09029150754213333, + "learning_rate": 0.00019088757396449707, + "loss": 0.1742, + "step": 3162 + }, + { + "epoch": 0.22821891121613333, + "grad_norm": 0.07928082346916199, + "learning_rate": 0.0001908846875451003, + "loss": 0.1659, + "step": 3163 + }, + { + "epoch": 0.22829106389119377, + "grad_norm": 0.10646059364080429, + "learning_rate": 0.00019088180112570357, + "loss": 0.1513, + "step": 3164 + }, + { + "epoch": 0.2283632165662542, + "grad_norm": 0.10814210027456284, + "learning_rate": 0.00019087891470630683, + "loss": 0.1667, + "step": 3165 + }, + { + "epoch": 0.22843536924131463, + "grad_norm": 0.09975995868444443, + "learning_rate": 0.0001908760282869101, + "loss": 0.1305, + "step": 3166 + }, + { + "epoch": 0.22850752191637505, + "grad_norm": 0.12126446515321732, + "learning_rate": 0.00019087314186751336, + "loss": 0.2065, + "step": 3167 + }, + { + "epoch": 0.22857967459143547, + "grad_norm": 0.09508046507835388, + "learning_rate": 0.00019087025544811662, + "loss": 0.1939, + "step": 3168 + }, + { + "epoch": 0.2286518272664959, + "grad_norm": 0.11263048648834229, + "learning_rate": 0.00019086736902871988, + "loss": 0.1556, + "step": 3169 + }, + { + "epoch": 0.22872397994155633, + "grad_norm": 0.1208372488617897, + "learning_rate": 0.00019086448260932314, + "loss": 0.1911, + "step": 3170 + }, + { + "epoch": 0.22879613261661677, + "grad_norm": 0.11846572160720825, + "learning_rate": 0.0001908615961899264, + "loss": 0.1708, + "step": 3171 + }, + { + "epoch": 0.2288682852916772, + "grad_norm": 0.1360793560743332, + "learning_rate": 0.00019085870977052967, + "loss": 0.1534, + "step": 3172 + }, + { + "epoch": 0.2289404379667376, + "grad_norm": 0.09993666410446167, + "learning_rate": 0.00019085582335113293, + "loss": 0.1468, + "step": 3173 + }, + { + "epoch": 0.22901259064179805, + "grad_norm": 0.09834518283605576, + "learning_rate": 0.00019085293693173617, + "loss": 0.1472, + "step": 3174 + }, + { + "epoch": 0.22908474331685846, + "grad_norm": 0.08449302613735199, + "learning_rate": 0.00019085005051233946, + "loss": 0.1319, + "step": 3175 + }, + { + "epoch": 0.2291568959919189, + "grad_norm": 0.12296770513057709, + "learning_rate": 0.00019084716409294272, + "loss": 0.1943, + "step": 3176 + }, + { + "epoch": 0.22922904866697932, + "grad_norm": 0.10122761875391006, + "learning_rate": 0.00019084427767354599, + "loss": 0.1603, + "step": 3177 + }, + { + "epoch": 0.22930120134203977, + "grad_norm": 0.10421742498874664, + "learning_rate": 0.00019084139125414925, + "loss": 0.1439, + "step": 3178 + }, + { + "epoch": 0.22937335401710018, + "grad_norm": 0.11106200516223907, + "learning_rate": 0.00019083850483475248, + "loss": 0.1647, + "step": 3179 + }, + { + "epoch": 0.2294455066921606, + "grad_norm": 0.08921913802623749, + "learning_rate": 0.00019083561841535575, + "loss": 0.1295, + "step": 3180 + }, + { + "epoch": 0.22951765936722104, + "grad_norm": 0.10120546072721481, + "learning_rate": 0.000190832731995959, + "loss": 0.1552, + "step": 3181 + }, + { + "epoch": 0.22958981204228146, + "grad_norm": 0.09767742455005646, + "learning_rate": 0.0001908298455765623, + "loss": 0.1868, + "step": 3182 + }, + { + "epoch": 0.2296619647173419, + "grad_norm": 0.11393865942955017, + "learning_rate": 0.00019082695915716556, + "loss": 0.1778, + "step": 3183 + }, + { + "epoch": 0.22973411739240232, + "grad_norm": 0.12658797204494476, + "learning_rate": 0.0001908240727377688, + "loss": 0.1707, + "step": 3184 + }, + { + "epoch": 0.22980627006746276, + "grad_norm": 0.112017422914505, + "learning_rate": 0.00019082118631837206, + "loss": 0.1774, + "step": 3185 + }, + { + "epoch": 0.22987842274252318, + "grad_norm": 0.11038339883089066, + "learning_rate": 0.00019081829989897532, + "loss": 0.1414, + "step": 3186 + }, + { + "epoch": 0.2299505754175836, + "grad_norm": 0.10965468734502792, + "learning_rate": 0.0001908154134795786, + "loss": 0.1886, + "step": 3187 + }, + { + "epoch": 0.23002272809264404, + "grad_norm": 0.08305226266384125, + "learning_rate": 0.00019081252706018185, + "loss": 0.193, + "step": 3188 + }, + { + "epoch": 0.23009488076770446, + "grad_norm": 0.10485181957483292, + "learning_rate": 0.0001908096406407851, + "loss": 0.1421, + "step": 3189 + }, + { + "epoch": 0.2301670334427649, + "grad_norm": 0.09514374285936356, + "learning_rate": 0.00019080675422138838, + "loss": 0.1807, + "step": 3190 + }, + { + "epoch": 0.23023918611782532, + "grad_norm": 0.09134036302566528, + "learning_rate": 0.00019080386780199164, + "loss": 0.1273, + "step": 3191 + }, + { + "epoch": 0.23031133879288573, + "grad_norm": 0.11478359252214432, + "learning_rate": 0.0001908009813825949, + "loss": 0.1358, + "step": 3192 + }, + { + "epoch": 0.23038349146794618, + "grad_norm": 0.11170293390750885, + "learning_rate": 0.00019079809496319817, + "loss": 0.2004, + "step": 3193 + }, + { + "epoch": 0.2304556441430066, + "grad_norm": 0.13690118491649628, + "learning_rate": 0.00019079520854380143, + "loss": 0.1525, + "step": 3194 + }, + { + "epoch": 0.23052779681806704, + "grad_norm": 0.1377672702074051, + "learning_rate": 0.00019079232212440466, + "loss": 0.187, + "step": 3195 + }, + { + "epoch": 0.23059994949312745, + "grad_norm": 0.1272856742143631, + "learning_rate": 0.00019078943570500795, + "loss": 0.208, + "step": 3196 + }, + { + "epoch": 0.2306721021681879, + "grad_norm": 0.13685926795005798, + "learning_rate": 0.00019078654928561122, + "loss": 0.1613, + "step": 3197 + }, + { + "epoch": 0.23074425484324831, + "grad_norm": 0.11943011730909348, + "learning_rate": 0.00019078366286621448, + "loss": 0.1984, + "step": 3198 + }, + { + "epoch": 0.23081640751830873, + "grad_norm": 0.12155202031135559, + "learning_rate": 0.00019078077644681774, + "loss": 0.1649, + "step": 3199 + }, + { + "epoch": 0.23088856019336917, + "grad_norm": 0.09924157708883286, + "learning_rate": 0.00019077789002742098, + "loss": 0.1882, + "step": 3200 + }, + { + "epoch": 0.2309607128684296, + "grad_norm": 0.09386096149682999, + "learning_rate": 0.00019077500360802424, + "loss": 0.1452, + "step": 3201 + }, + { + "epoch": 0.23103286554349003, + "grad_norm": 0.11926762759685516, + "learning_rate": 0.0001907721171886275, + "loss": 0.13, + "step": 3202 + }, + { + "epoch": 0.23110501821855045, + "grad_norm": 0.11140086501836777, + "learning_rate": 0.0001907692307692308, + "loss": 0.1827, + "step": 3203 + }, + { + "epoch": 0.23117717089361087, + "grad_norm": 0.11237042397260666, + "learning_rate": 0.00019076634434983406, + "loss": 0.2004, + "step": 3204 + }, + { + "epoch": 0.2312493235686713, + "grad_norm": 0.09138461947441101, + "learning_rate": 0.0001907634579304373, + "loss": 0.1688, + "step": 3205 + }, + { + "epoch": 0.23132147624373173, + "grad_norm": 0.09387119114398956, + "learning_rate": 0.00019076057151104056, + "loss": 0.1438, + "step": 3206 + }, + { + "epoch": 0.23139362891879217, + "grad_norm": 0.1088188886642456, + "learning_rate": 0.00019075768509164382, + "loss": 0.2102, + "step": 3207 + }, + { + "epoch": 0.2314657815938526, + "grad_norm": 0.09382703900337219, + "learning_rate": 0.00019075479867224708, + "loss": 0.1768, + "step": 3208 + }, + { + "epoch": 0.23153793426891303, + "grad_norm": 0.09567832946777344, + "learning_rate": 0.00019075191225285034, + "loss": 0.1488, + "step": 3209 + }, + { + "epoch": 0.23161008694397345, + "grad_norm": 0.11694233864545822, + "learning_rate": 0.0001907490258334536, + "loss": 0.1808, + "step": 3210 + }, + { + "epoch": 0.23168223961903386, + "grad_norm": 0.14368030428886414, + "learning_rate": 0.00019074613941405687, + "loss": 0.1594, + "step": 3211 + }, + { + "epoch": 0.2317543922940943, + "grad_norm": 0.12889264523983002, + "learning_rate": 0.00019074325299466013, + "loss": 0.1941, + "step": 3212 + }, + { + "epoch": 0.23182654496915472, + "grad_norm": 0.09148585796356201, + "learning_rate": 0.0001907403665752634, + "loss": 0.1295, + "step": 3213 + }, + { + "epoch": 0.23189869764421517, + "grad_norm": 0.13587994873523712, + "learning_rate": 0.00019073748015586666, + "loss": 0.1788, + "step": 3214 + }, + { + "epoch": 0.23197085031927558, + "grad_norm": 0.12354405224323273, + "learning_rate": 0.00019073459373646992, + "loss": 0.1216, + "step": 3215 + }, + { + "epoch": 0.23204300299433603, + "grad_norm": 0.11031404882669449, + "learning_rate": 0.00019073170731707319, + "loss": 0.1764, + "step": 3216 + }, + { + "epoch": 0.23211515566939644, + "grad_norm": 0.14193016290664673, + "learning_rate": 0.00019072882089767645, + "loss": 0.1294, + "step": 3217 + }, + { + "epoch": 0.23218730834445686, + "grad_norm": 0.15411879122257233, + "learning_rate": 0.0001907259344782797, + "loss": 0.2023, + "step": 3218 + }, + { + "epoch": 0.2322594610195173, + "grad_norm": 0.11790381371974945, + "learning_rate": 0.00019072304805888297, + "loss": 0.1778, + "step": 3219 + }, + { + "epoch": 0.23233161369457772, + "grad_norm": 0.1162402480840683, + "learning_rate": 0.00019072016163948624, + "loss": 0.1825, + "step": 3220 + }, + { + "epoch": 0.23240376636963817, + "grad_norm": 0.11275645345449448, + "learning_rate": 0.0001907172752200895, + "loss": 0.1703, + "step": 3221 + }, + { + "epoch": 0.23247591904469858, + "grad_norm": 0.13524194061756134, + "learning_rate": 0.00019071438880069274, + "loss": 0.1836, + "step": 3222 + }, + { + "epoch": 0.232548071719759, + "grad_norm": 0.09859395027160645, + "learning_rate": 0.000190711502381296, + "loss": 0.2146, + "step": 3223 + }, + { + "epoch": 0.23262022439481944, + "grad_norm": 0.09172052145004272, + "learning_rate": 0.0001907086159618993, + "loss": 0.1919, + "step": 3224 + }, + { + "epoch": 0.23269237706987986, + "grad_norm": 0.10333575308322906, + "learning_rate": 0.00019070572954250255, + "loss": 0.1809, + "step": 3225 + }, + { + "epoch": 0.2327645297449403, + "grad_norm": 0.11006314307451248, + "learning_rate": 0.00019070284312310581, + "loss": 0.1058, + "step": 3226 + }, + { + "epoch": 0.23283668242000072, + "grad_norm": 0.09686331450939178, + "learning_rate": 0.00019069995670370905, + "loss": 0.1585, + "step": 3227 + }, + { + "epoch": 0.23290883509506116, + "grad_norm": 0.11920811980962753, + "learning_rate": 0.0001906970702843123, + "loss": 0.2086, + "step": 3228 + }, + { + "epoch": 0.23298098777012158, + "grad_norm": 0.12744984030723572, + "learning_rate": 0.00019069418386491558, + "loss": 0.1615, + "step": 3229 + }, + { + "epoch": 0.233053140445182, + "grad_norm": 0.10401850193738937, + "learning_rate": 0.00019069129744551884, + "loss": 0.1448, + "step": 3230 + }, + { + "epoch": 0.23312529312024244, + "grad_norm": 0.1329621970653534, + "learning_rate": 0.00019068841102612213, + "loss": 0.1623, + "step": 3231 + }, + { + "epoch": 0.23319744579530285, + "grad_norm": 0.14025752246379852, + "learning_rate": 0.00019068552460672536, + "loss": 0.1652, + "step": 3232 + }, + { + "epoch": 0.2332695984703633, + "grad_norm": 0.11895523220300674, + "learning_rate": 0.00019068263818732863, + "loss": 0.142, + "step": 3233 + }, + { + "epoch": 0.23334175114542371, + "grad_norm": 0.11003698408603668, + "learning_rate": 0.0001906797517679319, + "loss": 0.1355, + "step": 3234 + }, + { + "epoch": 0.23341390382048413, + "grad_norm": 0.1019037514925003, + "learning_rate": 0.00019067686534853515, + "loss": 0.1667, + "step": 3235 + }, + { + "epoch": 0.23348605649554457, + "grad_norm": 0.14654062688350677, + "learning_rate": 0.00019067397892913842, + "loss": 0.1235, + "step": 3236 + }, + { + "epoch": 0.233558209170605, + "grad_norm": 0.10918853431940079, + "learning_rate": 0.00019067109250974168, + "loss": 0.1502, + "step": 3237 + }, + { + "epoch": 0.23363036184566544, + "grad_norm": 0.13547173142433167, + "learning_rate": 0.00019066820609034492, + "loss": 0.1639, + "step": 3238 + }, + { + "epoch": 0.23370251452072585, + "grad_norm": 0.10551789402961731, + "learning_rate": 0.0001906653196709482, + "loss": 0.1407, + "step": 3239 + }, + { + "epoch": 0.2337746671957863, + "grad_norm": 0.1089998185634613, + "learning_rate": 0.00019066243325155147, + "loss": 0.1574, + "step": 3240 + }, + { + "epoch": 0.2338468198708467, + "grad_norm": 0.14473126828670502, + "learning_rate": 0.00019065954683215473, + "loss": 0.1954, + "step": 3241 + }, + { + "epoch": 0.23391897254590713, + "grad_norm": 0.14464539289474487, + "learning_rate": 0.000190656660412758, + "loss": 0.2045, + "step": 3242 + }, + { + "epoch": 0.23399112522096757, + "grad_norm": 0.1070796549320221, + "learning_rate": 0.00019065377399336123, + "loss": 0.1938, + "step": 3243 + }, + { + "epoch": 0.234063277896028, + "grad_norm": 0.0994974672794342, + "learning_rate": 0.0001906508875739645, + "loss": 0.1438, + "step": 3244 + }, + { + "epoch": 0.23413543057108843, + "grad_norm": 0.14842146635055542, + "learning_rate": 0.00019064800115456776, + "loss": 0.1839, + "step": 3245 + }, + { + "epoch": 0.23420758324614885, + "grad_norm": 0.1003551185131073, + "learning_rate": 0.00019064511473517105, + "loss": 0.1168, + "step": 3246 + }, + { + "epoch": 0.2342797359212093, + "grad_norm": 0.11478232592344284, + "learning_rate": 0.0001906422283157743, + "loss": 0.1729, + "step": 3247 + }, + { + "epoch": 0.2343518885962697, + "grad_norm": 0.09958324581384659, + "learning_rate": 0.00019063934189637754, + "loss": 0.1219, + "step": 3248 + }, + { + "epoch": 0.23442404127133012, + "grad_norm": 0.10597258061170578, + "learning_rate": 0.0001906364554769808, + "loss": 0.1747, + "step": 3249 + }, + { + "epoch": 0.23449619394639057, + "grad_norm": 0.14413249492645264, + "learning_rate": 0.00019063356905758407, + "loss": 0.1426, + "step": 3250 + }, + { + "epoch": 0.23456834662145098, + "grad_norm": 0.1292029619216919, + "learning_rate": 0.00019063068263818733, + "loss": 0.1527, + "step": 3251 + }, + { + "epoch": 0.23464049929651143, + "grad_norm": 0.1137336865067482, + "learning_rate": 0.0001906277962187906, + "loss": 0.1778, + "step": 3252 + }, + { + "epoch": 0.23471265197157185, + "grad_norm": 0.1345532238483429, + "learning_rate": 0.00019062490979939386, + "loss": 0.1664, + "step": 3253 + }, + { + "epoch": 0.23478480464663226, + "grad_norm": 0.14330095052719116, + "learning_rate": 0.00019062202337999712, + "loss": 0.1812, + "step": 3254 + }, + { + "epoch": 0.2348569573216927, + "grad_norm": 0.10712531208992004, + "learning_rate": 0.00019061913696060038, + "loss": 0.2, + "step": 3255 + }, + { + "epoch": 0.23492910999675312, + "grad_norm": 0.14136013388633728, + "learning_rate": 0.00019061625054120365, + "loss": 0.1447, + "step": 3256 + }, + { + "epoch": 0.23500126267181357, + "grad_norm": 0.11800672113895416, + "learning_rate": 0.0001906133641218069, + "loss": 0.162, + "step": 3257 + }, + { + "epoch": 0.23507341534687398, + "grad_norm": 0.10882166773080826, + "learning_rate": 0.00019061047770241017, + "loss": 0.1226, + "step": 3258 + }, + { + "epoch": 0.23514556802193443, + "grad_norm": 0.13625170290470123, + "learning_rate": 0.0001906075912830134, + "loss": 0.17, + "step": 3259 + }, + { + "epoch": 0.23521772069699484, + "grad_norm": 0.10706209391355515, + "learning_rate": 0.0001906047048636167, + "loss": 0.1387, + "step": 3260 + }, + { + "epoch": 0.23528987337205526, + "grad_norm": 0.10762275755405426, + "learning_rate": 0.00019060181844421996, + "loss": 0.1385, + "step": 3261 + }, + { + "epoch": 0.2353620260471157, + "grad_norm": 0.11794031411409378, + "learning_rate": 0.00019059893202482323, + "loss": 0.1782, + "step": 3262 + }, + { + "epoch": 0.23543417872217612, + "grad_norm": 0.10584767907857895, + "learning_rate": 0.0001905960456054265, + "loss": 0.1208, + "step": 3263 + }, + { + "epoch": 0.23550633139723656, + "grad_norm": 0.0948035717010498, + "learning_rate": 0.00019059315918602972, + "loss": 0.1484, + "step": 3264 + }, + { + "epoch": 0.23557848407229698, + "grad_norm": 0.12207180261611938, + "learning_rate": 0.000190590272766633, + "loss": 0.1708, + "step": 3265 + }, + { + "epoch": 0.2356506367473574, + "grad_norm": 0.1034015640616417, + "learning_rate": 0.00019058738634723625, + "loss": 0.1828, + "step": 3266 + }, + { + "epoch": 0.23572278942241784, + "grad_norm": 0.11126287281513214, + "learning_rate": 0.00019058449992783954, + "loss": 0.203, + "step": 3267 + }, + { + "epoch": 0.23579494209747826, + "grad_norm": 0.09588288515806198, + "learning_rate": 0.0001905816135084428, + "loss": 0.1663, + "step": 3268 + }, + { + "epoch": 0.2358670947725387, + "grad_norm": 0.10574699938297272, + "learning_rate": 0.00019057872708904604, + "loss": 0.147, + "step": 3269 + }, + { + "epoch": 0.23593924744759912, + "grad_norm": 0.1389194130897522, + "learning_rate": 0.0001905758406696493, + "loss": 0.1739, + "step": 3270 + }, + { + "epoch": 0.23601140012265956, + "grad_norm": 0.09925451129674911, + "learning_rate": 0.00019057295425025256, + "loss": 0.1984, + "step": 3271 + }, + { + "epoch": 0.23608355279771998, + "grad_norm": 0.12089280039072037, + "learning_rate": 0.00019057006783085583, + "loss": 0.1985, + "step": 3272 + }, + { + "epoch": 0.2361557054727804, + "grad_norm": 0.1208878830075264, + "learning_rate": 0.0001905671814114591, + "loss": 0.1474, + "step": 3273 + }, + { + "epoch": 0.23622785814784084, + "grad_norm": 0.11745128035545349, + "learning_rate": 0.00019056429499206235, + "loss": 0.1903, + "step": 3274 + }, + { + "epoch": 0.23630001082290125, + "grad_norm": 0.0823647603392601, + "learning_rate": 0.00019056140857266562, + "loss": 0.1603, + "step": 3275 + }, + { + "epoch": 0.2363721634979617, + "grad_norm": 0.10410257428884506, + "learning_rate": 0.00019055852215326888, + "loss": 0.1805, + "step": 3276 + }, + { + "epoch": 0.2364443161730221, + "grad_norm": 0.10450460016727448, + "learning_rate": 0.00019055563573387214, + "loss": 0.1593, + "step": 3277 + }, + { + "epoch": 0.23651646884808256, + "grad_norm": 0.0935407504439354, + "learning_rate": 0.0001905527493144754, + "loss": 0.1638, + "step": 3278 + }, + { + "epoch": 0.23658862152314297, + "grad_norm": 0.0930311381816864, + "learning_rate": 0.00019054986289507867, + "loss": 0.1332, + "step": 3279 + }, + { + "epoch": 0.2366607741982034, + "grad_norm": 0.09465755522251129, + "learning_rate": 0.0001905469764756819, + "loss": 0.1857, + "step": 3280 + }, + { + "epoch": 0.23673292687326383, + "grad_norm": 0.09868931770324707, + "learning_rate": 0.0001905440900562852, + "loss": 0.155, + "step": 3281 + }, + { + "epoch": 0.23680507954832425, + "grad_norm": 0.11090152710676193, + "learning_rate": 0.00019054120363688846, + "loss": 0.1929, + "step": 3282 + }, + { + "epoch": 0.2368772322233847, + "grad_norm": 0.10099887102842331, + "learning_rate": 0.00019053831721749172, + "loss": 0.1279, + "step": 3283 + }, + { + "epoch": 0.2369493848984451, + "grad_norm": 0.116920106112957, + "learning_rate": 0.00019053543079809498, + "loss": 0.1571, + "step": 3284 + }, + { + "epoch": 0.23702153757350553, + "grad_norm": 0.11076688766479492, + "learning_rate": 0.00019053254437869822, + "loss": 0.1592, + "step": 3285 + }, + { + "epoch": 0.23709369024856597, + "grad_norm": 0.1265278160572052, + "learning_rate": 0.00019052965795930148, + "loss": 0.1657, + "step": 3286 + }, + { + "epoch": 0.23716584292362639, + "grad_norm": 0.1301831156015396, + "learning_rate": 0.00019052677153990474, + "loss": 0.1941, + "step": 3287 + }, + { + "epoch": 0.23723799559868683, + "grad_norm": 0.1072435975074768, + "learning_rate": 0.00019052388512050803, + "loss": 0.1303, + "step": 3288 + }, + { + "epoch": 0.23731014827374725, + "grad_norm": 0.13425888121128082, + "learning_rate": 0.0001905209987011113, + "loss": 0.2139, + "step": 3289 + }, + { + "epoch": 0.2373823009488077, + "grad_norm": 0.09891893714666367, + "learning_rate": 0.00019051811228171453, + "loss": 0.1291, + "step": 3290 + }, + { + "epoch": 0.2374544536238681, + "grad_norm": 0.11196228861808777, + "learning_rate": 0.0001905152258623178, + "loss": 0.1684, + "step": 3291 + }, + { + "epoch": 0.23752660629892852, + "grad_norm": 0.0930328220129013, + "learning_rate": 0.00019051233944292106, + "loss": 0.1454, + "step": 3292 + }, + { + "epoch": 0.23759875897398897, + "grad_norm": 0.10707025229930878, + "learning_rate": 0.00019050945302352432, + "loss": 0.2065, + "step": 3293 + }, + { + "epoch": 0.23767091164904938, + "grad_norm": 0.11718625575304031, + "learning_rate": 0.00019050656660412758, + "loss": 0.1971, + "step": 3294 + }, + { + "epoch": 0.23774306432410983, + "grad_norm": 0.09377551078796387, + "learning_rate": 0.00019050368018473085, + "loss": 0.1582, + "step": 3295 + }, + { + "epoch": 0.23781521699917024, + "grad_norm": 0.101633220911026, + "learning_rate": 0.0001905007937653341, + "loss": 0.1712, + "step": 3296 + }, + { + "epoch": 0.23788736967423066, + "grad_norm": 0.09976430237293243, + "learning_rate": 0.00019049790734593737, + "loss": 0.1484, + "step": 3297 + }, + { + "epoch": 0.2379595223492911, + "grad_norm": 0.11419283598661423, + "learning_rate": 0.00019049502092654064, + "loss": 0.1674, + "step": 3298 + }, + { + "epoch": 0.23803167502435152, + "grad_norm": 0.10907099395990372, + "learning_rate": 0.0001904921345071439, + "loss": 0.1734, + "step": 3299 + }, + { + "epoch": 0.23810382769941196, + "grad_norm": 0.09745454788208008, + "learning_rate": 0.00019048924808774716, + "loss": 0.139, + "step": 3300 + }, + { + "epoch": 0.23817598037447238, + "grad_norm": 0.09598489105701447, + "learning_rate": 0.0001904863616683504, + "loss": 0.1545, + "step": 3301 + }, + { + "epoch": 0.23824813304953282, + "grad_norm": 0.10601425915956497, + "learning_rate": 0.0001904834752489537, + "loss": 0.1595, + "step": 3302 + }, + { + "epoch": 0.23832028572459324, + "grad_norm": 0.10679695010185242, + "learning_rate": 0.00019048058882955695, + "loss": 0.1384, + "step": 3303 + }, + { + "epoch": 0.23839243839965366, + "grad_norm": 0.10327916592359543, + "learning_rate": 0.00019047770241016021, + "loss": 0.1657, + "step": 3304 + }, + { + "epoch": 0.2384645910747141, + "grad_norm": 0.09387264400720596, + "learning_rate": 0.00019047481599076348, + "loss": 0.1098, + "step": 3305 + }, + { + "epoch": 0.23853674374977452, + "grad_norm": 0.1033811941742897, + "learning_rate": 0.0001904719295713667, + "loss": 0.1819, + "step": 3306 + }, + { + "epoch": 0.23860889642483496, + "grad_norm": 0.09968467056751251, + "learning_rate": 0.00019046904315196998, + "loss": 0.1609, + "step": 3307 + }, + { + "epoch": 0.23868104909989538, + "grad_norm": 0.0858912244439125, + "learning_rate": 0.00019046615673257324, + "loss": 0.0986, + "step": 3308 + }, + { + "epoch": 0.23875320177495582, + "grad_norm": 0.09396117925643921, + "learning_rate": 0.00019046327031317653, + "loss": 0.1328, + "step": 3309 + }, + { + "epoch": 0.23882535445001624, + "grad_norm": 0.11658573895692825, + "learning_rate": 0.0001904603838937798, + "loss": 0.1457, + "step": 3310 + }, + { + "epoch": 0.23889750712507665, + "grad_norm": 0.08398326486349106, + "learning_rate": 0.00019045749747438303, + "loss": 0.1687, + "step": 3311 + }, + { + "epoch": 0.2389696598001371, + "grad_norm": 0.10841450840234756, + "learning_rate": 0.0001904546110549863, + "loss": 0.1672, + "step": 3312 + }, + { + "epoch": 0.2390418124751975, + "grad_norm": 0.1065574437379837, + "learning_rate": 0.00019045172463558955, + "loss": 0.1538, + "step": 3313 + }, + { + "epoch": 0.23911396515025796, + "grad_norm": 0.1297336369752884, + "learning_rate": 0.00019044883821619282, + "loss": 0.1549, + "step": 3314 + }, + { + "epoch": 0.23918611782531837, + "grad_norm": 0.11699077486991882, + "learning_rate": 0.00019044595179679608, + "loss": 0.1442, + "step": 3315 + }, + { + "epoch": 0.2392582705003788, + "grad_norm": 0.14201852679252625, + "learning_rate": 0.00019044306537739934, + "loss": 0.1751, + "step": 3316 + }, + { + "epoch": 0.23933042317543923, + "grad_norm": 0.13381065428256989, + "learning_rate": 0.0001904401789580026, + "loss": 0.1672, + "step": 3317 + }, + { + "epoch": 0.23940257585049965, + "grad_norm": 0.10775529593229294, + "learning_rate": 0.00019043729253860587, + "loss": 0.1859, + "step": 3318 + }, + { + "epoch": 0.2394747285255601, + "grad_norm": 0.12804460525512695, + "learning_rate": 0.00019043440611920913, + "loss": 0.1927, + "step": 3319 + }, + { + "epoch": 0.2395468812006205, + "grad_norm": 0.13988925516605377, + "learning_rate": 0.0001904315196998124, + "loss": 0.2239, + "step": 3320 + }, + { + "epoch": 0.23961903387568095, + "grad_norm": 0.12417380511760712, + "learning_rate": 0.00019042863328041566, + "loss": 0.2235, + "step": 3321 + }, + { + "epoch": 0.23969118655074137, + "grad_norm": 0.12068769335746765, + "learning_rate": 0.0001904257468610189, + "loss": 0.1781, + "step": 3322 + }, + { + "epoch": 0.23976333922580179, + "grad_norm": 0.1000375971198082, + "learning_rate": 0.00019042286044162218, + "loss": 0.1527, + "step": 3323 + }, + { + "epoch": 0.23983549190086223, + "grad_norm": 0.0985485389828682, + "learning_rate": 0.00019041997402222545, + "loss": 0.1428, + "step": 3324 + }, + { + "epoch": 0.23990764457592265, + "grad_norm": 0.1019006073474884, + "learning_rate": 0.0001904170876028287, + "loss": 0.1431, + "step": 3325 + }, + { + "epoch": 0.2399797972509831, + "grad_norm": 0.11141661554574966, + "learning_rate": 0.00019041420118343197, + "loss": 0.1982, + "step": 3326 + }, + { + "epoch": 0.2400519499260435, + "grad_norm": 0.1347435712814331, + "learning_rate": 0.0001904113147640352, + "loss": 0.1341, + "step": 3327 + }, + { + "epoch": 0.24012410260110392, + "grad_norm": 0.09408620744943619, + "learning_rate": 0.00019040842834463847, + "loss": 0.1067, + "step": 3328 + }, + { + "epoch": 0.24019625527616437, + "grad_norm": 0.10930059850215912, + "learning_rate": 0.00019040554192524173, + "loss": 0.1691, + "step": 3329 + }, + { + "epoch": 0.24026840795122478, + "grad_norm": 0.0910833477973938, + "learning_rate": 0.00019040265550584502, + "loss": 0.2006, + "step": 3330 + }, + { + "epoch": 0.24034056062628523, + "grad_norm": 0.12467349320650101, + "learning_rate": 0.00019039976908644829, + "loss": 0.1754, + "step": 3331 + }, + { + "epoch": 0.24041271330134564, + "grad_norm": 0.11248049885034561, + "learning_rate": 0.00019039688266705152, + "loss": 0.1977, + "step": 3332 + }, + { + "epoch": 0.2404848659764061, + "grad_norm": 0.08451106399297714, + "learning_rate": 0.00019039399624765478, + "loss": 0.1796, + "step": 3333 + }, + { + "epoch": 0.2405570186514665, + "grad_norm": 0.0945352241396904, + "learning_rate": 0.00019039110982825805, + "loss": 0.2041, + "step": 3334 + }, + { + "epoch": 0.24062917132652692, + "grad_norm": 0.10887964814901352, + "learning_rate": 0.0001903882234088613, + "loss": 0.1232, + "step": 3335 + }, + { + "epoch": 0.24070132400158736, + "grad_norm": 0.09830755740404129, + "learning_rate": 0.00019038533698946457, + "loss": 0.1501, + "step": 3336 + }, + { + "epoch": 0.24077347667664778, + "grad_norm": 0.11321946978569031, + "learning_rate": 0.00019038245057006786, + "loss": 0.1731, + "step": 3337 + }, + { + "epoch": 0.24084562935170822, + "grad_norm": 0.09077171236276627, + "learning_rate": 0.0001903795641506711, + "loss": 0.1461, + "step": 3338 + }, + { + "epoch": 0.24091778202676864, + "grad_norm": 0.0952875092625618, + "learning_rate": 0.00019037667773127436, + "loss": 0.1817, + "step": 3339 + }, + { + "epoch": 0.24098993470182908, + "grad_norm": 0.12886834144592285, + "learning_rate": 0.00019037379131187762, + "loss": 0.1984, + "step": 3340 + }, + { + "epoch": 0.2410620873768895, + "grad_norm": 0.11286803334951401, + "learning_rate": 0.0001903709048924809, + "loss": 0.1888, + "step": 3341 + }, + { + "epoch": 0.24113424005194992, + "grad_norm": 0.11777891218662262, + "learning_rate": 0.00019036801847308415, + "loss": 0.2311, + "step": 3342 + }, + { + "epoch": 0.24120639272701036, + "grad_norm": 0.10411883145570755, + "learning_rate": 0.0001903651320536874, + "loss": 0.1985, + "step": 3343 + }, + { + "epoch": 0.24127854540207078, + "grad_norm": 0.12743018567562103, + "learning_rate": 0.00019036224563429068, + "loss": 0.1463, + "step": 3344 + }, + { + "epoch": 0.24135069807713122, + "grad_norm": 0.12642966210842133, + "learning_rate": 0.00019035935921489394, + "loss": 0.1371, + "step": 3345 + }, + { + "epoch": 0.24142285075219164, + "grad_norm": 0.09358017146587372, + "learning_rate": 0.0001903564727954972, + "loss": 0.1312, + "step": 3346 + }, + { + "epoch": 0.24149500342725205, + "grad_norm": 0.10470343381166458, + "learning_rate": 0.00019035358637610047, + "loss": 0.16, + "step": 3347 + }, + { + "epoch": 0.2415671561023125, + "grad_norm": 0.10156462341547012, + "learning_rate": 0.00019035069995670373, + "loss": 0.228, + "step": 3348 + }, + { + "epoch": 0.2416393087773729, + "grad_norm": 0.1206686943769455, + "learning_rate": 0.00019034781353730696, + "loss": 0.1816, + "step": 3349 + }, + { + "epoch": 0.24171146145243336, + "grad_norm": 0.11973816901445389, + "learning_rate": 0.00019034492711791023, + "loss": 0.1634, + "step": 3350 + }, + { + "epoch": 0.24178361412749377, + "grad_norm": 0.10858598351478577, + "learning_rate": 0.00019034204069851352, + "loss": 0.1473, + "step": 3351 + }, + { + "epoch": 0.24185576680255422, + "grad_norm": 0.10533450543880463, + "learning_rate": 0.00019033915427911678, + "loss": 0.1859, + "step": 3352 + }, + { + "epoch": 0.24192791947761463, + "grad_norm": 0.12177108228206635, + "learning_rate": 0.00019033626785972004, + "loss": 0.1484, + "step": 3353 + }, + { + "epoch": 0.24200007215267505, + "grad_norm": 0.14595696330070496, + "learning_rate": 0.00019033338144032328, + "loss": 0.1656, + "step": 3354 + }, + { + "epoch": 0.2420722248277355, + "grad_norm": 0.12721535563468933, + "learning_rate": 0.00019033049502092654, + "loss": 0.1897, + "step": 3355 + }, + { + "epoch": 0.2421443775027959, + "grad_norm": 0.11117599159479141, + "learning_rate": 0.0001903276086015298, + "loss": 0.1826, + "step": 3356 + }, + { + "epoch": 0.24221653017785635, + "grad_norm": 0.13842089474201202, + "learning_rate": 0.00019032472218213307, + "loss": 0.1662, + "step": 3357 + }, + { + "epoch": 0.24228868285291677, + "grad_norm": 0.12665043771266937, + "learning_rate": 0.00019032183576273636, + "loss": 0.1141, + "step": 3358 + }, + { + "epoch": 0.2423608355279772, + "grad_norm": 0.13461072742938995, + "learning_rate": 0.0001903189493433396, + "loss": 0.1817, + "step": 3359 + }, + { + "epoch": 0.24243298820303763, + "grad_norm": 0.12616859376430511, + "learning_rate": 0.00019031606292394286, + "loss": 0.1765, + "step": 3360 + }, + { + "epoch": 0.24250514087809805, + "grad_norm": 0.12274335324764252, + "learning_rate": 0.00019031317650454612, + "loss": 0.212, + "step": 3361 + }, + { + "epoch": 0.2425772935531585, + "grad_norm": 0.09972148388624191, + "learning_rate": 0.00019031029008514938, + "loss": 0.1656, + "step": 3362 + }, + { + "epoch": 0.2426494462282189, + "grad_norm": 0.1198907345533371, + "learning_rate": 0.00019030740366575264, + "loss": 0.1293, + "step": 3363 + }, + { + "epoch": 0.24272159890327935, + "grad_norm": 0.10844511538743973, + "learning_rate": 0.0001903045172463559, + "loss": 0.1771, + "step": 3364 + }, + { + "epoch": 0.24279375157833977, + "grad_norm": 0.10141333192586899, + "learning_rate": 0.00019030163082695917, + "loss": 0.1704, + "step": 3365 + }, + { + "epoch": 0.24286590425340018, + "grad_norm": 0.09217889606952667, + "learning_rate": 0.00019029874440756243, + "loss": 0.1328, + "step": 3366 + }, + { + "epoch": 0.24293805692846063, + "grad_norm": 0.10080622881650925, + "learning_rate": 0.0001902958579881657, + "loss": 0.1322, + "step": 3367 + }, + { + "epoch": 0.24301020960352104, + "grad_norm": 0.10517612099647522, + "learning_rate": 0.00019029297156876896, + "loss": 0.1661, + "step": 3368 + }, + { + "epoch": 0.2430823622785815, + "grad_norm": 0.11337990313768387, + "learning_rate": 0.00019029008514937222, + "loss": 0.1677, + "step": 3369 + }, + { + "epoch": 0.2431545149536419, + "grad_norm": 0.11382631957530975, + "learning_rate": 0.00019028719872997546, + "loss": 0.1728, + "step": 3370 + }, + { + "epoch": 0.24322666762870235, + "grad_norm": 0.12541048228740692, + "learning_rate": 0.00019028431231057872, + "loss": 0.1693, + "step": 3371 + }, + { + "epoch": 0.24329882030376276, + "grad_norm": 0.12730221450328827, + "learning_rate": 0.000190281425891182, + "loss": 0.1598, + "step": 3372 + }, + { + "epoch": 0.24337097297882318, + "grad_norm": 0.10680249333381653, + "learning_rate": 0.00019027853947178527, + "loss": 0.1749, + "step": 3373 + }, + { + "epoch": 0.24344312565388362, + "grad_norm": 0.1114056333899498, + "learning_rate": 0.00019027565305238854, + "loss": 0.1434, + "step": 3374 + }, + { + "epoch": 0.24351527832894404, + "grad_norm": 0.1178140789270401, + "learning_rate": 0.00019027276663299177, + "loss": 0.1457, + "step": 3375 + }, + { + "epoch": 0.24358743100400448, + "grad_norm": 0.10966756194829941, + "learning_rate": 0.00019026988021359504, + "loss": 0.1997, + "step": 3376 + }, + { + "epoch": 0.2436595836790649, + "grad_norm": 0.10562735795974731, + "learning_rate": 0.0001902669937941983, + "loss": 0.2047, + "step": 3377 + }, + { + "epoch": 0.24373173635412532, + "grad_norm": 0.09306290000677109, + "learning_rate": 0.00019026410737480156, + "loss": 0.1157, + "step": 3378 + }, + { + "epoch": 0.24380388902918576, + "grad_norm": 0.10318143665790558, + "learning_rate": 0.00019026122095540485, + "loss": 0.0993, + "step": 3379 + }, + { + "epoch": 0.24387604170424618, + "grad_norm": 0.15777446329593658, + "learning_rate": 0.0001902583345360081, + "loss": 0.2304, + "step": 3380 + }, + { + "epoch": 0.24394819437930662, + "grad_norm": 0.10680831223726273, + "learning_rate": 0.00019025544811661135, + "loss": 0.1504, + "step": 3381 + }, + { + "epoch": 0.24402034705436704, + "grad_norm": 0.12741363048553467, + "learning_rate": 0.0001902525616972146, + "loss": 0.1984, + "step": 3382 + }, + { + "epoch": 0.24409249972942748, + "grad_norm": 0.10081455856561661, + "learning_rate": 0.00019024967527781788, + "loss": 0.1591, + "step": 3383 + }, + { + "epoch": 0.2441646524044879, + "grad_norm": 0.11780191957950592, + "learning_rate": 0.00019024678885842114, + "loss": 0.1666, + "step": 3384 + }, + { + "epoch": 0.2442368050795483, + "grad_norm": 0.08748682588338852, + "learning_rate": 0.0001902439024390244, + "loss": 0.1822, + "step": 3385 + }, + { + "epoch": 0.24430895775460876, + "grad_norm": 0.09807809442281723, + "learning_rate": 0.00019024101601962766, + "loss": 0.184, + "step": 3386 + }, + { + "epoch": 0.24438111042966917, + "grad_norm": 0.10223960131406784, + "learning_rate": 0.00019023812960023093, + "loss": 0.1693, + "step": 3387 + }, + { + "epoch": 0.24445326310472962, + "grad_norm": 0.08662978559732437, + "learning_rate": 0.0001902352431808342, + "loss": 0.1766, + "step": 3388 + }, + { + "epoch": 0.24452541577979003, + "grad_norm": 0.08942607790231705, + "learning_rate": 0.00019023235676143745, + "loss": 0.1274, + "step": 3389 + }, + { + "epoch": 0.24459756845485045, + "grad_norm": 0.1172434389591217, + "learning_rate": 0.00019022947034204072, + "loss": 0.1108, + "step": 3390 + }, + { + "epoch": 0.2446697211299109, + "grad_norm": 0.1019260361790657, + "learning_rate": 0.00019022658392264395, + "loss": 0.1431, + "step": 3391 + }, + { + "epoch": 0.2447418738049713, + "grad_norm": 0.12758377194404602, + "learning_rate": 0.00019022369750324722, + "loss": 0.2125, + "step": 3392 + }, + { + "epoch": 0.24481402648003175, + "grad_norm": 0.12854115664958954, + "learning_rate": 0.0001902208110838505, + "loss": 0.175, + "step": 3393 + }, + { + "epoch": 0.24488617915509217, + "grad_norm": 0.08560032397508621, + "learning_rate": 0.00019021792466445377, + "loss": 0.1714, + "step": 3394 + }, + { + "epoch": 0.24495833183015261, + "grad_norm": 0.10468869656324387, + "learning_rate": 0.00019021503824505703, + "loss": 0.1782, + "step": 3395 + }, + { + "epoch": 0.24503048450521303, + "grad_norm": 0.1147155836224556, + "learning_rate": 0.00019021215182566027, + "loss": 0.1654, + "step": 3396 + }, + { + "epoch": 0.24510263718027345, + "grad_norm": 0.11898034811019897, + "learning_rate": 0.00019020926540626353, + "loss": 0.1781, + "step": 3397 + }, + { + "epoch": 0.2451747898553339, + "grad_norm": 0.09727160632610321, + "learning_rate": 0.0001902063789868668, + "loss": 0.1412, + "step": 3398 + }, + { + "epoch": 0.2452469425303943, + "grad_norm": 0.14130552113056183, + "learning_rate": 0.00019020349256747006, + "loss": 0.2089, + "step": 3399 + }, + { + "epoch": 0.24531909520545475, + "grad_norm": 0.15899445116519928, + "learning_rate": 0.00019020060614807335, + "loss": 0.1687, + "step": 3400 + }, + { + "epoch": 0.24539124788051517, + "grad_norm": 0.15017631649971008, + "learning_rate": 0.00019019771972867658, + "loss": 0.165, + "step": 3401 + }, + { + "epoch": 0.2454634005555756, + "grad_norm": 0.10212711244821548, + "learning_rate": 0.00019019483330927984, + "loss": 0.1294, + "step": 3402 + }, + { + "epoch": 0.24553555323063603, + "grad_norm": 0.11916355043649673, + "learning_rate": 0.0001901919468898831, + "loss": 0.1724, + "step": 3403 + }, + { + "epoch": 0.24560770590569644, + "grad_norm": 0.09966582804918289, + "learning_rate": 0.00019018906047048637, + "loss": 0.1529, + "step": 3404 + }, + { + "epoch": 0.2456798585807569, + "grad_norm": 0.10150911659002304, + "learning_rate": 0.00019018617405108963, + "loss": 0.1836, + "step": 3405 + }, + { + "epoch": 0.2457520112558173, + "grad_norm": 0.10047971457242966, + "learning_rate": 0.0001901832876316929, + "loss": 0.1581, + "step": 3406 + }, + { + "epoch": 0.24582416393087775, + "grad_norm": 0.11941426247358322, + "learning_rate": 0.00019018040121229616, + "loss": 0.1423, + "step": 3407 + }, + { + "epoch": 0.24589631660593816, + "grad_norm": 0.09558165073394775, + "learning_rate": 0.00019017751479289942, + "loss": 0.1712, + "step": 3408 + }, + { + "epoch": 0.24596846928099858, + "grad_norm": 0.10940393060445786, + "learning_rate": 0.00019017462837350269, + "loss": 0.1649, + "step": 3409 + }, + { + "epoch": 0.24604062195605902, + "grad_norm": 0.10541975498199463, + "learning_rate": 0.00019017174195410595, + "loss": 0.1764, + "step": 3410 + }, + { + "epoch": 0.24611277463111944, + "grad_norm": 0.11231281608343124, + "learning_rate": 0.0001901688555347092, + "loss": 0.163, + "step": 3411 + }, + { + "epoch": 0.24618492730617988, + "grad_norm": 0.0928051769733429, + "learning_rate": 0.00019016596911531245, + "loss": 0.1887, + "step": 3412 + }, + { + "epoch": 0.2462570799812403, + "grad_norm": 0.11161345988512039, + "learning_rate": 0.0001901630826959157, + "loss": 0.1469, + "step": 3413 + }, + { + "epoch": 0.24632923265630075, + "grad_norm": 0.11575374752283096, + "learning_rate": 0.000190160196276519, + "loss": 0.1466, + "step": 3414 + }, + { + "epoch": 0.24640138533136116, + "grad_norm": 0.10162404179573059, + "learning_rate": 0.00019015730985712226, + "loss": 0.1193, + "step": 3415 + }, + { + "epoch": 0.24647353800642158, + "grad_norm": 0.10334353148937225, + "learning_rate": 0.00019015442343772553, + "loss": 0.1586, + "step": 3416 + }, + { + "epoch": 0.24654569068148202, + "grad_norm": 0.08884553611278534, + "learning_rate": 0.00019015153701832876, + "loss": 0.1715, + "step": 3417 + }, + { + "epoch": 0.24661784335654244, + "grad_norm": 0.1290484517812729, + "learning_rate": 0.00019014865059893202, + "loss": 0.1596, + "step": 3418 + }, + { + "epoch": 0.24668999603160288, + "grad_norm": 0.12575063109397888, + "learning_rate": 0.0001901457641795353, + "loss": 0.1052, + "step": 3419 + }, + { + "epoch": 0.2467621487066633, + "grad_norm": 0.10406794399023056, + "learning_rate": 0.00019014287776013855, + "loss": 0.1728, + "step": 3420 + }, + { + "epoch": 0.24683430138172371, + "grad_norm": 0.109982430934906, + "learning_rate": 0.00019013999134074184, + "loss": 0.1586, + "step": 3421 + }, + { + "epoch": 0.24690645405678416, + "grad_norm": 0.11729655414819717, + "learning_rate": 0.00019013710492134508, + "loss": 0.1731, + "step": 3422 + }, + { + "epoch": 0.24697860673184457, + "grad_norm": 0.08943957835435867, + "learning_rate": 0.00019013421850194834, + "loss": 0.1607, + "step": 3423 + }, + { + "epoch": 0.24705075940690502, + "grad_norm": 0.13747897744178772, + "learning_rate": 0.0001901313320825516, + "loss": 0.1816, + "step": 3424 + }, + { + "epoch": 0.24712291208196543, + "grad_norm": 0.1077960953116417, + "learning_rate": 0.00019012844566315486, + "loss": 0.1282, + "step": 3425 + }, + { + "epoch": 0.24719506475702588, + "grad_norm": 0.10450281202793121, + "learning_rate": 0.00019012555924375813, + "loss": 0.19, + "step": 3426 + }, + { + "epoch": 0.2472672174320863, + "grad_norm": 0.10756547749042511, + "learning_rate": 0.0001901226728243614, + "loss": 0.1612, + "step": 3427 + }, + { + "epoch": 0.2473393701071467, + "grad_norm": 0.08175303786993027, + "learning_rate": 0.00019011978640496465, + "loss": 0.1479, + "step": 3428 + }, + { + "epoch": 0.24741152278220715, + "grad_norm": 0.10902395844459534, + "learning_rate": 0.00019011689998556792, + "loss": 0.1431, + "step": 3429 + }, + { + "epoch": 0.24748367545726757, + "grad_norm": 0.10441279411315918, + "learning_rate": 0.00019011401356617118, + "loss": 0.1727, + "step": 3430 + }, + { + "epoch": 0.24755582813232802, + "grad_norm": 0.08806274086236954, + "learning_rate": 0.00019011112714677444, + "loss": 0.1463, + "step": 3431 + }, + { + "epoch": 0.24762798080738843, + "grad_norm": 0.141118586063385, + "learning_rate": 0.0001901082407273777, + "loss": 0.1821, + "step": 3432 + }, + { + "epoch": 0.24770013348244888, + "grad_norm": 0.10650867223739624, + "learning_rate": 0.00019010535430798094, + "loss": 0.1422, + "step": 3433 + }, + { + "epoch": 0.2477722861575093, + "grad_norm": 0.11610765755176544, + "learning_rate": 0.0001901024678885842, + "loss": 0.1737, + "step": 3434 + }, + { + "epoch": 0.2478444388325697, + "grad_norm": 0.10462258756160736, + "learning_rate": 0.00019009958146918747, + "loss": 0.1156, + "step": 3435 + }, + { + "epoch": 0.24791659150763015, + "grad_norm": 0.11078470945358276, + "learning_rate": 0.00019009669504979076, + "loss": 0.1429, + "step": 3436 + }, + { + "epoch": 0.24798874418269057, + "grad_norm": 0.13857358694076538, + "learning_rate": 0.00019009380863039402, + "loss": 0.2146, + "step": 3437 + }, + { + "epoch": 0.248060896857751, + "grad_norm": 0.18643376231193542, + "learning_rate": 0.00019009092221099726, + "loss": 0.1982, + "step": 3438 + }, + { + "epoch": 0.24813304953281143, + "grad_norm": 0.0913734957575798, + "learning_rate": 0.00019008803579160052, + "loss": 0.1216, + "step": 3439 + }, + { + "epoch": 0.24820520220787184, + "grad_norm": 0.09168455004692078, + "learning_rate": 0.00019008514937220378, + "loss": 0.1903, + "step": 3440 + }, + { + "epoch": 0.2482773548829323, + "grad_norm": 0.10040730237960815, + "learning_rate": 0.00019008226295280704, + "loss": 0.148, + "step": 3441 + }, + { + "epoch": 0.2483495075579927, + "grad_norm": 0.10486883670091629, + "learning_rate": 0.0001900793765334103, + "loss": 0.1377, + "step": 3442 + }, + { + "epoch": 0.24842166023305315, + "grad_norm": 0.1028137058019638, + "learning_rate": 0.00019007649011401357, + "loss": 0.148, + "step": 3443 + }, + { + "epoch": 0.24849381290811356, + "grad_norm": 0.11558540910482407, + "learning_rate": 0.00019007360369461683, + "loss": 0.1617, + "step": 3444 + }, + { + "epoch": 0.248565965583174, + "grad_norm": 0.09808561950922012, + "learning_rate": 0.0001900707172752201, + "loss": 0.1421, + "step": 3445 + }, + { + "epoch": 0.24863811825823443, + "grad_norm": 0.13802699744701385, + "learning_rate": 0.00019006783085582336, + "loss": 0.1876, + "step": 3446 + }, + { + "epoch": 0.24871027093329484, + "grad_norm": 0.10349813848733902, + "learning_rate": 0.00019006494443642662, + "loss": 0.1674, + "step": 3447 + }, + { + "epoch": 0.24878242360835529, + "grad_norm": 0.10739285498857498, + "learning_rate": 0.00019006205801702988, + "loss": 0.1963, + "step": 3448 + }, + { + "epoch": 0.2488545762834157, + "grad_norm": 0.10991799831390381, + "learning_rate": 0.00019005917159763315, + "loss": 0.181, + "step": 3449 + }, + { + "epoch": 0.24892672895847615, + "grad_norm": 0.10631420463323593, + "learning_rate": 0.0001900562851782364, + "loss": 0.182, + "step": 3450 + }, + { + "epoch": 0.24899888163353656, + "grad_norm": 0.1767328828573227, + "learning_rate": 0.00019005339875883967, + "loss": 0.1831, + "step": 3451 + }, + { + "epoch": 0.24907103430859698, + "grad_norm": 0.09794510155916214, + "learning_rate": 0.00019005051233944294, + "loss": 0.1805, + "step": 3452 + }, + { + "epoch": 0.24914318698365742, + "grad_norm": 0.12249629199504852, + "learning_rate": 0.0001900476259200462, + "loss": 0.1658, + "step": 3453 + }, + { + "epoch": 0.24921533965871784, + "grad_norm": 0.13031835854053497, + "learning_rate": 0.00019004473950064946, + "loss": 0.1184, + "step": 3454 + }, + { + "epoch": 0.24928749233377828, + "grad_norm": 0.12428729981184006, + "learning_rate": 0.0001900418530812527, + "loss": 0.1604, + "step": 3455 + }, + { + "epoch": 0.2493596450088387, + "grad_norm": 0.09246498346328735, + "learning_rate": 0.00019003896666185596, + "loss": 0.1555, + "step": 3456 + }, + { + "epoch": 0.24943179768389914, + "grad_norm": 0.09440509974956512, + "learning_rate": 0.00019003608024245925, + "loss": 0.2146, + "step": 3457 + }, + { + "epoch": 0.24950395035895956, + "grad_norm": 0.08850990235805511, + "learning_rate": 0.00019003319382306251, + "loss": 0.2196, + "step": 3458 + }, + { + "epoch": 0.24957610303401997, + "grad_norm": 0.09511906653642654, + "learning_rate": 0.00019003030740366578, + "loss": 0.1117, + "step": 3459 + }, + { + "epoch": 0.24964825570908042, + "grad_norm": 0.11032702773809433, + "learning_rate": 0.000190027420984269, + "loss": 0.1305, + "step": 3460 + }, + { + "epoch": 0.24972040838414084, + "grad_norm": 0.10022285580635071, + "learning_rate": 0.00019002453456487228, + "loss": 0.1479, + "step": 3461 + }, + { + "epoch": 0.24979256105920128, + "grad_norm": 0.09211345762014389, + "learning_rate": 0.00019002164814547554, + "loss": 0.1409, + "step": 3462 + }, + { + "epoch": 0.2498647137342617, + "grad_norm": 0.10632354021072388, + "learning_rate": 0.0001900187617260788, + "loss": 0.1489, + "step": 3463 + }, + { + "epoch": 0.24993686640932214, + "grad_norm": 0.10797733068466187, + "learning_rate": 0.0001900158753066821, + "loss": 0.1376, + "step": 3464 + }, + { + "epoch": 0.25000901908438256, + "grad_norm": 0.12420342862606049, + "learning_rate": 0.00019001298888728533, + "loss": 0.2198, + "step": 3465 + }, + { + "epoch": 0.250081171759443, + "grad_norm": 0.1036577820777893, + "learning_rate": 0.0001900101024678886, + "loss": 0.1677, + "step": 3466 + }, + { + "epoch": 0.2501533244345034, + "grad_norm": 0.11751674860715866, + "learning_rate": 0.00019000721604849185, + "loss": 0.1746, + "step": 3467 + }, + { + "epoch": 0.25022547710956383, + "grad_norm": 0.11562912166118622, + "learning_rate": 0.00019000432962909512, + "loss": 0.166, + "step": 3468 + }, + { + "epoch": 0.2502976297846243, + "grad_norm": 0.11643790453672409, + "learning_rate": 0.00019000144320969838, + "loss": 0.167, + "step": 3469 + }, + { + "epoch": 0.2503697824596847, + "grad_norm": 0.12111453711986542, + "learning_rate": 0.00018999855679030164, + "loss": 0.1769, + "step": 3470 + }, + { + "epoch": 0.2504419351347451, + "grad_norm": 0.11380564421415329, + "learning_rate": 0.0001899956703709049, + "loss": 0.1852, + "step": 3471 + }, + { + "epoch": 0.25051408780980555, + "grad_norm": 0.1100783571600914, + "learning_rate": 0.00018999278395150817, + "loss": 0.1419, + "step": 3472 + }, + { + "epoch": 0.250586240484866, + "grad_norm": 0.12145480513572693, + "learning_rate": 0.00018998989753211143, + "loss": 0.1455, + "step": 3473 + }, + { + "epoch": 0.2506583931599264, + "grad_norm": 0.13141202926635742, + "learning_rate": 0.0001899870111127147, + "loss": 0.157, + "step": 3474 + }, + { + "epoch": 0.25073054583498683, + "grad_norm": 0.09900374710559845, + "learning_rate": 0.00018998412469331796, + "loss": 0.1379, + "step": 3475 + }, + { + "epoch": 0.2508026985100473, + "grad_norm": 0.09456542879343033, + "learning_rate": 0.0001899812382739212, + "loss": 0.1831, + "step": 3476 + }, + { + "epoch": 0.25087485118510766, + "grad_norm": 0.11174774914979935, + "learning_rate": 0.00018997835185452446, + "loss": 0.1477, + "step": 3477 + }, + { + "epoch": 0.2509470038601681, + "grad_norm": 0.1130797490477562, + "learning_rate": 0.00018997546543512775, + "loss": 0.1874, + "step": 3478 + }, + { + "epoch": 0.25101915653522855, + "grad_norm": 0.09996349364519119, + "learning_rate": 0.000189972579015731, + "loss": 0.1372, + "step": 3479 + }, + { + "epoch": 0.251091309210289, + "grad_norm": 0.09237122535705566, + "learning_rate": 0.00018996969259633427, + "loss": 0.131, + "step": 3480 + }, + { + "epoch": 0.2511634618853494, + "grad_norm": 0.11162920296192169, + "learning_rate": 0.0001899668061769375, + "loss": 0.1687, + "step": 3481 + }, + { + "epoch": 0.2512356145604098, + "grad_norm": 0.10475852340459824, + "learning_rate": 0.00018996391975754077, + "loss": 0.1336, + "step": 3482 + }, + { + "epoch": 0.25130776723547027, + "grad_norm": 0.09986816346645355, + "learning_rate": 0.00018996103333814403, + "loss": 0.1376, + "step": 3483 + }, + { + "epoch": 0.25137991991053066, + "grad_norm": 0.1363130658864975, + "learning_rate": 0.0001899581469187473, + "loss": 0.1732, + "step": 3484 + }, + { + "epoch": 0.2514520725855911, + "grad_norm": 0.12526144087314606, + "learning_rate": 0.00018995526049935059, + "loss": 0.1946, + "step": 3485 + }, + { + "epoch": 0.25152422526065155, + "grad_norm": 0.09262970089912415, + "learning_rate": 0.00018995237407995382, + "loss": 0.1687, + "step": 3486 + }, + { + "epoch": 0.251596377935712, + "grad_norm": 0.10301724821329117, + "learning_rate": 0.00018994948766055708, + "loss": 0.1549, + "step": 3487 + }, + { + "epoch": 0.2516685306107724, + "grad_norm": 0.11157647520303726, + "learning_rate": 0.00018994660124116035, + "loss": 0.1551, + "step": 3488 + }, + { + "epoch": 0.2517406832858328, + "grad_norm": 0.11049095541238785, + "learning_rate": 0.0001899437148217636, + "loss": 0.1465, + "step": 3489 + }, + { + "epoch": 0.25181283596089327, + "grad_norm": 0.1106208935379982, + "learning_rate": 0.00018994082840236687, + "loss": 0.1204, + "step": 3490 + }, + { + "epoch": 0.25188498863595365, + "grad_norm": 0.1438845843076706, + "learning_rate": 0.00018993794198297014, + "loss": 0.1313, + "step": 3491 + }, + { + "epoch": 0.2519571413110141, + "grad_norm": 0.11887115240097046, + "learning_rate": 0.0001899350555635734, + "loss": 0.1527, + "step": 3492 + }, + { + "epoch": 0.25202929398607454, + "grad_norm": 0.1293410360813141, + "learning_rate": 0.00018993216914417666, + "loss": 0.1478, + "step": 3493 + }, + { + "epoch": 0.252101446661135, + "grad_norm": 0.10676635801792145, + "learning_rate": 0.00018992928272477992, + "loss": 0.1604, + "step": 3494 + }, + { + "epoch": 0.2521735993361954, + "grad_norm": 0.13810612261295319, + "learning_rate": 0.0001899263963053832, + "loss": 0.1081, + "step": 3495 + }, + { + "epoch": 0.2522457520112558, + "grad_norm": 0.12804998457431793, + "learning_rate": 0.00018992350988598645, + "loss": 0.151, + "step": 3496 + }, + { + "epoch": 0.25231790468631626, + "grad_norm": 0.09013412892818451, + "learning_rate": 0.0001899206234665897, + "loss": 0.1803, + "step": 3497 + }, + { + "epoch": 0.25239005736137665, + "grad_norm": 0.11779167503118515, + "learning_rate": 0.00018991773704719295, + "loss": 0.2159, + "step": 3498 + }, + { + "epoch": 0.2524622100364371, + "grad_norm": 0.1196313127875328, + "learning_rate": 0.00018991485062779624, + "loss": 0.2082, + "step": 3499 + }, + { + "epoch": 0.25253436271149754, + "grad_norm": 0.10573788732290268, + "learning_rate": 0.0001899119642083995, + "loss": 0.1439, + "step": 3500 + }, + { + "epoch": 0.252606515386558, + "grad_norm": 0.16237285733222961, + "learning_rate": 0.00018990907778900277, + "loss": 0.1524, + "step": 3501 + }, + { + "epoch": 0.25267866806161837, + "grad_norm": 0.101177878677845, + "learning_rate": 0.000189906191369606, + "loss": 0.1778, + "step": 3502 + }, + { + "epoch": 0.2527508207366788, + "grad_norm": 0.14791998267173767, + "learning_rate": 0.00018990330495020926, + "loss": 0.207, + "step": 3503 + }, + { + "epoch": 0.25282297341173926, + "grad_norm": 0.12691575288772583, + "learning_rate": 0.00018990041853081253, + "loss": 0.15, + "step": 3504 + }, + { + "epoch": 0.25289512608679965, + "grad_norm": 0.10003713518381119, + "learning_rate": 0.0001898975321114158, + "loss": 0.13, + "step": 3505 + }, + { + "epoch": 0.2529672787618601, + "grad_norm": 0.08510126918554306, + "learning_rate": 0.00018989464569201908, + "loss": 0.1463, + "step": 3506 + }, + { + "epoch": 0.25303943143692054, + "grad_norm": 0.087553009390831, + "learning_rate": 0.00018989175927262232, + "loss": 0.1488, + "step": 3507 + }, + { + "epoch": 0.2531115841119809, + "grad_norm": 0.11553581058979034, + "learning_rate": 0.00018988887285322558, + "loss": 0.1294, + "step": 3508 + }, + { + "epoch": 0.25318373678704137, + "grad_norm": 0.1350739449262619, + "learning_rate": 0.00018988598643382884, + "loss": 0.1414, + "step": 3509 + }, + { + "epoch": 0.2532558894621018, + "grad_norm": 0.1024918481707573, + "learning_rate": 0.0001898831000144321, + "loss": 0.1101, + "step": 3510 + }, + { + "epoch": 0.25332804213716226, + "grad_norm": 0.11271020770072937, + "learning_rate": 0.00018988021359503537, + "loss": 0.1579, + "step": 3511 + }, + { + "epoch": 0.25340019481222265, + "grad_norm": 0.1149035319685936, + "learning_rate": 0.00018987732717563863, + "loss": 0.1487, + "step": 3512 + }, + { + "epoch": 0.2534723474872831, + "grad_norm": 0.13088317215442657, + "learning_rate": 0.0001898744407562419, + "loss": 0.1902, + "step": 3513 + }, + { + "epoch": 0.25354450016234353, + "grad_norm": 0.09780467301607132, + "learning_rate": 0.00018987155433684516, + "loss": 0.1394, + "step": 3514 + }, + { + "epoch": 0.2536166528374039, + "grad_norm": 0.11135496944189072, + "learning_rate": 0.00018986866791744842, + "loss": 0.1806, + "step": 3515 + }, + { + "epoch": 0.25368880551246437, + "grad_norm": 0.10907930880784988, + "learning_rate": 0.00018986578149805168, + "loss": 0.1606, + "step": 3516 + }, + { + "epoch": 0.2537609581875248, + "grad_norm": 0.09134551137685776, + "learning_rate": 0.00018986289507865495, + "loss": 0.1284, + "step": 3517 + }, + { + "epoch": 0.25383311086258525, + "grad_norm": 0.15910980105400085, + "learning_rate": 0.00018986000865925818, + "loss": 0.2182, + "step": 3518 + }, + { + "epoch": 0.25390526353764564, + "grad_norm": 0.11735296249389648, + "learning_rate": 0.00018985712223986144, + "loss": 0.2251, + "step": 3519 + }, + { + "epoch": 0.2539774162127061, + "grad_norm": 0.11389101296663284, + "learning_rate": 0.00018985423582046473, + "loss": 0.1897, + "step": 3520 + }, + { + "epoch": 0.25404956888776653, + "grad_norm": 0.11940047144889832, + "learning_rate": 0.000189851349401068, + "loss": 0.1513, + "step": 3521 + }, + { + "epoch": 0.2541217215628269, + "grad_norm": 0.11212513595819473, + "learning_rate": 0.00018984846298167126, + "loss": 0.1766, + "step": 3522 + }, + { + "epoch": 0.25419387423788736, + "grad_norm": 0.12622421979904175, + "learning_rate": 0.0001898455765622745, + "loss": 0.1799, + "step": 3523 + }, + { + "epoch": 0.2542660269129478, + "grad_norm": 0.11228041350841522, + "learning_rate": 0.00018984269014287776, + "loss": 0.1154, + "step": 3524 + }, + { + "epoch": 0.25433817958800825, + "grad_norm": 0.1204824447631836, + "learning_rate": 0.00018983980372348102, + "loss": 0.1532, + "step": 3525 + }, + { + "epoch": 0.25441033226306864, + "grad_norm": 0.13384214043617249, + "learning_rate": 0.00018983691730408428, + "loss": 0.2145, + "step": 3526 + }, + { + "epoch": 0.2544824849381291, + "grad_norm": 0.09223582595586777, + "learning_rate": 0.00018983403088468757, + "loss": 0.1159, + "step": 3527 + }, + { + "epoch": 0.2545546376131895, + "grad_norm": 0.11915893107652664, + "learning_rate": 0.0001898311444652908, + "loss": 0.1494, + "step": 3528 + }, + { + "epoch": 0.2546267902882499, + "grad_norm": 0.09959257394075394, + "learning_rate": 0.00018982825804589407, + "loss": 0.1784, + "step": 3529 + }, + { + "epoch": 0.25469894296331036, + "grad_norm": 0.1257259100675583, + "learning_rate": 0.00018982537162649734, + "loss": 0.1652, + "step": 3530 + }, + { + "epoch": 0.2547710956383708, + "grad_norm": 0.09877978265285492, + "learning_rate": 0.0001898224852071006, + "loss": 0.1677, + "step": 3531 + }, + { + "epoch": 0.25484324831343125, + "grad_norm": 0.10418039560317993, + "learning_rate": 0.00018981959878770386, + "loss": 0.2078, + "step": 3532 + }, + { + "epoch": 0.25491540098849164, + "grad_norm": 0.14371944963932037, + "learning_rate": 0.00018981671236830712, + "loss": 0.2339, + "step": 3533 + }, + { + "epoch": 0.2549875536635521, + "grad_norm": 0.12156295776367188, + "learning_rate": 0.0001898138259489104, + "loss": 0.1863, + "step": 3534 + }, + { + "epoch": 0.2550597063386125, + "grad_norm": 0.13483689725399017, + "learning_rate": 0.00018981093952951365, + "loss": 0.1332, + "step": 3535 + }, + { + "epoch": 0.2551318590136729, + "grad_norm": 0.07853590697050095, + "learning_rate": 0.0001898080531101169, + "loss": 0.141, + "step": 3536 + }, + { + "epoch": 0.25520401168873336, + "grad_norm": 0.10824296623468399, + "learning_rate": 0.00018980516669072018, + "loss": 0.164, + "step": 3537 + }, + { + "epoch": 0.2552761643637938, + "grad_norm": 0.1052640974521637, + "learning_rate": 0.00018980228027132344, + "loss": 0.124, + "step": 3538 + }, + { + "epoch": 0.2553483170388542, + "grad_norm": 0.11124280840158463, + "learning_rate": 0.00018979939385192668, + "loss": 0.1573, + "step": 3539 + }, + { + "epoch": 0.25542046971391463, + "grad_norm": 0.1169007271528244, + "learning_rate": 0.00018979650743252994, + "loss": 0.1753, + "step": 3540 + }, + { + "epoch": 0.2554926223889751, + "grad_norm": 0.10264059901237488, + "learning_rate": 0.00018979362101313323, + "loss": 0.157, + "step": 3541 + }, + { + "epoch": 0.2555647750640355, + "grad_norm": 0.12313199788331985, + "learning_rate": 0.0001897907345937365, + "loss": 0.1744, + "step": 3542 + }, + { + "epoch": 0.2556369277390959, + "grad_norm": 0.10002419352531433, + "learning_rate": 0.00018978784817433975, + "loss": 0.1586, + "step": 3543 + }, + { + "epoch": 0.25570908041415635, + "grad_norm": 0.11884690076112747, + "learning_rate": 0.000189784961754943, + "loss": 0.1761, + "step": 3544 + }, + { + "epoch": 0.2557812330892168, + "grad_norm": 0.11634685844182968, + "learning_rate": 0.00018978207533554625, + "loss": 0.1918, + "step": 3545 + }, + { + "epoch": 0.2558533857642772, + "grad_norm": 0.13089223206043243, + "learning_rate": 0.00018977918891614952, + "loss": 0.1446, + "step": 3546 + }, + { + "epoch": 0.25592553843933763, + "grad_norm": 0.1116749569773674, + "learning_rate": 0.00018977630249675278, + "loss": 0.1381, + "step": 3547 + }, + { + "epoch": 0.2559976911143981, + "grad_norm": 0.12038832157850266, + "learning_rate": 0.00018977341607735607, + "loss": 0.1312, + "step": 3548 + }, + { + "epoch": 0.2560698437894585, + "grad_norm": 0.11651638150215149, + "learning_rate": 0.0001897705296579593, + "loss": 0.1588, + "step": 3549 + }, + { + "epoch": 0.2561419964645189, + "grad_norm": 0.1340457946062088, + "learning_rate": 0.00018976764323856257, + "loss": 0.1697, + "step": 3550 + }, + { + "epoch": 0.25621414913957935, + "grad_norm": 0.11867430806159973, + "learning_rate": 0.00018976475681916583, + "loss": 0.1801, + "step": 3551 + }, + { + "epoch": 0.2562863018146398, + "grad_norm": 0.1400105208158493, + "learning_rate": 0.0001897618703997691, + "loss": 0.2189, + "step": 3552 + }, + { + "epoch": 0.2563584544897002, + "grad_norm": 0.0946824699640274, + "learning_rate": 0.00018975898398037236, + "loss": 0.1552, + "step": 3553 + }, + { + "epoch": 0.2564306071647606, + "grad_norm": 0.08589153736829758, + "learning_rate": 0.00018975609756097562, + "loss": 0.1707, + "step": 3554 + }, + { + "epoch": 0.25650275983982107, + "grad_norm": 0.12594255805015564, + "learning_rate": 0.00018975321114157888, + "loss": 0.1107, + "step": 3555 + }, + { + "epoch": 0.2565749125148815, + "grad_norm": 0.11282055079936981, + "learning_rate": 0.00018975032472218214, + "loss": 0.1644, + "step": 3556 + }, + { + "epoch": 0.2566470651899419, + "grad_norm": 0.12821117043495178, + "learning_rate": 0.0001897474383027854, + "loss": 0.1475, + "step": 3557 + }, + { + "epoch": 0.25671921786500235, + "grad_norm": 0.10788974910974503, + "learning_rate": 0.00018974455188338867, + "loss": 0.1369, + "step": 3558 + }, + { + "epoch": 0.2567913705400628, + "grad_norm": 0.12976844608783722, + "learning_rate": 0.00018974166546399193, + "loss": 0.1695, + "step": 3559 + }, + { + "epoch": 0.2568635232151232, + "grad_norm": 0.11886755377054214, + "learning_rate": 0.0001897387790445952, + "loss": 0.1728, + "step": 3560 + }, + { + "epoch": 0.2569356758901836, + "grad_norm": 0.12633520364761353, + "learning_rate": 0.00018973589262519843, + "loss": 0.1143, + "step": 3561 + }, + { + "epoch": 0.25700782856524407, + "grad_norm": 0.10834062844514847, + "learning_rate": 0.00018973300620580172, + "loss": 0.1467, + "step": 3562 + }, + { + "epoch": 0.2570799812403045, + "grad_norm": 0.14430686831474304, + "learning_rate": 0.00018973011978640499, + "loss": 0.2205, + "step": 3563 + }, + { + "epoch": 0.2571521339153649, + "grad_norm": 0.13141347467899323, + "learning_rate": 0.00018972723336700825, + "loss": 0.1295, + "step": 3564 + }, + { + "epoch": 0.25722428659042534, + "grad_norm": 0.1020764708518982, + "learning_rate": 0.0001897243469476115, + "loss": 0.1944, + "step": 3565 + }, + { + "epoch": 0.2572964392654858, + "grad_norm": 0.1081843450665474, + "learning_rate": 0.00018972146052821475, + "loss": 0.1463, + "step": 3566 + }, + { + "epoch": 0.2573685919405462, + "grad_norm": 0.10524830967187881, + "learning_rate": 0.000189718574108818, + "loss": 0.1747, + "step": 3567 + }, + { + "epoch": 0.2574407446156066, + "grad_norm": 0.10554425418376923, + "learning_rate": 0.00018971568768942127, + "loss": 0.1796, + "step": 3568 + }, + { + "epoch": 0.25751289729066706, + "grad_norm": 0.0920037180185318, + "learning_rate": 0.00018971280127002456, + "loss": 0.1876, + "step": 3569 + }, + { + "epoch": 0.25758504996572745, + "grad_norm": 0.09690480679273605, + "learning_rate": 0.00018970991485062783, + "loss": 0.1565, + "step": 3570 + }, + { + "epoch": 0.2576572026407879, + "grad_norm": 0.10666646808385849, + "learning_rate": 0.00018970702843123106, + "loss": 0.1836, + "step": 3571 + }, + { + "epoch": 0.25772935531584834, + "grad_norm": 0.10854260623455048, + "learning_rate": 0.00018970414201183432, + "loss": 0.2031, + "step": 3572 + }, + { + "epoch": 0.2578015079909088, + "grad_norm": 0.12234905362129211, + "learning_rate": 0.0001897012555924376, + "loss": 0.177, + "step": 3573 + }, + { + "epoch": 0.2578736606659692, + "grad_norm": 0.09497665613889694, + "learning_rate": 0.00018969836917304085, + "loss": 0.2068, + "step": 3574 + }, + { + "epoch": 0.2579458133410296, + "grad_norm": 0.17623476684093475, + "learning_rate": 0.0001896954827536441, + "loss": 0.1507, + "step": 3575 + }, + { + "epoch": 0.25801796601609006, + "grad_norm": 0.09530671685934067, + "learning_rate": 0.00018969259633424738, + "loss": 0.1634, + "step": 3576 + }, + { + "epoch": 0.25809011869115045, + "grad_norm": 0.10053442418575287, + "learning_rate": 0.00018968970991485064, + "loss": 0.1796, + "step": 3577 + }, + { + "epoch": 0.2581622713662109, + "grad_norm": 0.12041661888360977, + "learning_rate": 0.0001896868234954539, + "loss": 0.1869, + "step": 3578 + }, + { + "epoch": 0.25823442404127134, + "grad_norm": 0.10001831501722336, + "learning_rate": 0.00018968393707605716, + "loss": 0.1732, + "step": 3579 + }, + { + "epoch": 0.2583065767163318, + "grad_norm": 0.09121733903884888, + "learning_rate": 0.00018968105065666043, + "loss": 0.1112, + "step": 3580 + }, + { + "epoch": 0.25837872939139217, + "grad_norm": 0.11876653879880905, + "learning_rate": 0.0001896781642372637, + "loss": 0.1667, + "step": 3581 + }, + { + "epoch": 0.2584508820664526, + "grad_norm": 0.10456927120685577, + "learning_rate": 0.00018967527781786693, + "loss": 0.1559, + "step": 3582 + }, + { + "epoch": 0.25852303474151306, + "grad_norm": 0.1214747354388237, + "learning_rate": 0.00018967239139847022, + "loss": 0.1742, + "step": 3583 + }, + { + "epoch": 0.25859518741657345, + "grad_norm": 0.10480388253927231, + "learning_rate": 0.00018966950497907348, + "loss": 0.1585, + "step": 3584 + }, + { + "epoch": 0.2586673400916339, + "grad_norm": 0.09965568035840988, + "learning_rate": 0.00018966661855967674, + "loss": 0.1745, + "step": 3585 + }, + { + "epoch": 0.25873949276669433, + "grad_norm": 0.12093115597963333, + "learning_rate": 0.00018966373214028, + "loss": 0.1547, + "step": 3586 + }, + { + "epoch": 0.2588116454417548, + "grad_norm": 0.1278897076845169, + "learning_rate": 0.00018966084572088324, + "loss": 0.149, + "step": 3587 + }, + { + "epoch": 0.25888379811681517, + "grad_norm": 0.08690732717514038, + "learning_rate": 0.0001896579593014865, + "loss": 0.165, + "step": 3588 + }, + { + "epoch": 0.2589559507918756, + "grad_norm": 0.108178049325943, + "learning_rate": 0.00018965507288208977, + "loss": 0.1891, + "step": 3589 + }, + { + "epoch": 0.25902810346693605, + "grad_norm": 0.09039347618818283, + "learning_rate": 0.00018965218646269306, + "loss": 0.182, + "step": 3590 + }, + { + "epoch": 0.25910025614199644, + "grad_norm": 0.10826592147350311, + "learning_rate": 0.00018964930004329632, + "loss": 0.1582, + "step": 3591 + }, + { + "epoch": 0.2591724088170569, + "grad_norm": 0.09563131630420685, + "learning_rate": 0.00018964641362389956, + "loss": 0.1596, + "step": 3592 + }, + { + "epoch": 0.25924456149211733, + "grad_norm": 0.13261514902114868, + "learning_rate": 0.00018964352720450282, + "loss": 0.163, + "step": 3593 + }, + { + "epoch": 0.2593167141671778, + "grad_norm": 0.12152953445911407, + "learning_rate": 0.00018964064078510608, + "loss": 0.1316, + "step": 3594 + }, + { + "epoch": 0.25938886684223816, + "grad_norm": 0.11141938716173172, + "learning_rate": 0.00018963775436570934, + "loss": 0.172, + "step": 3595 + }, + { + "epoch": 0.2594610195172986, + "grad_norm": 0.13898305594921112, + "learning_rate": 0.0001896348679463126, + "loss": 0.2022, + "step": 3596 + }, + { + "epoch": 0.25953317219235905, + "grad_norm": 0.11607407033443451, + "learning_rate": 0.00018963198152691587, + "loss": 0.1919, + "step": 3597 + }, + { + "epoch": 0.25960532486741944, + "grad_norm": 0.12388519197702408, + "learning_rate": 0.00018962909510751913, + "loss": 0.161, + "step": 3598 + }, + { + "epoch": 0.2596774775424799, + "grad_norm": 0.09459841996431351, + "learning_rate": 0.0001896262086881224, + "loss": 0.1909, + "step": 3599 + }, + { + "epoch": 0.25974963021754033, + "grad_norm": 0.10941751301288605, + "learning_rate": 0.00018962332226872566, + "loss": 0.159, + "step": 3600 + }, + { + "epoch": 0.2598217828926007, + "grad_norm": 0.1132725328207016, + "learning_rate": 0.00018962043584932892, + "loss": 0.1276, + "step": 3601 + }, + { + "epoch": 0.25989393556766116, + "grad_norm": 0.11622916907072067, + "learning_rate": 0.00018961754942993218, + "loss": 0.1255, + "step": 3602 + }, + { + "epoch": 0.2599660882427216, + "grad_norm": 0.10722587257623672, + "learning_rate": 0.00018961466301053542, + "loss": 0.135, + "step": 3603 + }, + { + "epoch": 0.26003824091778205, + "grad_norm": 0.09481862187385559, + "learning_rate": 0.0001896117765911387, + "loss": 0.156, + "step": 3604 + }, + { + "epoch": 0.26011039359284244, + "grad_norm": 0.1511927843093872, + "learning_rate": 0.00018960889017174197, + "loss": 0.1911, + "step": 3605 + }, + { + "epoch": 0.2601825462679029, + "grad_norm": 0.11891227215528488, + "learning_rate": 0.00018960600375234524, + "loss": 0.1898, + "step": 3606 + }, + { + "epoch": 0.2602546989429633, + "grad_norm": 0.11076851189136505, + "learning_rate": 0.0001896031173329485, + "loss": 0.0932, + "step": 3607 + }, + { + "epoch": 0.2603268516180237, + "grad_norm": 0.09624168276786804, + "learning_rate": 0.00018960023091355174, + "loss": 0.1357, + "step": 3608 + }, + { + "epoch": 0.26039900429308416, + "grad_norm": 0.10173476487398148, + "learning_rate": 0.000189597344494155, + "loss": 0.1637, + "step": 3609 + }, + { + "epoch": 0.2604711569681446, + "grad_norm": 0.1195240169763565, + "learning_rate": 0.00018959445807475826, + "loss": 0.1517, + "step": 3610 + }, + { + "epoch": 0.26054330964320505, + "grad_norm": 0.13871614634990692, + "learning_rate": 0.00018959157165536155, + "loss": 0.2041, + "step": 3611 + }, + { + "epoch": 0.26061546231826543, + "grad_norm": 0.13739337027072906, + "learning_rate": 0.00018958868523596481, + "loss": 0.2188, + "step": 3612 + }, + { + "epoch": 0.2606876149933259, + "grad_norm": 0.12913809716701508, + "learning_rate": 0.00018958579881656805, + "loss": 0.1665, + "step": 3613 + }, + { + "epoch": 0.2607597676683863, + "grad_norm": 0.13116084039211273, + "learning_rate": 0.0001895829123971713, + "loss": 0.1634, + "step": 3614 + }, + { + "epoch": 0.2608319203434467, + "grad_norm": 0.12041059136390686, + "learning_rate": 0.00018958002597777458, + "loss": 0.1619, + "step": 3615 + }, + { + "epoch": 0.26090407301850715, + "grad_norm": 0.13146141171455383, + "learning_rate": 0.00018957713955837784, + "loss": 0.1554, + "step": 3616 + }, + { + "epoch": 0.2609762256935676, + "grad_norm": 0.10757823288440704, + "learning_rate": 0.0001895742531389811, + "loss": 0.2041, + "step": 3617 + }, + { + "epoch": 0.26104837836862804, + "grad_norm": 0.11519914865493774, + "learning_rate": 0.00018957136671958436, + "loss": 0.1857, + "step": 3618 + }, + { + "epoch": 0.26112053104368843, + "grad_norm": 0.12142335623502731, + "learning_rate": 0.00018956848030018763, + "loss": 0.1589, + "step": 3619 + }, + { + "epoch": 0.2611926837187489, + "grad_norm": 0.1048182025551796, + "learning_rate": 0.0001895655938807909, + "loss": 0.1479, + "step": 3620 + }, + { + "epoch": 0.2612648363938093, + "grad_norm": 0.09723210334777832, + "learning_rate": 0.00018956270746139415, + "loss": 0.1832, + "step": 3621 + }, + { + "epoch": 0.2613369890688697, + "grad_norm": 0.10473082959651947, + "learning_rate": 0.00018955982104199742, + "loss": 0.1649, + "step": 3622 + }, + { + "epoch": 0.26140914174393015, + "grad_norm": 0.11956755071878433, + "learning_rate": 0.00018955693462260068, + "loss": 0.1179, + "step": 3623 + }, + { + "epoch": 0.2614812944189906, + "grad_norm": 0.10788846760988235, + "learning_rate": 0.00018955404820320392, + "loss": 0.1832, + "step": 3624 + }, + { + "epoch": 0.26155344709405104, + "grad_norm": 0.10262494534254074, + "learning_rate": 0.0001895511617838072, + "loss": 0.1945, + "step": 3625 + }, + { + "epoch": 0.2616255997691114, + "grad_norm": 0.12115581333637238, + "learning_rate": 0.00018954827536441047, + "loss": 0.2295, + "step": 3626 + }, + { + "epoch": 0.26169775244417187, + "grad_norm": 0.13347704708576202, + "learning_rate": 0.00018954538894501373, + "loss": 0.1444, + "step": 3627 + }, + { + "epoch": 0.2617699051192323, + "grad_norm": 0.09329728782176971, + "learning_rate": 0.000189542502525617, + "loss": 0.1849, + "step": 3628 + }, + { + "epoch": 0.2618420577942927, + "grad_norm": 0.1016915887594223, + "learning_rate": 0.00018953961610622023, + "loss": 0.1416, + "step": 3629 + }, + { + "epoch": 0.26191421046935315, + "grad_norm": 0.1248478889465332, + "learning_rate": 0.0001895367296868235, + "loss": 0.1706, + "step": 3630 + }, + { + "epoch": 0.2619863631444136, + "grad_norm": 0.09999848157167435, + "learning_rate": 0.00018953384326742676, + "loss": 0.1512, + "step": 3631 + }, + { + "epoch": 0.262058515819474, + "grad_norm": 0.1144399642944336, + "learning_rate": 0.00018953095684803002, + "loss": 0.1192, + "step": 3632 + }, + { + "epoch": 0.2621306684945344, + "grad_norm": 0.1034519225358963, + "learning_rate": 0.0001895280704286333, + "loss": 0.166, + "step": 3633 + }, + { + "epoch": 0.26220282116959487, + "grad_norm": 0.14921246469020844, + "learning_rate": 0.00018952518400923654, + "loss": 0.1258, + "step": 3634 + }, + { + "epoch": 0.2622749738446553, + "grad_norm": 0.10548774152994156, + "learning_rate": 0.0001895222975898398, + "loss": 0.1511, + "step": 3635 + }, + { + "epoch": 0.2623471265197157, + "grad_norm": 0.12472370266914368, + "learning_rate": 0.00018951941117044307, + "loss": 0.1733, + "step": 3636 + }, + { + "epoch": 0.26241927919477614, + "grad_norm": 0.10761553794145584, + "learning_rate": 0.00018951652475104633, + "loss": 0.194, + "step": 3637 + }, + { + "epoch": 0.2624914318698366, + "grad_norm": 0.11572976410388947, + "learning_rate": 0.0001895136383316496, + "loss": 0.1905, + "step": 3638 + }, + { + "epoch": 0.262563584544897, + "grad_norm": 0.1269570291042328, + "learning_rate": 0.00018951075191225286, + "loss": 0.1376, + "step": 3639 + }, + { + "epoch": 0.2626357372199574, + "grad_norm": 0.10389404743909836, + "learning_rate": 0.00018950786549285612, + "loss": 0.1459, + "step": 3640 + }, + { + "epoch": 0.26270788989501787, + "grad_norm": 0.10264139622449875, + "learning_rate": 0.00018950497907345938, + "loss": 0.1984, + "step": 3641 + }, + { + "epoch": 0.2627800425700783, + "grad_norm": 0.1579146385192871, + "learning_rate": 0.00018950209265406265, + "loss": 0.163, + "step": 3642 + }, + { + "epoch": 0.2628521952451387, + "grad_norm": 0.12138903141021729, + "learning_rate": 0.0001894992062346659, + "loss": 0.1668, + "step": 3643 + }, + { + "epoch": 0.26292434792019914, + "grad_norm": 0.11000584810972214, + "learning_rate": 0.00018949631981526917, + "loss": 0.1787, + "step": 3644 + }, + { + "epoch": 0.2629965005952596, + "grad_norm": 0.11004672199487686, + "learning_rate": 0.0001894934333958724, + "loss": 0.1586, + "step": 3645 + }, + { + "epoch": 0.26306865327032, + "grad_norm": 0.13046477735042572, + "learning_rate": 0.00018949054697647567, + "loss": 0.196, + "step": 3646 + }, + { + "epoch": 0.2631408059453804, + "grad_norm": 0.1197001039981842, + "learning_rate": 0.00018948766055707896, + "loss": 0.1421, + "step": 3647 + }, + { + "epoch": 0.26321295862044086, + "grad_norm": 0.11334479600191116, + "learning_rate": 0.00018948477413768223, + "loss": 0.1675, + "step": 3648 + }, + { + "epoch": 0.2632851112955013, + "grad_norm": 0.11990846693515778, + "learning_rate": 0.0001894818877182855, + "loss": 0.1294, + "step": 3649 + }, + { + "epoch": 0.2633572639705617, + "grad_norm": 0.11262289434671402, + "learning_rate": 0.00018947900129888872, + "loss": 0.1453, + "step": 3650 + }, + { + "epoch": 0.26342941664562214, + "grad_norm": 0.1628699153661728, + "learning_rate": 0.000189476114879492, + "loss": 0.2236, + "step": 3651 + }, + { + "epoch": 0.2635015693206826, + "grad_norm": 0.13269752264022827, + "learning_rate": 0.00018947322846009525, + "loss": 0.2132, + "step": 3652 + }, + { + "epoch": 0.26357372199574297, + "grad_norm": 0.09555750340223312, + "learning_rate": 0.0001894703420406985, + "loss": 0.1869, + "step": 3653 + }, + { + "epoch": 0.2636458746708034, + "grad_norm": 0.1075914055109024, + "learning_rate": 0.0001894674556213018, + "loss": 0.1991, + "step": 3654 + }, + { + "epoch": 0.26371802734586386, + "grad_norm": 0.10003098845481873, + "learning_rate": 0.00018946456920190504, + "loss": 0.1083, + "step": 3655 + }, + { + "epoch": 0.2637901800209243, + "grad_norm": 0.11860654503107071, + "learning_rate": 0.0001894616827825083, + "loss": 0.2015, + "step": 3656 + }, + { + "epoch": 0.2638623326959847, + "grad_norm": 0.09559154510498047, + "learning_rate": 0.00018945879636311156, + "loss": 0.1851, + "step": 3657 + }, + { + "epoch": 0.26393448537104514, + "grad_norm": 0.10533902049064636, + "learning_rate": 0.00018945590994371483, + "loss": 0.163, + "step": 3658 + }, + { + "epoch": 0.2640066380461056, + "grad_norm": 0.10047412663698196, + "learning_rate": 0.0001894530235243181, + "loss": 0.1725, + "step": 3659 + }, + { + "epoch": 0.26407879072116597, + "grad_norm": 0.08557490259408951, + "learning_rate": 0.00018945013710492135, + "loss": 0.1207, + "step": 3660 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 0.10779914259910583, + "learning_rate": 0.00018944725068552462, + "loss": 0.1795, + "step": 3661 + }, + { + "epoch": 0.26422309607128686, + "grad_norm": 0.1896720826625824, + "learning_rate": 0.00018944436426612788, + "loss": 0.162, + "step": 3662 + }, + { + "epoch": 0.26429524874634724, + "grad_norm": 0.11711643636226654, + "learning_rate": 0.00018944147784673114, + "loss": 0.2044, + "step": 3663 + }, + { + "epoch": 0.2643674014214077, + "grad_norm": 0.0934373065829277, + "learning_rate": 0.0001894385914273344, + "loss": 0.1858, + "step": 3664 + }, + { + "epoch": 0.26443955409646813, + "grad_norm": 0.12046694755554199, + "learning_rate": 0.00018943570500793767, + "loss": 0.1324, + "step": 3665 + }, + { + "epoch": 0.2645117067715286, + "grad_norm": 0.11609145253896713, + "learning_rate": 0.0001894328185885409, + "loss": 0.1798, + "step": 3666 + }, + { + "epoch": 0.26458385944658896, + "grad_norm": 0.15844784677028656, + "learning_rate": 0.00018942993216914417, + "loss": 0.1913, + "step": 3667 + }, + { + "epoch": 0.2646560121216494, + "grad_norm": 0.09626557677984238, + "learning_rate": 0.00018942704574974746, + "loss": 0.1481, + "step": 3668 + }, + { + "epoch": 0.26472816479670985, + "grad_norm": 0.10835087299346924, + "learning_rate": 0.00018942415933035072, + "loss": 0.1756, + "step": 3669 + }, + { + "epoch": 0.26480031747177024, + "grad_norm": 0.10344719886779785, + "learning_rate": 0.00018942127291095398, + "loss": 0.1515, + "step": 3670 + }, + { + "epoch": 0.2648724701468307, + "grad_norm": 0.09639471769332886, + "learning_rate": 0.00018941838649155722, + "loss": 0.1462, + "step": 3671 + }, + { + "epoch": 0.26494462282189113, + "grad_norm": 0.08890553563833237, + "learning_rate": 0.00018941550007216048, + "loss": 0.1329, + "step": 3672 + }, + { + "epoch": 0.2650167754969516, + "grad_norm": 0.10019133985042572, + "learning_rate": 0.00018941261365276374, + "loss": 0.1715, + "step": 3673 + }, + { + "epoch": 0.26508892817201196, + "grad_norm": 0.106917604804039, + "learning_rate": 0.000189409727233367, + "loss": 0.1643, + "step": 3674 + }, + { + "epoch": 0.2651610808470724, + "grad_norm": 0.12348364293575287, + "learning_rate": 0.0001894068408139703, + "loss": 0.1493, + "step": 3675 + }, + { + "epoch": 0.26523323352213285, + "grad_norm": 0.11973647773265839, + "learning_rate": 0.00018940395439457353, + "loss": 0.1608, + "step": 3676 + }, + { + "epoch": 0.26530538619719324, + "grad_norm": 0.12044582515954971, + "learning_rate": 0.0001894010679751768, + "loss": 0.1581, + "step": 3677 + }, + { + "epoch": 0.2653775388722537, + "grad_norm": 0.11655533313751221, + "learning_rate": 0.00018939818155578006, + "loss": 0.1468, + "step": 3678 + }, + { + "epoch": 0.2654496915473141, + "grad_norm": 0.13247819244861603, + "learning_rate": 0.00018939529513638332, + "loss": 0.2026, + "step": 3679 + }, + { + "epoch": 0.26552184422237457, + "grad_norm": 0.10399710386991501, + "learning_rate": 0.00018939240871698658, + "loss": 0.1578, + "step": 3680 + }, + { + "epoch": 0.26559399689743496, + "grad_norm": 0.14507174491882324, + "learning_rate": 0.00018938952229758985, + "loss": 0.156, + "step": 3681 + }, + { + "epoch": 0.2656661495724954, + "grad_norm": 0.10088624060153961, + "learning_rate": 0.0001893866358781931, + "loss": 0.1309, + "step": 3682 + }, + { + "epoch": 0.26573830224755585, + "grad_norm": 0.10670887678861618, + "learning_rate": 0.00018938374945879637, + "loss": 0.186, + "step": 3683 + }, + { + "epoch": 0.26581045492261623, + "grad_norm": 0.10915090143680573, + "learning_rate": 0.00018938086303939964, + "loss": 0.1492, + "step": 3684 + }, + { + "epoch": 0.2658826075976767, + "grad_norm": 0.11927489191293716, + "learning_rate": 0.0001893779766200029, + "loss": 0.1882, + "step": 3685 + }, + { + "epoch": 0.2659547602727371, + "grad_norm": 0.09340450912714005, + "learning_rate": 0.00018937509020060616, + "loss": 0.1327, + "step": 3686 + }, + { + "epoch": 0.26602691294779757, + "grad_norm": 0.1078324019908905, + "learning_rate": 0.00018937220378120942, + "loss": 0.1279, + "step": 3687 + }, + { + "epoch": 0.26609906562285796, + "grad_norm": 0.09562065452337265, + "learning_rate": 0.00018936931736181266, + "loss": 0.1378, + "step": 3688 + }, + { + "epoch": 0.2661712182979184, + "grad_norm": 0.10261200368404388, + "learning_rate": 0.00018936643094241595, + "loss": 0.1695, + "step": 3689 + }, + { + "epoch": 0.26624337097297884, + "grad_norm": 0.10890012234449387, + "learning_rate": 0.00018936354452301921, + "loss": 0.1438, + "step": 3690 + }, + { + "epoch": 0.26631552364803923, + "grad_norm": 0.09240952134132385, + "learning_rate": 0.00018936065810362248, + "loss": 0.1076, + "step": 3691 + }, + { + "epoch": 0.2663876763230997, + "grad_norm": 0.1261812299489975, + "learning_rate": 0.00018935777168422574, + "loss": 0.166, + "step": 3692 + }, + { + "epoch": 0.2664598289981601, + "grad_norm": 0.12737056612968445, + "learning_rate": 0.00018935488526482898, + "loss": 0.1715, + "step": 3693 + }, + { + "epoch": 0.2665319816732205, + "grad_norm": 0.12897031009197235, + "learning_rate": 0.00018935199884543224, + "loss": 0.1753, + "step": 3694 + }, + { + "epoch": 0.26660413434828095, + "grad_norm": 0.11706625670194626, + "learning_rate": 0.0001893491124260355, + "loss": 0.1702, + "step": 3695 + }, + { + "epoch": 0.2666762870233414, + "grad_norm": 0.11923673748970032, + "learning_rate": 0.0001893462260066388, + "loss": 0.1451, + "step": 3696 + }, + { + "epoch": 0.26674843969840184, + "grad_norm": 0.09781758487224579, + "learning_rate": 0.00018934333958724205, + "loss": 0.1429, + "step": 3697 + }, + { + "epoch": 0.26682059237346223, + "grad_norm": 0.1293449103832245, + "learning_rate": 0.0001893404531678453, + "loss": 0.14, + "step": 3698 + }, + { + "epoch": 0.2668927450485227, + "grad_norm": 0.10683408379554749, + "learning_rate": 0.00018933756674844855, + "loss": 0.1595, + "step": 3699 + }, + { + "epoch": 0.2669648977235831, + "grad_norm": 0.10115299373865128, + "learning_rate": 0.00018933468032905182, + "loss": 0.1624, + "step": 3700 + }, + { + "epoch": 0.2670370503986435, + "grad_norm": 0.08999868482351303, + "learning_rate": 0.00018933179390965508, + "loss": 0.1804, + "step": 3701 + }, + { + "epoch": 0.26710920307370395, + "grad_norm": 0.09947782754898071, + "learning_rate": 0.00018932890749025834, + "loss": 0.1512, + "step": 3702 + }, + { + "epoch": 0.2671813557487644, + "grad_norm": 0.11422078311443329, + "learning_rate": 0.0001893260210708616, + "loss": 0.1508, + "step": 3703 + }, + { + "epoch": 0.26725350842382484, + "grad_norm": 0.10992544144392014, + "learning_rate": 0.00018932313465146487, + "loss": 0.1943, + "step": 3704 + }, + { + "epoch": 0.2673256610988852, + "grad_norm": 0.09266640245914459, + "learning_rate": 0.00018932024823206813, + "loss": 0.1889, + "step": 3705 + }, + { + "epoch": 0.26739781377394567, + "grad_norm": 0.11159055680036545, + "learning_rate": 0.0001893173618126714, + "loss": 0.1612, + "step": 3706 + }, + { + "epoch": 0.2674699664490061, + "grad_norm": 0.11246234178543091, + "learning_rate": 0.00018931447539327466, + "loss": 0.1306, + "step": 3707 + }, + { + "epoch": 0.2675421191240665, + "grad_norm": 0.11322403699159622, + "learning_rate": 0.00018931158897387792, + "loss": 0.1338, + "step": 3708 + }, + { + "epoch": 0.26761427179912695, + "grad_norm": 0.11647389084100723, + "learning_rate": 0.00018930870255448115, + "loss": 0.1287, + "step": 3709 + }, + { + "epoch": 0.2676864244741874, + "grad_norm": 0.10262130945920944, + "learning_rate": 0.00018930581613508444, + "loss": 0.1796, + "step": 3710 + }, + { + "epoch": 0.26775857714924783, + "grad_norm": 0.13843820989131927, + "learning_rate": 0.0001893029297156877, + "loss": 0.2134, + "step": 3711 + }, + { + "epoch": 0.2678307298243082, + "grad_norm": 0.09976322203874588, + "learning_rate": 0.00018930004329629097, + "loss": 0.1382, + "step": 3712 + }, + { + "epoch": 0.26790288249936867, + "grad_norm": 0.10742239654064178, + "learning_rate": 0.00018929715687689423, + "loss": 0.1685, + "step": 3713 + }, + { + "epoch": 0.2679750351744291, + "grad_norm": 0.13246366381645203, + "learning_rate": 0.00018929427045749747, + "loss": 0.2164, + "step": 3714 + }, + { + "epoch": 0.2680471878494895, + "grad_norm": 0.12941241264343262, + "learning_rate": 0.00018929138403810073, + "loss": 0.1481, + "step": 3715 + }, + { + "epoch": 0.26811934052454994, + "grad_norm": 0.09542439877986908, + "learning_rate": 0.000189288497618704, + "loss": 0.1766, + "step": 3716 + }, + { + "epoch": 0.2681914931996104, + "grad_norm": 0.09997952729463577, + "learning_rate": 0.00018928561119930729, + "loss": 0.145, + "step": 3717 + }, + { + "epoch": 0.26826364587467083, + "grad_norm": 0.09770691394805908, + "learning_rate": 0.00018928272477991055, + "loss": 0.1748, + "step": 3718 + }, + { + "epoch": 0.2683357985497312, + "grad_norm": 0.10574066638946533, + "learning_rate": 0.00018927983836051378, + "loss": 0.1815, + "step": 3719 + }, + { + "epoch": 0.26840795122479166, + "grad_norm": 0.1131848618388176, + "learning_rate": 0.00018927695194111705, + "loss": 0.1126, + "step": 3720 + }, + { + "epoch": 0.2684801038998521, + "grad_norm": 0.1036604568362236, + "learning_rate": 0.0001892740655217203, + "loss": 0.1304, + "step": 3721 + }, + { + "epoch": 0.2685522565749125, + "grad_norm": 0.111257404088974, + "learning_rate": 0.00018927117910232357, + "loss": 0.1506, + "step": 3722 + }, + { + "epoch": 0.26862440924997294, + "grad_norm": 0.0995304137468338, + "learning_rate": 0.00018926829268292684, + "loss": 0.1737, + "step": 3723 + }, + { + "epoch": 0.2686965619250334, + "grad_norm": 0.11207863688468933, + "learning_rate": 0.0001892654062635301, + "loss": 0.1918, + "step": 3724 + }, + { + "epoch": 0.26876871460009377, + "grad_norm": 0.139298215508461, + "learning_rate": 0.00018926251984413336, + "loss": 0.174, + "step": 3725 + }, + { + "epoch": 0.2688408672751542, + "grad_norm": 0.12480378895998001, + "learning_rate": 0.00018925963342473662, + "loss": 0.1476, + "step": 3726 + }, + { + "epoch": 0.26891301995021466, + "grad_norm": 0.11126791685819626, + "learning_rate": 0.0001892567470053399, + "loss": 0.1864, + "step": 3727 + }, + { + "epoch": 0.2689851726252751, + "grad_norm": 0.13878969848155975, + "learning_rate": 0.00018925386058594315, + "loss": 0.1637, + "step": 3728 + }, + { + "epoch": 0.2690573253003355, + "grad_norm": 0.11918002367019653, + "learning_rate": 0.0001892509741665464, + "loss": 0.182, + "step": 3729 + }, + { + "epoch": 0.26912947797539594, + "grad_norm": 0.1296512633562088, + "learning_rate": 0.00018924808774714965, + "loss": 0.1888, + "step": 3730 + }, + { + "epoch": 0.2692016306504564, + "grad_norm": 0.10598678141832352, + "learning_rate": 0.00018924520132775294, + "loss": 0.195, + "step": 3731 + }, + { + "epoch": 0.26927378332551677, + "grad_norm": 0.14444386959075928, + "learning_rate": 0.0001892423149083562, + "loss": 0.148, + "step": 3732 + }, + { + "epoch": 0.2693459360005772, + "grad_norm": 0.10955000668764114, + "learning_rate": 0.00018923942848895947, + "loss": 0.1118, + "step": 3733 + }, + { + "epoch": 0.26941808867563766, + "grad_norm": 0.13034574687480927, + "learning_rate": 0.00018923654206956273, + "loss": 0.1556, + "step": 3734 + }, + { + "epoch": 0.2694902413506981, + "grad_norm": 0.12457529455423355, + "learning_rate": 0.00018923365565016596, + "loss": 0.1635, + "step": 3735 + }, + { + "epoch": 0.2695623940257585, + "grad_norm": 0.13422954082489014, + "learning_rate": 0.00018923076923076923, + "loss": 0.1447, + "step": 3736 + }, + { + "epoch": 0.26963454670081893, + "grad_norm": 0.10142651200294495, + "learning_rate": 0.0001892278828113725, + "loss": 0.2051, + "step": 3737 + }, + { + "epoch": 0.2697066993758794, + "grad_norm": 0.13308723270893097, + "learning_rate": 0.00018922499639197578, + "loss": 0.2012, + "step": 3738 + }, + { + "epoch": 0.26977885205093977, + "grad_norm": 0.12129965424537659, + "learning_rate": 0.00018922210997257904, + "loss": 0.1591, + "step": 3739 + }, + { + "epoch": 0.2698510047260002, + "grad_norm": 0.1246509999036789, + "learning_rate": 0.00018921922355318228, + "loss": 0.1245, + "step": 3740 + }, + { + "epoch": 0.26992315740106065, + "grad_norm": 0.12997084856033325, + "learning_rate": 0.00018921633713378554, + "loss": 0.1159, + "step": 3741 + }, + { + "epoch": 0.2699953100761211, + "grad_norm": 0.10414636880159378, + "learning_rate": 0.0001892134507143888, + "loss": 0.1973, + "step": 3742 + }, + { + "epoch": 0.2700674627511815, + "grad_norm": 0.097514308989048, + "learning_rate": 0.00018921056429499207, + "loss": 0.1326, + "step": 3743 + }, + { + "epoch": 0.27013961542624193, + "grad_norm": 0.09992020577192307, + "learning_rate": 0.00018920767787559533, + "loss": 0.1957, + "step": 3744 + }, + { + "epoch": 0.2702117681013024, + "grad_norm": 0.10262467712163925, + "learning_rate": 0.0001892047914561986, + "loss": 0.1407, + "step": 3745 + }, + { + "epoch": 0.27028392077636276, + "grad_norm": 0.0982523113489151, + "learning_rate": 0.00018920190503680186, + "loss": 0.1374, + "step": 3746 + }, + { + "epoch": 0.2703560734514232, + "grad_norm": 0.10197173804044724, + "learning_rate": 0.00018919901861740512, + "loss": 0.1541, + "step": 3747 + }, + { + "epoch": 0.27042822612648365, + "grad_norm": 0.12074144929647446, + "learning_rate": 0.00018919613219800838, + "loss": 0.1624, + "step": 3748 + }, + { + "epoch": 0.2705003788015441, + "grad_norm": 0.11587917059659958, + "learning_rate": 0.00018919324577861164, + "loss": 0.2164, + "step": 3749 + }, + { + "epoch": 0.2705725314766045, + "grad_norm": 0.11887791752815247, + "learning_rate": 0.0001891903593592149, + "loss": 0.172, + "step": 3750 + }, + { + "epoch": 0.2706446841516649, + "grad_norm": 0.127192422747612, + "learning_rate": 0.00018918747293981814, + "loss": 0.1608, + "step": 3751 + }, + { + "epoch": 0.27071683682672537, + "grad_norm": 0.12658534944057465, + "learning_rate": 0.00018918458652042143, + "loss": 0.1763, + "step": 3752 + }, + { + "epoch": 0.27078898950178576, + "grad_norm": 0.11674284189939499, + "learning_rate": 0.0001891817001010247, + "loss": 0.1629, + "step": 3753 + }, + { + "epoch": 0.2708611421768462, + "grad_norm": 0.09319207817316055, + "learning_rate": 0.00018917881368162796, + "loss": 0.1449, + "step": 3754 + }, + { + "epoch": 0.27093329485190665, + "grad_norm": 0.11137358844280243, + "learning_rate": 0.00018917592726223122, + "loss": 0.1375, + "step": 3755 + }, + { + "epoch": 0.27100544752696704, + "grad_norm": 0.11353585869073868, + "learning_rate": 0.00018917304084283446, + "loss": 0.1249, + "step": 3756 + }, + { + "epoch": 0.2710776002020275, + "grad_norm": 0.09032389521598816, + "learning_rate": 0.00018917015442343772, + "loss": 0.1456, + "step": 3757 + }, + { + "epoch": 0.2711497528770879, + "grad_norm": 0.14036047458648682, + "learning_rate": 0.00018916726800404098, + "loss": 0.1609, + "step": 3758 + }, + { + "epoch": 0.27122190555214837, + "grad_norm": 0.13396713137626648, + "learning_rate": 0.00018916438158464427, + "loss": 0.1756, + "step": 3759 + }, + { + "epoch": 0.27129405822720876, + "grad_norm": 0.0830783098936081, + "learning_rate": 0.00018916149516524754, + "loss": 0.1528, + "step": 3760 + }, + { + "epoch": 0.2713662109022692, + "grad_norm": 0.12058752775192261, + "learning_rate": 0.00018915860874585077, + "loss": 0.1757, + "step": 3761 + }, + { + "epoch": 0.27143836357732964, + "grad_norm": 0.12686319649219513, + "learning_rate": 0.00018915572232645404, + "loss": 0.1608, + "step": 3762 + }, + { + "epoch": 0.27151051625239003, + "grad_norm": 0.12383957207202911, + "learning_rate": 0.0001891528359070573, + "loss": 0.1136, + "step": 3763 + }, + { + "epoch": 0.2715826689274505, + "grad_norm": 0.14094068109989166, + "learning_rate": 0.00018914994948766056, + "loss": 0.1892, + "step": 3764 + }, + { + "epoch": 0.2716548216025109, + "grad_norm": 0.10473047941923141, + "learning_rate": 0.00018914706306826382, + "loss": 0.1918, + "step": 3765 + }, + { + "epoch": 0.27172697427757136, + "grad_norm": 0.13546206057071686, + "learning_rate": 0.0001891441766488671, + "loss": 0.1736, + "step": 3766 + }, + { + "epoch": 0.27179912695263175, + "grad_norm": 0.1025552973151207, + "learning_rate": 0.00018914129022947035, + "loss": 0.1631, + "step": 3767 + }, + { + "epoch": 0.2718712796276922, + "grad_norm": 0.1197451502084732, + "learning_rate": 0.0001891384038100736, + "loss": 0.22, + "step": 3768 + }, + { + "epoch": 0.27194343230275264, + "grad_norm": 0.10611274093389511, + "learning_rate": 0.00018913551739067688, + "loss": 0.1648, + "step": 3769 + }, + { + "epoch": 0.27201558497781303, + "grad_norm": 0.12307115644216537, + "learning_rate": 0.00018913263097128014, + "loss": 0.1849, + "step": 3770 + }, + { + "epoch": 0.2720877376528735, + "grad_norm": 0.10913196206092834, + "learning_rate": 0.0001891297445518834, + "loss": 0.1617, + "step": 3771 + }, + { + "epoch": 0.2721598903279339, + "grad_norm": 0.09902161359786987, + "learning_rate": 0.00018912685813248664, + "loss": 0.1149, + "step": 3772 + }, + { + "epoch": 0.27223204300299436, + "grad_norm": 0.11324041336774826, + "learning_rate": 0.00018912397171308993, + "loss": 0.1259, + "step": 3773 + }, + { + "epoch": 0.27230419567805475, + "grad_norm": 0.10454663634300232, + "learning_rate": 0.0001891210852936932, + "loss": 0.1425, + "step": 3774 + }, + { + "epoch": 0.2723763483531152, + "grad_norm": 0.10768377780914307, + "learning_rate": 0.00018911819887429645, + "loss": 0.1837, + "step": 3775 + }, + { + "epoch": 0.27244850102817564, + "grad_norm": 0.11168798059225082, + "learning_rate": 0.00018911531245489972, + "loss": 0.1619, + "step": 3776 + }, + { + "epoch": 0.272520653703236, + "grad_norm": 0.11275070905685425, + "learning_rate": 0.00018911242603550295, + "loss": 0.1425, + "step": 3777 + }, + { + "epoch": 0.27259280637829647, + "grad_norm": 0.10287392884492874, + "learning_rate": 0.00018910953961610622, + "loss": 0.1216, + "step": 3778 + }, + { + "epoch": 0.2726649590533569, + "grad_norm": 0.11645212024450302, + "learning_rate": 0.00018910665319670948, + "loss": 0.1777, + "step": 3779 + }, + { + "epoch": 0.27273711172841736, + "grad_norm": 0.11161747574806213, + "learning_rate": 0.00018910376677731277, + "loss": 0.1717, + "step": 3780 + }, + { + "epoch": 0.27280926440347775, + "grad_norm": 0.11371717602014542, + "learning_rate": 0.00018910088035791603, + "loss": 0.1827, + "step": 3781 + }, + { + "epoch": 0.2728814170785382, + "grad_norm": 0.1188749298453331, + "learning_rate": 0.00018909799393851927, + "loss": 0.1374, + "step": 3782 + }, + { + "epoch": 0.27295356975359863, + "grad_norm": 0.1261162906885147, + "learning_rate": 0.00018909510751912253, + "loss": 0.1285, + "step": 3783 + }, + { + "epoch": 0.273025722428659, + "grad_norm": 0.120500348508358, + "learning_rate": 0.0001890922210997258, + "loss": 0.15, + "step": 3784 + }, + { + "epoch": 0.27309787510371947, + "grad_norm": 0.12924271821975708, + "learning_rate": 0.00018908933468032906, + "loss": 0.177, + "step": 3785 + }, + { + "epoch": 0.2731700277787799, + "grad_norm": 0.1256682574748993, + "learning_rate": 0.00018908644826093232, + "loss": 0.1975, + "step": 3786 + }, + { + "epoch": 0.2732421804538403, + "grad_norm": 0.10796575993299484, + "learning_rate": 0.00018908356184153558, + "loss": 0.1844, + "step": 3787 + }, + { + "epoch": 0.27331433312890074, + "grad_norm": 0.13851940631866455, + "learning_rate": 0.00018908067542213884, + "loss": 0.1671, + "step": 3788 + }, + { + "epoch": 0.2733864858039612, + "grad_norm": 0.11101972311735153, + "learning_rate": 0.0001890777890027421, + "loss": 0.1249, + "step": 3789 + }, + { + "epoch": 0.27345863847902163, + "grad_norm": 0.11235811561346054, + "learning_rate": 0.00018907490258334537, + "loss": 0.1172, + "step": 3790 + }, + { + "epoch": 0.273530791154082, + "grad_norm": 0.08978880196809769, + "learning_rate": 0.00018907201616394863, + "loss": 0.1713, + "step": 3791 + }, + { + "epoch": 0.27360294382914246, + "grad_norm": 0.11776753515005112, + "learning_rate": 0.0001890691297445519, + "loss": 0.1518, + "step": 3792 + }, + { + "epoch": 0.2736750965042029, + "grad_norm": 0.11837328225374222, + "learning_rate": 0.00018906624332515516, + "loss": 0.1939, + "step": 3793 + }, + { + "epoch": 0.2737472491792633, + "grad_norm": 0.1469045728445053, + "learning_rate": 0.00018906335690575842, + "loss": 0.1756, + "step": 3794 + }, + { + "epoch": 0.27381940185432374, + "grad_norm": 0.12482008337974548, + "learning_rate": 0.00018906047048636168, + "loss": 0.1506, + "step": 3795 + }, + { + "epoch": 0.2738915545293842, + "grad_norm": 0.12701426446437836, + "learning_rate": 0.00018905758406696495, + "loss": 0.1602, + "step": 3796 + }, + { + "epoch": 0.27396370720444463, + "grad_norm": 0.10903041064739227, + "learning_rate": 0.0001890546976475682, + "loss": 0.1266, + "step": 3797 + }, + { + "epoch": 0.274035859879505, + "grad_norm": 0.09815473854541779, + "learning_rate": 0.00018905181122817147, + "loss": 0.1691, + "step": 3798 + }, + { + "epoch": 0.27410801255456546, + "grad_norm": 0.09265413880348206, + "learning_rate": 0.0001890489248087747, + "loss": 0.149, + "step": 3799 + }, + { + "epoch": 0.2741801652296259, + "grad_norm": 0.09038549661636353, + "learning_rate": 0.00018904603838937797, + "loss": 0.1739, + "step": 3800 + }, + { + "epoch": 0.2742523179046863, + "grad_norm": 0.13240946829319, + "learning_rate": 0.00018904315196998126, + "loss": 0.1932, + "step": 3801 + }, + { + "epoch": 0.27432447057974674, + "grad_norm": 0.11062067002058029, + "learning_rate": 0.00018904026555058453, + "loss": 0.1538, + "step": 3802 + }, + { + "epoch": 0.2743966232548072, + "grad_norm": 0.1425238698720932, + "learning_rate": 0.0001890373791311878, + "loss": 0.145, + "step": 3803 + }, + { + "epoch": 0.2744687759298676, + "grad_norm": 0.1070200502872467, + "learning_rate": 0.00018903449271179102, + "loss": 0.1648, + "step": 3804 + }, + { + "epoch": 0.274540928604928, + "grad_norm": 0.11594266444444656, + "learning_rate": 0.0001890316062923943, + "loss": 0.2021, + "step": 3805 + }, + { + "epoch": 0.27461308127998846, + "grad_norm": 0.10769648104906082, + "learning_rate": 0.00018902871987299755, + "loss": 0.1322, + "step": 3806 + }, + { + "epoch": 0.2746852339550489, + "grad_norm": 0.1301393359899521, + "learning_rate": 0.0001890258334536008, + "loss": 0.181, + "step": 3807 + }, + { + "epoch": 0.2747573866301093, + "grad_norm": 0.11476437002420425, + "learning_rate": 0.0001890229470342041, + "loss": 0.1417, + "step": 3808 + }, + { + "epoch": 0.27482953930516973, + "grad_norm": 0.09497443586587906, + "learning_rate": 0.00018902006061480734, + "loss": 0.1801, + "step": 3809 + }, + { + "epoch": 0.2749016919802302, + "grad_norm": 0.09713861346244812, + "learning_rate": 0.0001890171741954106, + "loss": 0.1347, + "step": 3810 + }, + { + "epoch": 0.2749738446552906, + "grad_norm": 0.13135769963264465, + "learning_rate": 0.00018901428777601386, + "loss": 0.1606, + "step": 3811 + }, + { + "epoch": 0.275045997330351, + "grad_norm": 0.09864150732755661, + "learning_rate": 0.00018901140135661713, + "loss": 0.1105, + "step": 3812 + }, + { + "epoch": 0.27511815000541145, + "grad_norm": 0.1272011399269104, + "learning_rate": 0.0001890085149372204, + "loss": 0.128, + "step": 3813 + }, + { + "epoch": 0.2751903026804719, + "grad_norm": 0.1033303365111351, + "learning_rate": 0.00018900562851782365, + "loss": 0.1681, + "step": 3814 + }, + { + "epoch": 0.2752624553555323, + "grad_norm": 0.14046432077884674, + "learning_rate": 0.00018900274209842692, + "loss": 0.1473, + "step": 3815 + }, + { + "epoch": 0.27533460803059273, + "grad_norm": 0.10199149698019028, + "learning_rate": 0.00018899985567903018, + "loss": 0.1824, + "step": 3816 + }, + { + "epoch": 0.2754067607056532, + "grad_norm": 0.1071721762418747, + "learning_rate": 0.00018899696925963344, + "loss": 0.1544, + "step": 3817 + }, + { + "epoch": 0.27547891338071356, + "grad_norm": 0.08951333910226822, + "learning_rate": 0.0001889940828402367, + "loss": 0.2149, + "step": 3818 + }, + { + "epoch": 0.275551066055774, + "grad_norm": 0.11420239508152008, + "learning_rate": 0.00018899119642083997, + "loss": 0.1265, + "step": 3819 + }, + { + "epoch": 0.27562321873083445, + "grad_norm": 0.12129448354244232, + "learning_rate": 0.0001889883100014432, + "loss": 0.1559, + "step": 3820 + }, + { + "epoch": 0.2756953714058949, + "grad_norm": 0.15682537853717804, + "learning_rate": 0.00018898542358204647, + "loss": 0.1485, + "step": 3821 + }, + { + "epoch": 0.2757675240809553, + "grad_norm": 0.10432837903499603, + "learning_rate": 0.00018898253716264973, + "loss": 0.1827, + "step": 3822 + }, + { + "epoch": 0.27583967675601573, + "grad_norm": 0.12009520828723907, + "learning_rate": 0.00018897965074325302, + "loss": 0.1801, + "step": 3823 + }, + { + "epoch": 0.27591182943107617, + "grad_norm": 0.12202169746160507, + "learning_rate": 0.00018897676432385628, + "loss": 0.1873, + "step": 3824 + }, + { + "epoch": 0.27598398210613656, + "grad_norm": 0.12997078895568848, + "learning_rate": 0.00018897387790445952, + "loss": 0.1842, + "step": 3825 + }, + { + "epoch": 0.276056134781197, + "grad_norm": 0.09522189944982529, + "learning_rate": 0.00018897099148506278, + "loss": 0.1388, + "step": 3826 + }, + { + "epoch": 0.27612828745625745, + "grad_norm": 0.09096496552228928, + "learning_rate": 0.00018896810506566604, + "loss": 0.1958, + "step": 3827 + }, + { + "epoch": 0.2762004401313179, + "grad_norm": 0.11269880086183548, + "learning_rate": 0.0001889652186462693, + "loss": 0.203, + "step": 3828 + }, + { + "epoch": 0.2762725928063783, + "grad_norm": 0.10250306874513626, + "learning_rate": 0.00018896233222687257, + "loss": 0.1931, + "step": 3829 + }, + { + "epoch": 0.2763447454814387, + "grad_norm": 0.10111036151647568, + "learning_rate": 0.00018895944580747583, + "loss": 0.1257, + "step": 3830 + }, + { + "epoch": 0.27641689815649917, + "grad_norm": 0.09969083219766617, + "learning_rate": 0.0001889565593880791, + "loss": 0.1363, + "step": 3831 + }, + { + "epoch": 0.27648905083155956, + "grad_norm": 0.12945561110973358, + "learning_rate": 0.00018895367296868236, + "loss": 0.1663, + "step": 3832 + }, + { + "epoch": 0.27656120350662, + "grad_norm": 0.11256492137908936, + "learning_rate": 0.00018895078654928562, + "loss": 0.2091, + "step": 3833 + }, + { + "epoch": 0.27663335618168045, + "grad_norm": 0.11610375344753265, + "learning_rate": 0.00018894790012988888, + "loss": 0.1667, + "step": 3834 + }, + { + "epoch": 0.2767055088567409, + "grad_norm": 0.11551795899868011, + "learning_rate": 0.00018894501371049215, + "loss": 0.1446, + "step": 3835 + }, + { + "epoch": 0.2767776615318013, + "grad_norm": 0.10832656919956207, + "learning_rate": 0.00018894212729109538, + "loss": 0.202, + "step": 3836 + }, + { + "epoch": 0.2768498142068617, + "grad_norm": 0.09568379819393158, + "learning_rate": 0.00018893924087169867, + "loss": 0.1885, + "step": 3837 + }, + { + "epoch": 0.27692196688192217, + "grad_norm": 0.09933038800954819, + "learning_rate": 0.00018893635445230194, + "loss": 0.1824, + "step": 3838 + }, + { + "epoch": 0.27699411955698255, + "grad_norm": 0.08672601729631424, + "learning_rate": 0.0001889334680329052, + "loss": 0.1698, + "step": 3839 + }, + { + "epoch": 0.277066272232043, + "grad_norm": 0.10415983200073242, + "learning_rate": 0.00018893058161350846, + "loss": 0.16, + "step": 3840 + }, + { + "epoch": 0.27713842490710344, + "grad_norm": 0.11694998294115067, + "learning_rate": 0.0001889276951941117, + "loss": 0.1328, + "step": 3841 + }, + { + "epoch": 0.2772105775821639, + "grad_norm": 0.1068284884095192, + "learning_rate": 0.00018892480877471496, + "loss": 0.1654, + "step": 3842 + }, + { + "epoch": 0.2772827302572243, + "grad_norm": 0.11396096646785736, + "learning_rate": 0.00018892192235531822, + "loss": 0.153, + "step": 3843 + }, + { + "epoch": 0.2773548829322847, + "grad_norm": 0.1199285089969635, + "learning_rate": 0.00018891903593592151, + "loss": 0.157, + "step": 3844 + }, + { + "epoch": 0.27742703560734516, + "grad_norm": 0.12460009753704071, + "learning_rate": 0.00018891614951652478, + "loss": 0.1268, + "step": 3845 + }, + { + "epoch": 0.27749918828240555, + "grad_norm": 0.12122397124767303, + "learning_rate": 0.000188913263097128, + "loss": 0.1929, + "step": 3846 + }, + { + "epoch": 0.277571340957466, + "grad_norm": 0.12009304016828537, + "learning_rate": 0.00018891037667773128, + "loss": 0.1462, + "step": 3847 + }, + { + "epoch": 0.27764349363252644, + "grad_norm": 0.1184767559170723, + "learning_rate": 0.00018890749025833454, + "loss": 0.1794, + "step": 3848 + }, + { + "epoch": 0.2777156463075868, + "grad_norm": 0.11086839437484741, + "learning_rate": 0.0001889046038389378, + "loss": 0.2225, + "step": 3849 + }, + { + "epoch": 0.27778779898264727, + "grad_norm": 0.1261485517024994, + "learning_rate": 0.00018890171741954106, + "loss": 0.1344, + "step": 3850 + }, + { + "epoch": 0.2778599516577077, + "grad_norm": 0.10495270788669586, + "learning_rate": 0.00018889883100014433, + "loss": 0.1623, + "step": 3851 + }, + { + "epoch": 0.27793210433276816, + "grad_norm": 0.10916092246770859, + "learning_rate": 0.0001888959445807476, + "loss": 0.1888, + "step": 3852 + }, + { + "epoch": 0.27800425700782855, + "grad_norm": 0.12027977406978607, + "learning_rate": 0.00018889305816135085, + "loss": 0.1618, + "step": 3853 + }, + { + "epoch": 0.278076409682889, + "grad_norm": 0.09191562980413437, + "learning_rate": 0.00018889017174195412, + "loss": 0.1426, + "step": 3854 + }, + { + "epoch": 0.27814856235794944, + "grad_norm": 0.0947212353348732, + "learning_rate": 0.00018888728532255738, + "loss": 0.12, + "step": 3855 + }, + { + "epoch": 0.2782207150330098, + "grad_norm": 0.12886327505111694, + "learning_rate": 0.00018888439890316064, + "loss": 0.1192, + "step": 3856 + }, + { + "epoch": 0.27829286770807027, + "grad_norm": 0.09498050063848495, + "learning_rate": 0.00018888151248376388, + "loss": 0.1316, + "step": 3857 + }, + { + "epoch": 0.2783650203831307, + "grad_norm": 0.1054006963968277, + "learning_rate": 0.00018887862606436717, + "loss": 0.1113, + "step": 3858 + }, + { + "epoch": 0.27843717305819116, + "grad_norm": 0.1302202194929123, + "learning_rate": 0.00018887573964497043, + "loss": 0.1784, + "step": 3859 + }, + { + "epoch": 0.27850932573325154, + "grad_norm": 0.13806942105293274, + "learning_rate": 0.0001888728532255737, + "loss": 0.1905, + "step": 3860 + }, + { + "epoch": 0.278581478408312, + "grad_norm": 0.10921463370323181, + "learning_rate": 0.00018886996680617696, + "loss": 0.1385, + "step": 3861 + }, + { + "epoch": 0.27865363108337243, + "grad_norm": 0.10545797646045685, + "learning_rate": 0.0001888670803867802, + "loss": 0.1844, + "step": 3862 + }, + { + "epoch": 0.2787257837584328, + "grad_norm": 0.10389558225870132, + "learning_rate": 0.00018886419396738346, + "loss": 0.1429, + "step": 3863 + }, + { + "epoch": 0.27879793643349327, + "grad_norm": 0.11613049358129501, + "learning_rate": 0.00018886130754798672, + "loss": 0.1973, + "step": 3864 + }, + { + "epoch": 0.2788700891085537, + "grad_norm": 0.09906083345413208, + "learning_rate": 0.00018885842112859, + "loss": 0.1599, + "step": 3865 + }, + { + "epoch": 0.27894224178361415, + "grad_norm": 0.15463513135910034, + "learning_rate": 0.00018885553470919327, + "loss": 0.1318, + "step": 3866 + }, + { + "epoch": 0.27901439445867454, + "grad_norm": 0.09531057626008987, + "learning_rate": 0.0001888526482897965, + "loss": 0.1453, + "step": 3867 + }, + { + "epoch": 0.279086547133735, + "grad_norm": 0.09789328277111053, + "learning_rate": 0.00018884976187039977, + "loss": 0.1928, + "step": 3868 + }, + { + "epoch": 0.27915869980879543, + "grad_norm": 0.12422343343496323, + "learning_rate": 0.00018884687545100303, + "loss": 0.1469, + "step": 3869 + }, + { + "epoch": 0.2792308524838558, + "grad_norm": 0.10871326178312302, + "learning_rate": 0.0001888439890316063, + "loss": 0.14, + "step": 3870 + }, + { + "epoch": 0.27930300515891626, + "grad_norm": 0.10778743028640747, + "learning_rate": 0.00018884110261220956, + "loss": 0.1523, + "step": 3871 + }, + { + "epoch": 0.2793751578339767, + "grad_norm": 0.08779001981019974, + "learning_rate": 0.00018883821619281282, + "loss": 0.1802, + "step": 3872 + }, + { + "epoch": 0.27944731050903715, + "grad_norm": 0.10399382561445236, + "learning_rate": 0.00018883532977341608, + "loss": 0.162, + "step": 3873 + }, + { + "epoch": 0.27951946318409754, + "grad_norm": 0.1193578690290451, + "learning_rate": 0.00018883244335401935, + "loss": 0.1275, + "step": 3874 + }, + { + "epoch": 0.279591615859158, + "grad_norm": 0.0955120399594307, + "learning_rate": 0.0001888295569346226, + "loss": 0.1615, + "step": 3875 + }, + { + "epoch": 0.2796637685342184, + "grad_norm": 0.13648675382137299, + "learning_rate": 0.00018882667051522587, + "loss": 0.1867, + "step": 3876 + }, + { + "epoch": 0.2797359212092788, + "grad_norm": 0.11467991769313812, + "learning_rate": 0.00018882378409582914, + "loss": 0.1679, + "step": 3877 + }, + { + "epoch": 0.27980807388433926, + "grad_norm": 0.10961662977933884, + "learning_rate": 0.00018882089767643237, + "loss": 0.1773, + "step": 3878 + }, + { + "epoch": 0.2798802265593997, + "grad_norm": 0.11874514073133469, + "learning_rate": 0.00018881801125703566, + "loss": 0.1586, + "step": 3879 + }, + { + "epoch": 0.2799523792344601, + "grad_norm": 0.13788311183452606, + "learning_rate": 0.00018881512483763892, + "loss": 0.2045, + "step": 3880 + }, + { + "epoch": 0.28002453190952054, + "grad_norm": 0.1315164566040039, + "learning_rate": 0.0001888122384182422, + "loss": 0.1471, + "step": 3881 + }, + { + "epoch": 0.280096684584581, + "grad_norm": 0.10409369319677353, + "learning_rate": 0.00018880935199884545, + "loss": 0.1833, + "step": 3882 + }, + { + "epoch": 0.2801688372596414, + "grad_norm": 0.14403219521045685, + "learning_rate": 0.00018880646557944869, + "loss": 0.166, + "step": 3883 + }, + { + "epoch": 0.2802409899347018, + "grad_norm": 0.13879208266735077, + "learning_rate": 0.00018880357916005195, + "loss": 0.1389, + "step": 3884 + }, + { + "epoch": 0.28031314260976226, + "grad_norm": 0.12207477539777756, + "learning_rate": 0.0001888006927406552, + "loss": 0.1949, + "step": 3885 + }, + { + "epoch": 0.2803852952848227, + "grad_norm": 0.10924966633319855, + "learning_rate": 0.0001887978063212585, + "loss": 0.1432, + "step": 3886 + }, + { + "epoch": 0.2804574479598831, + "grad_norm": 0.11043348908424377, + "learning_rate": 0.00018879491990186177, + "loss": 0.1715, + "step": 3887 + }, + { + "epoch": 0.28052960063494353, + "grad_norm": 0.12954466044902802, + "learning_rate": 0.000188792033482465, + "loss": 0.1936, + "step": 3888 + }, + { + "epoch": 0.280601753310004, + "grad_norm": 0.12958849966526031, + "learning_rate": 0.00018878914706306826, + "loss": 0.1475, + "step": 3889 + }, + { + "epoch": 0.2806739059850644, + "grad_norm": 0.08727343380451202, + "learning_rate": 0.00018878626064367153, + "loss": 0.1755, + "step": 3890 + }, + { + "epoch": 0.2807460586601248, + "grad_norm": 0.09146881103515625, + "learning_rate": 0.0001887833742242748, + "loss": 0.1693, + "step": 3891 + }, + { + "epoch": 0.28081821133518525, + "grad_norm": 0.1067313477396965, + "learning_rate": 0.00018878048780487805, + "loss": 0.1402, + "step": 3892 + }, + { + "epoch": 0.2808903640102457, + "grad_norm": 0.11091002076864243, + "learning_rate": 0.00018877760138548132, + "loss": 0.143, + "step": 3893 + }, + { + "epoch": 0.2809625166853061, + "grad_norm": 0.11535555124282837, + "learning_rate": 0.00018877471496608458, + "loss": 0.1776, + "step": 3894 + }, + { + "epoch": 0.28103466936036653, + "grad_norm": 0.12002919614315033, + "learning_rate": 0.00018877182854668784, + "loss": 0.1982, + "step": 3895 + }, + { + "epoch": 0.281106822035427, + "grad_norm": 0.11106644570827484, + "learning_rate": 0.0001887689421272911, + "loss": 0.1349, + "step": 3896 + }, + { + "epoch": 0.2811789747104874, + "grad_norm": 0.11119067668914795, + "learning_rate": 0.00018876605570789437, + "loss": 0.1834, + "step": 3897 + }, + { + "epoch": 0.2812511273855478, + "grad_norm": 0.10117504000663757, + "learning_rate": 0.00018876316928849763, + "loss": 0.1251, + "step": 3898 + }, + { + "epoch": 0.28132328006060825, + "grad_norm": 0.08032572269439697, + "learning_rate": 0.00018876028286910087, + "loss": 0.1559, + "step": 3899 + }, + { + "epoch": 0.2813954327356687, + "grad_norm": 0.1198943629860878, + "learning_rate": 0.00018875739644970416, + "loss": 0.1174, + "step": 3900 + }, + { + "epoch": 0.2814675854107291, + "grad_norm": 0.11611675471067429, + "learning_rate": 0.00018875451003030742, + "loss": 0.2006, + "step": 3901 + }, + { + "epoch": 0.2815397380857895, + "grad_norm": 0.14057551324367523, + "learning_rate": 0.00018875162361091068, + "loss": 0.1047, + "step": 3902 + }, + { + "epoch": 0.28161189076084997, + "grad_norm": 0.09256484359502792, + "learning_rate": 0.00018874873719151394, + "loss": 0.1624, + "step": 3903 + }, + { + "epoch": 0.2816840434359104, + "grad_norm": 0.10634186118841171, + "learning_rate": 0.0001887458507721172, + "loss": 0.1478, + "step": 3904 + }, + { + "epoch": 0.2817561961109708, + "grad_norm": 0.11302176862955093, + "learning_rate": 0.00018874296435272044, + "loss": 0.1519, + "step": 3905 + }, + { + "epoch": 0.28182834878603125, + "grad_norm": 0.12342038750648499, + "learning_rate": 0.0001887400779333237, + "loss": 0.1651, + "step": 3906 + }, + { + "epoch": 0.2819005014610917, + "grad_norm": 0.10243307799100876, + "learning_rate": 0.000188737191513927, + "loss": 0.1709, + "step": 3907 + }, + { + "epoch": 0.2819726541361521, + "grad_norm": 0.14496873319149017, + "learning_rate": 0.00018873430509453026, + "loss": 0.1658, + "step": 3908 + }, + { + "epoch": 0.2820448068112125, + "grad_norm": 0.123151034116745, + "learning_rate": 0.00018873141867513352, + "loss": 0.1899, + "step": 3909 + }, + { + "epoch": 0.28211695948627297, + "grad_norm": 0.11360703408718109, + "learning_rate": 0.00018872853225573676, + "loss": 0.1467, + "step": 3910 + }, + { + "epoch": 0.28218911216133336, + "grad_norm": 0.09966452419757843, + "learning_rate": 0.00018872564583634002, + "loss": 0.1606, + "step": 3911 + }, + { + "epoch": 0.2822612648363938, + "grad_norm": 0.1278088390827179, + "learning_rate": 0.00018872275941694328, + "loss": 0.1878, + "step": 3912 + }, + { + "epoch": 0.28233341751145424, + "grad_norm": 0.08595690876245499, + "learning_rate": 0.00018871987299754655, + "loss": 0.1312, + "step": 3913 + }, + { + "epoch": 0.2824055701865147, + "grad_norm": 0.10899940878152847, + "learning_rate": 0.00018871698657814984, + "loss": 0.1792, + "step": 3914 + }, + { + "epoch": 0.2824777228615751, + "grad_norm": 0.11173483729362488, + "learning_rate": 0.00018871410015875307, + "loss": 0.1327, + "step": 3915 + }, + { + "epoch": 0.2825498755366355, + "grad_norm": 0.08659325540065765, + "learning_rate": 0.00018871121373935634, + "loss": 0.1844, + "step": 3916 + }, + { + "epoch": 0.28262202821169596, + "grad_norm": 0.10769529640674591, + "learning_rate": 0.0001887083273199596, + "loss": 0.1749, + "step": 3917 + }, + { + "epoch": 0.28269418088675635, + "grad_norm": 0.10912948846817017, + "learning_rate": 0.00018870544090056286, + "loss": 0.2171, + "step": 3918 + }, + { + "epoch": 0.2827663335618168, + "grad_norm": 0.0865086242556572, + "learning_rate": 0.00018870255448116612, + "loss": 0.1299, + "step": 3919 + }, + { + "epoch": 0.28283848623687724, + "grad_norm": 0.11570220440626144, + "learning_rate": 0.0001886996680617694, + "loss": 0.1479, + "step": 3920 + }, + { + "epoch": 0.2829106389119377, + "grad_norm": 0.11451957374811172, + "learning_rate": 0.00018869678164237265, + "loss": 0.2033, + "step": 3921 + }, + { + "epoch": 0.2829827915869981, + "grad_norm": 0.10973011702299118, + "learning_rate": 0.0001886938952229759, + "loss": 0.1507, + "step": 3922 + }, + { + "epoch": 0.2830549442620585, + "grad_norm": 0.10192826390266418, + "learning_rate": 0.00018869100880357918, + "loss": 0.1661, + "step": 3923 + }, + { + "epoch": 0.28312709693711896, + "grad_norm": 0.11305376887321472, + "learning_rate": 0.00018868812238418244, + "loss": 0.1885, + "step": 3924 + }, + { + "epoch": 0.28319924961217935, + "grad_norm": 0.11453882604837418, + "learning_rate": 0.0001886852359647857, + "loss": 0.1129, + "step": 3925 + }, + { + "epoch": 0.2832714022872398, + "grad_norm": 0.14879728853702545, + "learning_rate": 0.00018868234954538894, + "loss": 0.1632, + "step": 3926 + }, + { + "epoch": 0.28334355496230024, + "grad_norm": 0.10178165137767792, + "learning_rate": 0.0001886794631259922, + "loss": 0.1971, + "step": 3927 + }, + { + "epoch": 0.2834157076373607, + "grad_norm": 0.1239686906337738, + "learning_rate": 0.0001886765767065955, + "loss": 0.1824, + "step": 3928 + }, + { + "epoch": 0.28348786031242107, + "grad_norm": 0.11629156023263931, + "learning_rate": 0.00018867369028719875, + "loss": 0.2027, + "step": 3929 + }, + { + "epoch": 0.2835600129874815, + "grad_norm": 0.1069471687078476, + "learning_rate": 0.00018867080386780202, + "loss": 0.2064, + "step": 3930 + }, + { + "epoch": 0.28363216566254196, + "grad_norm": 0.12007010728120804, + "learning_rate": 0.00018866791744840525, + "loss": 0.1326, + "step": 3931 + }, + { + "epoch": 0.28370431833760235, + "grad_norm": 0.0960017666220665, + "learning_rate": 0.00018866503102900852, + "loss": 0.1152, + "step": 3932 + }, + { + "epoch": 0.2837764710126628, + "grad_norm": 0.12916745245456696, + "learning_rate": 0.00018866214460961178, + "loss": 0.1586, + "step": 3933 + }, + { + "epoch": 0.28384862368772323, + "grad_norm": 0.10532090812921524, + "learning_rate": 0.00018865925819021504, + "loss": 0.1473, + "step": 3934 + }, + { + "epoch": 0.2839207763627837, + "grad_norm": 0.11855144798755646, + "learning_rate": 0.00018865637177081833, + "loss": 0.1472, + "step": 3935 + }, + { + "epoch": 0.28399292903784407, + "grad_norm": 0.09988211840391159, + "learning_rate": 0.00018865348535142157, + "loss": 0.1344, + "step": 3936 + }, + { + "epoch": 0.2840650817129045, + "grad_norm": 0.2618612051010132, + "learning_rate": 0.00018865059893202483, + "loss": 0.1355, + "step": 3937 + }, + { + "epoch": 0.28413723438796495, + "grad_norm": 0.1132231205701828, + "learning_rate": 0.0001886477125126281, + "loss": 0.1693, + "step": 3938 + }, + { + "epoch": 0.28420938706302534, + "grad_norm": 0.12109064310789108, + "learning_rate": 0.00018864482609323136, + "loss": 0.1847, + "step": 3939 + }, + { + "epoch": 0.2842815397380858, + "grad_norm": 0.13896405696868896, + "learning_rate": 0.00018864193967383462, + "loss": 0.1732, + "step": 3940 + }, + { + "epoch": 0.28435369241314623, + "grad_norm": 0.11479993909597397, + "learning_rate": 0.00018863905325443788, + "loss": 0.1557, + "step": 3941 + }, + { + "epoch": 0.2844258450882066, + "grad_norm": 0.10381763428449631, + "learning_rate": 0.00018863616683504114, + "loss": 0.1921, + "step": 3942 + }, + { + "epoch": 0.28449799776326706, + "grad_norm": 0.11239650100469589, + "learning_rate": 0.0001886332804156444, + "loss": 0.1338, + "step": 3943 + }, + { + "epoch": 0.2845701504383275, + "grad_norm": 0.11402308195829391, + "learning_rate": 0.00018863039399624767, + "loss": 0.1415, + "step": 3944 + }, + { + "epoch": 0.28464230311338795, + "grad_norm": 0.19553448259830475, + "learning_rate": 0.00018862750757685093, + "loss": 0.131, + "step": 3945 + }, + { + "epoch": 0.28471445578844834, + "grad_norm": 0.44547462463378906, + "learning_rate": 0.0001886246211574542, + "loss": 0.1869, + "step": 3946 + }, + { + "epoch": 0.2847866084635088, + "grad_norm": 0.4798113703727722, + "learning_rate": 0.00018862173473805743, + "loss": 0.1679, + "step": 3947 + }, + { + "epoch": 0.2848587611385692, + "grad_norm": 0.15806955099105835, + "learning_rate": 0.0001886188483186607, + "loss": 0.1831, + "step": 3948 + }, + { + "epoch": 0.2849309138136296, + "grad_norm": 0.24062255024909973, + "learning_rate": 0.00018861596189926399, + "loss": 0.1884, + "step": 3949 + }, + { + "epoch": 0.28500306648869006, + "grad_norm": 0.3865458071231842, + "learning_rate": 0.00018861307547986725, + "loss": 0.1539, + "step": 3950 + }, + { + "epoch": 0.2850752191637505, + "grad_norm": 0.41536572575569153, + "learning_rate": 0.0001886101890604705, + "loss": 0.12, + "step": 3951 + }, + { + "epoch": 0.28514737183881095, + "grad_norm": 0.26495084166526794, + "learning_rate": 0.00018860730264107375, + "loss": 0.1189, + "step": 3952 + }, + { + "epoch": 0.28521952451387134, + "grad_norm": 0.14461775124073029, + "learning_rate": 0.000188604416221677, + "loss": 0.1612, + "step": 3953 + }, + { + "epoch": 0.2852916771889318, + "grad_norm": 0.3562706410884857, + "learning_rate": 0.00018860152980228027, + "loss": 0.2264, + "step": 3954 + }, + { + "epoch": 0.2853638298639922, + "grad_norm": 0.1219395101070404, + "learning_rate": 0.00018859864338288354, + "loss": 0.1723, + "step": 3955 + }, + { + "epoch": 0.2854359825390526, + "grad_norm": 0.14860716462135315, + "learning_rate": 0.00018859575696348683, + "loss": 0.1644, + "step": 3956 + }, + { + "epoch": 0.28550813521411306, + "grad_norm": 0.5113199353218079, + "learning_rate": 0.00018859287054409006, + "loss": 0.1648, + "step": 3957 + }, + { + "epoch": 0.2855802878891735, + "grad_norm": 0.12189047038555145, + "learning_rate": 0.00018858998412469332, + "loss": 0.2147, + "step": 3958 + }, + { + "epoch": 0.28565244056423394, + "grad_norm": 0.10717660188674927, + "learning_rate": 0.0001885870977052966, + "loss": 0.157, + "step": 3959 + }, + { + "epoch": 0.28572459323929433, + "grad_norm": 0.18845148384571075, + "learning_rate": 0.00018858421128589985, + "loss": 0.1717, + "step": 3960 + }, + { + "epoch": 0.2857967459143548, + "grad_norm": 0.10378044098615646, + "learning_rate": 0.0001885813248665031, + "loss": 0.1588, + "step": 3961 + }, + { + "epoch": 0.2858688985894152, + "grad_norm": 0.11192762851715088, + "learning_rate": 0.00018857843844710638, + "loss": 0.186, + "step": 3962 + }, + { + "epoch": 0.2859410512644756, + "grad_norm": 0.10364487767219543, + "learning_rate": 0.00018857555202770964, + "loss": 0.1886, + "step": 3963 + }, + { + "epoch": 0.28601320393953605, + "grad_norm": 0.11991937458515167, + "learning_rate": 0.0001885726656083129, + "loss": 0.1597, + "step": 3964 + }, + { + "epoch": 0.2860853566145965, + "grad_norm": 0.11884426325559616, + "learning_rate": 0.00018856977918891616, + "loss": 0.1297, + "step": 3965 + }, + { + "epoch": 0.28615750928965694, + "grad_norm": 0.16260336339473724, + "learning_rate": 0.00018856689276951943, + "loss": 0.1478, + "step": 3966 + }, + { + "epoch": 0.28622966196471733, + "grad_norm": 0.12283408641815186, + "learning_rate": 0.0001885640063501227, + "loss": 0.1809, + "step": 3967 + }, + { + "epoch": 0.2863018146397778, + "grad_norm": 0.1470719277858734, + "learning_rate": 0.00018856111993072593, + "loss": 0.1567, + "step": 3968 + }, + { + "epoch": 0.2863739673148382, + "grad_norm": 0.13070467114448547, + "learning_rate": 0.0001885582335113292, + "loss": 0.1354, + "step": 3969 + }, + { + "epoch": 0.2864461199898986, + "grad_norm": 0.13749848306179047, + "learning_rate": 0.00018855534709193248, + "loss": 0.1292, + "step": 3970 + }, + { + "epoch": 0.28651827266495905, + "grad_norm": 0.09411299228668213, + "learning_rate": 0.00018855246067253574, + "loss": 0.1691, + "step": 3971 + }, + { + "epoch": 0.2865904253400195, + "grad_norm": 0.12227313220500946, + "learning_rate": 0.000188549574253139, + "loss": 0.1288, + "step": 3972 + }, + { + "epoch": 0.2866625780150799, + "grad_norm": 0.10804979503154755, + "learning_rate": 0.00018854668783374224, + "loss": 0.1803, + "step": 3973 + }, + { + "epoch": 0.2867347306901403, + "grad_norm": 0.10891221463680267, + "learning_rate": 0.0001885438014143455, + "loss": 0.1358, + "step": 3974 + }, + { + "epoch": 0.28680688336520077, + "grad_norm": 0.10148027539253235, + "learning_rate": 0.00018854091499494877, + "loss": 0.1354, + "step": 3975 + }, + { + "epoch": 0.2868790360402612, + "grad_norm": 0.11599002033472061, + "learning_rate": 0.00018853802857555203, + "loss": 0.1266, + "step": 3976 + }, + { + "epoch": 0.2869511887153216, + "grad_norm": 0.10220354050397873, + "learning_rate": 0.00018853514215615532, + "loss": 0.1739, + "step": 3977 + }, + { + "epoch": 0.28702334139038205, + "grad_norm": 0.13676996529102325, + "learning_rate": 0.00018853225573675856, + "loss": 0.1635, + "step": 3978 + }, + { + "epoch": 0.2870954940654425, + "grad_norm": 0.108549565076828, + "learning_rate": 0.00018852936931736182, + "loss": 0.1409, + "step": 3979 + }, + { + "epoch": 0.2871676467405029, + "grad_norm": 0.11575684696435928, + "learning_rate": 0.00018852648289796508, + "loss": 0.1629, + "step": 3980 + }, + { + "epoch": 0.2872397994155633, + "grad_norm": 0.10409197956323624, + "learning_rate": 0.00018852359647856834, + "loss": 0.1423, + "step": 3981 + }, + { + "epoch": 0.28731195209062377, + "grad_norm": 0.12796637415885925, + "learning_rate": 0.0001885207100591716, + "loss": 0.1709, + "step": 3982 + }, + { + "epoch": 0.2873841047656842, + "grad_norm": 0.13579745590686798, + "learning_rate": 0.00018851782363977487, + "loss": 0.1449, + "step": 3983 + }, + { + "epoch": 0.2874562574407446, + "grad_norm": 0.16057123243808746, + "learning_rate": 0.00018851493722037813, + "loss": 0.1708, + "step": 3984 + }, + { + "epoch": 0.28752841011580504, + "grad_norm": 0.12024898827075958, + "learning_rate": 0.0001885120508009814, + "loss": 0.1512, + "step": 3985 + }, + { + "epoch": 0.2876005627908655, + "grad_norm": 0.13466006517410278, + "learning_rate": 0.00018850916438158466, + "loss": 0.1547, + "step": 3986 + }, + { + "epoch": 0.2876727154659259, + "grad_norm": 0.2027030736207962, + "learning_rate": 0.00018850627796218792, + "loss": 0.2007, + "step": 3987 + }, + { + "epoch": 0.2877448681409863, + "grad_norm": 0.10973954200744629, + "learning_rate": 0.00018850339154279118, + "loss": 0.1728, + "step": 3988 + }, + { + "epoch": 0.28781702081604676, + "grad_norm": 0.10212751477956772, + "learning_rate": 0.00018850050512339442, + "loss": 0.203, + "step": 3989 + }, + { + "epoch": 0.2878891734911072, + "grad_norm": 0.10394073277711868, + "learning_rate": 0.00018849761870399768, + "loss": 0.195, + "step": 3990 + }, + { + "epoch": 0.2879613261661676, + "grad_norm": 0.15195779502391815, + "learning_rate": 0.00018849473228460097, + "loss": 0.1347, + "step": 3991 + }, + { + "epoch": 0.28803347884122804, + "grad_norm": 0.1137838140130043, + "learning_rate": 0.00018849184586520424, + "loss": 0.1627, + "step": 3992 + }, + { + "epoch": 0.2881056315162885, + "grad_norm": 1.0346198081970215, + "learning_rate": 0.0001884889594458075, + "loss": 0.1806, + "step": 3993 + }, + { + "epoch": 0.2881777841913489, + "grad_norm": 0.12991052865982056, + "learning_rate": 0.00018848607302641074, + "loss": 0.1341, + "step": 3994 + }, + { + "epoch": 0.2882499368664093, + "grad_norm": 0.11583830416202545, + "learning_rate": 0.000188483186607014, + "loss": 0.135, + "step": 3995 + }, + { + "epoch": 0.28832208954146976, + "grad_norm": 2.006385087966919, + "learning_rate": 0.00018848030018761726, + "loss": 0.1573, + "step": 3996 + }, + { + "epoch": 0.2883942422165302, + "grad_norm": 0.24502481520175934, + "learning_rate": 0.00018847741376822052, + "loss": 0.1416, + "step": 3997 + }, + { + "epoch": 0.2884663948915906, + "grad_norm": 0.6196821928024292, + "learning_rate": 0.00018847452734882381, + "loss": 0.1717, + "step": 3998 + }, + { + "epoch": 0.28853854756665104, + "grad_norm": 0.23879577219486237, + "learning_rate": 0.00018847164092942705, + "loss": 0.1534, + "step": 3999 + }, + { + "epoch": 0.2886107002417115, + "grad_norm": 0.13388881087303162, + "learning_rate": 0.0001884687545100303, + "loss": 0.1506, + "step": 4000 + }, + { + "epoch": 0.28868285291677187, + "grad_norm": 0.11309870332479477, + "learning_rate": 0.00018846586809063358, + "loss": 0.1596, + "step": 4001 + }, + { + "epoch": 0.2887550055918323, + "grad_norm": 0.12458118051290512, + "learning_rate": 0.00018846298167123684, + "loss": 0.1693, + "step": 4002 + }, + { + "epoch": 0.28882715826689276, + "grad_norm": 0.1627064347267151, + "learning_rate": 0.0001884600952518401, + "loss": 0.1911, + "step": 4003 + }, + { + "epoch": 0.28889931094195315, + "grad_norm": 0.13531793653964996, + "learning_rate": 0.00018845720883244336, + "loss": 0.1798, + "step": 4004 + }, + { + "epoch": 0.2889714636170136, + "grad_norm": 0.235118567943573, + "learning_rate": 0.00018845432241304663, + "loss": 0.1795, + "step": 4005 + }, + { + "epoch": 0.28904361629207403, + "grad_norm": 0.13340897858142853, + "learning_rate": 0.0001884514359936499, + "loss": 0.13, + "step": 4006 + }, + { + "epoch": 0.2891157689671345, + "grad_norm": 0.13717308640480042, + "learning_rate": 0.00018844854957425315, + "loss": 0.1247, + "step": 4007 + }, + { + "epoch": 0.28918792164219487, + "grad_norm": 0.12918154895305634, + "learning_rate": 0.00018844566315485642, + "loss": 0.1828, + "step": 4008 + }, + { + "epoch": 0.2892600743172553, + "grad_norm": 0.12834350764751434, + "learning_rate": 0.00018844277673545968, + "loss": 0.1492, + "step": 4009 + }, + { + "epoch": 0.28933222699231576, + "grad_norm": 0.1537393033504486, + "learning_rate": 0.00018843989031606291, + "loss": 0.1459, + "step": 4010 + }, + { + "epoch": 0.28940437966737614, + "grad_norm": 0.2367478758096695, + "learning_rate": 0.00018843700389666618, + "loss": 0.1588, + "step": 4011 + }, + { + "epoch": 0.2894765323424366, + "grad_norm": 0.1759425699710846, + "learning_rate": 0.00018843411747726947, + "loss": 0.1786, + "step": 4012 + }, + { + "epoch": 0.28954868501749703, + "grad_norm": 0.12869958579540253, + "learning_rate": 0.00018843123105787273, + "loss": 0.17, + "step": 4013 + }, + { + "epoch": 0.2896208376925575, + "grad_norm": 0.13942141830921173, + "learning_rate": 0.000188428344638476, + "loss": 0.1487, + "step": 4014 + }, + { + "epoch": 0.28969299036761786, + "grad_norm": 0.13664652407169342, + "learning_rate": 0.00018842545821907923, + "loss": 0.1811, + "step": 4015 + }, + { + "epoch": 0.2897651430426783, + "grad_norm": 0.15287500619888306, + "learning_rate": 0.0001884225717996825, + "loss": 0.2197, + "step": 4016 + }, + { + "epoch": 0.28983729571773875, + "grad_norm": 0.11458779126405716, + "learning_rate": 0.00018841968538028576, + "loss": 0.1378, + "step": 4017 + }, + { + "epoch": 0.28990944839279914, + "grad_norm": 0.1235150545835495, + "learning_rate": 0.00018841679896088902, + "loss": 0.156, + "step": 4018 + }, + { + "epoch": 0.2899816010678596, + "grad_norm": 0.17336517572402954, + "learning_rate": 0.00018841391254149228, + "loss": 0.159, + "step": 4019 + }, + { + "epoch": 0.29005375374292003, + "grad_norm": 0.1811176836490631, + "learning_rate": 0.00018841102612209554, + "loss": 0.1585, + "step": 4020 + }, + { + "epoch": 0.2901259064179805, + "grad_norm": 0.11761893332004547, + "learning_rate": 0.0001884081397026988, + "loss": 0.1588, + "step": 4021 + }, + { + "epoch": 0.29019805909304086, + "grad_norm": 0.12895290553569794, + "learning_rate": 0.00018840525328330207, + "loss": 0.1059, + "step": 4022 + }, + { + "epoch": 0.2902702117681013, + "grad_norm": 0.1484558880329132, + "learning_rate": 0.00018840236686390533, + "loss": 0.1516, + "step": 4023 + }, + { + "epoch": 0.29034236444316175, + "grad_norm": 0.11516424268484116, + "learning_rate": 0.0001883994804445086, + "loss": 0.1209, + "step": 4024 + }, + { + "epoch": 0.29041451711822214, + "grad_norm": 0.17815934121608734, + "learning_rate": 0.00018839659402511186, + "loss": 0.1315, + "step": 4025 + }, + { + "epoch": 0.2904866697932826, + "grad_norm": 0.12998010218143463, + "learning_rate": 0.00018839370760571512, + "loss": 0.1297, + "step": 4026 + }, + { + "epoch": 0.290558822468343, + "grad_norm": 0.15977595746517181, + "learning_rate": 0.00018839082118631838, + "loss": 0.1891, + "step": 4027 + }, + { + "epoch": 0.29063097514340347, + "grad_norm": 0.12049231678247452, + "learning_rate": 0.00018838793476692165, + "loss": 0.1191, + "step": 4028 + }, + { + "epoch": 0.29070312781846386, + "grad_norm": 0.14954523742198944, + "learning_rate": 0.0001883850483475249, + "loss": 0.1315, + "step": 4029 + }, + { + "epoch": 0.2907752804935243, + "grad_norm": 0.1778295636177063, + "learning_rate": 0.00018838216192812817, + "loss": 0.1761, + "step": 4030 + }, + { + "epoch": 0.29084743316858475, + "grad_norm": 0.1837385594844818, + "learning_rate": 0.00018837927550873144, + "loss": 0.168, + "step": 4031 + }, + { + "epoch": 0.29091958584364513, + "grad_norm": 0.12899182736873627, + "learning_rate": 0.00018837638908933467, + "loss": 0.1394, + "step": 4032 + }, + { + "epoch": 0.2909917385187056, + "grad_norm": 0.15641450881958008, + "learning_rate": 0.00018837350266993793, + "loss": 0.1565, + "step": 4033 + }, + { + "epoch": 0.291063891193766, + "grad_norm": 0.12071597576141357, + "learning_rate": 0.00018837061625054122, + "loss": 0.1602, + "step": 4034 + }, + { + "epoch": 0.2911360438688264, + "grad_norm": 0.12751080095767975, + "learning_rate": 0.0001883677298311445, + "loss": 0.1713, + "step": 4035 + }, + { + "epoch": 0.29120819654388685, + "grad_norm": 0.13059987127780914, + "learning_rate": 0.00018836484341174775, + "loss": 0.1678, + "step": 4036 + }, + { + "epoch": 0.2912803492189473, + "grad_norm": 0.11258640140295029, + "learning_rate": 0.000188361956992351, + "loss": 0.1422, + "step": 4037 + }, + { + "epoch": 0.29135250189400774, + "grad_norm": 0.12477847933769226, + "learning_rate": 0.00018835907057295425, + "loss": 0.1751, + "step": 4038 + }, + { + "epoch": 0.29142465456906813, + "grad_norm": 0.10548390448093414, + "learning_rate": 0.0001883561841535575, + "loss": 0.1278, + "step": 4039 + }, + { + "epoch": 0.2914968072441286, + "grad_norm": 0.11356746405363083, + "learning_rate": 0.00018835329773416078, + "loss": 0.186, + "step": 4040 + }, + { + "epoch": 0.291568959919189, + "grad_norm": 0.11771997064352036, + "learning_rate": 0.00018835041131476407, + "loss": 0.1914, + "step": 4041 + }, + { + "epoch": 0.2916411125942494, + "grad_norm": 0.10375703871250153, + "learning_rate": 0.0001883475248953673, + "loss": 0.1668, + "step": 4042 + }, + { + "epoch": 0.29171326526930985, + "grad_norm": 0.1297137439250946, + "learning_rate": 0.00018834463847597056, + "loss": 0.1757, + "step": 4043 + }, + { + "epoch": 0.2917854179443703, + "grad_norm": 0.122823566198349, + "learning_rate": 0.00018834175205657383, + "loss": 0.1312, + "step": 4044 + }, + { + "epoch": 0.29185757061943074, + "grad_norm": 0.11158803850412369, + "learning_rate": 0.0001883388656371771, + "loss": 0.1748, + "step": 4045 + }, + { + "epoch": 0.29192972329449113, + "grad_norm": 0.12954041361808777, + "learning_rate": 0.00018833597921778035, + "loss": 0.1422, + "step": 4046 + }, + { + "epoch": 0.29200187596955157, + "grad_norm": 0.11176861077547073, + "learning_rate": 0.00018833309279838362, + "loss": 0.1899, + "step": 4047 + }, + { + "epoch": 0.292074028644612, + "grad_norm": 0.13838274776935577, + "learning_rate": 0.00018833020637898688, + "loss": 0.1687, + "step": 4048 + }, + { + "epoch": 0.2921461813196724, + "grad_norm": 0.11651098728179932, + "learning_rate": 0.00018832731995959014, + "loss": 0.1347, + "step": 4049 + }, + { + "epoch": 0.29221833399473285, + "grad_norm": 0.1457776576280594, + "learning_rate": 0.0001883244335401934, + "loss": 0.1436, + "step": 4050 + }, + { + "epoch": 0.2922904866697933, + "grad_norm": 0.16298915445804596, + "learning_rate": 0.00018832154712079667, + "loss": 0.1437, + "step": 4051 + }, + { + "epoch": 0.29236263934485374, + "grad_norm": 0.12889905273914337, + "learning_rate": 0.00018831866070139993, + "loss": 0.1518, + "step": 4052 + }, + { + "epoch": 0.2924347920199141, + "grad_norm": 0.102033831179142, + "learning_rate": 0.00018831577428200317, + "loss": 0.1563, + "step": 4053 + }, + { + "epoch": 0.29250694469497457, + "grad_norm": 0.1378796100616455, + "learning_rate": 0.00018831288786260643, + "loss": 0.16, + "step": 4054 + }, + { + "epoch": 0.292579097370035, + "grad_norm": 0.12615390121936798, + "learning_rate": 0.00018831000144320972, + "loss": 0.1797, + "step": 4055 + }, + { + "epoch": 0.2926512500450954, + "grad_norm": 0.12225451320409775, + "learning_rate": 0.00018830711502381298, + "loss": 0.1335, + "step": 4056 + }, + { + "epoch": 0.29272340272015585, + "grad_norm": 0.1552281379699707, + "learning_rate": 0.00018830422860441625, + "loss": 0.1351, + "step": 4057 + }, + { + "epoch": 0.2927955553952163, + "grad_norm": 0.17103096842765808, + "learning_rate": 0.00018830134218501948, + "loss": 0.1857, + "step": 4058 + }, + { + "epoch": 0.29286770807027673, + "grad_norm": 0.14693373441696167, + "learning_rate": 0.00018829845576562274, + "loss": 0.1428, + "step": 4059 + }, + { + "epoch": 0.2929398607453371, + "grad_norm": 0.13027967512607574, + "learning_rate": 0.000188295569346226, + "loss": 0.1567, + "step": 4060 + }, + { + "epoch": 0.29301201342039757, + "grad_norm": 0.09253823757171631, + "learning_rate": 0.00018829268292682927, + "loss": 0.1417, + "step": 4061 + }, + { + "epoch": 0.293084166095458, + "grad_norm": 0.10826113075017929, + "learning_rate": 0.00018828979650743256, + "loss": 0.1642, + "step": 4062 + }, + { + "epoch": 0.2931563187705184, + "grad_norm": 0.12423907965421677, + "learning_rate": 0.0001882869100880358, + "loss": 0.159, + "step": 4063 + }, + { + "epoch": 0.29322847144557884, + "grad_norm": 0.10775986313819885, + "learning_rate": 0.00018828402366863906, + "loss": 0.1565, + "step": 4064 + }, + { + "epoch": 0.2933006241206393, + "grad_norm": 0.12115251272916794, + "learning_rate": 0.00018828113724924232, + "loss": 0.1536, + "step": 4065 + }, + { + "epoch": 0.2933727767956997, + "grad_norm": 0.10422796756029129, + "learning_rate": 0.00018827825082984558, + "loss": 0.1733, + "step": 4066 + }, + { + "epoch": 0.2934449294707601, + "grad_norm": 0.1318529099225998, + "learning_rate": 0.00018827536441044885, + "loss": 0.187, + "step": 4067 + }, + { + "epoch": 0.29351708214582056, + "grad_norm": 0.10229651629924774, + "learning_rate": 0.0001882724779910521, + "loss": 0.149, + "step": 4068 + }, + { + "epoch": 0.293589234820881, + "grad_norm": 0.17193888127803802, + "learning_rate": 0.00018826959157165537, + "loss": 0.2011, + "step": 4069 + }, + { + "epoch": 0.2936613874959414, + "grad_norm": 0.11080505698919296, + "learning_rate": 0.00018826670515225864, + "loss": 0.1973, + "step": 4070 + }, + { + "epoch": 0.29373354017100184, + "grad_norm": 0.164872944355011, + "learning_rate": 0.0001882638187328619, + "loss": 0.177, + "step": 4071 + }, + { + "epoch": 0.2938056928460623, + "grad_norm": 0.12402703613042831, + "learning_rate": 0.00018826093231346516, + "loss": 0.1304, + "step": 4072 + }, + { + "epoch": 0.29387784552112267, + "grad_norm": 0.12431920319795609, + "learning_rate": 0.00018825804589406842, + "loss": 0.1985, + "step": 4073 + }, + { + "epoch": 0.2939499981961831, + "grad_norm": 0.11424025893211365, + "learning_rate": 0.00018825515947467166, + "loss": 0.166, + "step": 4074 + }, + { + "epoch": 0.29402215087124356, + "grad_norm": 0.1481512039899826, + "learning_rate": 0.00018825227305527492, + "loss": 0.1308, + "step": 4075 + }, + { + "epoch": 0.294094303546304, + "grad_norm": 0.13233418762683868, + "learning_rate": 0.0001882493866358782, + "loss": 0.1528, + "step": 4076 + }, + { + "epoch": 0.2941664562213644, + "grad_norm": 0.1373155415058136, + "learning_rate": 0.00018824650021648148, + "loss": 0.1981, + "step": 4077 + }, + { + "epoch": 0.29423860889642484, + "grad_norm": 0.15324561297893524, + "learning_rate": 0.00018824361379708474, + "loss": 0.1932, + "step": 4078 + }, + { + "epoch": 0.2943107615714853, + "grad_norm": 0.10070636123418808, + "learning_rate": 0.00018824072737768798, + "loss": 0.1665, + "step": 4079 + }, + { + "epoch": 0.29438291424654567, + "grad_norm": 0.14961004257202148, + "learning_rate": 0.00018823784095829124, + "loss": 0.166, + "step": 4080 + }, + { + "epoch": 0.2944550669216061, + "grad_norm": 0.11874378472566605, + "learning_rate": 0.0001882349545388945, + "loss": 0.1582, + "step": 4081 + }, + { + "epoch": 0.29452721959666656, + "grad_norm": 0.11802510917186737, + "learning_rate": 0.00018823206811949776, + "loss": 0.1271, + "step": 4082 + }, + { + "epoch": 0.294599372271727, + "grad_norm": 0.1156727597117424, + "learning_rate": 0.00018822918170010105, + "loss": 0.1494, + "step": 4083 + }, + { + "epoch": 0.2946715249467874, + "grad_norm": 0.13092243671417236, + "learning_rate": 0.0001882262952807043, + "loss": 0.1736, + "step": 4084 + }, + { + "epoch": 0.29474367762184783, + "grad_norm": 0.1102990210056305, + "learning_rate": 0.00018822340886130755, + "loss": 0.1765, + "step": 4085 + }, + { + "epoch": 0.2948158302969083, + "grad_norm": 0.13617543876171112, + "learning_rate": 0.00018822052244191082, + "loss": 0.1995, + "step": 4086 + }, + { + "epoch": 0.29488798297196867, + "grad_norm": 0.17690233886241913, + "learning_rate": 0.00018821763602251408, + "loss": 0.1253, + "step": 4087 + }, + { + "epoch": 0.2949601356470291, + "grad_norm": 0.1378060132265091, + "learning_rate": 0.00018821474960311734, + "loss": 0.1225, + "step": 4088 + }, + { + "epoch": 0.29503228832208955, + "grad_norm": 0.10630947351455688, + "learning_rate": 0.0001882118631837206, + "loss": 0.0867, + "step": 4089 + }, + { + "epoch": 0.29510444099714994, + "grad_norm": 0.17138361930847168, + "learning_rate": 0.00018820897676432387, + "loss": 0.1546, + "step": 4090 + }, + { + "epoch": 0.2951765936722104, + "grad_norm": 0.1295410692691803, + "learning_rate": 0.00018820609034492713, + "loss": 0.194, + "step": 4091 + }, + { + "epoch": 0.29524874634727083, + "grad_norm": 0.13298285007476807, + "learning_rate": 0.0001882032039255304, + "loss": 0.1998, + "step": 4092 + }, + { + "epoch": 0.2953208990223313, + "grad_norm": 0.11028098315000534, + "learning_rate": 0.00018820031750613366, + "loss": 0.1298, + "step": 4093 + }, + { + "epoch": 0.29539305169739166, + "grad_norm": 0.11302671581506729, + "learning_rate": 0.00018819743108673692, + "loss": 0.107, + "step": 4094 + }, + { + "epoch": 0.2954652043724521, + "grad_norm": 0.12494118511676788, + "learning_rate": 0.00018819454466734015, + "loss": 0.1429, + "step": 4095 + }, + { + "epoch": 0.29553735704751255, + "grad_norm": 0.15362714231014252, + "learning_rate": 0.00018819165824794342, + "loss": 0.1242, + "step": 4096 + }, + { + "epoch": 0.29560950972257294, + "grad_norm": 0.12694497406482697, + "learning_rate": 0.0001881887718285467, + "loss": 0.1348, + "step": 4097 + }, + { + "epoch": 0.2956816623976334, + "grad_norm": 0.10499429702758789, + "learning_rate": 0.00018818588540914997, + "loss": 0.1069, + "step": 4098 + }, + { + "epoch": 0.2957538150726938, + "grad_norm": 0.11628948152065277, + "learning_rate": 0.00018818299898975323, + "loss": 0.1158, + "step": 4099 + }, + { + "epoch": 0.29582596774775427, + "grad_norm": 0.09497885406017303, + "learning_rate": 0.00018818011257035647, + "loss": 0.2063, + "step": 4100 + }, + { + "epoch": 0.29589812042281466, + "grad_norm": 0.11997506767511368, + "learning_rate": 0.00018817722615095973, + "loss": 0.1884, + "step": 4101 + }, + { + "epoch": 0.2959702730978751, + "grad_norm": 0.11558733880519867, + "learning_rate": 0.000188174339731563, + "loss": 0.1529, + "step": 4102 + }, + { + "epoch": 0.29604242577293555, + "grad_norm": 0.11531398445367813, + "learning_rate": 0.00018817145331216626, + "loss": 0.1241, + "step": 4103 + }, + { + "epoch": 0.29611457844799594, + "grad_norm": 0.11987558007240295, + "learning_rate": 0.00018816856689276955, + "loss": 0.1177, + "step": 4104 + }, + { + "epoch": 0.2961867311230564, + "grad_norm": 0.13128049671649933, + "learning_rate": 0.00018816568047337278, + "loss": 0.1799, + "step": 4105 + }, + { + "epoch": 0.2962588837981168, + "grad_norm": 0.1083531603217125, + "learning_rate": 0.00018816279405397605, + "loss": 0.1408, + "step": 4106 + }, + { + "epoch": 0.29633103647317727, + "grad_norm": 0.14563891291618347, + "learning_rate": 0.0001881599076345793, + "loss": 0.1904, + "step": 4107 + }, + { + "epoch": 0.29640318914823766, + "grad_norm": 0.1262306571006775, + "learning_rate": 0.00018815702121518257, + "loss": 0.175, + "step": 4108 + }, + { + "epoch": 0.2964753418232981, + "grad_norm": 0.11391209810972214, + "learning_rate": 0.00018815413479578584, + "loss": 0.1828, + "step": 4109 + }, + { + "epoch": 0.29654749449835854, + "grad_norm": 0.1466747671365738, + "learning_rate": 0.0001881512483763891, + "loss": 0.1566, + "step": 4110 + }, + { + "epoch": 0.29661964717341893, + "grad_norm": 0.09921132773160934, + "learning_rate": 0.00018814836195699236, + "loss": 0.2183, + "step": 4111 + }, + { + "epoch": 0.2966917998484794, + "grad_norm": 0.1014072522521019, + "learning_rate": 0.00018814547553759562, + "loss": 0.1362, + "step": 4112 + }, + { + "epoch": 0.2967639525235398, + "grad_norm": 0.1419476717710495, + "learning_rate": 0.0001881425891181989, + "loss": 0.1797, + "step": 4113 + }, + { + "epoch": 0.29683610519860026, + "grad_norm": 0.14102686941623688, + "learning_rate": 0.00018813970269880215, + "loss": 0.1651, + "step": 4114 + }, + { + "epoch": 0.29690825787366065, + "grad_norm": 0.11882190406322479, + "learning_rate": 0.0001881368162794054, + "loss": 0.1252, + "step": 4115 + }, + { + "epoch": 0.2969804105487211, + "grad_norm": 0.10842740535736084, + "learning_rate": 0.00018813392986000865, + "loss": 0.1866, + "step": 4116 + }, + { + "epoch": 0.29705256322378154, + "grad_norm": 0.12704412639141083, + "learning_rate": 0.0001881310434406119, + "loss": 0.1764, + "step": 4117 + }, + { + "epoch": 0.29712471589884193, + "grad_norm": 0.09052251279354095, + "learning_rate": 0.0001881281570212152, + "loss": 0.1465, + "step": 4118 + }, + { + "epoch": 0.2971968685739024, + "grad_norm": 0.10798463970422745, + "learning_rate": 0.00018812527060181846, + "loss": 0.1193, + "step": 4119 + }, + { + "epoch": 0.2972690212489628, + "grad_norm": 0.149445578455925, + "learning_rate": 0.00018812238418242173, + "loss": 0.1578, + "step": 4120 + }, + { + "epoch": 0.2973411739240232, + "grad_norm": 0.10103411972522736, + "learning_rate": 0.00018811949776302496, + "loss": 0.1184, + "step": 4121 + }, + { + "epoch": 0.29741332659908365, + "grad_norm": 0.14422334730625153, + "learning_rate": 0.00018811661134362823, + "loss": 0.1892, + "step": 4122 + }, + { + "epoch": 0.2974854792741441, + "grad_norm": 0.14310158789157867, + "learning_rate": 0.0001881137249242315, + "loss": 0.1733, + "step": 4123 + }, + { + "epoch": 0.29755763194920454, + "grad_norm": 0.1744057536125183, + "learning_rate": 0.00018811083850483475, + "loss": 0.1628, + "step": 4124 + }, + { + "epoch": 0.2976297846242649, + "grad_norm": 0.16436494886875153, + "learning_rate": 0.00018810795208543804, + "loss": 0.1426, + "step": 4125 + }, + { + "epoch": 0.29770193729932537, + "grad_norm": 0.10014980286359787, + "learning_rate": 0.00018810506566604128, + "loss": 0.1185, + "step": 4126 + }, + { + "epoch": 0.2977740899743858, + "grad_norm": 0.12728647887706757, + "learning_rate": 0.00018810217924664454, + "loss": 0.1495, + "step": 4127 + }, + { + "epoch": 0.2978462426494462, + "grad_norm": 0.09811389446258545, + "learning_rate": 0.0001880992928272478, + "loss": 0.1469, + "step": 4128 + }, + { + "epoch": 0.29791839532450665, + "grad_norm": 0.10743153095245361, + "learning_rate": 0.00018809640640785107, + "loss": 0.1497, + "step": 4129 + }, + { + "epoch": 0.2979905479995671, + "grad_norm": 0.11010617017745972, + "learning_rate": 0.00018809351998845433, + "loss": 0.1747, + "step": 4130 + }, + { + "epoch": 0.29806270067462753, + "grad_norm": 0.11767455190420151, + "learning_rate": 0.0001880906335690576, + "loss": 0.1553, + "step": 4131 + }, + { + "epoch": 0.2981348533496879, + "grad_norm": 0.13255800306797028, + "learning_rate": 0.00018808774714966086, + "loss": 0.1649, + "step": 4132 + }, + { + "epoch": 0.29820700602474837, + "grad_norm": 0.10767663270235062, + "learning_rate": 0.00018808486073026412, + "loss": 0.2007, + "step": 4133 + }, + { + "epoch": 0.2982791586998088, + "grad_norm": 0.11623447388410568, + "learning_rate": 0.00018808197431086738, + "loss": 0.1141, + "step": 4134 + }, + { + "epoch": 0.2983513113748692, + "grad_norm": 0.12203383445739746, + "learning_rate": 0.00018807908789147064, + "loss": 0.1472, + "step": 4135 + }, + { + "epoch": 0.29842346404992964, + "grad_norm": 0.1434355527162552, + "learning_rate": 0.0001880762014720739, + "loss": 0.2322, + "step": 4136 + }, + { + "epoch": 0.2984956167249901, + "grad_norm": 0.11646952480077744, + "learning_rate": 0.00018807331505267717, + "loss": 0.1597, + "step": 4137 + }, + { + "epoch": 0.29856776940005053, + "grad_norm": 0.12211567163467407, + "learning_rate": 0.0001880704286332804, + "loss": 0.1774, + "step": 4138 + }, + { + "epoch": 0.2986399220751109, + "grad_norm": 0.09796212613582611, + "learning_rate": 0.0001880675422138837, + "loss": 0.1474, + "step": 4139 + }, + { + "epoch": 0.29871207475017136, + "grad_norm": 0.11343994736671448, + "learning_rate": 0.00018806465579448696, + "loss": 0.1741, + "step": 4140 + }, + { + "epoch": 0.2987842274252318, + "grad_norm": 0.10891371220350266, + "learning_rate": 0.00018806176937509022, + "loss": 0.1269, + "step": 4141 + }, + { + "epoch": 0.2988563801002922, + "grad_norm": 0.1293962597846985, + "learning_rate": 0.00018805888295569348, + "loss": 0.2021, + "step": 4142 + }, + { + "epoch": 0.29892853277535264, + "grad_norm": 0.10513212531805038, + "learning_rate": 0.00018805599653629672, + "loss": 0.1557, + "step": 4143 + }, + { + "epoch": 0.2990006854504131, + "grad_norm": 0.10921186208724976, + "learning_rate": 0.00018805311011689998, + "loss": 0.1242, + "step": 4144 + }, + { + "epoch": 0.29907283812547353, + "grad_norm": 0.11698474735021591, + "learning_rate": 0.00018805022369750325, + "loss": 0.1721, + "step": 4145 + }, + { + "epoch": 0.2991449908005339, + "grad_norm": 0.10917928814888, + "learning_rate": 0.00018804733727810654, + "loss": 0.1267, + "step": 4146 + }, + { + "epoch": 0.29921714347559436, + "grad_norm": 0.10929599404335022, + "learning_rate": 0.0001880444508587098, + "loss": 0.1566, + "step": 4147 + }, + { + "epoch": 0.2992892961506548, + "grad_norm": 0.12578366696834564, + "learning_rate": 0.00018804156443931304, + "loss": 0.1473, + "step": 4148 + }, + { + "epoch": 0.2993614488257152, + "grad_norm": 0.11903616040945053, + "learning_rate": 0.0001880386780199163, + "loss": 0.1594, + "step": 4149 + }, + { + "epoch": 0.29943360150077564, + "grad_norm": 0.1281002014875412, + "learning_rate": 0.00018803579160051956, + "loss": 0.1956, + "step": 4150 + }, + { + "epoch": 0.2995057541758361, + "grad_norm": 0.1199919581413269, + "learning_rate": 0.00018803290518112282, + "loss": 0.1469, + "step": 4151 + }, + { + "epoch": 0.29957790685089647, + "grad_norm": 0.11935439705848694, + "learning_rate": 0.0001880300187617261, + "loss": 0.1396, + "step": 4152 + }, + { + "epoch": 0.2996500595259569, + "grad_norm": 0.11172063648700714, + "learning_rate": 0.00018802713234232935, + "loss": 0.147, + "step": 4153 + }, + { + "epoch": 0.29972221220101736, + "grad_norm": 0.11805318295955658, + "learning_rate": 0.0001880242459229326, + "loss": 0.1475, + "step": 4154 + }, + { + "epoch": 0.2997943648760778, + "grad_norm": 0.12204427272081375, + "learning_rate": 0.00018802135950353588, + "loss": 0.1779, + "step": 4155 + }, + { + "epoch": 0.2998665175511382, + "grad_norm": 0.10106804221868515, + "learning_rate": 0.00018801847308413914, + "loss": 0.1628, + "step": 4156 + }, + { + "epoch": 0.29993867022619863, + "grad_norm": 0.1267825812101364, + "learning_rate": 0.0001880155866647424, + "loss": 0.1184, + "step": 4157 + }, + { + "epoch": 0.3000108229012591, + "grad_norm": 0.11733009666204453, + "learning_rate": 0.00018801270024534566, + "loss": 0.1502, + "step": 4158 + }, + { + "epoch": 0.30008297557631947, + "grad_norm": 0.13267752528190613, + "learning_rate": 0.0001880098138259489, + "loss": 0.1694, + "step": 4159 + }, + { + "epoch": 0.3001551282513799, + "grad_norm": 0.14454156160354614, + "learning_rate": 0.0001880069274065522, + "loss": 0.1879, + "step": 4160 + }, + { + "epoch": 0.30022728092644035, + "grad_norm": 0.10851837694644928, + "learning_rate": 0.00018800404098715545, + "loss": 0.1671, + "step": 4161 + }, + { + "epoch": 0.3002994336015008, + "grad_norm": 0.12852200865745544, + "learning_rate": 0.00018800115456775872, + "loss": 0.1832, + "step": 4162 + }, + { + "epoch": 0.3003715862765612, + "grad_norm": 0.11237096041440964, + "learning_rate": 0.00018799826814836198, + "loss": 0.1271, + "step": 4163 + }, + { + "epoch": 0.30044373895162163, + "grad_norm": 0.11820393800735474, + "learning_rate": 0.00018799538172896522, + "loss": 0.1859, + "step": 4164 + }, + { + "epoch": 0.3005158916266821, + "grad_norm": 0.13072800636291504, + "learning_rate": 0.00018799249530956848, + "loss": 0.2141, + "step": 4165 + }, + { + "epoch": 0.30058804430174246, + "grad_norm": 0.11412015557289124, + "learning_rate": 0.00018798960889017174, + "loss": 0.1604, + "step": 4166 + }, + { + "epoch": 0.3006601969768029, + "grad_norm": 0.10667741298675537, + "learning_rate": 0.00018798672247077503, + "loss": 0.1529, + "step": 4167 + }, + { + "epoch": 0.30073234965186335, + "grad_norm": 0.1238722950220108, + "learning_rate": 0.0001879838360513783, + "loss": 0.1632, + "step": 4168 + }, + { + "epoch": 0.3008045023269238, + "grad_norm": 0.1128937378525734, + "learning_rate": 0.00018798094963198153, + "loss": 0.1618, + "step": 4169 + }, + { + "epoch": 0.3008766550019842, + "grad_norm": 0.09414263069629669, + "learning_rate": 0.0001879780632125848, + "loss": 0.1546, + "step": 4170 + }, + { + "epoch": 0.3009488076770446, + "grad_norm": 0.12279564142227173, + "learning_rate": 0.00018797517679318806, + "loss": 0.1514, + "step": 4171 + }, + { + "epoch": 0.30102096035210507, + "grad_norm": 0.11160296201705933, + "learning_rate": 0.00018797229037379132, + "loss": 0.1497, + "step": 4172 + }, + { + "epoch": 0.30109311302716546, + "grad_norm": 0.12374529242515564, + "learning_rate": 0.00018796940395439458, + "loss": 0.2017, + "step": 4173 + }, + { + "epoch": 0.3011652657022259, + "grad_norm": 0.10067132115364075, + "learning_rate": 0.00018796651753499784, + "loss": 0.1761, + "step": 4174 + }, + { + "epoch": 0.30123741837728635, + "grad_norm": 0.15694798529148102, + "learning_rate": 0.0001879636311156011, + "loss": 0.1506, + "step": 4175 + }, + { + "epoch": 0.3013095710523468, + "grad_norm": 0.09987813234329224, + "learning_rate": 0.00018796074469620437, + "loss": 0.1449, + "step": 4176 + }, + { + "epoch": 0.3013817237274072, + "grad_norm": 0.11210080236196518, + "learning_rate": 0.00018795785827680763, + "loss": 0.1088, + "step": 4177 + }, + { + "epoch": 0.3014538764024676, + "grad_norm": 0.14822888374328613, + "learning_rate": 0.0001879549718574109, + "loss": 0.1419, + "step": 4178 + }, + { + "epoch": 0.30152602907752807, + "grad_norm": 0.1467704325914383, + "learning_rate": 0.00018795208543801416, + "loss": 0.1969, + "step": 4179 + }, + { + "epoch": 0.30159818175258846, + "grad_norm": 0.11436786502599716, + "learning_rate": 0.0001879491990186174, + "loss": 0.1627, + "step": 4180 + }, + { + "epoch": 0.3016703344276489, + "grad_norm": 0.13047438859939575, + "learning_rate": 0.00018794631259922068, + "loss": 0.134, + "step": 4181 + }, + { + "epoch": 0.30174248710270934, + "grad_norm": 0.12848497927188873, + "learning_rate": 0.00018794342617982395, + "loss": 0.1177, + "step": 4182 + }, + { + "epoch": 0.30181463977776973, + "grad_norm": 0.12149526923894882, + "learning_rate": 0.0001879405397604272, + "loss": 0.2049, + "step": 4183 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.130044087767601, + "learning_rate": 0.00018793765334103047, + "loss": 0.1751, + "step": 4184 + }, + { + "epoch": 0.3019589451278906, + "grad_norm": 0.12724927067756653, + "learning_rate": 0.0001879347669216337, + "loss": 0.1456, + "step": 4185 + }, + { + "epoch": 0.30203109780295107, + "grad_norm": 0.09408117085695267, + "learning_rate": 0.00018793188050223697, + "loss": 0.1419, + "step": 4186 + }, + { + "epoch": 0.30210325047801145, + "grad_norm": 0.13772109150886536, + "learning_rate": 0.00018792899408284024, + "loss": 0.1553, + "step": 4187 + }, + { + "epoch": 0.3021754031530719, + "grad_norm": 0.11674950271844864, + "learning_rate": 0.00018792610766344353, + "loss": 0.152, + "step": 4188 + }, + { + "epoch": 0.30224755582813234, + "grad_norm": 0.1148289367556572, + "learning_rate": 0.0001879232212440468, + "loss": 0.1891, + "step": 4189 + }, + { + "epoch": 0.30231970850319273, + "grad_norm": 0.14175260066986084, + "learning_rate": 0.00018792033482465002, + "loss": 0.1409, + "step": 4190 + }, + { + "epoch": 0.3023918611782532, + "grad_norm": 0.09403803199529648, + "learning_rate": 0.0001879174484052533, + "loss": 0.1584, + "step": 4191 + }, + { + "epoch": 0.3024640138533136, + "grad_norm": 0.13788004219532013, + "learning_rate": 0.00018791456198585655, + "loss": 0.1308, + "step": 4192 + }, + { + "epoch": 0.30253616652837406, + "grad_norm": 0.0977247878909111, + "learning_rate": 0.0001879116755664598, + "loss": 0.1536, + "step": 4193 + }, + { + "epoch": 0.30260831920343445, + "grad_norm": 0.1072482168674469, + "learning_rate": 0.00018790878914706308, + "loss": 0.2268, + "step": 4194 + }, + { + "epoch": 0.3026804718784949, + "grad_norm": 0.115860216319561, + "learning_rate": 0.00018790590272766634, + "loss": 0.1732, + "step": 4195 + }, + { + "epoch": 0.30275262455355534, + "grad_norm": 0.11402580887079239, + "learning_rate": 0.0001879030163082696, + "loss": 0.1448, + "step": 4196 + }, + { + "epoch": 0.3028247772286157, + "grad_norm": 0.1256372034549713, + "learning_rate": 0.00018790012988887286, + "loss": 0.1335, + "step": 4197 + }, + { + "epoch": 0.30289692990367617, + "grad_norm": 0.11045928299427032, + "learning_rate": 0.00018789724346947613, + "loss": 0.1777, + "step": 4198 + }, + { + "epoch": 0.3029690825787366, + "grad_norm": 0.10243780165910721, + "learning_rate": 0.0001878943570500794, + "loss": 0.1781, + "step": 4199 + }, + { + "epoch": 0.30304123525379706, + "grad_norm": 0.09597732871770859, + "learning_rate": 0.00018789147063068265, + "loss": 0.1631, + "step": 4200 + }, + { + "epoch": 0.30311338792885745, + "grad_norm": 0.11148419231176376, + "learning_rate": 0.0001878885842112859, + "loss": 0.1736, + "step": 4201 + }, + { + "epoch": 0.3031855406039179, + "grad_norm": 0.11032175272703171, + "learning_rate": 0.00018788569779188918, + "loss": 0.1119, + "step": 4202 + }, + { + "epoch": 0.30325769327897834, + "grad_norm": 0.09522320330142975, + "learning_rate": 0.00018788281137249244, + "loss": 0.1259, + "step": 4203 + }, + { + "epoch": 0.3033298459540387, + "grad_norm": 0.11687321960926056, + "learning_rate": 0.0001878799249530957, + "loss": 0.1904, + "step": 4204 + }, + { + "epoch": 0.30340199862909917, + "grad_norm": 0.10842274129390717, + "learning_rate": 0.00018787703853369897, + "loss": 0.1194, + "step": 4205 + }, + { + "epoch": 0.3034741513041596, + "grad_norm": 0.11892799288034439, + "learning_rate": 0.0001878741521143022, + "loss": 0.2106, + "step": 4206 + }, + { + "epoch": 0.30354630397922006, + "grad_norm": 0.10375183075666428, + "learning_rate": 0.00018787126569490547, + "loss": 0.136, + "step": 4207 + }, + { + "epoch": 0.30361845665428044, + "grad_norm": 0.14736315608024597, + "learning_rate": 0.00018786837927550873, + "loss": 0.18, + "step": 4208 + }, + { + "epoch": 0.3036906093293409, + "grad_norm": 0.1258639693260193, + "learning_rate": 0.00018786549285611202, + "loss": 0.1328, + "step": 4209 + }, + { + "epoch": 0.30376276200440133, + "grad_norm": 0.13408122956752777, + "learning_rate": 0.00018786260643671528, + "loss": 0.122, + "step": 4210 + }, + { + "epoch": 0.3038349146794617, + "grad_norm": 0.1173291951417923, + "learning_rate": 0.00018785972001731852, + "loss": 0.1513, + "step": 4211 + }, + { + "epoch": 0.30390706735452216, + "grad_norm": 0.10346346348524094, + "learning_rate": 0.00018785683359792178, + "loss": 0.1413, + "step": 4212 + }, + { + "epoch": 0.3039792200295826, + "grad_norm": 0.21302101016044617, + "learning_rate": 0.00018785394717852504, + "loss": 0.1854, + "step": 4213 + }, + { + "epoch": 0.304051372704643, + "grad_norm": 0.16414475440979004, + "learning_rate": 0.0001878510607591283, + "loss": 0.182, + "step": 4214 + }, + { + "epoch": 0.30412352537970344, + "grad_norm": 0.1431964635848999, + "learning_rate": 0.00018784817433973157, + "loss": 0.1724, + "step": 4215 + }, + { + "epoch": 0.3041956780547639, + "grad_norm": 0.11071380227804184, + "learning_rate": 0.00018784528792033483, + "loss": 0.1084, + "step": 4216 + }, + { + "epoch": 0.30426783072982433, + "grad_norm": 0.10251831263303757, + "learning_rate": 0.0001878424015009381, + "loss": 0.131, + "step": 4217 + }, + { + "epoch": 0.3043399834048847, + "grad_norm": 0.1020776629447937, + "learning_rate": 0.00018783951508154136, + "loss": 0.1733, + "step": 4218 + }, + { + "epoch": 0.30441213607994516, + "grad_norm": 0.10754634439945221, + "learning_rate": 0.00018783662866214462, + "loss": 0.1544, + "step": 4219 + }, + { + "epoch": 0.3044842887550056, + "grad_norm": 0.12796087563037872, + "learning_rate": 0.00018783374224274788, + "loss": 0.1478, + "step": 4220 + }, + { + "epoch": 0.304556441430066, + "grad_norm": 0.10638104379177094, + "learning_rate": 0.00018783085582335115, + "loss": 0.1373, + "step": 4221 + }, + { + "epoch": 0.30462859410512644, + "grad_norm": 0.09254088252782822, + "learning_rate": 0.00018782796940395438, + "loss": 0.2043, + "step": 4222 + }, + { + "epoch": 0.3047007467801869, + "grad_norm": 0.144930899143219, + "learning_rate": 0.00018782508298455765, + "loss": 0.1252, + "step": 4223 + }, + { + "epoch": 0.3047728994552473, + "grad_norm": 0.0938224047422409, + "learning_rate": 0.00018782219656516094, + "loss": 0.1678, + "step": 4224 + }, + { + "epoch": 0.3048450521303077, + "grad_norm": 0.11682211607694626, + "learning_rate": 0.0001878193101457642, + "loss": 0.1498, + "step": 4225 + }, + { + "epoch": 0.30491720480536816, + "grad_norm": 0.10960418730974197, + "learning_rate": 0.00018781642372636746, + "loss": 0.1359, + "step": 4226 + }, + { + "epoch": 0.3049893574804286, + "grad_norm": 0.12347632646560669, + "learning_rate": 0.0001878135373069707, + "loss": 0.1275, + "step": 4227 + }, + { + "epoch": 0.305061510155489, + "grad_norm": 0.14290280640125275, + "learning_rate": 0.00018781065088757396, + "loss": 0.1448, + "step": 4228 + }, + { + "epoch": 0.30513366283054943, + "grad_norm": 0.1424276977777481, + "learning_rate": 0.00018780776446817722, + "loss": 0.1475, + "step": 4229 + }, + { + "epoch": 0.3052058155056099, + "grad_norm": 0.11387716233730316, + "learning_rate": 0.0001878048780487805, + "loss": 0.1347, + "step": 4230 + }, + { + "epoch": 0.3052779681806703, + "grad_norm": 0.10514809191226959, + "learning_rate": 0.00018780199162938378, + "loss": 0.1378, + "step": 4231 + }, + { + "epoch": 0.3053501208557307, + "grad_norm": 0.15583384037017822, + "learning_rate": 0.000187799105209987, + "loss": 0.1515, + "step": 4232 + }, + { + "epoch": 0.30542227353079116, + "grad_norm": 0.11437677592039108, + "learning_rate": 0.00018779621879059028, + "loss": 0.184, + "step": 4233 + }, + { + "epoch": 0.3054944262058516, + "grad_norm": 0.10012994706630707, + "learning_rate": 0.00018779333237119354, + "loss": 0.1504, + "step": 4234 + }, + { + "epoch": 0.305566578880912, + "grad_norm": 0.11126403510570526, + "learning_rate": 0.0001877904459517968, + "loss": 0.1747, + "step": 4235 + }, + { + "epoch": 0.30563873155597243, + "grad_norm": 0.1091986745595932, + "learning_rate": 0.00018778755953240006, + "loss": 0.1769, + "step": 4236 + }, + { + "epoch": 0.3057108842310329, + "grad_norm": 0.09372757375240326, + "learning_rate": 0.00018778467311300333, + "loss": 0.1175, + "step": 4237 + }, + { + "epoch": 0.3057830369060933, + "grad_norm": 0.08955243974924088, + "learning_rate": 0.0001877817866936066, + "loss": 0.1118, + "step": 4238 + }, + { + "epoch": 0.3058551895811537, + "grad_norm": 0.12780213356018066, + "learning_rate": 0.00018777890027420985, + "loss": 0.1612, + "step": 4239 + }, + { + "epoch": 0.30592734225621415, + "grad_norm": 0.12503564357757568, + "learning_rate": 0.00018777601385481312, + "loss": 0.1476, + "step": 4240 + }, + { + "epoch": 0.3059994949312746, + "grad_norm": 0.12395293265581131, + "learning_rate": 0.00018777312743541638, + "loss": 0.1497, + "step": 4241 + }, + { + "epoch": 0.306071647606335, + "grad_norm": 0.13468153774738312, + "learning_rate": 0.00018777024101601964, + "loss": 0.1438, + "step": 4242 + }, + { + "epoch": 0.30614380028139543, + "grad_norm": 0.12461975961923599, + "learning_rate": 0.00018776735459662288, + "loss": 0.1979, + "step": 4243 + }, + { + "epoch": 0.3062159529564559, + "grad_norm": 0.1354757845401764, + "learning_rate": 0.00018776446817722614, + "loss": 0.1441, + "step": 4244 + }, + { + "epoch": 0.30628810563151626, + "grad_norm": 0.12288720160722733, + "learning_rate": 0.00018776158175782943, + "loss": 0.1898, + "step": 4245 + }, + { + "epoch": 0.3063602583065767, + "grad_norm": 0.13993801176548004, + "learning_rate": 0.0001877586953384327, + "loss": 0.1258, + "step": 4246 + }, + { + "epoch": 0.30643241098163715, + "grad_norm": 0.1167026087641716, + "learning_rate": 0.00018775580891903596, + "loss": 0.1559, + "step": 4247 + }, + { + "epoch": 0.3065045636566976, + "grad_norm": 0.12541060149669647, + "learning_rate": 0.0001877529224996392, + "loss": 0.1728, + "step": 4248 + }, + { + "epoch": 0.306576716331758, + "grad_norm": 0.1256195604801178, + "learning_rate": 0.00018775003608024245, + "loss": 0.1408, + "step": 4249 + }, + { + "epoch": 0.3066488690068184, + "grad_norm": 0.14515994489192963, + "learning_rate": 0.00018774714966084572, + "loss": 0.1522, + "step": 4250 + }, + { + "epoch": 0.30672102168187887, + "grad_norm": 0.10798481106758118, + "learning_rate": 0.00018774426324144898, + "loss": 0.1522, + "step": 4251 + }, + { + "epoch": 0.30679317435693926, + "grad_norm": 0.11664534360170364, + "learning_rate": 0.00018774137682205227, + "loss": 0.1785, + "step": 4252 + }, + { + "epoch": 0.3068653270319997, + "grad_norm": 0.10946546494960785, + "learning_rate": 0.00018773849040265553, + "loss": 0.1899, + "step": 4253 + }, + { + "epoch": 0.30693747970706015, + "grad_norm": 0.09712180495262146, + "learning_rate": 0.00018773560398325877, + "loss": 0.1512, + "step": 4254 + }, + { + "epoch": 0.3070096323821206, + "grad_norm": 0.1304282695055008, + "learning_rate": 0.00018773271756386203, + "loss": 0.1517, + "step": 4255 + }, + { + "epoch": 0.307081785057181, + "grad_norm": 0.09476155787706375, + "learning_rate": 0.0001877298311444653, + "loss": 0.1469, + "step": 4256 + }, + { + "epoch": 0.3071539377322414, + "grad_norm": 0.12522584199905396, + "learning_rate": 0.00018772694472506856, + "loss": 0.1569, + "step": 4257 + }, + { + "epoch": 0.30722609040730187, + "grad_norm": 0.12130534648895264, + "learning_rate": 0.00018772405830567182, + "loss": 0.163, + "step": 4258 + }, + { + "epoch": 0.30729824308236225, + "grad_norm": 0.0961449146270752, + "learning_rate": 0.00018772117188627508, + "loss": 0.1792, + "step": 4259 + }, + { + "epoch": 0.3073703957574227, + "grad_norm": 0.11592444032430649, + "learning_rate": 0.00018771828546687835, + "loss": 0.1572, + "step": 4260 + }, + { + "epoch": 0.30744254843248314, + "grad_norm": 0.11982754617929459, + "learning_rate": 0.0001877153990474816, + "loss": 0.201, + "step": 4261 + }, + { + "epoch": 0.3075147011075436, + "grad_norm": 0.09742818772792816, + "learning_rate": 0.00018771251262808487, + "loss": 0.1445, + "step": 4262 + }, + { + "epoch": 0.307586853782604, + "grad_norm": 0.11305420100688934, + "learning_rate": 0.00018770962620868814, + "loss": 0.1776, + "step": 4263 + }, + { + "epoch": 0.3076590064576644, + "grad_norm": 0.12675850093364716, + "learning_rate": 0.0001877067397892914, + "loss": 0.2095, + "step": 4264 + }, + { + "epoch": 0.30773115913272486, + "grad_norm": 0.12010957300662994, + "learning_rate": 0.00018770385336989463, + "loss": 0.1617, + "step": 4265 + }, + { + "epoch": 0.30780331180778525, + "grad_norm": 0.11646157503128052, + "learning_rate": 0.00018770096695049792, + "loss": 0.1465, + "step": 4266 + }, + { + "epoch": 0.3078754644828457, + "grad_norm": 0.09521958976984024, + "learning_rate": 0.0001876980805311012, + "loss": 0.1499, + "step": 4267 + }, + { + "epoch": 0.30794761715790614, + "grad_norm": 0.13130347430706024, + "learning_rate": 0.00018769519411170445, + "loss": 0.1362, + "step": 4268 + }, + { + "epoch": 0.3080197698329666, + "grad_norm": 0.1242891252040863, + "learning_rate": 0.0001876923076923077, + "loss": 0.1916, + "step": 4269 + }, + { + "epoch": 0.30809192250802697, + "grad_norm": 0.13037200272083282, + "learning_rate": 0.00018768942127291095, + "loss": 0.1363, + "step": 4270 + }, + { + "epoch": 0.3081640751830874, + "grad_norm": 0.10391830652952194, + "learning_rate": 0.0001876865348535142, + "loss": 0.1717, + "step": 4271 + }, + { + "epoch": 0.30823622785814786, + "grad_norm": 0.10626058280467987, + "learning_rate": 0.00018768364843411748, + "loss": 0.1713, + "step": 4272 + }, + { + "epoch": 0.30830838053320825, + "grad_norm": 0.08672137558460236, + "learning_rate": 0.00018768076201472077, + "loss": 0.1517, + "step": 4273 + }, + { + "epoch": 0.3083805332082687, + "grad_norm": 0.10902436077594757, + "learning_rate": 0.00018767787559532403, + "loss": 0.1841, + "step": 4274 + }, + { + "epoch": 0.30845268588332914, + "grad_norm": 0.10874659568071365, + "learning_rate": 0.00018767498917592726, + "loss": 0.115, + "step": 4275 + }, + { + "epoch": 0.3085248385583895, + "grad_norm": 0.11341419816017151, + "learning_rate": 0.00018767210275653053, + "loss": 0.2206, + "step": 4276 + }, + { + "epoch": 0.30859699123344997, + "grad_norm": 0.13141369819641113, + "learning_rate": 0.0001876692163371338, + "loss": 0.1607, + "step": 4277 + }, + { + "epoch": 0.3086691439085104, + "grad_norm": 0.14365047216415405, + "learning_rate": 0.00018766632991773705, + "loss": 0.1518, + "step": 4278 + }, + { + "epoch": 0.30874129658357086, + "grad_norm": 0.10291138291358948, + "learning_rate": 0.00018766344349834032, + "loss": 0.1475, + "step": 4279 + }, + { + "epoch": 0.30881344925863125, + "grad_norm": 0.13778726756572723, + "learning_rate": 0.00018766055707894358, + "loss": 0.1579, + "step": 4280 + }, + { + "epoch": 0.3088856019336917, + "grad_norm": 0.10906017571687698, + "learning_rate": 0.00018765767065954684, + "loss": 0.169, + "step": 4281 + }, + { + "epoch": 0.30895775460875213, + "grad_norm": 0.1113433763384819, + "learning_rate": 0.0001876547842401501, + "loss": 0.1795, + "step": 4282 + }, + { + "epoch": 0.3090299072838125, + "grad_norm": 0.14330008625984192, + "learning_rate": 0.00018765189782075337, + "loss": 0.1835, + "step": 4283 + }, + { + "epoch": 0.30910205995887297, + "grad_norm": 0.11915328353643417, + "learning_rate": 0.00018764901140135663, + "loss": 0.1822, + "step": 4284 + }, + { + "epoch": 0.3091742126339334, + "grad_norm": 0.13964983820915222, + "learning_rate": 0.0001876461249819599, + "loss": 0.1775, + "step": 4285 + }, + { + "epoch": 0.30924636530899385, + "grad_norm": 0.12686780095100403, + "learning_rate": 0.00018764323856256313, + "loss": 0.1364, + "step": 4286 + }, + { + "epoch": 0.30931851798405424, + "grad_norm": 0.11067734658718109, + "learning_rate": 0.00018764035214316642, + "loss": 0.1721, + "step": 4287 + }, + { + "epoch": 0.3093906706591147, + "grad_norm": 0.09381629526615143, + "learning_rate": 0.00018763746572376968, + "loss": 0.1681, + "step": 4288 + }, + { + "epoch": 0.30946282333417513, + "grad_norm": 0.0986705869436264, + "learning_rate": 0.00018763457930437294, + "loss": 0.1756, + "step": 4289 + }, + { + "epoch": 0.3095349760092355, + "grad_norm": 0.09282074123620987, + "learning_rate": 0.0001876316928849762, + "loss": 0.1794, + "step": 4290 + }, + { + "epoch": 0.30960712868429596, + "grad_norm": 0.12449940294027328, + "learning_rate": 0.00018762880646557944, + "loss": 0.1104, + "step": 4291 + }, + { + "epoch": 0.3096792813593564, + "grad_norm": 0.10148288309574127, + "learning_rate": 0.0001876259200461827, + "loss": 0.116, + "step": 4292 + }, + { + "epoch": 0.30975143403441685, + "grad_norm": 0.1262681633234024, + "learning_rate": 0.00018762303362678597, + "loss": 0.146, + "step": 4293 + }, + { + "epoch": 0.30982358670947724, + "grad_norm": 0.13044936954975128, + "learning_rate": 0.00018762014720738926, + "loss": 0.1289, + "step": 4294 + }, + { + "epoch": 0.3098957393845377, + "grad_norm": 0.08965712040662766, + "learning_rate": 0.00018761726078799252, + "loss": 0.1356, + "step": 4295 + }, + { + "epoch": 0.3099678920595981, + "grad_norm": 0.14585722982883453, + "learning_rate": 0.00018761437436859576, + "loss": 0.1672, + "step": 4296 + }, + { + "epoch": 0.3100400447346585, + "grad_norm": 0.13562092185020447, + "learning_rate": 0.00018761148794919902, + "loss": 0.1959, + "step": 4297 + }, + { + "epoch": 0.31011219740971896, + "grad_norm": 0.09663914144039154, + "learning_rate": 0.00018760860152980228, + "loss": 0.15, + "step": 4298 + }, + { + "epoch": 0.3101843500847794, + "grad_norm": 0.09218167513608932, + "learning_rate": 0.00018760571511040555, + "loss": 0.149, + "step": 4299 + }, + { + "epoch": 0.31025650275983985, + "grad_norm": 0.10205177962779999, + "learning_rate": 0.0001876028286910088, + "loss": 0.1779, + "step": 4300 + }, + { + "epoch": 0.31032865543490024, + "grad_norm": 0.12574578821659088, + "learning_rate": 0.00018759994227161207, + "loss": 0.172, + "step": 4301 + }, + { + "epoch": 0.3104008081099607, + "grad_norm": 0.12000858038663864, + "learning_rate": 0.00018759705585221534, + "loss": 0.1089, + "step": 4302 + }, + { + "epoch": 0.3104729607850211, + "grad_norm": 0.1477157026529312, + "learning_rate": 0.0001875941694328186, + "loss": 0.1796, + "step": 4303 + }, + { + "epoch": 0.3105451134600815, + "grad_norm": 0.1305723488330841, + "learning_rate": 0.00018759128301342186, + "loss": 0.1477, + "step": 4304 + }, + { + "epoch": 0.31061726613514196, + "grad_norm": 0.10859954357147217, + "learning_rate": 0.00018758839659402512, + "loss": 0.1771, + "step": 4305 + }, + { + "epoch": 0.3106894188102024, + "grad_norm": 0.0939202830195427, + "learning_rate": 0.0001875855101746284, + "loss": 0.1413, + "step": 4306 + }, + { + "epoch": 0.3107615714852628, + "grad_norm": 0.22470423579216003, + "learning_rate": 0.00018758262375523162, + "loss": 0.1255, + "step": 4307 + }, + { + "epoch": 0.31083372416032323, + "grad_norm": 0.10599032789468765, + "learning_rate": 0.0001875797373358349, + "loss": 0.19, + "step": 4308 + }, + { + "epoch": 0.3109058768353837, + "grad_norm": 0.11063870042562485, + "learning_rate": 0.00018757685091643818, + "loss": 0.1338, + "step": 4309 + }, + { + "epoch": 0.3109780295104441, + "grad_norm": 0.12211477011442184, + "learning_rate": 0.00018757396449704144, + "loss": 0.1426, + "step": 4310 + }, + { + "epoch": 0.3110501821855045, + "grad_norm": 0.11353323608636856, + "learning_rate": 0.0001875710780776447, + "loss": 0.1478, + "step": 4311 + }, + { + "epoch": 0.31112233486056495, + "grad_norm": 0.11034446209669113, + "learning_rate": 0.00018756819165824794, + "loss": 0.1592, + "step": 4312 + }, + { + "epoch": 0.3111944875356254, + "grad_norm": 0.12288372963666916, + "learning_rate": 0.0001875653052388512, + "loss": 0.1839, + "step": 4313 + }, + { + "epoch": 0.3112666402106858, + "grad_norm": 0.10573863238096237, + "learning_rate": 0.00018756241881945446, + "loss": 0.1885, + "step": 4314 + }, + { + "epoch": 0.31133879288574623, + "grad_norm": 0.15434828400611877, + "learning_rate": 0.00018755953240005775, + "loss": 0.1639, + "step": 4315 + }, + { + "epoch": 0.3114109455608067, + "grad_norm": 0.1456459015607834, + "learning_rate": 0.00018755664598066102, + "loss": 0.2052, + "step": 4316 + }, + { + "epoch": 0.3114830982358671, + "grad_norm": 0.1549510806798935, + "learning_rate": 0.00018755375956126425, + "loss": 0.1343, + "step": 4317 + }, + { + "epoch": 0.3115552509109275, + "grad_norm": 0.11198796331882477, + "learning_rate": 0.00018755087314186752, + "loss": 0.1419, + "step": 4318 + }, + { + "epoch": 0.31162740358598795, + "grad_norm": 0.1253986954689026, + "learning_rate": 0.00018754798672247078, + "loss": 0.1541, + "step": 4319 + }, + { + "epoch": 0.3116995562610484, + "grad_norm": 0.1058746874332428, + "learning_rate": 0.00018754510030307404, + "loss": 0.2107, + "step": 4320 + }, + { + "epoch": 0.3117717089361088, + "grad_norm": 0.1150989979505539, + "learning_rate": 0.0001875422138836773, + "loss": 0.2103, + "step": 4321 + }, + { + "epoch": 0.3118438616111692, + "grad_norm": 0.09693560004234314, + "learning_rate": 0.00018753932746428057, + "loss": 0.1357, + "step": 4322 + }, + { + "epoch": 0.31191601428622967, + "grad_norm": 0.1558159440755844, + "learning_rate": 0.00018753644104488383, + "loss": 0.1587, + "step": 4323 + }, + { + "epoch": 0.3119881669612901, + "grad_norm": 0.1341172456741333, + "learning_rate": 0.0001875335546254871, + "loss": 0.1185, + "step": 4324 + }, + { + "epoch": 0.3120603196363505, + "grad_norm": 0.11654019355773926, + "learning_rate": 0.00018753066820609036, + "loss": 0.1479, + "step": 4325 + }, + { + "epoch": 0.31213247231141095, + "grad_norm": 0.1901504397392273, + "learning_rate": 0.00018752778178669362, + "loss": 0.1115, + "step": 4326 + }, + { + "epoch": 0.3122046249864714, + "grad_norm": 0.1711183488368988, + "learning_rate": 0.00018752489536729688, + "loss": 0.1711, + "step": 4327 + }, + { + "epoch": 0.3122767776615318, + "grad_norm": 0.12417960911989212, + "learning_rate": 0.00018752200894790012, + "loss": 0.1292, + "step": 4328 + }, + { + "epoch": 0.3123489303365922, + "grad_norm": 0.12994439899921417, + "learning_rate": 0.0001875191225285034, + "loss": 0.1137, + "step": 4329 + }, + { + "epoch": 0.31242108301165267, + "grad_norm": 0.12198687344789505, + "learning_rate": 0.00018751623610910667, + "loss": 0.1479, + "step": 4330 + }, + { + "epoch": 0.3124932356867131, + "grad_norm": 0.1255475878715515, + "learning_rate": 0.00018751334968970993, + "loss": 0.2342, + "step": 4331 + }, + { + "epoch": 0.3125653883617735, + "grad_norm": 0.11571196466684341, + "learning_rate": 0.0001875104632703132, + "loss": 0.1098, + "step": 4332 + }, + { + "epoch": 0.31263754103683394, + "grad_norm": 0.15391094982624054, + "learning_rate": 0.00018750757685091643, + "loss": 0.1983, + "step": 4333 + }, + { + "epoch": 0.3127096937118944, + "grad_norm": 0.10191630572080612, + "learning_rate": 0.0001875046904315197, + "loss": 0.1628, + "step": 4334 + }, + { + "epoch": 0.3127818463869548, + "grad_norm": 0.13359738886356354, + "learning_rate": 0.00018750180401212296, + "loss": 0.1369, + "step": 4335 + }, + { + "epoch": 0.3128539990620152, + "grad_norm": 0.1361343115568161, + "learning_rate": 0.00018749891759272625, + "loss": 0.1614, + "step": 4336 + }, + { + "epoch": 0.31292615173707566, + "grad_norm": 0.11173868179321289, + "learning_rate": 0.0001874960311733295, + "loss": 0.1474, + "step": 4337 + }, + { + "epoch": 0.31299830441213605, + "grad_norm": 0.1364803910255432, + "learning_rate": 0.00018749314475393275, + "loss": 0.1616, + "step": 4338 + }, + { + "epoch": 0.3130704570871965, + "grad_norm": 0.1421414315700531, + "learning_rate": 0.000187490258334536, + "loss": 0.1945, + "step": 4339 + }, + { + "epoch": 0.31314260976225694, + "grad_norm": 0.11378195881843567, + "learning_rate": 0.00018748737191513927, + "loss": 0.1365, + "step": 4340 + }, + { + "epoch": 0.3132147624373174, + "grad_norm": 0.10104478150606155, + "learning_rate": 0.00018748448549574254, + "loss": 0.0923, + "step": 4341 + }, + { + "epoch": 0.3132869151123778, + "grad_norm": 0.14223428070545197, + "learning_rate": 0.0001874815990763458, + "loss": 0.2485, + "step": 4342 + }, + { + "epoch": 0.3133590677874382, + "grad_norm": 0.11533410847187042, + "learning_rate": 0.00018747871265694906, + "loss": 0.1847, + "step": 4343 + }, + { + "epoch": 0.31343122046249866, + "grad_norm": 0.09127724170684814, + "learning_rate": 0.00018747582623755232, + "loss": 0.162, + "step": 4344 + }, + { + "epoch": 0.31350337313755905, + "grad_norm": 0.10632152855396271, + "learning_rate": 0.0001874729398181556, + "loss": 0.1239, + "step": 4345 + }, + { + "epoch": 0.3135755258126195, + "grad_norm": 0.20816880464553833, + "learning_rate": 0.00018747005339875885, + "loss": 0.1647, + "step": 4346 + }, + { + "epoch": 0.31364767848767994, + "grad_norm": 0.12501870095729828, + "learning_rate": 0.0001874671669793621, + "loss": 0.1703, + "step": 4347 + }, + { + "epoch": 0.3137198311627404, + "grad_norm": 0.11986255645751953, + "learning_rate": 0.00018746428055996538, + "loss": 0.1496, + "step": 4348 + }, + { + "epoch": 0.31379198383780077, + "grad_norm": 0.13503970205783844, + "learning_rate": 0.0001874613941405686, + "loss": 0.1829, + "step": 4349 + }, + { + "epoch": 0.3138641365128612, + "grad_norm": 0.16656002402305603, + "learning_rate": 0.0001874585077211719, + "loss": 0.1814, + "step": 4350 + }, + { + "epoch": 0.31393628918792166, + "grad_norm": 0.12464180588722229, + "learning_rate": 0.00018745562130177516, + "loss": 0.112, + "step": 4351 + }, + { + "epoch": 0.31400844186298205, + "grad_norm": 0.13612239062786102, + "learning_rate": 0.00018745273488237843, + "loss": 0.1877, + "step": 4352 + }, + { + "epoch": 0.3140805945380425, + "grad_norm": 0.11188091337680817, + "learning_rate": 0.0001874498484629817, + "loss": 0.1682, + "step": 4353 + }, + { + "epoch": 0.31415274721310293, + "grad_norm": 0.1170855462551117, + "learning_rate": 0.00018744696204358493, + "loss": 0.1764, + "step": 4354 + }, + { + "epoch": 0.3142248998881634, + "grad_norm": 0.10502547025680542, + "learning_rate": 0.0001874440756241882, + "loss": 0.1801, + "step": 4355 + }, + { + "epoch": 0.31429705256322377, + "grad_norm": 0.12084412574768066, + "learning_rate": 0.00018744118920479145, + "loss": 0.1332, + "step": 4356 + }, + { + "epoch": 0.3143692052382842, + "grad_norm": 0.11734385043382645, + "learning_rate": 0.00018743830278539474, + "loss": 0.1831, + "step": 4357 + }, + { + "epoch": 0.31444135791334465, + "grad_norm": 0.10435345768928528, + "learning_rate": 0.000187435416365998, + "loss": 0.1646, + "step": 4358 + }, + { + "epoch": 0.31451351058840504, + "grad_norm": 0.11109847575426102, + "learning_rate": 0.00018743252994660124, + "loss": 0.1603, + "step": 4359 + }, + { + "epoch": 0.3145856632634655, + "grad_norm": 0.12884214520454407, + "learning_rate": 0.0001874296435272045, + "loss": 0.1502, + "step": 4360 + }, + { + "epoch": 0.31465781593852593, + "grad_norm": 0.14958150684833527, + "learning_rate": 0.00018742675710780777, + "loss": 0.1851, + "step": 4361 + }, + { + "epoch": 0.3147299686135864, + "grad_norm": 0.11282835155725479, + "learning_rate": 0.00018742387068841103, + "loss": 0.1913, + "step": 4362 + }, + { + "epoch": 0.31480212128864676, + "grad_norm": 0.15314820408821106, + "learning_rate": 0.0001874209842690143, + "loss": 0.167, + "step": 4363 + }, + { + "epoch": 0.3148742739637072, + "grad_norm": 0.09965931624174118, + "learning_rate": 0.00018741809784961756, + "loss": 0.1009, + "step": 4364 + }, + { + "epoch": 0.31494642663876765, + "grad_norm": 0.11220397800207138, + "learning_rate": 0.00018741521143022082, + "loss": 0.1251, + "step": 4365 + }, + { + "epoch": 0.31501857931382804, + "grad_norm": 0.11934738606214523, + "learning_rate": 0.00018741232501082408, + "loss": 0.1763, + "step": 4366 + }, + { + "epoch": 0.3150907319888885, + "grad_norm": 0.10464148968458176, + "learning_rate": 0.00018740943859142734, + "loss": 0.13, + "step": 4367 + }, + { + "epoch": 0.31516288466394893, + "grad_norm": 0.11204349994659424, + "learning_rate": 0.0001874065521720306, + "loss": 0.1716, + "step": 4368 + }, + { + "epoch": 0.3152350373390093, + "grad_norm": 0.11725107580423355, + "learning_rate": 0.00018740366575263387, + "loss": 0.1576, + "step": 4369 + }, + { + "epoch": 0.31530719001406976, + "grad_norm": 0.09801217913627625, + "learning_rate": 0.00018740077933323713, + "loss": 0.145, + "step": 4370 + }, + { + "epoch": 0.3153793426891302, + "grad_norm": 0.11340101063251495, + "learning_rate": 0.0001873978929138404, + "loss": 0.1408, + "step": 4371 + }, + { + "epoch": 0.31545149536419065, + "grad_norm": 0.11271858215332031, + "learning_rate": 0.00018739500649444366, + "loss": 0.1525, + "step": 4372 + }, + { + "epoch": 0.31552364803925104, + "grad_norm": 0.14412888884544373, + "learning_rate": 0.00018739212007504692, + "loss": 0.1077, + "step": 4373 + }, + { + "epoch": 0.3155958007143115, + "grad_norm": 0.14130175113677979, + "learning_rate": 0.00018738923365565018, + "loss": 0.1687, + "step": 4374 + }, + { + "epoch": 0.3156679533893719, + "grad_norm": 0.1080450788140297, + "learning_rate": 0.00018738634723625345, + "loss": 0.155, + "step": 4375 + }, + { + "epoch": 0.3157401060644323, + "grad_norm": 0.1382875144481659, + "learning_rate": 0.00018738346081685668, + "loss": 0.1675, + "step": 4376 + }, + { + "epoch": 0.31581225873949276, + "grad_norm": 0.10196302831172943, + "learning_rate": 0.00018738057439745995, + "loss": 0.1696, + "step": 4377 + }, + { + "epoch": 0.3158844114145532, + "grad_norm": 0.1086023822426796, + "learning_rate": 0.00018737768797806324, + "loss": 0.1256, + "step": 4378 + }, + { + "epoch": 0.31595656408961365, + "grad_norm": 0.10549721866846085, + "learning_rate": 0.0001873748015586665, + "loss": 0.1294, + "step": 4379 + }, + { + "epoch": 0.31602871676467403, + "grad_norm": 0.10880477726459503, + "learning_rate": 0.00018737191513926976, + "loss": 0.1322, + "step": 4380 + }, + { + "epoch": 0.3161008694397345, + "grad_norm": 0.12142103910446167, + "learning_rate": 0.000187369028719873, + "loss": 0.18, + "step": 4381 + }, + { + "epoch": 0.3161730221147949, + "grad_norm": 0.10394002497196198, + "learning_rate": 0.00018736614230047626, + "loss": 0.134, + "step": 4382 + }, + { + "epoch": 0.3162451747898553, + "grad_norm": 0.12098486721515656, + "learning_rate": 0.00018736325588107952, + "loss": 0.1417, + "step": 4383 + }, + { + "epoch": 0.31631732746491575, + "grad_norm": 0.16527141630649567, + "learning_rate": 0.0001873603694616828, + "loss": 0.1611, + "step": 4384 + }, + { + "epoch": 0.3163894801399762, + "grad_norm": 0.13548526167869568, + "learning_rate": 0.00018735748304228608, + "loss": 0.1987, + "step": 4385 + }, + { + "epoch": 0.31646163281503664, + "grad_norm": 0.10220380872488022, + "learning_rate": 0.0001873545966228893, + "loss": 0.1748, + "step": 4386 + }, + { + "epoch": 0.31653378549009703, + "grad_norm": 0.11551573127508163, + "learning_rate": 0.00018735171020349258, + "loss": 0.1566, + "step": 4387 + }, + { + "epoch": 0.3166059381651575, + "grad_norm": 0.111538365483284, + "learning_rate": 0.00018734882378409584, + "loss": 0.1734, + "step": 4388 + }, + { + "epoch": 0.3166780908402179, + "grad_norm": 0.1431824266910553, + "learning_rate": 0.0001873459373646991, + "loss": 0.16, + "step": 4389 + }, + { + "epoch": 0.3167502435152783, + "grad_norm": 0.11930502206087112, + "learning_rate": 0.00018734305094530236, + "loss": 0.1276, + "step": 4390 + }, + { + "epoch": 0.31682239619033875, + "grad_norm": 0.1125396341085434, + "learning_rate": 0.00018734016452590563, + "loss": 0.1517, + "step": 4391 + }, + { + "epoch": 0.3168945488653992, + "grad_norm": 0.1314370483160019, + "learning_rate": 0.0001873372781065089, + "loss": 0.1755, + "step": 4392 + }, + { + "epoch": 0.31696670154045964, + "grad_norm": 0.1070464625954628, + "learning_rate": 0.00018733439168711215, + "loss": 0.1557, + "step": 4393 + }, + { + "epoch": 0.31703885421552, + "grad_norm": 0.1102706640958786, + "learning_rate": 0.00018733150526771542, + "loss": 0.1643, + "step": 4394 + }, + { + "epoch": 0.31711100689058047, + "grad_norm": 0.13626664876937866, + "learning_rate": 0.00018732861884831868, + "loss": 0.1673, + "step": 4395 + }, + { + "epoch": 0.3171831595656409, + "grad_norm": 0.14753460884094238, + "learning_rate": 0.00018732573242892194, + "loss": 0.1865, + "step": 4396 + }, + { + "epoch": 0.3172553122407013, + "grad_norm": 0.08600473403930664, + "learning_rate": 0.00018732284600952518, + "loss": 0.164, + "step": 4397 + }, + { + "epoch": 0.31732746491576175, + "grad_norm": 0.08636843413114548, + "learning_rate": 0.00018731995959012844, + "loss": 0.1115, + "step": 4398 + }, + { + "epoch": 0.3173996175908222, + "grad_norm": 0.1168600469827652, + "learning_rate": 0.00018731707317073173, + "loss": 0.1659, + "step": 4399 + }, + { + "epoch": 0.3174717702658826, + "grad_norm": 0.1677643358707428, + "learning_rate": 0.000187314186751335, + "loss": 0.2004, + "step": 4400 + }, + { + "epoch": 0.317543922940943, + "grad_norm": 0.13780860602855682, + "learning_rate": 0.00018731130033193826, + "loss": 0.1379, + "step": 4401 + }, + { + "epoch": 0.31761607561600347, + "grad_norm": 0.09523440152406693, + "learning_rate": 0.0001873084139125415, + "loss": 0.1681, + "step": 4402 + }, + { + "epoch": 0.3176882282910639, + "grad_norm": 0.09516411274671555, + "learning_rate": 0.00018730552749314476, + "loss": 0.1347, + "step": 4403 + }, + { + "epoch": 0.3177603809661243, + "grad_norm": 0.11729392409324646, + "learning_rate": 0.00018730264107374802, + "loss": 0.1378, + "step": 4404 + }, + { + "epoch": 0.31783253364118474, + "grad_norm": 0.10459394752979279, + "learning_rate": 0.00018729975465435128, + "loss": 0.1994, + "step": 4405 + }, + { + "epoch": 0.3179046863162452, + "grad_norm": 0.09694515913724899, + "learning_rate": 0.00018729686823495457, + "loss": 0.0951, + "step": 4406 + }, + { + "epoch": 0.3179768389913056, + "grad_norm": 0.10586842894554138, + "learning_rate": 0.0001872939818155578, + "loss": 0.1866, + "step": 4407 + }, + { + "epoch": 0.318048991666366, + "grad_norm": 0.10234203189611435, + "learning_rate": 0.00018729109539616107, + "loss": 0.1486, + "step": 4408 + }, + { + "epoch": 0.31812114434142647, + "grad_norm": 0.10403174161911011, + "learning_rate": 0.00018728820897676433, + "loss": 0.1718, + "step": 4409 + }, + { + "epoch": 0.3181932970164869, + "grad_norm": 0.11329304426908493, + "learning_rate": 0.0001872853225573676, + "loss": 0.1913, + "step": 4410 + }, + { + "epoch": 0.3182654496915473, + "grad_norm": 0.16083496809005737, + "learning_rate": 0.00018728243613797086, + "loss": 0.1544, + "step": 4411 + }, + { + "epoch": 0.31833760236660774, + "grad_norm": 0.14580407738685608, + "learning_rate": 0.00018727954971857412, + "loss": 0.151, + "step": 4412 + }, + { + "epoch": 0.3184097550416682, + "grad_norm": 0.13236120343208313, + "learning_rate": 0.00018727666329917736, + "loss": 0.1948, + "step": 4413 + }, + { + "epoch": 0.3184819077167286, + "grad_norm": 0.14633306860923767, + "learning_rate": 0.00018727377687978065, + "loss": 0.1695, + "step": 4414 + }, + { + "epoch": 0.318554060391789, + "grad_norm": 0.14905905723571777, + "learning_rate": 0.0001872708904603839, + "loss": 0.1486, + "step": 4415 + }, + { + "epoch": 0.31862621306684946, + "grad_norm": 0.14201393723487854, + "learning_rate": 0.00018726800404098717, + "loss": 0.161, + "step": 4416 + }, + { + "epoch": 0.3186983657419099, + "grad_norm": 0.12765076756477356, + "learning_rate": 0.00018726511762159044, + "loss": 0.1649, + "step": 4417 + }, + { + "epoch": 0.3187705184169703, + "grad_norm": 0.093240886926651, + "learning_rate": 0.00018726223120219367, + "loss": 0.1571, + "step": 4418 + }, + { + "epoch": 0.31884267109203074, + "grad_norm": 0.11160070449113846, + "learning_rate": 0.00018725934478279693, + "loss": 0.1692, + "step": 4419 + }, + { + "epoch": 0.3189148237670912, + "grad_norm": 0.10448220372200012, + "learning_rate": 0.0001872564583634002, + "loss": 0.1845, + "step": 4420 + }, + { + "epoch": 0.31898697644215157, + "grad_norm": 0.11860977113246918, + "learning_rate": 0.0001872535719440035, + "loss": 0.2073, + "step": 4421 + }, + { + "epoch": 0.319059129117212, + "grad_norm": 0.1062135323882103, + "learning_rate": 0.00018725068552460675, + "loss": 0.1474, + "step": 4422 + }, + { + "epoch": 0.31913128179227246, + "grad_norm": 0.11262713372707367, + "learning_rate": 0.00018724779910521, + "loss": 0.1567, + "step": 4423 + }, + { + "epoch": 0.3192034344673329, + "grad_norm": 0.09404128044843674, + "learning_rate": 0.00018724491268581325, + "loss": 0.1307, + "step": 4424 + }, + { + "epoch": 0.3192755871423933, + "grad_norm": 0.1066492572426796, + "learning_rate": 0.0001872420262664165, + "loss": 0.1214, + "step": 4425 + }, + { + "epoch": 0.31934773981745374, + "grad_norm": 0.11451132595539093, + "learning_rate": 0.00018723913984701978, + "loss": 0.1679, + "step": 4426 + }, + { + "epoch": 0.3194198924925142, + "grad_norm": 0.1330243945121765, + "learning_rate": 0.00018723625342762304, + "loss": 0.1941, + "step": 4427 + }, + { + "epoch": 0.31949204516757457, + "grad_norm": 0.10715876519680023, + "learning_rate": 0.0001872333670082263, + "loss": 0.1643, + "step": 4428 + }, + { + "epoch": 0.319564197842635, + "grad_norm": 0.09910948574542999, + "learning_rate": 0.00018723048058882956, + "loss": 0.2107, + "step": 4429 + }, + { + "epoch": 0.31963635051769546, + "grad_norm": 0.1132940798997879, + "learning_rate": 0.00018722759416943283, + "loss": 0.1718, + "step": 4430 + }, + { + "epoch": 0.31970850319275584, + "grad_norm": 0.12930969893932343, + "learning_rate": 0.0001872247077500361, + "loss": 0.2033, + "step": 4431 + }, + { + "epoch": 0.3197806558678163, + "grad_norm": 0.10411523282527924, + "learning_rate": 0.00018722182133063935, + "loss": 0.1732, + "step": 4432 + }, + { + "epoch": 0.31985280854287673, + "grad_norm": 0.09339303523302078, + "learning_rate": 0.00018721893491124262, + "loss": 0.1742, + "step": 4433 + }, + { + "epoch": 0.3199249612179372, + "grad_norm": 0.13078728318214417, + "learning_rate": 0.00018721604849184585, + "loss": 0.1439, + "step": 4434 + }, + { + "epoch": 0.31999711389299756, + "grad_norm": 0.15128646790981293, + "learning_rate": 0.00018721316207244914, + "loss": 0.2077, + "step": 4435 + }, + { + "epoch": 0.320069266568058, + "grad_norm": 0.1379484236240387, + "learning_rate": 0.0001872102756530524, + "loss": 0.1768, + "step": 4436 + }, + { + "epoch": 0.32014141924311845, + "grad_norm": 0.09854443371295929, + "learning_rate": 0.00018720738923365567, + "loss": 0.1505, + "step": 4437 + }, + { + "epoch": 0.32021357191817884, + "grad_norm": 0.1198776438832283, + "learning_rate": 0.00018720450281425893, + "loss": 0.1648, + "step": 4438 + }, + { + "epoch": 0.3202857245932393, + "grad_norm": 0.1183851808309555, + "learning_rate": 0.00018720161639486217, + "loss": 0.1767, + "step": 4439 + }, + { + "epoch": 0.32035787726829973, + "grad_norm": 0.11320844292640686, + "learning_rate": 0.00018719872997546543, + "loss": 0.1562, + "step": 4440 + }, + { + "epoch": 0.3204300299433602, + "grad_norm": 0.11692800372838974, + "learning_rate": 0.0001871958435560687, + "loss": 0.1715, + "step": 4441 + }, + { + "epoch": 0.32050218261842056, + "grad_norm": 0.11715764552354813, + "learning_rate": 0.00018719295713667198, + "loss": 0.1428, + "step": 4442 + }, + { + "epoch": 0.320574335293481, + "grad_norm": 0.10514655709266663, + "learning_rate": 0.00018719007071727524, + "loss": 0.13, + "step": 4443 + }, + { + "epoch": 0.32064648796854145, + "grad_norm": 0.10924189537763596, + "learning_rate": 0.00018718718429787848, + "loss": 0.1128, + "step": 4444 + }, + { + "epoch": 0.32071864064360184, + "grad_norm": 0.0940491333603859, + "learning_rate": 0.00018718429787848174, + "loss": 0.1718, + "step": 4445 + }, + { + "epoch": 0.3207907933186623, + "grad_norm": 0.10690921545028687, + "learning_rate": 0.000187181411459085, + "loss": 0.1282, + "step": 4446 + }, + { + "epoch": 0.3208629459937227, + "grad_norm": 0.09734002500772476, + "learning_rate": 0.00018717852503968827, + "loss": 0.1351, + "step": 4447 + }, + { + "epoch": 0.32093509866878317, + "grad_norm": 0.11302919685840607, + "learning_rate": 0.00018717563862029153, + "loss": 0.1471, + "step": 4448 + }, + { + "epoch": 0.32100725134384356, + "grad_norm": 0.1114354357123375, + "learning_rate": 0.0001871727522008948, + "loss": 0.1741, + "step": 4449 + }, + { + "epoch": 0.321079404018904, + "grad_norm": 0.09100287407636642, + "learning_rate": 0.00018716986578149806, + "loss": 0.1366, + "step": 4450 + }, + { + "epoch": 0.32115155669396445, + "grad_norm": 0.11997717618942261, + "learning_rate": 0.00018716697936210132, + "loss": 0.2065, + "step": 4451 + }, + { + "epoch": 0.32122370936902483, + "grad_norm": 0.11886655539274216, + "learning_rate": 0.00018716409294270458, + "loss": 0.1109, + "step": 4452 + }, + { + "epoch": 0.3212958620440853, + "grad_norm": 0.13030387461185455, + "learning_rate": 0.00018716120652330785, + "loss": 0.1574, + "step": 4453 + }, + { + "epoch": 0.3213680147191457, + "grad_norm": 0.11604657024145126, + "learning_rate": 0.0001871583201039111, + "loss": 0.1803, + "step": 4454 + }, + { + "epoch": 0.32144016739420617, + "grad_norm": 0.13347670435905457, + "learning_rate": 0.00018715543368451435, + "loss": 0.19, + "step": 4455 + }, + { + "epoch": 0.32151232006926656, + "grad_norm": 0.09421198815107346, + "learning_rate": 0.00018715254726511764, + "loss": 0.1779, + "step": 4456 + }, + { + "epoch": 0.321584472744327, + "grad_norm": 0.11267413198947906, + "learning_rate": 0.0001871496608457209, + "loss": 0.1803, + "step": 4457 + }, + { + "epoch": 0.32165662541938744, + "grad_norm": 0.12656359374523163, + "learning_rate": 0.00018714677442632416, + "loss": 0.1604, + "step": 4458 + }, + { + "epoch": 0.32172877809444783, + "grad_norm": 0.1371208280324936, + "learning_rate": 0.00018714388800692742, + "loss": 0.1612, + "step": 4459 + }, + { + "epoch": 0.3218009307695083, + "grad_norm": 0.08244256675243378, + "learning_rate": 0.00018714100158753066, + "loss": 0.1081, + "step": 4460 + }, + { + "epoch": 0.3218730834445687, + "grad_norm": 0.10778731852769852, + "learning_rate": 0.00018713811516813392, + "loss": 0.2065, + "step": 4461 + }, + { + "epoch": 0.3219452361196291, + "grad_norm": 0.11527123302221298, + "learning_rate": 0.00018713522874873719, + "loss": 0.1305, + "step": 4462 + }, + { + "epoch": 0.32201738879468955, + "grad_norm": 0.10875330120325089, + "learning_rate": 0.00018713234232934048, + "loss": 0.1715, + "step": 4463 + }, + { + "epoch": 0.32208954146975, + "grad_norm": 0.09810987859964371, + "learning_rate": 0.00018712945590994374, + "loss": 0.1827, + "step": 4464 + }, + { + "epoch": 0.32216169414481044, + "grad_norm": 0.1507887840270996, + "learning_rate": 0.00018712656949054697, + "loss": 0.1881, + "step": 4465 + }, + { + "epoch": 0.32223384681987083, + "grad_norm": 0.10882745683193207, + "learning_rate": 0.00018712368307115024, + "loss": 0.1538, + "step": 4466 + }, + { + "epoch": 0.3223059994949313, + "grad_norm": 0.14391076564788818, + "learning_rate": 0.0001871207966517535, + "loss": 0.1758, + "step": 4467 + }, + { + "epoch": 0.3223781521699917, + "grad_norm": 0.10498519986867905, + "learning_rate": 0.00018711791023235676, + "loss": 0.1741, + "step": 4468 + }, + { + "epoch": 0.3224503048450521, + "grad_norm": 0.11114256083965302, + "learning_rate": 0.00018711502381296003, + "loss": 0.1855, + "step": 4469 + }, + { + "epoch": 0.32252245752011255, + "grad_norm": 0.1319652646780014, + "learning_rate": 0.0001871121373935633, + "loss": 0.1523, + "step": 4470 + }, + { + "epoch": 0.322594610195173, + "grad_norm": 0.13710534572601318, + "learning_rate": 0.00018710925097416655, + "loss": 0.144, + "step": 4471 + }, + { + "epoch": 0.32266676287023344, + "grad_norm": 0.11060652136802673, + "learning_rate": 0.00018710636455476982, + "loss": 0.1513, + "step": 4472 + }, + { + "epoch": 0.3227389155452938, + "grad_norm": 0.13275280594825745, + "learning_rate": 0.00018710347813537308, + "loss": 0.1882, + "step": 4473 + }, + { + "epoch": 0.32281106822035427, + "grad_norm": 0.09653960913419724, + "learning_rate": 0.00018710059171597634, + "loss": 0.1658, + "step": 4474 + }, + { + "epoch": 0.3228832208954147, + "grad_norm": 0.11430240422487259, + "learning_rate": 0.0001870977052965796, + "loss": 0.1951, + "step": 4475 + }, + { + "epoch": 0.3229553735704751, + "grad_norm": 0.12474402785301208, + "learning_rate": 0.00018709481887718287, + "loss": 0.1901, + "step": 4476 + }, + { + "epoch": 0.32302752624553555, + "grad_norm": 0.138519287109375, + "learning_rate": 0.00018709193245778613, + "loss": 0.1691, + "step": 4477 + }, + { + "epoch": 0.323099678920596, + "grad_norm": 0.13189628720283508, + "learning_rate": 0.0001870890460383894, + "loss": 0.1214, + "step": 4478 + }, + { + "epoch": 0.32317183159565643, + "grad_norm": 0.12156566232442856, + "learning_rate": 0.00018708615961899266, + "loss": 0.1632, + "step": 4479 + }, + { + "epoch": 0.3232439842707168, + "grad_norm": 0.1292414367198944, + "learning_rate": 0.00018708327319959592, + "loss": 0.1777, + "step": 4480 + }, + { + "epoch": 0.32331613694577727, + "grad_norm": 0.1226365715265274, + "learning_rate": 0.00018708038678019918, + "loss": 0.1383, + "step": 4481 + }, + { + "epoch": 0.3233882896208377, + "grad_norm": 0.11465481668710709, + "learning_rate": 0.00018707750036080242, + "loss": 0.1404, + "step": 4482 + }, + { + "epoch": 0.3234604422958981, + "grad_norm": 0.12336615473031998, + "learning_rate": 0.00018707461394140568, + "loss": 0.1984, + "step": 4483 + }, + { + "epoch": 0.32353259497095854, + "grad_norm": 0.10501537472009659, + "learning_rate": 0.00018707172752200897, + "loss": 0.1069, + "step": 4484 + }, + { + "epoch": 0.323604747646019, + "grad_norm": 0.12265104800462723, + "learning_rate": 0.00018706884110261223, + "loss": 0.1748, + "step": 4485 + }, + { + "epoch": 0.32367690032107943, + "grad_norm": 0.1346326619386673, + "learning_rate": 0.0001870659546832155, + "loss": 0.1089, + "step": 4486 + }, + { + "epoch": 0.3237490529961398, + "grad_norm": 0.1227821558713913, + "learning_rate": 0.00018706306826381873, + "loss": 0.1666, + "step": 4487 + }, + { + "epoch": 0.32382120567120026, + "grad_norm": 0.1063651442527771, + "learning_rate": 0.000187060181844422, + "loss": 0.1539, + "step": 4488 + }, + { + "epoch": 0.3238933583462607, + "grad_norm": 0.09898777306079865, + "learning_rate": 0.00018705729542502526, + "loss": 0.1503, + "step": 4489 + }, + { + "epoch": 0.3239655110213211, + "grad_norm": 0.12139619141817093, + "learning_rate": 0.00018705440900562852, + "loss": 0.1826, + "step": 4490 + }, + { + "epoch": 0.32403766369638154, + "grad_norm": 0.11227799952030182, + "learning_rate": 0.0001870515225862318, + "loss": 0.2026, + "step": 4491 + }, + { + "epoch": 0.324109816371442, + "grad_norm": 0.09174603223800659, + "learning_rate": 0.00018704863616683505, + "loss": 0.1523, + "step": 4492 + }, + { + "epoch": 0.32418196904650237, + "grad_norm": 0.08484648168087006, + "learning_rate": 0.0001870457497474383, + "loss": 0.149, + "step": 4493 + }, + { + "epoch": 0.3242541217215628, + "grad_norm": 0.13810379803180695, + "learning_rate": 0.00018704286332804157, + "loss": 0.1605, + "step": 4494 + }, + { + "epoch": 0.32432627439662326, + "grad_norm": 0.10621599107980728, + "learning_rate": 0.00018703997690864484, + "loss": 0.1376, + "step": 4495 + }, + { + "epoch": 0.3243984270716837, + "grad_norm": 0.11542864143848419, + "learning_rate": 0.0001870370904892481, + "loss": 0.1821, + "step": 4496 + }, + { + "epoch": 0.3244705797467441, + "grad_norm": 0.0962272360920906, + "learning_rate": 0.00018703420406985136, + "loss": 0.1415, + "step": 4497 + }, + { + "epoch": 0.32454273242180454, + "grad_norm": 0.13142697513103485, + "learning_rate": 0.00018703131765045462, + "loss": 0.1624, + "step": 4498 + }, + { + "epoch": 0.324614885096865, + "grad_norm": 0.1143009141087532, + "learning_rate": 0.0001870284312310579, + "loss": 0.1921, + "step": 4499 + }, + { + "epoch": 0.32468703777192537, + "grad_norm": 0.10246312618255615, + "learning_rate": 0.00018702554481166115, + "loss": 0.1578, + "step": 4500 + }, + { + "epoch": 0.3247591904469858, + "grad_norm": 0.11688818782567978, + "learning_rate": 0.0001870226583922644, + "loss": 0.1536, + "step": 4501 + }, + { + "epoch": 0.32483134312204626, + "grad_norm": 0.1425468921661377, + "learning_rate": 0.00018701977197286768, + "loss": 0.1644, + "step": 4502 + }, + { + "epoch": 0.3249034957971067, + "grad_norm": 0.10924214869737625, + "learning_rate": 0.0001870168855534709, + "loss": 0.2426, + "step": 4503 + }, + { + "epoch": 0.3249756484721671, + "grad_norm": 0.11056023836135864, + "learning_rate": 0.00018701399913407417, + "loss": 0.1529, + "step": 4504 + }, + { + "epoch": 0.32504780114722753, + "grad_norm": 0.08694283664226532, + "learning_rate": 0.00018701111271467746, + "loss": 0.1748, + "step": 4505 + }, + { + "epoch": 0.325119953822288, + "grad_norm": 0.10328497737646103, + "learning_rate": 0.00018700822629528073, + "loss": 0.141, + "step": 4506 + }, + { + "epoch": 0.32519210649734837, + "grad_norm": 0.1171845868229866, + "learning_rate": 0.000187005339875884, + "loss": 0.1579, + "step": 4507 + }, + { + "epoch": 0.3252642591724088, + "grad_norm": 0.09219054132699966, + "learning_rate": 0.00018700245345648723, + "loss": 0.1664, + "step": 4508 + }, + { + "epoch": 0.32533641184746925, + "grad_norm": 0.1340274214744568, + "learning_rate": 0.0001869995670370905, + "loss": 0.1391, + "step": 4509 + }, + { + "epoch": 0.3254085645225297, + "grad_norm": 0.1330026537179947, + "learning_rate": 0.00018699668061769375, + "loss": 0.1515, + "step": 4510 + }, + { + "epoch": 0.3254807171975901, + "grad_norm": 0.10810331255197525, + "learning_rate": 0.00018699379419829702, + "loss": 0.1008, + "step": 4511 + }, + { + "epoch": 0.32555286987265053, + "grad_norm": 0.10727111250162125, + "learning_rate": 0.0001869909077789003, + "loss": 0.142, + "step": 4512 + }, + { + "epoch": 0.325625022547711, + "grad_norm": 0.14349889755249023, + "learning_rate": 0.00018698802135950354, + "loss": 0.1649, + "step": 4513 + }, + { + "epoch": 0.32569717522277136, + "grad_norm": 0.09554945677518845, + "learning_rate": 0.0001869851349401068, + "loss": 0.1674, + "step": 4514 + }, + { + "epoch": 0.3257693278978318, + "grad_norm": 0.09096377342939377, + "learning_rate": 0.00018698224852071007, + "loss": 0.1298, + "step": 4515 + }, + { + "epoch": 0.32584148057289225, + "grad_norm": 0.1272154152393341, + "learning_rate": 0.00018697936210131333, + "loss": 0.1946, + "step": 4516 + }, + { + "epoch": 0.3259136332479527, + "grad_norm": 0.10733834654092789, + "learning_rate": 0.0001869764756819166, + "loss": 0.1779, + "step": 4517 + }, + { + "epoch": 0.3259857859230131, + "grad_norm": 0.10842615365982056, + "learning_rate": 0.00018697358926251986, + "loss": 0.1721, + "step": 4518 + }, + { + "epoch": 0.3260579385980735, + "grad_norm": 0.10522483289241791, + "learning_rate": 0.00018697070284312312, + "loss": 0.1295, + "step": 4519 + }, + { + "epoch": 0.32613009127313397, + "grad_norm": 0.1101028248667717, + "learning_rate": 0.00018696781642372638, + "loss": 0.1372, + "step": 4520 + }, + { + "epoch": 0.32620224394819436, + "grad_norm": 0.10599458962678909, + "learning_rate": 0.00018696493000432964, + "loss": 0.1429, + "step": 4521 + }, + { + "epoch": 0.3262743966232548, + "grad_norm": 0.10401427745819092, + "learning_rate": 0.0001869620435849329, + "loss": 0.1497, + "step": 4522 + }, + { + "epoch": 0.32634654929831525, + "grad_norm": 0.11688731610774994, + "learning_rate": 0.00018695915716553617, + "loss": 0.1729, + "step": 4523 + }, + { + "epoch": 0.32641870197337564, + "grad_norm": 0.11261221021413803, + "learning_rate": 0.0001869562707461394, + "loss": 0.2113, + "step": 4524 + }, + { + "epoch": 0.3264908546484361, + "grad_norm": 0.1077963262796402, + "learning_rate": 0.00018695338432674267, + "loss": 0.1665, + "step": 4525 + }, + { + "epoch": 0.3265630073234965, + "grad_norm": 0.11620333045721054, + "learning_rate": 0.00018695049790734596, + "loss": 0.1312, + "step": 4526 + }, + { + "epoch": 0.32663515999855697, + "grad_norm": 0.10986893624067307, + "learning_rate": 0.00018694761148794922, + "loss": 0.1637, + "step": 4527 + }, + { + "epoch": 0.32670731267361736, + "grad_norm": 0.12115433067083359, + "learning_rate": 0.00018694472506855248, + "loss": 0.1028, + "step": 4528 + }, + { + "epoch": 0.3267794653486778, + "grad_norm": 0.0897984430193901, + "learning_rate": 0.00018694183864915572, + "loss": 0.13, + "step": 4529 + }, + { + "epoch": 0.32685161802373824, + "grad_norm": 0.11984968930482864, + "learning_rate": 0.00018693895222975898, + "loss": 0.1606, + "step": 4530 + }, + { + "epoch": 0.32692377069879863, + "grad_norm": 0.09984946250915527, + "learning_rate": 0.00018693606581036225, + "loss": 0.1656, + "step": 4531 + }, + { + "epoch": 0.3269959233738591, + "grad_norm": 0.11888393014669418, + "learning_rate": 0.0001869331793909655, + "loss": 0.1907, + "step": 4532 + }, + { + "epoch": 0.3270680760489195, + "grad_norm": 0.11285512894392014, + "learning_rate": 0.0001869302929715688, + "loss": 0.1239, + "step": 4533 + }, + { + "epoch": 0.32714022872397996, + "grad_norm": 0.12616856396198273, + "learning_rate": 0.00018692740655217204, + "loss": 0.1966, + "step": 4534 + }, + { + "epoch": 0.32721238139904035, + "grad_norm": 0.12473762035369873, + "learning_rate": 0.0001869245201327753, + "loss": 0.1923, + "step": 4535 + }, + { + "epoch": 0.3272845340741008, + "grad_norm": 0.09554195404052734, + "learning_rate": 0.00018692163371337856, + "loss": 0.143, + "step": 4536 + }, + { + "epoch": 0.32735668674916124, + "grad_norm": 0.10972704738378525, + "learning_rate": 0.00018691874729398182, + "loss": 0.106, + "step": 4537 + }, + { + "epoch": 0.32742883942422163, + "grad_norm": 0.10398998856544495, + "learning_rate": 0.0001869158608745851, + "loss": 0.174, + "step": 4538 + }, + { + "epoch": 0.3275009920992821, + "grad_norm": 0.1194574236869812, + "learning_rate": 0.00018691297445518835, + "loss": 0.1593, + "step": 4539 + }, + { + "epoch": 0.3275731447743425, + "grad_norm": 0.09499070048332214, + "learning_rate": 0.0001869100880357916, + "loss": 0.1614, + "step": 4540 + }, + { + "epoch": 0.32764529744940296, + "grad_norm": 0.11292389035224915, + "learning_rate": 0.00018690720161639488, + "loss": 0.1857, + "step": 4541 + }, + { + "epoch": 0.32771745012446335, + "grad_norm": 0.10467950254678726, + "learning_rate": 0.00018690431519699814, + "loss": 0.1776, + "step": 4542 + }, + { + "epoch": 0.3277896027995238, + "grad_norm": 0.13810518383979797, + "learning_rate": 0.0001869014287776014, + "loss": 0.1982, + "step": 4543 + }, + { + "epoch": 0.32786175547458424, + "grad_norm": 0.09907660633325577, + "learning_rate": 0.00018689854235820466, + "loss": 0.1903, + "step": 4544 + }, + { + "epoch": 0.3279339081496446, + "grad_norm": 0.09725797176361084, + "learning_rate": 0.0001868956559388079, + "loss": 0.1079, + "step": 4545 + }, + { + "epoch": 0.32800606082470507, + "grad_norm": 0.09951073676347733, + "learning_rate": 0.00018689276951941116, + "loss": 0.1275, + "step": 4546 + }, + { + "epoch": 0.3280782134997655, + "grad_norm": 0.11599034070968628, + "learning_rate": 0.00018688988310001445, + "loss": 0.1522, + "step": 4547 + }, + { + "epoch": 0.32815036617482596, + "grad_norm": 0.11120975017547607, + "learning_rate": 0.00018688699668061772, + "loss": 0.2065, + "step": 4548 + }, + { + "epoch": 0.32822251884988635, + "grad_norm": 0.1209036186337471, + "learning_rate": 0.00018688411026122098, + "loss": 0.1301, + "step": 4549 + }, + { + "epoch": 0.3282946715249468, + "grad_norm": 0.13418681919574738, + "learning_rate": 0.00018688122384182421, + "loss": 0.143, + "step": 4550 + }, + { + "epoch": 0.32836682420000723, + "grad_norm": 0.12810713052749634, + "learning_rate": 0.00018687833742242748, + "loss": 0.185, + "step": 4551 + }, + { + "epoch": 0.3284389768750676, + "grad_norm": 0.12670108675956726, + "learning_rate": 0.00018687545100303074, + "loss": 0.1891, + "step": 4552 + }, + { + "epoch": 0.32851112955012807, + "grad_norm": 0.09912601858377457, + "learning_rate": 0.000186872564583634, + "loss": 0.189, + "step": 4553 + }, + { + "epoch": 0.3285832822251885, + "grad_norm": 0.10828938335180283, + "learning_rate": 0.0001868696781642373, + "loss": 0.1458, + "step": 4554 + }, + { + "epoch": 0.3286554349002489, + "grad_norm": 0.12141595780849457, + "learning_rate": 0.00018686679174484053, + "loss": 0.1673, + "step": 4555 + }, + { + "epoch": 0.32872758757530934, + "grad_norm": 0.09101445972919464, + "learning_rate": 0.0001868639053254438, + "loss": 0.177, + "step": 4556 + }, + { + "epoch": 0.3287997402503698, + "grad_norm": 0.10998333990573883, + "learning_rate": 0.00018686101890604706, + "loss": 0.1473, + "step": 4557 + }, + { + "epoch": 0.32887189292543023, + "grad_norm": 0.10678628832101822, + "learning_rate": 0.00018685813248665032, + "loss": 0.1801, + "step": 4558 + }, + { + "epoch": 0.3289440456004906, + "grad_norm": 0.12848390638828278, + "learning_rate": 0.00018685524606725358, + "loss": 0.184, + "step": 4559 + }, + { + "epoch": 0.32901619827555106, + "grad_norm": 0.10749983787536621, + "learning_rate": 0.00018685235964785684, + "loss": 0.1449, + "step": 4560 + }, + { + "epoch": 0.3290883509506115, + "grad_norm": 0.1044686958193779, + "learning_rate": 0.0001868494732284601, + "loss": 0.1846, + "step": 4561 + }, + { + "epoch": 0.3291605036256719, + "grad_norm": 0.1108483299612999, + "learning_rate": 0.00018684658680906337, + "loss": 0.184, + "step": 4562 + }, + { + "epoch": 0.32923265630073234, + "grad_norm": 0.13553328812122345, + "learning_rate": 0.00018684370038966663, + "loss": 0.1676, + "step": 4563 + }, + { + "epoch": 0.3293048089757928, + "grad_norm": 0.10779878497123718, + "learning_rate": 0.0001868408139702699, + "loss": 0.111, + "step": 4564 + }, + { + "epoch": 0.32937696165085323, + "grad_norm": 0.10668893903493881, + "learning_rate": 0.00018683792755087316, + "loss": 0.1699, + "step": 4565 + }, + { + "epoch": 0.3294491143259136, + "grad_norm": 0.10057191550731659, + "learning_rate": 0.0001868350411314764, + "loss": 0.1336, + "step": 4566 + }, + { + "epoch": 0.32952126700097406, + "grad_norm": 0.10133133828639984, + "learning_rate": 0.00018683215471207966, + "loss": 0.1269, + "step": 4567 + }, + { + "epoch": 0.3295934196760345, + "grad_norm": 0.11621999740600586, + "learning_rate": 0.00018682926829268295, + "loss": 0.1926, + "step": 4568 + }, + { + "epoch": 0.3296655723510949, + "grad_norm": 0.10976468026638031, + "learning_rate": 0.0001868263818732862, + "loss": 0.1501, + "step": 4569 + }, + { + "epoch": 0.32973772502615534, + "grad_norm": 0.1117042601108551, + "learning_rate": 0.00018682349545388947, + "loss": 0.1374, + "step": 4570 + }, + { + "epoch": 0.3298098777012158, + "grad_norm": 0.12507273256778717, + "learning_rate": 0.0001868206090344927, + "loss": 0.1484, + "step": 4571 + }, + { + "epoch": 0.3298820303762762, + "grad_norm": 0.1241648867726326, + "learning_rate": 0.00018681772261509597, + "loss": 0.1712, + "step": 4572 + }, + { + "epoch": 0.3299541830513366, + "grad_norm": 0.1172139123082161, + "learning_rate": 0.00018681483619569923, + "loss": 0.1523, + "step": 4573 + }, + { + "epoch": 0.33002633572639706, + "grad_norm": 0.14421804249286652, + "learning_rate": 0.0001868119497763025, + "loss": 0.1611, + "step": 4574 + }, + { + "epoch": 0.3300984884014575, + "grad_norm": 0.12426183372735977, + "learning_rate": 0.0001868090633569058, + "loss": 0.1317, + "step": 4575 + }, + { + "epoch": 0.3301706410765179, + "grad_norm": 0.1330544501543045, + "learning_rate": 0.00018680617693750902, + "loss": 0.1573, + "step": 4576 + }, + { + "epoch": 0.33024279375157833, + "grad_norm": 0.10799801349639893, + "learning_rate": 0.0001868032905181123, + "loss": 0.18, + "step": 4577 + }, + { + "epoch": 0.3303149464266388, + "grad_norm": 0.11260545998811722, + "learning_rate": 0.00018680040409871555, + "loss": 0.1513, + "step": 4578 + }, + { + "epoch": 0.3303870991016992, + "grad_norm": 0.12739793956279755, + "learning_rate": 0.0001867975176793188, + "loss": 0.1609, + "step": 4579 + }, + { + "epoch": 0.3304592517767596, + "grad_norm": 0.10804455727338791, + "learning_rate": 0.00018679463125992208, + "loss": 0.1712, + "step": 4580 + }, + { + "epoch": 0.33053140445182005, + "grad_norm": 0.1295986771583557, + "learning_rate": 0.00018679174484052534, + "loss": 0.1722, + "step": 4581 + }, + { + "epoch": 0.3306035571268805, + "grad_norm": 0.10611939430236816, + "learning_rate": 0.0001867888584211286, + "loss": 0.1396, + "step": 4582 + }, + { + "epoch": 0.3306757098019409, + "grad_norm": 0.1253766566514969, + "learning_rate": 0.00018678597200173186, + "loss": 0.1414, + "step": 4583 + }, + { + "epoch": 0.33074786247700133, + "grad_norm": 0.10080047696828842, + "learning_rate": 0.00018678308558233513, + "loss": 0.1893, + "step": 4584 + }, + { + "epoch": 0.3308200151520618, + "grad_norm": 0.1438768357038498, + "learning_rate": 0.0001867801991629384, + "loss": 0.187, + "step": 4585 + }, + { + "epoch": 0.33089216782712216, + "grad_norm": 0.1150486096739769, + "learning_rate": 0.00018677731274354165, + "loss": 0.1851, + "step": 4586 + }, + { + "epoch": 0.3309643205021826, + "grad_norm": 0.11817629635334015, + "learning_rate": 0.0001867744263241449, + "loss": 0.1796, + "step": 4587 + }, + { + "epoch": 0.33103647317724305, + "grad_norm": 0.11434055864810944, + "learning_rate": 0.00018677153990474815, + "loss": 0.1736, + "step": 4588 + }, + { + "epoch": 0.3311086258523035, + "grad_norm": 0.12488023936748505, + "learning_rate": 0.00018676865348535144, + "loss": 0.1656, + "step": 4589 + }, + { + "epoch": 0.3311807785273639, + "grad_norm": 0.11203234642744064, + "learning_rate": 0.0001867657670659547, + "loss": 0.1106, + "step": 4590 + }, + { + "epoch": 0.33125293120242433, + "grad_norm": 0.1347704976797104, + "learning_rate": 0.00018676288064655797, + "loss": 0.1662, + "step": 4591 + }, + { + "epoch": 0.33132508387748477, + "grad_norm": 0.11631891131401062, + "learning_rate": 0.0001867599942271612, + "loss": 0.1512, + "step": 4592 + }, + { + "epoch": 0.33139723655254516, + "grad_norm": 0.1219472661614418, + "learning_rate": 0.00018675710780776447, + "loss": 0.1522, + "step": 4593 + }, + { + "epoch": 0.3314693892276056, + "grad_norm": 0.09370163828134537, + "learning_rate": 0.00018675422138836773, + "loss": 0.1296, + "step": 4594 + }, + { + "epoch": 0.33154154190266605, + "grad_norm": 0.12474989145994186, + "learning_rate": 0.000186751334968971, + "loss": 0.169, + "step": 4595 + }, + { + "epoch": 0.3316136945777265, + "grad_norm": 0.10887616872787476, + "learning_rate": 0.00018674844854957428, + "loss": 0.1326, + "step": 4596 + }, + { + "epoch": 0.3316858472527869, + "grad_norm": 0.14505040645599365, + "learning_rate": 0.00018674556213017752, + "loss": 0.2107, + "step": 4597 + }, + { + "epoch": 0.3317579999278473, + "grad_norm": 0.10402046889066696, + "learning_rate": 0.00018674267571078078, + "loss": 0.1095, + "step": 4598 + }, + { + "epoch": 0.33183015260290777, + "grad_norm": 0.10427999496459961, + "learning_rate": 0.00018673978929138404, + "loss": 0.145, + "step": 4599 + }, + { + "epoch": 0.33190230527796816, + "grad_norm": 0.1119854524731636, + "learning_rate": 0.0001867369028719873, + "loss": 0.1437, + "step": 4600 + }, + { + "epoch": 0.3319744579530286, + "grad_norm": 0.12867394089698792, + "learning_rate": 0.00018673401645259057, + "loss": 0.1505, + "step": 4601 + }, + { + "epoch": 0.33204661062808905, + "grad_norm": 0.11001761257648468, + "learning_rate": 0.00018673113003319383, + "loss": 0.1703, + "step": 4602 + }, + { + "epoch": 0.3321187633031495, + "grad_norm": 0.11156858503818512, + "learning_rate": 0.0001867282436137971, + "loss": 0.1937, + "step": 4603 + }, + { + "epoch": 0.3321909159782099, + "grad_norm": 0.11975778639316559, + "learning_rate": 0.00018672535719440036, + "loss": 0.1671, + "step": 4604 + }, + { + "epoch": 0.3322630686532703, + "grad_norm": 0.12916690111160278, + "learning_rate": 0.00018672247077500362, + "loss": 0.1152, + "step": 4605 + }, + { + "epoch": 0.33233522132833077, + "grad_norm": 0.10326454043388367, + "learning_rate": 0.00018671958435560688, + "loss": 0.1481, + "step": 4606 + }, + { + "epoch": 0.33240737400339115, + "grad_norm": 0.10354454070329666, + "learning_rate": 0.00018671669793621015, + "loss": 0.1201, + "step": 4607 + }, + { + "epoch": 0.3324795266784516, + "grad_norm": 0.13755939900875092, + "learning_rate": 0.0001867138115168134, + "loss": 0.1908, + "step": 4608 + }, + { + "epoch": 0.33255167935351204, + "grad_norm": 0.1091679260134697, + "learning_rate": 0.00018671092509741665, + "loss": 0.1395, + "step": 4609 + }, + { + "epoch": 0.3326238320285725, + "grad_norm": 0.10698618739843369, + "learning_rate": 0.0001867080386780199, + "loss": 0.1705, + "step": 4610 + }, + { + "epoch": 0.3326959847036329, + "grad_norm": 0.12497153133153915, + "learning_rate": 0.0001867051522586232, + "loss": 0.1841, + "step": 4611 + }, + { + "epoch": 0.3327681373786933, + "grad_norm": 0.10098680853843689, + "learning_rate": 0.00018670226583922646, + "loss": 0.1874, + "step": 4612 + }, + { + "epoch": 0.33284029005375376, + "grad_norm": 0.0964135080575943, + "learning_rate": 0.00018669937941982972, + "loss": 0.1674, + "step": 4613 + }, + { + "epoch": 0.33291244272881415, + "grad_norm": 0.10805868357419968, + "learning_rate": 0.00018669649300043296, + "loss": 0.1431, + "step": 4614 + }, + { + "epoch": 0.3329845954038746, + "grad_norm": 0.11034578830003738, + "learning_rate": 0.00018669360658103622, + "loss": 0.1658, + "step": 4615 + }, + { + "epoch": 0.33305674807893504, + "grad_norm": 0.09507668018341064, + "learning_rate": 0.00018669072016163949, + "loss": 0.1955, + "step": 4616 + }, + { + "epoch": 0.3331289007539954, + "grad_norm": 0.09780636429786682, + "learning_rate": 0.00018668783374224275, + "loss": 0.1609, + "step": 4617 + }, + { + "epoch": 0.33320105342905587, + "grad_norm": 0.09780305624008179, + "learning_rate": 0.00018668494732284604, + "loss": 0.1506, + "step": 4618 + }, + { + "epoch": 0.3332732061041163, + "grad_norm": 0.10436023026704788, + "learning_rate": 0.00018668206090344928, + "loss": 0.1397, + "step": 4619 + }, + { + "epoch": 0.33334535877917676, + "grad_norm": 0.13126035034656525, + "learning_rate": 0.00018667917448405254, + "loss": 0.1509, + "step": 4620 + }, + { + "epoch": 0.33341751145423715, + "grad_norm": 0.10076587647199631, + "learning_rate": 0.0001866762880646558, + "loss": 0.1777, + "step": 4621 + }, + { + "epoch": 0.3334896641292976, + "grad_norm": 0.10679809749126434, + "learning_rate": 0.00018667340164525906, + "loss": 0.1579, + "step": 4622 + }, + { + "epoch": 0.33356181680435804, + "grad_norm": 0.0949108749628067, + "learning_rate": 0.00018667051522586233, + "loss": 0.2041, + "step": 4623 + }, + { + "epoch": 0.3336339694794184, + "grad_norm": 0.1757906824350357, + "learning_rate": 0.0001866676288064656, + "loss": 0.1807, + "step": 4624 + }, + { + "epoch": 0.33370612215447887, + "grad_norm": 0.11894873529672623, + "learning_rate": 0.00018666474238706885, + "loss": 0.1407, + "step": 4625 + }, + { + "epoch": 0.3337782748295393, + "grad_norm": 0.12828341126441956, + "learning_rate": 0.00018666185596767212, + "loss": 0.2167, + "step": 4626 + }, + { + "epoch": 0.33385042750459976, + "grad_norm": 0.10132212191820145, + "learning_rate": 0.00018665896954827538, + "loss": 0.1045, + "step": 4627 + }, + { + "epoch": 0.33392258017966014, + "grad_norm": 0.1280662566423416, + "learning_rate": 0.00018665608312887864, + "loss": 0.1172, + "step": 4628 + }, + { + "epoch": 0.3339947328547206, + "grad_norm": 0.09809806197881699, + "learning_rate": 0.0001866531967094819, + "loss": 0.0899, + "step": 4629 + }, + { + "epoch": 0.33406688552978103, + "grad_norm": 0.13381366431713104, + "learning_rate": 0.00018665031029008514, + "loss": 0.1653, + "step": 4630 + }, + { + "epoch": 0.3341390382048414, + "grad_norm": 0.12938711047172546, + "learning_rate": 0.0001866474238706884, + "loss": 0.1243, + "step": 4631 + }, + { + "epoch": 0.33421119087990186, + "grad_norm": 0.21836337447166443, + "learning_rate": 0.0001866445374512917, + "loss": 0.1444, + "step": 4632 + }, + { + "epoch": 0.3342833435549623, + "grad_norm": 0.10769309848546982, + "learning_rate": 0.00018664165103189496, + "loss": 0.09, + "step": 4633 + }, + { + "epoch": 0.33435549623002275, + "grad_norm": 0.13226410746574402, + "learning_rate": 0.00018663876461249822, + "loss": 0.159, + "step": 4634 + }, + { + "epoch": 0.33442764890508314, + "grad_norm": 0.1118188351392746, + "learning_rate": 0.00018663587819310145, + "loss": 0.1398, + "step": 4635 + }, + { + "epoch": 0.3344998015801436, + "grad_norm": 0.11711820214986801, + "learning_rate": 0.00018663299177370472, + "loss": 0.1487, + "step": 4636 + }, + { + "epoch": 0.33457195425520403, + "grad_norm": 0.10661720484495163, + "learning_rate": 0.00018663010535430798, + "loss": 0.0891, + "step": 4637 + }, + { + "epoch": 0.3346441069302644, + "grad_norm": 0.11856275051832199, + "learning_rate": 0.00018662721893491124, + "loss": 0.1348, + "step": 4638 + }, + { + "epoch": 0.33471625960532486, + "grad_norm": 0.1158871129155159, + "learning_rate": 0.00018662433251551453, + "loss": 0.158, + "step": 4639 + }, + { + "epoch": 0.3347884122803853, + "grad_norm": 0.11150803416967392, + "learning_rate": 0.00018662144609611777, + "loss": 0.16, + "step": 4640 + }, + { + "epoch": 0.33486056495544575, + "grad_norm": 0.09448430687189102, + "learning_rate": 0.00018661855967672103, + "loss": 0.1971, + "step": 4641 + }, + { + "epoch": 0.33493271763050614, + "grad_norm": 0.10407508164644241, + "learning_rate": 0.0001866156732573243, + "loss": 0.1422, + "step": 4642 + }, + { + "epoch": 0.3350048703055666, + "grad_norm": 0.1328696757555008, + "learning_rate": 0.00018661278683792756, + "loss": 0.1603, + "step": 4643 + }, + { + "epoch": 0.335077022980627, + "grad_norm": 0.11169770359992981, + "learning_rate": 0.00018660990041853082, + "loss": 0.2227, + "step": 4644 + }, + { + "epoch": 0.3351491756556874, + "grad_norm": 0.12166395038366318, + "learning_rate": 0.00018660701399913408, + "loss": 0.129, + "step": 4645 + }, + { + "epoch": 0.33522132833074786, + "grad_norm": 0.17166155576705933, + "learning_rate": 0.00018660412757973735, + "loss": 0.1548, + "step": 4646 + }, + { + "epoch": 0.3352934810058083, + "grad_norm": 0.11148369312286377, + "learning_rate": 0.0001866012411603406, + "loss": 0.1503, + "step": 4647 + }, + { + "epoch": 0.3353656336808687, + "grad_norm": 0.09874880313873291, + "learning_rate": 0.00018659835474094387, + "loss": 0.1783, + "step": 4648 + }, + { + "epoch": 0.33543778635592914, + "grad_norm": 0.11467466503381729, + "learning_rate": 0.00018659546832154714, + "loss": 0.1721, + "step": 4649 + }, + { + "epoch": 0.3355099390309896, + "grad_norm": 0.14663182199001312, + "learning_rate": 0.0001865925819021504, + "loss": 0.1608, + "step": 4650 + }, + { + "epoch": 0.33558209170605, + "grad_norm": 0.10331515222787857, + "learning_rate": 0.00018658969548275363, + "loss": 0.1141, + "step": 4651 + }, + { + "epoch": 0.3356542443811104, + "grad_norm": 0.11265163868665695, + "learning_rate": 0.0001865868090633569, + "loss": 0.1506, + "step": 4652 + }, + { + "epoch": 0.33572639705617086, + "grad_norm": 0.12390236556529999, + "learning_rate": 0.0001865839226439602, + "loss": 0.2137, + "step": 4653 + }, + { + "epoch": 0.3357985497312313, + "grad_norm": 0.11463173478841782, + "learning_rate": 0.00018658103622456345, + "loss": 0.18, + "step": 4654 + }, + { + "epoch": 0.3358707024062917, + "grad_norm": 0.10468676686286926, + "learning_rate": 0.0001865781498051667, + "loss": 0.1382, + "step": 4655 + }, + { + "epoch": 0.33594285508135213, + "grad_norm": 0.12385626882314682, + "learning_rate": 0.00018657526338576995, + "loss": 0.1558, + "step": 4656 + }, + { + "epoch": 0.3360150077564126, + "grad_norm": 0.11396743357181549, + "learning_rate": 0.0001865723769663732, + "loss": 0.1245, + "step": 4657 + }, + { + "epoch": 0.336087160431473, + "grad_norm": 0.10428833216428757, + "learning_rate": 0.00018656949054697647, + "loss": 0.134, + "step": 4658 + }, + { + "epoch": 0.3361593131065334, + "grad_norm": 0.11879660189151764, + "learning_rate": 0.00018656660412757974, + "loss": 0.2264, + "step": 4659 + }, + { + "epoch": 0.33623146578159385, + "grad_norm": 0.09917629510164261, + "learning_rate": 0.00018656371770818303, + "loss": 0.1545, + "step": 4660 + }, + { + "epoch": 0.3363036184566543, + "grad_norm": 0.11942600458860397, + "learning_rate": 0.00018656083128878626, + "loss": 0.1021, + "step": 4661 + }, + { + "epoch": 0.3363757711317147, + "grad_norm": 0.11812159419059753, + "learning_rate": 0.00018655794486938953, + "loss": 0.1416, + "step": 4662 + }, + { + "epoch": 0.33644792380677513, + "grad_norm": 0.1421084702014923, + "learning_rate": 0.0001865550584499928, + "loss": 0.1365, + "step": 4663 + }, + { + "epoch": 0.3365200764818356, + "grad_norm": 0.142777681350708, + "learning_rate": 0.00018655217203059605, + "loss": 0.1786, + "step": 4664 + }, + { + "epoch": 0.336592229156896, + "grad_norm": 0.11653783172369003, + "learning_rate": 0.00018654928561119932, + "loss": 0.1993, + "step": 4665 + }, + { + "epoch": 0.3366643818319564, + "grad_norm": 0.13131196796894073, + "learning_rate": 0.00018654639919180258, + "loss": 0.1696, + "step": 4666 + }, + { + "epoch": 0.33673653450701685, + "grad_norm": 0.10151226073503494, + "learning_rate": 0.00018654351277240584, + "loss": 0.2048, + "step": 4667 + }, + { + "epoch": 0.3368086871820773, + "grad_norm": 0.10702992230653763, + "learning_rate": 0.0001865406263530091, + "loss": 0.1293, + "step": 4668 + }, + { + "epoch": 0.3368808398571377, + "grad_norm": 0.09864121675491333, + "learning_rate": 0.00018653773993361237, + "loss": 0.1224, + "step": 4669 + }, + { + "epoch": 0.3369529925321981, + "grad_norm": 0.121495820581913, + "learning_rate": 0.00018653485351421563, + "loss": 0.1651, + "step": 4670 + }, + { + "epoch": 0.33702514520725857, + "grad_norm": 0.10459835827350616, + "learning_rate": 0.0001865319670948189, + "loss": 0.158, + "step": 4671 + }, + { + "epoch": 0.337097297882319, + "grad_norm": 0.11049709469079971, + "learning_rate": 0.00018652908067542213, + "loss": 0.1203, + "step": 4672 + }, + { + "epoch": 0.3371694505573794, + "grad_norm": 0.10447237640619278, + "learning_rate": 0.0001865261942560254, + "loss": 0.1551, + "step": 4673 + }, + { + "epoch": 0.33724160323243985, + "grad_norm": 0.11688422411680222, + "learning_rate": 0.00018652330783662868, + "loss": 0.1868, + "step": 4674 + }, + { + "epoch": 0.3373137559075003, + "grad_norm": 0.1138431653380394, + "learning_rate": 0.00018652042141723194, + "loss": 0.1653, + "step": 4675 + }, + { + "epoch": 0.3373859085825607, + "grad_norm": 0.11060623079538345, + "learning_rate": 0.0001865175349978352, + "loss": 0.1488, + "step": 4676 + }, + { + "epoch": 0.3374580612576211, + "grad_norm": 0.12014255672693253, + "learning_rate": 0.00018651464857843844, + "loss": 0.16, + "step": 4677 + }, + { + "epoch": 0.33753021393268157, + "grad_norm": 0.1315694898366928, + "learning_rate": 0.0001865117621590417, + "loss": 0.1181, + "step": 4678 + }, + { + "epoch": 0.33760236660774195, + "grad_norm": 0.11970925331115723, + "learning_rate": 0.00018650887573964497, + "loss": 0.1441, + "step": 4679 + }, + { + "epoch": 0.3376745192828024, + "grad_norm": 0.11607395112514496, + "learning_rate": 0.00018650598932024823, + "loss": 0.1245, + "step": 4680 + }, + { + "epoch": 0.33774667195786284, + "grad_norm": 0.09468183666467667, + "learning_rate": 0.00018650310290085152, + "loss": 0.1351, + "step": 4681 + }, + { + "epoch": 0.3378188246329233, + "grad_norm": 0.1312519758939743, + "learning_rate": 0.00018650021648145476, + "loss": 0.1716, + "step": 4682 + }, + { + "epoch": 0.3378909773079837, + "grad_norm": 0.10811814665794373, + "learning_rate": 0.00018649733006205802, + "loss": 0.1249, + "step": 4683 + }, + { + "epoch": 0.3379631299830441, + "grad_norm": 0.11948755383491516, + "learning_rate": 0.00018649444364266128, + "loss": 0.1462, + "step": 4684 + }, + { + "epoch": 0.33803528265810456, + "grad_norm": 0.11248423159122467, + "learning_rate": 0.00018649155722326455, + "loss": 0.1231, + "step": 4685 + }, + { + "epoch": 0.33810743533316495, + "grad_norm": 0.10225424915552139, + "learning_rate": 0.0001864886708038678, + "loss": 0.1394, + "step": 4686 + }, + { + "epoch": 0.3381795880082254, + "grad_norm": 0.15257570147514343, + "learning_rate": 0.00018648578438447107, + "loss": 0.24, + "step": 4687 + }, + { + "epoch": 0.33825174068328584, + "grad_norm": 0.11088695377111435, + "learning_rate": 0.00018648289796507434, + "loss": 0.2021, + "step": 4688 + }, + { + "epoch": 0.3383238933583463, + "grad_norm": 0.13657927513122559, + "learning_rate": 0.0001864800115456776, + "loss": 0.1179, + "step": 4689 + }, + { + "epoch": 0.33839604603340667, + "grad_norm": 0.1033518984913826, + "learning_rate": 0.00018647712512628086, + "loss": 0.1355, + "step": 4690 + }, + { + "epoch": 0.3384681987084671, + "grad_norm": 0.13475528359413147, + "learning_rate": 0.00018647423870688412, + "loss": 0.1449, + "step": 4691 + }, + { + "epoch": 0.33854035138352756, + "grad_norm": 0.13904184103012085, + "learning_rate": 0.0001864713522874874, + "loss": 0.1383, + "step": 4692 + }, + { + "epoch": 0.33861250405858795, + "grad_norm": 0.10915260761976242, + "learning_rate": 0.00018646846586809062, + "loss": 0.114, + "step": 4693 + }, + { + "epoch": 0.3386846567336484, + "grad_norm": 0.11143431812524796, + "learning_rate": 0.00018646557944869389, + "loss": 0.1838, + "step": 4694 + }, + { + "epoch": 0.33875680940870884, + "grad_norm": 0.10445766896009445, + "learning_rate": 0.00018646269302929718, + "loss": 0.1545, + "step": 4695 + }, + { + "epoch": 0.3388289620837693, + "grad_norm": 0.1109762191772461, + "learning_rate": 0.00018645980660990044, + "loss": 0.1145, + "step": 4696 + }, + { + "epoch": 0.33890111475882967, + "grad_norm": 0.09423503279685974, + "learning_rate": 0.0001864569201905037, + "loss": 0.1932, + "step": 4697 + }, + { + "epoch": 0.3389732674338901, + "grad_norm": 0.09955920279026031, + "learning_rate": 0.00018645403377110694, + "loss": 0.1493, + "step": 4698 + }, + { + "epoch": 0.33904542010895056, + "grad_norm": 0.11547043174505234, + "learning_rate": 0.0001864511473517102, + "loss": 0.1392, + "step": 4699 + }, + { + "epoch": 0.33911757278401095, + "grad_norm": 0.11824516952037811, + "learning_rate": 0.00018644826093231346, + "loss": 0.1677, + "step": 4700 + }, + { + "epoch": 0.3391897254590714, + "grad_norm": 0.10412485152482986, + "learning_rate": 0.00018644537451291673, + "loss": 0.1393, + "step": 4701 + }, + { + "epoch": 0.33926187813413183, + "grad_norm": 0.10375717282295227, + "learning_rate": 0.00018644248809352002, + "loss": 0.1385, + "step": 4702 + }, + { + "epoch": 0.3393340308091923, + "grad_norm": 0.10352824628353119, + "learning_rate": 0.00018643960167412325, + "loss": 0.1416, + "step": 4703 + }, + { + "epoch": 0.33940618348425267, + "grad_norm": 0.10737777501344681, + "learning_rate": 0.00018643671525472652, + "loss": 0.1398, + "step": 4704 + }, + { + "epoch": 0.3394783361593131, + "grad_norm": 0.1165878176689148, + "learning_rate": 0.00018643382883532978, + "loss": 0.1748, + "step": 4705 + }, + { + "epoch": 0.33955048883437355, + "grad_norm": 0.10642968118190765, + "learning_rate": 0.00018643094241593304, + "loss": 0.1579, + "step": 4706 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 0.11588136106729507, + "learning_rate": 0.0001864280559965363, + "loss": 0.1507, + "step": 4707 + }, + { + "epoch": 0.3396947941844944, + "grad_norm": 0.0921846553683281, + "learning_rate": 0.00018642516957713957, + "loss": 0.1795, + "step": 4708 + }, + { + "epoch": 0.33976694685955483, + "grad_norm": 0.1140107586979866, + "learning_rate": 0.00018642228315774283, + "loss": 0.1674, + "step": 4709 + }, + { + "epoch": 0.3398390995346152, + "grad_norm": 0.09212062507867813, + "learning_rate": 0.0001864193967383461, + "loss": 0.1469, + "step": 4710 + }, + { + "epoch": 0.33991125220967566, + "grad_norm": 0.11071064323186874, + "learning_rate": 0.00018641651031894936, + "loss": 0.1464, + "step": 4711 + }, + { + "epoch": 0.3399834048847361, + "grad_norm": 0.09902884811162949, + "learning_rate": 0.00018641362389955262, + "loss": 0.1397, + "step": 4712 + }, + { + "epoch": 0.34005555755979655, + "grad_norm": 0.10399525612592697, + "learning_rate": 0.00018641073748015588, + "loss": 0.1804, + "step": 4713 + }, + { + "epoch": 0.34012771023485694, + "grad_norm": 0.1262943297624588, + "learning_rate": 0.00018640785106075914, + "loss": 0.15, + "step": 4714 + }, + { + "epoch": 0.3401998629099174, + "grad_norm": 0.10293149948120117, + "learning_rate": 0.00018640496464136238, + "loss": 0.1787, + "step": 4715 + }, + { + "epoch": 0.3402720155849778, + "grad_norm": 0.11217711120843887, + "learning_rate": 0.00018640207822196567, + "loss": 0.1346, + "step": 4716 + }, + { + "epoch": 0.3403441682600382, + "grad_norm": 0.12639397382736206, + "learning_rate": 0.00018639919180256893, + "loss": 0.1688, + "step": 4717 + }, + { + "epoch": 0.34041632093509866, + "grad_norm": 0.1675584316253662, + "learning_rate": 0.0001863963053831722, + "loss": 0.1863, + "step": 4718 + }, + { + "epoch": 0.3404884736101591, + "grad_norm": 0.12408801913261414, + "learning_rate": 0.00018639341896377546, + "loss": 0.2211, + "step": 4719 + }, + { + "epoch": 0.34056062628521955, + "grad_norm": 0.13109129667282104, + "learning_rate": 0.0001863905325443787, + "loss": 0.1582, + "step": 4720 + }, + { + "epoch": 0.34063277896027994, + "grad_norm": 0.10773653537034988, + "learning_rate": 0.00018638764612498196, + "loss": 0.1646, + "step": 4721 + }, + { + "epoch": 0.3407049316353404, + "grad_norm": 0.13233451545238495, + "learning_rate": 0.00018638475970558522, + "loss": 0.1902, + "step": 4722 + }, + { + "epoch": 0.3407770843104008, + "grad_norm": 0.12965644896030426, + "learning_rate": 0.0001863818732861885, + "loss": 0.1846, + "step": 4723 + }, + { + "epoch": 0.3408492369854612, + "grad_norm": 0.10962004214525223, + "learning_rate": 0.00018637898686679177, + "loss": 0.1629, + "step": 4724 + }, + { + "epoch": 0.34092138966052166, + "grad_norm": 0.0985800251364708, + "learning_rate": 0.000186376100447395, + "loss": 0.1614, + "step": 4725 + }, + { + "epoch": 0.3409935423355821, + "grad_norm": 0.10292194038629532, + "learning_rate": 0.00018637321402799827, + "loss": 0.1586, + "step": 4726 + }, + { + "epoch": 0.34106569501064254, + "grad_norm": 0.08835239708423615, + "learning_rate": 0.00018637032760860154, + "loss": 0.1115, + "step": 4727 + }, + { + "epoch": 0.34113784768570293, + "grad_norm": 0.09853382408618927, + "learning_rate": 0.0001863674411892048, + "loss": 0.1714, + "step": 4728 + }, + { + "epoch": 0.3412100003607634, + "grad_norm": 0.11794517189264297, + "learning_rate": 0.00018636455476980806, + "loss": 0.143, + "step": 4729 + }, + { + "epoch": 0.3412821530358238, + "grad_norm": 0.11417117714881897, + "learning_rate": 0.00018636166835041132, + "loss": 0.1719, + "step": 4730 + }, + { + "epoch": 0.3413543057108842, + "grad_norm": 0.11145757138729095, + "learning_rate": 0.0001863587819310146, + "loss": 0.1621, + "step": 4731 + }, + { + "epoch": 0.34142645838594465, + "grad_norm": 0.12171374261379242, + "learning_rate": 0.00018635589551161785, + "loss": 0.1081, + "step": 4732 + }, + { + "epoch": 0.3414986110610051, + "grad_norm": 0.14451739192008972, + "learning_rate": 0.0001863530090922211, + "loss": 0.1434, + "step": 4733 + }, + { + "epoch": 0.34157076373606554, + "grad_norm": 0.12776045501232147, + "learning_rate": 0.00018635012267282438, + "loss": 0.1579, + "step": 4734 + }, + { + "epoch": 0.34164291641112593, + "grad_norm": 0.13398566842079163, + "learning_rate": 0.00018634723625342764, + "loss": 0.1978, + "step": 4735 + }, + { + "epoch": 0.3417150690861864, + "grad_norm": 0.15044716000556946, + "learning_rate": 0.00018634434983403087, + "loss": 0.1786, + "step": 4736 + }, + { + "epoch": 0.3417872217612468, + "grad_norm": 0.10238110274076462, + "learning_rate": 0.00018634146341463416, + "loss": 0.161, + "step": 4737 + }, + { + "epoch": 0.3418593744363072, + "grad_norm": 0.13163591921329498, + "learning_rate": 0.00018633857699523743, + "loss": 0.2066, + "step": 4738 + }, + { + "epoch": 0.34193152711136765, + "grad_norm": 0.12700612843036652, + "learning_rate": 0.0001863356905758407, + "loss": 0.1646, + "step": 4739 + }, + { + "epoch": 0.3420036797864281, + "grad_norm": 0.09891757369041443, + "learning_rate": 0.00018633280415644395, + "loss": 0.1366, + "step": 4740 + }, + { + "epoch": 0.3420758324614885, + "grad_norm": 0.10575391352176666, + "learning_rate": 0.0001863299177370472, + "loss": 0.1348, + "step": 4741 + }, + { + "epoch": 0.3421479851365489, + "grad_norm": 0.10323916375637054, + "learning_rate": 0.00018632703131765045, + "loss": 0.1507, + "step": 4742 + }, + { + "epoch": 0.34222013781160937, + "grad_norm": 0.13781608641147614, + "learning_rate": 0.00018632414489825371, + "loss": 0.115, + "step": 4743 + }, + { + "epoch": 0.3422922904866698, + "grad_norm": 0.07461650669574738, + "learning_rate": 0.000186321258478857, + "loss": 0.1986, + "step": 4744 + }, + { + "epoch": 0.3423644431617302, + "grad_norm": 0.11913321167230606, + "learning_rate": 0.00018631837205946027, + "loss": 0.1754, + "step": 4745 + }, + { + "epoch": 0.34243659583679065, + "grad_norm": 0.1495174616575241, + "learning_rate": 0.0001863154856400635, + "loss": 0.1403, + "step": 4746 + }, + { + "epoch": 0.3425087485118511, + "grad_norm": 0.10709115117788315, + "learning_rate": 0.00018631259922066677, + "loss": 0.1544, + "step": 4747 + }, + { + "epoch": 0.3425809011869115, + "grad_norm": 0.10365457087755203, + "learning_rate": 0.00018630971280127003, + "loss": 0.1524, + "step": 4748 + }, + { + "epoch": 0.3426530538619719, + "grad_norm": 0.13240467011928558, + "learning_rate": 0.0001863068263818733, + "loss": 0.1697, + "step": 4749 + }, + { + "epoch": 0.34272520653703237, + "grad_norm": 0.15050604939460754, + "learning_rate": 0.00018630393996247656, + "loss": 0.1907, + "step": 4750 + }, + { + "epoch": 0.3427973592120928, + "grad_norm": 0.09507954865694046, + "learning_rate": 0.00018630105354307982, + "loss": 0.1742, + "step": 4751 + }, + { + "epoch": 0.3428695118871532, + "grad_norm": 0.12495091557502747, + "learning_rate": 0.00018629816712368308, + "loss": 0.1267, + "step": 4752 + }, + { + "epoch": 0.34294166456221364, + "grad_norm": 0.09712611883878708, + "learning_rate": 0.00018629528070428634, + "loss": 0.1199, + "step": 4753 + }, + { + "epoch": 0.3430138172372741, + "grad_norm": 0.21932922303676605, + "learning_rate": 0.0001862923942848896, + "loss": 0.1807, + "step": 4754 + }, + { + "epoch": 0.3430859699123345, + "grad_norm": 0.1500842273235321, + "learning_rate": 0.00018628950786549287, + "loss": 0.1607, + "step": 4755 + }, + { + "epoch": 0.3431581225873949, + "grad_norm": 0.11836665123701096, + "learning_rate": 0.00018628662144609613, + "loss": 0.1686, + "step": 4756 + }, + { + "epoch": 0.34323027526245536, + "grad_norm": 0.14937469363212585, + "learning_rate": 0.00018628373502669937, + "loss": 0.1744, + "step": 4757 + }, + { + "epoch": 0.3433024279375158, + "grad_norm": 0.1330706626176834, + "learning_rate": 0.00018628084860730266, + "loss": 0.1232, + "step": 4758 + }, + { + "epoch": 0.3433745806125762, + "grad_norm": 0.17299704253673553, + "learning_rate": 0.00018627796218790592, + "loss": 0.2073, + "step": 4759 + }, + { + "epoch": 0.34344673328763664, + "grad_norm": 0.1117033064365387, + "learning_rate": 0.00018627507576850918, + "loss": 0.1562, + "step": 4760 + }, + { + "epoch": 0.3435188859626971, + "grad_norm": 0.12291192263364792, + "learning_rate": 0.00018627218934911245, + "loss": 0.2009, + "step": 4761 + }, + { + "epoch": 0.3435910386377575, + "grad_norm": 0.17998120188713074, + "learning_rate": 0.00018626930292971568, + "loss": 0.1629, + "step": 4762 + }, + { + "epoch": 0.3436631913128179, + "grad_norm": 0.13401764631271362, + "learning_rate": 0.00018626641651031895, + "loss": 0.1175, + "step": 4763 + }, + { + "epoch": 0.34373534398787836, + "grad_norm": 0.09569345414638519, + "learning_rate": 0.0001862635300909222, + "loss": 0.1052, + "step": 4764 + }, + { + "epoch": 0.3438074966629388, + "grad_norm": 0.21904654800891876, + "learning_rate": 0.0001862606436715255, + "loss": 0.1244, + "step": 4765 + }, + { + "epoch": 0.3438796493379992, + "grad_norm": 0.11808110773563385, + "learning_rate": 0.00018625775725212876, + "loss": 0.1374, + "step": 4766 + }, + { + "epoch": 0.34395180201305964, + "grad_norm": 0.12179733067750931, + "learning_rate": 0.000186254870832732, + "loss": 0.1814, + "step": 4767 + }, + { + "epoch": 0.3440239546881201, + "grad_norm": 0.16868746280670166, + "learning_rate": 0.00018625198441333526, + "loss": 0.1689, + "step": 4768 + }, + { + "epoch": 0.34409610736318047, + "grad_norm": 0.17062485218048096, + "learning_rate": 0.00018624909799393852, + "loss": 0.2181, + "step": 4769 + }, + { + "epoch": 0.3441682600382409, + "grad_norm": 0.13485071063041687, + "learning_rate": 0.0001862462115745418, + "loss": 0.1138, + "step": 4770 + }, + { + "epoch": 0.34424041271330136, + "grad_norm": 0.13490718603134155, + "learning_rate": 0.00018624332515514505, + "loss": 0.1584, + "step": 4771 + }, + { + "epoch": 0.34431256538836175, + "grad_norm": 0.10937722772359848, + "learning_rate": 0.0001862404387357483, + "loss": 0.1474, + "step": 4772 + }, + { + "epoch": 0.3443847180634222, + "grad_norm": 0.11925653368234634, + "learning_rate": 0.00018623755231635158, + "loss": 0.1239, + "step": 4773 + }, + { + "epoch": 0.34445687073848263, + "grad_norm": 0.12353496998548508, + "learning_rate": 0.00018623466589695484, + "loss": 0.1302, + "step": 4774 + }, + { + "epoch": 0.3445290234135431, + "grad_norm": 0.10578591376543045, + "learning_rate": 0.0001862317794775581, + "loss": 0.1839, + "step": 4775 + }, + { + "epoch": 0.34460117608860347, + "grad_norm": 0.12525229156017303, + "learning_rate": 0.00018622889305816136, + "loss": 0.2055, + "step": 4776 + }, + { + "epoch": 0.3446733287636639, + "grad_norm": 0.09258376061916351, + "learning_rate": 0.00018622600663876463, + "loss": 0.1966, + "step": 4777 + }, + { + "epoch": 0.34474548143872435, + "grad_norm": 0.09338001906871796, + "learning_rate": 0.00018622312021936786, + "loss": 0.1321, + "step": 4778 + }, + { + "epoch": 0.34481763411378474, + "grad_norm": 0.10230891406536102, + "learning_rate": 0.00018622023379997115, + "loss": 0.1247, + "step": 4779 + }, + { + "epoch": 0.3448897867888452, + "grad_norm": 0.12309479713439941, + "learning_rate": 0.00018621734738057442, + "loss": 0.1612, + "step": 4780 + }, + { + "epoch": 0.34496193946390563, + "grad_norm": 0.12477600574493408, + "learning_rate": 0.00018621446096117768, + "loss": 0.1614, + "step": 4781 + }, + { + "epoch": 0.3450340921389661, + "grad_norm": 0.137281596660614, + "learning_rate": 0.00018621157454178094, + "loss": 0.1415, + "step": 4782 + }, + { + "epoch": 0.34510624481402646, + "grad_norm": 0.14565874636173248, + "learning_rate": 0.00018620868812238418, + "loss": 0.1686, + "step": 4783 + }, + { + "epoch": 0.3451783974890869, + "grad_norm": 0.13980144262313843, + "learning_rate": 0.00018620580170298744, + "loss": 0.1582, + "step": 4784 + }, + { + "epoch": 0.34525055016414735, + "grad_norm": 0.11648563295602798, + "learning_rate": 0.0001862029152835907, + "loss": 0.1464, + "step": 4785 + }, + { + "epoch": 0.34532270283920774, + "grad_norm": 0.09814045578241348, + "learning_rate": 0.000186200028864194, + "loss": 0.1122, + "step": 4786 + }, + { + "epoch": 0.3453948555142682, + "grad_norm": 0.11648702621459961, + "learning_rate": 0.00018619714244479726, + "loss": 0.1814, + "step": 4787 + }, + { + "epoch": 0.34546700818932863, + "grad_norm": 0.09737619012594223, + "learning_rate": 0.0001861942560254005, + "loss": 0.0984, + "step": 4788 + }, + { + "epoch": 0.34553916086438907, + "grad_norm": 0.10784975439310074, + "learning_rate": 0.00018619136960600376, + "loss": 0.1379, + "step": 4789 + }, + { + "epoch": 0.34561131353944946, + "grad_norm": 0.1196584552526474, + "learning_rate": 0.00018618848318660702, + "loss": 0.2279, + "step": 4790 + }, + { + "epoch": 0.3456834662145099, + "grad_norm": 0.10751967877149582, + "learning_rate": 0.00018618559676721028, + "loss": 0.1506, + "step": 4791 + }, + { + "epoch": 0.34575561888957035, + "grad_norm": 0.15543009340763092, + "learning_rate": 0.00018618271034781354, + "loss": 0.2072, + "step": 4792 + }, + { + "epoch": 0.34582777156463074, + "grad_norm": 0.12283115088939667, + "learning_rate": 0.0001861798239284168, + "loss": 0.1385, + "step": 4793 + }, + { + "epoch": 0.3458999242396912, + "grad_norm": 0.11519939452409744, + "learning_rate": 0.00018617693750902007, + "loss": 0.1916, + "step": 4794 + }, + { + "epoch": 0.3459720769147516, + "grad_norm": 0.11674375087022781, + "learning_rate": 0.00018617405108962333, + "loss": 0.1635, + "step": 4795 + }, + { + "epoch": 0.34604422958981207, + "grad_norm": 0.1331339031457901, + "learning_rate": 0.0001861711646702266, + "loss": 0.1459, + "step": 4796 + }, + { + "epoch": 0.34611638226487246, + "grad_norm": 0.11357157677412033, + "learning_rate": 0.00018616827825082986, + "loss": 0.1733, + "step": 4797 + }, + { + "epoch": 0.3461885349399329, + "grad_norm": 0.10810473561286926, + "learning_rate": 0.00018616539183143312, + "loss": 0.208, + "step": 4798 + }, + { + "epoch": 0.34626068761499335, + "grad_norm": 0.1509094536304474, + "learning_rate": 0.00018616250541203636, + "loss": 0.1517, + "step": 4799 + }, + { + "epoch": 0.34633284029005373, + "grad_norm": 0.10608479380607605, + "learning_rate": 0.00018615961899263965, + "loss": 0.1997, + "step": 4800 + }, + { + "epoch": 0.3464049929651142, + "grad_norm": 0.12887756526470184, + "learning_rate": 0.0001861567325732429, + "loss": 0.1386, + "step": 4801 + }, + { + "epoch": 0.3464771456401746, + "grad_norm": 0.1120765283703804, + "learning_rate": 0.00018615384615384617, + "loss": 0.099, + "step": 4802 + }, + { + "epoch": 0.346549298315235, + "grad_norm": 0.12339556962251663, + "learning_rate": 0.00018615095973444944, + "loss": 0.1679, + "step": 4803 + }, + { + "epoch": 0.34662145099029545, + "grad_norm": 0.09008228033781052, + "learning_rate": 0.00018614807331505267, + "loss": 0.1299, + "step": 4804 + }, + { + "epoch": 0.3466936036653559, + "grad_norm": 0.11359957605600357, + "learning_rate": 0.00018614518689565593, + "loss": 0.1226, + "step": 4805 + }, + { + "epoch": 0.34676575634041634, + "grad_norm": 0.10467953234910965, + "learning_rate": 0.0001861423004762592, + "loss": 0.2249, + "step": 4806 + }, + { + "epoch": 0.34683790901547673, + "grad_norm": 0.1343858242034912, + "learning_rate": 0.00018613941405686246, + "loss": 0.1881, + "step": 4807 + }, + { + "epoch": 0.3469100616905372, + "grad_norm": 0.15307852625846863, + "learning_rate": 0.00018613652763746575, + "loss": 0.1139, + "step": 4808 + }, + { + "epoch": 0.3469822143655976, + "grad_norm": 0.11613091826438904, + "learning_rate": 0.00018613364121806899, + "loss": 0.2056, + "step": 4809 + }, + { + "epoch": 0.347054367040658, + "grad_norm": 0.11698072403669357, + "learning_rate": 0.00018613075479867225, + "loss": 0.1305, + "step": 4810 + }, + { + "epoch": 0.34712651971571845, + "grad_norm": 0.09609155356884003, + "learning_rate": 0.0001861278683792755, + "loss": 0.1496, + "step": 4811 + }, + { + "epoch": 0.3471986723907789, + "grad_norm": 0.11466329544782639, + "learning_rate": 0.00018612498195987878, + "loss": 0.1632, + "step": 4812 + }, + { + "epoch": 0.34727082506583934, + "grad_norm": 0.11565911769866943, + "learning_rate": 0.00018612209554048204, + "loss": 0.1647, + "step": 4813 + }, + { + "epoch": 0.3473429777408997, + "grad_norm": 0.12109330296516418, + "learning_rate": 0.0001861192091210853, + "loss": 0.1659, + "step": 4814 + }, + { + "epoch": 0.34741513041596017, + "grad_norm": 0.12918664515018463, + "learning_rate": 0.00018611632270168856, + "loss": 0.1974, + "step": 4815 + }, + { + "epoch": 0.3474872830910206, + "grad_norm": 0.1266615092754364, + "learning_rate": 0.00018611343628229183, + "loss": 0.1636, + "step": 4816 + }, + { + "epoch": 0.347559435766081, + "grad_norm": 0.12520161271095276, + "learning_rate": 0.0001861105498628951, + "loss": 0.157, + "step": 4817 + }, + { + "epoch": 0.34763158844114145, + "grad_norm": 0.10957998037338257, + "learning_rate": 0.00018610766344349835, + "loss": 0.1716, + "step": 4818 + }, + { + "epoch": 0.3477037411162019, + "grad_norm": 0.1544824093580246, + "learning_rate": 0.00018610477702410162, + "loss": 0.1614, + "step": 4819 + }, + { + "epoch": 0.34777589379126234, + "grad_norm": 0.11876598745584488, + "learning_rate": 0.00018610189060470488, + "loss": 0.1301, + "step": 4820 + }, + { + "epoch": 0.3478480464663227, + "grad_norm": 0.13351014256477356, + "learning_rate": 0.00018609900418530811, + "loss": 0.1828, + "step": 4821 + }, + { + "epoch": 0.34792019914138317, + "grad_norm": 0.10807938128709793, + "learning_rate": 0.0001860961177659114, + "loss": 0.1599, + "step": 4822 + }, + { + "epoch": 0.3479923518164436, + "grad_norm": 0.16282778978347778, + "learning_rate": 0.00018609323134651467, + "loss": 0.1566, + "step": 4823 + }, + { + "epoch": 0.348064504491504, + "grad_norm": 0.09783818572759628, + "learning_rate": 0.00018609034492711793, + "loss": 0.1339, + "step": 4824 + }, + { + "epoch": 0.34813665716656444, + "grad_norm": 0.15799443423748016, + "learning_rate": 0.0001860874585077212, + "loss": 0.1464, + "step": 4825 + }, + { + "epoch": 0.3482088098416249, + "grad_norm": 0.13499031960964203, + "learning_rate": 0.00018608457208832443, + "loss": 0.1794, + "step": 4826 + }, + { + "epoch": 0.34828096251668533, + "grad_norm": 0.12276072800159454, + "learning_rate": 0.0001860816856689277, + "loss": 0.1579, + "step": 4827 + }, + { + "epoch": 0.3483531151917457, + "grad_norm": 0.10281578451395035, + "learning_rate": 0.00018607879924953095, + "loss": 0.1766, + "step": 4828 + }, + { + "epoch": 0.34842526786680617, + "grad_norm": 0.10461385548114777, + "learning_rate": 0.00018607591283013424, + "loss": 0.189, + "step": 4829 + }, + { + "epoch": 0.3484974205418666, + "grad_norm": 0.12931039929389954, + "learning_rate": 0.0001860730264107375, + "loss": 0.1358, + "step": 4830 + }, + { + "epoch": 0.348569573216927, + "grad_norm": 0.1350351870059967, + "learning_rate": 0.00018607013999134074, + "loss": 0.1733, + "step": 4831 + }, + { + "epoch": 0.34864172589198744, + "grad_norm": 0.0907088965177536, + "learning_rate": 0.000186067253571944, + "loss": 0.1813, + "step": 4832 + }, + { + "epoch": 0.3487138785670479, + "grad_norm": 0.1410246193408966, + "learning_rate": 0.00018606436715254727, + "loss": 0.1817, + "step": 4833 + }, + { + "epoch": 0.3487860312421083, + "grad_norm": 0.10278214514255524, + "learning_rate": 0.00018606148073315053, + "loss": 0.1448, + "step": 4834 + }, + { + "epoch": 0.3488581839171687, + "grad_norm": 0.10022161900997162, + "learning_rate": 0.0001860585943137538, + "loss": 0.1403, + "step": 4835 + }, + { + "epoch": 0.34893033659222916, + "grad_norm": 0.09270060807466507, + "learning_rate": 0.00018605570789435706, + "loss": 0.1678, + "step": 4836 + }, + { + "epoch": 0.3490024892672896, + "grad_norm": 0.09166289865970612, + "learning_rate": 0.00018605282147496032, + "loss": 0.1091, + "step": 4837 + }, + { + "epoch": 0.34907464194235, + "grad_norm": 0.12794873118400574, + "learning_rate": 0.00018604993505556358, + "loss": 0.1895, + "step": 4838 + }, + { + "epoch": 0.34914679461741044, + "grad_norm": 0.11265773326158524, + "learning_rate": 0.00018604704863616685, + "loss": 0.158, + "step": 4839 + }, + { + "epoch": 0.3492189472924709, + "grad_norm": 0.1151571273803711, + "learning_rate": 0.0001860441622167701, + "loss": 0.1398, + "step": 4840 + }, + { + "epoch": 0.34929109996753127, + "grad_norm": 0.15330448746681213, + "learning_rate": 0.00018604127579737337, + "loss": 0.1456, + "step": 4841 + }, + { + "epoch": 0.3493632526425917, + "grad_norm": 0.1384754776954651, + "learning_rate": 0.0001860383893779766, + "loss": 0.1718, + "step": 4842 + }, + { + "epoch": 0.34943540531765216, + "grad_norm": 0.125054270029068, + "learning_rate": 0.0001860355029585799, + "loss": 0.1426, + "step": 4843 + }, + { + "epoch": 0.3495075579927126, + "grad_norm": 0.11646536737680435, + "learning_rate": 0.00018603261653918316, + "loss": 0.1718, + "step": 4844 + }, + { + "epoch": 0.349579710667773, + "grad_norm": 0.11603065580129623, + "learning_rate": 0.00018602973011978642, + "loss": 0.1669, + "step": 4845 + }, + { + "epoch": 0.34965186334283344, + "grad_norm": 0.11983395367860794, + "learning_rate": 0.0001860268437003897, + "loss": 0.1548, + "step": 4846 + }, + { + "epoch": 0.3497240160178939, + "grad_norm": 0.09510248154401779, + "learning_rate": 0.00018602395728099292, + "loss": 0.1876, + "step": 4847 + }, + { + "epoch": 0.34979616869295427, + "grad_norm": 0.11014638096094131, + "learning_rate": 0.00018602107086159619, + "loss": 0.153, + "step": 4848 + }, + { + "epoch": 0.3498683213680147, + "grad_norm": 0.11330801248550415, + "learning_rate": 0.00018601818444219945, + "loss": 0.1617, + "step": 4849 + }, + { + "epoch": 0.34994047404307516, + "grad_norm": 0.12342897802591324, + "learning_rate": 0.00018601529802280274, + "loss": 0.1629, + "step": 4850 + }, + { + "epoch": 0.3500126267181356, + "grad_norm": 0.11456283926963806, + "learning_rate": 0.000186012411603406, + "loss": 0.1526, + "step": 4851 + }, + { + "epoch": 0.350084779393196, + "grad_norm": 0.11536956578493118, + "learning_rate": 0.00018600952518400924, + "loss": 0.2178, + "step": 4852 + }, + { + "epoch": 0.35015693206825643, + "grad_norm": 0.09011700004339218, + "learning_rate": 0.0001860066387646125, + "loss": 0.1739, + "step": 4853 + }, + { + "epoch": 0.3502290847433169, + "grad_norm": 0.1924978345632553, + "learning_rate": 0.00018600375234521576, + "loss": 0.1629, + "step": 4854 + }, + { + "epoch": 0.35030123741837726, + "grad_norm": 0.11129790544509888, + "learning_rate": 0.00018600086592581903, + "loss": 0.139, + "step": 4855 + }, + { + "epoch": 0.3503733900934377, + "grad_norm": 0.13411857187747955, + "learning_rate": 0.0001859979795064223, + "loss": 0.1842, + "step": 4856 + }, + { + "epoch": 0.35044554276849815, + "grad_norm": 0.1135907843708992, + "learning_rate": 0.00018599509308702555, + "loss": 0.1568, + "step": 4857 + }, + { + "epoch": 0.3505176954435586, + "grad_norm": 0.11942192167043686, + "learning_rate": 0.00018599220666762882, + "loss": 0.1048, + "step": 4858 + }, + { + "epoch": 0.350589848118619, + "grad_norm": 0.11281799525022507, + "learning_rate": 0.00018598932024823208, + "loss": 0.16, + "step": 4859 + }, + { + "epoch": 0.35066200079367943, + "grad_norm": 0.13526779413223267, + "learning_rate": 0.00018598643382883534, + "loss": 0.1716, + "step": 4860 + }, + { + "epoch": 0.3507341534687399, + "grad_norm": 0.0978962704539299, + "learning_rate": 0.0001859835474094386, + "loss": 0.1704, + "step": 4861 + }, + { + "epoch": 0.35080630614380026, + "grad_norm": 0.1300303190946579, + "learning_rate": 0.00018598066099004187, + "loss": 0.1538, + "step": 4862 + }, + { + "epoch": 0.3508784588188607, + "grad_norm": 0.10521839559078217, + "learning_rate": 0.0001859777745706451, + "loss": 0.1361, + "step": 4863 + }, + { + "epoch": 0.35095061149392115, + "grad_norm": 0.13387595117092133, + "learning_rate": 0.0001859748881512484, + "loss": 0.202, + "step": 4864 + }, + { + "epoch": 0.35102276416898154, + "grad_norm": 0.11061757802963257, + "learning_rate": 0.00018597200173185166, + "loss": 0.1007, + "step": 4865 + }, + { + "epoch": 0.351094916844042, + "grad_norm": 0.12662525475025177, + "learning_rate": 0.00018596911531245492, + "loss": 0.1581, + "step": 4866 + }, + { + "epoch": 0.3511670695191024, + "grad_norm": 0.11428170651197433, + "learning_rate": 0.00018596622889305818, + "loss": 0.1513, + "step": 4867 + }, + { + "epoch": 0.35123922219416287, + "grad_norm": 0.09728476405143738, + "learning_rate": 0.00018596334247366142, + "loss": 0.1806, + "step": 4868 + }, + { + "epoch": 0.35131137486922326, + "grad_norm": 0.11465656012296677, + "learning_rate": 0.00018596045605426468, + "loss": 0.1413, + "step": 4869 + }, + { + "epoch": 0.3513835275442837, + "grad_norm": 0.14767242968082428, + "learning_rate": 0.00018595756963486794, + "loss": 0.1908, + "step": 4870 + }, + { + "epoch": 0.35145568021934415, + "grad_norm": 0.15032465755939484, + "learning_rate": 0.00018595468321547123, + "loss": 0.1018, + "step": 4871 + }, + { + "epoch": 0.35152783289440453, + "grad_norm": 0.11787731200456619, + "learning_rate": 0.0001859517967960745, + "loss": 0.1997, + "step": 4872 + }, + { + "epoch": 0.351599985569465, + "grad_norm": 0.14257623255252838, + "learning_rate": 0.00018594891037667773, + "loss": 0.1838, + "step": 4873 + }, + { + "epoch": 0.3516721382445254, + "grad_norm": 0.10114888846874237, + "learning_rate": 0.000185946023957281, + "loss": 0.1321, + "step": 4874 + }, + { + "epoch": 0.35174429091958587, + "grad_norm": 0.1027207151055336, + "learning_rate": 0.00018594313753788426, + "loss": 0.1783, + "step": 4875 + }, + { + "epoch": 0.35181644359464626, + "grad_norm": 0.11875282227993011, + "learning_rate": 0.00018594025111848752, + "loss": 0.2001, + "step": 4876 + }, + { + "epoch": 0.3518885962697067, + "grad_norm": 0.10851060599088669, + "learning_rate": 0.00018593736469909078, + "loss": 0.1755, + "step": 4877 + }, + { + "epoch": 0.35196074894476714, + "grad_norm": 0.1544347107410431, + "learning_rate": 0.00018593447827969405, + "loss": 0.1528, + "step": 4878 + }, + { + "epoch": 0.35203290161982753, + "grad_norm": 0.12431411445140839, + "learning_rate": 0.0001859315918602973, + "loss": 0.1784, + "step": 4879 + }, + { + "epoch": 0.352105054294888, + "grad_norm": 0.10366930812597275, + "learning_rate": 0.00018592870544090057, + "loss": 0.1516, + "step": 4880 + }, + { + "epoch": 0.3521772069699484, + "grad_norm": 0.11398233473300934, + "learning_rate": 0.00018592581902150384, + "loss": 0.1817, + "step": 4881 + }, + { + "epoch": 0.35224935964500886, + "grad_norm": 0.14207541942596436, + "learning_rate": 0.0001859229326021071, + "loss": 0.1364, + "step": 4882 + }, + { + "epoch": 0.35232151232006925, + "grad_norm": 0.10279354453086853, + "learning_rate": 0.00018592004618271036, + "loss": 0.1584, + "step": 4883 + }, + { + "epoch": 0.3523936649951297, + "grad_norm": 0.11522091180086136, + "learning_rate": 0.0001859171597633136, + "loss": 0.1577, + "step": 4884 + }, + { + "epoch": 0.35246581767019014, + "grad_norm": 0.13115240633487701, + "learning_rate": 0.0001859142733439169, + "loss": 0.1583, + "step": 4885 + }, + { + "epoch": 0.35253797034525053, + "grad_norm": 0.12494537979364395, + "learning_rate": 0.00018591138692452015, + "loss": 0.2014, + "step": 4886 + }, + { + "epoch": 0.352610123020311, + "grad_norm": 0.11419171094894409, + "learning_rate": 0.0001859085005051234, + "loss": 0.1129, + "step": 4887 + }, + { + "epoch": 0.3526822756953714, + "grad_norm": 0.10616102814674377, + "learning_rate": 0.00018590561408572668, + "loss": 0.1373, + "step": 4888 + }, + { + "epoch": 0.35275442837043186, + "grad_norm": 0.12825287878513336, + "learning_rate": 0.0001859027276663299, + "loss": 0.1472, + "step": 4889 + }, + { + "epoch": 0.35282658104549225, + "grad_norm": 0.10453600436449051, + "learning_rate": 0.00018589984124693317, + "loss": 0.1502, + "step": 4890 + }, + { + "epoch": 0.3528987337205527, + "grad_norm": 0.11971019953489304, + "learning_rate": 0.00018589695482753644, + "loss": 0.1735, + "step": 4891 + }, + { + "epoch": 0.35297088639561314, + "grad_norm": 0.12282633781433105, + "learning_rate": 0.00018589406840813973, + "loss": 0.1804, + "step": 4892 + }, + { + "epoch": 0.3530430390706735, + "grad_norm": 0.11366037279367447, + "learning_rate": 0.000185891181988743, + "loss": 0.1439, + "step": 4893 + }, + { + "epoch": 0.35311519174573397, + "grad_norm": 0.10512767732143402, + "learning_rate": 0.00018588829556934623, + "loss": 0.1536, + "step": 4894 + }, + { + "epoch": 0.3531873444207944, + "grad_norm": 0.11639894545078278, + "learning_rate": 0.0001858854091499495, + "loss": 0.1007, + "step": 4895 + }, + { + "epoch": 0.3532594970958548, + "grad_norm": 0.16471442580223083, + "learning_rate": 0.00018588252273055275, + "loss": 0.1693, + "step": 4896 + }, + { + "epoch": 0.35333164977091525, + "grad_norm": 0.1491038203239441, + "learning_rate": 0.00018587963631115602, + "loss": 0.1459, + "step": 4897 + }, + { + "epoch": 0.3534038024459757, + "grad_norm": 0.10994590073823929, + "learning_rate": 0.00018587674989175928, + "loss": 0.1671, + "step": 4898 + }, + { + "epoch": 0.35347595512103613, + "grad_norm": 0.11387878656387329, + "learning_rate": 0.00018587386347236254, + "loss": 0.089, + "step": 4899 + }, + { + "epoch": 0.3535481077960965, + "grad_norm": 0.11015894263982773, + "learning_rate": 0.0001858709770529658, + "loss": 0.1384, + "step": 4900 + }, + { + "epoch": 0.35362026047115697, + "grad_norm": 0.11075104773044586, + "learning_rate": 0.00018586809063356907, + "loss": 0.1223, + "step": 4901 + }, + { + "epoch": 0.3536924131462174, + "grad_norm": 0.13021506369113922, + "learning_rate": 0.00018586520421417233, + "loss": 0.159, + "step": 4902 + }, + { + "epoch": 0.3537645658212778, + "grad_norm": 0.16010746359825134, + "learning_rate": 0.0001858623177947756, + "loss": 0.1793, + "step": 4903 + }, + { + "epoch": 0.35383671849633824, + "grad_norm": 0.12425374984741211, + "learning_rate": 0.00018585943137537886, + "loss": 0.1644, + "step": 4904 + }, + { + "epoch": 0.3539088711713987, + "grad_norm": 0.17232443392276764, + "learning_rate": 0.0001858565449559821, + "loss": 0.1742, + "step": 4905 + }, + { + "epoch": 0.35398102384645913, + "grad_norm": 0.12240596860647202, + "learning_rate": 0.00018585365853658538, + "loss": 0.1562, + "step": 4906 + }, + { + "epoch": 0.3540531765215195, + "grad_norm": 0.11358831077814102, + "learning_rate": 0.00018585077211718864, + "loss": 0.1793, + "step": 4907 + }, + { + "epoch": 0.35412532919657996, + "grad_norm": 0.12762053310871124, + "learning_rate": 0.0001858478856977919, + "loss": 0.1609, + "step": 4908 + }, + { + "epoch": 0.3541974818716404, + "grad_norm": 0.11020836979150772, + "learning_rate": 0.00018584499927839517, + "loss": 0.1631, + "step": 4909 + }, + { + "epoch": 0.3542696345467008, + "grad_norm": 0.14324946701526642, + "learning_rate": 0.0001858421128589984, + "loss": 0.139, + "step": 4910 + }, + { + "epoch": 0.35434178722176124, + "grad_norm": 0.09634080529212952, + "learning_rate": 0.00018583922643960167, + "loss": 0.1669, + "step": 4911 + }, + { + "epoch": 0.3544139398968217, + "grad_norm": 0.11602307856082916, + "learning_rate": 0.00018583634002020493, + "loss": 0.1946, + "step": 4912 + }, + { + "epoch": 0.3544860925718821, + "grad_norm": 0.09764589369297028, + "learning_rate": 0.00018583345360080822, + "loss": 0.1496, + "step": 4913 + }, + { + "epoch": 0.3545582452469425, + "grad_norm": 0.1315620392560959, + "learning_rate": 0.00018583056718141148, + "loss": 0.1426, + "step": 4914 + }, + { + "epoch": 0.35463039792200296, + "grad_norm": 0.10375626385211945, + "learning_rate": 0.00018582768076201472, + "loss": 0.1783, + "step": 4915 + }, + { + "epoch": 0.3547025505970634, + "grad_norm": 0.13591431081295013, + "learning_rate": 0.00018582479434261798, + "loss": 0.1795, + "step": 4916 + }, + { + "epoch": 0.3547747032721238, + "grad_norm": 0.11436531692743301, + "learning_rate": 0.00018582190792322125, + "loss": 0.1377, + "step": 4917 + }, + { + "epoch": 0.35484685594718424, + "grad_norm": 0.13248689472675323, + "learning_rate": 0.0001858190215038245, + "loss": 0.1669, + "step": 4918 + }, + { + "epoch": 0.3549190086222447, + "grad_norm": 0.10617823898792267, + "learning_rate": 0.00018581613508442777, + "loss": 0.1722, + "step": 4919 + }, + { + "epoch": 0.3549911612973051, + "grad_norm": 0.13824215531349182, + "learning_rate": 0.00018581324866503104, + "loss": 0.1169, + "step": 4920 + }, + { + "epoch": 0.3550633139723655, + "grad_norm": 0.12319400161504745, + "learning_rate": 0.0001858103622456343, + "loss": 0.1784, + "step": 4921 + }, + { + "epoch": 0.35513546664742596, + "grad_norm": 0.12189222872257233, + "learning_rate": 0.00018580747582623756, + "loss": 0.1435, + "step": 4922 + }, + { + "epoch": 0.3552076193224864, + "grad_norm": 0.13751690089702606, + "learning_rate": 0.00018580458940684082, + "loss": 0.1789, + "step": 4923 + }, + { + "epoch": 0.3552797719975468, + "grad_norm": 0.11428214609622955, + "learning_rate": 0.0001858017029874441, + "loss": 0.2127, + "step": 4924 + }, + { + "epoch": 0.35535192467260723, + "grad_norm": 0.10802461951971054, + "learning_rate": 0.00018579881656804735, + "loss": 0.1921, + "step": 4925 + }, + { + "epoch": 0.3554240773476677, + "grad_norm": 0.12341997027397156, + "learning_rate": 0.00018579593014865059, + "loss": 0.1457, + "step": 4926 + }, + { + "epoch": 0.35549623002272807, + "grad_norm": 0.1213202252984047, + "learning_rate": 0.00018579304372925388, + "loss": 0.1195, + "step": 4927 + }, + { + "epoch": 0.3555683826977885, + "grad_norm": 0.11273522675037384, + "learning_rate": 0.00018579015730985714, + "loss": 0.1636, + "step": 4928 + }, + { + "epoch": 0.35564053537284895, + "grad_norm": 0.10054554790258408, + "learning_rate": 0.0001857872708904604, + "loss": 0.1404, + "step": 4929 + }, + { + "epoch": 0.3557126880479094, + "grad_norm": 0.12420728802680969, + "learning_rate": 0.00018578438447106366, + "loss": 0.1534, + "step": 4930 + }, + { + "epoch": 0.3557848407229698, + "grad_norm": 0.11710210889577866, + "learning_rate": 0.0001857814980516669, + "loss": 0.1284, + "step": 4931 + }, + { + "epoch": 0.35585699339803023, + "grad_norm": 0.11963210999965668, + "learning_rate": 0.00018577861163227016, + "loss": 0.166, + "step": 4932 + }, + { + "epoch": 0.3559291460730907, + "grad_norm": 0.12775014340877533, + "learning_rate": 0.00018577572521287343, + "loss": 0.1592, + "step": 4933 + }, + { + "epoch": 0.35600129874815106, + "grad_norm": 0.16686271131038666, + "learning_rate": 0.00018577283879347672, + "loss": 0.1688, + "step": 4934 + }, + { + "epoch": 0.3560734514232115, + "grad_norm": 0.15730160474777222, + "learning_rate": 0.00018576995237407998, + "loss": 0.214, + "step": 4935 + }, + { + "epoch": 0.35614560409827195, + "grad_norm": 0.13442355394363403, + "learning_rate": 0.00018576706595468321, + "loss": 0.136, + "step": 4936 + }, + { + "epoch": 0.3562177567733324, + "grad_norm": 0.11043615639209747, + "learning_rate": 0.00018576417953528648, + "loss": 0.1737, + "step": 4937 + }, + { + "epoch": 0.3562899094483928, + "grad_norm": 0.10412963479757309, + "learning_rate": 0.00018576129311588974, + "loss": 0.2177, + "step": 4938 + }, + { + "epoch": 0.3563620621234532, + "grad_norm": 0.1325763612985611, + "learning_rate": 0.000185758406696493, + "loss": 0.1786, + "step": 4939 + }, + { + "epoch": 0.35643421479851367, + "grad_norm": 0.12171344459056854, + "learning_rate": 0.00018575552027709627, + "loss": 0.1551, + "step": 4940 + }, + { + "epoch": 0.35650636747357406, + "grad_norm": 0.11393021047115326, + "learning_rate": 0.00018575263385769953, + "loss": 0.1474, + "step": 4941 + }, + { + "epoch": 0.3565785201486345, + "grad_norm": 0.11627914011478424, + "learning_rate": 0.0001857497474383028, + "loss": 0.1918, + "step": 4942 + }, + { + "epoch": 0.35665067282369495, + "grad_norm": 0.11078419536352158, + "learning_rate": 0.00018574686101890606, + "loss": 0.1807, + "step": 4943 + }, + { + "epoch": 0.3567228254987554, + "grad_norm": 0.11737526208162308, + "learning_rate": 0.00018574397459950932, + "loss": 0.1269, + "step": 4944 + }, + { + "epoch": 0.3567949781738158, + "grad_norm": 0.12382479012012482, + "learning_rate": 0.00018574108818011258, + "loss": 0.1615, + "step": 4945 + }, + { + "epoch": 0.3568671308488762, + "grad_norm": 0.12704169750213623, + "learning_rate": 0.00018573820176071584, + "loss": 0.1949, + "step": 4946 + }, + { + "epoch": 0.35693928352393667, + "grad_norm": 0.17338639497756958, + "learning_rate": 0.0001857353153413191, + "loss": 0.2353, + "step": 4947 + }, + { + "epoch": 0.35701143619899706, + "grad_norm": 0.10502665489912033, + "learning_rate": 0.00018573242892192237, + "loss": 0.1754, + "step": 4948 + }, + { + "epoch": 0.3570835888740575, + "grad_norm": 0.1206195130944252, + "learning_rate": 0.00018572954250252563, + "loss": 0.1575, + "step": 4949 + }, + { + "epoch": 0.35715574154911794, + "grad_norm": 0.1553194224834442, + "learning_rate": 0.0001857266560831289, + "loss": 0.1897, + "step": 4950 + }, + { + "epoch": 0.3572278942241784, + "grad_norm": 0.102202408015728, + "learning_rate": 0.00018572376966373216, + "loss": 0.1697, + "step": 4951 + }, + { + "epoch": 0.3573000468992388, + "grad_norm": 0.10409127920866013, + "learning_rate": 0.00018572088324433542, + "loss": 0.1864, + "step": 4952 + }, + { + "epoch": 0.3573721995742992, + "grad_norm": 0.10515742003917694, + "learning_rate": 0.00018571799682493866, + "loss": 0.1261, + "step": 4953 + }, + { + "epoch": 0.35744435224935966, + "grad_norm": 0.13130493462085724, + "learning_rate": 0.00018571511040554192, + "loss": 0.1506, + "step": 4954 + }, + { + "epoch": 0.35751650492442005, + "grad_norm": 0.1422574669122696, + "learning_rate": 0.0001857122239861452, + "loss": 0.1385, + "step": 4955 + }, + { + "epoch": 0.3575886575994805, + "grad_norm": 0.13222919404506683, + "learning_rate": 0.00018570933756674847, + "loss": 0.142, + "step": 4956 + }, + { + "epoch": 0.35766081027454094, + "grad_norm": 0.14456702768802643, + "learning_rate": 0.00018570645114735174, + "loss": 0.1845, + "step": 4957 + }, + { + "epoch": 0.35773296294960133, + "grad_norm": 0.10612176358699799, + "learning_rate": 0.00018570356472795497, + "loss": 0.1913, + "step": 4958 + }, + { + "epoch": 0.3578051156246618, + "grad_norm": 0.12707382440567017, + "learning_rate": 0.00018570067830855823, + "loss": 0.1965, + "step": 4959 + }, + { + "epoch": 0.3578772682997222, + "grad_norm": 0.11566483974456787, + "learning_rate": 0.0001856977918891615, + "loss": 0.1653, + "step": 4960 + }, + { + "epoch": 0.35794942097478266, + "grad_norm": 0.12770824134349823, + "learning_rate": 0.00018569490546976476, + "loss": 0.171, + "step": 4961 + }, + { + "epoch": 0.35802157364984305, + "grad_norm": 0.13529135286808014, + "learning_rate": 0.00018569201905036805, + "loss": 0.1826, + "step": 4962 + }, + { + "epoch": 0.3580937263249035, + "grad_norm": 0.162160724401474, + "learning_rate": 0.0001856891326309713, + "loss": 0.1945, + "step": 4963 + }, + { + "epoch": 0.35816587899996394, + "grad_norm": 0.1113724559545517, + "learning_rate": 0.00018568624621157455, + "loss": 0.1742, + "step": 4964 + }, + { + "epoch": 0.3582380316750243, + "grad_norm": 0.13558605313301086, + "learning_rate": 0.0001856833597921778, + "loss": 0.1912, + "step": 4965 + }, + { + "epoch": 0.35831018435008477, + "grad_norm": 0.22403933107852936, + "learning_rate": 0.00018568047337278108, + "loss": 0.1738, + "step": 4966 + }, + { + "epoch": 0.3583823370251452, + "grad_norm": 0.10644828528165817, + "learning_rate": 0.00018567758695338434, + "loss": 0.158, + "step": 4967 + }, + { + "epoch": 0.35845448970020566, + "grad_norm": 0.12324364483356476, + "learning_rate": 0.0001856747005339876, + "loss": 0.1726, + "step": 4968 + }, + { + "epoch": 0.35852664237526605, + "grad_norm": 0.1327822208404541, + "learning_rate": 0.00018567181411459086, + "loss": 0.1701, + "step": 4969 + }, + { + "epoch": 0.3585987950503265, + "grad_norm": 0.10459893196821213, + "learning_rate": 0.00018566892769519413, + "loss": 0.1236, + "step": 4970 + }, + { + "epoch": 0.35867094772538693, + "grad_norm": 0.1374405175447464, + "learning_rate": 0.0001856660412757974, + "loss": 0.1443, + "step": 4971 + }, + { + "epoch": 0.3587431004004473, + "grad_norm": 0.11382637917995453, + "learning_rate": 0.00018566315485640065, + "loss": 0.1148, + "step": 4972 + }, + { + "epoch": 0.35881525307550777, + "grad_norm": 0.14948365092277527, + "learning_rate": 0.00018566026843700392, + "loss": 0.1815, + "step": 4973 + }, + { + "epoch": 0.3588874057505682, + "grad_norm": 0.12349545955657959, + "learning_rate": 0.00018565738201760715, + "loss": 0.1543, + "step": 4974 + }, + { + "epoch": 0.35895955842562866, + "grad_norm": 0.15273398160934448, + "learning_rate": 0.00018565449559821041, + "loss": 0.1415, + "step": 4975 + }, + { + "epoch": 0.35903171110068904, + "grad_norm": 0.16976992785930634, + "learning_rate": 0.0001856516091788137, + "loss": 0.1221, + "step": 4976 + }, + { + "epoch": 0.3591038637757495, + "grad_norm": 0.1320040225982666, + "learning_rate": 0.00018564872275941697, + "loss": 0.1559, + "step": 4977 + }, + { + "epoch": 0.35917601645080993, + "grad_norm": 0.1515548974275589, + "learning_rate": 0.00018564583634002023, + "loss": 0.1812, + "step": 4978 + }, + { + "epoch": 0.3592481691258703, + "grad_norm": 0.1064198911190033, + "learning_rate": 0.00018564294992062347, + "loss": 0.1855, + "step": 4979 + }, + { + "epoch": 0.35932032180093076, + "grad_norm": 0.10020937025547028, + "learning_rate": 0.00018564006350122673, + "loss": 0.1363, + "step": 4980 + }, + { + "epoch": 0.3593924744759912, + "grad_norm": 0.11039954423904419, + "learning_rate": 0.00018563717708183, + "loss": 0.227, + "step": 4981 + }, + { + "epoch": 0.35946462715105165, + "grad_norm": 0.11901956051588058, + "learning_rate": 0.00018563429066243325, + "loss": 0.1784, + "step": 4982 + }, + { + "epoch": 0.35953677982611204, + "grad_norm": 0.13375921547412872, + "learning_rate": 0.00018563140424303654, + "loss": 0.1272, + "step": 4983 + }, + { + "epoch": 0.3596089325011725, + "grad_norm": 0.12381310015916824, + "learning_rate": 0.00018562851782363978, + "loss": 0.1631, + "step": 4984 + }, + { + "epoch": 0.35968108517623293, + "grad_norm": 0.11995392292737961, + "learning_rate": 0.00018562563140424304, + "loss": 0.1261, + "step": 4985 + }, + { + "epoch": 0.3597532378512933, + "grad_norm": 0.14222565293312073, + "learning_rate": 0.0001856227449848463, + "loss": 0.1373, + "step": 4986 + }, + { + "epoch": 0.35982539052635376, + "grad_norm": 0.09233032912015915, + "learning_rate": 0.00018561985856544957, + "loss": 0.1233, + "step": 4987 + }, + { + "epoch": 0.3598975432014142, + "grad_norm": 0.11498374491930008, + "learning_rate": 0.00018561697214605283, + "loss": 0.1335, + "step": 4988 + }, + { + "epoch": 0.3599696958764746, + "grad_norm": 0.1523871123790741, + "learning_rate": 0.0001856140857266561, + "loss": 0.1507, + "step": 4989 + }, + { + "epoch": 0.36004184855153504, + "grad_norm": 0.1467117816209793, + "learning_rate": 0.00018561119930725936, + "loss": 0.1534, + "step": 4990 + }, + { + "epoch": 0.3601140012265955, + "grad_norm": 0.12076793611049652, + "learning_rate": 0.00018560831288786262, + "loss": 0.1503, + "step": 4991 + }, + { + "epoch": 0.3601861539016559, + "grad_norm": 0.13146507740020752, + "learning_rate": 0.00018560542646846588, + "loss": 0.1526, + "step": 4992 + }, + { + "epoch": 0.3602583065767163, + "grad_norm": 0.12270855158567429, + "learning_rate": 0.00018560254004906915, + "loss": 0.1191, + "step": 4993 + }, + { + "epoch": 0.36033045925177676, + "grad_norm": 0.13664329051971436, + "learning_rate": 0.0001855996536296724, + "loss": 0.2025, + "step": 4994 + }, + { + "epoch": 0.3604026119268372, + "grad_norm": 0.15141281485557556, + "learning_rate": 0.00018559676721027565, + "loss": 0.1622, + "step": 4995 + }, + { + "epoch": 0.3604747646018976, + "grad_norm": 0.11014848947525024, + "learning_rate": 0.0001855938807908789, + "loss": 0.1817, + "step": 4996 + }, + { + "epoch": 0.36054691727695803, + "grad_norm": 0.13574273884296417, + "learning_rate": 0.00018559099437148217, + "loss": 0.1963, + "step": 4997 + }, + { + "epoch": 0.3606190699520185, + "grad_norm": 0.13352210819721222, + "learning_rate": 0.00018558810795208546, + "loss": 0.1491, + "step": 4998 + }, + { + "epoch": 0.3606912226270789, + "grad_norm": 0.09718064963817596, + "learning_rate": 0.00018558522153268872, + "loss": 0.1291, + "step": 4999 + }, + { + "epoch": 0.3607633753021393, + "grad_norm": 0.1510242223739624, + "learning_rate": 0.00018558233511329196, + "loss": 0.2136, + "step": 5000 + }, + { + "epoch": 0.36083552797719975, + "grad_norm": 0.13630354404449463, + "learning_rate": 0.00018557944869389522, + "loss": 0.1878, + "step": 5001 + }, + { + "epoch": 0.3609076806522602, + "grad_norm": 0.10373663902282715, + "learning_rate": 0.00018557656227449849, + "loss": 0.1361, + "step": 5002 + }, + { + "epoch": 0.3609798333273206, + "grad_norm": 0.11246156692504883, + "learning_rate": 0.00018557367585510175, + "loss": 0.2004, + "step": 5003 + }, + { + "epoch": 0.36105198600238103, + "grad_norm": 0.11527900397777557, + "learning_rate": 0.000185570789435705, + "loss": 0.154, + "step": 5004 + }, + { + "epoch": 0.3611241386774415, + "grad_norm": 0.15830467641353607, + "learning_rate": 0.00018556790301630828, + "loss": 0.1936, + "step": 5005 + }, + { + "epoch": 0.3611962913525019, + "grad_norm": 0.09405852109193802, + "learning_rate": 0.00018556501659691154, + "loss": 0.1425, + "step": 5006 + }, + { + "epoch": 0.3612684440275623, + "grad_norm": 0.13238678872585297, + "learning_rate": 0.0001855621301775148, + "loss": 0.1399, + "step": 5007 + }, + { + "epoch": 0.36134059670262275, + "grad_norm": 0.10964048653841019, + "learning_rate": 0.00018555924375811806, + "loss": 0.1785, + "step": 5008 + }, + { + "epoch": 0.3614127493776832, + "grad_norm": 0.1466868370771408, + "learning_rate": 0.00018555635733872133, + "loss": 0.1134, + "step": 5009 + }, + { + "epoch": 0.3614849020527436, + "grad_norm": 0.11806122213602066, + "learning_rate": 0.0001855534709193246, + "loss": 0.1418, + "step": 5010 + }, + { + "epoch": 0.36155705472780403, + "grad_norm": 0.13827867805957794, + "learning_rate": 0.00018555058449992783, + "loss": 0.1562, + "step": 5011 + }, + { + "epoch": 0.36162920740286447, + "grad_norm": 0.10188999772071838, + "learning_rate": 0.00018554769808053112, + "loss": 0.2064, + "step": 5012 + }, + { + "epoch": 0.3617013600779249, + "grad_norm": 0.14065970480442047, + "learning_rate": 0.00018554481166113438, + "loss": 0.1469, + "step": 5013 + }, + { + "epoch": 0.3617735127529853, + "grad_norm": 0.11012709140777588, + "learning_rate": 0.00018554192524173764, + "loss": 0.164, + "step": 5014 + }, + { + "epoch": 0.36184566542804575, + "grad_norm": 0.1569322943687439, + "learning_rate": 0.0001855390388223409, + "loss": 0.1547, + "step": 5015 + }, + { + "epoch": 0.3619178181031062, + "grad_norm": 0.1648615151643753, + "learning_rate": 0.00018553615240294414, + "loss": 0.1572, + "step": 5016 + }, + { + "epoch": 0.3619899707781666, + "grad_norm": 0.13935476541519165, + "learning_rate": 0.0001855332659835474, + "loss": 0.129, + "step": 5017 + }, + { + "epoch": 0.362062123453227, + "grad_norm": 0.1299179047346115, + "learning_rate": 0.00018553037956415067, + "loss": 0.144, + "step": 5018 + }, + { + "epoch": 0.36213427612828747, + "grad_norm": 0.14577659964561462, + "learning_rate": 0.00018552749314475396, + "loss": 0.1673, + "step": 5019 + }, + { + "epoch": 0.36220642880334786, + "grad_norm": 0.11624746024608612, + "learning_rate": 0.00018552460672535722, + "loss": 0.1483, + "step": 5020 + }, + { + "epoch": 0.3622785814784083, + "grad_norm": 0.1287081092596054, + "learning_rate": 0.00018552172030596045, + "loss": 0.1829, + "step": 5021 + }, + { + "epoch": 0.36235073415346875, + "grad_norm": 0.14396774768829346, + "learning_rate": 0.00018551883388656372, + "loss": 0.1286, + "step": 5022 + }, + { + "epoch": 0.3624228868285292, + "grad_norm": 0.1085737869143486, + "learning_rate": 0.00018551594746716698, + "loss": 0.1543, + "step": 5023 + }, + { + "epoch": 0.3624950395035896, + "grad_norm": 0.14155034720897675, + "learning_rate": 0.00018551306104777024, + "loss": 0.1076, + "step": 5024 + }, + { + "epoch": 0.36256719217865, + "grad_norm": 0.12421038001775742, + "learning_rate": 0.0001855101746283735, + "loss": 0.1596, + "step": 5025 + }, + { + "epoch": 0.36263934485371047, + "grad_norm": 0.1154010072350502, + "learning_rate": 0.00018550728820897677, + "loss": 0.167, + "step": 5026 + }, + { + "epoch": 0.36271149752877085, + "grad_norm": 0.13365091383457184, + "learning_rate": 0.00018550440178958003, + "loss": 0.1402, + "step": 5027 + }, + { + "epoch": 0.3627836502038313, + "grad_norm": 0.2421489953994751, + "learning_rate": 0.0001855015153701833, + "loss": 0.128, + "step": 5028 + }, + { + "epoch": 0.36285580287889174, + "grad_norm": 0.11368130892515182, + "learning_rate": 0.00018549862895078656, + "loss": 0.1508, + "step": 5029 + }, + { + "epoch": 0.3629279555539522, + "grad_norm": 0.10695904493331909, + "learning_rate": 0.00018549574253138982, + "loss": 0.1816, + "step": 5030 + }, + { + "epoch": 0.3630001082290126, + "grad_norm": 0.09516868740320206, + "learning_rate": 0.00018549285611199308, + "loss": 0.1551, + "step": 5031 + }, + { + "epoch": 0.363072260904073, + "grad_norm": 0.14686326682567596, + "learning_rate": 0.00018548996969259632, + "loss": 0.1383, + "step": 5032 + }, + { + "epoch": 0.36314441357913346, + "grad_norm": 0.13058914244174957, + "learning_rate": 0.0001854870832731996, + "loss": 0.1851, + "step": 5033 + }, + { + "epoch": 0.36321656625419385, + "grad_norm": 0.15616953372955322, + "learning_rate": 0.00018548419685380287, + "loss": 0.1557, + "step": 5034 + }, + { + "epoch": 0.3632887189292543, + "grad_norm": 0.09751254320144653, + "learning_rate": 0.00018548131043440614, + "loss": 0.1456, + "step": 5035 + }, + { + "epoch": 0.36336087160431474, + "grad_norm": 0.15860705077648163, + "learning_rate": 0.0001854784240150094, + "loss": 0.1678, + "step": 5036 + }, + { + "epoch": 0.3634330242793752, + "grad_norm": 0.1270139217376709, + "learning_rate": 0.00018547553759561263, + "loss": 0.1407, + "step": 5037 + }, + { + "epoch": 0.36350517695443557, + "grad_norm": 0.1260344237089157, + "learning_rate": 0.0001854726511762159, + "loss": 0.1758, + "step": 5038 + }, + { + "epoch": 0.363577329629496, + "grad_norm": 0.12336471676826477, + "learning_rate": 0.00018546976475681916, + "loss": 0.1415, + "step": 5039 + }, + { + "epoch": 0.36364948230455646, + "grad_norm": 0.12689828872680664, + "learning_rate": 0.00018546687833742245, + "loss": 0.1465, + "step": 5040 + }, + { + "epoch": 0.36372163497961685, + "grad_norm": 0.12668170034885406, + "learning_rate": 0.0001854639919180257, + "loss": 0.1252, + "step": 5041 + }, + { + "epoch": 0.3637937876546773, + "grad_norm": 0.12117216736078262, + "learning_rate": 0.00018546110549862895, + "loss": 0.1832, + "step": 5042 + }, + { + "epoch": 0.36386594032973774, + "grad_norm": 0.12453142553567886, + "learning_rate": 0.0001854582190792322, + "loss": 0.1583, + "step": 5043 + }, + { + "epoch": 0.3639380930047982, + "grad_norm": 0.10939580947160721, + "learning_rate": 0.00018545533265983547, + "loss": 0.1806, + "step": 5044 + }, + { + "epoch": 0.36401024567985857, + "grad_norm": 0.13718275725841522, + "learning_rate": 0.00018545244624043874, + "loss": 0.1396, + "step": 5045 + }, + { + "epoch": 0.364082398354919, + "grad_norm": 0.13322681188583374, + "learning_rate": 0.000185449559821042, + "loss": 0.1287, + "step": 5046 + }, + { + "epoch": 0.36415455102997946, + "grad_norm": 0.13685956597328186, + "learning_rate": 0.00018544667340164526, + "loss": 0.189, + "step": 5047 + }, + { + "epoch": 0.36422670370503984, + "grad_norm": 0.1251964420080185, + "learning_rate": 0.00018544378698224853, + "loss": 0.1731, + "step": 5048 + }, + { + "epoch": 0.3642988563801003, + "grad_norm": 0.14998053014278412, + "learning_rate": 0.0001854409005628518, + "loss": 0.1574, + "step": 5049 + }, + { + "epoch": 0.36437100905516073, + "grad_norm": 0.13014808297157288, + "learning_rate": 0.00018543801414345505, + "loss": 0.1192, + "step": 5050 + }, + { + "epoch": 0.3644431617302211, + "grad_norm": 0.10887446254491806, + "learning_rate": 0.00018543512772405832, + "loss": 0.1638, + "step": 5051 + }, + { + "epoch": 0.36451531440528157, + "grad_norm": 0.10679549723863602, + "learning_rate": 0.00018543224130466158, + "loss": 0.1455, + "step": 5052 + }, + { + "epoch": 0.364587467080342, + "grad_norm": 0.13491018116474152, + "learning_rate": 0.00018542935488526484, + "loss": 0.154, + "step": 5053 + }, + { + "epoch": 0.36465961975540245, + "grad_norm": 0.10097759962081909, + "learning_rate": 0.0001854264684658681, + "loss": 0.1674, + "step": 5054 + }, + { + "epoch": 0.36473177243046284, + "grad_norm": 0.1123056635260582, + "learning_rate": 0.00018542358204647137, + "loss": 0.1529, + "step": 5055 + }, + { + "epoch": 0.3648039251055233, + "grad_norm": 0.17683255672454834, + "learning_rate": 0.00018542069562707463, + "loss": 0.1562, + "step": 5056 + }, + { + "epoch": 0.36487607778058373, + "grad_norm": 0.1176273375749588, + "learning_rate": 0.0001854178092076779, + "loss": 0.1714, + "step": 5057 + }, + { + "epoch": 0.3649482304556441, + "grad_norm": 0.11856452375650406, + "learning_rate": 0.00018541492278828116, + "loss": 0.1786, + "step": 5058 + }, + { + "epoch": 0.36502038313070456, + "grad_norm": 0.13071638345718384, + "learning_rate": 0.0001854120363688844, + "loss": 0.1881, + "step": 5059 + }, + { + "epoch": 0.365092535805765, + "grad_norm": 0.12175871431827545, + "learning_rate": 0.00018540914994948765, + "loss": 0.2229, + "step": 5060 + }, + { + "epoch": 0.36516468848082545, + "grad_norm": 0.11493963748216629, + "learning_rate": 0.00018540626353009094, + "loss": 0.1575, + "step": 5061 + }, + { + "epoch": 0.36523684115588584, + "grad_norm": 0.13158316910266876, + "learning_rate": 0.0001854033771106942, + "loss": 0.1481, + "step": 5062 + }, + { + "epoch": 0.3653089938309463, + "grad_norm": 0.15244421362876892, + "learning_rate": 0.00018540049069129747, + "loss": 0.1597, + "step": 5063 + }, + { + "epoch": 0.3653811465060067, + "grad_norm": 0.1313554048538208, + "learning_rate": 0.0001853976042719007, + "loss": 0.1255, + "step": 5064 + }, + { + "epoch": 0.3654532991810671, + "grad_norm": 0.11577058583498001, + "learning_rate": 0.00018539471785250397, + "loss": 0.1504, + "step": 5065 + }, + { + "epoch": 0.36552545185612756, + "grad_norm": 0.11609276384115219, + "learning_rate": 0.00018539183143310723, + "loss": 0.1802, + "step": 5066 + }, + { + "epoch": 0.365597604531188, + "grad_norm": 0.1118810847401619, + "learning_rate": 0.0001853889450137105, + "loss": 0.1399, + "step": 5067 + }, + { + "epoch": 0.36566975720624845, + "grad_norm": 0.10551551729440689, + "learning_rate": 0.00018538605859431378, + "loss": 0.1921, + "step": 5068 + }, + { + "epoch": 0.36574190988130884, + "grad_norm": 0.10924666374921799, + "learning_rate": 0.00018538317217491702, + "loss": 0.1264, + "step": 5069 + }, + { + "epoch": 0.3658140625563693, + "grad_norm": 0.10584163665771484, + "learning_rate": 0.00018538028575552028, + "loss": 0.144, + "step": 5070 + }, + { + "epoch": 0.3658862152314297, + "grad_norm": 0.09888424724340439, + "learning_rate": 0.00018537739933612355, + "loss": 0.1276, + "step": 5071 + }, + { + "epoch": 0.3659583679064901, + "grad_norm": 0.11036736518144608, + "learning_rate": 0.0001853745129167268, + "loss": 0.1714, + "step": 5072 + }, + { + "epoch": 0.36603052058155056, + "grad_norm": 0.11193112283945084, + "learning_rate": 0.00018537162649733007, + "loss": 0.1732, + "step": 5073 + }, + { + "epoch": 0.366102673256611, + "grad_norm": 0.12735997140407562, + "learning_rate": 0.00018536874007793334, + "loss": 0.1406, + "step": 5074 + }, + { + "epoch": 0.36617482593167144, + "grad_norm": 0.11917732656002045, + "learning_rate": 0.0001853658536585366, + "loss": 0.1515, + "step": 5075 + }, + { + "epoch": 0.36624697860673183, + "grad_norm": 0.13747093081474304, + "learning_rate": 0.00018536296723913986, + "loss": 0.156, + "step": 5076 + }, + { + "epoch": 0.3663191312817923, + "grad_norm": 0.15208113193511963, + "learning_rate": 0.00018536008081974312, + "loss": 0.1641, + "step": 5077 + }, + { + "epoch": 0.3663912839568527, + "grad_norm": 0.1165727749466896, + "learning_rate": 0.0001853571944003464, + "loss": 0.1581, + "step": 5078 + }, + { + "epoch": 0.3664634366319131, + "grad_norm": 0.13178570568561554, + "learning_rate": 0.00018535430798094965, + "loss": 0.1601, + "step": 5079 + }, + { + "epoch": 0.36653558930697355, + "grad_norm": 0.13907936215400696, + "learning_rate": 0.00018535142156155289, + "loss": 0.1643, + "step": 5080 + }, + { + "epoch": 0.366607741982034, + "grad_norm": 0.1026151031255722, + "learning_rate": 0.00018534853514215615, + "loss": 0.1198, + "step": 5081 + }, + { + "epoch": 0.3666798946570944, + "grad_norm": 0.09718500077724457, + "learning_rate": 0.00018534564872275944, + "loss": 0.1908, + "step": 5082 + }, + { + "epoch": 0.36675204733215483, + "grad_norm": 0.10540282726287842, + "learning_rate": 0.0001853427623033627, + "loss": 0.1317, + "step": 5083 + }, + { + "epoch": 0.3668242000072153, + "grad_norm": 0.1018374040722847, + "learning_rate": 0.00018533987588396596, + "loss": 0.1423, + "step": 5084 + }, + { + "epoch": 0.3668963526822757, + "grad_norm": 0.13188058137893677, + "learning_rate": 0.0001853369894645692, + "loss": 0.1671, + "step": 5085 + }, + { + "epoch": 0.3669685053573361, + "grad_norm": 0.16619445383548737, + "learning_rate": 0.00018533410304517246, + "loss": 0.164, + "step": 5086 + }, + { + "epoch": 0.36704065803239655, + "grad_norm": 0.13738171756267548, + "learning_rate": 0.00018533121662577573, + "loss": 0.1734, + "step": 5087 + }, + { + "epoch": 0.367112810707457, + "grad_norm": 0.13084229826927185, + "learning_rate": 0.000185328330206379, + "loss": 0.2097, + "step": 5088 + }, + { + "epoch": 0.3671849633825174, + "grad_norm": 0.09078751504421234, + "learning_rate": 0.00018532544378698228, + "loss": 0.168, + "step": 5089 + }, + { + "epoch": 0.3672571160575778, + "grad_norm": 0.10640835762023926, + "learning_rate": 0.00018532255736758551, + "loss": 0.1773, + "step": 5090 + }, + { + "epoch": 0.36732926873263827, + "grad_norm": 0.12556225061416626, + "learning_rate": 0.00018531967094818878, + "loss": 0.13, + "step": 5091 + }, + { + "epoch": 0.3674014214076987, + "grad_norm": 0.11766793578863144, + "learning_rate": 0.00018531678452879204, + "loss": 0.1166, + "step": 5092 + }, + { + "epoch": 0.3674735740827591, + "grad_norm": 0.11859668046236038, + "learning_rate": 0.0001853138981093953, + "loss": 0.1438, + "step": 5093 + }, + { + "epoch": 0.36754572675781955, + "grad_norm": 0.13351677358150482, + "learning_rate": 0.00018531101168999857, + "loss": 0.157, + "step": 5094 + }, + { + "epoch": 0.36761787943288, + "grad_norm": 0.1354007124900818, + "learning_rate": 0.00018530812527060183, + "loss": 0.1691, + "step": 5095 + }, + { + "epoch": 0.3676900321079404, + "grad_norm": 0.1143498420715332, + "learning_rate": 0.0001853052388512051, + "loss": 0.1612, + "step": 5096 + }, + { + "epoch": 0.3677621847830008, + "grad_norm": 0.12536217272281647, + "learning_rate": 0.00018530235243180836, + "loss": 0.1642, + "step": 5097 + }, + { + "epoch": 0.36783433745806127, + "grad_norm": 0.13603799045085907, + "learning_rate": 0.00018529946601241162, + "loss": 0.1423, + "step": 5098 + }, + { + "epoch": 0.3679064901331217, + "grad_norm": 0.11363290250301361, + "learning_rate": 0.00018529657959301488, + "loss": 0.1752, + "step": 5099 + }, + { + "epoch": 0.3679786428081821, + "grad_norm": 0.11023396998643875, + "learning_rate": 0.00018529369317361814, + "loss": 0.1874, + "step": 5100 + }, + { + "epoch": 0.36805079548324254, + "grad_norm": 0.11664698272943497, + "learning_rate": 0.00018529080675422138, + "loss": 0.1488, + "step": 5101 + }, + { + "epoch": 0.368122948158303, + "grad_norm": 0.11291544139385223, + "learning_rate": 0.00018528792033482464, + "loss": 0.1407, + "step": 5102 + }, + { + "epoch": 0.3681951008333634, + "grad_norm": 0.1242561861872673, + "learning_rate": 0.00018528503391542793, + "loss": 0.1325, + "step": 5103 + }, + { + "epoch": 0.3682672535084238, + "grad_norm": 0.1404646337032318, + "learning_rate": 0.0001852821474960312, + "loss": 0.2256, + "step": 5104 + }, + { + "epoch": 0.36833940618348426, + "grad_norm": 0.11316098272800446, + "learning_rate": 0.00018527926107663446, + "loss": 0.1467, + "step": 5105 + }, + { + "epoch": 0.3684115588585447, + "grad_norm": 0.10041385143995285, + "learning_rate": 0.0001852763746572377, + "loss": 0.1365, + "step": 5106 + }, + { + "epoch": 0.3684837115336051, + "grad_norm": 0.13179618120193481, + "learning_rate": 0.00018527348823784096, + "loss": 0.1816, + "step": 5107 + }, + { + "epoch": 0.36855586420866554, + "grad_norm": 0.1282452642917633, + "learning_rate": 0.00018527060181844422, + "loss": 0.143, + "step": 5108 + }, + { + "epoch": 0.368628016883726, + "grad_norm": 0.17576515674591064, + "learning_rate": 0.00018526771539904748, + "loss": 0.1765, + "step": 5109 + }, + { + "epoch": 0.3687001695587864, + "grad_norm": 0.13764993846416473, + "learning_rate": 0.00018526482897965077, + "loss": 0.1637, + "step": 5110 + }, + { + "epoch": 0.3687723222338468, + "grad_norm": 0.10974890738725662, + "learning_rate": 0.000185261942560254, + "loss": 0.1639, + "step": 5111 + }, + { + "epoch": 0.36884447490890726, + "grad_norm": 0.13875915110111237, + "learning_rate": 0.00018525905614085727, + "loss": 0.1499, + "step": 5112 + }, + { + "epoch": 0.36891662758396765, + "grad_norm": 0.11761275678873062, + "learning_rate": 0.00018525616972146054, + "loss": 0.1813, + "step": 5113 + }, + { + "epoch": 0.3689887802590281, + "grad_norm": 0.12019307911396027, + "learning_rate": 0.0001852532833020638, + "loss": 0.0926, + "step": 5114 + }, + { + "epoch": 0.36906093293408854, + "grad_norm": 0.12786947190761566, + "learning_rate": 0.00018525039688266706, + "loss": 0.1878, + "step": 5115 + }, + { + "epoch": 0.369133085609149, + "grad_norm": 0.10546452552080154, + "learning_rate": 0.00018524751046327032, + "loss": 0.1632, + "step": 5116 + }, + { + "epoch": 0.36920523828420937, + "grad_norm": 0.13980886340141296, + "learning_rate": 0.0001852446240438736, + "loss": 0.197, + "step": 5117 + }, + { + "epoch": 0.3692773909592698, + "grad_norm": 0.10450896620750427, + "learning_rate": 0.00018524173762447685, + "loss": 0.2151, + "step": 5118 + }, + { + "epoch": 0.36934954363433026, + "grad_norm": 0.12501630187034607, + "learning_rate": 0.0001852388512050801, + "loss": 0.1527, + "step": 5119 + }, + { + "epoch": 0.36942169630939065, + "grad_norm": 0.1291973739862442, + "learning_rate": 0.00018523596478568338, + "loss": 0.1995, + "step": 5120 + }, + { + "epoch": 0.3694938489844511, + "grad_norm": 0.11190078407526016, + "learning_rate": 0.00018523307836628664, + "loss": 0.1156, + "step": 5121 + }, + { + "epoch": 0.36956600165951153, + "grad_norm": 0.1329687386751175, + "learning_rate": 0.00018523019194688987, + "loss": 0.1712, + "step": 5122 + }, + { + "epoch": 0.369638154334572, + "grad_norm": 0.11458615958690643, + "learning_rate": 0.00018522730552749314, + "loss": 0.1704, + "step": 5123 + }, + { + "epoch": 0.36971030700963237, + "grad_norm": 0.1093921810388565, + "learning_rate": 0.00018522441910809643, + "loss": 0.1724, + "step": 5124 + }, + { + "epoch": 0.3697824596846928, + "grad_norm": 0.12858615815639496, + "learning_rate": 0.0001852215326886997, + "loss": 0.1628, + "step": 5125 + }, + { + "epoch": 0.36985461235975325, + "grad_norm": 0.1646309196949005, + "learning_rate": 0.00018521864626930295, + "loss": 0.1371, + "step": 5126 + }, + { + "epoch": 0.36992676503481364, + "grad_norm": 0.12659627199172974, + "learning_rate": 0.0001852157598499062, + "loss": 0.1148, + "step": 5127 + }, + { + "epoch": 0.3699989177098741, + "grad_norm": 0.16643866896629333, + "learning_rate": 0.00018521287343050945, + "loss": 0.1886, + "step": 5128 + }, + { + "epoch": 0.37007107038493453, + "grad_norm": 0.11686883866786957, + "learning_rate": 0.00018520998701111271, + "loss": 0.1142, + "step": 5129 + }, + { + "epoch": 0.370143223059995, + "grad_norm": 0.12929080426692963, + "learning_rate": 0.00018520710059171598, + "loss": 0.1463, + "step": 5130 + }, + { + "epoch": 0.37021537573505536, + "grad_norm": 0.10576673597097397, + "learning_rate": 0.00018520421417231927, + "loss": 0.1083, + "step": 5131 + }, + { + "epoch": 0.3702875284101158, + "grad_norm": 0.1180238425731659, + "learning_rate": 0.0001852013277529225, + "loss": 0.1625, + "step": 5132 + }, + { + "epoch": 0.37035968108517625, + "grad_norm": 0.14105002582073212, + "learning_rate": 0.00018519844133352577, + "loss": 0.1921, + "step": 5133 + }, + { + "epoch": 0.37043183376023664, + "grad_norm": 0.10322585701942444, + "learning_rate": 0.00018519555491412903, + "loss": 0.1203, + "step": 5134 + }, + { + "epoch": 0.3705039864352971, + "grad_norm": 0.12555329501628876, + "learning_rate": 0.0001851926684947323, + "loss": 0.1714, + "step": 5135 + }, + { + "epoch": 0.3705761391103575, + "grad_norm": 0.11362385749816895, + "learning_rate": 0.00018518978207533556, + "loss": 0.1574, + "step": 5136 + }, + { + "epoch": 0.37064829178541797, + "grad_norm": 0.11772707849740982, + "learning_rate": 0.00018518689565593882, + "loss": 0.1694, + "step": 5137 + }, + { + "epoch": 0.37072044446047836, + "grad_norm": 0.1352548599243164, + "learning_rate": 0.00018518400923654208, + "loss": 0.1703, + "step": 5138 + }, + { + "epoch": 0.3707925971355388, + "grad_norm": 0.11592496931552887, + "learning_rate": 0.00018518112281714534, + "loss": 0.1873, + "step": 5139 + }, + { + "epoch": 0.37086474981059925, + "grad_norm": 0.12126876413822174, + "learning_rate": 0.0001851782363977486, + "loss": 0.1846, + "step": 5140 + }, + { + "epoch": 0.37093690248565964, + "grad_norm": 0.09664303064346313, + "learning_rate": 0.00018517534997835187, + "loss": 0.0948, + "step": 5141 + }, + { + "epoch": 0.3710090551607201, + "grad_norm": 0.15003101527690887, + "learning_rate": 0.00018517246355895513, + "loss": 0.2187, + "step": 5142 + }, + { + "epoch": 0.3710812078357805, + "grad_norm": 0.11940673738718033, + "learning_rate": 0.00018516957713955837, + "loss": 0.1276, + "step": 5143 + }, + { + "epoch": 0.3711533605108409, + "grad_norm": 0.10618552565574646, + "learning_rate": 0.00018516669072016163, + "loss": 0.1542, + "step": 5144 + }, + { + "epoch": 0.37122551318590136, + "grad_norm": 0.11178718507289886, + "learning_rate": 0.00018516380430076492, + "loss": 0.1326, + "step": 5145 + }, + { + "epoch": 0.3712976658609618, + "grad_norm": 0.12645266950130463, + "learning_rate": 0.00018516091788136818, + "loss": 0.1407, + "step": 5146 + }, + { + "epoch": 0.37136981853602224, + "grad_norm": 0.11778198182582855, + "learning_rate": 0.00018515803146197145, + "loss": 0.1773, + "step": 5147 + }, + { + "epoch": 0.37144197121108263, + "grad_norm": 0.12361133098602295, + "learning_rate": 0.00018515514504257468, + "loss": 0.1357, + "step": 5148 + }, + { + "epoch": 0.3715141238861431, + "grad_norm": 0.0917186588048935, + "learning_rate": 0.00018515225862317795, + "loss": 0.1331, + "step": 5149 + }, + { + "epoch": 0.3715862765612035, + "grad_norm": 0.11127284169197083, + "learning_rate": 0.0001851493722037812, + "loss": 0.1188, + "step": 5150 + }, + { + "epoch": 0.3716584292362639, + "grad_norm": 0.15068739652633667, + "learning_rate": 0.00018514648578438447, + "loss": 0.174, + "step": 5151 + }, + { + "epoch": 0.37173058191132435, + "grad_norm": 0.10275956988334656, + "learning_rate": 0.00018514359936498776, + "loss": 0.1613, + "step": 5152 + }, + { + "epoch": 0.3718027345863848, + "grad_norm": 0.1003510132431984, + "learning_rate": 0.000185140712945591, + "loss": 0.1288, + "step": 5153 + }, + { + "epoch": 0.37187488726144524, + "grad_norm": 0.09363386780023575, + "learning_rate": 0.00018513782652619426, + "loss": 0.1305, + "step": 5154 + }, + { + "epoch": 0.37194703993650563, + "grad_norm": 0.1276710331439972, + "learning_rate": 0.00018513494010679752, + "loss": 0.2236, + "step": 5155 + }, + { + "epoch": 0.3720191926115661, + "grad_norm": 0.12521874904632568, + "learning_rate": 0.00018513205368740079, + "loss": 0.152, + "step": 5156 + }, + { + "epoch": 0.3720913452866265, + "grad_norm": 0.12947016954421997, + "learning_rate": 0.00018512916726800405, + "loss": 0.1358, + "step": 5157 + }, + { + "epoch": 0.3721634979616869, + "grad_norm": 0.12372040748596191, + "learning_rate": 0.0001851262808486073, + "loss": 0.1574, + "step": 5158 + }, + { + "epoch": 0.37223565063674735, + "grad_norm": 0.18169283866882324, + "learning_rate": 0.00018512339442921058, + "loss": 0.1858, + "step": 5159 + }, + { + "epoch": 0.3723078033118078, + "grad_norm": 0.12034885585308075, + "learning_rate": 0.00018512050800981384, + "loss": 0.1416, + "step": 5160 + }, + { + "epoch": 0.37237995598686824, + "grad_norm": 0.09472742676734924, + "learning_rate": 0.0001851176215904171, + "loss": 0.1548, + "step": 5161 + }, + { + "epoch": 0.3724521086619286, + "grad_norm": 0.09557875245809555, + "learning_rate": 0.00018511473517102036, + "loss": 0.1493, + "step": 5162 + }, + { + "epoch": 0.37252426133698907, + "grad_norm": 0.12181645631790161, + "learning_rate": 0.00018511184875162363, + "loss": 0.1092, + "step": 5163 + }, + { + "epoch": 0.3725964140120495, + "grad_norm": 0.14272871613502502, + "learning_rate": 0.00018510896233222686, + "loss": 0.2234, + "step": 5164 + }, + { + "epoch": 0.3726685666871099, + "grad_norm": 0.11819078773260117, + "learning_rate": 0.00018510607591283013, + "loss": 0.1381, + "step": 5165 + }, + { + "epoch": 0.37274071936217035, + "grad_norm": 0.1666698306798935, + "learning_rate": 0.00018510318949343342, + "loss": 0.2006, + "step": 5166 + }, + { + "epoch": 0.3728128720372308, + "grad_norm": 0.1215706467628479, + "learning_rate": 0.00018510030307403668, + "loss": 0.1684, + "step": 5167 + }, + { + "epoch": 0.37288502471229124, + "grad_norm": 0.1380370706319809, + "learning_rate": 0.00018509741665463994, + "loss": 0.2058, + "step": 5168 + }, + { + "epoch": 0.3729571773873516, + "grad_norm": 0.10384880006313324, + "learning_rate": 0.0001850945302352432, + "loss": 0.156, + "step": 5169 + }, + { + "epoch": 0.37302933006241207, + "grad_norm": 0.11163298040628433, + "learning_rate": 0.00018509164381584644, + "loss": 0.1436, + "step": 5170 + }, + { + "epoch": 0.3731014827374725, + "grad_norm": 0.13944418728351593, + "learning_rate": 0.0001850887573964497, + "loss": 0.1954, + "step": 5171 + }, + { + "epoch": 0.3731736354125329, + "grad_norm": 0.17720527946949005, + "learning_rate": 0.00018508587097705297, + "loss": 0.1473, + "step": 5172 + }, + { + "epoch": 0.37324578808759334, + "grad_norm": 0.10967066138982773, + "learning_rate": 0.00018508298455765626, + "loss": 0.1882, + "step": 5173 + }, + { + "epoch": 0.3733179407626538, + "grad_norm": 0.12297673523426056, + "learning_rate": 0.00018508009813825952, + "loss": 0.1715, + "step": 5174 + }, + { + "epoch": 0.3733900934377142, + "grad_norm": 0.15667521953582764, + "learning_rate": 0.00018507721171886275, + "loss": 0.1389, + "step": 5175 + }, + { + "epoch": 0.3734622461127746, + "grad_norm": 0.12142255902290344, + "learning_rate": 0.00018507432529946602, + "loss": 0.1701, + "step": 5176 + }, + { + "epoch": 0.37353439878783506, + "grad_norm": 0.10766392946243286, + "learning_rate": 0.00018507143888006928, + "loss": 0.1125, + "step": 5177 + }, + { + "epoch": 0.3736065514628955, + "grad_norm": 0.10855721682310104, + "learning_rate": 0.00018506855246067254, + "loss": 0.1527, + "step": 5178 + }, + { + "epoch": 0.3736787041379559, + "grad_norm": 0.09985214471817017, + "learning_rate": 0.0001850656660412758, + "loss": 0.1422, + "step": 5179 + }, + { + "epoch": 0.37375085681301634, + "grad_norm": 0.11852970719337463, + "learning_rate": 0.00018506277962187907, + "loss": 0.1296, + "step": 5180 + }, + { + "epoch": 0.3738230094880768, + "grad_norm": 0.13748237490653992, + "learning_rate": 0.00018505989320248233, + "loss": 0.1861, + "step": 5181 + }, + { + "epoch": 0.3738951621631372, + "grad_norm": 0.10805381834506989, + "learning_rate": 0.0001850570067830856, + "loss": 0.1693, + "step": 5182 + }, + { + "epoch": 0.3739673148381976, + "grad_norm": 0.10942400246858597, + "learning_rate": 0.00018505412036368886, + "loss": 0.1575, + "step": 5183 + }, + { + "epoch": 0.37403946751325806, + "grad_norm": 0.1067863404750824, + "learning_rate": 0.00018505123394429212, + "loss": 0.1128, + "step": 5184 + }, + { + "epoch": 0.3741116201883185, + "grad_norm": 0.13118085265159607, + "learning_rate": 0.00018504834752489538, + "loss": 0.1486, + "step": 5185 + }, + { + "epoch": 0.3741837728633789, + "grad_norm": 0.158247172832489, + "learning_rate": 0.00018504546110549862, + "loss": 0.1464, + "step": 5186 + }, + { + "epoch": 0.37425592553843934, + "grad_norm": 0.12931883335113525, + "learning_rate": 0.0001850425746861019, + "loss": 0.1421, + "step": 5187 + }, + { + "epoch": 0.3743280782134998, + "grad_norm": 0.11958953738212585, + "learning_rate": 0.00018503968826670517, + "loss": 0.1432, + "step": 5188 + }, + { + "epoch": 0.37440023088856017, + "grad_norm": 0.15297405421733856, + "learning_rate": 0.00018503680184730844, + "loss": 0.1302, + "step": 5189 + }, + { + "epoch": 0.3744723835636206, + "grad_norm": 0.19646477699279785, + "learning_rate": 0.0001850339154279117, + "loss": 0.1448, + "step": 5190 + }, + { + "epoch": 0.37454453623868106, + "grad_norm": 0.15121282637119293, + "learning_rate": 0.00018503102900851493, + "loss": 0.1458, + "step": 5191 + }, + { + "epoch": 0.3746166889137415, + "grad_norm": 0.14924101531505585, + "learning_rate": 0.0001850281425891182, + "loss": 0.1541, + "step": 5192 + }, + { + "epoch": 0.3746888415888019, + "grad_norm": 0.12003999948501587, + "learning_rate": 0.00018502525616972146, + "loss": 0.1964, + "step": 5193 + }, + { + "epoch": 0.37476099426386233, + "grad_norm": 0.10451246798038483, + "learning_rate": 0.00018502236975032472, + "loss": 0.1744, + "step": 5194 + }, + { + "epoch": 0.3748331469389228, + "grad_norm": 0.14572416245937347, + "learning_rate": 0.000185019483330928, + "loss": 0.1611, + "step": 5195 + }, + { + "epoch": 0.37490529961398317, + "grad_norm": 0.12013660371303558, + "learning_rate": 0.00018501659691153125, + "loss": 0.1958, + "step": 5196 + }, + { + "epoch": 0.3749774522890436, + "grad_norm": 0.12600034475326538, + "learning_rate": 0.0001850137104921345, + "loss": 0.1563, + "step": 5197 + }, + { + "epoch": 0.37504960496410406, + "grad_norm": 0.11181921511888504, + "learning_rate": 0.00018501082407273777, + "loss": 0.1529, + "step": 5198 + }, + { + "epoch": 0.3751217576391645, + "grad_norm": 0.13128091394901276, + "learning_rate": 0.00018500793765334104, + "loss": 0.1629, + "step": 5199 + }, + { + "epoch": 0.3751939103142249, + "grad_norm": 0.10196653008460999, + "learning_rate": 0.0001850050512339443, + "loss": 0.1718, + "step": 5200 + }, + { + "epoch": 0.37526606298928533, + "grad_norm": 0.10212597995996475, + "learning_rate": 0.00018500216481454756, + "loss": 0.1602, + "step": 5201 + }, + { + "epoch": 0.3753382156643458, + "grad_norm": 0.12853200733661652, + "learning_rate": 0.00018499927839515083, + "loss": 0.153, + "step": 5202 + }, + { + "epoch": 0.37541036833940616, + "grad_norm": 0.11547980457544327, + "learning_rate": 0.0001849963919757541, + "loss": 0.165, + "step": 5203 + }, + { + "epoch": 0.3754825210144666, + "grad_norm": 0.11114273220300674, + "learning_rate": 0.00018499350555635735, + "loss": 0.1702, + "step": 5204 + }, + { + "epoch": 0.37555467368952705, + "grad_norm": 0.10827504843473434, + "learning_rate": 0.00018499061913696062, + "loss": 0.1513, + "step": 5205 + }, + { + "epoch": 0.37562682636458744, + "grad_norm": 0.12006576359272003, + "learning_rate": 0.00018498773271756388, + "loss": 0.0892, + "step": 5206 + }, + { + "epoch": 0.3756989790396479, + "grad_norm": 0.12298358976840973, + "learning_rate": 0.00018498484629816711, + "loss": 0.1007, + "step": 5207 + }, + { + "epoch": 0.37577113171470833, + "grad_norm": 0.12523004412651062, + "learning_rate": 0.00018498195987877038, + "loss": 0.157, + "step": 5208 + }, + { + "epoch": 0.3758432843897688, + "grad_norm": 0.12031126022338867, + "learning_rate": 0.00018497907345937367, + "loss": 0.1319, + "step": 5209 + }, + { + "epoch": 0.37591543706482916, + "grad_norm": 0.15156899392604828, + "learning_rate": 0.00018497618703997693, + "loss": 0.1494, + "step": 5210 + }, + { + "epoch": 0.3759875897398896, + "grad_norm": 0.12533117830753326, + "learning_rate": 0.0001849733006205802, + "loss": 0.168, + "step": 5211 + }, + { + "epoch": 0.37605974241495005, + "grad_norm": 0.11359720677137375, + "learning_rate": 0.00018497041420118343, + "loss": 0.1936, + "step": 5212 + }, + { + "epoch": 0.37613189509001044, + "grad_norm": 0.12386760860681534, + "learning_rate": 0.0001849675277817867, + "loss": 0.2077, + "step": 5213 + }, + { + "epoch": 0.3762040477650709, + "grad_norm": 0.09602401405572891, + "learning_rate": 0.00018496464136238995, + "loss": 0.138, + "step": 5214 + }, + { + "epoch": 0.3762762004401313, + "grad_norm": 0.11211474239826202, + "learning_rate": 0.00018496175494299322, + "loss": 0.1555, + "step": 5215 + }, + { + "epoch": 0.37634835311519177, + "grad_norm": 0.145424485206604, + "learning_rate": 0.0001849588685235965, + "loss": 0.1402, + "step": 5216 + }, + { + "epoch": 0.37642050579025216, + "grad_norm": 0.1361163705587387, + "learning_rate": 0.00018495598210419974, + "loss": 0.1345, + "step": 5217 + }, + { + "epoch": 0.3764926584653126, + "grad_norm": 0.12563641369342804, + "learning_rate": 0.000184953095684803, + "loss": 0.169, + "step": 5218 + }, + { + "epoch": 0.37656481114037305, + "grad_norm": 0.10223568975925446, + "learning_rate": 0.00018495020926540627, + "loss": 0.144, + "step": 5219 + }, + { + "epoch": 0.37663696381543343, + "grad_norm": 0.10732857137918472, + "learning_rate": 0.00018494732284600953, + "loss": 0.1629, + "step": 5220 + }, + { + "epoch": 0.3767091164904939, + "grad_norm": 0.15483683347702026, + "learning_rate": 0.0001849444364266128, + "loss": 0.1828, + "step": 5221 + }, + { + "epoch": 0.3767812691655543, + "grad_norm": 0.12385743111371994, + "learning_rate": 0.00018494155000721606, + "loss": 0.1336, + "step": 5222 + }, + { + "epoch": 0.37685342184061477, + "grad_norm": 0.10738009959459305, + "learning_rate": 0.00018493866358781932, + "loss": 0.1541, + "step": 5223 + }, + { + "epoch": 0.37692557451567515, + "grad_norm": 0.12624196708202362, + "learning_rate": 0.00018493577716842258, + "loss": 0.1947, + "step": 5224 + }, + { + "epoch": 0.3769977271907356, + "grad_norm": 0.1286533772945404, + "learning_rate": 0.00018493289074902585, + "loss": 0.1794, + "step": 5225 + }, + { + "epoch": 0.37706987986579604, + "grad_norm": 0.11351227760314941, + "learning_rate": 0.0001849300043296291, + "loss": 0.0987, + "step": 5226 + }, + { + "epoch": 0.37714203254085643, + "grad_norm": 0.1511540710926056, + "learning_rate": 0.00018492711791023237, + "loss": 0.139, + "step": 5227 + }, + { + "epoch": 0.3772141852159169, + "grad_norm": 0.113191619515419, + "learning_rate": 0.0001849242314908356, + "loss": 0.177, + "step": 5228 + }, + { + "epoch": 0.3772863378909773, + "grad_norm": 0.12065392732620239, + "learning_rate": 0.00018492134507143887, + "loss": 0.156, + "step": 5229 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.09924370795488358, + "learning_rate": 0.00018491845865204216, + "loss": 0.1436, + "step": 5230 + }, + { + "epoch": 0.37743064324109815, + "grad_norm": 0.10744208842515945, + "learning_rate": 0.00018491557223264542, + "loss": 0.1498, + "step": 5231 + }, + { + "epoch": 0.3775027959161586, + "grad_norm": 0.11493775993585587, + "learning_rate": 0.0001849126858132487, + "loss": 0.1281, + "step": 5232 + }, + { + "epoch": 0.37757494859121904, + "grad_norm": 0.15129004418849945, + "learning_rate": 0.00018490979939385192, + "loss": 0.1879, + "step": 5233 + }, + { + "epoch": 0.37764710126627943, + "grad_norm": 0.14441713690757751, + "learning_rate": 0.00018490691297445519, + "loss": 0.2083, + "step": 5234 + }, + { + "epoch": 0.37771925394133987, + "grad_norm": 0.10310018062591553, + "learning_rate": 0.00018490402655505845, + "loss": 0.1561, + "step": 5235 + }, + { + "epoch": 0.3777914066164003, + "grad_norm": 0.1262882649898529, + "learning_rate": 0.0001849011401356617, + "loss": 0.1585, + "step": 5236 + }, + { + "epoch": 0.3778635592914607, + "grad_norm": 0.10732994228601456, + "learning_rate": 0.000184898253716265, + "loss": 0.1657, + "step": 5237 + }, + { + "epoch": 0.37793571196652115, + "grad_norm": 0.11948318034410477, + "learning_rate": 0.00018489536729686824, + "loss": 0.1835, + "step": 5238 + }, + { + "epoch": 0.3780078646415816, + "grad_norm": 0.1175948828458786, + "learning_rate": 0.0001848924808774715, + "loss": 0.1943, + "step": 5239 + }, + { + "epoch": 0.37808001731664204, + "grad_norm": 0.10899997502565384, + "learning_rate": 0.00018488959445807476, + "loss": 0.1541, + "step": 5240 + }, + { + "epoch": 0.3781521699917024, + "grad_norm": 0.0994245633482933, + "learning_rate": 0.00018488670803867803, + "loss": 0.1894, + "step": 5241 + }, + { + "epoch": 0.37822432266676287, + "grad_norm": 0.11324607580900192, + "learning_rate": 0.0001848838216192813, + "loss": 0.175, + "step": 5242 + }, + { + "epoch": 0.3782964753418233, + "grad_norm": 0.09962138533592224, + "learning_rate": 0.00018488093519988455, + "loss": 0.2107, + "step": 5243 + }, + { + "epoch": 0.3783686280168837, + "grad_norm": 0.11725051701068878, + "learning_rate": 0.00018487804878048782, + "loss": 0.1915, + "step": 5244 + }, + { + "epoch": 0.37844078069194415, + "grad_norm": 0.11407988518476486, + "learning_rate": 0.00018487516236109108, + "loss": 0.1548, + "step": 5245 + }, + { + "epoch": 0.3785129333670046, + "grad_norm": 0.09773348271846771, + "learning_rate": 0.00018487227594169434, + "loss": 0.184, + "step": 5246 + }, + { + "epoch": 0.37858508604206503, + "grad_norm": 0.11072361469268799, + "learning_rate": 0.0001848693895222976, + "loss": 0.16, + "step": 5247 + }, + { + "epoch": 0.3786572387171254, + "grad_norm": 0.12344031035900116, + "learning_rate": 0.00018486650310290087, + "loss": 0.2174, + "step": 5248 + }, + { + "epoch": 0.37872939139218587, + "grad_norm": 0.11325129121541977, + "learning_rate": 0.0001848636166835041, + "loss": 0.1661, + "step": 5249 + }, + { + "epoch": 0.3788015440672463, + "grad_norm": 0.10998645424842834, + "learning_rate": 0.00018486073026410737, + "loss": 0.1149, + "step": 5250 + }, + { + "epoch": 0.3788736967423067, + "grad_norm": 0.11722152680158615, + "learning_rate": 0.00018485784384471066, + "loss": 0.1137, + "step": 5251 + }, + { + "epoch": 0.37894584941736714, + "grad_norm": 0.09863753616809845, + "learning_rate": 0.00018485495742531392, + "loss": 0.181, + "step": 5252 + }, + { + "epoch": 0.3790180020924276, + "grad_norm": 0.12387730181217194, + "learning_rate": 0.00018485207100591718, + "loss": 0.1864, + "step": 5253 + }, + { + "epoch": 0.37909015476748803, + "grad_norm": 0.11490000039339066, + "learning_rate": 0.00018484918458652042, + "loss": 0.1487, + "step": 5254 + }, + { + "epoch": 0.3791623074425484, + "grad_norm": 0.14609166979789734, + "learning_rate": 0.00018484629816712368, + "loss": 0.2246, + "step": 5255 + }, + { + "epoch": 0.37923446011760886, + "grad_norm": 0.11078675091266632, + "learning_rate": 0.00018484341174772694, + "loss": 0.186, + "step": 5256 + }, + { + "epoch": 0.3793066127926693, + "grad_norm": 0.09130535274744034, + "learning_rate": 0.0001848405253283302, + "loss": 0.1847, + "step": 5257 + }, + { + "epoch": 0.3793787654677297, + "grad_norm": 0.12202174961566925, + "learning_rate": 0.0001848376389089335, + "loss": 0.171, + "step": 5258 + }, + { + "epoch": 0.37945091814279014, + "grad_norm": 0.09502533078193665, + "learning_rate": 0.00018483475248953673, + "loss": 0.116, + "step": 5259 + }, + { + "epoch": 0.3795230708178506, + "grad_norm": 0.09844629466533661, + "learning_rate": 0.00018483186607014, + "loss": 0.1335, + "step": 5260 + }, + { + "epoch": 0.379595223492911, + "grad_norm": 0.10482223331928253, + "learning_rate": 0.00018482897965074326, + "loss": 0.156, + "step": 5261 + }, + { + "epoch": 0.3796673761679714, + "grad_norm": 0.14621932804584503, + "learning_rate": 0.00018482609323134652, + "loss": 0.2034, + "step": 5262 + }, + { + "epoch": 0.37973952884303186, + "grad_norm": 0.11277309060096741, + "learning_rate": 0.00018482320681194978, + "loss": 0.1122, + "step": 5263 + }, + { + "epoch": 0.3798116815180923, + "grad_norm": 0.09830714017152786, + "learning_rate": 0.00018482032039255305, + "loss": 0.1383, + "step": 5264 + }, + { + "epoch": 0.3798838341931527, + "grad_norm": 0.12473218888044357, + "learning_rate": 0.0001848174339731563, + "loss": 0.1575, + "step": 5265 + }, + { + "epoch": 0.37995598686821314, + "grad_norm": 0.11305204033851624, + "learning_rate": 0.00018481454755375957, + "loss": 0.1695, + "step": 5266 + }, + { + "epoch": 0.3800281395432736, + "grad_norm": 0.13128140568733215, + "learning_rate": 0.00018481166113436284, + "loss": 0.1735, + "step": 5267 + }, + { + "epoch": 0.38010029221833397, + "grad_norm": 0.1158086508512497, + "learning_rate": 0.0001848087747149661, + "loss": 0.1545, + "step": 5268 + }, + { + "epoch": 0.3801724448933944, + "grad_norm": 0.11829610913991928, + "learning_rate": 0.00018480588829556936, + "loss": 0.168, + "step": 5269 + }, + { + "epoch": 0.38024459756845486, + "grad_norm": 0.09383835643529892, + "learning_rate": 0.0001848030018761726, + "loss": 0.1199, + "step": 5270 + }, + { + "epoch": 0.3803167502435153, + "grad_norm": 0.12397849559783936, + "learning_rate": 0.00018480011545677586, + "loss": 0.1468, + "step": 5271 + }, + { + "epoch": 0.3803889029185757, + "grad_norm": 0.11687537282705307, + "learning_rate": 0.00018479722903737915, + "loss": 0.1096, + "step": 5272 + }, + { + "epoch": 0.38046105559363613, + "grad_norm": 0.12818457186222076, + "learning_rate": 0.0001847943426179824, + "loss": 0.1712, + "step": 5273 + }, + { + "epoch": 0.3805332082686966, + "grad_norm": 0.10428598523139954, + "learning_rate": 0.00018479145619858568, + "loss": 0.1297, + "step": 5274 + }, + { + "epoch": 0.38060536094375697, + "grad_norm": 0.08568240702152252, + "learning_rate": 0.0001847885697791889, + "loss": 0.1753, + "step": 5275 + }, + { + "epoch": 0.3806775136188174, + "grad_norm": 0.10244014859199524, + "learning_rate": 0.00018478568335979217, + "loss": 0.1478, + "step": 5276 + }, + { + "epoch": 0.38074966629387785, + "grad_norm": 0.1192367896437645, + "learning_rate": 0.00018478279694039544, + "loss": 0.1416, + "step": 5277 + }, + { + "epoch": 0.3808218189689383, + "grad_norm": 0.1428164690732956, + "learning_rate": 0.0001847799105209987, + "loss": 0.1437, + "step": 5278 + }, + { + "epoch": 0.3808939716439987, + "grad_norm": 0.11545929312705994, + "learning_rate": 0.000184777024101602, + "loss": 0.1769, + "step": 5279 + }, + { + "epoch": 0.38096612431905913, + "grad_norm": 0.12664854526519775, + "learning_rate": 0.00018477413768220523, + "loss": 0.1852, + "step": 5280 + }, + { + "epoch": 0.3810382769941196, + "grad_norm": 0.09942354261875153, + "learning_rate": 0.0001847712512628085, + "loss": 0.1493, + "step": 5281 + }, + { + "epoch": 0.38111042966917996, + "grad_norm": 0.12140817940235138, + "learning_rate": 0.00018476836484341175, + "loss": 0.1723, + "step": 5282 + }, + { + "epoch": 0.3811825823442404, + "grad_norm": 0.11576629430055618, + "learning_rate": 0.00018476547842401501, + "loss": 0.1425, + "step": 5283 + }, + { + "epoch": 0.38125473501930085, + "grad_norm": 0.1314578354358673, + "learning_rate": 0.00018476259200461828, + "loss": 0.1543, + "step": 5284 + }, + { + "epoch": 0.3813268876943613, + "grad_norm": 0.1331385374069214, + "learning_rate": 0.00018475970558522154, + "loss": 0.1852, + "step": 5285 + }, + { + "epoch": 0.3813990403694217, + "grad_norm": 0.10998408496379852, + "learning_rate": 0.0001847568191658248, + "loss": 0.1247, + "step": 5286 + }, + { + "epoch": 0.3814711930444821, + "grad_norm": 0.12362408638000488, + "learning_rate": 0.00018475393274642807, + "loss": 0.184, + "step": 5287 + }, + { + "epoch": 0.38154334571954257, + "grad_norm": 0.09371355921030045, + "learning_rate": 0.00018475104632703133, + "loss": 0.1363, + "step": 5288 + }, + { + "epoch": 0.38161549839460296, + "grad_norm": 0.12054570764303207, + "learning_rate": 0.0001847481599076346, + "loss": 0.1973, + "step": 5289 + }, + { + "epoch": 0.3816876510696634, + "grad_norm": 0.11465035378932953, + "learning_rate": 0.00018474527348823786, + "loss": 0.1481, + "step": 5290 + }, + { + "epoch": 0.38175980374472385, + "grad_norm": 0.11174909770488739, + "learning_rate": 0.00018474238706884112, + "loss": 0.1507, + "step": 5291 + }, + { + "epoch": 0.3818319564197843, + "grad_norm": 0.10161197930574417, + "learning_rate": 0.00018473950064944435, + "loss": 0.1777, + "step": 5292 + }, + { + "epoch": 0.3819041090948447, + "grad_norm": 0.11192326992750168, + "learning_rate": 0.00018473661423004764, + "loss": 0.1521, + "step": 5293 + }, + { + "epoch": 0.3819762617699051, + "grad_norm": 0.11598385125398636, + "learning_rate": 0.0001847337278106509, + "loss": 0.1865, + "step": 5294 + }, + { + "epoch": 0.38204841444496557, + "grad_norm": 0.1161946952342987, + "learning_rate": 0.00018473084139125417, + "loss": 0.1116, + "step": 5295 + }, + { + "epoch": 0.38212056712002596, + "grad_norm": 0.11318299919366837, + "learning_rate": 0.00018472795497185743, + "loss": 0.1465, + "step": 5296 + }, + { + "epoch": 0.3821927197950864, + "grad_norm": 0.12316545844078064, + "learning_rate": 0.00018472506855246067, + "loss": 0.1831, + "step": 5297 + }, + { + "epoch": 0.38226487247014684, + "grad_norm": 0.1300279051065445, + "learning_rate": 0.00018472218213306393, + "loss": 0.1342, + "step": 5298 + }, + { + "epoch": 0.38233702514520723, + "grad_norm": 0.11169512569904327, + "learning_rate": 0.0001847192957136672, + "loss": 0.1615, + "step": 5299 + }, + { + "epoch": 0.3824091778202677, + "grad_norm": 0.1198602169752121, + "learning_rate": 0.00018471640929427048, + "loss": 0.1483, + "step": 5300 + }, + { + "epoch": 0.3824813304953281, + "grad_norm": 0.12193506211042404, + "learning_rate": 0.00018471352287487375, + "loss": 0.1258, + "step": 5301 + }, + { + "epoch": 0.38255348317038856, + "grad_norm": 0.1365336775779724, + "learning_rate": 0.00018471063645547698, + "loss": 0.178, + "step": 5302 + }, + { + "epoch": 0.38262563584544895, + "grad_norm": 0.13387756049633026, + "learning_rate": 0.00018470775003608025, + "loss": 0.1403, + "step": 5303 + }, + { + "epoch": 0.3826977885205094, + "grad_norm": 0.1223507970571518, + "learning_rate": 0.0001847048636166835, + "loss": 0.1892, + "step": 5304 + }, + { + "epoch": 0.38276994119556984, + "grad_norm": 0.11407187581062317, + "learning_rate": 0.00018470197719728677, + "loss": 0.1332, + "step": 5305 + }, + { + "epoch": 0.38284209387063023, + "grad_norm": 0.10814730077981949, + "learning_rate": 0.00018469909077789003, + "loss": 0.1567, + "step": 5306 + }, + { + "epoch": 0.3829142465456907, + "grad_norm": 0.13604643940925598, + "learning_rate": 0.0001846962043584933, + "loss": 0.1558, + "step": 5307 + }, + { + "epoch": 0.3829863992207511, + "grad_norm": 0.12819768488407135, + "learning_rate": 0.00018469331793909656, + "loss": 0.1514, + "step": 5308 + }, + { + "epoch": 0.38305855189581156, + "grad_norm": 0.1057751253247261, + "learning_rate": 0.00018469043151969982, + "loss": 0.1247, + "step": 5309 + }, + { + "epoch": 0.38313070457087195, + "grad_norm": 0.10188581049442291, + "learning_rate": 0.0001846875451003031, + "loss": 0.2009, + "step": 5310 + }, + { + "epoch": 0.3832028572459324, + "grad_norm": 0.10390143096446991, + "learning_rate": 0.00018468465868090635, + "loss": 0.1475, + "step": 5311 + }, + { + "epoch": 0.38327500992099284, + "grad_norm": 0.10660845041275024, + "learning_rate": 0.0001846817722615096, + "loss": 0.1695, + "step": 5312 + }, + { + "epoch": 0.3833471625960532, + "grad_norm": 0.12353482842445374, + "learning_rate": 0.00018467888584211285, + "loss": 0.1797, + "step": 5313 + }, + { + "epoch": 0.38341931527111367, + "grad_norm": 0.12088686227798462, + "learning_rate": 0.00018467599942271614, + "loss": 0.196, + "step": 5314 + }, + { + "epoch": 0.3834914679461741, + "grad_norm": 0.09210559725761414, + "learning_rate": 0.0001846731130033194, + "loss": 0.1526, + "step": 5315 + }, + { + "epoch": 0.38356362062123456, + "grad_norm": 0.1001611053943634, + "learning_rate": 0.00018467022658392266, + "loss": 0.1851, + "step": 5316 + }, + { + "epoch": 0.38363577329629495, + "grad_norm": 0.10381637513637543, + "learning_rate": 0.00018466734016452593, + "loss": 0.13, + "step": 5317 + }, + { + "epoch": 0.3837079259713554, + "grad_norm": 0.13279971480369568, + "learning_rate": 0.00018466445374512916, + "loss": 0.1296, + "step": 5318 + }, + { + "epoch": 0.38378007864641583, + "grad_norm": 0.13534903526306152, + "learning_rate": 0.00018466156732573243, + "loss": 0.1512, + "step": 5319 + }, + { + "epoch": 0.3838522313214762, + "grad_norm": 0.11557264626026154, + "learning_rate": 0.0001846586809063357, + "loss": 0.161, + "step": 5320 + }, + { + "epoch": 0.38392438399653667, + "grad_norm": 0.10474171489477158, + "learning_rate": 0.00018465579448693898, + "loss": 0.1979, + "step": 5321 + }, + { + "epoch": 0.3839965366715971, + "grad_norm": 0.13840435445308685, + "learning_rate": 0.00018465290806754224, + "loss": 0.1288, + "step": 5322 + }, + { + "epoch": 0.38406868934665755, + "grad_norm": 0.12595689296722412, + "learning_rate": 0.00018465002164814548, + "loss": 0.1164, + "step": 5323 + }, + { + "epoch": 0.38414084202171794, + "grad_norm": 0.10718733072280884, + "learning_rate": 0.00018464713522874874, + "loss": 0.1421, + "step": 5324 + }, + { + "epoch": 0.3842129946967784, + "grad_norm": 0.12469929456710815, + "learning_rate": 0.000184644248809352, + "loss": 0.2049, + "step": 5325 + }, + { + "epoch": 0.38428514737183883, + "grad_norm": 0.10900650918483734, + "learning_rate": 0.00018464136238995527, + "loss": 0.1824, + "step": 5326 + }, + { + "epoch": 0.3843573000468992, + "grad_norm": 0.12053513526916504, + "learning_rate": 0.00018463847597055853, + "loss": 0.1582, + "step": 5327 + }, + { + "epoch": 0.38442945272195966, + "grad_norm": 0.14565838873386383, + "learning_rate": 0.0001846355895511618, + "loss": 0.1341, + "step": 5328 + }, + { + "epoch": 0.3845016053970201, + "grad_norm": 0.11942523717880249, + "learning_rate": 0.00018463270313176506, + "loss": 0.1358, + "step": 5329 + }, + { + "epoch": 0.3845737580720805, + "grad_norm": 0.16042739152908325, + "learning_rate": 0.00018462981671236832, + "loss": 0.1976, + "step": 5330 + }, + { + "epoch": 0.38464591074714094, + "grad_norm": 0.13497167825698853, + "learning_rate": 0.00018462693029297158, + "loss": 0.1564, + "step": 5331 + }, + { + "epoch": 0.3847180634222014, + "grad_norm": 0.10613252967596054, + "learning_rate": 0.00018462404387357484, + "loss": 0.1579, + "step": 5332 + }, + { + "epoch": 0.38479021609726183, + "grad_norm": 0.11304750293493271, + "learning_rate": 0.0001846211574541781, + "loss": 0.1715, + "step": 5333 + }, + { + "epoch": 0.3848623687723222, + "grad_norm": 0.12445401400327682, + "learning_rate": 0.00018461827103478134, + "loss": 0.1411, + "step": 5334 + }, + { + "epoch": 0.38493452144738266, + "grad_norm": 0.11099398136138916, + "learning_rate": 0.00018461538461538463, + "loss": 0.1169, + "step": 5335 + }, + { + "epoch": 0.3850066741224431, + "grad_norm": 0.10735882818698883, + "learning_rate": 0.0001846124981959879, + "loss": 0.132, + "step": 5336 + }, + { + "epoch": 0.3850788267975035, + "grad_norm": 0.13184022903442383, + "learning_rate": 0.00018460961177659116, + "loss": 0.1494, + "step": 5337 + }, + { + "epoch": 0.38515097947256394, + "grad_norm": 0.10633637756109238, + "learning_rate": 0.00018460672535719442, + "loss": 0.1883, + "step": 5338 + }, + { + "epoch": 0.3852231321476244, + "grad_norm": 0.10791455209255219, + "learning_rate": 0.00018460383893779766, + "loss": 0.1568, + "step": 5339 + }, + { + "epoch": 0.3852952848226848, + "grad_norm": 0.11143725365400314, + "learning_rate": 0.00018460095251840092, + "loss": 0.1623, + "step": 5340 + }, + { + "epoch": 0.3853674374977452, + "grad_norm": 0.11982785165309906, + "learning_rate": 0.00018459806609900418, + "loss": 0.1127, + "step": 5341 + }, + { + "epoch": 0.38543959017280566, + "grad_norm": 0.0953775942325592, + "learning_rate": 0.00018459517967960747, + "loss": 0.1433, + "step": 5342 + }, + { + "epoch": 0.3855117428478661, + "grad_norm": 0.09541591256856918, + "learning_rate": 0.00018459229326021074, + "loss": 0.1648, + "step": 5343 + }, + { + "epoch": 0.3855838955229265, + "grad_norm": 0.14726461470127106, + "learning_rate": 0.00018458940684081397, + "loss": 0.1893, + "step": 5344 + }, + { + "epoch": 0.38565604819798693, + "grad_norm": 0.1327129751443863, + "learning_rate": 0.00018458652042141723, + "loss": 0.1897, + "step": 5345 + }, + { + "epoch": 0.3857282008730474, + "grad_norm": 0.1486828774213791, + "learning_rate": 0.0001845836340020205, + "loss": 0.1717, + "step": 5346 + }, + { + "epoch": 0.3858003535481078, + "grad_norm": 0.1061648428440094, + "learning_rate": 0.00018458074758262376, + "loss": 0.0938, + "step": 5347 + }, + { + "epoch": 0.3858725062231682, + "grad_norm": 0.1217179223895073, + "learning_rate": 0.00018457786116322702, + "loss": 0.173, + "step": 5348 + }, + { + "epoch": 0.38594465889822865, + "grad_norm": 0.13469278812408447, + "learning_rate": 0.00018457497474383029, + "loss": 0.1417, + "step": 5349 + }, + { + "epoch": 0.3860168115732891, + "grad_norm": 0.10769320279359818, + "learning_rate": 0.00018457208832443355, + "loss": 0.1652, + "step": 5350 + }, + { + "epoch": 0.3860889642483495, + "grad_norm": 0.10926205664873123, + "learning_rate": 0.0001845692019050368, + "loss": 0.1439, + "step": 5351 + }, + { + "epoch": 0.38616111692340993, + "grad_norm": 0.09097305685281754, + "learning_rate": 0.00018456631548564008, + "loss": 0.1237, + "step": 5352 + }, + { + "epoch": 0.3862332695984704, + "grad_norm": 0.16229401528835297, + "learning_rate": 0.00018456342906624334, + "loss": 0.1723, + "step": 5353 + }, + { + "epoch": 0.3863054222735308, + "grad_norm": 0.10649501532316208, + "learning_rate": 0.0001845605426468466, + "loss": 0.155, + "step": 5354 + }, + { + "epoch": 0.3863775749485912, + "grad_norm": 0.10674185305833817, + "learning_rate": 0.00018455765622744984, + "loss": 0.1761, + "step": 5355 + }, + { + "epoch": 0.38644972762365165, + "grad_norm": 0.13681352138519287, + "learning_rate": 0.00018455476980805313, + "loss": 0.1533, + "step": 5356 + }, + { + "epoch": 0.3865218802987121, + "grad_norm": 0.1047702357172966, + "learning_rate": 0.0001845518833886564, + "loss": 0.1726, + "step": 5357 + }, + { + "epoch": 0.3865940329737725, + "grad_norm": 0.09624727070331573, + "learning_rate": 0.00018454899696925965, + "loss": 0.1286, + "step": 5358 + }, + { + "epoch": 0.3866661856488329, + "grad_norm": 0.10974021255970001, + "learning_rate": 0.00018454611054986292, + "loss": 0.1269, + "step": 5359 + }, + { + "epoch": 0.38673833832389337, + "grad_norm": 0.1494600921869278, + "learning_rate": 0.00018454322413046615, + "loss": 0.1172, + "step": 5360 + }, + { + "epoch": 0.38681049099895376, + "grad_norm": 0.11654991656541824, + "learning_rate": 0.00018454033771106941, + "loss": 0.1575, + "step": 5361 + }, + { + "epoch": 0.3868826436740142, + "grad_norm": 0.13498321175575256, + "learning_rate": 0.00018453745129167268, + "loss": 0.1384, + "step": 5362 + }, + { + "epoch": 0.38695479634907465, + "grad_norm": 0.15553444623947144, + "learning_rate": 0.00018453456487227597, + "loss": 0.1509, + "step": 5363 + }, + { + "epoch": 0.3870269490241351, + "grad_norm": 0.13026754558086395, + "learning_rate": 0.00018453167845287923, + "loss": 0.1682, + "step": 5364 + }, + { + "epoch": 0.3870991016991955, + "grad_norm": 0.1101813018321991, + "learning_rate": 0.00018452879203348247, + "loss": 0.1691, + "step": 5365 + }, + { + "epoch": 0.3871712543742559, + "grad_norm": 0.1375681310892105, + "learning_rate": 0.00018452590561408573, + "loss": 0.1199, + "step": 5366 + }, + { + "epoch": 0.38724340704931637, + "grad_norm": 0.1228758916258812, + "learning_rate": 0.000184523019194689, + "loss": 0.1025, + "step": 5367 + }, + { + "epoch": 0.38731555972437676, + "grad_norm": 0.10448736697435379, + "learning_rate": 0.00018452013277529225, + "loss": 0.1622, + "step": 5368 + }, + { + "epoch": 0.3873877123994372, + "grad_norm": 0.11223381757736206, + "learning_rate": 0.00018451724635589552, + "loss": 0.1847, + "step": 5369 + }, + { + "epoch": 0.38745986507449764, + "grad_norm": 0.13099254667758942, + "learning_rate": 0.00018451435993649878, + "loss": 0.1464, + "step": 5370 + }, + { + "epoch": 0.3875320177495581, + "grad_norm": 0.13704586029052734, + "learning_rate": 0.00018451147351710204, + "loss": 0.0955, + "step": 5371 + }, + { + "epoch": 0.3876041704246185, + "grad_norm": 0.12381067126989365, + "learning_rate": 0.0001845085870977053, + "loss": 0.1466, + "step": 5372 + }, + { + "epoch": 0.3876763230996789, + "grad_norm": 0.1215813010931015, + "learning_rate": 0.00018450570067830857, + "loss": 0.1393, + "step": 5373 + }, + { + "epoch": 0.38774847577473937, + "grad_norm": 0.1295686811208725, + "learning_rate": 0.00018450281425891183, + "loss": 0.1387, + "step": 5374 + }, + { + "epoch": 0.38782062844979975, + "grad_norm": 0.1214752271771431, + "learning_rate": 0.0001844999278395151, + "loss": 0.1621, + "step": 5375 + }, + { + "epoch": 0.3878927811248602, + "grad_norm": 0.1196463406085968, + "learning_rate": 0.00018449704142011833, + "loss": 0.1321, + "step": 5376 + }, + { + "epoch": 0.38796493379992064, + "grad_norm": 0.1484265923500061, + "learning_rate": 0.00018449415500072162, + "loss": 0.1793, + "step": 5377 + }, + { + "epoch": 0.3880370864749811, + "grad_norm": 0.12468485534191132, + "learning_rate": 0.00018449126858132488, + "loss": 0.157, + "step": 5378 + }, + { + "epoch": 0.3881092391500415, + "grad_norm": 0.0925653800368309, + "learning_rate": 0.00018448838216192815, + "loss": 0.1711, + "step": 5379 + }, + { + "epoch": 0.3881813918251019, + "grad_norm": 0.11857761442661285, + "learning_rate": 0.0001844854957425314, + "loss": 0.1254, + "step": 5380 + }, + { + "epoch": 0.38825354450016236, + "grad_norm": 0.11172490566968918, + "learning_rate": 0.00018448260932313465, + "loss": 0.1884, + "step": 5381 + }, + { + "epoch": 0.38832569717522275, + "grad_norm": 0.14004208147525787, + "learning_rate": 0.0001844797229037379, + "loss": 0.1159, + "step": 5382 + }, + { + "epoch": 0.3883978498502832, + "grad_norm": 0.11043383181095123, + "learning_rate": 0.00018447683648434117, + "loss": 0.1471, + "step": 5383 + }, + { + "epoch": 0.38847000252534364, + "grad_norm": 0.11880587786436081, + "learning_rate": 0.00018447395006494446, + "loss": 0.1561, + "step": 5384 + }, + { + "epoch": 0.3885421552004041, + "grad_norm": 0.1249169260263443, + "learning_rate": 0.00018447106364554772, + "loss": 0.1261, + "step": 5385 + }, + { + "epoch": 0.38861430787546447, + "grad_norm": 0.1326235830783844, + "learning_rate": 0.00018446817722615096, + "loss": 0.1132, + "step": 5386 + }, + { + "epoch": 0.3886864605505249, + "grad_norm": 0.1248789057135582, + "learning_rate": 0.00018446529080675422, + "loss": 0.1652, + "step": 5387 + }, + { + "epoch": 0.38875861322558536, + "grad_norm": 0.11591673642396927, + "learning_rate": 0.00018446240438735749, + "loss": 0.1938, + "step": 5388 + }, + { + "epoch": 0.38883076590064575, + "grad_norm": 0.11416880041360855, + "learning_rate": 0.00018445951796796075, + "loss": 0.1469, + "step": 5389 + }, + { + "epoch": 0.3889029185757062, + "grad_norm": 0.10494326800107956, + "learning_rate": 0.000184456631548564, + "loss": 0.1151, + "step": 5390 + }, + { + "epoch": 0.38897507125076664, + "grad_norm": 0.11870312690734863, + "learning_rate": 0.00018445374512916727, + "loss": 0.1586, + "step": 5391 + }, + { + "epoch": 0.389047223925827, + "grad_norm": 0.11810563504695892, + "learning_rate": 0.00018445085870977054, + "loss": 0.1868, + "step": 5392 + }, + { + "epoch": 0.38911937660088747, + "grad_norm": 0.10838472098112106, + "learning_rate": 0.0001844479722903738, + "loss": 0.141, + "step": 5393 + }, + { + "epoch": 0.3891915292759479, + "grad_norm": 0.1309295892715454, + "learning_rate": 0.00018444508587097706, + "loss": 0.1446, + "step": 5394 + }, + { + "epoch": 0.38926368195100836, + "grad_norm": 0.13636811077594757, + "learning_rate": 0.00018444219945158033, + "loss": 0.1854, + "step": 5395 + }, + { + "epoch": 0.38933583462606874, + "grad_norm": 0.11778973042964935, + "learning_rate": 0.0001844393130321836, + "loss": 0.1048, + "step": 5396 + }, + { + "epoch": 0.3894079873011292, + "grad_norm": 0.11265967786312103, + "learning_rate": 0.00018443642661278685, + "loss": 0.1634, + "step": 5397 + }, + { + "epoch": 0.38948013997618963, + "grad_norm": 0.12080727517604828, + "learning_rate": 0.0001844335401933901, + "loss": 0.1749, + "step": 5398 + }, + { + "epoch": 0.38955229265125, + "grad_norm": 0.10691976547241211, + "learning_rate": 0.00018443065377399338, + "loss": 0.1443, + "step": 5399 + }, + { + "epoch": 0.38962444532631046, + "grad_norm": 0.11768995970487595, + "learning_rate": 0.00018442776735459664, + "loss": 0.1771, + "step": 5400 + }, + { + "epoch": 0.3896965980013709, + "grad_norm": 0.12330100685358047, + "learning_rate": 0.0001844248809351999, + "loss": 0.1728, + "step": 5401 + }, + { + "epoch": 0.38976875067643135, + "grad_norm": 0.10355494916439056, + "learning_rate": 0.00018442199451580317, + "loss": 0.1304, + "step": 5402 + }, + { + "epoch": 0.38984090335149174, + "grad_norm": 0.11235153675079346, + "learning_rate": 0.0001844191080964064, + "loss": 0.2203, + "step": 5403 + }, + { + "epoch": 0.3899130560265522, + "grad_norm": 0.11451764404773712, + "learning_rate": 0.00018441622167700967, + "loss": 0.162, + "step": 5404 + }, + { + "epoch": 0.38998520870161263, + "grad_norm": 0.11043937504291534, + "learning_rate": 0.00018441333525761293, + "loss": 0.1765, + "step": 5405 + }, + { + "epoch": 0.390057361376673, + "grad_norm": 0.10878059267997742, + "learning_rate": 0.00018441044883821622, + "loss": 0.186, + "step": 5406 + }, + { + "epoch": 0.39012951405173346, + "grad_norm": 0.10789388418197632, + "learning_rate": 0.00018440756241881948, + "loss": 0.1377, + "step": 5407 + }, + { + "epoch": 0.3902016667267939, + "grad_norm": 0.10264996439218521, + "learning_rate": 0.00018440467599942272, + "loss": 0.1387, + "step": 5408 + }, + { + "epoch": 0.39027381940185435, + "grad_norm": 0.0962095707654953, + "learning_rate": 0.00018440178958002598, + "loss": 0.1335, + "step": 5409 + }, + { + "epoch": 0.39034597207691474, + "grad_norm": 0.16185228526592255, + "learning_rate": 0.00018439890316062924, + "loss": 0.1424, + "step": 5410 + }, + { + "epoch": 0.3904181247519752, + "grad_norm": 0.10481355339288712, + "learning_rate": 0.0001843960167412325, + "loss": 0.1497, + "step": 5411 + }, + { + "epoch": 0.3904902774270356, + "grad_norm": 0.10153397917747498, + "learning_rate": 0.00018439313032183577, + "loss": 0.1463, + "step": 5412 + }, + { + "epoch": 0.390562430102096, + "grad_norm": 0.09573136270046234, + "learning_rate": 0.00018439024390243903, + "loss": 0.1296, + "step": 5413 + }, + { + "epoch": 0.39063458277715646, + "grad_norm": 0.09889469295740128, + "learning_rate": 0.0001843873574830423, + "loss": 0.1502, + "step": 5414 + }, + { + "epoch": 0.3907067354522169, + "grad_norm": 0.10378115624189377, + "learning_rate": 0.00018438447106364556, + "loss": 0.181, + "step": 5415 + }, + { + "epoch": 0.39077888812727735, + "grad_norm": 0.12253466248512268, + "learning_rate": 0.00018438158464424882, + "loss": 0.1107, + "step": 5416 + }, + { + "epoch": 0.39085104080233773, + "grad_norm": 0.10994445532560349, + "learning_rate": 0.00018437869822485208, + "loss": 0.1217, + "step": 5417 + }, + { + "epoch": 0.3909231934773982, + "grad_norm": 0.09956198185682297, + "learning_rate": 0.00018437581180545535, + "loss": 0.1925, + "step": 5418 + }, + { + "epoch": 0.3909953461524586, + "grad_norm": 0.10294509679079056, + "learning_rate": 0.00018437292538605858, + "loss": 0.1733, + "step": 5419 + }, + { + "epoch": 0.391067498827519, + "grad_norm": 0.1036074310541153, + "learning_rate": 0.00018437003896666187, + "loss": 0.1161, + "step": 5420 + }, + { + "epoch": 0.39113965150257946, + "grad_norm": 0.14222463965415955, + "learning_rate": 0.00018436715254726514, + "loss": 0.1602, + "step": 5421 + }, + { + "epoch": 0.3912118041776399, + "grad_norm": 0.13478884100914001, + "learning_rate": 0.0001843642661278684, + "loss": 0.1516, + "step": 5422 + }, + { + "epoch": 0.3912839568527003, + "grad_norm": 0.13049808144569397, + "learning_rate": 0.00018436137970847166, + "loss": 0.1812, + "step": 5423 + }, + { + "epoch": 0.39135610952776073, + "grad_norm": 0.13071030378341675, + "learning_rate": 0.0001843584932890749, + "loss": 0.1483, + "step": 5424 + }, + { + "epoch": 0.3914282622028212, + "grad_norm": 0.10672631114721298, + "learning_rate": 0.00018435560686967816, + "loss": 0.1074, + "step": 5425 + }, + { + "epoch": 0.3915004148778816, + "grad_norm": 0.11152239143848419, + "learning_rate": 0.00018435272045028142, + "loss": 0.1203, + "step": 5426 + }, + { + "epoch": 0.391572567552942, + "grad_norm": 0.10995141416788101, + "learning_rate": 0.0001843498340308847, + "loss": 0.1235, + "step": 5427 + }, + { + "epoch": 0.39164472022800245, + "grad_norm": 0.14090102910995483, + "learning_rate": 0.00018434694761148798, + "loss": 0.1559, + "step": 5428 + }, + { + "epoch": 0.3917168729030629, + "grad_norm": 0.10918077081441879, + "learning_rate": 0.0001843440611920912, + "loss": 0.1567, + "step": 5429 + }, + { + "epoch": 0.3917890255781233, + "grad_norm": 0.12291335314512253, + "learning_rate": 0.00018434117477269447, + "loss": 0.1851, + "step": 5430 + }, + { + "epoch": 0.39186117825318373, + "grad_norm": 0.10027393698692322, + "learning_rate": 0.00018433828835329774, + "loss": 0.1488, + "step": 5431 + }, + { + "epoch": 0.3919333309282442, + "grad_norm": 0.1360558718442917, + "learning_rate": 0.000184335401933901, + "loss": 0.1473, + "step": 5432 + }, + { + "epoch": 0.3920054836033046, + "grad_norm": 0.1145196408033371, + "learning_rate": 0.00018433251551450426, + "loss": 0.1662, + "step": 5433 + }, + { + "epoch": 0.392077636278365, + "grad_norm": 0.126168355345726, + "learning_rate": 0.00018432962909510753, + "loss": 0.1747, + "step": 5434 + }, + { + "epoch": 0.39214978895342545, + "grad_norm": 0.13097237050533295, + "learning_rate": 0.0001843267426757108, + "loss": 0.1333, + "step": 5435 + }, + { + "epoch": 0.3922219416284859, + "grad_norm": 0.11811990290880203, + "learning_rate": 0.00018432385625631405, + "loss": 0.2224, + "step": 5436 + }, + { + "epoch": 0.3922940943035463, + "grad_norm": 0.1210411936044693, + "learning_rate": 0.00018432096983691732, + "loss": 0.1718, + "step": 5437 + }, + { + "epoch": 0.3923662469786067, + "grad_norm": 0.12498074769973755, + "learning_rate": 0.00018431808341752058, + "loss": 0.1645, + "step": 5438 + }, + { + "epoch": 0.39243839965366717, + "grad_norm": 0.111318439245224, + "learning_rate": 0.00018431519699812384, + "loss": 0.1946, + "step": 5439 + }, + { + "epoch": 0.3925105523287276, + "grad_norm": 0.1372980922460556, + "learning_rate": 0.00018431231057872708, + "loss": 0.1617, + "step": 5440 + }, + { + "epoch": 0.392582705003788, + "grad_norm": 0.1149701476097107, + "learning_rate": 0.00018430942415933037, + "loss": 0.1452, + "step": 5441 + }, + { + "epoch": 0.39265485767884845, + "grad_norm": 0.12510797381401062, + "learning_rate": 0.00018430653773993363, + "loss": 0.124, + "step": 5442 + }, + { + "epoch": 0.3927270103539089, + "grad_norm": 0.11453134566545486, + "learning_rate": 0.0001843036513205369, + "loss": 0.1616, + "step": 5443 + }, + { + "epoch": 0.3927991630289693, + "grad_norm": 0.11869612336158752, + "learning_rate": 0.00018430076490114016, + "loss": 0.1863, + "step": 5444 + }, + { + "epoch": 0.3928713157040297, + "grad_norm": 0.13181565701961517, + "learning_rate": 0.0001842978784817434, + "loss": 0.1824, + "step": 5445 + }, + { + "epoch": 0.39294346837909017, + "grad_norm": 0.11108796298503876, + "learning_rate": 0.00018429499206234665, + "loss": 0.1569, + "step": 5446 + }, + { + "epoch": 0.3930156210541506, + "grad_norm": 0.11389564722776413, + "learning_rate": 0.00018429210564294992, + "loss": 0.1683, + "step": 5447 + }, + { + "epoch": 0.393087773729211, + "grad_norm": 0.10978323966264725, + "learning_rate": 0.0001842892192235532, + "loss": 0.1169, + "step": 5448 + }, + { + "epoch": 0.39315992640427144, + "grad_norm": 0.1294555515050888, + "learning_rate": 0.00018428633280415647, + "loss": 0.1374, + "step": 5449 + }, + { + "epoch": 0.3932320790793319, + "grad_norm": 0.12295661866664886, + "learning_rate": 0.0001842834463847597, + "loss": 0.1673, + "step": 5450 + }, + { + "epoch": 0.3933042317543923, + "grad_norm": 0.10065015405416489, + "learning_rate": 0.00018428055996536297, + "loss": 0.158, + "step": 5451 + }, + { + "epoch": 0.3933763844294527, + "grad_norm": 0.10121423006057739, + "learning_rate": 0.00018427767354596623, + "loss": 0.1636, + "step": 5452 + }, + { + "epoch": 0.39344853710451316, + "grad_norm": 0.09941934794187546, + "learning_rate": 0.0001842747871265695, + "loss": 0.1774, + "step": 5453 + }, + { + "epoch": 0.39352068977957355, + "grad_norm": 0.17981722950935364, + "learning_rate": 0.00018427190070717276, + "loss": 0.1931, + "step": 5454 + }, + { + "epoch": 0.393592842454634, + "grad_norm": 0.13403883576393127, + "learning_rate": 0.00018426901428777602, + "loss": 0.1749, + "step": 5455 + }, + { + "epoch": 0.39366499512969444, + "grad_norm": 0.11700470745563507, + "learning_rate": 0.00018426612786837928, + "loss": 0.1459, + "step": 5456 + }, + { + "epoch": 0.3937371478047549, + "grad_norm": 0.11984625458717346, + "learning_rate": 0.00018426324144898255, + "loss": 0.2311, + "step": 5457 + }, + { + "epoch": 0.39380930047981527, + "grad_norm": 0.12416800111532211, + "learning_rate": 0.0001842603550295858, + "loss": 0.1538, + "step": 5458 + }, + { + "epoch": 0.3938814531548757, + "grad_norm": 0.10681987553834915, + "learning_rate": 0.00018425746861018907, + "loss": 0.1577, + "step": 5459 + }, + { + "epoch": 0.39395360582993616, + "grad_norm": 0.13483217358589172, + "learning_rate": 0.00018425458219079234, + "loss": 0.1452, + "step": 5460 + }, + { + "epoch": 0.39402575850499655, + "grad_norm": 0.11891290545463562, + "learning_rate": 0.00018425169577139557, + "loss": 0.1492, + "step": 5461 + }, + { + "epoch": 0.394097911180057, + "grad_norm": 0.11302975565195084, + "learning_rate": 0.00018424880935199886, + "loss": 0.1237, + "step": 5462 + }, + { + "epoch": 0.39417006385511744, + "grad_norm": 0.09831945598125458, + "learning_rate": 0.00018424592293260212, + "loss": 0.1607, + "step": 5463 + }, + { + "epoch": 0.3942422165301779, + "grad_norm": 0.10420966893434525, + "learning_rate": 0.0001842430365132054, + "loss": 0.1171, + "step": 5464 + }, + { + "epoch": 0.39431436920523827, + "grad_norm": 0.138570636510849, + "learning_rate": 0.00018424015009380865, + "loss": 0.1551, + "step": 5465 + }, + { + "epoch": 0.3943865218802987, + "grad_norm": 0.11153330653905869, + "learning_rate": 0.00018423726367441189, + "loss": 0.183, + "step": 5466 + }, + { + "epoch": 0.39445867455535916, + "grad_norm": 0.13243813812732697, + "learning_rate": 0.00018423437725501515, + "loss": 0.1178, + "step": 5467 + }, + { + "epoch": 0.39453082723041955, + "grad_norm": 0.10459816455841064, + "learning_rate": 0.0001842314908356184, + "loss": 0.1906, + "step": 5468 + }, + { + "epoch": 0.39460297990548, + "grad_norm": 0.12012185156345367, + "learning_rate": 0.0001842286044162217, + "loss": 0.1403, + "step": 5469 + }, + { + "epoch": 0.39467513258054043, + "grad_norm": 0.1246560662984848, + "learning_rate": 0.00018422571799682496, + "loss": 0.214, + "step": 5470 + }, + { + "epoch": 0.3947472852556009, + "grad_norm": 0.14620047807693481, + "learning_rate": 0.0001842228315774282, + "loss": 0.1475, + "step": 5471 + }, + { + "epoch": 0.39481943793066127, + "grad_norm": 0.13080306351184845, + "learning_rate": 0.00018421994515803146, + "loss": 0.1292, + "step": 5472 + }, + { + "epoch": 0.3948915906057217, + "grad_norm": 0.11931418627500534, + "learning_rate": 0.00018421705873863473, + "loss": 0.2049, + "step": 5473 + }, + { + "epoch": 0.39496374328078215, + "grad_norm": 0.1012091189622879, + "learning_rate": 0.000184214172319238, + "loss": 0.166, + "step": 5474 + }, + { + "epoch": 0.39503589595584254, + "grad_norm": 0.16803866624832153, + "learning_rate": 0.00018421128589984125, + "loss": 0.1538, + "step": 5475 + }, + { + "epoch": 0.395108048630903, + "grad_norm": 0.12577006220817566, + "learning_rate": 0.00018420839948044451, + "loss": 0.172, + "step": 5476 + }, + { + "epoch": 0.39518020130596343, + "grad_norm": 0.13829030096530914, + "learning_rate": 0.00018420551306104778, + "loss": 0.1718, + "step": 5477 + }, + { + "epoch": 0.3952523539810239, + "grad_norm": 0.13298436999320984, + "learning_rate": 0.00018420262664165104, + "loss": 0.1623, + "step": 5478 + }, + { + "epoch": 0.39532450665608426, + "grad_norm": 0.10100710391998291, + "learning_rate": 0.0001841997402222543, + "loss": 0.1546, + "step": 5479 + }, + { + "epoch": 0.3953966593311447, + "grad_norm": 0.12674416601657867, + "learning_rate": 0.00018419685380285757, + "loss": 0.1538, + "step": 5480 + }, + { + "epoch": 0.39546881200620515, + "grad_norm": 0.10350482910871506, + "learning_rate": 0.00018419396738346083, + "loss": 0.1574, + "step": 5481 + }, + { + "epoch": 0.39554096468126554, + "grad_norm": 0.12238260358572006, + "learning_rate": 0.00018419108096406407, + "loss": 0.178, + "step": 5482 + }, + { + "epoch": 0.395613117356326, + "grad_norm": 0.09578686207532883, + "learning_rate": 0.00018418819454466736, + "loss": 0.1866, + "step": 5483 + }, + { + "epoch": 0.3956852700313864, + "grad_norm": 0.1295008659362793, + "learning_rate": 0.00018418530812527062, + "loss": 0.1224, + "step": 5484 + }, + { + "epoch": 0.3957574227064468, + "grad_norm": 0.1356228142976761, + "learning_rate": 0.00018418242170587388, + "loss": 0.156, + "step": 5485 + }, + { + "epoch": 0.39582957538150726, + "grad_norm": 0.13476917147636414, + "learning_rate": 0.00018417953528647714, + "loss": 0.1875, + "step": 5486 + }, + { + "epoch": 0.3959017280565677, + "grad_norm": 0.12495545297861099, + "learning_rate": 0.00018417664886708038, + "loss": 0.1427, + "step": 5487 + }, + { + "epoch": 0.39597388073162815, + "grad_norm": 0.11760874092578888, + "learning_rate": 0.00018417376244768364, + "loss": 0.1597, + "step": 5488 + }, + { + "epoch": 0.39604603340668854, + "grad_norm": 0.1176750659942627, + "learning_rate": 0.0001841708760282869, + "loss": 0.1463, + "step": 5489 + }, + { + "epoch": 0.396118186081749, + "grad_norm": 0.10409785062074661, + "learning_rate": 0.0001841679896088902, + "loss": 0.1786, + "step": 5490 + }, + { + "epoch": 0.3961903387568094, + "grad_norm": 0.11557011306285858, + "learning_rate": 0.00018416510318949346, + "loss": 0.1325, + "step": 5491 + }, + { + "epoch": 0.3962624914318698, + "grad_norm": 0.14132000505924225, + "learning_rate": 0.0001841622167700967, + "loss": 0.1814, + "step": 5492 + }, + { + "epoch": 0.39633464410693026, + "grad_norm": 0.09684260189533234, + "learning_rate": 0.00018415933035069996, + "loss": 0.224, + "step": 5493 + }, + { + "epoch": 0.3964067967819907, + "grad_norm": 0.10463616997003555, + "learning_rate": 0.00018415644393130322, + "loss": 0.1462, + "step": 5494 + }, + { + "epoch": 0.39647894945705114, + "grad_norm": 0.11369778960943222, + "learning_rate": 0.00018415355751190648, + "loss": 0.1953, + "step": 5495 + }, + { + "epoch": 0.39655110213211153, + "grad_norm": 0.11679672449827194, + "learning_rate": 0.00018415067109250975, + "loss": 0.1567, + "step": 5496 + }, + { + "epoch": 0.396623254807172, + "grad_norm": 0.11416541039943695, + "learning_rate": 0.000184147784673113, + "loss": 0.1684, + "step": 5497 + }, + { + "epoch": 0.3966954074822324, + "grad_norm": 0.1330970972776413, + "learning_rate": 0.00018414489825371627, + "loss": 0.1633, + "step": 5498 + }, + { + "epoch": 0.3967675601572928, + "grad_norm": 0.13111475110054016, + "learning_rate": 0.00018414201183431953, + "loss": 0.1709, + "step": 5499 + }, + { + "epoch": 0.39683971283235325, + "grad_norm": 0.1295691430568695, + "learning_rate": 0.0001841391254149228, + "loss": 0.1488, + "step": 5500 + }, + { + "epoch": 0.3969118655074137, + "grad_norm": 0.10770321637392044, + "learning_rate": 0.00018413623899552606, + "loss": 0.1953, + "step": 5501 + }, + { + "epoch": 0.39698401818247414, + "grad_norm": 0.15028440952301025, + "learning_rate": 0.00018413335257612932, + "loss": 0.1781, + "step": 5502 + }, + { + "epoch": 0.39705617085753453, + "grad_norm": 0.09639762341976166, + "learning_rate": 0.00018413046615673256, + "loss": 0.1141, + "step": 5503 + }, + { + "epoch": 0.397128323532595, + "grad_norm": 0.14751236140727997, + "learning_rate": 0.00018412757973733585, + "loss": 0.1746, + "step": 5504 + }, + { + "epoch": 0.3972004762076554, + "grad_norm": 0.13057030737400055, + "learning_rate": 0.0001841246933179391, + "loss": 0.1534, + "step": 5505 + }, + { + "epoch": 0.3972726288827158, + "grad_norm": 0.11408231407403946, + "learning_rate": 0.00018412180689854238, + "loss": 0.2179, + "step": 5506 + }, + { + "epoch": 0.39734478155777625, + "grad_norm": 0.18876004219055176, + "learning_rate": 0.00018411892047914564, + "loss": 0.1353, + "step": 5507 + }, + { + "epoch": 0.3974169342328367, + "grad_norm": 0.11818505823612213, + "learning_rate": 0.00018411603405974887, + "loss": 0.1653, + "step": 5508 + }, + { + "epoch": 0.39748908690789714, + "grad_norm": 0.1076313853263855, + "learning_rate": 0.00018411314764035214, + "loss": 0.1502, + "step": 5509 + }, + { + "epoch": 0.3975612395829575, + "grad_norm": 0.09401322156190872, + "learning_rate": 0.0001841102612209554, + "loss": 0.197, + "step": 5510 + }, + { + "epoch": 0.39763339225801797, + "grad_norm": 0.10382793098688126, + "learning_rate": 0.0001841073748015587, + "loss": 0.1718, + "step": 5511 + }, + { + "epoch": 0.3977055449330784, + "grad_norm": 0.13243991136550903, + "learning_rate": 0.00018410448838216195, + "loss": 0.1712, + "step": 5512 + }, + { + "epoch": 0.3977776976081388, + "grad_norm": 0.13869154453277588, + "learning_rate": 0.0001841016019627652, + "loss": 0.2437, + "step": 5513 + }, + { + "epoch": 0.39784985028319925, + "grad_norm": 0.13590875267982483, + "learning_rate": 0.00018409871554336845, + "loss": 0.1687, + "step": 5514 + }, + { + "epoch": 0.3979220029582597, + "grad_norm": 0.27010488510131836, + "learning_rate": 0.00018409582912397171, + "loss": 0.1801, + "step": 5515 + }, + { + "epoch": 0.3979941556333201, + "grad_norm": 0.11247264593839645, + "learning_rate": 0.00018409294270457498, + "loss": 0.1467, + "step": 5516 + }, + { + "epoch": 0.3980663083083805, + "grad_norm": 0.1487649828195572, + "learning_rate": 0.00018409005628517824, + "loss": 0.177, + "step": 5517 + }, + { + "epoch": 0.39813846098344097, + "grad_norm": 0.13606010377407074, + "learning_rate": 0.00018408716986578153, + "loss": 0.1343, + "step": 5518 + }, + { + "epoch": 0.3982106136585014, + "grad_norm": 0.11267613619565964, + "learning_rate": 0.00018408428344638477, + "loss": 0.1745, + "step": 5519 + }, + { + "epoch": 0.3982827663335618, + "grad_norm": 0.15224646031856537, + "learning_rate": 0.00018408139702698803, + "loss": 0.1924, + "step": 5520 + }, + { + "epoch": 0.39835491900862224, + "grad_norm": 0.13173869252204895, + "learning_rate": 0.0001840785106075913, + "loss": 0.1866, + "step": 5521 + }, + { + "epoch": 0.3984270716836827, + "grad_norm": 0.127375528216362, + "learning_rate": 0.00018407562418819455, + "loss": 0.147, + "step": 5522 + }, + { + "epoch": 0.3984992243587431, + "grad_norm": 0.10252101719379425, + "learning_rate": 0.00018407273776879782, + "loss": 0.1101, + "step": 5523 + }, + { + "epoch": 0.3985713770338035, + "grad_norm": 0.09786207973957062, + "learning_rate": 0.00018406985134940108, + "loss": 0.1511, + "step": 5524 + }, + { + "epoch": 0.39864352970886396, + "grad_norm": 0.15317873656749725, + "learning_rate": 0.00018406696493000434, + "loss": 0.1469, + "step": 5525 + }, + { + "epoch": 0.3987156823839244, + "grad_norm": 0.12719370424747467, + "learning_rate": 0.0001840640785106076, + "loss": 0.1466, + "step": 5526 + }, + { + "epoch": 0.3987878350589848, + "grad_norm": 0.12291129678487778, + "learning_rate": 0.00018406119209121087, + "loss": 0.1089, + "step": 5527 + }, + { + "epoch": 0.39885998773404524, + "grad_norm": 0.13241510093212128, + "learning_rate": 0.00018405830567181413, + "loss": 0.1651, + "step": 5528 + }, + { + "epoch": 0.3989321404091057, + "grad_norm": 0.14325828850269318, + "learning_rate": 0.0001840554192524174, + "loss": 0.2134, + "step": 5529 + }, + { + "epoch": 0.3990042930841661, + "grad_norm": 0.11104754358530045, + "learning_rate": 0.00018405253283302063, + "loss": 0.1667, + "step": 5530 + }, + { + "epoch": 0.3990764457592265, + "grad_norm": 0.13289597630500793, + "learning_rate": 0.0001840496464136239, + "loss": 0.1048, + "step": 5531 + }, + { + "epoch": 0.39914859843428696, + "grad_norm": 0.14496006071567535, + "learning_rate": 0.00018404675999422718, + "loss": 0.154, + "step": 5532 + }, + { + "epoch": 0.3992207511093474, + "grad_norm": 0.14237834513187408, + "learning_rate": 0.00018404387357483045, + "loss": 0.1515, + "step": 5533 + }, + { + "epoch": 0.3992929037844078, + "grad_norm": 0.11830680817365646, + "learning_rate": 0.0001840409871554337, + "loss": 0.185, + "step": 5534 + }, + { + "epoch": 0.39936505645946824, + "grad_norm": 0.11759422719478607, + "learning_rate": 0.00018403810073603695, + "loss": 0.1492, + "step": 5535 + }, + { + "epoch": 0.3994372091345287, + "grad_norm": 0.11616528779268265, + "learning_rate": 0.0001840352143166402, + "loss": 0.1328, + "step": 5536 + }, + { + "epoch": 0.39950936180958907, + "grad_norm": 0.12721532583236694, + "learning_rate": 0.00018403232789724347, + "loss": 0.1896, + "step": 5537 + }, + { + "epoch": 0.3995815144846495, + "grad_norm": 0.12530900537967682, + "learning_rate": 0.00018402944147784673, + "loss": 0.1531, + "step": 5538 + }, + { + "epoch": 0.39965366715970996, + "grad_norm": 0.12472952157258987, + "learning_rate": 0.00018402655505845002, + "loss": 0.1626, + "step": 5539 + }, + { + "epoch": 0.3997258198347704, + "grad_norm": 0.09787862002849579, + "learning_rate": 0.00018402366863905326, + "loss": 0.1518, + "step": 5540 + }, + { + "epoch": 0.3997979725098308, + "grad_norm": 0.10951712727546692, + "learning_rate": 0.00018402078221965652, + "loss": 0.1368, + "step": 5541 + }, + { + "epoch": 0.39987012518489123, + "grad_norm": 0.15396511554718018, + "learning_rate": 0.00018401789580025979, + "loss": 0.1458, + "step": 5542 + }, + { + "epoch": 0.3999422778599517, + "grad_norm": 0.12943795323371887, + "learning_rate": 0.00018401500938086305, + "loss": 0.1693, + "step": 5543 + }, + { + "epoch": 0.40001443053501207, + "grad_norm": 0.1799429953098297, + "learning_rate": 0.0001840121229614663, + "loss": 0.1878, + "step": 5544 + }, + { + "epoch": 0.4000865832100725, + "grad_norm": 0.17684325575828552, + "learning_rate": 0.00018400923654206958, + "loss": 0.2223, + "step": 5545 + }, + { + "epoch": 0.40015873588513295, + "grad_norm": 0.11476756632328033, + "learning_rate": 0.00018400635012267284, + "loss": 0.1512, + "step": 5546 + }, + { + "epoch": 0.40023088856019334, + "grad_norm": 0.18006323277950287, + "learning_rate": 0.0001840034637032761, + "loss": 0.1732, + "step": 5547 + }, + { + "epoch": 0.4003030412352538, + "grad_norm": 0.13941843807697296, + "learning_rate": 0.00018400057728387936, + "loss": 0.1466, + "step": 5548 + }, + { + "epoch": 0.40037519391031423, + "grad_norm": 0.11408531665802002, + "learning_rate": 0.00018399769086448263, + "loss": 0.1456, + "step": 5549 + }, + { + "epoch": 0.4004473465853747, + "grad_norm": 0.12096792459487915, + "learning_rate": 0.0001839948044450859, + "loss": 0.171, + "step": 5550 + }, + { + "epoch": 0.40051949926043506, + "grad_norm": 0.11903735250234604, + "learning_rate": 0.00018399191802568913, + "loss": 0.1409, + "step": 5551 + }, + { + "epoch": 0.4005916519354955, + "grad_norm": 0.12109566479921341, + "learning_rate": 0.0001839890316062924, + "loss": 0.189, + "step": 5552 + }, + { + "epoch": 0.40066380461055595, + "grad_norm": 0.13302576541900635, + "learning_rate": 0.00018398614518689568, + "loss": 0.1552, + "step": 5553 + }, + { + "epoch": 0.40073595728561634, + "grad_norm": 0.10768107324838638, + "learning_rate": 0.00018398325876749894, + "loss": 0.1365, + "step": 5554 + }, + { + "epoch": 0.4008081099606768, + "grad_norm": 0.10121867805719376, + "learning_rate": 0.0001839803723481022, + "loss": 0.1893, + "step": 5555 + }, + { + "epoch": 0.40088026263573723, + "grad_norm": 0.09464319050312042, + "learning_rate": 0.00018397748592870544, + "loss": 0.1361, + "step": 5556 + }, + { + "epoch": 0.40095241531079767, + "grad_norm": 0.14916472136974335, + "learning_rate": 0.0001839745995093087, + "loss": 0.1604, + "step": 5557 + }, + { + "epoch": 0.40102456798585806, + "grad_norm": 0.11772575974464417, + "learning_rate": 0.00018397171308991197, + "loss": 0.1463, + "step": 5558 + }, + { + "epoch": 0.4010967206609185, + "grad_norm": 0.11101650446653366, + "learning_rate": 0.00018396882667051523, + "loss": 0.163, + "step": 5559 + }, + { + "epoch": 0.40116887333597895, + "grad_norm": 0.12537023425102234, + "learning_rate": 0.00018396594025111852, + "loss": 0.1489, + "step": 5560 + }, + { + "epoch": 0.40124102601103934, + "grad_norm": 0.09984667599201202, + "learning_rate": 0.00018396305383172175, + "loss": 0.1013, + "step": 5561 + }, + { + "epoch": 0.4013131786860998, + "grad_norm": 0.09811095148324966, + "learning_rate": 0.00018396016741232502, + "loss": 0.1849, + "step": 5562 + }, + { + "epoch": 0.4013853313611602, + "grad_norm": 0.14147751033306122, + "learning_rate": 0.00018395728099292828, + "loss": 0.0963, + "step": 5563 + }, + { + "epoch": 0.40145748403622067, + "grad_norm": 0.12936724722385406, + "learning_rate": 0.00018395439457353154, + "loss": 0.1845, + "step": 5564 + }, + { + "epoch": 0.40152963671128106, + "grad_norm": 0.10370690375566483, + "learning_rate": 0.0001839515081541348, + "loss": 0.1504, + "step": 5565 + }, + { + "epoch": 0.4016017893863415, + "grad_norm": 0.14071407914161682, + "learning_rate": 0.00018394862173473807, + "loss": 0.1893, + "step": 5566 + }, + { + "epoch": 0.40167394206140195, + "grad_norm": 0.1003267839550972, + "learning_rate": 0.00018394573531534133, + "loss": 0.1858, + "step": 5567 + }, + { + "epoch": 0.40174609473646233, + "grad_norm": 0.16829344630241394, + "learning_rate": 0.0001839428488959446, + "loss": 0.16, + "step": 5568 + }, + { + "epoch": 0.4018182474115228, + "grad_norm": 0.10380838811397552, + "learning_rate": 0.00018393996247654786, + "loss": 0.1476, + "step": 5569 + }, + { + "epoch": 0.4018904000865832, + "grad_norm": 0.11744909733533859, + "learning_rate": 0.00018393707605715112, + "loss": 0.1264, + "step": 5570 + }, + { + "epoch": 0.40196255276164367, + "grad_norm": 0.11899411678314209, + "learning_rate": 0.00018393418963775438, + "loss": 0.2101, + "step": 5571 + }, + { + "epoch": 0.40203470543670405, + "grad_norm": 0.10737913846969604, + "learning_rate": 0.00018393130321835762, + "loss": 0.2118, + "step": 5572 + }, + { + "epoch": 0.4021068581117645, + "grad_norm": 0.11953913420438766, + "learning_rate": 0.00018392841679896088, + "loss": 0.1424, + "step": 5573 + }, + { + "epoch": 0.40217901078682494, + "grad_norm": 0.1752198338508606, + "learning_rate": 0.00018392553037956417, + "loss": 0.1891, + "step": 5574 + }, + { + "epoch": 0.40225116346188533, + "grad_norm": 0.14559204876422882, + "learning_rate": 0.00018392264396016744, + "loss": 0.1421, + "step": 5575 + }, + { + "epoch": 0.4023233161369458, + "grad_norm": 0.0974157378077507, + "learning_rate": 0.0001839197575407707, + "loss": 0.1336, + "step": 5576 + }, + { + "epoch": 0.4023954688120062, + "grad_norm": 0.1417972892522812, + "learning_rate": 0.00018391687112137393, + "loss": 0.1208, + "step": 5577 + }, + { + "epoch": 0.4024676214870666, + "grad_norm": 0.10055826604366302, + "learning_rate": 0.0001839139847019772, + "loss": 0.1519, + "step": 5578 + }, + { + "epoch": 0.40253977416212705, + "grad_norm": 0.09204552322626114, + "learning_rate": 0.00018391109828258046, + "loss": 0.1635, + "step": 5579 + }, + { + "epoch": 0.4026119268371875, + "grad_norm": 0.10245250165462494, + "learning_rate": 0.00018390821186318372, + "loss": 0.1937, + "step": 5580 + }, + { + "epoch": 0.40268407951224794, + "grad_norm": 0.1685457080602646, + "learning_rate": 0.000183905325443787, + "loss": 0.2262, + "step": 5581 + }, + { + "epoch": 0.4027562321873083, + "grad_norm": 0.167040154337883, + "learning_rate": 0.00018390243902439025, + "loss": 0.2172, + "step": 5582 + }, + { + "epoch": 0.40282838486236877, + "grad_norm": 0.21358591318130493, + "learning_rate": 0.0001838995526049935, + "loss": 0.1522, + "step": 5583 + }, + { + "epoch": 0.4029005375374292, + "grad_norm": 0.10421575605869293, + "learning_rate": 0.00018389666618559677, + "loss": 0.1249, + "step": 5584 + }, + { + "epoch": 0.4029726902124896, + "grad_norm": 0.1081615537405014, + "learning_rate": 0.00018389377976620004, + "loss": 0.1344, + "step": 5585 + }, + { + "epoch": 0.40304484288755005, + "grad_norm": 0.1218922883272171, + "learning_rate": 0.0001838908933468033, + "loss": 0.1531, + "step": 5586 + }, + { + "epoch": 0.4031169955626105, + "grad_norm": 0.14666524529457092, + "learning_rate": 0.00018388800692740656, + "loss": 0.126, + "step": 5587 + }, + { + "epoch": 0.40318914823767094, + "grad_norm": 0.08872871100902557, + "learning_rate": 0.0001838851205080098, + "loss": 0.1376, + "step": 5588 + }, + { + "epoch": 0.4032613009127313, + "grad_norm": 0.10605582594871521, + "learning_rate": 0.0001838822340886131, + "loss": 0.1679, + "step": 5589 + }, + { + "epoch": 0.40333345358779177, + "grad_norm": 0.09796331822872162, + "learning_rate": 0.00018387934766921635, + "loss": 0.1382, + "step": 5590 + }, + { + "epoch": 0.4034056062628522, + "grad_norm": 0.11313818395137787, + "learning_rate": 0.00018387646124981962, + "loss": 0.1829, + "step": 5591 + }, + { + "epoch": 0.4034777589379126, + "grad_norm": 0.1263338327407837, + "learning_rate": 0.00018387357483042288, + "loss": 0.1642, + "step": 5592 + }, + { + "epoch": 0.40354991161297304, + "grad_norm": 0.14448325335979462, + "learning_rate": 0.00018387068841102611, + "loss": 0.1713, + "step": 5593 + }, + { + "epoch": 0.4036220642880335, + "grad_norm": 0.11438018083572388, + "learning_rate": 0.00018386780199162938, + "loss": 0.147, + "step": 5594 + }, + { + "epoch": 0.40369421696309393, + "grad_norm": 0.13415935635566711, + "learning_rate": 0.00018386491557223264, + "loss": 0.1614, + "step": 5595 + }, + { + "epoch": 0.4037663696381543, + "grad_norm": 0.14307013154029846, + "learning_rate": 0.00018386202915283593, + "loss": 0.1311, + "step": 5596 + }, + { + "epoch": 0.40383852231321477, + "grad_norm": 0.10783229023218155, + "learning_rate": 0.0001838591427334392, + "loss": 0.148, + "step": 5597 + }, + { + "epoch": 0.4039106749882752, + "grad_norm": 0.11876362562179565, + "learning_rate": 0.00018385625631404243, + "loss": 0.1107, + "step": 5598 + }, + { + "epoch": 0.4039828276633356, + "grad_norm": 0.12312009185552597, + "learning_rate": 0.0001838533698946457, + "loss": 0.1633, + "step": 5599 + }, + { + "epoch": 0.40405498033839604, + "grad_norm": 0.10900390893220901, + "learning_rate": 0.00018385048347524895, + "loss": 0.136, + "step": 5600 + }, + { + "epoch": 0.4041271330134565, + "grad_norm": 0.13774895668029785, + "learning_rate": 0.00018384759705585222, + "loss": 0.1689, + "step": 5601 + }, + { + "epoch": 0.40419928568851693, + "grad_norm": 0.11681775748729706, + "learning_rate": 0.00018384471063645548, + "loss": 0.1376, + "step": 5602 + }, + { + "epoch": 0.4042714383635773, + "grad_norm": 0.13280463218688965, + "learning_rate": 0.00018384182421705874, + "loss": 0.1585, + "step": 5603 + }, + { + "epoch": 0.40434359103863776, + "grad_norm": 0.12719693779945374, + "learning_rate": 0.000183838937797662, + "loss": 0.1547, + "step": 5604 + }, + { + "epoch": 0.4044157437136982, + "grad_norm": 0.12950772047042847, + "learning_rate": 0.00018383605137826527, + "loss": 0.1568, + "step": 5605 + }, + { + "epoch": 0.4044878963887586, + "grad_norm": 0.1172901839017868, + "learning_rate": 0.00018383316495886853, + "loss": 0.156, + "step": 5606 + }, + { + "epoch": 0.40456004906381904, + "grad_norm": 0.12445659935474396, + "learning_rate": 0.0001838302785394718, + "loss": 0.1376, + "step": 5607 + }, + { + "epoch": 0.4046322017388795, + "grad_norm": 0.09501469880342484, + "learning_rate": 0.00018382739212007506, + "loss": 0.1283, + "step": 5608 + }, + { + "epoch": 0.40470435441393987, + "grad_norm": 0.09757381677627563, + "learning_rate": 0.0001838245057006783, + "loss": 0.1238, + "step": 5609 + }, + { + "epoch": 0.4047765070890003, + "grad_norm": 0.11157125234603882, + "learning_rate": 0.00018382161928128158, + "loss": 0.1144, + "step": 5610 + }, + { + "epoch": 0.40484865976406076, + "grad_norm": 0.130250945687294, + "learning_rate": 0.00018381873286188485, + "loss": 0.2204, + "step": 5611 + }, + { + "epoch": 0.4049208124391212, + "grad_norm": 0.12943124771118164, + "learning_rate": 0.0001838158464424881, + "loss": 0.1603, + "step": 5612 + }, + { + "epoch": 0.4049929651141816, + "grad_norm": 0.13775520026683807, + "learning_rate": 0.00018381296002309137, + "loss": 0.1831, + "step": 5613 + }, + { + "epoch": 0.40506511778924204, + "grad_norm": 0.12541238963603973, + "learning_rate": 0.0001838100736036946, + "loss": 0.1569, + "step": 5614 + }, + { + "epoch": 0.4051372704643025, + "grad_norm": 0.13825315237045288, + "learning_rate": 0.00018380718718429787, + "loss": 0.1495, + "step": 5615 + }, + { + "epoch": 0.40520942313936287, + "grad_norm": 0.10173406451940536, + "learning_rate": 0.00018380430076490113, + "loss": 0.1119, + "step": 5616 + }, + { + "epoch": 0.4052815758144233, + "grad_norm": 0.1514287143945694, + "learning_rate": 0.00018380141434550442, + "loss": 0.1496, + "step": 5617 + }, + { + "epoch": 0.40535372848948376, + "grad_norm": 0.1010914295911789, + "learning_rate": 0.0001837985279261077, + "loss": 0.0856, + "step": 5618 + }, + { + "epoch": 0.4054258811645442, + "grad_norm": 0.14536389708518982, + "learning_rate": 0.00018379564150671092, + "loss": 0.2136, + "step": 5619 + }, + { + "epoch": 0.4054980338396046, + "grad_norm": 0.12175793945789337, + "learning_rate": 0.00018379275508731419, + "loss": 0.1596, + "step": 5620 + }, + { + "epoch": 0.40557018651466503, + "grad_norm": 0.1434437483549118, + "learning_rate": 0.00018378986866791745, + "loss": 0.1478, + "step": 5621 + }, + { + "epoch": 0.4056423391897255, + "grad_norm": 0.1122213676571846, + "learning_rate": 0.0001837869822485207, + "loss": 0.1568, + "step": 5622 + }, + { + "epoch": 0.40571449186478586, + "grad_norm": 0.10404715687036514, + "learning_rate": 0.00018378409582912397, + "loss": 0.1678, + "step": 5623 + }, + { + "epoch": 0.4057866445398463, + "grad_norm": 0.11233187466859818, + "learning_rate": 0.00018378120940972724, + "loss": 0.1707, + "step": 5624 + }, + { + "epoch": 0.40585879721490675, + "grad_norm": 0.10880354046821594, + "learning_rate": 0.0001837783229903305, + "loss": 0.1443, + "step": 5625 + }, + { + "epoch": 0.4059309498899672, + "grad_norm": 0.1127571314573288, + "learning_rate": 0.00018377543657093376, + "loss": 0.2127, + "step": 5626 + }, + { + "epoch": 0.4060031025650276, + "grad_norm": 0.11678310483694077, + "learning_rate": 0.00018377255015153703, + "loss": 0.1568, + "step": 5627 + }, + { + "epoch": 0.40607525524008803, + "grad_norm": 0.11706399917602539, + "learning_rate": 0.0001837696637321403, + "loss": 0.1637, + "step": 5628 + }, + { + "epoch": 0.4061474079151485, + "grad_norm": 0.10593684762716293, + "learning_rate": 0.00018376677731274355, + "loss": 0.1642, + "step": 5629 + }, + { + "epoch": 0.40621956059020886, + "grad_norm": 0.14842522144317627, + "learning_rate": 0.00018376389089334681, + "loss": 0.1577, + "step": 5630 + }, + { + "epoch": 0.4062917132652693, + "grad_norm": 0.11014879494905472, + "learning_rate": 0.00018376100447395008, + "loss": 0.1497, + "step": 5631 + }, + { + "epoch": 0.40636386594032975, + "grad_norm": 0.10869240015745163, + "learning_rate": 0.00018375811805455334, + "loss": 0.1774, + "step": 5632 + }, + { + "epoch": 0.4064360186153902, + "grad_norm": 0.12231890857219696, + "learning_rate": 0.0001837552316351566, + "loss": 0.14, + "step": 5633 + }, + { + "epoch": 0.4065081712904506, + "grad_norm": 0.11752889305353165, + "learning_rate": 0.00018375234521575987, + "loss": 0.1522, + "step": 5634 + }, + { + "epoch": 0.406580323965511, + "grad_norm": 0.12622301280498505, + "learning_rate": 0.00018374945879636313, + "loss": 0.1349, + "step": 5635 + }, + { + "epoch": 0.40665247664057147, + "grad_norm": 0.12093006074428558, + "learning_rate": 0.00018374657237696637, + "loss": 0.1645, + "step": 5636 + }, + { + "epoch": 0.40672462931563186, + "grad_norm": 0.10946014523506165, + "learning_rate": 0.00018374368595756963, + "loss": 0.179, + "step": 5637 + }, + { + "epoch": 0.4067967819906923, + "grad_norm": 0.105443075299263, + "learning_rate": 0.00018374079953817292, + "loss": 0.1336, + "step": 5638 + }, + { + "epoch": 0.40686893466575275, + "grad_norm": 0.1109885573387146, + "learning_rate": 0.00018373791311877618, + "loss": 0.1291, + "step": 5639 + }, + { + "epoch": 0.40694108734081313, + "grad_norm": 0.10615464299917221, + "learning_rate": 0.00018373502669937944, + "loss": 0.1363, + "step": 5640 + }, + { + "epoch": 0.4070132400158736, + "grad_norm": 0.12033508718013763, + "learning_rate": 0.00018373214027998268, + "loss": 0.1899, + "step": 5641 + }, + { + "epoch": 0.407085392690934, + "grad_norm": 0.1122654527425766, + "learning_rate": 0.00018372925386058594, + "loss": 0.1282, + "step": 5642 + }, + { + "epoch": 0.40715754536599447, + "grad_norm": 0.1292010247707367, + "learning_rate": 0.0001837263674411892, + "loss": 0.1536, + "step": 5643 + }, + { + "epoch": 0.40722969804105486, + "grad_norm": 0.11770255118608475, + "learning_rate": 0.00018372348102179247, + "loss": 0.1315, + "step": 5644 + }, + { + "epoch": 0.4073018507161153, + "grad_norm": 0.13030600547790527, + "learning_rate": 0.00018372059460239576, + "loss": 0.1585, + "step": 5645 + }, + { + "epoch": 0.40737400339117574, + "grad_norm": 0.10636156797409058, + "learning_rate": 0.000183717708182999, + "loss": 0.1644, + "step": 5646 + }, + { + "epoch": 0.40744615606623613, + "grad_norm": 0.10021941363811493, + "learning_rate": 0.00018371482176360226, + "loss": 0.1305, + "step": 5647 + }, + { + "epoch": 0.4075183087412966, + "grad_norm": 0.10358332842588425, + "learning_rate": 0.00018371193534420552, + "loss": 0.1459, + "step": 5648 + }, + { + "epoch": 0.407590461416357, + "grad_norm": 0.13000337779521942, + "learning_rate": 0.00018370904892480878, + "loss": 0.1457, + "step": 5649 + }, + { + "epoch": 0.40766261409141746, + "grad_norm": 0.15379568934440613, + "learning_rate": 0.00018370616250541205, + "loss": 0.1913, + "step": 5650 + }, + { + "epoch": 0.40773476676647785, + "grad_norm": 0.12629206478595734, + "learning_rate": 0.0001837032760860153, + "loss": 0.1264, + "step": 5651 + }, + { + "epoch": 0.4078069194415383, + "grad_norm": 0.1013764888048172, + "learning_rate": 0.00018370038966661857, + "loss": 0.1241, + "step": 5652 + }, + { + "epoch": 0.40787907211659874, + "grad_norm": 0.1186005175113678, + "learning_rate": 0.00018369750324722184, + "loss": 0.1681, + "step": 5653 + }, + { + "epoch": 0.40795122479165913, + "grad_norm": 0.12923049926757812, + "learning_rate": 0.0001836946168278251, + "loss": 0.1826, + "step": 5654 + }, + { + "epoch": 0.4080233774667196, + "grad_norm": 0.11416444182395935, + "learning_rate": 0.00018369173040842836, + "loss": 0.1738, + "step": 5655 + }, + { + "epoch": 0.40809553014178, + "grad_norm": 0.09962332248687744, + "learning_rate": 0.00018368884398903162, + "loss": 0.1549, + "step": 5656 + }, + { + "epoch": 0.40816768281684046, + "grad_norm": 0.1386914998292923, + "learning_rate": 0.00018368595756963486, + "loss": 0.1769, + "step": 5657 + }, + { + "epoch": 0.40823983549190085, + "grad_norm": 0.12415996193885803, + "learning_rate": 0.00018368307115023812, + "loss": 0.1594, + "step": 5658 + }, + { + "epoch": 0.4083119881669613, + "grad_norm": 0.1531461775302887, + "learning_rate": 0.0001836801847308414, + "loss": 0.1311, + "step": 5659 + }, + { + "epoch": 0.40838414084202174, + "grad_norm": 0.11212441325187683, + "learning_rate": 0.00018367729831144468, + "loss": 0.1853, + "step": 5660 + }, + { + "epoch": 0.4084562935170821, + "grad_norm": 0.12954039871692657, + "learning_rate": 0.00018367441189204794, + "loss": 0.194, + "step": 5661 + }, + { + "epoch": 0.40852844619214257, + "grad_norm": 0.13729546964168549, + "learning_rate": 0.00018367152547265117, + "loss": 0.1578, + "step": 5662 + }, + { + "epoch": 0.408600598867203, + "grad_norm": 0.1417706310749054, + "learning_rate": 0.00018366863905325444, + "loss": 0.1596, + "step": 5663 + }, + { + "epoch": 0.40867275154226346, + "grad_norm": 0.12691785395145416, + "learning_rate": 0.0001836657526338577, + "loss": 0.1442, + "step": 5664 + }, + { + "epoch": 0.40874490421732385, + "grad_norm": 0.11149875819683075, + "learning_rate": 0.00018366286621446096, + "loss": 0.2039, + "step": 5665 + }, + { + "epoch": 0.4088170568923843, + "grad_norm": 0.09372027218341827, + "learning_rate": 0.00018365997979506425, + "loss": 0.122, + "step": 5666 + }, + { + "epoch": 0.40888920956744473, + "grad_norm": 0.13826479017734528, + "learning_rate": 0.0001836570933756675, + "loss": 0.1548, + "step": 5667 + }, + { + "epoch": 0.4089613622425051, + "grad_norm": 0.154812291264534, + "learning_rate": 0.00018365420695627075, + "loss": 0.2081, + "step": 5668 + }, + { + "epoch": 0.40903351491756557, + "grad_norm": 0.10125817358493805, + "learning_rate": 0.00018365132053687401, + "loss": 0.1397, + "step": 5669 + }, + { + "epoch": 0.409105667592626, + "grad_norm": 0.13650555908679962, + "learning_rate": 0.00018364843411747728, + "loss": 0.1591, + "step": 5670 + }, + { + "epoch": 0.4091778202676864, + "grad_norm": 0.1282268911600113, + "learning_rate": 0.00018364554769808054, + "loss": 0.1726, + "step": 5671 + }, + { + "epoch": 0.40924997294274684, + "grad_norm": 0.13484404981136322, + "learning_rate": 0.0001836426612786838, + "loss": 0.1917, + "step": 5672 + }, + { + "epoch": 0.4093221256178073, + "grad_norm": 0.11429597437381744, + "learning_rate": 0.00018363977485928707, + "loss": 0.1403, + "step": 5673 + }, + { + "epoch": 0.40939427829286773, + "grad_norm": 0.15964668989181519, + "learning_rate": 0.00018363688843989033, + "loss": 0.1883, + "step": 5674 + }, + { + "epoch": 0.4094664309679281, + "grad_norm": 0.09609831869602203, + "learning_rate": 0.0001836340020204936, + "loss": 0.167, + "step": 5675 + }, + { + "epoch": 0.40953858364298856, + "grad_norm": 0.12426905333995819, + "learning_rate": 0.00018363111560109686, + "loss": 0.1825, + "step": 5676 + }, + { + "epoch": 0.409610736318049, + "grad_norm": 0.11844122409820557, + "learning_rate": 0.00018362822918170012, + "loss": 0.1929, + "step": 5677 + }, + { + "epoch": 0.4096828889931094, + "grad_norm": 0.1360781341791153, + "learning_rate": 0.00018362534276230335, + "loss": 0.2015, + "step": 5678 + }, + { + "epoch": 0.40975504166816984, + "grad_norm": 0.12205765396356583, + "learning_rate": 0.00018362245634290662, + "loss": 0.1006, + "step": 5679 + }, + { + "epoch": 0.4098271943432303, + "grad_norm": 0.12862655520439148, + "learning_rate": 0.0001836195699235099, + "loss": 0.1534, + "step": 5680 + }, + { + "epoch": 0.4098993470182907, + "grad_norm": 0.10743330419063568, + "learning_rate": 0.00018361668350411317, + "loss": 0.1926, + "step": 5681 + }, + { + "epoch": 0.4099714996933511, + "grad_norm": 0.14362482726573944, + "learning_rate": 0.00018361379708471643, + "loss": 0.1729, + "step": 5682 + }, + { + "epoch": 0.41004365236841156, + "grad_norm": 0.1336904764175415, + "learning_rate": 0.00018361091066531967, + "loss": 0.172, + "step": 5683 + }, + { + "epoch": 0.410115805043472, + "grad_norm": 0.14109814167022705, + "learning_rate": 0.00018360802424592293, + "loss": 0.1686, + "step": 5684 + }, + { + "epoch": 0.4101879577185324, + "grad_norm": 0.12172040343284607, + "learning_rate": 0.0001836051378265262, + "loss": 0.1594, + "step": 5685 + }, + { + "epoch": 0.41026011039359284, + "grad_norm": 0.12343661487102509, + "learning_rate": 0.00018360225140712946, + "loss": 0.1437, + "step": 5686 + }, + { + "epoch": 0.4103322630686533, + "grad_norm": 0.13678564131259918, + "learning_rate": 0.00018359936498773275, + "loss": 0.2118, + "step": 5687 + }, + { + "epoch": 0.4104044157437137, + "grad_norm": 0.1265508383512497, + "learning_rate": 0.00018359647856833598, + "loss": 0.1364, + "step": 5688 + }, + { + "epoch": 0.4104765684187741, + "grad_norm": 0.11329175531864166, + "learning_rate": 0.00018359359214893925, + "loss": 0.1411, + "step": 5689 + }, + { + "epoch": 0.41054872109383456, + "grad_norm": 0.111592598259449, + "learning_rate": 0.0001835907057295425, + "loss": 0.1482, + "step": 5690 + }, + { + "epoch": 0.410620873768895, + "grad_norm": 0.11039689183235168, + "learning_rate": 0.00018358781931014577, + "loss": 0.2005, + "step": 5691 + }, + { + "epoch": 0.4106930264439554, + "grad_norm": 0.11525961756706238, + "learning_rate": 0.00018358493289074903, + "loss": 0.1999, + "step": 5692 + }, + { + "epoch": 0.41076517911901583, + "grad_norm": 0.11791030317544937, + "learning_rate": 0.0001835820464713523, + "loss": 0.1936, + "step": 5693 + }, + { + "epoch": 0.4108373317940763, + "grad_norm": 0.11614886671304703, + "learning_rate": 0.00018357916005195556, + "loss": 0.1649, + "step": 5694 + }, + { + "epoch": 0.4109094844691367, + "grad_norm": 0.12133285403251648, + "learning_rate": 0.00018357627363255882, + "loss": 0.1373, + "step": 5695 + }, + { + "epoch": 0.4109816371441971, + "grad_norm": 0.11625772714614868, + "learning_rate": 0.0001835733872131621, + "loss": 0.1641, + "step": 5696 + }, + { + "epoch": 0.41105378981925755, + "grad_norm": 0.10323289036750793, + "learning_rate": 0.00018357050079376535, + "loss": 0.1068, + "step": 5697 + }, + { + "epoch": 0.411125942494318, + "grad_norm": 0.10496301203966141, + "learning_rate": 0.0001835676143743686, + "loss": 0.1126, + "step": 5698 + }, + { + "epoch": 0.4111980951693784, + "grad_norm": 0.11890524625778198, + "learning_rate": 0.00018356472795497185, + "loss": 0.1398, + "step": 5699 + }, + { + "epoch": 0.41127024784443883, + "grad_norm": 0.12184417992830276, + "learning_rate": 0.0001835618415355751, + "loss": 0.2108, + "step": 5700 + }, + { + "epoch": 0.4113424005194993, + "grad_norm": 0.12418466061353683, + "learning_rate": 0.0001835589551161784, + "loss": 0.1581, + "step": 5701 + }, + { + "epoch": 0.41141455319455966, + "grad_norm": 0.15010544657707214, + "learning_rate": 0.00018355606869678166, + "loss": 0.1631, + "step": 5702 + }, + { + "epoch": 0.4114867058696201, + "grad_norm": 0.22763435542583466, + "learning_rate": 0.00018355318227738493, + "loss": 0.1373, + "step": 5703 + }, + { + "epoch": 0.41155885854468055, + "grad_norm": 0.13446487486362457, + "learning_rate": 0.00018355029585798816, + "loss": 0.1767, + "step": 5704 + }, + { + "epoch": 0.411631011219741, + "grad_norm": 0.12202102690935135, + "learning_rate": 0.00018354740943859143, + "loss": 0.101, + "step": 5705 + }, + { + "epoch": 0.4117031638948014, + "grad_norm": 0.10646074265241623, + "learning_rate": 0.0001835445230191947, + "loss": 0.1843, + "step": 5706 + }, + { + "epoch": 0.4117753165698618, + "grad_norm": 0.1445694863796234, + "learning_rate": 0.00018354163659979795, + "loss": 0.2066, + "step": 5707 + }, + { + "epoch": 0.41184746924492227, + "grad_norm": 0.14221738278865814, + "learning_rate": 0.00018353875018040124, + "loss": 0.1619, + "step": 5708 + }, + { + "epoch": 0.41191962191998266, + "grad_norm": 0.1329052895307541, + "learning_rate": 0.00018353586376100448, + "loss": 0.1656, + "step": 5709 + }, + { + "epoch": 0.4119917745950431, + "grad_norm": 0.1614803969860077, + "learning_rate": 0.00018353297734160774, + "loss": 0.1685, + "step": 5710 + }, + { + "epoch": 0.41206392727010355, + "grad_norm": 0.14487746357917786, + "learning_rate": 0.000183530090922211, + "loss": 0.1753, + "step": 5711 + }, + { + "epoch": 0.412136079945164, + "grad_norm": 0.1476510763168335, + "learning_rate": 0.00018352720450281427, + "loss": 0.1733, + "step": 5712 + }, + { + "epoch": 0.4122082326202244, + "grad_norm": 0.12578529119491577, + "learning_rate": 0.00018352431808341753, + "loss": 0.1628, + "step": 5713 + }, + { + "epoch": 0.4122803852952848, + "grad_norm": 0.13598518073558807, + "learning_rate": 0.0001835214316640208, + "loss": 0.1586, + "step": 5714 + }, + { + "epoch": 0.41235253797034527, + "grad_norm": 0.12721259891986847, + "learning_rate": 0.00018351854524462405, + "loss": 0.1611, + "step": 5715 + }, + { + "epoch": 0.41242469064540566, + "grad_norm": 0.16968463361263275, + "learning_rate": 0.00018351565882522732, + "loss": 0.1996, + "step": 5716 + }, + { + "epoch": 0.4124968433204661, + "grad_norm": 0.10791303962469101, + "learning_rate": 0.00018351277240583058, + "loss": 0.1612, + "step": 5717 + }, + { + "epoch": 0.41256899599552654, + "grad_norm": 0.12305327504873276, + "learning_rate": 0.00018350988598643384, + "loss": 0.1849, + "step": 5718 + }, + { + "epoch": 0.412641148670587, + "grad_norm": 0.10294891148805618, + "learning_rate": 0.0001835069995670371, + "loss": 0.1834, + "step": 5719 + }, + { + "epoch": 0.4127133013456474, + "grad_norm": 0.11937738209962845, + "learning_rate": 0.00018350411314764034, + "loss": 0.1454, + "step": 5720 + }, + { + "epoch": 0.4127854540207078, + "grad_norm": 0.13009479641914368, + "learning_rate": 0.0001835012267282436, + "loss": 0.1847, + "step": 5721 + }, + { + "epoch": 0.41285760669576826, + "grad_norm": 0.1491885632276535, + "learning_rate": 0.0001834983403088469, + "loss": 0.2028, + "step": 5722 + }, + { + "epoch": 0.41292975937082865, + "grad_norm": 0.11253825575113297, + "learning_rate": 0.00018349545388945016, + "loss": 0.1265, + "step": 5723 + }, + { + "epoch": 0.4130019120458891, + "grad_norm": 0.13709759712219238, + "learning_rate": 0.00018349256747005342, + "loss": 0.1504, + "step": 5724 + }, + { + "epoch": 0.41307406472094954, + "grad_norm": 0.11551269143819809, + "learning_rate": 0.00018348968105065666, + "loss": 0.1652, + "step": 5725 + }, + { + "epoch": 0.41314621739600993, + "grad_norm": 0.10509738326072693, + "learning_rate": 0.00018348679463125992, + "loss": 0.1564, + "step": 5726 + }, + { + "epoch": 0.4132183700710704, + "grad_norm": 0.12881092727184296, + "learning_rate": 0.00018348390821186318, + "loss": 0.1537, + "step": 5727 + }, + { + "epoch": 0.4132905227461308, + "grad_norm": 0.11585894227027893, + "learning_rate": 0.00018348102179246645, + "loss": 0.1681, + "step": 5728 + }, + { + "epoch": 0.41336267542119126, + "grad_norm": 0.10582401603460312, + "learning_rate": 0.00018347813537306974, + "loss": 0.1401, + "step": 5729 + }, + { + "epoch": 0.41343482809625165, + "grad_norm": 0.11501099169254303, + "learning_rate": 0.00018347524895367297, + "loss": 0.1416, + "step": 5730 + }, + { + "epoch": 0.4135069807713121, + "grad_norm": 0.11508175730705261, + "learning_rate": 0.00018347236253427623, + "loss": 0.1236, + "step": 5731 + }, + { + "epoch": 0.41357913344637254, + "grad_norm": 0.13451208174228668, + "learning_rate": 0.0001834694761148795, + "loss": 0.1284, + "step": 5732 + }, + { + "epoch": 0.4136512861214329, + "grad_norm": 0.15134069323539734, + "learning_rate": 0.00018346658969548276, + "loss": 0.1498, + "step": 5733 + }, + { + "epoch": 0.41372343879649337, + "grad_norm": 0.12855762243270874, + "learning_rate": 0.00018346370327608602, + "loss": 0.1303, + "step": 5734 + }, + { + "epoch": 0.4137955914715538, + "grad_norm": 0.09979292750358582, + "learning_rate": 0.00018346081685668929, + "loss": 0.1402, + "step": 5735 + }, + { + "epoch": 0.41386774414661426, + "grad_norm": 0.13594497740268707, + "learning_rate": 0.00018345793043729255, + "loss": 0.2275, + "step": 5736 + }, + { + "epoch": 0.41393989682167465, + "grad_norm": 0.1293654441833496, + "learning_rate": 0.0001834550440178958, + "loss": 0.1622, + "step": 5737 + }, + { + "epoch": 0.4140120494967351, + "grad_norm": 0.21570616960525513, + "learning_rate": 0.00018345215759849907, + "loss": 0.1946, + "step": 5738 + }, + { + "epoch": 0.41408420217179553, + "grad_norm": 0.13746745884418488, + "learning_rate": 0.00018344927117910234, + "loss": 0.1473, + "step": 5739 + }, + { + "epoch": 0.4141563548468559, + "grad_norm": 0.12070949375629425, + "learning_rate": 0.0001834463847597056, + "loss": 0.1335, + "step": 5740 + }, + { + "epoch": 0.41422850752191637, + "grad_norm": 0.19424080848693848, + "learning_rate": 0.00018344349834030886, + "loss": 0.166, + "step": 5741 + }, + { + "epoch": 0.4143006601969768, + "grad_norm": 0.12598204612731934, + "learning_rate": 0.0001834406119209121, + "loss": 0.1249, + "step": 5742 + }, + { + "epoch": 0.41437281287203725, + "grad_norm": 0.11859706044197083, + "learning_rate": 0.0001834377255015154, + "loss": 0.2106, + "step": 5743 + }, + { + "epoch": 0.41444496554709764, + "grad_norm": 0.1263197511434555, + "learning_rate": 0.00018343483908211865, + "loss": 0.163, + "step": 5744 + }, + { + "epoch": 0.4145171182221581, + "grad_norm": 0.15331189334392548, + "learning_rate": 0.00018343195266272192, + "loss": 0.1288, + "step": 5745 + }, + { + "epoch": 0.41458927089721853, + "grad_norm": 0.13554920256137848, + "learning_rate": 0.00018342906624332518, + "loss": 0.1355, + "step": 5746 + }, + { + "epoch": 0.4146614235722789, + "grad_norm": 0.12715382874011993, + "learning_rate": 0.00018342617982392841, + "loss": 0.1382, + "step": 5747 + }, + { + "epoch": 0.41473357624733936, + "grad_norm": 0.12478697299957275, + "learning_rate": 0.00018342329340453168, + "loss": 0.171, + "step": 5748 + }, + { + "epoch": 0.4148057289223998, + "grad_norm": 0.09921965003013611, + "learning_rate": 0.00018342040698513494, + "loss": 0.0962, + "step": 5749 + }, + { + "epoch": 0.41487788159746025, + "grad_norm": 0.09717674553394318, + "learning_rate": 0.00018341752056573823, + "loss": 0.1345, + "step": 5750 + }, + { + "epoch": 0.41495003427252064, + "grad_norm": 0.11751259863376617, + "learning_rate": 0.0001834146341463415, + "loss": 0.1562, + "step": 5751 + }, + { + "epoch": 0.4150221869475811, + "grad_norm": 0.11764825880527496, + "learning_rate": 0.00018341174772694473, + "loss": 0.1871, + "step": 5752 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 0.10763514786958694, + "learning_rate": 0.000183408861307548, + "loss": 0.157, + "step": 5753 + }, + { + "epoch": 0.4151664922977019, + "grad_norm": 0.10569040477275848, + "learning_rate": 0.00018340597488815125, + "loss": 0.1303, + "step": 5754 + }, + { + "epoch": 0.41523864497276236, + "grad_norm": 0.11948921531438828, + "learning_rate": 0.00018340308846875452, + "loss": 0.1661, + "step": 5755 + }, + { + "epoch": 0.4153107976478228, + "grad_norm": 0.13358844816684723, + "learning_rate": 0.00018340020204935778, + "loss": 0.1455, + "step": 5756 + }, + { + "epoch": 0.4153829503228832, + "grad_norm": 0.12435338646173477, + "learning_rate": 0.00018339731562996104, + "loss": 0.146, + "step": 5757 + }, + { + "epoch": 0.41545510299794364, + "grad_norm": 0.13670466840267181, + "learning_rate": 0.0001833944292105643, + "loss": 0.1405, + "step": 5758 + }, + { + "epoch": 0.4155272556730041, + "grad_norm": 0.1171383336186409, + "learning_rate": 0.00018339154279116757, + "loss": 0.1349, + "step": 5759 + }, + { + "epoch": 0.4155994083480645, + "grad_norm": 0.11977862566709518, + "learning_rate": 0.00018338865637177083, + "loss": 0.1482, + "step": 5760 + }, + { + "epoch": 0.4156715610231249, + "grad_norm": 0.11456245929002762, + "learning_rate": 0.0001833857699523741, + "loss": 0.1671, + "step": 5761 + }, + { + "epoch": 0.41574371369818536, + "grad_norm": 0.13899686932563782, + "learning_rate": 0.00018338288353297736, + "loss": 0.1764, + "step": 5762 + }, + { + "epoch": 0.4158158663732458, + "grad_norm": 0.11373434215784073, + "learning_rate": 0.0001833799971135806, + "loss": 0.1239, + "step": 5763 + }, + { + "epoch": 0.4158880190483062, + "grad_norm": 0.14908647537231445, + "learning_rate": 0.00018337711069418388, + "loss": 0.1531, + "step": 5764 + }, + { + "epoch": 0.41596017172336663, + "grad_norm": 0.17516694962978363, + "learning_rate": 0.00018337422427478715, + "loss": 0.1924, + "step": 5765 + }, + { + "epoch": 0.4160323243984271, + "grad_norm": 0.13450179994106293, + "learning_rate": 0.0001833713378553904, + "loss": 0.1606, + "step": 5766 + }, + { + "epoch": 0.4161044770734875, + "grad_norm": 0.12532760202884674, + "learning_rate": 0.00018336845143599367, + "loss": 0.1399, + "step": 5767 + }, + { + "epoch": 0.4161766297485479, + "grad_norm": 0.13363435864448547, + "learning_rate": 0.0001833655650165969, + "loss": 0.1554, + "step": 5768 + }, + { + "epoch": 0.41624878242360835, + "grad_norm": 0.1193813607096672, + "learning_rate": 0.00018336267859720017, + "loss": 0.1727, + "step": 5769 + }, + { + "epoch": 0.4163209350986688, + "grad_norm": 0.10275016725063324, + "learning_rate": 0.00018335979217780343, + "loss": 0.1379, + "step": 5770 + }, + { + "epoch": 0.4163930877737292, + "grad_norm": 0.12169700860977173, + "learning_rate": 0.00018335690575840672, + "loss": 0.1478, + "step": 5771 + }, + { + "epoch": 0.41646524044878963, + "grad_norm": 0.16734255850315094, + "learning_rate": 0.00018335401933901, + "loss": 0.224, + "step": 5772 + }, + { + "epoch": 0.4165373931238501, + "grad_norm": 0.1350734382867813, + "learning_rate": 0.00018335113291961322, + "loss": 0.1518, + "step": 5773 + }, + { + "epoch": 0.4166095457989105, + "grad_norm": 0.12713783979415894, + "learning_rate": 0.00018334824650021649, + "loss": 0.1748, + "step": 5774 + }, + { + "epoch": 0.4166816984739709, + "grad_norm": 0.12002015858888626, + "learning_rate": 0.00018334536008081975, + "loss": 0.1708, + "step": 5775 + }, + { + "epoch": 0.41675385114903135, + "grad_norm": 0.12447904050350189, + "learning_rate": 0.000183342473661423, + "loss": 0.1616, + "step": 5776 + }, + { + "epoch": 0.4168260038240918, + "grad_norm": 0.12837150692939758, + "learning_rate": 0.00018333958724202627, + "loss": 0.1498, + "step": 5777 + }, + { + "epoch": 0.4168981564991522, + "grad_norm": 0.14380308985710144, + "learning_rate": 0.00018333670082262954, + "loss": 0.1424, + "step": 5778 + }, + { + "epoch": 0.4169703091742126, + "grad_norm": 0.11396576464176178, + "learning_rate": 0.0001833338144032328, + "loss": 0.1573, + "step": 5779 + }, + { + "epoch": 0.41704246184927307, + "grad_norm": 0.11688689142465591, + "learning_rate": 0.00018333092798383606, + "loss": 0.178, + "step": 5780 + }, + { + "epoch": 0.4171146145243335, + "grad_norm": 0.13411317765712738, + "learning_rate": 0.00018332804156443933, + "loss": 0.1631, + "step": 5781 + }, + { + "epoch": 0.4171867671993939, + "grad_norm": 0.15105921030044556, + "learning_rate": 0.0001833251551450426, + "loss": 0.1903, + "step": 5782 + }, + { + "epoch": 0.41725891987445435, + "grad_norm": 0.12753167748451233, + "learning_rate": 0.00018332226872564585, + "loss": 0.1432, + "step": 5783 + }, + { + "epoch": 0.4173310725495148, + "grad_norm": 0.11953182518482208, + "learning_rate": 0.0001833193823062491, + "loss": 0.1778, + "step": 5784 + }, + { + "epoch": 0.4174032252245752, + "grad_norm": 0.11744635552167892, + "learning_rate": 0.00018331649588685235, + "loss": 0.2007, + "step": 5785 + }, + { + "epoch": 0.4174753778996356, + "grad_norm": 0.12863267958164215, + "learning_rate": 0.00018331360946745564, + "loss": 0.1041, + "step": 5786 + }, + { + "epoch": 0.41754753057469607, + "grad_norm": 0.11282802373170853, + "learning_rate": 0.0001833107230480589, + "loss": 0.2177, + "step": 5787 + }, + { + "epoch": 0.41761968324975646, + "grad_norm": 0.15572760999202728, + "learning_rate": 0.00018330783662866217, + "loss": 0.1447, + "step": 5788 + }, + { + "epoch": 0.4176918359248169, + "grad_norm": 0.09966544061899185, + "learning_rate": 0.0001833049502092654, + "loss": 0.105, + "step": 5789 + }, + { + "epoch": 0.41776398859987735, + "grad_norm": 0.13301274180412292, + "learning_rate": 0.00018330206378986867, + "loss": 0.1398, + "step": 5790 + }, + { + "epoch": 0.4178361412749378, + "grad_norm": 0.1342577189207077, + "learning_rate": 0.00018329917737047193, + "loss": 0.1375, + "step": 5791 + }, + { + "epoch": 0.4179082939499982, + "grad_norm": 0.10327088832855225, + "learning_rate": 0.0001832962909510752, + "loss": 0.1565, + "step": 5792 + }, + { + "epoch": 0.4179804466250586, + "grad_norm": 0.12060295790433884, + "learning_rate": 0.00018329340453167848, + "loss": 0.1418, + "step": 5793 + }, + { + "epoch": 0.41805259930011907, + "grad_norm": 0.10361672937870026, + "learning_rate": 0.00018329051811228172, + "loss": 0.1699, + "step": 5794 + }, + { + "epoch": 0.41812475197517945, + "grad_norm": 0.1323527842760086, + "learning_rate": 0.00018328763169288498, + "loss": 0.1728, + "step": 5795 + }, + { + "epoch": 0.4181969046502399, + "grad_norm": 0.14689281582832336, + "learning_rate": 0.00018328474527348824, + "loss": 0.1719, + "step": 5796 + }, + { + "epoch": 0.41826905732530034, + "grad_norm": 0.12184374034404755, + "learning_rate": 0.0001832818588540915, + "loss": 0.1421, + "step": 5797 + }, + { + "epoch": 0.4183412100003608, + "grad_norm": 0.13505211472511292, + "learning_rate": 0.00018327897243469477, + "loss": 0.212, + "step": 5798 + }, + { + "epoch": 0.4184133626754212, + "grad_norm": 0.11547620594501495, + "learning_rate": 0.00018327608601529803, + "loss": 0.1905, + "step": 5799 + }, + { + "epoch": 0.4184855153504816, + "grad_norm": 0.12469831854104996, + "learning_rate": 0.0001832731995959013, + "loss": 0.1484, + "step": 5800 + }, + { + "epoch": 0.41855766802554206, + "grad_norm": 0.1516280174255371, + "learning_rate": 0.00018327031317650456, + "loss": 0.1436, + "step": 5801 + }, + { + "epoch": 0.41862982070060245, + "grad_norm": 0.12161953002214432, + "learning_rate": 0.00018326742675710782, + "loss": 0.1521, + "step": 5802 + }, + { + "epoch": 0.4187019733756629, + "grad_norm": 0.13412874937057495, + "learning_rate": 0.00018326454033771108, + "loss": 0.1139, + "step": 5803 + }, + { + "epoch": 0.41877412605072334, + "grad_norm": 0.11358436942100525, + "learning_rate": 0.00018326165391831435, + "loss": 0.1606, + "step": 5804 + }, + { + "epoch": 0.4188462787257838, + "grad_norm": 0.10829130560159683, + "learning_rate": 0.00018325876749891758, + "loss": 0.1828, + "step": 5805 + }, + { + "epoch": 0.41891843140084417, + "grad_norm": 0.12857361137866974, + "learning_rate": 0.00018325588107952085, + "loss": 0.1265, + "step": 5806 + }, + { + "epoch": 0.4189905840759046, + "grad_norm": 0.13641512393951416, + "learning_rate": 0.00018325299466012414, + "loss": 0.1818, + "step": 5807 + }, + { + "epoch": 0.41906273675096506, + "grad_norm": 0.11245698481798172, + "learning_rate": 0.0001832501082407274, + "loss": 0.1082, + "step": 5808 + }, + { + "epoch": 0.41913488942602545, + "grad_norm": 0.11375346779823303, + "learning_rate": 0.00018324722182133066, + "loss": 0.135, + "step": 5809 + }, + { + "epoch": 0.4192070421010859, + "grad_norm": 0.14834176003932953, + "learning_rate": 0.0001832443354019339, + "loss": 0.1715, + "step": 5810 + }, + { + "epoch": 0.41927919477614634, + "grad_norm": 0.12736055254936218, + "learning_rate": 0.00018324144898253716, + "loss": 0.1759, + "step": 5811 + }, + { + "epoch": 0.4193513474512068, + "grad_norm": 0.15465882420539856, + "learning_rate": 0.00018323856256314042, + "loss": 0.1679, + "step": 5812 + }, + { + "epoch": 0.41942350012626717, + "grad_norm": 0.11425981670618057, + "learning_rate": 0.00018323567614374369, + "loss": 0.1426, + "step": 5813 + }, + { + "epoch": 0.4194956528013276, + "grad_norm": 0.11629367619752884, + "learning_rate": 0.00018323278972434698, + "loss": 0.1339, + "step": 5814 + }, + { + "epoch": 0.41956780547638806, + "grad_norm": 0.12361908704042435, + "learning_rate": 0.0001832299033049502, + "loss": 0.1736, + "step": 5815 + }, + { + "epoch": 0.41963995815144844, + "grad_norm": 0.1226155161857605, + "learning_rate": 0.00018322701688555347, + "loss": 0.1865, + "step": 5816 + }, + { + "epoch": 0.4197121108265089, + "grad_norm": 0.11376158148050308, + "learning_rate": 0.00018322413046615674, + "loss": 0.1796, + "step": 5817 + }, + { + "epoch": 0.41978426350156933, + "grad_norm": 0.11067010462284088, + "learning_rate": 0.00018322124404676, + "loss": 0.0954, + "step": 5818 + }, + { + "epoch": 0.4198564161766297, + "grad_norm": 0.128396674990654, + "learning_rate": 0.00018321835762736326, + "loss": 0.1813, + "step": 5819 + }, + { + "epoch": 0.41992856885169016, + "grad_norm": 0.11627837270498276, + "learning_rate": 0.00018321547120796653, + "loss": 0.1594, + "step": 5820 + }, + { + "epoch": 0.4200007215267506, + "grad_norm": 0.09679029881954193, + "learning_rate": 0.0001832125847885698, + "loss": 0.1316, + "step": 5821 + }, + { + "epoch": 0.42007287420181105, + "grad_norm": 0.1351933628320694, + "learning_rate": 0.00018320969836917305, + "loss": 0.1418, + "step": 5822 + }, + { + "epoch": 0.42014502687687144, + "grad_norm": 0.13122516870498657, + "learning_rate": 0.00018320681194977631, + "loss": 0.1817, + "step": 5823 + }, + { + "epoch": 0.4202171795519319, + "grad_norm": 0.11837951838970184, + "learning_rate": 0.00018320392553037958, + "loss": 0.1491, + "step": 5824 + }, + { + "epoch": 0.42028933222699233, + "grad_norm": 0.13323001563549042, + "learning_rate": 0.00018320103911098284, + "loss": 0.184, + "step": 5825 + }, + { + "epoch": 0.4203614849020527, + "grad_norm": 0.11082823574542999, + "learning_rate": 0.00018319815269158608, + "loss": 0.111, + "step": 5826 + }, + { + "epoch": 0.42043363757711316, + "grad_norm": 0.15011513233184814, + "learning_rate": 0.00018319526627218934, + "loss": 0.1439, + "step": 5827 + }, + { + "epoch": 0.4205057902521736, + "grad_norm": 0.1101953387260437, + "learning_rate": 0.00018319237985279263, + "loss": 0.1607, + "step": 5828 + }, + { + "epoch": 0.42057794292723405, + "grad_norm": 0.11777789145708084, + "learning_rate": 0.0001831894934333959, + "loss": 0.1451, + "step": 5829 + }, + { + "epoch": 0.42065009560229444, + "grad_norm": 0.10828852653503418, + "learning_rate": 0.00018318660701399916, + "loss": 0.1446, + "step": 5830 + }, + { + "epoch": 0.4207222482773549, + "grad_norm": 0.1039978563785553, + "learning_rate": 0.0001831837205946024, + "loss": 0.1701, + "step": 5831 + }, + { + "epoch": 0.4207944009524153, + "grad_norm": 0.14535623788833618, + "learning_rate": 0.00018318083417520565, + "loss": 0.1969, + "step": 5832 + }, + { + "epoch": 0.4208665536274757, + "grad_norm": 0.12345319986343384, + "learning_rate": 0.00018317794775580892, + "loss": 0.1878, + "step": 5833 + }, + { + "epoch": 0.42093870630253616, + "grad_norm": 0.12048795819282532, + "learning_rate": 0.00018317506133641218, + "loss": 0.1764, + "step": 5834 + }, + { + "epoch": 0.4210108589775966, + "grad_norm": 0.11864970624446869, + "learning_rate": 0.00018317217491701547, + "loss": 0.1592, + "step": 5835 + }, + { + "epoch": 0.42108301165265705, + "grad_norm": 0.09020841121673584, + "learning_rate": 0.0001831692884976187, + "loss": 0.1413, + "step": 5836 + }, + { + "epoch": 0.42115516432771744, + "grad_norm": 0.10649677366018295, + "learning_rate": 0.00018316640207822197, + "loss": 0.139, + "step": 5837 + }, + { + "epoch": 0.4212273170027779, + "grad_norm": 0.1171664223074913, + "learning_rate": 0.00018316351565882523, + "loss": 0.152, + "step": 5838 + }, + { + "epoch": 0.4212994696778383, + "grad_norm": 0.11515042930841446, + "learning_rate": 0.0001831606292394285, + "loss": 0.1712, + "step": 5839 + }, + { + "epoch": 0.4213716223528987, + "grad_norm": 0.133058562874794, + "learning_rate": 0.00018315774282003176, + "loss": 0.1491, + "step": 5840 + }, + { + "epoch": 0.42144377502795916, + "grad_norm": 0.10233544558286667, + "learning_rate": 0.00018315485640063502, + "loss": 0.1469, + "step": 5841 + }, + { + "epoch": 0.4215159277030196, + "grad_norm": 0.1627454161643982, + "learning_rate": 0.00018315196998123828, + "loss": 0.1574, + "step": 5842 + }, + { + "epoch": 0.42158808037808004, + "grad_norm": 0.15958084166049957, + "learning_rate": 0.00018314908356184155, + "loss": 0.153, + "step": 5843 + }, + { + "epoch": 0.42166023305314043, + "grad_norm": 0.12667101621627808, + "learning_rate": 0.0001831461971424448, + "loss": 0.1208, + "step": 5844 + }, + { + "epoch": 0.4217323857282009, + "grad_norm": 0.11573609709739685, + "learning_rate": 0.00018314331072304807, + "loss": 0.1218, + "step": 5845 + }, + { + "epoch": 0.4218045384032613, + "grad_norm": 0.14142170548439026, + "learning_rate": 0.00018314042430365133, + "loss": 0.1279, + "step": 5846 + }, + { + "epoch": 0.4218766910783217, + "grad_norm": 0.12910237908363342, + "learning_rate": 0.00018313753788425457, + "loss": 0.1554, + "step": 5847 + }, + { + "epoch": 0.42194884375338215, + "grad_norm": 0.11738871037960052, + "learning_rate": 0.00018313465146485783, + "loss": 0.0915, + "step": 5848 + }, + { + "epoch": 0.4220209964284426, + "grad_norm": 0.13453556597232819, + "learning_rate": 0.00018313176504546112, + "loss": 0.1591, + "step": 5849 + }, + { + "epoch": 0.422093149103503, + "grad_norm": 0.10725102573633194, + "learning_rate": 0.0001831288786260644, + "loss": 0.1826, + "step": 5850 + }, + { + "epoch": 0.42216530177856343, + "grad_norm": 0.15016676485538483, + "learning_rate": 0.00018312599220666765, + "loss": 0.2079, + "step": 5851 + }, + { + "epoch": 0.4222374544536239, + "grad_norm": 0.12500222027301788, + "learning_rate": 0.00018312310578727089, + "loss": 0.1417, + "step": 5852 + }, + { + "epoch": 0.4223096071286843, + "grad_norm": 0.1071745902299881, + "learning_rate": 0.00018312021936787415, + "loss": 0.1672, + "step": 5853 + }, + { + "epoch": 0.4223817598037447, + "grad_norm": 0.11144714802503586, + "learning_rate": 0.0001831173329484774, + "loss": 0.1354, + "step": 5854 + }, + { + "epoch": 0.42245391247880515, + "grad_norm": 0.11990606784820557, + "learning_rate": 0.00018311444652908067, + "loss": 0.1728, + "step": 5855 + }, + { + "epoch": 0.4225260651538656, + "grad_norm": 0.12286270409822464, + "learning_rate": 0.00018311156010968396, + "loss": 0.1637, + "step": 5856 + }, + { + "epoch": 0.422598217828926, + "grad_norm": 0.13923144340515137, + "learning_rate": 0.0001831086736902872, + "loss": 0.223, + "step": 5857 + }, + { + "epoch": 0.4226703705039864, + "grad_norm": 0.11241895705461502, + "learning_rate": 0.00018310578727089046, + "loss": 0.1688, + "step": 5858 + }, + { + "epoch": 0.42274252317904687, + "grad_norm": 0.1049077957868576, + "learning_rate": 0.00018310290085149373, + "loss": 0.1654, + "step": 5859 + }, + { + "epoch": 0.4228146758541073, + "grad_norm": 0.12452409416437149, + "learning_rate": 0.000183100014432097, + "loss": 0.1499, + "step": 5860 + }, + { + "epoch": 0.4228868285291677, + "grad_norm": 0.12135022133588791, + "learning_rate": 0.00018309712801270025, + "loss": 0.1608, + "step": 5861 + }, + { + "epoch": 0.42295898120422815, + "grad_norm": 0.09587796032428741, + "learning_rate": 0.00018309424159330351, + "loss": 0.2024, + "step": 5862 + }, + { + "epoch": 0.4230311338792886, + "grad_norm": 0.1125534176826477, + "learning_rate": 0.00018309135517390678, + "loss": 0.1313, + "step": 5863 + }, + { + "epoch": 0.423103286554349, + "grad_norm": 0.12060602754354477, + "learning_rate": 0.00018308846875451004, + "loss": 0.1382, + "step": 5864 + }, + { + "epoch": 0.4231754392294094, + "grad_norm": 0.09139573574066162, + "learning_rate": 0.0001830855823351133, + "loss": 0.1363, + "step": 5865 + }, + { + "epoch": 0.42324759190446987, + "grad_norm": 0.11417364329099655, + "learning_rate": 0.00018308269591571657, + "loss": 0.1556, + "step": 5866 + }, + { + "epoch": 0.4233197445795303, + "grad_norm": 0.09632206708192825, + "learning_rate": 0.00018307980949631983, + "loss": 0.1635, + "step": 5867 + }, + { + "epoch": 0.4233918972545907, + "grad_norm": 0.1438894122838974, + "learning_rate": 0.0001830769230769231, + "loss": 0.1208, + "step": 5868 + }, + { + "epoch": 0.42346404992965114, + "grad_norm": 0.14925217628479004, + "learning_rate": 0.00018307403665752633, + "loss": 0.1523, + "step": 5869 + }, + { + "epoch": 0.4235362026047116, + "grad_norm": 0.13312889635562897, + "learning_rate": 0.00018307115023812962, + "loss": 0.1306, + "step": 5870 + }, + { + "epoch": 0.423608355279772, + "grad_norm": 0.10850357264280319, + "learning_rate": 0.00018306826381873288, + "loss": 0.1768, + "step": 5871 + }, + { + "epoch": 0.4236805079548324, + "grad_norm": 0.10067091882228851, + "learning_rate": 0.00018306537739933614, + "loss": 0.1211, + "step": 5872 + }, + { + "epoch": 0.42375266062989286, + "grad_norm": 0.11367423087358475, + "learning_rate": 0.0001830624909799394, + "loss": 0.1652, + "step": 5873 + }, + { + "epoch": 0.4238248133049533, + "grad_norm": 0.18777737021446228, + "learning_rate": 0.00018305960456054264, + "loss": 0.12, + "step": 5874 + }, + { + "epoch": 0.4238969659800137, + "grad_norm": 0.128363236784935, + "learning_rate": 0.0001830567181411459, + "loss": 0.1622, + "step": 5875 + }, + { + "epoch": 0.42396911865507414, + "grad_norm": 0.12165389209985733, + "learning_rate": 0.00018305383172174917, + "loss": 0.0883, + "step": 5876 + }, + { + "epoch": 0.4240412713301346, + "grad_norm": 0.09363032877445221, + "learning_rate": 0.00018305094530235246, + "loss": 0.1329, + "step": 5877 + }, + { + "epoch": 0.42411342400519497, + "grad_norm": 0.11027435213327408, + "learning_rate": 0.00018304805888295572, + "loss": 0.1508, + "step": 5878 + }, + { + "epoch": 0.4241855766802554, + "grad_norm": 0.10954953730106354, + "learning_rate": 0.00018304517246355896, + "loss": 0.1681, + "step": 5879 + }, + { + "epoch": 0.42425772935531586, + "grad_norm": 0.11906708776950836, + "learning_rate": 0.00018304228604416222, + "loss": 0.0962, + "step": 5880 + }, + { + "epoch": 0.42432988203037625, + "grad_norm": 0.13661254942417145, + "learning_rate": 0.00018303939962476548, + "loss": 0.1372, + "step": 5881 + }, + { + "epoch": 0.4244020347054367, + "grad_norm": 0.12327921390533447, + "learning_rate": 0.00018303651320536875, + "loss": 0.1481, + "step": 5882 + }, + { + "epoch": 0.42447418738049714, + "grad_norm": 0.123286172747612, + "learning_rate": 0.000183033626785972, + "loss": 0.0811, + "step": 5883 + }, + { + "epoch": 0.4245463400555576, + "grad_norm": 0.12807874381542206, + "learning_rate": 0.00018303074036657527, + "loss": 0.1677, + "step": 5884 + }, + { + "epoch": 0.42461849273061797, + "grad_norm": 0.13452769815921783, + "learning_rate": 0.00018302785394717853, + "loss": 0.0953, + "step": 5885 + }, + { + "epoch": 0.4246906454056784, + "grad_norm": 0.13784530758857727, + "learning_rate": 0.0001830249675277818, + "loss": 0.1868, + "step": 5886 + }, + { + "epoch": 0.42476279808073886, + "grad_norm": 0.13271364569664001, + "learning_rate": 0.00018302208110838506, + "loss": 0.1832, + "step": 5887 + }, + { + "epoch": 0.42483495075579925, + "grad_norm": 0.13027092814445496, + "learning_rate": 0.00018301919468898832, + "loss": 0.1451, + "step": 5888 + }, + { + "epoch": 0.4249071034308597, + "grad_norm": 0.13652382791042328, + "learning_rate": 0.00018301630826959159, + "loss": 0.1992, + "step": 5889 + }, + { + "epoch": 0.42497925610592013, + "grad_norm": 0.1736457645893097, + "learning_rate": 0.00018301342185019482, + "loss": 0.1589, + "step": 5890 + }, + { + "epoch": 0.4250514087809806, + "grad_norm": 0.12425713241100311, + "learning_rate": 0.0001830105354307981, + "loss": 0.1122, + "step": 5891 + }, + { + "epoch": 0.42512356145604097, + "grad_norm": 0.13524624705314636, + "learning_rate": 0.00018300764901140138, + "loss": 0.1785, + "step": 5892 + }, + { + "epoch": 0.4251957141311014, + "grad_norm": 0.10107419639825821, + "learning_rate": 0.00018300476259200464, + "loss": 0.148, + "step": 5893 + }, + { + "epoch": 0.42526786680616185, + "grad_norm": 0.1114598885178566, + "learning_rate": 0.0001830018761726079, + "loss": 0.2023, + "step": 5894 + }, + { + "epoch": 0.42534001948122224, + "grad_norm": 0.13687801361083984, + "learning_rate": 0.00018299898975321114, + "loss": 0.164, + "step": 5895 + }, + { + "epoch": 0.4254121721562827, + "grad_norm": 0.15442319214344025, + "learning_rate": 0.0001829961033338144, + "loss": 0.1804, + "step": 5896 + }, + { + "epoch": 0.42548432483134313, + "grad_norm": 0.08131521195173264, + "learning_rate": 0.00018299321691441766, + "loss": 0.1752, + "step": 5897 + }, + { + "epoch": 0.4255564775064036, + "grad_norm": 0.10607467591762543, + "learning_rate": 0.00018299033049502095, + "loss": 0.177, + "step": 5898 + }, + { + "epoch": 0.42562863018146396, + "grad_norm": 0.11774367839097977, + "learning_rate": 0.00018298744407562422, + "loss": 0.1205, + "step": 5899 + }, + { + "epoch": 0.4257007828565244, + "grad_norm": 0.09929922968149185, + "learning_rate": 0.00018298455765622745, + "loss": 0.1396, + "step": 5900 + }, + { + "epoch": 0.42577293553158485, + "grad_norm": 0.13096508383750916, + "learning_rate": 0.00018298167123683071, + "loss": 0.1574, + "step": 5901 + }, + { + "epoch": 0.42584508820664524, + "grad_norm": 0.1007523238658905, + "learning_rate": 0.00018297878481743398, + "loss": 0.1346, + "step": 5902 + }, + { + "epoch": 0.4259172408817057, + "grad_norm": 0.09653882682323456, + "learning_rate": 0.00018297589839803724, + "loss": 0.1508, + "step": 5903 + }, + { + "epoch": 0.4259893935567661, + "grad_norm": 0.1219748705625534, + "learning_rate": 0.0001829730119786405, + "loss": 0.131, + "step": 5904 + }, + { + "epoch": 0.42606154623182657, + "grad_norm": 0.1340920776128769, + "learning_rate": 0.00018297012555924377, + "loss": 0.1711, + "step": 5905 + }, + { + "epoch": 0.42613369890688696, + "grad_norm": 0.11681176722049713, + "learning_rate": 0.00018296723913984703, + "loss": 0.1012, + "step": 5906 + }, + { + "epoch": 0.4262058515819474, + "grad_norm": 0.11355821043252945, + "learning_rate": 0.0001829643527204503, + "loss": 0.141, + "step": 5907 + }, + { + "epoch": 0.42627800425700785, + "grad_norm": 0.10950490087270737, + "learning_rate": 0.00018296146630105355, + "loss": 0.1598, + "step": 5908 + }, + { + "epoch": 0.42635015693206824, + "grad_norm": 0.12313297390937805, + "learning_rate": 0.00018295857988165682, + "loss": 0.1669, + "step": 5909 + }, + { + "epoch": 0.4264223096071287, + "grad_norm": 0.12461540102958679, + "learning_rate": 0.00018295569346226008, + "loss": 0.1192, + "step": 5910 + }, + { + "epoch": 0.4264944622821891, + "grad_norm": 0.10815031081438065, + "learning_rate": 0.00018295280704286332, + "loss": 0.1571, + "step": 5911 + }, + { + "epoch": 0.4265666149572495, + "grad_norm": 0.16719099879264832, + "learning_rate": 0.0001829499206234666, + "loss": 0.1661, + "step": 5912 + }, + { + "epoch": 0.42663876763230996, + "grad_norm": 0.11606849730014801, + "learning_rate": 0.00018294703420406987, + "loss": 0.1781, + "step": 5913 + }, + { + "epoch": 0.4267109203073704, + "grad_norm": 0.12098933756351471, + "learning_rate": 0.00018294414778467313, + "loss": 0.1538, + "step": 5914 + }, + { + "epoch": 0.42678307298243084, + "grad_norm": 0.12739412486553192, + "learning_rate": 0.0001829412613652764, + "loss": 0.1704, + "step": 5915 + }, + { + "epoch": 0.42685522565749123, + "grad_norm": 0.11682642251253128, + "learning_rate": 0.00018293837494587963, + "loss": 0.1249, + "step": 5916 + }, + { + "epoch": 0.4269273783325517, + "grad_norm": 0.13074259459972382, + "learning_rate": 0.0001829354885264829, + "loss": 0.1812, + "step": 5917 + }, + { + "epoch": 0.4269995310076121, + "grad_norm": 0.11174934357404709, + "learning_rate": 0.00018293260210708616, + "loss": 0.1528, + "step": 5918 + }, + { + "epoch": 0.4270716836826725, + "grad_norm": 0.11838796734809875, + "learning_rate": 0.00018292971568768945, + "loss": 0.1251, + "step": 5919 + }, + { + "epoch": 0.42714383635773295, + "grad_norm": 0.16064082086086273, + "learning_rate": 0.0001829268292682927, + "loss": 0.167, + "step": 5920 + }, + { + "epoch": 0.4272159890327934, + "grad_norm": 0.12237231433391571, + "learning_rate": 0.00018292394284889595, + "loss": 0.1683, + "step": 5921 + }, + { + "epoch": 0.42728814170785384, + "grad_norm": 0.10439042747020721, + "learning_rate": 0.0001829210564294992, + "loss": 0.1037, + "step": 5922 + }, + { + "epoch": 0.42736029438291423, + "grad_norm": 0.1152818351984024, + "learning_rate": 0.00018291817001010247, + "loss": 0.138, + "step": 5923 + }, + { + "epoch": 0.4274324470579747, + "grad_norm": 0.13494327664375305, + "learning_rate": 0.00018291528359070573, + "loss": 0.1809, + "step": 5924 + }, + { + "epoch": 0.4275045997330351, + "grad_norm": 0.10945824533700943, + "learning_rate": 0.000182912397171309, + "loss": 0.1499, + "step": 5925 + }, + { + "epoch": 0.4275767524080955, + "grad_norm": 0.11632286757230759, + "learning_rate": 0.00018290951075191226, + "loss": 0.1306, + "step": 5926 + }, + { + "epoch": 0.42764890508315595, + "grad_norm": 0.13610531389713287, + "learning_rate": 0.00018290662433251552, + "loss": 0.1616, + "step": 5927 + }, + { + "epoch": 0.4277210577582164, + "grad_norm": 0.18281017243862152, + "learning_rate": 0.00018290373791311879, + "loss": 0.1488, + "step": 5928 + }, + { + "epoch": 0.42779321043327684, + "grad_norm": 0.13510634005069733, + "learning_rate": 0.00018290085149372205, + "loss": 0.1588, + "step": 5929 + }, + { + "epoch": 0.4278653631083372, + "grad_norm": 0.12557633221149445, + "learning_rate": 0.0001828979650743253, + "loss": 0.1846, + "step": 5930 + }, + { + "epoch": 0.42793751578339767, + "grad_norm": 0.11874547600746155, + "learning_rate": 0.00018289507865492857, + "loss": 0.1583, + "step": 5931 + }, + { + "epoch": 0.4280096684584581, + "grad_norm": 0.10396189242601395, + "learning_rate": 0.0001828921922355318, + "loss": 0.1409, + "step": 5932 + }, + { + "epoch": 0.4280818211335185, + "grad_norm": 0.10849086195230484, + "learning_rate": 0.0001828893058161351, + "loss": 0.1719, + "step": 5933 + }, + { + "epoch": 0.42815397380857895, + "grad_norm": 0.12240342795848846, + "learning_rate": 0.00018288641939673836, + "loss": 0.1992, + "step": 5934 + }, + { + "epoch": 0.4282261264836394, + "grad_norm": 0.10966840386390686, + "learning_rate": 0.00018288353297734163, + "loss": 0.1009, + "step": 5935 + }, + { + "epoch": 0.42829827915869984, + "grad_norm": 0.09783158451318741, + "learning_rate": 0.0001828806465579449, + "loss": 0.1323, + "step": 5936 + }, + { + "epoch": 0.4283704318337602, + "grad_norm": 0.0975455790758133, + "learning_rate": 0.00018287776013854813, + "loss": 0.202, + "step": 5937 + }, + { + "epoch": 0.42844258450882067, + "grad_norm": 0.11488337814807892, + "learning_rate": 0.0001828748737191514, + "loss": 0.1367, + "step": 5938 + }, + { + "epoch": 0.4285147371838811, + "grad_norm": 0.11133381724357605, + "learning_rate": 0.00018287198729975465, + "loss": 0.1793, + "step": 5939 + }, + { + "epoch": 0.4285868898589415, + "grad_norm": 0.10274504870176315, + "learning_rate": 0.00018286910088035794, + "loss": 0.1471, + "step": 5940 + }, + { + "epoch": 0.42865904253400194, + "grad_norm": 0.12160360813140869, + "learning_rate": 0.0001828662144609612, + "loss": 0.166, + "step": 5941 + }, + { + "epoch": 0.4287311952090624, + "grad_norm": 0.10171741247177124, + "learning_rate": 0.00018286332804156444, + "loss": 0.123, + "step": 5942 + }, + { + "epoch": 0.4288033478841228, + "grad_norm": 0.13317373394966125, + "learning_rate": 0.0001828604416221677, + "loss": 0.1214, + "step": 5943 + }, + { + "epoch": 0.4288755005591832, + "grad_norm": 0.10292576253414154, + "learning_rate": 0.00018285755520277097, + "loss": 0.1328, + "step": 5944 + }, + { + "epoch": 0.42894765323424366, + "grad_norm": 0.15207763016223907, + "learning_rate": 0.00018285466878337423, + "loss": 0.1883, + "step": 5945 + }, + { + "epoch": 0.4290198059093041, + "grad_norm": 0.10962386429309845, + "learning_rate": 0.0001828517823639775, + "loss": 0.1393, + "step": 5946 + }, + { + "epoch": 0.4290919585843645, + "grad_norm": 0.12100953608751297, + "learning_rate": 0.00018284889594458075, + "loss": 0.142, + "step": 5947 + }, + { + "epoch": 0.42916411125942494, + "grad_norm": 0.11007843911647797, + "learning_rate": 0.00018284600952518402, + "loss": 0.1595, + "step": 5948 + }, + { + "epoch": 0.4292362639344854, + "grad_norm": 0.15275146067142487, + "learning_rate": 0.00018284312310578728, + "loss": 0.117, + "step": 5949 + }, + { + "epoch": 0.4293084166095458, + "grad_norm": 0.13427838683128357, + "learning_rate": 0.00018284023668639054, + "loss": 0.1629, + "step": 5950 + }, + { + "epoch": 0.4293805692846062, + "grad_norm": 0.1632755845785141, + "learning_rate": 0.0001828373502669938, + "loss": 0.1591, + "step": 5951 + }, + { + "epoch": 0.42945272195966666, + "grad_norm": 0.1164804995059967, + "learning_rate": 0.00018283446384759707, + "loss": 0.2073, + "step": 5952 + }, + { + "epoch": 0.4295248746347271, + "grad_norm": 0.12820562720298767, + "learning_rate": 0.0001828315774282003, + "loss": 0.1669, + "step": 5953 + }, + { + "epoch": 0.4295970273097875, + "grad_norm": 0.13334399461746216, + "learning_rate": 0.0001828286910088036, + "loss": 0.1461, + "step": 5954 + }, + { + "epoch": 0.42966917998484794, + "grad_norm": 0.12655992805957794, + "learning_rate": 0.00018282580458940686, + "loss": 0.1332, + "step": 5955 + }, + { + "epoch": 0.4297413326599084, + "grad_norm": 0.12284820526838303, + "learning_rate": 0.00018282291817001012, + "loss": 0.154, + "step": 5956 + }, + { + "epoch": 0.42981348533496877, + "grad_norm": 0.10828308761119843, + "learning_rate": 0.00018282003175061338, + "loss": 0.1576, + "step": 5957 + }, + { + "epoch": 0.4298856380100292, + "grad_norm": 0.12627825140953064, + "learning_rate": 0.00018281714533121662, + "loss": 0.1195, + "step": 5958 + }, + { + "epoch": 0.42995779068508966, + "grad_norm": 0.13220927119255066, + "learning_rate": 0.00018281425891181988, + "loss": 0.1654, + "step": 5959 + }, + { + "epoch": 0.4300299433601501, + "grad_norm": 0.10761047154664993, + "learning_rate": 0.00018281137249242315, + "loss": 0.1684, + "step": 5960 + }, + { + "epoch": 0.4301020960352105, + "grad_norm": 0.1338089257478714, + "learning_rate": 0.00018280848607302644, + "loss": 0.163, + "step": 5961 + }, + { + "epoch": 0.43017424871027093, + "grad_norm": 0.13918466866016388, + "learning_rate": 0.0001828055996536297, + "loss": 0.182, + "step": 5962 + }, + { + "epoch": 0.4302464013853314, + "grad_norm": 0.11007937788963318, + "learning_rate": 0.00018280271323423293, + "loss": 0.1725, + "step": 5963 + }, + { + "epoch": 0.43031855406039177, + "grad_norm": 0.11286143213510513, + "learning_rate": 0.0001827998268148362, + "loss": 0.1548, + "step": 5964 + }, + { + "epoch": 0.4303907067354522, + "grad_norm": 0.10858482122421265, + "learning_rate": 0.00018279694039543946, + "loss": 0.1773, + "step": 5965 + }, + { + "epoch": 0.43046285941051265, + "grad_norm": 0.13632416725158691, + "learning_rate": 0.00018279405397604272, + "loss": 0.1765, + "step": 5966 + }, + { + "epoch": 0.4305350120855731, + "grad_norm": 0.11998841166496277, + "learning_rate": 0.00018279116755664599, + "loss": 0.1726, + "step": 5967 + }, + { + "epoch": 0.4306071647606335, + "grad_norm": 0.09694402664899826, + "learning_rate": 0.00018278828113724925, + "loss": 0.1676, + "step": 5968 + }, + { + "epoch": 0.43067931743569393, + "grad_norm": 0.11320126056671143, + "learning_rate": 0.0001827853947178525, + "loss": 0.1574, + "step": 5969 + }, + { + "epoch": 0.4307514701107544, + "grad_norm": 0.11799290776252747, + "learning_rate": 0.00018278250829845577, + "loss": 0.1672, + "step": 5970 + }, + { + "epoch": 0.43082362278581476, + "grad_norm": 0.11785417050123215, + "learning_rate": 0.00018277962187905904, + "loss": 0.1412, + "step": 5971 + }, + { + "epoch": 0.4308957754608752, + "grad_norm": 0.11125194281339645, + "learning_rate": 0.0001827767354596623, + "loss": 0.1512, + "step": 5972 + }, + { + "epoch": 0.43096792813593565, + "grad_norm": 0.12699389457702637, + "learning_rate": 0.00018277384904026556, + "loss": 0.137, + "step": 5973 + }, + { + "epoch": 0.43104008081099604, + "grad_norm": 0.11820469051599503, + "learning_rate": 0.00018277096262086883, + "loss": 0.1464, + "step": 5974 + }, + { + "epoch": 0.4311122334860565, + "grad_norm": 0.10742323845624924, + "learning_rate": 0.00018276807620147206, + "loss": 0.1288, + "step": 5975 + }, + { + "epoch": 0.43118438616111693, + "grad_norm": 0.11058198660612106, + "learning_rate": 0.00018276518978207535, + "loss": 0.218, + "step": 5976 + }, + { + "epoch": 0.43125653883617737, + "grad_norm": 0.1054624542593956, + "learning_rate": 0.00018276230336267862, + "loss": 0.1621, + "step": 5977 + }, + { + "epoch": 0.43132869151123776, + "grad_norm": 0.10926330089569092, + "learning_rate": 0.00018275941694328188, + "loss": 0.192, + "step": 5978 + }, + { + "epoch": 0.4314008441862982, + "grad_norm": 0.12095009535551071, + "learning_rate": 0.00018275653052388514, + "loss": 0.1717, + "step": 5979 + }, + { + "epoch": 0.43147299686135865, + "grad_norm": 0.14626653492450714, + "learning_rate": 0.00018275364410448838, + "loss": 0.1831, + "step": 5980 + }, + { + "epoch": 0.43154514953641904, + "grad_norm": 0.11830244213342667, + "learning_rate": 0.00018275075768509164, + "loss": 0.1632, + "step": 5981 + }, + { + "epoch": 0.4316173022114795, + "grad_norm": 0.105450838804245, + "learning_rate": 0.0001827478712656949, + "loss": 0.1451, + "step": 5982 + }, + { + "epoch": 0.4316894548865399, + "grad_norm": 0.12555789947509766, + "learning_rate": 0.0001827449848462982, + "loss": 0.2005, + "step": 5983 + }, + { + "epoch": 0.43176160756160037, + "grad_norm": 0.11613515764474869, + "learning_rate": 0.00018274209842690146, + "loss": 0.1313, + "step": 5984 + }, + { + "epoch": 0.43183376023666076, + "grad_norm": 0.11850589513778687, + "learning_rate": 0.0001827392120075047, + "loss": 0.172, + "step": 5985 + }, + { + "epoch": 0.4319059129117212, + "grad_norm": 0.09376665949821472, + "learning_rate": 0.00018273632558810795, + "loss": 0.1635, + "step": 5986 + }, + { + "epoch": 0.43197806558678165, + "grad_norm": 0.12785853445529938, + "learning_rate": 0.00018273343916871122, + "loss": 0.1612, + "step": 5987 + }, + { + "epoch": 0.43205021826184203, + "grad_norm": 0.16476839780807495, + "learning_rate": 0.00018273055274931448, + "loss": 0.1906, + "step": 5988 + }, + { + "epoch": 0.4321223709369025, + "grad_norm": 0.15365070104599, + "learning_rate": 0.00018272766632991774, + "loss": 0.1283, + "step": 5989 + }, + { + "epoch": 0.4321945236119629, + "grad_norm": 0.10299073159694672, + "learning_rate": 0.000182724779910521, + "loss": 0.1787, + "step": 5990 + }, + { + "epoch": 0.43226667628702337, + "grad_norm": 0.13909496366977692, + "learning_rate": 0.00018272189349112427, + "loss": 0.1656, + "step": 5991 + }, + { + "epoch": 0.43233882896208375, + "grad_norm": 0.10549302399158478, + "learning_rate": 0.00018271900707172753, + "loss": 0.1566, + "step": 5992 + }, + { + "epoch": 0.4324109816371442, + "grad_norm": 0.15905825793743134, + "learning_rate": 0.0001827161206523308, + "loss": 0.1702, + "step": 5993 + }, + { + "epoch": 0.43248313431220464, + "grad_norm": 0.13359205424785614, + "learning_rate": 0.00018271323423293406, + "loss": 0.1221, + "step": 5994 + }, + { + "epoch": 0.43255528698726503, + "grad_norm": 0.14784115552902222, + "learning_rate": 0.00018271034781353732, + "loss": 0.1176, + "step": 5995 + }, + { + "epoch": 0.4326274396623255, + "grad_norm": 0.1282242238521576, + "learning_rate": 0.00018270746139414056, + "loss": 0.137, + "step": 5996 + }, + { + "epoch": 0.4326995923373859, + "grad_norm": 0.11883100867271423, + "learning_rate": 0.00018270457497474385, + "loss": 0.1569, + "step": 5997 + }, + { + "epoch": 0.43277174501244636, + "grad_norm": 0.1250883787870407, + "learning_rate": 0.0001827016885553471, + "loss": 0.1626, + "step": 5998 + }, + { + "epoch": 0.43284389768750675, + "grad_norm": 0.11436107754707336, + "learning_rate": 0.00018269880213595037, + "loss": 0.1389, + "step": 5999 + }, + { + "epoch": 0.4329160503625672, + "grad_norm": 0.1631336808204651, + "learning_rate": 0.00018269591571655364, + "loss": 0.1491, + "step": 6000 + }, + { + "epoch": 0.43298820303762764, + "grad_norm": 0.1344766467809677, + "learning_rate": 0.00018269302929715687, + "loss": 0.1315, + "step": 6001 + }, + { + "epoch": 0.433060355712688, + "grad_norm": 0.10305652767419815, + "learning_rate": 0.00018269014287776013, + "loss": 0.1788, + "step": 6002 + }, + { + "epoch": 0.43313250838774847, + "grad_norm": 0.11222297698259354, + "learning_rate": 0.0001826872564583634, + "loss": 0.1766, + "step": 6003 + }, + { + "epoch": 0.4332046610628089, + "grad_norm": 0.12760218977928162, + "learning_rate": 0.0001826843700389667, + "loss": 0.1388, + "step": 6004 + }, + { + "epoch": 0.4332768137378693, + "grad_norm": 0.1565997153520584, + "learning_rate": 0.00018268148361956995, + "loss": 0.1935, + "step": 6005 + }, + { + "epoch": 0.43334896641292975, + "grad_norm": 0.10120268166065216, + "learning_rate": 0.00018267859720017319, + "loss": 0.1646, + "step": 6006 + }, + { + "epoch": 0.4334211190879902, + "grad_norm": 0.09831411391496658, + "learning_rate": 0.00018267571078077645, + "loss": 0.1495, + "step": 6007 + }, + { + "epoch": 0.43349327176305064, + "grad_norm": 0.0985429584980011, + "learning_rate": 0.0001826728243613797, + "loss": 0.1229, + "step": 6008 + }, + { + "epoch": 0.433565424438111, + "grad_norm": 0.13200056552886963, + "learning_rate": 0.00018266993794198297, + "loss": 0.1641, + "step": 6009 + }, + { + "epoch": 0.43363757711317147, + "grad_norm": 0.1078602522611618, + "learning_rate": 0.00018266705152258624, + "loss": 0.1545, + "step": 6010 + }, + { + "epoch": 0.4337097297882319, + "grad_norm": 0.1463153064250946, + "learning_rate": 0.0001826641651031895, + "loss": 0.1629, + "step": 6011 + }, + { + "epoch": 0.4337818824632923, + "grad_norm": 0.12757614254951477, + "learning_rate": 0.00018266127868379276, + "loss": 0.1285, + "step": 6012 + }, + { + "epoch": 0.43385403513835274, + "grad_norm": 0.10534749180078506, + "learning_rate": 0.00018265839226439603, + "loss": 0.1294, + "step": 6013 + }, + { + "epoch": 0.4339261878134132, + "grad_norm": 0.11838100850582123, + "learning_rate": 0.0001826555058449993, + "loss": 0.2004, + "step": 6014 + }, + { + "epoch": 0.43399834048847363, + "grad_norm": 0.1322600394487381, + "learning_rate": 0.00018265261942560255, + "loss": 0.15, + "step": 6015 + }, + { + "epoch": 0.434070493163534, + "grad_norm": 0.10637901723384857, + "learning_rate": 0.00018264973300620581, + "loss": 0.1582, + "step": 6016 + }, + { + "epoch": 0.43414264583859447, + "grad_norm": 0.1292022168636322, + "learning_rate": 0.00018264684658680905, + "loss": 0.1546, + "step": 6017 + }, + { + "epoch": 0.4342147985136549, + "grad_norm": 0.11601456254720688, + "learning_rate": 0.00018264396016741234, + "loss": 0.2267, + "step": 6018 + }, + { + "epoch": 0.4342869511887153, + "grad_norm": 0.12072426825761795, + "learning_rate": 0.0001826410737480156, + "loss": 0.197, + "step": 6019 + }, + { + "epoch": 0.43435910386377574, + "grad_norm": 0.10457204282283783, + "learning_rate": 0.00018263818732861887, + "loss": 0.1452, + "step": 6020 + }, + { + "epoch": 0.4344312565388362, + "grad_norm": 0.11974366754293442, + "learning_rate": 0.00018263530090922213, + "loss": 0.1394, + "step": 6021 + }, + { + "epoch": 0.43450340921389663, + "grad_norm": 0.12165110558271408, + "learning_rate": 0.00018263241448982537, + "loss": 0.1692, + "step": 6022 + }, + { + "epoch": 0.434575561888957, + "grad_norm": 0.13369393348693848, + "learning_rate": 0.00018262952807042863, + "loss": 0.1567, + "step": 6023 + }, + { + "epoch": 0.43464771456401746, + "grad_norm": 0.1187087744474411, + "learning_rate": 0.0001826266416510319, + "loss": 0.141, + "step": 6024 + }, + { + "epoch": 0.4347198672390779, + "grad_norm": 0.10499470680952072, + "learning_rate": 0.00018262375523163518, + "loss": 0.1764, + "step": 6025 + }, + { + "epoch": 0.4347920199141383, + "grad_norm": 0.10115773975849152, + "learning_rate": 0.00018262086881223844, + "loss": 0.1727, + "step": 6026 + }, + { + "epoch": 0.43486417258919874, + "grad_norm": 0.10905278474092484, + "learning_rate": 0.00018261798239284168, + "loss": 0.12, + "step": 6027 + }, + { + "epoch": 0.4349363252642592, + "grad_norm": 0.1212361603975296, + "learning_rate": 0.00018261509597344494, + "loss": 0.1583, + "step": 6028 + }, + { + "epoch": 0.4350084779393196, + "grad_norm": 0.10275877267122269, + "learning_rate": 0.0001826122095540482, + "loss": 0.1231, + "step": 6029 + }, + { + "epoch": 0.43508063061438, + "grad_norm": 0.120490163564682, + "learning_rate": 0.00018260932313465147, + "loss": 0.1782, + "step": 6030 + }, + { + "epoch": 0.43515278328944046, + "grad_norm": 0.12808160483837128, + "learning_rate": 0.00018260643671525473, + "loss": 0.1667, + "step": 6031 + }, + { + "epoch": 0.4352249359645009, + "grad_norm": 0.13066217303276062, + "learning_rate": 0.000182603550295858, + "loss": 0.1496, + "step": 6032 + }, + { + "epoch": 0.4352970886395613, + "grad_norm": 0.13443118333816528, + "learning_rate": 0.00018260066387646126, + "loss": 0.1789, + "step": 6033 + }, + { + "epoch": 0.43536924131462174, + "grad_norm": 0.10961943864822388, + "learning_rate": 0.00018259777745706452, + "loss": 0.1879, + "step": 6034 + }, + { + "epoch": 0.4354413939896822, + "grad_norm": 0.12362266331911087, + "learning_rate": 0.00018259489103766778, + "loss": 0.1758, + "step": 6035 + }, + { + "epoch": 0.43551354666474257, + "grad_norm": 0.11460345983505249, + "learning_rate": 0.00018259200461827105, + "loss": 0.1837, + "step": 6036 + }, + { + "epoch": 0.435585699339803, + "grad_norm": 0.11664992570877075, + "learning_rate": 0.0001825891181988743, + "loss": 0.153, + "step": 6037 + }, + { + "epoch": 0.43565785201486346, + "grad_norm": 0.11473949253559113, + "learning_rate": 0.00018258623177947754, + "loss": 0.1829, + "step": 6038 + }, + { + "epoch": 0.4357300046899239, + "grad_norm": 0.13825523853302002, + "learning_rate": 0.00018258334536008083, + "loss": 0.2015, + "step": 6039 + }, + { + "epoch": 0.4358021573649843, + "grad_norm": 0.13293814659118652, + "learning_rate": 0.0001825804589406841, + "loss": 0.145, + "step": 6040 + }, + { + "epoch": 0.43587431004004473, + "grad_norm": 0.15453194081783295, + "learning_rate": 0.00018257757252128736, + "loss": 0.1654, + "step": 6041 + }, + { + "epoch": 0.4359464627151052, + "grad_norm": 0.10982448607683182, + "learning_rate": 0.00018257468610189062, + "loss": 0.1681, + "step": 6042 + }, + { + "epoch": 0.43601861539016556, + "grad_norm": 0.12576572597026825, + "learning_rate": 0.00018257179968249386, + "loss": 0.1335, + "step": 6043 + }, + { + "epoch": 0.436090768065226, + "grad_norm": 0.10711273550987244, + "learning_rate": 0.00018256891326309712, + "loss": 0.1419, + "step": 6044 + }, + { + "epoch": 0.43616292074028645, + "grad_norm": 0.13865168392658234, + "learning_rate": 0.00018256602684370039, + "loss": 0.1557, + "step": 6045 + }, + { + "epoch": 0.4362350734153469, + "grad_norm": 0.11145945638418198, + "learning_rate": 0.00018256314042430368, + "loss": 0.1484, + "step": 6046 + }, + { + "epoch": 0.4363072260904073, + "grad_norm": 0.10596998035907745, + "learning_rate": 0.00018256025400490694, + "loss": 0.1762, + "step": 6047 + }, + { + "epoch": 0.43637937876546773, + "grad_norm": 0.13695760071277618, + "learning_rate": 0.00018255736758551017, + "loss": 0.1452, + "step": 6048 + }, + { + "epoch": 0.4364515314405282, + "grad_norm": 0.10145651549100876, + "learning_rate": 0.00018255448116611344, + "loss": 0.1108, + "step": 6049 + }, + { + "epoch": 0.43652368411558856, + "grad_norm": 0.11442878842353821, + "learning_rate": 0.0001825515947467167, + "loss": 0.1243, + "step": 6050 + }, + { + "epoch": 0.436595836790649, + "grad_norm": 0.11627763509750366, + "learning_rate": 0.00018254870832731996, + "loss": 0.1899, + "step": 6051 + }, + { + "epoch": 0.43666798946570945, + "grad_norm": 0.10713426768779755, + "learning_rate": 0.00018254582190792323, + "loss": 0.1579, + "step": 6052 + }, + { + "epoch": 0.4367401421407699, + "grad_norm": 0.13007032871246338, + "learning_rate": 0.0001825429354885265, + "loss": 0.1612, + "step": 6053 + }, + { + "epoch": 0.4368122948158303, + "grad_norm": 0.11363731324672699, + "learning_rate": 0.00018254004906912975, + "loss": 0.151, + "step": 6054 + }, + { + "epoch": 0.4368844474908907, + "grad_norm": 0.13862450420856476, + "learning_rate": 0.00018253716264973301, + "loss": 0.1464, + "step": 6055 + }, + { + "epoch": 0.43695660016595117, + "grad_norm": 0.10363277792930603, + "learning_rate": 0.00018253427623033628, + "loss": 0.1355, + "step": 6056 + }, + { + "epoch": 0.43702875284101156, + "grad_norm": 0.11451687663793564, + "learning_rate": 0.00018253138981093954, + "loss": 0.1645, + "step": 6057 + }, + { + "epoch": 0.437100905516072, + "grad_norm": 0.15152929723262787, + "learning_rate": 0.0001825285033915428, + "loss": 0.2073, + "step": 6058 + }, + { + "epoch": 0.43717305819113245, + "grad_norm": 0.1593613475561142, + "learning_rate": 0.00018252561697214604, + "loss": 0.1389, + "step": 6059 + }, + { + "epoch": 0.4372452108661929, + "grad_norm": 0.17250584065914154, + "learning_rate": 0.00018252273055274933, + "loss": 0.1642, + "step": 6060 + }, + { + "epoch": 0.4373173635412533, + "grad_norm": 0.1319507211446762, + "learning_rate": 0.0001825198441333526, + "loss": 0.1548, + "step": 6061 + }, + { + "epoch": 0.4373895162163137, + "grad_norm": 0.12388844043016434, + "learning_rate": 0.00018251695771395586, + "loss": 0.1348, + "step": 6062 + }, + { + "epoch": 0.43746166889137417, + "grad_norm": 0.1052316427230835, + "learning_rate": 0.00018251407129455912, + "loss": 0.1584, + "step": 6063 + }, + { + "epoch": 0.43753382156643456, + "grad_norm": 0.11350063234567642, + "learning_rate": 0.00018251118487516235, + "loss": 0.1775, + "step": 6064 + }, + { + "epoch": 0.437605974241495, + "grad_norm": 0.10035805404186249, + "learning_rate": 0.00018250829845576562, + "loss": 0.1664, + "step": 6065 + }, + { + "epoch": 0.43767812691655544, + "grad_norm": 0.10310214757919312, + "learning_rate": 0.00018250541203636888, + "loss": 0.1095, + "step": 6066 + }, + { + "epoch": 0.43775027959161583, + "grad_norm": 0.12834404408931732, + "learning_rate": 0.00018250252561697217, + "loss": 0.1463, + "step": 6067 + }, + { + "epoch": 0.4378224322666763, + "grad_norm": 0.09869488328695297, + "learning_rate": 0.00018249963919757543, + "loss": 0.1348, + "step": 6068 + }, + { + "epoch": 0.4378945849417367, + "grad_norm": 0.18246905505657196, + "learning_rate": 0.00018249675277817867, + "loss": 0.1834, + "step": 6069 + }, + { + "epoch": 0.43796673761679716, + "grad_norm": 0.11811268329620361, + "learning_rate": 0.00018249386635878193, + "loss": 0.1358, + "step": 6070 + }, + { + "epoch": 0.43803889029185755, + "grad_norm": 0.1267288476228714, + "learning_rate": 0.0001824909799393852, + "loss": 0.1811, + "step": 6071 + }, + { + "epoch": 0.438111042966918, + "grad_norm": 0.10749071836471558, + "learning_rate": 0.00018248809351998846, + "loss": 0.1516, + "step": 6072 + }, + { + "epoch": 0.43818319564197844, + "grad_norm": 0.10894975066184998, + "learning_rate": 0.00018248520710059172, + "loss": 0.1908, + "step": 6073 + }, + { + "epoch": 0.43825534831703883, + "grad_norm": 0.11814959347248077, + "learning_rate": 0.00018248232068119498, + "loss": 0.1482, + "step": 6074 + }, + { + "epoch": 0.4383275009920993, + "grad_norm": 0.13490252196788788, + "learning_rate": 0.00018247943426179825, + "loss": 0.1527, + "step": 6075 + }, + { + "epoch": 0.4383996536671597, + "grad_norm": 0.12666769325733185, + "learning_rate": 0.0001824765478424015, + "loss": 0.1583, + "step": 6076 + }, + { + "epoch": 0.43847180634222016, + "grad_norm": 0.11454398930072784, + "learning_rate": 0.00018247366142300477, + "loss": 0.1389, + "step": 6077 + }, + { + "epoch": 0.43854395901728055, + "grad_norm": 0.1308351755142212, + "learning_rate": 0.00018247077500360803, + "loss": 0.1821, + "step": 6078 + }, + { + "epoch": 0.438616111692341, + "grad_norm": 0.11633557081222534, + "learning_rate": 0.0001824678885842113, + "loss": 0.1346, + "step": 6079 + }, + { + "epoch": 0.43868826436740144, + "grad_norm": 0.10918557643890381, + "learning_rate": 0.00018246500216481453, + "loss": 0.1244, + "step": 6080 + }, + { + "epoch": 0.4387604170424618, + "grad_norm": 0.10061082988977432, + "learning_rate": 0.00018246211574541782, + "loss": 0.124, + "step": 6081 + }, + { + "epoch": 0.43883256971752227, + "grad_norm": 0.10848748683929443, + "learning_rate": 0.00018245922932602109, + "loss": 0.16, + "step": 6082 + }, + { + "epoch": 0.4389047223925827, + "grad_norm": 0.10849925130605698, + "learning_rate": 0.00018245634290662435, + "loss": 0.1529, + "step": 6083 + }, + { + "epoch": 0.43897687506764316, + "grad_norm": 0.10635923594236374, + "learning_rate": 0.0001824534564872276, + "loss": 0.1929, + "step": 6084 + }, + { + "epoch": 0.43904902774270355, + "grad_norm": 0.11233007907867432, + "learning_rate": 0.00018245057006783088, + "loss": 0.1928, + "step": 6085 + }, + { + "epoch": 0.439121180417764, + "grad_norm": 0.11263249814510345, + "learning_rate": 0.0001824476836484341, + "loss": 0.1631, + "step": 6086 + }, + { + "epoch": 0.43919333309282443, + "grad_norm": 0.09963755309581757, + "learning_rate": 0.00018244479722903737, + "loss": 0.1246, + "step": 6087 + }, + { + "epoch": 0.4392654857678848, + "grad_norm": 0.11192876845598221, + "learning_rate": 0.00018244191080964066, + "loss": 0.1498, + "step": 6088 + }, + { + "epoch": 0.43933763844294527, + "grad_norm": 0.12412458658218384, + "learning_rate": 0.00018243902439024393, + "loss": 0.1314, + "step": 6089 + }, + { + "epoch": 0.4394097911180057, + "grad_norm": 0.14059120416641235, + "learning_rate": 0.0001824361379708472, + "loss": 0.186, + "step": 6090 + }, + { + "epoch": 0.43948194379306615, + "grad_norm": 0.115476593375206, + "learning_rate": 0.00018243325155145043, + "loss": 0.178, + "step": 6091 + }, + { + "epoch": 0.43955409646812654, + "grad_norm": 0.11097398400306702, + "learning_rate": 0.0001824303651320537, + "loss": 0.1784, + "step": 6092 + }, + { + "epoch": 0.439626249143187, + "grad_norm": 0.11092938482761383, + "learning_rate": 0.00018242747871265695, + "loss": 0.1745, + "step": 6093 + }, + { + "epoch": 0.43969840181824743, + "grad_norm": 0.11939523369073868, + "learning_rate": 0.00018242459229326021, + "loss": 0.1534, + "step": 6094 + }, + { + "epoch": 0.4397705544933078, + "grad_norm": 0.12466622143983841, + "learning_rate": 0.0001824217058738635, + "loss": 0.1458, + "step": 6095 + }, + { + "epoch": 0.43984270716836826, + "grad_norm": 0.12432552129030228, + "learning_rate": 0.00018241881945446674, + "loss": 0.2048, + "step": 6096 + }, + { + "epoch": 0.4399148598434287, + "grad_norm": 0.10610135644674301, + "learning_rate": 0.00018241593303507, + "loss": 0.1668, + "step": 6097 + }, + { + "epoch": 0.4399870125184891, + "grad_norm": 0.12429031729698181, + "learning_rate": 0.00018241304661567327, + "loss": 0.1618, + "step": 6098 + }, + { + "epoch": 0.44005916519354954, + "grad_norm": 0.11785822361707687, + "learning_rate": 0.00018241016019627653, + "loss": 0.1588, + "step": 6099 + }, + { + "epoch": 0.44013131786861, + "grad_norm": 0.12341566383838654, + "learning_rate": 0.0001824072737768798, + "loss": 0.2115, + "step": 6100 + }, + { + "epoch": 0.4402034705436704, + "grad_norm": 0.12034764885902405, + "learning_rate": 0.00018240438735748305, + "loss": 0.1269, + "step": 6101 + }, + { + "epoch": 0.4402756232187308, + "grad_norm": 0.14521610736846924, + "learning_rate": 0.00018240150093808632, + "loss": 0.1682, + "step": 6102 + }, + { + "epoch": 0.44034777589379126, + "grad_norm": 0.1044413149356842, + "learning_rate": 0.00018239861451868958, + "loss": 0.1418, + "step": 6103 + }, + { + "epoch": 0.4404199285688517, + "grad_norm": 0.10452799499034882, + "learning_rate": 0.00018239572809929284, + "loss": 0.1366, + "step": 6104 + }, + { + "epoch": 0.4404920812439121, + "grad_norm": 0.08668769896030426, + "learning_rate": 0.0001823928416798961, + "loss": 0.1589, + "step": 6105 + }, + { + "epoch": 0.44056423391897254, + "grad_norm": 0.11417484283447266, + "learning_rate": 0.00018238995526049937, + "loss": 0.1404, + "step": 6106 + }, + { + "epoch": 0.440636386594033, + "grad_norm": 0.10670206695795059, + "learning_rate": 0.0001823870688411026, + "loss": 0.1395, + "step": 6107 + }, + { + "epoch": 0.4407085392690934, + "grad_norm": 0.12471598386764526, + "learning_rate": 0.00018238418242170587, + "loss": 0.1505, + "step": 6108 + }, + { + "epoch": 0.4407806919441538, + "grad_norm": 0.1277986317873001, + "learning_rate": 0.00018238129600230916, + "loss": 0.1165, + "step": 6109 + }, + { + "epoch": 0.44085284461921426, + "grad_norm": 0.1507466435432434, + "learning_rate": 0.00018237840958291242, + "loss": 0.202, + "step": 6110 + }, + { + "epoch": 0.4409249972942747, + "grad_norm": 0.10197729617357254, + "learning_rate": 0.00018237552316351568, + "loss": 0.1808, + "step": 6111 + }, + { + "epoch": 0.4409971499693351, + "grad_norm": 0.1200295016169548, + "learning_rate": 0.00018237263674411892, + "loss": 0.1685, + "step": 6112 + }, + { + "epoch": 0.44106930264439553, + "grad_norm": 0.12193494290113449, + "learning_rate": 0.00018236975032472218, + "loss": 0.1966, + "step": 6113 + }, + { + "epoch": 0.441141455319456, + "grad_norm": 0.11754129081964493, + "learning_rate": 0.00018236686390532545, + "loss": 0.1081, + "step": 6114 + }, + { + "epoch": 0.4412136079945164, + "grad_norm": 0.1152927577495575, + "learning_rate": 0.0001823639774859287, + "loss": 0.1402, + "step": 6115 + }, + { + "epoch": 0.4412857606695768, + "grad_norm": 0.12563861906528473, + "learning_rate": 0.000182361091066532, + "loss": 0.1531, + "step": 6116 + }, + { + "epoch": 0.44135791334463725, + "grad_norm": 0.1496850550174713, + "learning_rate": 0.00018235820464713523, + "loss": 0.1327, + "step": 6117 + }, + { + "epoch": 0.4414300660196977, + "grad_norm": 0.11527600139379501, + "learning_rate": 0.0001823553182277385, + "loss": 0.1886, + "step": 6118 + }, + { + "epoch": 0.4415022186947581, + "grad_norm": 0.12287785112857819, + "learning_rate": 0.00018235243180834176, + "loss": 0.1504, + "step": 6119 + }, + { + "epoch": 0.44157437136981853, + "grad_norm": 0.11043699085712433, + "learning_rate": 0.00018234954538894502, + "loss": 0.177, + "step": 6120 + }, + { + "epoch": 0.441646524044879, + "grad_norm": 0.10308093577623367, + "learning_rate": 0.00018234665896954829, + "loss": 0.1435, + "step": 6121 + }, + { + "epoch": 0.4417186767199394, + "grad_norm": 0.13625788688659668, + "learning_rate": 0.00018234377255015155, + "loss": 0.1695, + "step": 6122 + }, + { + "epoch": 0.4417908293949998, + "grad_norm": 0.17083428800106049, + "learning_rate": 0.0001823408861307548, + "loss": 0.117, + "step": 6123 + }, + { + "epoch": 0.44186298207006025, + "grad_norm": 0.10584522783756256, + "learning_rate": 0.00018233799971135807, + "loss": 0.1568, + "step": 6124 + }, + { + "epoch": 0.4419351347451207, + "grad_norm": 0.11779288947582245, + "learning_rate": 0.00018233511329196134, + "loss": 0.1604, + "step": 6125 + }, + { + "epoch": 0.4420072874201811, + "grad_norm": 0.12526173889636993, + "learning_rate": 0.0001823322268725646, + "loss": 0.1249, + "step": 6126 + }, + { + "epoch": 0.4420794400952415, + "grad_norm": 0.1471685767173767, + "learning_rate": 0.00018232934045316786, + "loss": 0.1559, + "step": 6127 + }, + { + "epoch": 0.44215159277030197, + "grad_norm": 0.14475755393505096, + "learning_rate": 0.0001823264540337711, + "loss": 0.1228, + "step": 6128 + }, + { + "epoch": 0.44222374544536236, + "grad_norm": 0.1287613958120346, + "learning_rate": 0.00018232356761437436, + "loss": 0.1143, + "step": 6129 + }, + { + "epoch": 0.4422958981204228, + "grad_norm": 0.11104518175125122, + "learning_rate": 0.00018232068119497765, + "loss": 0.152, + "step": 6130 + }, + { + "epoch": 0.44236805079548325, + "grad_norm": 0.12088888138532639, + "learning_rate": 0.00018231779477558092, + "loss": 0.1241, + "step": 6131 + }, + { + "epoch": 0.4424402034705437, + "grad_norm": 0.1110999584197998, + "learning_rate": 0.00018231490835618418, + "loss": 0.1359, + "step": 6132 + }, + { + "epoch": 0.4425123561456041, + "grad_norm": 0.10359269380569458, + "learning_rate": 0.00018231202193678741, + "loss": 0.1584, + "step": 6133 + }, + { + "epoch": 0.4425845088206645, + "grad_norm": 0.10348071157932281, + "learning_rate": 0.00018230913551739068, + "loss": 0.1518, + "step": 6134 + }, + { + "epoch": 0.44265666149572497, + "grad_norm": 0.11287201941013336, + "learning_rate": 0.00018230624909799394, + "loss": 0.118, + "step": 6135 + }, + { + "epoch": 0.44272881417078536, + "grad_norm": 0.0962667316198349, + "learning_rate": 0.0001823033626785972, + "loss": 0.1416, + "step": 6136 + }, + { + "epoch": 0.4428009668458458, + "grad_norm": 0.1272866427898407, + "learning_rate": 0.0001823004762592005, + "loss": 0.1781, + "step": 6137 + }, + { + "epoch": 0.44287311952090624, + "grad_norm": 0.10131068527698517, + "learning_rate": 0.00018229758983980373, + "loss": 0.1632, + "step": 6138 + }, + { + "epoch": 0.4429452721959667, + "grad_norm": 0.12730516493320465, + "learning_rate": 0.000182294703420407, + "loss": 0.1483, + "step": 6139 + }, + { + "epoch": 0.4430174248710271, + "grad_norm": 0.14410527050495148, + "learning_rate": 0.00018229181700101025, + "loss": 0.2395, + "step": 6140 + }, + { + "epoch": 0.4430895775460875, + "grad_norm": 0.11683524399995804, + "learning_rate": 0.00018228893058161352, + "loss": 0.1283, + "step": 6141 + }, + { + "epoch": 0.44316173022114796, + "grad_norm": 0.11929294466972351, + "learning_rate": 0.00018228604416221678, + "loss": 0.1416, + "step": 6142 + }, + { + "epoch": 0.44323388289620835, + "grad_norm": 0.10018768161535263, + "learning_rate": 0.00018228315774282004, + "loss": 0.193, + "step": 6143 + }, + { + "epoch": 0.4433060355712688, + "grad_norm": 0.12105677276849747, + "learning_rate": 0.0001822802713234233, + "loss": 0.1476, + "step": 6144 + }, + { + "epoch": 0.44337818824632924, + "grad_norm": 0.11790598183870316, + "learning_rate": 0.00018227738490402657, + "loss": 0.1682, + "step": 6145 + }, + { + "epoch": 0.4434503409213897, + "grad_norm": 0.1322735697031021, + "learning_rate": 0.00018227449848462983, + "loss": 0.1232, + "step": 6146 + }, + { + "epoch": 0.4435224935964501, + "grad_norm": 0.10716860741376877, + "learning_rate": 0.0001822716120652331, + "loss": 0.1455, + "step": 6147 + }, + { + "epoch": 0.4435946462715105, + "grad_norm": 0.10606841742992401, + "learning_rate": 0.00018226872564583636, + "loss": 0.1664, + "step": 6148 + }, + { + "epoch": 0.44366679894657096, + "grad_norm": 0.11973252892494202, + "learning_rate": 0.0001822658392264396, + "loss": 0.1208, + "step": 6149 + }, + { + "epoch": 0.44373895162163135, + "grad_norm": 0.11278528720140457, + "learning_rate": 0.00018226295280704286, + "loss": 0.1624, + "step": 6150 + }, + { + "epoch": 0.4438111042966918, + "grad_norm": 0.11765396595001221, + "learning_rate": 0.00018226006638764615, + "loss": 0.1465, + "step": 6151 + }, + { + "epoch": 0.44388325697175224, + "grad_norm": 0.11731109768152237, + "learning_rate": 0.0001822571799682494, + "loss": 0.1774, + "step": 6152 + }, + { + "epoch": 0.4439554096468127, + "grad_norm": 0.09984837472438812, + "learning_rate": 0.00018225429354885267, + "loss": 0.1207, + "step": 6153 + }, + { + "epoch": 0.44402756232187307, + "grad_norm": 0.13035528361797333, + "learning_rate": 0.0001822514071294559, + "loss": 0.1497, + "step": 6154 + }, + { + "epoch": 0.4440997149969335, + "grad_norm": 0.14929358661174774, + "learning_rate": 0.00018224852071005917, + "loss": 0.2084, + "step": 6155 + }, + { + "epoch": 0.44417186767199396, + "grad_norm": 0.11625222861766815, + "learning_rate": 0.00018224563429066243, + "loss": 0.1745, + "step": 6156 + }, + { + "epoch": 0.44424402034705435, + "grad_norm": 0.1217883750796318, + "learning_rate": 0.0001822427478712657, + "loss": 0.1987, + "step": 6157 + }, + { + "epoch": 0.4443161730221148, + "grad_norm": 0.10432552546262741, + "learning_rate": 0.000182239861451869, + "loss": 0.1421, + "step": 6158 + }, + { + "epoch": 0.44438832569717523, + "grad_norm": 0.12206144630908966, + "learning_rate": 0.00018223697503247222, + "loss": 0.1148, + "step": 6159 + }, + { + "epoch": 0.4444604783722356, + "grad_norm": 0.0915931984782219, + "learning_rate": 0.00018223408861307549, + "loss": 0.1694, + "step": 6160 + }, + { + "epoch": 0.44453263104729607, + "grad_norm": 0.11894541233778, + "learning_rate": 0.00018223120219367875, + "loss": 0.1654, + "step": 6161 + }, + { + "epoch": 0.4446047837223565, + "grad_norm": 0.10530544072389603, + "learning_rate": 0.000182228315774282, + "loss": 0.0963, + "step": 6162 + }, + { + "epoch": 0.44467693639741696, + "grad_norm": 0.14129701256752014, + "learning_rate": 0.00018222542935488527, + "loss": 0.1353, + "step": 6163 + }, + { + "epoch": 0.44474908907247734, + "grad_norm": 0.11482277512550354, + "learning_rate": 0.00018222254293548854, + "loss": 0.1608, + "step": 6164 + }, + { + "epoch": 0.4448212417475378, + "grad_norm": 0.13121692836284637, + "learning_rate": 0.0001822196565160918, + "loss": 0.1484, + "step": 6165 + }, + { + "epoch": 0.44489339442259823, + "grad_norm": 0.13311266899108887, + "learning_rate": 0.00018221677009669506, + "loss": 0.1761, + "step": 6166 + }, + { + "epoch": 0.4449655470976586, + "grad_norm": 0.10540780425071716, + "learning_rate": 0.00018221388367729833, + "loss": 0.1455, + "step": 6167 + }, + { + "epoch": 0.44503769977271906, + "grad_norm": 0.11652028560638428, + "learning_rate": 0.0001822109972579016, + "loss": 0.1911, + "step": 6168 + }, + { + "epoch": 0.4451098524477795, + "grad_norm": 0.11527334898710251, + "learning_rate": 0.00018220811083850485, + "loss": 0.1621, + "step": 6169 + }, + { + "epoch": 0.44518200512283995, + "grad_norm": 0.11917366087436676, + "learning_rate": 0.0001822052244191081, + "loss": 0.1535, + "step": 6170 + }, + { + "epoch": 0.44525415779790034, + "grad_norm": 0.10209325700998306, + "learning_rate": 0.00018220233799971135, + "loss": 0.1344, + "step": 6171 + }, + { + "epoch": 0.4453263104729608, + "grad_norm": 0.10349535197019577, + "learning_rate": 0.00018219945158031461, + "loss": 0.1509, + "step": 6172 + }, + { + "epoch": 0.44539846314802123, + "grad_norm": 0.12221920490264893, + "learning_rate": 0.0001821965651609179, + "loss": 0.1399, + "step": 6173 + }, + { + "epoch": 0.4454706158230816, + "grad_norm": 0.1287602335214615, + "learning_rate": 0.00018219367874152117, + "loss": 0.1538, + "step": 6174 + }, + { + "epoch": 0.44554276849814206, + "grad_norm": 0.13449262082576752, + "learning_rate": 0.0001821907923221244, + "loss": 0.1422, + "step": 6175 + }, + { + "epoch": 0.4456149211732025, + "grad_norm": 0.10923946648836136, + "learning_rate": 0.00018218790590272767, + "loss": 0.1438, + "step": 6176 + }, + { + "epoch": 0.44568707384826295, + "grad_norm": 0.10191056877374649, + "learning_rate": 0.00018218501948333093, + "loss": 0.1424, + "step": 6177 + }, + { + "epoch": 0.44575922652332334, + "grad_norm": 0.11073064804077148, + "learning_rate": 0.0001821821330639342, + "loss": 0.1396, + "step": 6178 + }, + { + "epoch": 0.4458313791983838, + "grad_norm": 0.14307044446468353, + "learning_rate": 0.00018217924664453745, + "loss": 0.164, + "step": 6179 + }, + { + "epoch": 0.4459035318734442, + "grad_norm": 0.1324271410703659, + "learning_rate": 0.00018217636022514072, + "loss": 0.1488, + "step": 6180 + }, + { + "epoch": 0.4459756845485046, + "grad_norm": 0.15620259940624237, + "learning_rate": 0.00018217347380574398, + "loss": 0.2011, + "step": 6181 + }, + { + "epoch": 0.44604783722356506, + "grad_norm": 0.1245202124118805, + "learning_rate": 0.00018217058738634724, + "loss": 0.1461, + "step": 6182 + }, + { + "epoch": 0.4461199898986255, + "grad_norm": 0.12234611064195633, + "learning_rate": 0.0001821677009669505, + "loss": 0.1931, + "step": 6183 + }, + { + "epoch": 0.44619214257368595, + "grad_norm": 0.10709469765424728, + "learning_rate": 0.00018216481454755377, + "loss": 0.1523, + "step": 6184 + }, + { + "epoch": 0.44626429524874633, + "grad_norm": 0.10262808948755264, + "learning_rate": 0.00018216192812815703, + "loss": 0.1532, + "step": 6185 + }, + { + "epoch": 0.4463364479238068, + "grad_norm": 0.12416316568851471, + "learning_rate": 0.00018215904170876027, + "loss": 0.1458, + "step": 6186 + }, + { + "epoch": 0.4464086005988672, + "grad_norm": 0.12517490983009338, + "learning_rate": 0.00018215615528936356, + "loss": 0.1104, + "step": 6187 + }, + { + "epoch": 0.4464807532739276, + "grad_norm": 0.12051703035831451, + "learning_rate": 0.00018215326886996682, + "loss": 0.1738, + "step": 6188 + }, + { + "epoch": 0.44655290594898805, + "grad_norm": 0.1233791932463646, + "learning_rate": 0.00018215038245057008, + "loss": 0.1313, + "step": 6189 + }, + { + "epoch": 0.4466250586240485, + "grad_norm": 0.1406187266111374, + "learning_rate": 0.00018214749603117335, + "loss": 0.1521, + "step": 6190 + }, + { + "epoch": 0.4466972112991089, + "grad_norm": 0.16725574433803558, + "learning_rate": 0.00018214460961177658, + "loss": 0.1623, + "step": 6191 + }, + { + "epoch": 0.44676936397416933, + "grad_norm": 0.11151408404111862, + "learning_rate": 0.00018214172319237985, + "loss": 0.1811, + "step": 6192 + }, + { + "epoch": 0.4468415166492298, + "grad_norm": 0.10352703928947449, + "learning_rate": 0.0001821388367729831, + "loss": 0.1245, + "step": 6193 + }, + { + "epoch": 0.4469136693242902, + "grad_norm": 0.11607158184051514, + "learning_rate": 0.0001821359503535864, + "loss": 0.1305, + "step": 6194 + }, + { + "epoch": 0.4469858219993506, + "grad_norm": 0.13465505838394165, + "learning_rate": 0.00018213306393418966, + "loss": 0.1492, + "step": 6195 + }, + { + "epoch": 0.44705797467441105, + "grad_norm": 0.1132718026638031, + "learning_rate": 0.0001821301775147929, + "loss": 0.2283, + "step": 6196 + }, + { + "epoch": 0.4471301273494715, + "grad_norm": 0.1039915382862091, + "learning_rate": 0.00018212729109539616, + "loss": 0.1543, + "step": 6197 + }, + { + "epoch": 0.4472022800245319, + "grad_norm": 0.18599781394004822, + "learning_rate": 0.00018212440467599942, + "loss": 0.166, + "step": 6198 + }, + { + "epoch": 0.44727443269959233, + "grad_norm": 0.130781888961792, + "learning_rate": 0.00018212151825660269, + "loss": 0.1512, + "step": 6199 + }, + { + "epoch": 0.44734658537465277, + "grad_norm": 0.12227053940296173, + "learning_rate": 0.00018211863183720595, + "loss": 0.1769, + "step": 6200 + }, + { + "epoch": 0.4474187380497132, + "grad_norm": 0.13931097090244293, + "learning_rate": 0.0001821157454178092, + "loss": 0.143, + "step": 6201 + }, + { + "epoch": 0.4474908907247736, + "grad_norm": 0.10793418437242508, + "learning_rate": 0.00018211285899841247, + "loss": 0.1522, + "step": 6202 + }, + { + "epoch": 0.44756304339983405, + "grad_norm": 0.12433990836143494, + "learning_rate": 0.00018210997257901574, + "loss": 0.1403, + "step": 6203 + }, + { + "epoch": 0.4476351960748945, + "grad_norm": 0.10359567403793335, + "learning_rate": 0.000182107086159619, + "loss": 0.1841, + "step": 6204 + }, + { + "epoch": 0.4477073487499549, + "grad_norm": 0.10066763311624527, + "learning_rate": 0.00018210419974022226, + "loss": 0.1791, + "step": 6205 + }, + { + "epoch": 0.4477795014250153, + "grad_norm": 0.10185603052377701, + "learning_rate": 0.00018210131332082553, + "loss": 0.1273, + "step": 6206 + }, + { + "epoch": 0.44785165410007577, + "grad_norm": 0.12718892097473145, + "learning_rate": 0.0001820984269014288, + "loss": 0.1295, + "step": 6207 + }, + { + "epoch": 0.4479238067751362, + "grad_norm": 0.09801424294710159, + "learning_rate": 0.00018209554048203205, + "loss": 0.102, + "step": 6208 + }, + { + "epoch": 0.4479959594501966, + "grad_norm": 0.1023603305220604, + "learning_rate": 0.00018209265406263531, + "loss": 0.162, + "step": 6209 + }, + { + "epoch": 0.44806811212525705, + "grad_norm": 0.11572487652301788, + "learning_rate": 0.00018208976764323858, + "loss": 0.1378, + "step": 6210 + }, + { + "epoch": 0.4481402648003175, + "grad_norm": 0.11289431154727936, + "learning_rate": 0.00018208688122384184, + "loss": 0.179, + "step": 6211 + }, + { + "epoch": 0.4482124174753779, + "grad_norm": 0.10198424011468887, + "learning_rate": 0.0001820839948044451, + "loss": 0.1729, + "step": 6212 + }, + { + "epoch": 0.4482845701504383, + "grad_norm": 0.09329713135957718, + "learning_rate": 0.00018208110838504834, + "loss": 0.1604, + "step": 6213 + }, + { + "epoch": 0.44835672282549877, + "grad_norm": 0.15892958641052246, + "learning_rate": 0.0001820782219656516, + "loss": 0.1585, + "step": 6214 + }, + { + "epoch": 0.4484288755005592, + "grad_norm": 0.11771567165851593, + "learning_rate": 0.0001820753355462549, + "loss": 0.0995, + "step": 6215 + }, + { + "epoch": 0.4485010281756196, + "grad_norm": 0.1459246426820755, + "learning_rate": 0.00018207244912685816, + "loss": 0.1453, + "step": 6216 + }, + { + "epoch": 0.44857318085068004, + "grad_norm": 0.14376023411750793, + "learning_rate": 0.00018206956270746142, + "loss": 0.2116, + "step": 6217 + }, + { + "epoch": 0.4486453335257405, + "grad_norm": 0.12150080502033234, + "learning_rate": 0.00018206667628806465, + "loss": 0.199, + "step": 6218 + }, + { + "epoch": 0.4487174862008009, + "grad_norm": 0.1031125858426094, + "learning_rate": 0.00018206378986866792, + "loss": 0.17, + "step": 6219 + }, + { + "epoch": 0.4487896388758613, + "grad_norm": 0.10577013343572617, + "learning_rate": 0.00018206090344927118, + "loss": 0.123, + "step": 6220 + }, + { + "epoch": 0.44886179155092176, + "grad_norm": 0.11361294984817505, + "learning_rate": 0.00018205801702987444, + "loss": 0.1037, + "step": 6221 + }, + { + "epoch": 0.44893394422598215, + "grad_norm": 0.12185689061880112, + "learning_rate": 0.00018205513061047773, + "loss": 0.1318, + "step": 6222 + }, + { + "epoch": 0.4490060969010426, + "grad_norm": 0.11607586592435837, + "learning_rate": 0.00018205224419108097, + "loss": 0.1489, + "step": 6223 + }, + { + "epoch": 0.44907824957610304, + "grad_norm": 0.10012314468622208, + "learning_rate": 0.00018204935777168423, + "loss": 0.1555, + "step": 6224 + }, + { + "epoch": 0.4491504022511635, + "grad_norm": 0.11083272099494934, + "learning_rate": 0.0001820464713522875, + "loss": 0.194, + "step": 6225 + }, + { + "epoch": 0.44922255492622387, + "grad_norm": 0.15222343802452087, + "learning_rate": 0.00018204358493289076, + "loss": 0.178, + "step": 6226 + }, + { + "epoch": 0.4492947076012843, + "grad_norm": 0.1352849304676056, + "learning_rate": 0.00018204069851349402, + "loss": 0.1353, + "step": 6227 + }, + { + "epoch": 0.44936686027634476, + "grad_norm": 0.13668890297412872, + "learning_rate": 0.00018203781209409728, + "loss": 0.1146, + "step": 6228 + }, + { + "epoch": 0.44943901295140515, + "grad_norm": 0.11314396560192108, + "learning_rate": 0.00018203492567470055, + "loss": 0.1265, + "step": 6229 + }, + { + "epoch": 0.4495111656264656, + "grad_norm": 0.1572897881269455, + "learning_rate": 0.0001820320392553038, + "loss": 0.1363, + "step": 6230 + }, + { + "epoch": 0.44958331830152604, + "grad_norm": 0.10678167641162872, + "learning_rate": 0.00018202915283590707, + "loss": 0.1649, + "step": 6231 + }, + { + "epoch": 0.4496554709765865, + "grad_norm": 0.15983405709266663, + "learning_rate": 0.00018202626641651033, + "loss": 0.1636, + "step": 6232 + }, + { + "epoch": 0.44972762365164687, + "grad_norm": 0.14953090250492096, + "learning_rate": 0.0001820233799971136, + "loss": 0.1702, + "step": 6233 + }, + { + "epoch": 0.4497997763267073, + "grad_norm": 0.12968796491622925, + "learning_rate": 0.00018202049357771683, + "loss": 0.1762, + "step": 6234 + }, + { + "epoch": 0.44987192900176776, + "grad_norm": 0.12254311889410019, + "learning_rate": 0.0001820176071583201, + "loss": 0.1765, + "step": 6235 + }, + { + "epoch": 0.44994408167682814, + "grad_norm": 0.13646142184734344, + "learning_rate": 0.0001820147207389234, + "loss": 0.1452, + "step": 6236 + }, + { + "epoch": 0.4500162343518886, + "grad_norm": 0.10840586572885513, + "learning_rate": 0.00018201183431952665, + "loss": 0.1542, + "step": 6237 + }, + { + "epoch": 0.45008838702694903, + "grad_norm": 0.12204860895872116, + "learning_rate": 0.0001820089479001299, + "loss": 0.1651, + "step": 6238 + }, + { + "epoch": 0.4501605397020095, + "grad_norm": 0.10802742093801498, + "learning_rate": 0.00018200606148073315, + "loss": 0.1148, + "step": 6239 + }, + { + "epoch": 0.45023269237706987, + "grad_norm": 0.11421792954206467, + "learning_rate": 0.0001820031750613364, + "loss": 0.1786, + "step": 6240 + }, + { + "epoch": 0.4503048450521303, + "grad_norm": 0.10538505762815475, + "learning_rate": 0.00018200028864193967, + "loss": 0.162, + "step": 6241 + }, + { + "epoch": 0.45037699772719075, + "grad_norm": 0.1272907853126526, + "learning_rate": 0.00018199740222254294, + "loss": 0.1742, + "step": 6242 + }, + { + "epoch": 0.45044915040225114, + "grad_norm": 0.12079206854104996, + "learning_rate": 0.00018199451580314623, + "loss": 0.147, + "step": 6243 + }, + { + "epoch": 0.4505213030773116, + "grad_norm": 0.11582466959953308, + "learning_rate": 0.00018199162938374946, + "loss": 0.1636, + "step": 6244 + }, + { + "epoch": 0.45059345575237203, + "grad_norm": 0.1340245008468628, + "learning_rate": 0.00018198874296435273, + "loss": 0.1894, + "step": 6245 + }, + { + "epoch": 0.4506656084274325, + "grad_norm": 0.11480450630187988, + "learning_rate": 0.000181985856544956, + "loss": 0.1789, + "step": 6246 + }, + { + "epoch": 0.45073776110249286, + "grad_norm": 0.14045003056526184, + "learning_rate": 0.00018198297012555925, + "loss": 0.1233, + "step": 6247 + }, + { + "epoch": 0.4508099137775533, + "grad_norm": 0.12720069289207458, + "learning_rate": 0.00018198008370616251, + "loss": 0.1262, + "step": 6248 + }, + { + "epoch": 0.45088206645261375, + "grad_norm": 0.1322208046913147, + "learning_rate": 0.00018197719728676578, + "loss": 0.1568, + "step": 6249 + }, + { + "epoch": 0.45095421912767414, + "grad_norm": 0.1560821682214737, + "learning_rate": 0.00018197431086736904, + "loss": 0.1584, + "step": 6250 + }, + { + "epoch": 0.4510263718027346, + "grad_norm": 0.131991907954216, + "learning_rate": 0.0001819714244479723, + "loss": 0.1221, + "step": 6251 + }, + { + "epoch": 0.451098524477795, + "grad_norm": 0.14872147142887115, + "learning_rate": 0.00018196853802857557, + "loss": 0.1248, + "step": 6252 + }, + { + "epoch": 0.4511706771528554, + "grad_norm": 0.14252154529094696, + "learning_rate": 0.00018196565160917883, + "loss": 0.1378, + "step": 6253 + }, + { + "epoch": 0.45124282982791586, + "grad_norm": 0.14059919118881226, + "learning_rate": 0.0001819627651897821, + "loss": 0.1513, + "step": 6254 + }, + { + "epoch": 0.4513149825029763, + "grad_norm": 0.12438001483678818, + "learning_rate": 0.00018195987877038533, + "loss": 0.1868, + "step": 6255 + }, + { + "epoch": 0.45138713517803675, + "grad_norm": 0.12125087529420853, + "learning_rate": 0.0001819569923509886, + "loss": 0.1459, + "step": 6256 + }, + { + "epoch": 0.45145928785309714, + "grad_norm": 0.12559860944747925, + "learning_rate": 0.00018195410593159188, + "loss": 0.1742, + "step": 6257 + }, + { + "epoch": 0.4515314405281576, + "grad_norm": 0.11661777645349503, + "learning_rate": 0.00018195121951219514, + "loss": 0.1382, + "step": 6258 + }, + { + "epoch": 0.451603593203218, + "grad_norm": 0.14414922893047333, + "learning_rate": 0.0001819483330927984, + "loss": 0.1732, + "step": 6259 + }, + { + "epoch": 0.4516757458782784, + "grad_norm": 0.12364254891872406, + "learning_rate": 0.00018194544667340164, + "loss": 0.2019, + "step": 6260 + }, + { + "epoch": 0.45174789855333886, + "grad_norm": 0.10109005868434906, + "learning_rate": 0.0001819425602540049, + "loss": 0.1286, + "step": 6261 + }, + { + "epoch": 0.4518200512283993, + "grad_norm": 0.12184163182973862, + "learning_rate": 0.00018193967383460817, + "loss": 0.2007, + "step": 6262 + }, + { + "epoch": 0.45189220390345974, + "grad_norm": 0.14499284327030182, + "learning_rate": 0.00018193678741521143, + "loss": 0.1365, + "step": 6263 + }, + { + "epoch": 0.45196435657852013, + "grad_norm": 0.12527626752853394, + "learning_rate": 0.00018193390099581472, + "loss": 0.1302, + "step": 6264 + }, + { + "epoch": 0.4520365092535806, + "grad_norm": 0.08568686246871948, + "learning_rate": 0.00018193101457641796, + "loss": 0.1685, + "step": 6265 + }, + { + "epoch": 0.452108661928641, + "grad_norm": 0.14542171359062195, + "learning_rate": 0.00018192812815702122, + "loss": 0.12, + "step": 6266 + }, + { + "epoch": 0.4521808146037014, + "grad_norm": 0.09898033738136292, + "learning_rate": 0.00018192524173762448, + "loss": 0.1337, + "step": 6267 + }, + { + "epoch": 0.45225296727876185, + "grad_norm": 0.09792659431695938, + "learning_rate": 0.00018192235531822775, + "loss": 0.1224, + "step": 6268 + }, + { + "epoch": 0.4523251199538223, + "grad_norm": 0.08936212956905365, + "learning_rate": 0.000181919468898831, + "loss": 0.1105, + "step": 6269 + }, + { + "epoch": 0.45239727262888274, + "grad_norm": 0.10980160534381866, + "learning_rate": 0.00018191658247943427, + "loss": 0.1328, + "step": 6270 + }, + { + "epoch": 0.45246942530394313, + "grad_norm": 0.10068574547767639, + "learning_rate": 0.00018191369606003753, + "loss": 0.1788, + "step": 6271 + }, + { + "epoch": 0.4525415779790036, + "grad_norm": 0.11020131409168243, + "learning_rate": 0.0001819108096406408, + "loss": 0.1622, + "step": 6272 + }, + { + "epoch": 0.452613730654064, + "grad_norm": 0.11048059165477753, + "learning_rate": 0.00018190792322124406, + "loss": 0.1327, + "step": 6273 + }, + { + "epoch": 0.4526858833291244, + "grad_norm": 0.13467931747436523, + "learning_rate": 0.00018190503680184732, + "loss": 0.1455, + "step": 6274 + }, + { + "epoch": 0.45275803600418485, + "grad_norm": 0.12309185415506363, + "learning_rate": 0.00018190215038245059, + "loss": 0.1491, + "step": 6275 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.09846778959035873, + "learning_rate": 0.00018189926396305382, + "loss": 0.145, + "step": 6276 + }, + { + "epoch": 0.45290234135430574, + "grad_norm": 0.12652455270290375, + "learning_rate": 0.00018189637754365708, + "loss": 0.1435, + "step": 6277 + }, + { + "epoch": 0.4529744940293661, + "grad_norm": 0.12107978761196136, + "learning_rate": 0.00018189349112426038, + "loss": 0.1544, + "step": 6278 + }, + { + "epoch": 0.45304664670442657, + "grad_norm": 0.13256511092185974, + "learning_rate": 0.00018189060470486364, + "loss": 0.1434, + "step": 6279 + }, + { + "epoch": 0.453118799379487, + "grad_norm": 0.11256644129753113, + "learning_rate": 0.0001818877182854669, + "loss": 0.1276, + "step": 6280 + }, + { + "epoch": 0.4531909520545474, + "grad_norm": 0.1764122098684311, + "learning_rate": 0.00018188483186607014, + "loss": 0.1369, + "step": 6281 + }, + { + "epoch": 0.45326310472960785, + "grad_norm": 0.1378893405199051, + "learning_rate": 0.0001818819454466734, + "loss": 0.1603, + "step": 6282 + }, + { + "epoch": 0.4533352574046683, + "grad_norm": 0.10006693005561829, + "learning_rate": 0.00018187905902727666, + "loss": 0.1722, + "step": 6283 + }, + { + "epoch": 0.4534074100797287, + "grad_norm": 0.09453818202018738, + "learning_rate": 0.00018187617260787993, + "loss": 0.1612, + "step": 6284 + }, + { + "epoch": 0.4534795627547891, + "grad_norm": 0.11707943677902222, + "learning_rate": 0.00018187328618848322, + "loss": 0.1738, + "step": 6285 + }, + { + "epoch": 0.45355171542984957, + "grad_norm": 0.1276026964187622, + "learning_rate": 0.00018187039976908645, + "loss": 0.1525, + "step": 6286 + }, + { + "epoch": 0.45362386810491, + "grad_norm": 0.1502910554409027, + "learning_rate": 0.00018186751334968971, + "loss": 0.1586, + "step": 6287 + }, + { + "epoch": 0.4536960207799704, + "grad_norm": 0.12908358871936798, + "learning_rate": 0.00018186462693029298, + "loss": 0.1563, + "step": 6288 + }, + { + "epoch": 0.45376817345503084, + "grad_norm": 0.09012867510318756, + "learning_rate": 0.00018186174051089624, + "loss": 0.1726, + "step": 6289 + }, + { + "epoch": 0.4538403261300913, + "grad_norm": 0.15328793227672577, + "learning_rate": 0.0001818588540914995, + "loss": 0.2016, + "step": 6290 + }, + { + "epoch": 0.4539124788051517, + "grad_norm": 0.14295051991939545, + "learning_rate": 0.00018185596767210277, + "loss": 0.156, + "step": 6291 + }, + { + "epoch": 0.4539846314802121, + "grad_norm": 0.11826188117265701, + "learning_rate": 0.00018185308125270603, + "loss": 0.246, + "step": 6292 + }, + { + "epoch": 0.45405678415527256, + "grad_norm": 0.12954123318195343, + "learning_rate": 0.0001818501948333093, + "loss": 0.1262, + "step": 6293 + }, + { + "epoch": 0.454128936830333, + "grad_norm": 0.10843376815319061, + "learning_rate": 0.00018184730841391255, + "loss": 0.1256, + "step": 6294 + }, + { + "epoch": 0.4542010895053934, + "grad_norm": 0.1581542044878006, + "learning_rate": 0.00018184442199451582, + "loss": 0.178, + "step": 6295 + }, + { + "epoch": 0.45427324218045384, + "grad_norm": 0.11956808716058731, + "learning_rate": 0.00018184153557511908, + "loss": 0.1303, + "step": 6296 + }, + { + "epoch": 0.4543453948555143, + "grad_norm": 0.12054760009050369, + "learning_rate": 0.00018183864915572232, + "loss": 0.1808, + "step": 6297 + }, + { + "epoch": 0.4544175475305747, + "grad_norm": 0.13112328946590424, + "learning_rate": 0.00018183576273632558, + "loss": 0.133, + "step": 6298 + }, + { + "epoch": 0.4544897002056351, + "grad_norm": 0.1242629885673523, + "learning_rate": 0.00018183287631692887, + "loss": 0.1619, + "step": 6299 + }, + { + "epoch": 0.45456185288069556, + "grad_norm": 0.1355835199356079, + "learning_rate": 0.00018182998989753213, + "loss": 0.177, + "step": 6300 + }, + { + "epoch": 0.454634005555756, + "grad_norm": 0.11525852233171463, + "learning_rate": 0.0001818271034781354, + "loss": 0.1728, + "step": 6301 + }, + { + "epoch": 0.4547061582308164, + "grad_norm": 0.14136461913585663, + "learning_rate": 0.00018182421705873863, + "loss": 0.2022, + "step": 6302 + }, + { + "epoch": 0.45477831090587684, + "grad_norm": 0.11866612732410431, + "learning_rate": 0.0001818213306393419, + "loss": 0.1662, + "step": 6303 + }, + { + "epoch": 0.4548504635809373, + "grad_norm": 0.12703737616539001, + "learning_rate": 0.00018181844421994516, + "loss": 0.1795, + "step": 6304 + }, + { + "epoch": 0.45492261625599767, + "grad_norm": 0.13970893621444702, + "learning_rate": 0.00018181555780054842, + "loss": 0.1836, + "step": 6305 + }, + { + "epoch": 0.4549947689310581, + "grad_norm": 0.12486319988965988, + "learning_rate": 0.0001818126713811517, + "loss": 0.193, + "step": 6306 + }, + { + "epoch": 0.45506692160611856, + "grad_norm": 0.12619183957576752, + "learning_rate": 0.00018180978496175495, + "loss": 0.259, + "step": 6307 + }, + { + "epoch": 0.455139074281179, + "grad_norm": 0.12064392119646072, + "learning_rate": 0.0001818068985423582, + "loss": 0.1446, + "step": 6308 + }, + { + "epoch": 0.4552112269562394, + "grad_norm": 0.13968324661254883, + "learning_rate": 0.00018180401212296147, + "loss": 0.1611, + "step": 6309 + }, + { + "epoch": 0.45528337963129983, + "grad_norm": 0.11358285695314407, + "learning_rate": 0.00018180112570356473, + "loss": 0.1626, + "step": 6310 + }, + { + "epoch": 0.4553555323063603, + "grad_norm": 0.10314571857452393, + "learning_rate": 0.000181798239284168, + "loss": 0.1977, + "step": 6311 + }, + { + "epoch": 0.45542768498142067, + "grad_norm": 0.11178232729434967, + "learning_rate": 0.00018179535286477126, + "loss": 0.1358, + "step": 6312 + }, + { + "epoch": 0.4554998376564811, + "grad_norm": 0.1190803200006485, + "learning_rate": 0.00018179246644537452, + "loss": 0.1085, + "step": 6313 + }, + { + "epoch": 0.45557199033154155, + "grad_norm": 0.12554095685482025, + "learning_rate": 0.00018178958002597779, + "loss": 0.1622, + "step": 6314 + }, + { + "epoch": 0.45564414300660194, + "grad_norm": 0.12150195240974426, + "learning_rate": 0.00018178669360658105, + "loss": 0.1676, + "step": 6315 + }, + { + "epoch": 0.4557162956816624, + "grad_norm": 0.1563529372215271, + "learning_rate": 0.0001817838071871843, + "loss": 0.1664, + "step": 6316 + }, + { + "epoch": 0.45578844835672283, + "grad_norm": 0.1873168647289276, + "learning_rate": 0.00018178092076778757, + "loss": 0.185, + "step": 6317 + }, + { + "epoch": 0.4558606010317833, + "grad_norm": 0.12532073259353638, + "learning_rate": 0.00018177803434839084, + "loss": 0.1596, + "step": 6318 + }, + { + "epoch": 0.45593275370684366, + "grad_norm": 0.1312006264925003, + "learning_rate": 0.00018177514792899407, + "loss": 0.1443, + "step": 6319 + }, + { + "epoch": 0.4560049063819041, + "grad_norm": 0.10332578420639038, + "learning_rate": 0.00018177226150959736, + "loss": 0.1757, + "step": 6320 + }, + { + "epoch": 0.45607705905696455, + "grad_norm": 0.1040138453245163, + "learning_rate": 0.00018176937509020063, + "loss": 0.1226, + "step": 6321 + }, + { + "epoch": 0.45614921173202494, + "grad_norm": 0.13129954040050507, + "learning_rate": 0.0001817664886708039, + "loss": 0.1652, + "step": 6322 + }, + { + "epoch": 0.4562213644070854, + "grad_norm": 0.11622961610555649, + "learning_rate": 0.00018176360225140715, + "loss": 0.1849, + "step": 6323 + }, + { + "epoch": 0.4562935170821458, + "grad_norm": 0.09446028620004654, + "learning_rate": 0.0001817607158320104, + "loss": 0.1734, + "step": 6324 + }, + { + "epoch": 0.45636566975720627, + "grad_norm": 0.10070005059242249, + "learning_rate": 0.00018175782941261365, + "loss": 0.1853, + "step": 6325 + }, + { + "epoch": 0.45643782243226666, + "grad_norm": 0.14620892703533173, + "learning_rate": 0.00018175494299321691, + "loss": 0.1377, + "step": 6326 + }, + { + "epoch": 0.4565099751073271, + "grad_norm": 0.1088029146194458, + "learning_rate": 0.0001817520565738202, + "loss": 0.1486, + "step": 6327 + }, + { + "epoch": 0.45658212778238755, + "grad_norm": 0.14180849492549896, + "learning_rate": 0.00018174917015442347, + "loss": 0.1597, + "step": 6328 + }, + { + "epoch": 0.45665428045744794, + "grad_norm": 0.12755030393600464, + "learning_rate": 0.0001817462837350267, + "loss": 0.1483, + "step": 6329 + }, + { + "epoch": 0.4567264331325084, + "grad_norm": 0.11482290923595428, + "learning_rate": 0.00018174339731562997, + "loss": 0.1008, + "step": 6330 + }, + { + "epoch": 0.4567985858075688, + "grad_norm": 0.1262357085943222, + "learning_rate": 0.00018174051089623323, + "loss": 0.1243, + "step": 6331 + }, + { + "epoch": 0.45687073848262927, + "grad_norm": 0.14928500354290009, + "learning_rate": 0.0001817376244768365, + "loss": 0.1893, + "step": 6332 + }, + { + "epoch": 0.45694289115768966, + "grad_norm": 0.12983083724975586, + "learning_rate": 0.00018173473805743975, + "loss": 0.1272, + "step": 6333 + }, + { + "epoch": 0.4570150438327501, + "grad_norm": 0.13185936212539673, + "learning_rate": 0.00018173185163804302, + "loss": 0.1277, + "step": 6334 + }, + { + "epoch": 0.45708719650781054, + "grad_norm": 0.10516639798879623, + "learning_rate": 0.00018172896521864628, + "loss": 0.1488, + "step": 6335 + }, + { + "epoch": 0.45715934918287093, + "grad_norm": 0.11697663366794586, + "learning_rate": 0.00018172607879924954, + "loss": 0.1771, + "step": 6336 + }, + { + "epoch": 0.4572315018579314, + "grad_norm": 0.176951602101326, + "learning_rate": 0.0001817231923798528, + "loss": 0.1402, + "step": 6337 + }, + { + "epoch": 0.4573036545329918, + "grad_norm": 0.11368632316589355, + "learning_rate": 0.00018172030596045607, + "loss": 0.1855, + "step": 6338 + }, + { + "epoch": 0.45737580720805227, + "grad_norm": 0.12310828268527985, + "learning_rate": 0.00018171741954105933, + "loss": 0.1033, + "step": 6339 + }, + { + "epoch": 0.45744795988311265, + "grad_norm": 0.10735306143760681, + "learning_rate": 0.00018171453312166257, + "loss": 0.1652, + "step": 6340 + }, + { + "epoch": 0.4575201125581731, + "grad_norm": 0.11295438557863235, + "learning_rate": 0.00018171164670226586, + "loss": 0.1622, + "step": 6341 + }, + { + "epoch": 0.45759226523323354, + "grad_norm": 0.10728729516267776, + "learning_rate": 0.00018170876028286912, + "loss": 0.1584, + "step": 6342 + }, + { + "epoch": 0.45766441790829393, + "grad_norm": 0.14049620926380157, + "learning_rate": 0.00018170587386347238, + "loss": 0.1725, + "step": 6343 + }, + { + "epoch": 0.4577365705833544, + "grad_norm": 0.10544250160455704, + "learning_rate": 0.00018170298744407565, + "loss": 0.1266, + "step": 6344 + }, + { + "epoch": 0.4578087232584148, + "grad_norm": 0.10104487091302872, + "learning_rate": 0.00018170010102467888, + "loss": 0.1539, + "step": 6345 + }, + { + "epoch": 0.4578808759334752, + "grad_norm": 0.12155293673276901, + "learning_rate": 0.00018169721460528215, + "loss": 0.1659, + "step": 6346 + }, + { + "epoch": 0.45795302860853565, + "grad_norm": 0.10418069362640381, + "learning_rate": 0.0001816943281858854, + "loss": 0.157, + "step": 6347 + }, + { + "epoch": 0.4580251812835961, + "grad_norm": 0.12691755592823029, + "learning_rate": 0.0001816914417664887, + "loss": 0.1556, + "step": 6348 + }, + { + "epoch": 0.45809733395865654, + "grad_norm": 0.11559534072875977, + "learning_rate": 0.00018168855534709196, + "loss": 0.1821, + "step": 6349 + }, + { + "epoch": 0.4581694866337169, + "grad_norm": 0.10430330783128738, + "learning_rate": 0.0001816856689276952, + "loss": 0.1321, + "step": 6350 + }, + { + "epoch": 0.45824163930877737, + "grad_norm": 0.11644110828638077, + "learning_rate": 0.00018168278250829846, + "loss": 0.1651, + "step": 6351 + }, + { + "epoch": 0.4583137919838378, + "grad_norm": 0.11053968220949173, + "learning_rate": 0.00018167989608890172, + "loss": 0.1446, + "step": 6352 + }, + { + "epoch": 0.4583859446588982, + "grad_norm": 0.12273887544870377, + "learning_rate": 0.00018167700966950499, + "loss": 0.1963, + "step": 6353 + }, + { + "epoch": 0.45845809733395865, + "grad_norm": 0.1266319304704666, + "learning_rate": 0.00018167412325010825, + "loss": 0.1561, + "step": 6354 + }, + { + "epoch": 0.4585302500090191, + "grad_norm": 0.11079327762126923, + "learning_rate": 0.0001816712368307115, + "loss": 0.1325, + "step": 6355 + }, + { + "epoch": 0.45860240268407954, + "grad_norm": 0.11335281282663345, + "learning_rate": 0.00018166835041131477, + "loss": 0.1843, + "step": 6356 + }, + { + "epoch": 0.4586745553591399, + "grad_norm": 0.11442596465349197, + "learning_rate": 0.00018166546399191804, + "loss": 0.1203, + "step": 6357 + }, + { + "epoch": 0.45874670803420037, + "grad_norm": 0.12477446347475052, + "learning_rate": 0.0001816625775725213, + "loss": 0.1746, + "step": 6358 + }, + { + "epoch": 0.4588188607092608, + "grad_norm": 0.09427320212125778, + "learning_rate": 0.00018165969115312456, + "loss": 0.1079, + "step": 6359 + }, + { + "epoch": 0.4588910133843212, + "grad_norm": 0.11606919765472412, + "learning_rate": 0.00018165680473372783, + "loss": 0.1729, + "step": 6360 + }, + { + "epoch": 0.45896316605938164, + "grad_norm": 0.1418781727552414, + "learning_rate": 0.00018165391831433106, + "loss": 0.1932, + "step": 6361 + }, + { + "epoch": 0.4590353187344421, + "grad_norm": 0.11492908746004105, + "learning_rate": 0.00018165103189493435, + "loss": 0.1547, + "step": 6362 + }, + { + "epoch": 0.45910747140950253, + "grad_norm": 0.10958899557590485, + "learning_rate": 0.00018164814547553761, + "loss": 0.1327, + "step": 6363 + }, + { + "epoch": 0.4591796240845629, + "grad_norm": 0.1356894075870514, + "learning_rate": 0.00018164525905614088, + "loss": 0.1571, + "step": 6364 + }, + { + "epoch": 0.45925177675962336, + "grad_norm": 0.10377617180347443, + "learning_rate": 0.00018164237263674414, + "loss": 0.2138, + "step": 6365 + }, + { + "epoch": 0.4593239294346838, + "grad_norm": 0.13075923919677734, + "learning_rate": 0.00018163948621734738, + "loss": 0.1748, + "step": 6366 + }, + { + "epoch": 0.4593960821097442, + "grad_norm": 0.1446242481470108, + "learning_rate": 0.00018163659979795064, + "loss": 0.1922, + "step": 6367 + }, + { + "epoch": 0.45946823478480464, + "grad_norm": 0.1177244782447815, + "learning_rate": 0.0001816337133785539, + "loss": 0.139, + "step": 6368 + }, + { + "epoch": 0.4595403874598651, + "grad_norm": 0.11971200257539749, + "learning_rate": 0.00018163082695915717, + "loss": 0.1529, + "step": 6369 + }, + { + "epoch": 0.45961254013492553, + "grad_norm": 0.15256580710411072, + "learning_rate": 0.00018162794053976046, + "loss": 0.1496, + "step": 6370 + }, + { + "epoch": 0.4596846928099859, + "grad_norm": 0.11101801693439484, + "learning_rate": 0.0001816250541203637, + "loss": 0.1439, + "step": 6371 + }, + { + "epoch": 0.45975684548504636, + "grad_norm": 0.11491654068231583, + "learning_rate": 0.00018162216770096695, + "loss": 0.2289, + "step": 6372 + }, + { + "epoch": 0.4598289981601068, + "grad_norm": 0.1170630007982254, + "learning_rate": 0.00018161928128157022, + "loss": 0.1543, + "step": 6373 + }, + { + "epoch": 0.4599011508351672, + "grad_norm": 0.11507470905780792, + "learning_rate": 0.00018161639486217348, + "loss": 0.1691, + "step": 6374 + }, + { + "epoch": 0.45997330351022764, + "grad_norm": 0.14252659678459167, + "learning_rate": 0.00018161350844277674, + "loss": 0.14, + "step": 6375 + }, + { + "epoch": 0.4600454561852881, + "grad_norm": 0.11022763699293137, + "learning_rate": 0.00018161062202338, + "loss": 0.166, + "step": 6376 + }, + { + "epoch": 0.46011760886034847, + "grad_norm": 0.150185227394104, + "learning_rate": 0.00018160773560398327, + "loss": 0.1591, + "step": 6377 + }, + { + "epoch": 0.4601897615354089, + "grad_norm": 0.1444399207830429, + "learning_rate": 0.00018160484918458653, + "loss": 0.1927, + "step": 6378 + }, + { + "epoch": 0.46026191421046936, + "grad_norm": 0.11369339376688004, + "learning_rate": 0.0001816019627651898, + "loss": 0.1781, + "step": 6379 + }, + { + "epoch": 0.4603340668855298, + "grad_norm": 0.1188020408153534, + "learning_rate": 0.00018159907634579306, + "loss": 0.1252, + "step": 6380 + }, + { + "epoch": 0.4604062195605902, + "grad_norm": 0.11750219017267227, + "learning_rate": 0.00018159618992639632, + "loss": 0.1637, + "step": 6381 + }, + { + "epoch": 0.46047837223565063, + "grad_norm": 0.1212676465511322, + "learning_rate": 0.00018159330350699956, + "loss": 0.1853, + "step": 6382 + }, + { + "epoch": 0.4605505249107111, + "grad_norm": 0.08860348910093307, + "learning_rate": 0.00018159041708760282, + "loss": 0.128, + "step": 6383 + }, + { + "epoch": 0.46062267758577147, + "grad_norm": 0.10735640674829483, + "learning_rate": 0.0001815875306682061, + "loss": 0.1735, + "step": 6384 + }, + { + "epoch": 0.4606948302608319, + "grad_norm": 0.13036073744297028, + "learning_rate": 0.00018158464424880937, + "loss": 0.1319, + "step": 6385 + }, + { + "epoch": 0.46076698293589236, + "grad_norm": 0.10206615924835205, + "learning_rate": 0.00018158175782941264, + "loss": 0.1723, + "step": 6386 + }, + { + "epoch": 0.4608391356109528, + "grad_norm": 0.10884787887334824, + "learning_rate": 0.00018157887141001587, + "loss": 0.1322, + "step": 6387 + }, + { + "epoch": 0.4609112882860132, + "grad_norm": 0.09827440232038498, + "learning_rate": 0.00018157598499061913, + "loss": 0.1934, + "step": 6388 + }, + { + "epoch": 0.46098344096107363, + "grad_norm": 0.10427073389291763, + "learning_rate": 0.0001815730985712224, + "loss": 0.1358, + "step": 6389 + }, + { + "epoch": 0.4610555936361341, + "grad_norm": 0.11135826259851456, + "learning_rate": 0.00018157021215182566, + "loss": 0.157, + "step": 6390 + }, + { + "epoch": 0.46112774631119446, + "grad_norm": 0.11995387077331543, + "learning_rate": 0.00018156732573242895, + "loss": 0.1854, + "step": 6391 + }, + { + "epoch": 0.4611998989862549, + "grad_norm": 0.12274812161922455, + "learning_rate": 0.00018156443931303219, + "loss": 0.142, + "step": 6392 + }, + { + "epoch": 0.46127205166131535, + "grad_norm": 0.12474401295185089, + "learning_rate": 0.00018156155289363545, + "loss": 0.1636, + "step": 6393 + }, + { + "epoch": 0.4613442043363758, + "grad_norm": 0.1390080600976944, + "learning_rate": 0.0001815586664742387, + "loss": 0.1685, + "step": 6394 + }, + { + "epoch": 0.4614163570114362, + "grad_norm": 0.10171744227409363, + "learning_rate": 0.00018155578005484197, + "loss": 0.1444, + "step": 6395 + }, + { + "epoch": 0.46148850968649663, + "grad_norm": 0.10669015347957611, + "learning_rate": 0.00018155289363544524, + "loss": 0.1095, + "step": 6396 + }, + { + "epoch": 0.4615606623615571, + "grad_norm": 0.1214829832315445, + "learning_rate": 0.0001815500072160485, + "loss": 0.1458, + "step": 6397 + }, + { + "epoch": 0.46163281503661746, + "grad_norm": 0.1343984454870224, + "learning_rate": 0.00018154712079665176, + "loss": 0.1811, + "step": 6398 + }, + { + "epoch": 0.4617049677116779, + "grad_norm": 0.11867804080247879, + "learning_rate": 0.00018154423437725503, + "loss": 0.1828, + "step": 6399 + }, + { + "epoch": 0.46177712038673835, + "grad_norm": 0.149135023355484, + "learning_rate": 0.0001815413479578583, + "loss": 0.1708, + "step": 6400 + }, + { + "epoch": 0.4618492730617988, + "grad_norm": 0.12415354698896408, + "learning_rate": 0.00018153846153846155, + "loss": 0.1478, + "step": 6401 + }, + { + "epoch": 0.4619214257368592, + "grad_norm": 0.147150918841362, + "learning_rate": 0.00018153557511906481, + "loss": 0.1718, + "step": 6402 + }, + { + "epoch": 0.4619935784119196, + "grad_norm": 0.11363677680492401, + "learning_rate": 0.00018153268869966805, + "loss": 0.1306, + "step": 6403 + }, + { + "epoch": 0.46206573108698007, + "grad_norm": 0.11995453387498856, + "learning_rate": 0.0001815298022802713, + "loss": 0.2005, + "step": 6404 + }, + { + "epoch": 0.46213788376204046, + "grad_norm": 0.1330481767654419, + "learning_rate": 0.0001815269158608746, + "loss": 0.1651, + "step": 6405 + }, + { + "epoch": 0.4622100364371009, + "grad_norm": 0.11070237308740616, + "learning_rate": 0.00018152402944147787, + "loss": 0.1586, + "step": 6406 + }, + { + "epoch": 0.46228218911216135, + "grad_norm": 0.12627100944519043, + "learning_rate": 0.00018152114302208113, + "loss": 0.171, + "step": 6407 + }, + { + "epoch": 0.46235434178722173, + "grad_norm": 0.12145937979221344, + "learning_rate": 0.00018151825660268437, + "loss": 0.1288, + "step": 6408 + }, + { + "epoch": 0.4624264944622822, + "grad_norm": 0.11226236820220947, + "learning_rate": 0.00018151537018328763, + "loss": 0.1718, + "step": 6409 + }, + { + "epoch": 0.4624986471373426, + "grad_norm": 0.16444960236549377, + "learning_rate": 0.0001815124837638909, + "loss": 0.1636, + "step": 6410 + }, + { + "epoch": 0.46257079981240307, + "grad_norm": 0.10897719115018845, + "learning_rate": 0.00018150959734449415, + "loss": 0.178, + "step": 6411 + }, + { + "epoch": 0.46264295248746345, + "grad_norm": 0.14687609672546387, + "learning_rate": 0.00018150671092509744, + "loss": 0.1847, + "step": 6412 + }, + { + "epoch": 0.4627151051625239, + "grad_norm": 0.10744399577379227, + "learning_rate": 0.00018150382450570068, + "loss": 0.1543, + "step": 6413 + }, + { + "epoch": 0.46278725783758434, + "grad_norm": 0.10385393351316452, + "learning_rate": 0.00018150093808630394, + "loss": 0.149, + "step": 6414 + }, + { + "epoch": 0.46285941051264473, + "grad_norm": 0.12182513624429703, + "learning_rate": 0.0001814980516669072, + "loss": 0.1421, + "step": 6415 + }, + { + "epoch": 0.4629315631877052, + "grad_norm": 0.14916294813156128, + "learning_rate": 0.00018149516524751047, + "loss": 0.1855, + "step": 6416 + }, + { + "epoch": 0.4630037158627656, + "grad_norm": 0.10458331555128098, + "learning_rate": 0.00018149227882811373, + "loss": 0.1786, + "step": 6417 + }, + { + "epoch": 0.46307586853782606, + "grad_norm": 0.14848539233207703, + "learning_rate": 0.000181489392408717, + "loss": 0.1478, + "step": 6418 + }, + { + "epoch": 0.46314802121288645, + "grad_norm": 0.1410219371318817, + "learning_rate": 0.00018148650598932026, + "loss": 0.1779, + "step": 6419 + }, + { + "epoch": 0.4632201738879469, + "grad_norm": 0.09285683929920197, + "learning_rate": 0.00018148361956992352, + "loss": 0.1702, + "step": 6420 + }, + { + "epoch": 0.46329232656300734, + "grad_norm": 0.1484486162662506, + "learning_rate": 0.00018148073315052678, + "loss": 0.2162, + "step": 6421 + }, + { + "epoch": 0.46336447923806773, + "grad_norm": 0.14052875339984894, + "learning_rate": 0.00018147784673113005, + "loss": 0.1639, + "step": 6422 + }, + { + "epoch": 0.46343663191312817, + "grad_norm": 0.11463950574398041, + "learning_rate": 0.0001814749603117333, + "loss": 0.169, + "step": 6423 + }, + { + "epoch": 0.4635087845881886, + "grad_norm": 0.10958924889564514, + "learning_rate": 0.00018147207389233654, + "loss": 0.1124, + "step": 6424 + }, + { + "epoch": 0.46358093726324906, + "grad_norm": 0.15305006504058838, + "learning_rate": 0.0001814691874729398, + "loss": 0.1749, + "step": 6425 + }, + { + "epoch": 0.46365308993830945, + "grad_norm": 0.10930691659450531, + "learning_rate": 0.0001814663010535431, + "loss": 0.2099, + "step": 6426 + }, + { + "epoch": 0.4637252426133699, + "grad_norm": 0.14171509444713593, + "learning_rate": 0.00018146341463414636, + "loss": 0.1764, + "step": 6427 + }, + { + "epoch": 0.46379739528843034, + "grad_norm": 0.1683446764945984, + "learning_rate": 0.00018146052821474962, + "loss": 0.1656, + "step": 6428 + }, + { + "epoch": 0.4638695479634907, + "grad_norm": 0.12286924570798874, + "learning_rate": 0.0001814576417953529, + "loss": 0.171, + "step": 6429 + }, + { + "epoch": 0.46394170063855117, + "grad_norm": 0.1306712031364441, + "learning_rate": 0.00018145475537595612, + "loss": 0.1532, + "step": 6430 + }, + { + "epoch": 0.4640138533136116, + "grad_norm": 0.13364550471305847, + "learning_rate": 0.00018145186895655939, + "loss": 0.1449, + "step": 6431 + }, + { + "epoch": 0.46408600598867206, + "grad_norm": 0.1290210783481598, + "learning_rate": 0.00018144898253716265, + "loss": 0.1507, + "step": 6432 + }, + { + "epoch": 0.46415815866373245, + "grad_norm": 0.15846581757068634, + "learning_rate": 0.00018144609611776594, + "loss": 0.1263, + "step": 6433 + }, + { + "epoch": 0.4642303113387929, + "grad_norm": 0.11158798635005951, + "learning_rate": 0.0001814432096983692, + "loss": 0.1785, + "step": 6434 + }, + { + "epoch": 0.46430246401385333, + "grad_norm": 0.09691043198108673, + "learning_rate": 0.00018144032327897244, + "loss": 0.1429, + "step": 6435 + }, + { + "epoch": 0.4643746166889137, + "grad_norm": 0.1297903060913086, + "learning_rate": 0.0001814374368595757, + "loss": 0.1737, + "step": 6436 + }, + { + "epoch": 0.46444676936397417, + "grad_norm": 0.10582982748746872, + "learning_rate": 0.00018143455044017896, + "loss": 0.2156, + "step": 6437 + }, + { + "epoch": 0.4645189220390346, + "grad_norm": 0.13353532552719116, + "learning_rate": 0.00018143166402078223, + "loss": 0.1536, + "step": 6438 + }, + { + "epoch": 0.464591074714095, + "grad_norm": 0.13009218871593475, + "learning_rate": 0.0001814287776013855, + "loss": 0.1437, + "step": 6439 + }, + { + "epoch": 0.46466322738915544, + "grad_norm": 0.11011122912168503, + "learning_rate": 0.00018142589118198875, + "loss": 0.1102, + "step": 6440 + }, + { + "epoch": 0.4647353800642159, + "grad_norm": 0.12354083359241486, + "learning_rate": 0.00018142300476259201, + "loss": 0.1938, + "step": 6441 + }, + { + "epoch": 0.46480753273927633, + "grad_norm": 0.09825614094734192, + "learning_rate": 0.00018142011834319528, + "loss": 0.1406, + "step": 6442 + }, + { + "epoch": 0.4648796854143367, + "grad_norm": 0.11126351356506348, + "learning_rate": 0.00018141723192379854, + "loss": 0.1242, + "step": 6443 + }, + { + "epoch": 0.46495183808939716, + "grad_norm": 0.1263757348060608, + "learning_rate": 0.0001814143455044018, + "loss": 0.1867, + "step": 6444 + }, + { + "epoch": 0.4650239907644576, + "grad_norm": 0.10534121096134186, + "learning_rate": 0.00018141145908500507, + "loss": 0.1539, + "step": 6445 + }, + { + "epoch": 0.465096143439518, + "grad_norm": 0.1159055158495903, + "learning_rate": 0.0001814085726656083, + "loss": 0.1574, + "step": 6446 + }, + { + "epoch": 0.46516829611457844, + "grad_norm": 0.13766884803771973, + "learning_rate": 0.0001814056862462116, + "loss": 0.1988, + "step": 6447 + }, + { + "epoch": 0.4652404487896389, + "grad_norm": 0.11503700911998749, + "learning_rate": 0.00018140279982681485, + "loss": 0.1327, + "step": 6448 + }, + { + "epoch": 0.4653126014646993, + "grad_norm": 0.15995432436466217, + "learning_rate": 0.00018139991340741812, + "loss": 0.144, + "step": 6449 + }, + { + "epoch": 0.4653847541397597, + "grad_norm": 0.10858393460512161, + "learning_rate": 0.00018139702698802138, + "loss": 0.1895, + "step": 6450 + }, + { + "epoch": 0.46545690681482016, + "grad_norm": 0.16355015337467194, + "learning_rate": 0.00018139414056862462, + "loss": 0.171, + "step": 6451 + }, + { + "epoch": 0.4655290594898806, + "grad_norm": 0.12066861987113953, + "learning_rate": 0.00018139125414922788, + "loss": 0.1786, + "step": 6452 + }, + { + "epoch": 0.465601212164941, + "grad_norm": 0.13982541859149933, + "learning_rate": 0.00018138836772983114, + "loss": 0.1929, + "step": 6453 + }, + { + "epoch": 0.46567336484000144, + "grad_norm": 0.1383274346590042, + "learning_rate": 0.00018138548131043443, + "loss": 0.1467, + "step": 6454 + }, + { + "epoch": 0.4657455175150619, + "grad_norm": 0.11604558676481247, + "learning_rate": 0.0001813825948910377, + "loss": 0.1761, + "step": 6455 + }, + { + "epoch": 0.4658176701901223, + "grad_norm": 0.13544894754886627, + "learning_rate": 0.00018137970847164093, + "loss": 0.1297, + "step": 6456 + }, + { + "epoch": 0.4658898228651827, + "grad_norm": 0.09898851811885834, + "learning_rate": 0.0001813768220522442, + "loss": 0.1838, + "step": 6457 + }, + { + "epoch": 0.46596197554024316, + "grad_norm": 0.11426880955696106, + "learning_rate": 0.00018137393563284746, + "loss": 0.1579, + "step": 6458 + }, + { + "epoch": 0.4660341282153036, + "grad_norm": 0.09921670705080032, + "learning_rate": 0.00018137104921345072, + "loss": 0.1243, + "step": 6459 + }, + { + "epoch": 0.466106280890364, + "grad_norm": 0.11920057237148285, + "learning_rate": 0.00018136816279405398, + "loss": 0.1541, + "step": 6460 + }, + { + "epoch": 0.46617843356542443, + "grad_norm": 0.12290757894515991, + "learning_rate": 0.00018136527637465725, + "loss": 0.1505, + "step": 6461 + }, + { + "epoch": 0.4662505862404849, + "grad_norm": 0.12962016463279724, + "learning_rate": 0.0001813623899552605, + "loss": 0.1735, + "step": 6462 + }, + { + "epoch": 0.4663227389155453, + "grad_norm": 0.15535619854927063, + "learning_rate": 0.00018135950353586377, + "loss": 0.1385, + "step": 6463 + }, + { + "epoch": 0.4663948915906057, + "grad_norm": 0.12486453354358673, + "learning_rate": 0.00018135661711646703, + "loss": 0.1254, + "step": 6464 + }, + { + "epoch": 0.46646704426566615, + "grad_norm": 0.14467908442020416, + "learning_rate": 0.0001813537306970703, + "loss": 0.1841, + "step": 6465 + }, + { + "epoch": 0.4665391969407266, + "grad_norm": 0.13134662806987762, + "learning_rate": 0.00018135084427767356, + "loss": 0.1249, + "step": 6466 + }, + { + "epoch": 0.466611349615787, + "grad_norm": 0.11720168590545654, + "learning_rate": 0.0001813479578582768, + "loss": 0.1126, + "step": 6467 + }, + { + "epoch": 0.46668350229084743, + "grad_norm": 0.13877685368061066, + "learning_rate": 0.00018134507143888009, + "loss": 0.1735, + "step": 6468 + }, + { + "epoch": 0.4667556549659079, + "grad_norm": 0.11466140300035477, + "learning_rate": 0.00018134218501948335, + "loss": 0.164, + "step": 6469 + }, + { + "epoch": 0.46682780764096826, + "grad_norm": 0.13385191559791565, + "learning_rate": 0.0001813392986000866, + "loss": 0.1438, + "step": 6470 + }, + { + "epoch": 0.4668999603160287, + "grad_norm": 0.1479582041501999, + "learning_rate": 0.00018133641218068987, + "loss": 0.1393, + "step": 6471 + }, + { + "epoch": 0.46697211299108915, + "grad_norm": 0.16987402737140656, + "learning_rate": 0.0001813335257612931, + "loss": 0.1922, + "step": 6472 + }, + { + "epoch": 0.4670442656661496, + "grad_norm": 0.12824693322181702, + "learning_rate": 0.00018133063934189637, + "loss": 0.1507, + "step": 6473 + }, + { + "epoch": 0.46711641834121, + "grad_norm": 0.13783419132232666, + "learning_rate": 0.00018132775292249964, + "loss": 0.1303, + "step": 6474 + }, + { + "epoch": 0.4671885710162704, + "grad_norm": 0.13383013010025024, + "learning_rate": 0.00018132486650310293, + "loss": 0.1516, + "step": 6475 + }, + { + "epoch": 0.46726072369133087, + "grad_norm": 0.11658954620361328, + "learning_rate": 0.0001813219800837062, + "loss": 0.1503, + "step": 6476 + }, + { + "epoch": 0.46733287636639126, + "grad_norm": 0.13576288521289825, + "learning_rate": 0.00018131909366430943, + "loss": 0.1386, + "step": 6477 + }, + { + "epoch": 0.4674050290414517, + "grad_norm": 0.10309716314077377, + "learning_rate": 0.0001813162072449127, + "loss": 0.1811, + "step": 6478 + }, + { + "epoch": 0.46747718171651215, + "grad_norm": 0.09541424363851547, + "learning_rate": 0.00018131332082551595, + "loss": 0.1468, + "step": 6479 + }, + { + "epoch": 0.4675493343915726, + "grad_norm": 0.10171490162611008, + "learning_rate": 0.00018131043440611921, + "loss": 0.116, + "step": 6480 + }, + { + "epoch": 0.467621487066633, + "grad_norm": 0.10852886736392975, + "learning_rate": 0.00018130754798672248, + "loss": 0.1362, + "step": 6481 + }, + { + "epoch": 0.4676936397416934, + "grad_norm": 0.12629874050617218, + "learning_rate": 0.00018130466156732574, + "loss": 0.1944, + "step": 6482 + }, + { + "epoch": 0.46776579241675387, + "grad_norm": 0.11615622043609619, + "learning_rate": 0.000181301775147929, + "loss": 0.2132, + "step": 6483 + }, + { + "epoch": 0.46783794509181426, + "grad_norm": 0.15097269415855408, + "learning_rate": 0.00018129888872853227, + "loss": 0.1369, + "step": 6484 + }, + { + "epoch": 0.4679100977668747, + "grad_norm": 0.11667463183403015, + "learning_rate": 0.00018129600230913553, + "loss": 0.1446, + "step": 6485 + }, + { + "epoch": 0.46798225044193514, + "grad_norm": 0.11408985406160355, + "learning_rate": 0.0001812931158897388, + "loss": 0.1198, + "step": 6486 + }, + { + "epoch": 0.4680544031169956, + "grad_norm": 0.1181301549077034, + "learning_rate": 0.00018129022947034205, + "loss": 0.1554, + "step": 6487 + }, + { + "epoch": 0.468126555792056, + "grad_norm": 0.1423175036907196, + "learning_rate": 0.0001812873430509453, + "loss": 0.1545, + "step": 6488 + }, + { + "epoch": 0.4681987084671164, + "grad_norm": 0.1127098798751831, + "learning_rate": 0.00018128445663154858, + "loss": 0.1267, + "step": 6489 + }, + { + "epoch": 0.46827086114217686, + "grad_norm": 0.15628333389759064, + "learning_rate": 0.00018128157021215184, + "loss": 0.193, + "step": 6490 + }, + { + "epoch": 0.46834301381723725, + "grad_norm": 0.115764319896698, + "learning_rate": 0.0001812786837927551, + "loss": 0.12, + "step": 6491 + }, + { + "epoch": 0.4684151664922977, + "grad_norm": 0.15240737795829773, + "learning_rate": 0.00018127579737335837, + "loss": 0.1843, + "step": 6492 + }, + { + "epoch": 0.46848731916735814, + "grad_norm": 0.13437342643737793, + "learning_rate": 0.0001812729109539616, + "loss": 0.1126, + "step": 6493 + }, + { + "epoch": 0.4685594718424186, + "grad_norm": 0.13212938606739044, + "learning_rate": 0.00018127002453456487, + "loss": 0.167, + "step": 6494 + }, + { + "epoch": 0.468631624517479, + "grad_norm": 0.13624313473701477, + "learning_rate": 0.00018126713811516813, + "loss": 0.1344, + "step": 6495 + }, + { + "epoch": 0.4687037771925394, + "grad_norm": 0.1937403380870819, + "learning_rate": 0.00018126425169577142, + "loss": 0.1865, + "step": 6496 + }, + { + "epoch": 0.46877592986759986, + "grad_norm": 0.1276809275150299, + "learning_rate": 0.00018126136527637468, + "loss": 0.1516, + "step": 6497 + }, + { + "epoch": 0.46884808254266025, + "grad_norm": 0.09707915782928467, + "learning_rate": 0.00018125847885697792, + "loss": 0.1733, + "step": 6498 + }, + { + "epoch": 0.4689202352177207, + "grad_norm": 0.13547371327877045, + "learning_rate": 0.00018125559243758118, + "loss": 0.145, + "step": 6499 + }, + { + "epoch": 0.46899238789278114, + "grad_norm": 0.11191370338201523, + "learning_rate": 0.00018125270601818445, + "loss": 0.1484, + "step": 6500 + }, + { + "epoch": 0.4690645405678415, + "grad_norm": 0.11115206032991409, + "learning_rate": 0.0001812498195987877, + "loss": 0.1522, + "step": 6501 + }, + { + "epoch": 0.46913669324290197, + "grad_norm": 0.11308522522449493, + "learning_rate": 0.00018124693317939097, + "loss": 0.1649, + "step": 6502 + }, + { + "epoch": 0.4692088459179624, + "grad_norm": 0.11458290368318558, + "learning_rate": 0.00018124404675999423, + "loss": 0.1652, + "step": 6503 + }, + { + "epoch": 0.46928099859302286, + "grad_norm": 0.1331714242696762, + "learning_rate": 0.0001812411603405975, + "loss": 0.1747, + "step": 6504 + }, + { + "epoch": 0.46935315126808325, + "grad_norm": 0.11369192600250244, + "learning_rate": 0.00018123827392120076, + "loss": 0.195, + "step": 6505 + }, + { + "epoch": 0.4694253039431437, + "grad_norm": 0.1556098610162735, + "learning_rate": 0.00018123538750180402, + "loss": 0.1732, + "step": 6506 + }, + { + "epoch": 0.46949745661820413, + "grad_norm": 0.11743935942649841, + "learning_rate": 0.00018123250108240729, + "loss": 0.1762, + "step": 6507 + }, + { + "epoch": 0.4695696092932645, + "grad_norm": 0.09582743793725967, + "learning_rate": 0.00018122961466301055, + "loss": 0.1053, + "step": 6508 + }, + { + "epoch": 0.46964176196832497, + "grad_norm": 0.1395568996667862, + "learning_rate": 0.00018122672824361378, + "loss": 0.1723, + "step": 6509 + }, + { + "epoch": 0.4697139146433854, + "grad_norm": 0.1317128688097, + "learning_rate": 0.00018122384182421707, + "loss": 0.1207, + "step": 6510 + }, + { + "epoch": 0.46978606731844585, + "grad_norm": 0.11051097512245178, + "learning_rate": 0.00018122095540482034, + "loss": 0.1774, + "step": 6511 + }, + { + "epoch": 0.46985821999350624, + "grad_norm": 0.14036059379577637, + "learning_rate": 0.0001812180689854236, + "loss": 0.1344, + "step": 6512 + }, + { + "epoch": 0.4699303726685667, + "grad_norm": 0.11173304915428162, + "learning_rate": 0.00018121518256602686, + "loss": 0.163, + "step": 6513 + }, + { + "epoch": 0.47000252534362713, + "grad_norm": 0.12173853814601898, + "learning_rate": 0.0001812122961466301, + "loss": 0.1832, + "step": 6514 + }, + { + "epoch": 0.4700746780186875, + "grad_norm": 0.0984521210193634, + "learning_rate": 0.00018120940972723336, + "loss": 0.1437, + "step": 6515 + }, + { + "epoch": 0.47014683069374796, + "grad_norm": 0.09181611239910126, + "learning_rate": 0.00018120652330783663, + "loss": 0.1452, + "step": 6516 + }, + { + "epoch": 0.4702189833688084, + "grad_norm": 0.1238136738538742, + "learning_rate": 0.00018120363688843992, + "loss": 0.1546, + "step": 6517 + }, + { + "epoch": 0.47029113604386885, + "grad_norm": 0.1385972499847412, + "learning_rate": 0.00018120075046904318, + "loss": 0.1755, + "step": 6518 + }, + { + "epoch": 0.47036328871892924, + "grad_norm": 0.1276998668909073, + "learning_rate": 0.00018119786404964641, + "loss": 0.2061, + "step": 6519 + }, + { + "epoch": 0.4704354413939897, + "grad_norm": 0.14419038593769073, + "learning_rate": 0.00018119497763024968, + "loss": 0.1492, + "step": 6520 + }, + { + "epoch": 0.47050759406905013, + "grad_norm": 0.11534445732831955, + "learning_rate": 0.00018119209121085294, + "loss": 0.1521, + "step": 6521 + }, + { + "epoch": 0.4705797467441105, + "grad_norm": 0.12136315554380417, + "learning_rate": 0.0001811892047914562, + "loss": 0.1367, + "step": 6522 + }, + { + "epoch": 0.47065189941917096, + "grad_norm": 0.12664173543453217, + "learning_rate": 0.00018118631837205947, + "loss": 0.1368, + "step": 6523 + }, + { + "epoch": 0.4707240520942314, + "grad_norm": 0.1297597885131836, + "learning_rate": 0.00018118343195266273, + "loss": 0.2212, + "step": 6524 + }, + { + "epoch": 0.47079620476929185, + "grad_norm": 0.11439337581396103, + "learning_rate": 0.000181180545533266, + "loss": 0.1603, + "step": 6525 + }, + { + "epoch": 0.47086835744435224, + "grad_norm": 0.1221558228135109, + "learning_rate": 0.00018117765911386925, + "loss": 0.1582, + "step": 6526 + }, + { + "epoch": 0.4709405101194127, + "grad_norm": 0.1027345359325409, + "learning_rate": 0.00018117477269447252, + "loss": 0.1799, + "step": 6527 + }, + { + "epoch": 0.4710126627944731, + "grad_norm": 0.11186765134334564, + "learning_rate": 0.00018117188627507578, + "loss": 0.1518, + "step": 6528 + }, + { + "epoch": 0.4710848154695335, + "grad_norm": 0.1288941502571106, + "learning_rate": 0.00018116899985567904, + "loss": 0.1344, + "step": 6529 + }, + { + "epoch": 0.47115696814459396, + "grad_norm": 0.10255924612283707, + "learning_rate": 0.00018116611343628228, + "loss": 0.107, + "step": 6530 + }, + { + "epoch": 0.4712291208196544, + "grad_norm": 0.11037500202655792, + "learning_rate": 0.00018116322701688557, + "loss": 0.1445, + "step": 6531 + }, + { + "epoch": 0.4713012734947148, + "grad_norm": 0.09189187735319138, + "learning_rate": 0.00018116034059748883, + "loss": 0.1748, + "step": 6532 + }, + { + "epoch": 0.47137342616977523, + "grad_norm": 0.13333386182785034, + "learning_rate": 0.0001811574541780921, + "loss": 0.1718, + "step": 6533 + }, + { + "epoch": 0.4714455788448357, + "grad_norm": 0.14279568195343018, + "learning_rate": 0.00018115456775869536, + "loss": 0.1587, + "step": 6534 + }, + { + "epoch": 0.4715177315198961, + "grad_norm": 0.11661846935749054, + "learning_rate": 0.0001811516813392986, + "loss": 0.1619, + "step": 6535 + }, + { + "epoch": 0.4715898841949565, + "grad_norm": 0.1385808289051056, + "learning_rate": 0.00018114879491990186, + "loss": 0.2019, + "step": 6536 + }, + { + "epoch": 0.47166203687001695, + "grad_norm": 0.11933812499046326, + "learning_rate": 0.00018114590850050512, + "loss": 0.1153, + "step": 6537 + }, + { + "epoch": 0.4717341895450774, + "grad_norm": 0.1256553828716278, + "learning_rate": 0.0001811430220811084, + "loss": 0.1407, + "step": 6538 + }, + { + "epoch": 0.4718063422201378, + "grad_norm": 0.14740781486034393, + "learning_rate": 0.00018114013566171167, + "loss": 0.167, + "step": 6539 + }, + { + "epoch": 0.47187849489519823, + "grad_norm": 0.12701353430747986, + "learning_rate": 0.0001811372492423149, + "loss": 0.1264, + "step": 6540 + }, + { + "epoch": 0.4719506475702587, + "grad_norm": 0.1495954990386963, + "learning_rate": 0.00018113436282291817, + "loss": 0.1844, + "step": 6541 + }, + { + "epoch": 0.4720228002453191, + "grad_norm": 0.11055200546979904, + "learning_rate": 0.00018113147640352143, + "loss": 0.1683, + "step": 6542 + }, + { + "epoch": 0.4720949529203795, + "grad_norm": 0.18256966769695282, + "learning_rate": 0.0001811285899841247, + "loss": 0.1395, + "step": 6543 + }, + { + "epoch": 0.47216710559543995, + "grad_norm": 0.1399894505739212, + "learning_rate": 0.00018112570356472796, + "loss": 0.1742, + "step": 6544 + }, + { + "epoch": 0.4722392582705004, + "grad_norm": 0.11353211849927902, + "learning_rate": 0.00018112281714533122, + "loss": 0.1333, + "step": 6545 + }, + { + "epoch": 0.4723114109455608, + "grad_norm": 0.14108607172966003, + "learning_rate": 0.00018111993072593449, + "loss": 0.2102, + "step": 6546 + }, + { + "epoch": 0.4723835636206212, + "grad_norm": 0.12063361704349518, + "learning_rate": 0.00018111704430653775, + "loss": 0.1504, + "step": 6547 + }, + { + "epoch": 0.47245571629568167, + "grad_norm": 0.1718057245016098, + "learning_rate": 0.000181114157887141, + "loss": 0.19, + "step": 6548 + }, + { + "epoch": 0.4725278689707421, + "grad_norm": 0.10668683797121048, + "learning_rate": 0.00018111127146774427, + "loss": 0.1488, + "step": 6549 + }, + { + "epoch": 0.4726000216458025, + "grad_norm": 0.09043119847774506, + "learning_rate": 0.00018110838504834754, + "loss": 0.1434, + "step": 6550 + }, + { + "epoch": 0.47267217432086295, + "grad_norm": 0.10747907310724258, + "learning_rate": 0.0001811054986289508, + "loss": 0.1596, + "step": 6551 + }, + { + "epoch": 0.4727443269959234, + "grad_norm": 0.11078695952892303, + "learning_rate": 0.00018110261220955406, + "loss": 0.1629, + "step": 6552 + }, + { + "epoch": 0.4728164796709838, + "grad_norm": 0.1163090318441391, + "learning_rate": 0.00018109972579015733, + "loss": 0.178, + "step": 6553 + }, + { + "epoch": 0.4728886323460442, + "grad_norm": 0.11894099414348602, + "learning_rate": 0.0001810968393707606, + "loss": 0.1557, + "step": 6554 + }, + { + "epoch": 0.47296078502110467, + "grad_norm": 0.10437697172164917, + "learning_rate": 0.00018109395295136385, + "loss": 0.1197, + "step": 6555 + }, + { + "epoch": 0.4730329376961651, + "grad_norm": 0.10267592966556549, + "learning_rate": 0.00018109106653196711, + "loss": 0.1778, + "step": 6556 + }, + { + "epoch": 0.4731050903712255, + "grad_norm": 0.11590174585580826, + "learning_rate": 0.00018108818011257035, + "loss": 0.1882, + "step": 6557 + }, + { + "epoch": 0.47317724304628594, + "grad_norm": 0.11858008056879044, + "learning_rate": 0.00018108529369317361, + "loss": 0.1707, + "step": 6558 + }, + { + "epoch": 0.4732493957213464, + "grad_norm": 0.11694855242967606, + "learning_rate": 0.0001810824072737769, + "loss": 0.1728, + "step": 6559 + }, + { + "epoch": 0.4733215483964068, + "grad_norm": 0.12493687868118286, + "learning_rate": 0.00018107952085438017, + "loss": 0.1962, + "step": 6560 + }, + { + "epoch": 0.4733937010714672, + "grad_norm": 0.12782999873161316, + "learning_rate": 0.00018107663443498343, + "loss": 0.1692, + "step": 6561 + }, + { + "epoch": 0.47346585374652767, + "grad_norm": 0.12364133447408676, + "learning_rate": 0.00018107374801558667, + "loss": 0.1231, + "step": 6562 + }, + { + "epoch": 0.47353800642158805, + "grad_norm": 0.12436700612306595, + "learning_rate": 0.00018107086159618993, + "loss": 0.1225, + "step": 6563 + }, + { + "epoch": 0.4736101590966485, + "grad_norm": 0.1342078149318695, + "learning_rate": 0.0001810679751767932, + "loss": 0.1363, + "step": 6564 + }, + { + "epoch": 0.47368231177170894, + "grad_norm": 0.15493200719356537, + "learning_rate": 0.00018106508875739645, + "loss": 0.1801, + "step": 6565 + }, + { + "epoch": 0.4737544644467694, + "grad_norm": 0.13285429775714874, + "learning_rate": 0.00018106220233799972, + "loss": 0.1501, + "step": 6566 + }, + { + "epoch": 0.4738266171218298, + "grad_norm": 0.15981805324554443, + "learning_rate": 0.00018105931591860298, + "loss": 0.2127, + "step": 6567 + }, + { + "epoch": 0.4738987697968902, + "grad_norm": 0.12443274259567261, + "learning_rate": 0.00018105642949920624, + "loss": 0.1572, + "step": 6568 + }, + { + "epoch": 0.47397092247195066, + "grad_norm": 0.16868320107460022, + "learning_rate": 0.0001810535430798095, + "loss": 0.1776, + "step": 6569 + }, + { + "epoch": 0.47404307514701105, + "grad_norm": 0.11300229281187057, + "learning_rate": 0.00018105065666041277, + "loss": 0.1331, + "step": 6570 + }, + { + "epoch": 0.4741152278220715, + "grad_norm": 0.1066485121846199, + "learning_rate": 0.00018104777024101603, + "loss": 0.1507, + "step": 6571 + }, + { + "epoch": 0.47418738049713194, + "grad_norm": 0.11548085510730743, + "learning_rate": 0.0001810448838216193, + "loss": 0.1868, + "step": 6572 + }, + { + "epoch": 0.4742595331721924, + "grad_norm": 0.11486897617578506, + "learning_rate": 0.00018104199740222253, + "loss": 0.152, + "step": 6573 + }, + { + "epoch": 0.47433168584725277, + "grad_norm": 0.1293863207101822, + "learning_rate": 0.00018103911098282582, + "loss": 0.1139, + "step": 6574 + }, + { + "epoch": 0.4744038385223132, + "grad_norm": 0.09505106508731842, + "learning_rate": 0.00018103622456342908, + "loss": 0.1714, + "step": 6575 + }, + { + "epoch": 0.47447599119737366, + "grad_norm": 0.11848371475934982, + "learning_rate": 0.00018103333814403235, + "loss": 0.1933, + "step": 6576 + }, + { + "epoch": 0.47454814387243405, + "grad_norm": 0.11045391112565994, + "learning_rate": 0.0001810304517246356, + "loss": 0.1315, + "step": 6577 + }, + { + "epoch": 0.4746202965474945, + "grad_norm": 0.09907637536525726, + "learning_rate": 0.00018102756530523884, + "loss": 0.134, + "step": 6578 + }, + { + "epoch": 0.47469244922255494, + "grad_norm": 0.11193614453077316, + "learning_rate": 0.0001810246788858421, + "loss": 0.1517, + "step": 6579 + }, + { + "epoch": 0.4747646018976154, + "grad_norm": 0.10982697457075119, + "learning_rate": 0.00018102179246644537, + "loss": 0.1708, + "step": 6580 + }, + { + "epoch": 0.47483675457267577, + "grad_norm": 0.12854880094528198, + "learning_rate": 0.00018101890604704866, + "loss": 0.1961, + "step": 6581 + }, + { + "epoch": 0.4749089072477362, + "grad_norm": 0.1719358265399933, + "learning_rate": 0.00018101601962765192, + "loss": 0.1382, + "step": 6582 + }, + { + "epoch": 0.47498105992279666, + "grad_norm": 0.12974786758422852, + "learning_rate": 0.00018101313320825516, + "loss": 0.137, + "step": 6583 + }, + { + "epoch": 0.47505321259785704, + "grad_norm": 0.11269278079271317, + "learning_rate": 0.00018101024678885842, + "loss": 0.1494, + "step": 6584 + }, + { + "epoch": 0.4751253652729175, + "grad_norm": 0.11579824239015579, + "learning_rate": 0.00018100736036946169, + "loss": 0.1704, + "step": 6585 + }, + { + "epoch": 0.47519751794797793, + "grad_norm": 0.11576331406831741, + "learning_rate": 0.00018100447395006495, + "loss": 0.1707, + "step": 6586 + }, + { + "epoch": 0.4752696706230384, + "grad_norm": 0.10914190113544464, + "learning_rate": 0.0001810015875306682, + "loss": 0.1695, + "step": 6587 + }, + { + "epoch": 0.47534182329809876, + "grad_norm": 0.19466876983642578, + "learning_rate": 0.00018099870111127147, + "loss": 0.1266, + "step": 6588 + }, + { + "epoch": 0.4754139759731592, + "grad_norm": 0.1322767287492752, + "learning_rate": 0.00018099581469187474, + "loss": 0.1713, + "step": 6589 + }, + { + "epoch": 0.47548612864821965, + "grad_norm": 0.13432395458221436, + "learning_rate": 0.000180992928272478, + "loss": 0.1327, + "step": 6590 + }, + { + "epoch": 0.47555828132328004, + "grad_norm": 0.11618776619434357, + "learning_rate": 0.00018099004185308126, + "loss": 0.1659, + "step": 6591 + }, + { + "epoch": 0.4756304339983405, + "grad_norm": 0.13345777988433838, + "learning_rate": 0.00018098715543368453, + "loss": 0.1703, + "step": 6592 + }, + { + "epoch": 0.47570258667340093, + "grad_norm": 0.19079522788524628, + "learning_rate": 0.0001809842690142878, + "loss": 0.1992, + "step": 6593 + }, + { + "epoch": 0.4757747393484613, + "grad_norm": 0.16293351352214813, + "learning_rate": 0.00018098138259489102, + "loss": 0.1404, + "step": 6594 + }, + { + "epoch": 0.47584689202352176, + "grad_norm": 0.1540932059288025, + "learning_rate": 0.00018097849617549431, + "loss": 0.2358, + "step": 6595 + }, + { + "epoch": 0.4759190446985822, + "grad_norm": 0.12129596620798111, + "learning_rate": 0.00018097560975609758, + "loss": 0.1355, + "step": 6596 + }, + { + "epoch": 0.47599119737364265, + "grad_norm": 0.12973016500473022, + "learning_rate": 0.00018097272333670084, + "loss": 0.1235, + "step": 6597 + }, + { + "epoch": 0.47606335004870304, + "grad_norm": 0.12584935128688812, + "learning_rate": 0.0001809698369173041, + "loss": 0.1781, + "step": 6598 + }, + { + "epoch": 0.4761355027237635, + "grad_norm": 0.14946849644184113, + "learning_rate": 0.00018096695049790734, + "loss": 0.1754, + "step": 6599 + }, + { + "epoch": 0.4762076553988239, + "grad_norm": 0.11293647438287735, + "learning_rate": 0.0001809640640785106, + "loss": 0.142, + "step": 6600 + }, + { + "epoch": 0.4762798080738843, + "grad_norm": 0.09187892824411392, + "learning_rate": 0.00018096117765911387, + "loss": 0.1148, + "step": 6601 + }, + { + "epoch": 0.47635196074894476, + "grad_norm": 0.14158938825130463, + "learning_rate": 0.00018095829123971716, + "loss": 0.1556, + "step": 6602 + }, + { + "epoch": 0.4764241134240052, + "grad_norm": 0.14010418951511383, + "learning_rate": 0.00018095540482032042, + "loss": 0.1418, + "step": 6603 + }, + { + "epoch": 0.47649626609906565, + "grad_norm": 0.11662571877241135, + "learning_rate": 0.00018095251840092365, + "loss": 0.1528, + "step": 6604 + }, + { + "epoch": 0.47656841877412603, + "grad_norm": 0.18299691379070282, + "learning_rate": 0.00018094963198152692, + "loss": 0.1936, + "step": 6605 + }, + { + "epoch": 0.4766405714491865, + "grad_norm": 0.11245797574520111, + "learning_rate": 0.00018094674556213018, + "loss": 0.1748, + "step": 6606 + }, + { + "epoch": 0.4767127241242469, + "grad_norm": 0.1389348804950714, + "learning_rate": 0.00018094385914273344, + "loss": 0.1184, + "step": 6607 + }, + { + "epoch": 0.4767848767993073, + "grad_norm": 0.12851954996585846, + "learning_rate": 0.0001809409727233367, + "loss": 0.1642, + "step": 6608 + }, + { + "epoch": 0.47685702947436776, + "grad_norm": 0.16452080011367798, + "learning_rate": 0.00018093808630393997, + "loss": 0.1655, + "step": 6609 + }, + { + "epoch": 0.4769291821494282, + "grad_norm": 0.14606648683547974, + "learning_rate": 0.00018093519988454323, + "loss": 0.1588, + "step": 6610 + }, + { + "epoch": 0.47700133482448864, + "grad_norm": 0.13718731701374054, + "learning_rate": 0.0001809323134651465, + "loss": 0.1402, + "step": 6611 + }, + { + "epoch": 0.47707348749954903, + "grad_norm": 0.11560270190238953, + "learning_rate": 0.00018092942704574976, + "loss": 0.1626, + "step": 6612 + }, + { + "epoch": 0.4771456401746095, + "grad_norm": 0.1626240760087967, + "learning_rate": 0.00018092654062635302, + "loss": 0.1752, + "step": 6613 + }, + { + "epoch": 0.4772177928496699, + "grad_norm": 0.15158993005752563, + "learning_rate": 0.00018092365420695628, + "loss": 0.1806, + "step": 6614 + }, + { + "epoch": 0.4772899455247303, + "grad_norm": 0.1264626383781433, + "learning_rate": 0.00018092076778755952, + "loss": 0.1171, + "step": 6615 + }, + { + "epoch": 0.47736209819979075, + "grad_norm": 0.1221628487110138, + "learning_rate": 0.0001809178813681628, + "loss": 0.1555, + "step": 6616 + }, + { + "epoch": 0.4774342508748512, + "grad_norm": 0.12342524528503418, + "learning_rate": 0.00018091499494876607, + "loss": 0.1459, + "step": 6617 + }, + { + "epoch": 0.47750640354991164, + "grad_norm": 0.1038442999124527, + "learning_rate": 0.00018091210852936933, + "loss": 0.1378, + "step": 6618 + }, + { + "epoch": 0.47757855622497203, + "grad_norm": 0.1145162507891655, + "learning_rate": 0.0001809092221099726, + "loss": 0.1901, + "step": 6619 + }, + { + "epoch": 0.4776507089000325, + "grad_norm": 0.12268324196338654, + "learning_rate": 0.00018090633569057583, + "loss": 0.1643, + "step": 6620 + }, + { + "epoch": 0.4777228615750929, + "grad_norm": 0.12261956185102463, + "learning_rate": 0.0001809034492711791, + "loss": 0.1541, + "step": 6621 + }, + { + "epoch": 0.4777950142501533, + "grad_norm": 0.12539663910865784, + "learning_rate": 0.00018090056285178236, + "loss": 0.1705, + "step": 6622 + }, + { + "epoch": 0.47786716692521375, + "grad_norm": 0.14309175312519073, + "learning_rate": 0.00018089767643238565, + "loss": 0.1829, + "step": 6623 + }, + { + "epoch": 0.4779393196002742, + "grad_norm": 0.11351905018091202, + "learning_rate": 0.0001808947900129889, + "loss": 0.1625, + "step": 6624 + }, + { + "epoch": 0.4780114722753346, + "grad_norm": 0.1390398144721985, + "learning_rate": 0.00018089190359359215, + "loss": 0.167, + "step": 6625 + }, + { + "epoch": 0.478083624950395, + "grad_norm": 0.12186425924301147, + "learning_rate": 0.0001808890171741954, + "loss": 0.1163, + "step": 6626 + }, + { + "epoch": 0.47815577762545547, + "grad_norm": 0.11824537813663483, + "learning_rate": 0.00018088613075479867, + "loss": 0.1289, + "step": 6627 + }, + { + "epoch": 0.4782279303005159, + "grad_norm": 0.11230827867984772, + "learning_rate": 0.00018088324433540194, + "loss": 0.1711, + "step": 6628 + }, + { + "epoch": 0.4783000829755763, + "grad_norm": 0.14109791815280914, + "learning_rate": 0.0001808803579160052, + "loss": 0.1551, + "step": 6629 + }, + { + "epoch": 0.47837223565063675, + "grad_norm": 0.12549245357513428, + "learning_rate": 0.00018087747149660846, + "loss": 0.1696, + "step": 6630 + }, + { + "epoch": 0.4784443883256972, + "grad_norm": 0.11947820335626602, + "learning_rate": 0.00018087458507721173, + "loss": 0.1483, + "step": 6631 + }, + { + "epoch": 0.4785165410007576, + "grad_norm": 0.14599832892417908, + "learning_rate": 0.000180871698657815, + "loss": 0.1937, + "step": 6632 + }, + { + "epoch": 0.478588693675818, + "grad_norm": 0.15094219148159027, + "learning_rate": 0.00018086881223841825, + "loss": 0.1848, + "step": 6633 + }, + { + "epoch": 0.47866084635087847, + "grad_norm": 0.10668916255235672, + "learning_rate": 0.00018086592581902151, + "loss": 0.1225, + "step": 6634 + }, + { + "epoch": 0.4787329990259389, + "grad_norm": 0.13218899071216583, + "learning_rate": 0.00018086303939962478, + "loss": 0.1547, + "step": 6635 + }, + { + "epoch": 0.4788051517009993, + "grad_norm": 0.10088010877370834, + "learning_rate": 0.000180860152980228, + "loss": 0.2136, + "step": 6636 + }, + { + "epoch": 0.47887730437605974, + "grad_norm": 0.13138513267040253, + "learning_rate": 0.0001808572665608313, + "loss": 0.1611, + "step": 6637 + }, + { + "epoch": 0.4789494570511202, + "grad_norm": 0.11981962621212006, + "learning_rate": 0.00018085438014143457, + "loss": 0.1709, + "step": 6638 + }, + { + "epoch": 0.4790216097261806, + "grad_norm": 0.10274869203567505, + "learning_rate": 0.00018085149372203783, + "loss": 0.1538, + "step": 6639 + }, + { + "epoch": 0.479093762401241, + "grad_norm": 0.12627063691616058, + "learning_rate": 0.0001808486073026411, + "loss": 0.1435, + "step": 6640 + }, + { + "epoch": 0.47916591507630146, + "grad_norm": 0.1629066914319992, + "learning_rate": 0.00018084572088324433, + "loss": 0.1687, + "step": 6641 + }, + { + "epoch": 0.4792380677513619, + "grad_norm": 0.11205365508794785, + "learning_rate": 0.0001808428344638476, + "loss": 0.1415, + "step": 6642 + }, + { + "epoch": 0.4793102204264223, + "grad_norm": 0.11279461532831192, + "learning_rate": 0.00018083994804445085, + "loss": 0.1365, + "step": 6643 + }, + { + "epoch": 0.47938237310148274, + "grad_norm": 0.14440464973449707, + "learning_rate": 0.00018083706162505414, + "loss": 0.1621, + "step": 6644 + }, + { + "epoch": 0.4794545257765432, + "grad_norm": 0.10292355716228485, + "learning_rate": 0.0001808341752056574, + "loss": 0.1742, + "step": 6645 + }, + { + "epoch": 0.47952667845160357, + "grad_norm": 0.16559121012687683, + "learning_rate": 0.00018083128878626064, + "loss": 0.1389, + "step": 6646 + }, + { + "epoch": 0.479598831126664, + "grad_norm": 0.1599855273962021, + "learning_rate": 0.0001808284023668639, + "loss": 0.1418, + "step": 6647 + }, + { + "epoch": 0.47967098380172446, + "grad_norm": 0.12400344759225845, + "learning_rate": 0.00018082551594746717, + "loss": 0.1426, + "step": 6648 + }, + { + "epoch": 0.4797431364767849, + "grad_norm": 0.20110811293125153, + "learning_rate": 0.00018082262952807043, + "loss": 0.1622, + "step": 6649 + }, + { + "epoch": 0.4798152891518453, + "grad_norm": 0.1190720945596695, + "learning_rate": 0.0001808197431086737, + "loss": 0.1328, + "step": 6650 + }, + { + "epoch": 0.47988744182690574, + "grad_norm": 0.1138315349817276, + "learning_rate": 0.00018081685668927696, + "loss": 0.136, + "step": 6651 + }, + { + "epoch": 0.4799595945019662, + "grad_norm": 0.10997669398784637, + "learning_rate": 0.00018081397026988022, + "loss": 0.1949, + "step": 6652 + }, + { + "epoch": 0.48003174717702657, + "grad_norm": 0.14533071219921112, + "learning_rate": 0.00018081108385048348, + "loss": 0.2486, + "step": 6653 + }, + { + "epoch": 0.480103899852087, + "grad_norm": 0.10433954745531082, + "learning_rate": 0.00018080819743108675, + "loss": 0.1373, + "step": 6654 + }, + { + "epoch": 0.48017605252714746, + "grad_norm": 0.11102496832609177, + "learning_rate": 0.00018080531101169, + "loss": 0.1293, + "step": 6655 + }, + { + "epoch": 0.48024820520220785, + "grad_norm": 0.11135412007570267, + "learning_rate": 0.00018080242459229327, + "loss": 0.1655, + "step": 6656 + }, + { + "epoch": 0.4803203578772683, + "grad_norm": 0.12301556020975113, + "learning_rate": 0.00018079953817289653, + "loss": 0.1613, + "step": 6657 + }, + { + "epoch": 0.48039251055232873, + "grad_norm": 0.11578719317913055, + "learning_rate": 0.0001807966517534998, + "loss": 0.1921, + "step": 6658 + }, + { + "epoch": 0.4804646632273892, + "grad_norm": 0.137067511677742, + "learning_rate": 0.00018079376533410306, + "loss": 0.1792, + "step": 6659 + }, + { + "epoch": 0.48053681590244957, + "grad_norm": 0.1312975287437439, + "learning_rate": 0.00018079087891470632, + "loss": 0.1593, + "step": 6660 + }, + { + "epoch": 0.48060896857751, + "grad_norm": 0.19008010625839233, + "learning_rate": 0.00018078799249530959, + "loss": 0.1669, + "step": 6661 + }, + { + "epoch": 0.48068112125257045, + "grad_norm": 0.10435241460800171, + "learning_rate": 0.00018078510607591285, + "loss": 0.1198, + "step": 6662 + }, + { + "epoch": 0.48075327392763084, + "grad_norm": 0.11011923849582672, + "learning_rate": 0.00018078221965651608, + "loss": 0.1744, + "step": 6663 + }, + { + "epoch": 0.4808254266026913, + "grad_norm": 0.12781675159931183, + "learning_rate": 0.00018077933323711935, + "loss": 0.1985, + "step": 6664 + }, + { + "epoch": 0.48089757927775173, + "grad_norm": 0.13984736800193787, + "learning_rate": 0.00018077644681772264, + "loss": 0.192, + "step": 6665 + }, + { + "epoch": 0.4809697319528122, + "grad_norm": 0.12048052996397018, + "learning_rate": 0.0001807735603983259, + "loss": 0.1622, + "step": 6666 + }, + { + "epoch": 0.48104188462787256, + "grad_norm": 0.10933246463537216, + "learning_rate": 0.00018077067397892916, + "loss": 0.0976, + "step": 6667 + }, + { + "epoch": 0.481114037302933, + "grad_norm": 0.11265300214290619, + "learning_rate": 0.0001807677875595324, + "loss": 0.1056, + "step": 6668 + }, + { + "epoch": 0.48118618997799345, + "grad_norm": 0.12599968910217285, + "learning_rate": 0.00018076490114013566, + "loss": 0.1385, + "step": 6669 + }, + { + "epoch": 0.48125834265305384, + "grad_norm": 0.15781208872795105, + "learning_rate": 0.00018076201472073893, + "loss": 0.1562, + "step": 6670 + }, + { + "epoch": 0.4813304953281143, + "grad_norm": 0.14753875136375427, + "learning_rate": 0.0001807591283013422, + "loss": 0.1207, + "step": 6671 + }, + { + "epoch": 0.4814026480031747, + "grad_norm": 0.1585770696401596, + "learning_rate": 0.00018075624188194548, + "loss": 0.1114, + "step": 6672 + }, + { + "epoch": 0.48147480067823517, + "grad_norm": 0.13444659113883972, + "learning_rate": 0.00018075335546254871, + "loss": 0.1646, + "step": 6673 + }, + { + "epoch": 0.48154695335329556, + "grad_norm": 0.1050247773528099, + "learning_rate": 0.00018075046904315198, + "loss": 0.1646, + "step": 6674 + }, + { + "epoch": 0.481619106028356, + "grad_norm": 0.12049075961112976, + "learning_rate": 0.00018074758262375524, + "loss": 0.1866, + "step": 6675 + }, + { + "epoch": 0.48169125870341645, + "grad_norm": 0.16078917682170868, + "learning_rate": 0.0001807446962043585, + "loss": 0.1709, + "step": 6676 + }, + { + "epoch": 0.48176341137847684, + "grad_norm": 0.1396174281835556, + "learning_rate": 0.00018074180978496177, + "loss": 0.1639, + "step": 6677 + }, + { + "epoch": 0.4818355640535373, + "grad_norm": 0.13774073123931885, + "learning_rate": 0.00018073892336556503, + "loss": 0.1683, + "step": 6678 + }, + { + "epoch": 0.4819077167285977, + "grad_norm": 0.1184118315577507, + "learning_rate": 0.0001807360369461683, + "loss": 0.1978, + "step": 6679 + }, + { + "epoch": 0.48197986940365817, + "grad_norm": 0.11098407953977585, + "learning_rate": 0.00018073315052677155, + "loss": 0.1413, + "step": 6680 + }, + { + "epoch": 0.48205202207871856, + "grad_norm": 0.10108431428670883, + "learning_rate": 0.00018073026410737482, + "loss": 0.1093, + "step": 6681 + }, + { + "epoch": 0.482124174753779, + "grad_norm": 0.1013457179069519, + "learning_rate": 0.00018072737768797808, + "loss": 0.118, + "step": 6682 + }, + { + "epoch": 0.48219632742883944, + "grad_norm": 0.1206645593047142, + "learning_rate": 0.00018072449126858134, + "loss": 0.1623, + "step": 6683 + }, + { + "epoch": 0.48226848010389983, + "grad_norm": 0.09064919501543045, + "learning_rate": 0.00018072160484918458, + "loss": 0.1879, + "step": 6684 + }, + { + "epoch": 0.4823406327789603, + "grad_norm": 0.14269712567329407, + "learning_rate": 0.00018071871842978784, + "loss": 0.1547, + "step": 6685 + }, + { + "epoch": 0.4824127854540207, + "grad_norm": 0.16197986900806427, + "learning_rate": 0.00018071583201039113, + "loss": 0.1873, + "step": 6686 + }, + { + "epoch": 0.4824849381290811, + "grad_norm": 0.1269264966249466, + "learning_rate": 0.0001807129455909944, + "loss": 0.1053, + "step": 6687 + }, + { + "epoch": 0.48255709080414155, + "grad_norm": 0.15127058327198029, + "learning_rate": 0.00018071005917159766, + "loss": 0.2106, + "step": 6688 + }, + { + "epoch": 0.482629243479202, + "grad_norm": 0.13187997043132782, + "learning_rate": 0.0001807071727522009, + "loss": 0.1533, + "step": 6689 + }, + { + "epoch": 0.48270139615426244, + "grad_norm": 0.12021167576313019, + "learning_rate": 0.00018070428633280416, + "loss": 0.1604, + "step": 6690 + }, + { + "epoch": 0.48277354882932283, + "grad_norm": 0.13101136684417725, + "learning_rate": 0.00018070139991340742, + "loss": 0.1488, + "step": 6691 + }, + { + "epoch": 0.4828457015043833, + "grad_norm": 0.09323979914188385, + "learning_rate": 0.00018069851349401068, + "loss": 0.1079, + "step": 6692 + }, + { + "epoch": 0.4829178541794437, + "grad_norm": 0.11968117207288742, + "learning_rate": 0.00018069562707461397, + "loss": 0.1316, + "step": 6693 + }, + { + "epoch": 0.4829900068545041, + "grad_norm": 0.12057603150606155, + "learning_rate": 0.0001806927406552172, + "loss": 0.135, + "step": 6694 + }, + { + "epoch": 0.48306215952956455, + "grad_norm": 0.10026104003190994, + "learning_rate": 0.00018068985423582047, + "loss": 0.1274, + "step": 6695 + }, + { + "epoch": 0.483134312204625, + "grad_norm": 0.08938336372375488, + "learning_rate": 0.00018068696781642373, + "loss": 0.1454, + "step": 6696 + }, + { + "epoch": 0.48320646487968544, + "grad_norm": 0.10528580099344254, + "learning_rate": 0.000180684081397027, + "loss": 0.163, + "step": 6697 + }, + { + "epoch": 0.4832786175547458, + "grad_norm": 0.10674149543046951, + "learning_rate": 0.00018068119497763026, + "loss": 0.1223, + "step": 6698 + }, + { + "epoch": 0.48335077022980627, + "grad_norm": 0.12780793011188507, + "learning_rate": 0.00018067830855823352, + "loss": 0.1572, + "step": 6699 + }, + { + "epoch": 0.4834229229048667, + "grad_norm": 0.11604584753513336, + "learning_rate": 0.00018067542213883679, + "loss": 0.1203, + "step": 6700 + }, + { + "epoch": 0.4834950755799271, + "grad_norm": 0.1347905695438385, + "learning_rate": 0.00018067253571944005, + "loss": 0.2002, + "step": 6701 + }, + { + "epoch": 0.48356722825498755, + "grad_norm": 0.1396912932395935, + "learning_rate": 0.0001806696493000433, + "loss": 0.171, + "step": 6702 + }, + { + "epoch": 0.483639380930048, + "grad_norm": 0.11245424300432205, + "learning_rate": 0.00018066676288064657, + "loss": 0.1436, + "step": 6703 + }, + { + "epoch": 0.48371153360510843, + "grad_norm": 0.12426016479730606, + "learning_rate": 0.00018066387646124984, + "loss": 0.1466, + "step": 6704 + }, + { + "epoch": 0.4837836862801688, + "grad_norm": 0.11430709064006805, + "learning_rate": 0.00018066099004185307, + "loss": 0.1435, + "step": 6705 + }, + { + "epoch": 0.48385583895522927, + "grad_norm": 0.09857946634292603, + "learning_rate": 0.00018065810362245634, + "loss": 0.1142, + "step": 6706 + }, + { + "epoch": 0.4839279916302897, + "grad_norm": 0.111849345266819, + "learning_rate": 0.00018065521720305963, + "loss": 0.1343, + "step": 6707 + }, + { + "epoch": 0.4840001443053501, + "grad_norm": 0.13994166254997253, + "learning_rate": 0.0001806523307836629, + "loss": 0.1503, + "step": 6708 + }, + { + "epoch": 0.48407229698041054, + "grad_norm": 0.14901243150234222, + "learning_rate": 0.00018064944436426615, + "loss": 0.1474, + "step": 6709 + }, + { + "epoch": 0.484144449655471, + "grad_norm": 0.1526750922203064, + "learning_rate": 0.0001806465579448694, + "loss": 0.1894, + "step": 6710 + }, + { + "epoch": 0.48421660233053143, + "grad_norm": 0.19044718146324158, + "learning_rate": 0.00018064367152547265, + "loss": 0.1503, + "step": 6711 + }, + { + "epoch": 0.4842887550055918, + "grad_norm": 0.12334035336971283, + "learning_rate": 0.00018064078510607591, + "loss": 0.202, + "step": 6712 + }, + { + "epoch": 0.48436090768065226, + "grad_norm": 0.11487280577421188, + "learning_rate": 0.00018063789868667918, + "loss": 0.1566, + "step": 6713 + }, + { + "epoch": 0.4844330603557127, + "grad_norm": 0.10991586744785309, + "learning_rate": 0.00018063501226728247, + "loss": 0.1498, + "step": 6714 + }, + { + "epoch": 0.4845052130307731, + "grad_norm": 0.12430540472269058, + "learning_rate": 0.0001806321258478857, + "loss": 0.1515, + "step": 6715 + }, + { + "epoch": 0.48457736570583354, + "grad_norm": 0.14053206145763397, + "learning_rate": 0.00018062923942848897, + "loss": 0.1809, + "step": 6716 + }, + { + "epoch": 0.484649518380894, + "grad_norm": 0.10123886913061142, + "learning_rate": 0.00018062635300909223, + "loss": 0.1721, + "step": 6717 + }, + { + "epoch": 0.4847216710559544, + "grad_norm": 0.18325616419315338, + "learning_rate": 0.0001806234665896955, + "loss": 0.1395, + "step": 6718 + }, + { + "epoch": 0.4847938237310148, + "grad_norm": 0.10586833953857422, + "learning_rate": 0.00018062058017029875, + "loss": 0.1674, + "step": 6719 + }, + { + "epoch": 0.48486597640607526, + "grad_norm": 0.11999451369047165, + "learning_rate": 0.00018061769375090202, + "loss": 0.2388, + "step": 6720 + }, + { + "epoch": 0.4849381290811357, + "grad_norm": 0.12448670715093613, + "learning_rate": 0.00018061480733150528, + "loss": 0.1352, + "step": 6721 + }, + { + "epoch": 0.4850102817561961, + "grad_norm": 0.11156656593084335, + "learning_rate": 0.00018061192091210854, + "loss": 0.1393, + "step": 6722 + }, + { + "epoch": 0.48508243443125654, + "grad_norm": 0.1126633808016777, + "learning_rate": 0.0001806090344927118, + "loss": 0.1515, + "step": 6723 + }, + { + "epoch": 0.485154587106317, + "grad_norm": 0.1979413479566574, + "learning_rate": 0.00018060614807331507, + "loss": 0.1535, + "step": 6724 + }, + { + "epoch": 0.48522673978137737, + "grad_norm": 0.12131483107805252, + "learning_rate": 0.00018060326165391833, + "loss": 0.1885, + "step": 6725 + }, + { + "epoch": 0.4852988924564378, + "grad_norm": 0.1307258903980255, + "learning_rate": 0.00018060037523452157, + "loss": 0.1509, + "step": 6726 + }, + { + "epoch": 0.48537104513149826, + "grad_norm": 0.12998425960540771, + "learning_rate": 0.00018059748881512483, + "loss": 0.1277, + "step": 6727 + }, + { + "epoch": 0.4854431978065587, + "grad_norm": 0.13239721953868866, + "learning_rate": 0.00018059460239572812, + "loss": 0.2174, + "step": 6728 + }, + { + "epoch": 0.4855153504816191, + "grad_norm": 0.16458624601364136, + "learning_rate": 0.00018059171597633138, + "loss": 0.1372, + "step": 6729 + }, + { + "epoch": 0.48558750315667953, + "grad_norm": 0.18854382634162903, + "learning_rate": 0.00018058882955693465, + "loss": 0.1805, + "step": 6730 + }, + { + "epoch": 0.48565965583174, + "grad_norm": 0.1215885728597641, + "learning_rate": 0.00018058594313753788, + "loss": 0.1563, + "step": 6731 + }, + { + "epoch": 0.48573180850680037, + "grad_norm": 0.1417505294084549, + "learning_rate": 0.00018058305671814115, + "loss": 0.2168, + "step": 6732 + }, + { + "epoch": 0.4858039611818608, + "grad_norm": 0.138729989528656, + "learning_rate": 0.0001805801702987444, + "loss": 0.1613, + "step": 6733 + }, + { + "epoch": 0.48587611385692125, + "grad_norm": 0.12912528216838837, + "learning_rate": 0.00018057728387934767, + "loss": 0.1352, + "step": 6734 + }, + { + "epoch": 0.4859482665319817, + "grad_norm": 0.13241249322891235, + "learning_rate": 0.00018057439745995096, + "loss": 0.1539, + "step": 6735 + }, + { + "epoch": 0.4860204192070421, + "grad_norm": 0.09461740404367447, + "learning_rate": 0.0001805715110405542, + "loss": 0.1591, + "step": 6736 + }, + { + "epoch": 0.48609257188210253, + "grad_norm": 0.11101723462343216, + "learning_rate": 0.00018056862462115746, + "loss": 0.1714, + "step": 6737 + }, + { + "epoch": 0.486164724557163, + "grad_norm": 0.11402534693479538, + "learning_rate": 0.00018056573820176072, + "loss": 0.133, + "step": 6738 + }, + { + "epoch": 0.48623687723222336, + "grad_norm": 0.12268409132957458, + "learning_rate": 0.00018056285178236399, + "loss": 0.171, + "step": 6739 + }, + { + "epoch": 0.4863090299072838, + "grad_norm": 0.11872043460607529, + "learning_rate": 0.00018055996536296725, + "loss": 0.1358, + "step": 6740 + }, + { + "epoch": 0.48638118258234425, + "grad_norm": 0.18621380627155304, + "learning_rate": 0.0001805570789435705, + "loss": 0.1265, + "step": 6741 + }, + { + "epoch": 0.4864533352574047, + "grad_norm": 0.14223319292068481, + "learning_rate": 0.00018055419252417377, + "loss": 0.1406, + "step": 6742 + }, + { + "epoch": 0.4865254879324651, + "grad_norm": 0.1255837380886078, + "learning_rate": 0.00018055130610477704, + "loss": 0.1574, + "step": 6743 + }, + { + "epoch": 0.48659764060752553, + "grad_norm": 0.11430840194225311, + "learning_rate": 0.0001805484196853803, + "loss": 0.1556, + "step": 6744 + }, + { + "epoch": 0.48666979328258597, + "grad_norm": 0.14644275605678558, + "learning_rate": 0.00018054553326598356, + "loss": 0.1527, + "step": 6745 + }, + { + "epoch": 0.48674194595764636, + "grad_norm": 0.12583127617835999, + "learning_rate": 0.00018054264684658683, + "loss": 0.1432, + "step": 6746 + }, + { + "epoch": 0.4868140986327068, + "grad_norm": 0.10729315131902695, + "learning_rate": 0.00018053976042719006, + "loss": 0.0996, + "step": 6747 + }, + { + "epoch": 0.48688625130776725, + "grad_norm": 0.10388250648975372, + "learning_rate": 0.00018053687400779332, + "loss": 0.1448, + "step": 6748 + }, + { + "epoch": 0.48695840398282764, + "grad_norm": 0.1295686513185501, + "learning_rate": 0.00018053398758839661, + "loss": 0.1811, + "step": 6749 + }, + { + "epoch": 0.4870305566578881, + "grad_norm": 0.12884283065795898, + "learning_rate": 0.00018053110116899988, + "loss": 0.1848, + "step": 6750 + }, + { + "epoch": 0.4871027093329485, + "grad_norm": 0.16300494968891144, + "learning_rate": 0.00018052821474960314, + "loss": 0.1611, + "step": 6751 + }, + { + "epoch": 0.48717486200800897, + "grad_norm": 0.10789994150400162, + "learning_rate": 0.00018052532833020638, + "loss": 0.1218, + "step": 6752 + }, + { + "epoch": 0.48724701468306936, + "grad_norm": 0.15742021799087524, + "learning_rate": 0.00018052244191080964, + "loss": 0.165, + "step": 6753 + }, + { + "epoch": 0.4873191673581298, + "grad_norm": 0.11617636680603027, + "learning_rate": 0.0001805195554914129, + "loss": 0.1744, + "step": 6754 + }, + { + "epoch": 0.48739132003319025, + "grad_norm": 0.13167405128479004, + "learning_rate": 0.00018051666907201617, + "loss": 0.1724, + "step": 6755 + }, + { + "epoch": 0.48746347270825063, + "grad_norm": 0.12322933971881866, + "learning_rate": 0.00018051378265261946, + "loss": 0.1461, + "step": 6756 + }, + { + "epoch": 0.4875356253833111, + "grad_norm": 0.14227062463760376, + "learning_rate": 0.0001805108962332227, + "loss": 0.1319, + "step": 6757 + }, + { + "epoch": 0.4876077780583715, + "grad_norm": 0.11945953965187073, + "learning_rate": 0.00018050800981382595, + "loss": 0.1424, + "step": 6758 + }, + { + "epoch": 0.48767993073343197, + "grad_norm": 0.11324942857027054, + "learning_rate": 0.00018050512339442922, + "loss": 0.1273, + "step": 6759 + }, + { + "epoch": 0.48775208340849235, + "grad_norm": 0.1219736859202385, + "learning_rate": 0.00018050223697503248, + "loss": 0.182, + "step": 6760 + }, + { + "epoch": 0.4878242360835528, + "grad_norm": 0.15173614025115967, + "learning_rate": 0.00018049935055563574, + "loss": 0.1324, + "step": 6761 + }, + { + "epoch": 0.48789638875861324, + "grad_norm": 0.11266780644655228, + "learning_rate": 0.000180496464136239, + "loss": 0.1393, + "step": 6762 + }, + { + "epoch": 0.48796854143367363, + "grad_norm": 0.12031975388526917, + "learning_rate": 0.00018049357771684224, + "loss": 0.1619, + "step": 6763 + }, + { + "epoch": 0.4880406941087341, + "grad_norm": 0.13273458182811737, + "learning_rate": 0.00018049069129744553, + "loss": 0.1478, + "step": 6764 + }, + { + "epoch": 0.4881128467837945, + "grad_norm": 0.15466207265853882, + "learning_rate": 0.0001804878048780488, + "loss": 0.1648, + "step": 6765 + }, + { + "epoch": 0.48818499945885496, + "grad_norm": 0.12362847477197647, + "learning_rate": 0.00018048491845865206, + "loss": 0.1551, + "step": 6766 + }, + { + "epoch": 0.48825715213391535, + "grad_norm": 0.11828542500734329, + "learning_rate": 0.00018048203203925532, + "loss": 0.2099, + "step": 6767 + }, + { + "epoch": 0.4883293048089758, + "grad_norm": 0.1162853091955185, + "learning_rate": 0.00018047914561985856, + "loss": 0.1516, + "step": 6768 + }, + { + "epoch": 0.48840145748403624, + "grad_norm": 0.1452043503522873, + "learning_rate": 0.00018047625920046182, + "loss": 0.2206, + "step": 6769 + }, + { + "epoch": 0.4884736101590966, + "grad_norm": 0.13743065297603607, + "learning_rate": 0.00018047337278106508, + "loss": 0.2595, + "step": 6770 + }, + { + "epoch": 0.48854576283415707, + "grad_norm": 0.12419189512729645, + "learning_rate": 0.00018047048636166837, + "loss": 0.1532, + "step": 6771 + }, + { + "epoch": 0.4886179155092175, + "grad_norm": 0.10468582808971405, + "learning_rate": 0.00018046759994227163, + "loss": 0.235, + "step": 6772 + }, + { + "epoch": 0.48869006818427796, + "grad_norm": 0.16387949883937836, + "learning_rate": 0.00018046471352287487, + "loss": 0.1454, + "step": 6773 + }, + { + "epoch": 0.48876222085933835, + "grad_norm": 0.11070086807012558, + "learning_rate": 0.00018046182710347813, + "loss": 0.1507, + "step": 6774 + }, + { + "epoch": 0.4888343735343988, + "grad_norm": 0.12430056184530258, + "learning_rate": 0.0001804589406840814, + "loss": 0.1779, + "step": 6775 + }, + { + "epoch": 0.48890652620945924, + "grad_norm": 0.121668241918087, + "learning_rate": 0.00018045605426468466, + "loss": 0.153, + "step": 6776 + }, + { + "epoch": 0.4889786788845196, + "grad_norm": 0.12041906267404556, + "learning_rate": 0.00018045316784528792, + "loss": 0.126, + "step": 6777 + }, + { + "epoch": 0.48905083155958007, + "grad_norm": 0.11366225779056549, + "learning_rate": 0.0001804502814258912, + "loss": 0.1017, + "step": 6778 + }, + { + "epoch": 0.4891229842346405, + "grad_norm": 0.16463147103786469, + "learning_rate": 0.00018044739500649445, + "loss": 0.1559, + "step": 6779 + }, + { + "epoch": 0.4891951369097009, + "grad_norm": 0.13968990743160248, + "learning_rate": 0.0001804445085870977, + "loss": 0.1147, + "step": 6780 + }, + { + "epoch": 0.48926728958476134, + "grad_norm": 0.1277603954076767, + "learning_rate": 0.00018044162216770097, + "loss": 0.1485, + "step": 6781 + }, + { + "epoch": 0.4893394422598218, + "grad_norm": 0.1520412266254425, + "learning_rate": 0.00018043873574830424, + "loss": 0.1495, + "step": 6782 + }, + { + "epoch": 0.48941159493488223, + "grad_norm": 0.11583957821130753, + "learning_rate": 0.0001804358493289075, + "loss": 0.1676, + "step": 6783 + }, + { + "epoch": 0.4894837476099426, + "grad_norm": 0.10783440619707108, + "learning_rate": 0.00018043296290951076, + "loss": 0.1188, + "step": 6784 + }, + { + "epoch": 0.48955590028500306, + "grad_norm": 0.10469064116477966, + "learning_rate": 0.00018043007649011403, + "loss": 0.1177, + "step": 6785 + }, + { + "epoch": 0.4896280529600635, + "grad_norm": 0.11987747251987457, + "learning_rate": 0.0001804271900707173, + "loss": 0.1044, + "step": 6786 + }, + { + "epoch": 0.4897002056351239, + "grad_norm": 0.12509019672870636, + "learning_rate": 0.00018042430365132055, + "loss": 0.1357, + "step": 6787 + }, + { + "epoch": 0.48977235831018434, + "grad_norm": 0.13454003632068634, + "learning_rate": 0.00018042141723192381, + "loss": 0.1973, + "step": 6788 + }, + { + "epoch": 0.4898445109852448, + "grad_norm": 0.1286957859992981, + "learning_rate": 0.00018041853081252708, + "loss": 0.1868, + "step": 6789 + }, + { + "epoch": 0.48991666366030523, + "grad_norm": 0.09743602573871613, + "learning_rate": 0.0001804156443931303, + "loss": 0.2109, + "step": 6790 + }, + { + "epoch": 0.4899888163353656, + "grad_norm": 0.10004373639822006, + "learning_rate": 0.00018041275797373358, + "loss": 0.1382, + "step": 6791 + }, + { + "epoch": 0.49006096901042606, + "grad_norm": 0.09505674242973328, + "learning_rate": 0.00018040987155433687, + "loss": 0.1378, + "step": 6792 + }, + { + "epoch": 0.4901331216854865, + "grad_norm": 0.0980699211359024, + "learning_rate": 0.00018040698513494013, + "loss": 0.1446, + "step": 6793 + }, + { + "epoch": 0.4902052743605469, + "grad_norm": 0.1165425106883049, + "learning_rate": 0.0001804040987155434, + "loss": 0.1441, + "step": 6794 + }, + { + "epoch": 0.49027742703560734, + "grad_norm": 0.12139949202537537, + "learning_rate": 0.00018040121229614663, + "loss": 0.1555, + "step": 6795 + }, + { + "epoch": 0.4903495797106678, + "grad_norm": 0.10587059706449509, + "learning_rate": 0.0001803983258767499, + "loss": 0.1527, + "step": 6796 + }, + { + "epoch": 0.4904217323857282, + "grad_norm": 0.11615040153265, + "learning_rate": 0.00018039543945735315, + "loss": 0.1349, + "step": 6797 + }, + { + "epoch": 0.4904938850607886, + "grad_norm": 0.1317576915025711, + "learning_rate": 0.00018039255303795642, + "loss": 0.1482, + "step": 6798 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 0.10900890082120895, + "learning_rate": 0.0001803896666185597, + "loss": 0.1226, + "step": 6799 + }, + { + "epoch": 0.4906381904109095, + "grad_norm": 0.12659330666065216, + "learning_rate": 0.00018038678019916294, + "loss": 0.1639, + "step": 6800 + }, + { + "epoch": 0.4907103430859699, + "grad_norm": 0.1328059285879135, + "learning_rate": 0.0001803838937797662, + "loss": 0.1755, + "step": 6801 + }, + { + "epoch": 0.49078249576103034, + "grad_norm": 0.12249781936407089, + "learning_rate": 0.00018038100736036947, + "loss": 0.1605, + "step": 6802 + }, + { + "epoch": 0.4908546484360908, + "grad_norm": 0.11942338198423386, + "learning_rate": 0.00018037812094097273, + "loss": 0.2028, + "step": 6803 + }, + { + "epoch": 0.4909268011111512, + "grad_norm": 0.1126035749912262, + "learning_rate": 0.000180375234521576, + "loss": 0.1113, + "step": 6804 + }, + { + "epoch": 0.4909989537862116, + "grad_norm": 0.11068452149629593, + "learning_rate": 0.00018037234810217926, + "loss": 0.1557, + "step": 6805 + }, + { + "epoch": 0.49107110646127206, + "grad_norm": 0.10908321291208267, + "learning_rate": 0.00018036946168278252, + "loss": 0.1912, + "step": 6806 + }, + { + "epoch": 0.4911432591363325, + "grad_norm": 0.10851013660430908, + "learning_rate": 0.00018036657526338578, + "loss": 0.1393, + "step": 6807 + }, + { + "epoch": 0.4912154118113929, + "grad_norm": 0.11371336877346039, + "learning_rate": 0.00018036368884398905, + "loss": 0.1267, + "step": 6808 + }, + { + "epoch": 0.49128756448645333, + "grad_norm": 0.1305808126926422, + "learning_rate": 0.0001803608024245923, + "loss": 0.1854, + "step": 6809 + }, + { + "epoch": 0.4913597171615138, + "grad_norm": 0.13042595982551575, + "learning_rate": 0.00018035791600519557, + "loss": 0.1337, + "step": 6810 + }, + { + "epoch": 0.49143186983657416, + "grad_norm": 0.11381411552429199, + "learning_rate": 0.0001803550295857988, + "loss": 0.1338, + "step": 6811 + }, + { + "epoch": 0.4915040225116346, + "grad_norm": 0.24770130217075348, + "learning_rate": 0.00018035214316640207, + "loss": 0.1505, + "step": 6812 + }, + { + "epoch": 0.49157617518669505, + "grad_norm": 0.11011113971471786, + "learning_rate": 0.00018034925674700536, + "loss": 0.1929, + "step": 6813 + }, + { + "epoch": 0.4916483278617555, + "grad_norm": 0.09932421892881393, + "learning_rate": 0.00018034637032760862, + "loss": 0.1531, + "step": 6814 + }, + { + "epoch": 0.4917204805368159, + "grad_norm": 0.1148666962981224, + "learning_rate": 0.00018034348390821189, + "loss": 0.1751, + "step": 6815 + }, + { + "epoch": 0.49179263321187633, + "grad_norm": 0.13124980032444, + "learning_rate": 0.00018034059748881512, + "loss": 0.1797, + "step": 6816 + }, + { + "epoch": 0.4918647858869368, + "grad_norm": 0.1289656162261963, + "learning_rate": 0.00018033771106941839, + "loss": 0.1274, + "step": 6817 + }, + { + "epoch": 0.49193693856199716, + "grad_norm": 0.1168791651725769, + "learning_rate": 0.00018033482465002165, + "loss": 0.163, + "step": 6818 + }, + { + "epoch": 0.4920090912370576, + "grad_norm": 0.1393895000219345, + "learning_rate": 0.0001803319382306249, + "loss": 0.1439, + "step": 6819 + }, + { + "epoch": 0.49208124391211805, + "grad_norm": 0.12687747180461884, + "learning_rate": 0.0001803290518112282, + "loss": 0.1318, + "step": 6820 + }, + { + "epoch": 0.4921533965871785, + "grad_norm": 0.13864044845104218, + "learning_rate": 0.00018032616539183144, + "loss": 0.1782, + "step": 6821 + }, + { + "epoch": 0.4922255492622389, + "grad_norm": 0.15509755909442902, + "learning_rate": 0.0001803232789724347, + "loss": 0.1428, + "step": 6822 + }, + { + "epoch": 0.4922977019372993, + "grad_norm": 0.1268070787191391, + "learning_rate": 0.00018032039255303796, + "loss": 0.1352, + "step": 6823 + }, + { + "epoch": 0.49236985461235977, + "grad_norm": 0.14477965235710144, + "learning_rate": 0.00018031750613364123, + "loss": 0.1877, + "step": 6824 + }, + { + "epoch": 0.49244200728742016, + "grad_norm": 0.11024410277605057, + "learning_rate": 0.0001803146197142445, + "loss": 0.157, + "step": 6825 + }, + { + "epoch": 0.4925141599624806, + "grad_norm": 0.1326301097869873, + "learning_rate": 0.00018031173329484775, + "loss": 0.1775, + "step": 6826 + }, + { + "epoch": 0.49258631263754105, + "grad_norm": 0.11130791902542114, + "learning_rate": 0.00018030884687545101, + "loss": 0.1153, + "step": 6827 + }, + { + "epoch": 0.4926584653126015, + "grad_norm": 0.14091289043426514, + "learning_rate": 0.00018030596045605428, + "loss": 0.1392, + "step": 6828 + }, + { + "epoch": 0.4927306179876619, + "grad_norm": 0.1333671510219574, + "learning_rate": 0.00018030307403665754, + "loss": 0.1282, + "step": 6829 + }, + { + "epoch": 0.4928027706627223, + "grad_norm": 0.12756669521331787, + "learning_rate": 0.0001803001876172608, + "loss": 0.1384, + "step": 6830 + }, + { + "epoch": 0.49287492333778277, + "grad_norm": 0.13180451095104218, + "learning_rate": 0.00018029730119786407, + "loss": 0.1725, + "step": 6831 + }, + { + "epoch": 0.49294707601284316, + "grad_norm": 0.12362208217382431, + "learning_rate": 0.0001802944147784673, + "loss": 0.2103, + "step": 6832 + }, + { + "epoch": 0.4930192286879036, + "grad_norm": 0.11759877949953079, + "learning_rate": 0.00018029152835907056, + "loss": 0.1783, + "step": 6833 + }, + { + "epoch": 0.49309138136296404, + "grad_norm": 0.13143250346183777, + "learning_rate": 0.00018028864193967385, + "loss": 0.2168, + "step": 6834 + }, + { + "epoch": 0.4931635340380245, + "grad_norm": 0.20306192338466644, + "learning_rate": 0.00018028575552027712, + "loss": 0.1445, + "step": 6835 + }, + { + "epoch": 0.4932356867130849, + "grad_norm": 0.12542825937271118, + "learning_rate": 0.00018028286910088038, + "loss": 0.1667, + "step": 6836 + }, + { + "epoch": 0.4933078393881453, + "grad_norm": 0.16882474720478058, + "learning_rate": 0.00018027998268148362, + "loss": 0.1411, + "step": 6837 + }, + { + "epoch": 0.49337999206320576, + "grad_norm": 0.12080718576908112, + "learning_rate": 0.00018027709626208688, + "loss": 0.1399, + "step": 6838 + }, + { + "epoch": 0.49345214473826615, + "grad_norm": 0.13088224828243256, + "learning_rate": 0.00018027420984269014, + "loss": 0.1605, + "step": 6839 + }, + { + "epoch": 0.4935242974133266, + "grad_norm": 0.11381285637617111, + "learning_rate": 0.0001802713234232934, + "loss": 0.1636, + "step": 6840 + }, + { + "epoch": 0.49359645008838704, + "grad_norm": 0.13267774879932404, + "learning_rate": 0.0001802684370038967, + "loss": 0.1716, + "step": 6841 + }, + { + "epoch": 0.49366860276344743, + "grad_norm": 0.12944793701171875, + "learning_rate": 0.00018026555058449993, + "loss": 0.1396, + "step": 6842 + }, + { + "epoch": 0.4937407554385079, + "grad_norm": 0.12048157304525375, + "learning_rate": 0.0001802626641651032, + "loss": 0.1656, + "step": 6843 + }, + { + "epoch": 0.4938129081135683, + "grad_norm": 0.14227688312530518, + "learning_rate": 0.00018025977774570646, + "loss": 0.1323, + "step": 6844 + }, + { + "epoch": 0.49388506078862876, + "grad_norm": 0.11630921810865402, + "learning_rate": 0.00018025689132630972, + "loss": 0.1066, + "step": 6845 + }, + { + "epoch": 0.49395721346368915, + "grad_norm": 0.12264031171798706, + "learning_rate": 0.00018025400490691298, + "loss": 0.1489, + "step": 6846 + }, + { + "epoch": 0.4940293661387496, + "grad_norm": 0.0999833270907402, + "learning_rate": 0.00018025111848751625, + "loss": 0.1859, + "step": 6847 + }, + { + "epoch": 0.49410151881381004, + "grad_norm": 0.10908046364784241, + "learning_rate": 0.0001802482320681195, + "loss": 0.1322, + "step": 6848 + }, + { + "epoch": 0.4941736714888704, + "grad_norm": 0.115277960896492, + "learning_rate": 0.00018024534564872277, + "loss": 0.1943, + "step": 6849 + }, + { + "epoch": 0.49424582416393087, + "grad_norm": 0.17231899499893188, + "learning_rate": 0.00018024245922932603, + "loss": 0.2096, + "step": 6850 + }, + { + "epoch": 0.4943179768389913, + "grad_norm": 0.11707253009080887, + "learning_rate": 0.0001802395728099293, + "loss": 0.2134, + "step": 6851 + }, + { + "epoch": 0.49439012951405176, + "grad_norm": 0.10661081969738007, + "learning_rate": 0.00018023668639053256, + "loss": 0.166, + "step": 6852 + }, + { + "epoch": 0.49446228218911215, + "grad_norm": 0.11799009889364243, + "learning_rate": 0.0001802337999711358, + "loss": 0.1633, + "step": 6853 + }, + { + "epoch": 0.4945344348641726, + "grad_norm": 0.11546391248703003, + "learning_rate": 0.00018023091355173906, + "loss": 0.1618, + "step": 6854 + }, + { + "epoch": 0.49460658753923303, + "grad_norm": 0.14451655745506287, + "learning_rate": 0.00018022802713234235, + "loss": 0.2062, + "step": 6855 + }, + { + "epoch": 0.4946787402142934, + "grad_norm": 0.12024658918380737, + "learning_rate": 0.0001802251407129456, + "loss": 0.1083, + "step": 6856 + }, + { + "epoch": 0.49475089288935387, + "grad_norm": 0.10264474898576736, + "learning_rate": 0.00018022225429354887, + "loss": 0.1322, + "step": 6857 + }, + { + "epoch": 0.4948230455644143, + "grad_norm": 0.1129191517829895, + "learning_rate": 0.0001802193678741521, + "loss": 0.198, + "step": 6858 + }, + { + "epoch": 0.49489519823947475, + "grad_norm": 0.14828601479530334, + "learning_rate": 0.00018021648145475537, + "loss": 0.1402, + "step": 6859 + }, + { + "epoch": 0.49496735091453514, + "grad_norm": 0.10332966595888138, + "learning_rate": 0.00018021359503535864, + "loss": 0.1265, + "step": 6860 + }, + { + "epoch": 0.4950395035895956, + "grad_norm": 0.13966286182403564, + "learning_rate": 0.0001802107086159619, + "loss": 0.1923, + "step": 6861 + }, + { + "epoch": 0.49511165626465603, + "grad_norm": 0.13843166828155518, + "learning_rate": 0.0001802078221965652, + "loss": 0.1386, + "step": 6862 + }, + { + "epoch": 0.4951838089397164, + "grad_norm": 0.11983319371938705, + "learning_rate": 0.00018020493577716843, + "loss": 0.1604, + "step": 6863 + }, + { + "epoch": 0.49525596161477686, + "grad_norm": 0.14071129262447357, + "learning_rate": 0.0001802020493577717, + "loss": 0.1664, + "step": 6864 + }, + { + "epoch": 0.4953281142898373, + "grad_norm": 0.10576890408992767, + "learning_rate": 0.00018019916293837495, + "loss": 0.1891, + "step": 6865 + }, + { + "epoch": 0.49540026696489775, + "grad_norm": 0.11082214117050171, + "learning_rate": 0.00018019627651897821, + "loss": 0.1443, + "step": 6866 + }, + { + "epoch": 0.49547241963995814, + "grad_norm": 0.11030527949333191, + "learning_rate": 0.00018019339009958148, + "loss": 0.1156, + "step": 6867 + }, + { + "epoch": 0.4955445723150186, + "grad_norm": 0.17411255836486816, + "learning_rate": 0.00018019050368018474, + "loss": 0.1127, + "step": 6868 + }, + { + "epoch": 0.495616724990079, + "grad_norm": 0.11475282907485962, + "learning_rate": 0.000180187617260788, + "loss": 0.1339, + "step": 6869 + }, + { + "epoch": 0.4956888776651394, + "grad_norm": 0.12999731302261353, + "learning_rate": 0.00018018473084139127, + "loss": 0.1332, + "step": 6870 + }, + { + "epoch": 0.49576103034019986, + "grad_norm": 0.13385990262031555, + "learning_rate": 0.00018018184442199453, + "loss": 0.1379, + "step": 6871 + }, + { + "epoch": 0.4958331830152603, + "grad_norm": 0.13611887395381927, + "learning_rate": 0.0001801789580025978, + "loss": 0.1593, + "step": 6872 + }, + { + "epoch": 0.4959053356903207, + "grad_norm": 0.11021700501441956, + "learning_rate": 0.00018017607158320105, + "loss": 0.1198, + "step": 6873 + }, + { + "epoch": 0.49597748836538114, + "grad_norm": 0.12136051058769226, + "learning_rate": 0.0001801731851638043, + "loss": 0.1448, + "step": 6874 + }, + { + "epoch": 0.4960496410404416, + "grad_norm": 0.11286564916372299, + "learning_rate": 0.00018017029874440755, + "loss": 0.16, + "step": 6875 + }, + { + "epoch": 0.496121793715502, + "grad_norm": 0.11558818817138672, + "learning_rate": 0.00018016741232501084, + "loss": 0.1311, + "step": 6876 + }, + { + "epoch": 0.4961939463905624, + "grad_norm": 0.1303420513868332, + "learning_rate": 0.0001801645259056141, + "loss": 0.1402, + "step": 6877 + }, + { + "epoch": 0.49626609906562286, + "grad_norm": 0.11762979626655579, + "learning_rate": 0.00018016163948621737, + "loss": 0.2148, + "step": 6878 + }, + { + "epoch": 0.4963382517406833, + "grad_norm": 0.1510939747095108, + "learning_rate": 0.0001801587530668206, + "loss": 0.1713, + "step": 6879 + }, + { + "epoch": 0.4964104044157437, + "grad_norm": 0.10852061212062836, + "learning_rate": 0.00018015586664742387, + "loss": 0.1599, + "step": 6880 + }, + { + "epoch": 0.49648255709080413, + "grad_norm": 0.1138058751821518, + "learning_rate": 0.00018015298022802713, + "loss": 0.1685, + "step": 6881 + }, + { + "epoch": 0.4965547097658646, + "grad_norm": 0.11576670408248901, + "learning_rate": 0.0001801500938086304, + "loss": 0.2199, + "step": 6882 + }, + { + "epoch": 0.496626862440925, + "grad_norm": 0.10762116312980652, + "learning_rate": 0.00018014720738923368, + "loss": 0.1245, + "step": 6883 + }, + { + "epoch": 0.4966990151159854, + "grad_norm": 0.10610596090555191, + "learning_rate": 0.00018014432096983692, + "loss": 0.1297, + "step": 6884 + }, + { + "epoch": 0.49677116779104585, + "grad_norm": 0.13391663134098053, + "learning_rate": 0.00018014143455044018, + "loss": 0.2265, + "step": 6885 + }, + { + "epoch": 0.4968433204661063, + "grad_norm": 0.15051336586475372, + "learning_rate": 0.00018013854813104345, + "loss": 0.1682, + "step": 6886 + }, + { + "epoch": 0.4969154731411667, + "grad_norm": 0.12215742468833923, + "learning_rate": 0.0001801356617116467, + "loss": 0.1466, + "step": 6887 + }, + { + "epoch": 0.49698762581622713, + "grad_norm": 0.10626982152462006, + "learning_rate": 0.00018013277529224997, + "loss": 0.131, + "step": 6888 + }, + { + "epoch": 0.4970597784912876, + "grad_norm": 0.10880763083696365, + "learning_rate": 0.00018012988887285323, + "loss": 0.1352, + "step": 6889 + }, + { + "epoch": 0.497131931166348, + "grad_norm": 0.10323496162891388, + "learning_rate": 0.0001801270024534565, + "loss": 0.1245, + "step": 6890 + }, + { + "epoch": 0.4972040838414084, + "grad_norm": 0.1361677497625351, + "learning_rate": 0.00018012411603405976, + "loss": 0.2063, + "step": 6891 + }, + { + "epoch": 0.49727623651646885, + "grad_norm": 0.1083250641822815, + "learning_rate": 0.00018012122961466302, + "loss": 0.1222, + "step": 6892 + }, + { + "epoch": 0.4973483891915293, + "grad_norm": 0.11975391954183578, + "learning_rate": 0.00018011834319526629, + "loss": 0.1638, + "step": 6893 + }, + { + "epoch": 0.4974205418665897, + "grad_norm": 0.09761971235275269, + "learning_rate": 0.00018011545677586955, + "loss": 0.1286, + "step": 6894 + }, + { + "epoch": 0.4974926945416501, + "grad_norm": 0.11344848573207855, + "learning_rate": 0.0001801125703564728, + "loss": 0.1573, + "step": 6895 + }, + { + "epoch": 0.49756484721671057, + "grad_norm": 0.11941707134246826, + "learning_rate": 0.00018010968393707605, + "loss": 0.1608, + "step": 6896 + }, + { + "epoch": 0.497636999891771, + "grad_norm": 0.10896875709295273, + "learning_rate": 0.00018010679751767934, + "loss": 0.136, + "step": 6897 + }, + { + "epoch": 0.4977091525668314, + "grad_norm": 0.1440306007862091, + "learning_rate": 0.0001801039110982826, + "loss": 0.1656, + "step": 6898 + }, + { + "epoch": 0.49778130524189185, + "grad_norm": 0.11497805267572403, + "learning_rate": 0.00018010102467888586, + "loss": 0.1135, + "step": 6899 + }, + { + "epoch": 0.4978534579169523, + "grad_norm": 0.10532939434051514, + "learning_rate": 0.00018009813825948913, + "loss": 0.1423, + "step": 6900 + }, + { + "epoch": 0.4979256105920127, + "grad_norm": 0.11883176863193512, + "learning_rate": 0.00018009525184009236, + "loss": 0.1556, + "step": 6901 + }, + { + "epoch": 0.4979977632670731, + "grad_norm": 0.1338721215724945, + "learning_rate": 0.00018009236542069562, + "loss": 0.1549, + "step": 6902 + }, + { + "epoch": 0.49806991594213357, + "grad_norm": 0.11123886704444885, + "learning_rate": 0.0001800894790012989, + "loss": 0.1702, + "step": 6903 + }, + { + "epoch": 0.49814206861719396, + "grad_norm": 0.12014085054397583, + "learning_rate": 0.00018008659258190218, + "loss": 0.1508, + "step": 6904 + }, + { + "epoch": 0.4982142212922544, + "grad_norm": 0.13090504705905914, + "learning_rate": 0.00018008370616250544, + "loss": 0.136, + "step": 6905 + }, + { + "epoch": 0.49828637396731484, + "grad_norm": 0.11417842656373978, + "learning_rate": 0.00018008081974310868, + "loss": 0.1126, + "step": 6906 + }, + { + "epoch": 0.4983585266423753, + "grad_norm": 0.12810492515563965, + "learning_rate": 0.00018007793332371194, + "loss": 0.1879, + "step": 6907 + }, + { + "epoch": 0.4984306793174357, + "grad_norm": 0.14297088980674744, + "learning_rate": 0.0001800750469043152, + "loss": 0.1806, + "step": 6908 + }, + { + "epoch": 0.4985028319924961, + "grad_norm": 0.10888095945119858, + "learning_rate": 0.00018007216048491847, + "loss": 0.1546, + "step": 6909 + }, + { + "epoch": 0.49857498466755656, + "grad_norm": 0.11205391585826874, + "learning_rate": 0.00018006927406552173, + "loss": 0.1656, + "step": 6910 + }, + { + "epoch": 0.49864713734261695, + "grad_norm": 0.1569240689277649, + "learning_rate": 0.000180066387646125, + "loss": 0.1952, + "step": 6911 + }, + { + "epoch": 0.4987192900176774, + "grad_norm": 0.11143320053815842, + "learning_rate": 0.00018006350122672825, + "loss": 0.1617, + "step": 6912 + }, + { + "epoch": 0.49879144269273784, + "grad_norm": 0.15045265853405, + "learning_rate": 0.00018006061480733152, + "loss": 0.1575, + "step": 6913 + }, + { + "epoch": 0.4988635953677983, + "grad_norm": 0.13224567472934723, + "learning_rate": 0.00018005772838793478, + "loss": 0.1554, + "step": 6914 + }, + { + "epoch": 0.4989357480428587, + "grad_norm": 0.1473943293094635, + "learning_rate": 0.00018005484196853804, + "loss": 0.1615, + "step": 6915 + }, + { + "epoch": 0.4990079007179191, + "grad_norm": 0.13103948533535004, + "learning_rate": 0.0001800519555491413, + "loss": 0.2071, + "step": 6916 + }, + { + "epoch": 0.49908005339297956, + "grad_norm": 0.13195586204528809, + "learning_rate": 0.00018004906912974454, + "loss": 0.1665, + "step": 6917 + }, + { + "epoch": 0.49915220606803995, + "grad_norm": 0.12213563174009323, + "learning_rate": 0.00018004618271034783, + "loss": 0.1648, + "step": 6918 + }, + { + "epoch": 0.4992243587431004, + "grad_norm": 0.1488204449415207, + "learning_rate": 0.0001800432962909511, + "loss": 0.155, + "step": 6919 + }, + { + "epoch": 0.49929651141816084, + "grad_norm": 0.1254352629184723, + "learning_rate": 0.00018004040987155436, + "loss": 0.1236, + "step": 6920 + }, + { + "epoch": 0.4993686640932213, + "grad_norm": 0.12986153364181519, + "learning_rate": 0.00018003752345215762, + "loss": 0.1576, + "step": 6921 + }, + { + "epoch": 0.49944081676828167, + "grad_norm": 0.1283103972673416, + "learning_rate": 0.00018003463703276086, + "loss": 0.1107, + "step": 6922 + }, + { + "epoch": 0.4995129694433421, + "grad_norm": 0.11497905850410461, + "learning_rate": 0.00018003175061336412, + "loss": 0.1333, + "step": 6923 + }, + { + "epoch": 0.49958512211840256, + "grad_norm": 0.1354951113462448, + "learning_rate": 0.00018002886419396738, + "loss": 0.1466, + "step": 6924 + }, + { + "epoch": 0.49965727479346295, + "grad_norm": 0.12747453153133392, + "learning_rate": 0.00018002597777457067, + "loss": 0.1574, + "step": 6925 + }, + { + "epoch": 0.4997294274685234, + "grad_norm": 0.11834773421287537, + "learning_rate": 0.00018002309135517394, + "loss": 0.1301, + "step": 6926 + }, + { + "epoch": 0.49980158014358383, + "grad_norm": 0.13238999247550964, + "learning_rate": 0.00018002020493577717, + "loss": 0.1679, + "step": 6927 + }, + { + "epoch": 0.4998737328186443, + "grad_norm": 0.1121571809053421, + "learning_rate": 0.00018001731851638043, + "loss": 0.0951, + "step": 6928 + }, + { + "epoch": 0.49994588549370467, + "grad_norm": 0.1254463642835617, + "learning_rate": 0.0001800144320969837, + "loss": 0.1794, + "step": 6929 + }, + { + "epoch": 0.5000180381687651, + "grad_norm": 0.13795477151870728, + "learning_rate": 0.00018001154567758696, + "loss": 0.1449, + "step": 6930 + }, + { + "epoch": 0.5000901908438256, + "grad_norm": 0.14214231073856354, + "learning_rate": 0.00018000865925819022, + "loss": 0.169, + "step": 6931 + }, + { + "epoch": 0.500162343518886, + "grad_norm": 0.1185181513428688, + "learning_rate": 0.00018000577283879349, + "loss": 0.1569, + "step": 6932 + }, + { + "epoch": 0.5002344961939464, + "grad_norm": 0.12954311072826385, + "learning_rate": 0.00018000288641939675, + "loss": 0.1504, + "step": 6933 + }, + { + "epoch": 0.5003066488690068, + "grad_norm": 0.11512862145900726, + "learning_rate": 0.00018, + "loss": 0.1353, + "step": 6934 + }, + { + "epoch": 0.5003788015440672, + "grad_norm": 0.11813472956418991, + "learning_rate": 0.00017999711358060327, + "loss": 0.12, + "step": 6935 + }, + { + "epoch": 0.5004509542191277, + "grad_norm": 0.11345507949590683, + "learning_rate": 0.00017999422716120654, + "loss": 0.1778, + "step": 6936 + }, + { + "epoch": 0.5005231068941881, + "grad_norm": 0.116734080016613, + "learning_rate": 0.0001799913407418098, + "loss": 0.1353, + "step": 6937 + }, + { + "epoch": 0.5005952595692486, + "grad_norm": 0.10393017530441284, + "learning_rate": 0.00017998845432241304, + "loss": 0.1672, + "step": 6938 + }, + { + "epoch": 0.500667412244309, + "grad_norm": 0.16826027631759644, + "learning_rate": 0.00017998556790301633, + "loss": 0.1595, + "step": 6939 + }, + { + "epoch": 0.5007395649193694, + "grad_norm": 0.10345373302698135, + "learning_rate": 0.0001799826814836196, + "loss": 0.188, + "step": 6940 + }, + { + "epoch": 0.5008117175944298, + "grad_norm": 0.12687575817108154, + "learning_rate": 0.00017997979506422285, + "loss": 0.1285, + "step": 6941 + }, + { + "epoch": 0.5008838702694902, + "grad_norm": 0.10807101428508759, + "learning_rate": 0.00017997690864482611, + "loss": 0.1236, + "step": 6942 + }, + { + "epoch": 0.5009560229445507, + "grad_norm": 0.1182204857468605, + "learning_rate": 0.00017997402222542935, + "loss": 0.1638, + "step": 6943 + }, + { + "epoch": 0.5010281756196111, + "grad_norm": 0.12244538217782974, + "learning_rate": 0.0001799711358060326, + "loss": 0.1899, + "step": 6944 + }, + { + "epoch": 0.5011003282946715, + "grad_norm": 0.12142857164144516, + "learning_rate": 0.00017996824938663588, + "loss": 0.1343, + "step": 6945 + }, + { + "epoch": 0.501172480969732, + "grad_norm": 0.1048036441206932, + "learning_rate": 0.00017996536296723917, + "loss": 0.1174, + "step": 6946 + }, + { + "epoch": 0.5012446336447923, + "grad_norm": 0.14296436309814453, + "learning_rate": 0.00017996247654784243, + "loss": 0.1805, + "step": 6947 + }, + { + "epoch": 0.5013167863198528, + "grad_norm": 0.1301654428243637, + "learning_rate": 0.00017995959012844567, + "loss": 0.166, + "step": 6948 + }, + { + "epoch": 0.5013889389949132, + "grad_norm": 0.12834221124649048, + "learning_rate": 0.00017995670370904893, + "loss": 0.1018, + "step": 6949 + }, + { + "epoch": 0.5014610916699737, + "grad_norm": 0.11551238596439362, + "learning_rate": 0.0001799538172896522, + "loss": 0.1263, + "step": 6950 + }, + { + "epoch": 0.5015332443450341, + "grad_norm": 0.12239497154951096, + "learning_rate": 0.00017995093087025545, + "loss": 0.1839, + "step": 6951 + }, + { + "epoch": 0.5016053970200945, + "grad_norm": 0.10061843693256378, + "learning_rate": 0.00017994804445085872, + "loss": 0.1273, + "step": 6952 + }, + { + "epoch": 0.501677549695155, + "grad_norm": 0.11200693994760513, + "learning_rate": 0.00017994515803146198, + "loss": 0.1774, + "step": 6953 + }, + { + "epoch": 0.5017497023702153, + "grad_norm": 0.12200941145420074, + "learning_rate": 0.00017994227161206524, + "loss": 0.1455, + "step": 6954 + }, + { + "epoch": 0.5018218550452758, + "grad_norm": 0.11386123299598694, + "learning_rate": 0.0001799393851926685, + "loss": 0.1651, + "step": 6955 + }, + { + "epoch": 0.5018940077203362, + "grad_norm": 0.11359906196594238, + "learning_rate": 0.00017993649877327177, + "loss": 0.109, + "step": 6956 + }, + { + "epoch": 0.5019661603953967, + "grad_norm": 0.12544330954551697, + "learning_rate": 0.00017993361235387503, + "loss": 0.1447, + "step": 6957 + }, + { + "epoch": 0.5020383130704571, + "grad_norm": 0.11329219490289688, + "learning_rate": 0.0001799307259344783, + "loss": 0.2536, + "step": 6958 + }, + { + "epoch": 0.5021104657455175, + "grad_norm": 0.12163042277097702, + "learning_rate": 0.00017992783951508153, + "loss": 0.1926, + "step": 6959 + }, + { + "epoch": 0.502182618420578, + "grad_norm": 0.10154034942388535, + "learning_rate": 0.0001799249530956848, + "loss": 0.1485, + "step": 6960 + }, + { + "epoch": 0.5022547710956383, + "grad_norm": 0.12191421538591385, + "learning_rate": 0.00017992206667628808, + "loss": 0.1388, + "step": 6961 + }, + { + "epoch": 0.5023269237706988, + "grad_norm": 0.12472362816333771, + "learning_rate": 0.00017991918025689135, + "loss": 0.1511, + "step": 6962 + }, + { + "epoch": 0.5023990764457592, + "grad_norm": 0.11926441639661789, + "learning_rate": 0.0001799162938374946, + "loss": 0.203, + "step": 6963 + }, + { + "epoch": 0.5024712291208197, + "grad_norm": 0.09784934669733047, + "learning_rate": 0.00017991340741809784, + "loss": 0.1425, + "step": 6964 + }, + { + "epoch": 0.5025433817958801, + "grad_norm": 0.12471173703670502, + "learning_rate": 0.0001799105209987011, + "loss": 0.1314, + "step": 6965 + }, + { + "epoch": 0.5026155344709405, + "grad_norm": 0.112209253013134, + "learning_rate": 0.00017990763457930437, + "loss": 0.1505, + "step": 6966 + }, + { + "epoch": 0.502687687146001, + "grad_norm": 0.13939471542835236, + "learning_rate": 0.00017990474815990763, + "loss": 0.146, + "step": 6967 + }, + { + "epoch": 0.5027598398210613, + "grad_norm": 0.13189548254013062, + "learning_rate": 0.00017990186174051092, + "loss": 0.1621, + "step": 6968 + }, + { + "epoch": 0.5028319924961218, + "grad_norm": 0.15791858732700348, + "learning_rate": 0.00017989897532111416, + "loss": 0.1749, + "step": 6969 + }, + { + "epoch": 0.5029041451711822, + "grad_norm": 0.1081838309764862, + "learning_rate": 0.00017989608890171742, + "loss": 0.1829, + "step": 6970 + }, + { + "epoch": 0.5029762978462426, + "grad_norm": 0.12584063410758972, + "learning_rate": 0.00017989320248232069, + "loss": 0.1542, + "step": 6971 + }, + { + "epoch": 0.5030484505213031, + "grad_norm": 0.12362273037433624, + "learning_rate": 0.00017989031606292395, + "loss": 0.1612, + "step": 6972 + }, + { + "epoch": 0.5031206031963635, + "grad_norm": 0.10868021845817566, + "learning_rate": 0.0001798874296435272, + "loss": 0.1536, + "step": 6973 + }, + { + "epoch": 0.503192755871424, + "grad_norm": 0.13362200558185577, + "learning_rate": 0.00017988454322413047, + "loss": 0.1671, + "step": 6974 + }, + { + "epoch": 0.5032649085464843, + "grad_norm": 0.12995874881744385, + "learning_rate": 0.00017988165680473374, + "loss": 0.1807, + "step": 6975 + }, + { + "epoch": 0.5033370612215448, + "grad_norm": 0.10249987989664078, + "learning_rate": 0.000179878770385337, + "loss": 0.174, + "step": 6976 + }, + { + "epoch": 0.5034092138966052, + "grad_norm": 0.13581307232379913, + "learning_rate": 0.00017987588396594026, + "loss": 0.1428, + "step": 6977 + }, + { + "epoch": 0.5034813665716656, + "grad_norm": 0.132228821516037, + "learning_rate": 0.00017987299754654353, + "loss": 0.1549, + "step": 6978 + }, + { + "epoch": 0.5035535192467261, + "grad_norm": 0.11819226294755936, + "learning_rate": 0.0001798701111271468, + "loss": 0.1437, + "step": 6979 + }, + { + "epoch": 0.5036256719217865, + "grad_norm": 0.13969078660011292, + "learning_rate": 0.00017986722470775002, + "loss": 0.192, + "step": 6980 + }, + { + "epoch": 0.503697824596847, + "grad_norm": 0.11371377855539322, + "learning_rate": 0.0001798643382883533, + "loss": 0.1306, + "step": 6981 + }, + { + "epoch": 0.5037699772719073, + "grad_norm": 0.11589173227548599, + "learning_rate": 0.00017986145186895658, + "loss": 0.1906, + "step": 6982 + }, + { + "epoch": 0.5038421299469678, + "grad_norm": 0.12938082218170166, + "learning_rate": 0.00017985856544955984, + "loss": 0.131, + "step": 6983 + }, + { + "epoch": 0.5039142826220282, + "grad_norm": 0.13039258122444153, + "learning_rate": 0.0001798556790301631, + "loss": 0.1316, + "step": 6984 + }, + { + "epoch": 0.5039864352970886, + "grad_norm": 0.12052374333143234, + "learning_rate": 0.00017985279261076634, + "loss": 0.1307, + "step": 6985 + }, + { + "epoch": 0.5040585879721491, + "grad_norm": 0.1361314356327057, + "learning_rate": 0.0001798499061913696, + "loss": 0.1431, + "step": 6986 + }, + { + "epoch": 0.5041307406472095, + "grad_norm": 0.1491934359073639, + "learning_rate": 0.00017984701977197286, + "loss": 0.1424, + "step": 6987 + }, + { + "epoch": 0.50420289332227, + "grad_norm": 0.10724689811468124, + "learning_rate": 0.00017984413335257613, + "loss": 0.131, + "step": 6988 + }, + { + "epoch": 0.5042750459973303, + "grad_norm": 0.129995197057724, + "learning_rate": 0.00017984124693317942, + "loss": 0.1716, + "step": 6989 + }, + { + "epoch": 0.5043471986723908, + "grad_norm": 0.1048898696899414, + "learning_rate": 0.00017983836051378265, + "loss": 0.1915, + "step": 6990 + }, + { + "epoch": 0.5044193513474512, + "grad_norm": 0.1751856803894043, + "learning_rate": 0.00017983547409438592, + "loss": 0.1597, + "step": 6991 + }, + { + "epoch": 0.5044915040225116, + "grad_norm": 0.12383807450532913, + "learning_rate": 0.00017983258767498918, + "loss": 0.1263, + "step": 6992 + }, + { + "epoch": 0.5045636566975721, + "grad_norm": 0.13870978355407715, + "learning_rate": 0.00017982970125559244, + "loss": 0.1651, + "step": 6993 + }, + { + "epoch": 0.5046358093726325, + "grad_norm": 0.12926892936229706, + "learning_rate": 0.0001798268148361957, + "loss": 0.1254, + "step": 6994 + }, + { + "epoch": 0.504707962047693, + "grad_norm": 0.11067026853561401, + "learning_rate": 0.00017982392841679897, + "loss": 0.1536, + "step": 6995 + }, + { + "epoch": 0.5047801147227533, + "grad_norm": 0.2897682189941406, + "learning_rate": 0.00017982104199740223, + "loss": 0.1842, + "step": 6996 + }, + { + "epoch": 0.5048522673978137, + "grad_norm": 0.15760228037834167, + "learning_rate": 0.0001798181555780055, + "loss": 0.1897, + "step": 6997 + }, + { + "epoch": 0.5049244200728742, + "grad_norm": 0.10852501541376114, + "learning_rate": 0.00017981526915860876, + "loss": 0.1853, + "step": 6998 + }, + { + "epoch": 0.5049965727479346, + "grad_norm": 0.1096150130033493, + "learning_rate": 0.00017981238273921202, + "loss": 0.188, + "step": 6999 + }, + { + "epoch": 0.5050687254229951, + "grad_norm": 0.15682603418827057, + "learning_rate": 0.00017980949631981528, + "loss": 0.1283, + "step": 7000 + }, + { + "epoch": 0.5051408780980555, + "grad_norm": 0.14422990381717682, + "learning_rate": 0.00017980660990041855, + "loss": 0.1779, + "step": 7001 + }, + { + "epoch": 0.505213030773116, + "grad_norm": 0.11944327503442764, + "learning_rate": 0.00017980372348102178, + "loss": 0.1191, + "step": 7002 + }, + { + "epoch": 0.5052851834481763, + "grad_norm": 0.11260118335485458, + "learning_rate": 0.00017980083706162507, + "loss": 0.1435, + "step": 7003 + }, + { + "epoch": 0.5053573361232367, + "grad_norm": 0.1541164666414261, + "learning_rate": 0.00017979795064222833, + "loss": 0.1337, + "step": 7004 + }, + { + "epoch": 0.5054294887982972, + "grad_norm": 0.12973268330097198, + "learning_rate": 0.0001797950642228316, + "loss": 0.1456, + "step": 7005 + }, + { + "epoch": 0.5055016414733576, + "grad_norm": 0.1538536548614502, + "learning_rate": 0.00017979217780343486, + "loss": 0.1787, + "step": 7006 + }, + { + "epoch": 0.5055737941484181, + "grad_norm": 0.1301107406616211, + "learning_rate": 0.0001797892913840381, + "loss": 0.1498, + "step": 7007 + }, + { + "epoch": 0.5056459468234785, + "grad_norm": 0.12300629168748856, + "learning_rate": 0.00017978640496464136, + "loss": 0.1831, + "step": 7008 + }, + { + "epoch": 0.5057180994985389, + "grad_norm": 0.13967126607894897, + "learning_rate": 0.00017978351854524462, + "loss": 0.1445, + "step": 7009 + }, + { + "epoch": 0.5057902521735993, + "grad_norm": 0.11427458375692368, + "learning_rate": 0.0001797806321258479, + "loss": 0.1792, + "step": 7010 + }, + { + "epoch": 0.5058624048486597, + "grad_norm": 0.10966087132692337, + "learning_rate": 0.00017977774570645117, + "loss": 0.1518, + "step": 7011 + }, + { + "epoch": 0.5059345575237202, + "grad_norm": 0.11489519476890564, + "learning_rate": 0.0001797748592870544, + "loss": 0.1191, + "step": 7012 + }, + { + "epoch": 0.5060067101987806, + "grad_norm": 0.10946142673492432, + "learning_rate": 0.00017977197286765767, + "loss": 0.1454, + "step": 7013 + }, + { + "epoch": 0.5060788628738411, + "grad_norm": 0.14782243967056274, + "learning_rate": 0.00017976908644826094, + "loss": 0.1324, + "step": 7014 + }, + { + "epoch": 0.5061510155489015, + "grad_norm": 0.13172070682048798, + "learning_rate": 0.0001797662000288642, + "loss": 0.1634, + "step": 7015 + }, + { + "epoch": 0.5062231682239619, + "grad_norm": 0.1212749034166336, + "learning_rate": 0.00017976331360946746, + "loss": 0.1197, + "step": 7016 + }, + { + "epoch": 0.5062953208990223, + "grad_norm": 0.1257588416337967, + "learning_rate": 0.00017976042719007073, + "loss": 0.1632, + "step": 7017 + }, + { + "epoch": 0.5063674735740827, + "grad_norm": 0.11557316780090332, + "learning_rate": 0.000179757540770674, + "loss": 0.122, + "step": 7018 + }, + { + "epoch": 0.5064396262491432, + "grad_norm": 0.09845668822526932, + "learning_rate": 0.00017975465435127725, + "loss": 0.1862, + "step": 7019 + }, + { + "epoch": 0.5065117789242036, + "grad_norm": 0.11798904091119766, + "learning_rate": 0.00017975176793188051, + "loss": 0.1551, + "step": 7020 + }, + { + "epoch": 0.5065839315992641, + "grad_norm": 0.16191120445728302, + "learning_rate": 0.00017974888151248378, + "loss": 0.1648, + "step": 7021 + }, + { + "epoch": 0.5066560842743245, + "grad_norm": 0.1420876383781433, + "learning_rate": 0.00017974599509308704, + "loss": 0.2152, + "step": 7022 + }, + { + "epoch": 0.5067282369493848, + "grad_norm": 0.13263604044914246, + "learning_rate": 0.00017974310867369028, + "loss": 0.179, + "step": 7023 + }, + { + "epoch": 0.5068003896244453, + "grad_norm": 0.12817439436912537, + "learning_rate": 0.00017974022225429357, + "loss": 0.1644, + "step": 7024 + }, + { + "epoch": 0.5068725422995057, + "grad_norm": 0.1356990933418274, + "learning_rate": 0.00017973733583489683, + "loss": 0.2159, + "step": 7025 + }, + { + "epoch": 0.5069446949745662, + "grad_norm": 0.15865840017795563, + "learning_rate": 0.0001797344494155001, + "loss": 0.1347, + "step": 7026 + }, + { + "epoch": 0.5070168476496266, + "grad_norm": 0.12952634692192078, + "learning_rate": 0.00017973156299610335, + "loss": 0.1918, + "step": 7027 + }, + { + "epoch": 0.5070890003246871, + "grad_norm": 0.12019264698028564, + "learning_rate": 0.0001797286765767066, + "loss": 0.1209, + "step": 7028 + }, + { + "epoch": 0.5071611529997475, + "grad_norm": 0.13946324586868286, + "learning_rate": 0.00017972579015730985, + "loss": 0.1278, + "step": 7029 + }, + { + "epoch": 0.5072333056748078, + "grad_norm": 0.11112209409475327, + "learning_rate": 0.00017972290373791312, + "loss": 0.0934, + "step": 7030 + }, + { + "epoch": 0.5073054583498683, + "grad_norm": 0.11413145065307617, + "learning_rate": 0.0001797200173185164, + "loss": 0.1378, + "step": 7031 + }, + { + "epoch": 0.5073776110249287, + "grad_norm": 0.12074559926986694, + "learning_rate": 0.00017971713089911967, + "loss": 0.1683, + "step": 7032 + }, + { + "epoch": 0.5074497636999892, + "grad_norm": 0.12333431094884872, + "learning_rate": 0.0001797142444797229, + "loss": 0.128, + "step": 7033 + }, + { + "epoch": 0.5075219163750496, + "grad_norm": 0.10644882917404175, + "learning_rate": 0.00017971135806032617, + "loss": 0.1364, + "step": 7034 + }, + { + "epoch": 0.5075940690501101, + "grad_norm": 0.10675278306007385, + "learning_rate": 0.00017970847164092943, + "loss": 0.1467, + "step": 7035 + }, + { + "epoch": 0.5076662217251705, + "grad_norm": 0.1398714929819107, + "learning_rate": 0.0001797055852215327, + "loss": 0.1811, + "step": 7036 + }, + { + "epoch": 0.5077383744002308, + "grad_norm": 0.11652438342571259, + "learning_rate": 0.00017970269880213596, + "loss": 0.1778, + "step": 7037 + }, + { + "epoch": 0.5078105270752913, + "grad_norm": 0.11117210984230042, + "learning_rate": 0.00017969981238273922, + "loss": 0.1226, + "step": 7038 + }, + { + "epoch": 0.5078826797503517, + "grad_norm": 0.12034925073385239, + "learning_rate": 0.00017969692596334248, + "loss": 0.2226, + "step": 7039 + }, + { + "epoch": 0.5079548324254122, + "grad_norm": 0.17814908921718597, + "learning_rate": 0.00017969403954394575, + "loss": 0.1634, + "step": 7040 + }, + { + "epoch": 0.5080269851004726, + "grad_norm": 0.12198749929666519, + "learning_rate": 0.000179691153124549, + "loss": 0.1614, + "step": 7041 + }, + { + "epoch": 0.5080991377755331, + "grad_norm": 0.10999343544244766, + "learning_rate": 0.00017968826670515227, + "loss": 0.1414, + "step": 7042 + }, + { + "epoch": 0.5081712904505935, + "grad_norm": 0.11487124860286713, + "learning_rate": 0.00017968538028575553, + "loss": 0.1703, + "step": 7043 + }, + { + "epoch": 0.5082434431256538, + "grad_norm": 0.10328925400972366, + "learning_rate": 0.00017968249386635877, + "loss": 0.147, + "step": 7044 + }, + { + "epoch": 0.5083155958007143, + "grad_norm": 0.0999947264790535, + "learning_rate": 0.00017967960744696206, + "loss": 0.1905, + "step": 7045 + }, + { + "epoch": 0.5083877484757747, + "grad_norm": 0.10743480175733566, + "learning_rate": 0.00017967672102756532, + "loss": 0.1452, + "step": 7046 + }, + { + "epoch": 0.5084599011508352, + "grad_norm": 0.11257276684045792, + "learning_rate": 0.00017967383460816859, + "loss": 0.1336, + "step": 7047 + }, + { + "epoch": 0.5085320538258956, + "grad_norm": 0.12533427774906158, + "learning_rate": 0.00017967094818877185, + "loss": 0.1068, + "step": 7048 + }, + { + "epoch": 0.5086042065009561, + "grad_norm": 0.11079025268554688, + "learning_rate": 0.00017966806176937508, + "loss": 0.1423, + "step": 7049 + }, + { + "epoch": 0.5086763591760165, + "grad_norm": 0.12263496220111847, + "learning_rate": 0.00017966517534997835, + "loss": 0.1511, + "step": 7050 + }, + { + "epoch": 0.5087485118510768, + "grad_norm": 0.1146349087357521, + "learning_rate": 0.0001796622889305816, + "loss": 0.1482, + "step": 7051 + }, + { + "epoch": 0.5088206645261373, + "grad_norm": 0.11123580485582352, + "learning_rate": 0.0001796594025111849, + "loss": 0.1826, + "step": 7052 + }, + { + "epoch": 0.5088928172011977, + "grad_norm": 0.13652732968330383, + "learning_rate": 0.00017965651609178816, + "loss": 0.1803, + "step": 7053 + }, + { + "epoch": 0.5089649698762582, + "grad_norm": 0.16410355269908905, + "learning_rate": 0.0001796536296723914, + "loss": 0.1844, + "step": 7054 + }, + { + "epoch": 0.5090371225513186, + "grad_norm": 0.1269945502281189, + "learning_rate": 0.00017965074325299466, + "loss": 0.1875, + "step": 7055 + }, + { + "epoch": 0.509109275226379, + "grad_norm": 0.1545627862215042, + "learning_rate": 0.00017964785683359793, + "loss": 0.1762, + "step": 7056 + }, + { + "epoch": 0.5091814279014395, + "grad_norm": 0.12612615525722504, + "learning_rate": 0.0001796449704142012, + "loss": 0.1691, + "step": 7057 + }, + { + "epoch": 0.5092535805764998, + "grad_norm": 0.14168410003185272, + "learning_rate": 0.00017964208399480445, + "loss": 0.1414, + "step": 7058 + }, + { + "epoch": 0.5093257332515603, + "grad_norm": 0.1303904950618744, + "learning_rate": 0.00017963919757540771, + "loss": 0.2057, + "step": 7059 + }, + { + "epoch": 0.5093978859266207, + "grad_norm": 0.11661367118358612, + "learning_rate": 0.00017963631115601098, + "loss": 0.1503, + "step": 7060 + }, + { + "epoch": 0.5094700386016812, + "grad_norm": 0.13968224823474884, + "learning_rate": 0.00017963342473661424, + "loss": 0.1054, + "step": 7061 + }, + { + "epoch": 0.5095421912767416, + "grad_norm": 0.12871862947940826, + "learning_rate": 0.0001796305383172175, + "loss": 0.1559, + "step": 7062 + }, + { + "epoch": 0.509614343951802, + "grad_norm": 0.16265441477298737, + "learning_rate": 0.00017962765189782077, + "loss": 0.1363, + "step": 7063 + }, + { + "epoch": 0.5096864966268625, + "grad_norm": 0.19330908358097076, + "learning_rate": 0.00017962476547842403, + "loss": 0.1145, + "step": 7064 + }, + { + "epoch": 0.5097586493019228, + "grad_norm": 0.1145443320274353, + "learning_rate": 0.00017962187905902726, + "loss": 0.1688, + "step": 7065 + }, + { + "epoch": 0.5098308019769833, + "grad_norm": 0.16316966712474823, + "learning_rate": 0.00017961899263963055, + "loss": 0.1873, + "step": 7066 + }, + { + "epoch": 0.5099029546520437, + "grad_norm": 0.10494080930948257, + "learning_rate": 0.00017961610622023382, + "loss": 0.1404, + "step": 7067 + }, + { + "epoch": 0.5099751073271042, + "grad_norm": 0.11450763791799545, + "learning_rate": 0.00017961321980083708, + "loss": 0.1579, + "step": 7068 + }, + { + "epoch": 0.5100472600021646, + "grad_norm": 0.1169004738330841, + "learning_rate": 0.00017961033338144034, + "loss": 0.1023, + "step": 7069 + }, + { + "epoch": 0.510119412677225, + "grad_norm": 0.1317950189113617, + "learning_rate": 0.00017960744696204358, + "loss": 0.2358, + "step": 7070 + }, + { + "epoch": 0.5101915653522854, + "grad_norm": 0.14101877808570862, + "learning_rate": 0.00017960456054264684, + "loss": 0.143, + "step": 7071 + }, + { + "epoch": 0.5102637180273458, + "grad_norm": 0.18086402118206024, + "learning_rate": 0.0001796016741232501, + "loss": 0.1629, + "step": 7072 + }, + { + "epoch": 0.5103358707024063, + "grad_norm": 0.1619880646467209, + "learning_rate": 0.0001795987877038534, + "loss": 0.144, + "step": 7073 + }, + { + "epoch": 0.5104080233774667, + "grad_norm": 0.14426441490650177, + "learning_rate": 0.00017959590128445666, + "loss": 0.1667, + "step": 7074 + }, + { + "epoch": 0.5104801760525272, + "grad_norm": 0.12022842466831207, + "learning_rate": 0.0001795930148650599, + "loss": 0.173, + "step": 7075 + }, + { + "epoch": 0.5105523287275876, + "grad_norm": 0.1245194524526596, + "learning_rate": 0.00017959012844566316, + "loss": 0.1601, + "step": 7076 + }, + { + "epoch": 0.510624481402648, + "grad_norm": 0.1720270961523056, + "learning_rate": 0.00017958724202626642, + "loss": 0.2207, + "step": 7077 + }, + { + "epoch": 0.5106966340777084, + "grad_norm": 0.13784779608249664, + "learning_rate": 0.00017958435560686968, + "loss": 0.1676, + "step": 7078 + }, + { + "epoch": 0.5107687867527688, + "grad_norm": 0.10445517301559448, + "learning_rate": 0.00017958146918747295, + "loss": 0.1448, + "step": 7079 + }, + { + "epoch": 0.5108409394278293, + "grad_norm": 0.1254737228155136, + "learning_rate": 0.0001795785827680762, + "loss": 0.1447, + "step": 7080 + }, + { + "epoch": 0.5109130921028897, + "grad_norm": 0.11472035944461823, + "learning_rate": 0.00017957569634867947, + "loss": 0.1331, + "step": 7081 + }, + { + "epoch": 0.5109852447779502, + "grad_norm": 0.11648841202259064, + "learning_rate": 0.00017957280992928273, + "loss": 0.1631, + "step": 7082 + }, + { + "epoch": 0.5110573974530106, + "grad_norm": 0.12026941031217575, + "learning_rate": 0.000179569923509886, + "loss": 0.1392, + "step": 7083 + }, + { + "epoch": 0.511129550128071, + "grad_norm": 0.11940126866102219, + "learning_rate": 0.00017956703709048926, + "loss": 0.1431, + "step": 7084 + }, + { + "epoch": 0.5112017028031314, + "grad_norm": 0.12190019339323044, + "learning_rate": 0.00017956415067109252, + "loss": 0.1748, + "step": 7085 + }, + { + "epoch": 0.5112738554781918, + "grad_norm": 0.1303064376115799, + "learning_rate": 0.00017956126425169576, + "loss": 0.1483, + "step": 7086 + }, + { + "epoch": 0.5113460081532523, + "grad_norm": 0.11201722174882889, + "learning_rate": 0.00017955837783229905, + "loss": 0.1709, + "step": 7087 + }, + { + "epoch": 0.5114181608283127, + "grad_norm": 0.132659912109375, + "learning_rate": 0.0001795554914129023, + "loss": 0.148, + "step": 7088 + }, + { + "epoch": 0.5114903135033732, + "grad_norm": 0.1099841296672821, + "learning_rate": 0.00017955260499350557, + "loss": 0.2, + "step": 7089 + }, + { + "epoch": 0.5115624661784336, + "grad_norm": 0.11605177074670792, + "learning_rate": 0.00017954971857410884, + "loss": 0.1367, + "step": 7090 + }, + { + "epoch": 0.511634618853494, + "grad_norm": 0.13425514101982117, + "learning_rate": 0.00017954683215471207, + "loss": 0.1885, + "step": 7091 + }, + { + "epoch": 0.5117067715285544, + "grad_norm": 0.09016387164592743, + "learning_rate": 0.00017954394573531534, + "loss": 0.129, + "step": 7092 + }, + { + "epoch": 0.5117789242036148, + "grad_norm": 0.11584919691085815, + "learning_rate": 0.0001795410593159186, + "loss": 0.1891, + "step": 7093 + }, + { + "epoch": 0.5118510768786753, + "grad_norm": 0.13602469861507416, + "learning_rate": 0.0001795381728965219, + "loss": 0.1442, + "step": 7094 + }, + { + "epoch": 0.5119232295537357, + "grad_norm": 0.11831388622522354, + "learning_rate": 0.00017953528647712515, + "loss": 0.1617, + "step": 7095 + }, + { + "epoch": 0.5119953822287961, + "grad_norm": 0.12001494318246841, + "learning_rate": 0.0001795324000577284, + "loss": 0.1257, + "step": 7096 + }, + { + "epoch": 0.5120675349038566, + "grad_norm": 0.1519460827112198, + "learning_rate": 0.00017952951363833165, + "loss": 0.1627, + "step": 7097 + }, + { + "epoch": 0.512139687578917, + "grad_norm": 0.12677805125713348, + "learning_rate": 0.00017952662721893491, + "loss": 0.1374, + "step": 7098 + }, + { + "epoch": 0.5122118402539774, + "grad_norm": 0.1298854649066925, + "learning_rate": 0.00017952374079953818, + "loss": 0.1082, + "step": 7099 + }, + { + "epoch": 0.5122839929290378, + "grad_norm": 0.16897009313106537, + "learning_rate": 0.00017952085438014144, + "loss": 0.2109, + "step": 7100 + }, + { + "epoch": 0.5123561456040983, + "grad_norm": 0.13847383856773376, + "learning_rate": 0.0001795179679607447, + "loss": 0.2166, + "step": 7101 + }, + { + "epoch": 0.5124282982791587, + "grad_norm": 0.13180577754974365, + "learning_rate": 0.00017951508154134797, + "loss": 0.1774, + "step": 7102 + }, + { + "epoch": 0.5125004509542191, + "grad_norm": 0.1354549080133438, + "learning_rate": 0.00017951219512195123, + "loss": 0.1629, + "step": 7103 + }, + { + "epoch": 0.5125726036292796, + "grad_norm": 0.10785700380802155, + "learning_rate": 0.0001795093087025545, + "loss": 0.1226, + "step": 7104 + }, + { + "epoch": 0.51264475630434, + "grad_norm": 0.11956942081451416, + "learning_rate": 0.00017950642228315775, + "loss": 0.1717, + "step": 7105 + }, + { + "epoch": 0.5127169089794004, + "grad_norm": 0.11497174948453903, + "learning_rate": 0.00017950353586376102, + "loss": 0.1623, + "step": 7106 + }, + { + "epoch": 0.5127890616544608, + "grad_norm": 0.15201187133789062, + "learning_rate": 0.00017950064944436425, + "loss": 0.1592, + "step": 7107 + }, + { + "epoch": 0.5128612143295213, + "grad_norm": 0.12435279041528702, + "learning_rate": 0.00017949776302496754, + "loss": 0.1148, + "step": 7108 + }, + { + "epoch": 0.5129333670045817, + "grad_norm": 0.12462184578180313, + "learning_rate": 0.0001794948766055708, + "loss": 0.1231, + "step": 7109 + }, + { + "epoch": 0.5130055196796421, + "grad_norm": 0.12461728602647781, + "learning_rate": 0.00017949199018617407, + "loss": 0.1636, + "step": 7110 + }, + { + "epoch": 0.5130776723547026, + "grad_norm": 0.11966323107481003, + "learning_rate": 0.00017948910376677733, + "loss": 0.1824, + "step": 7111 + }, + { + "epoch": 0.513149825029763, + "grad_norm": 0.10968772321939468, + "learning_rate": 0.00017948621734738057, + "loss": 0.1257, + "step": 7112 + }, + { + "epoch": 0.5132219777048234, + "grad_norm": 0.13588903844356537, + "learning_rate": 0.00017948333092798383, + "loss": 0.187, + "step": 7113 + }, + { + "epoch": 0.5132941303798838, + "grad_norm": 0.10114775598049164, + "learning_rate": 0.0001794804445085871, + "loss": 0.1311, + "step": 7114 + }, + { + "epoch": 0.5133662830549443, + "grad_norm": 0.1150478795170784, + "learning_rate": 0.00017947755808919038, + "loss": 0.1423, + "step": 7115 + }, + { + "epoch": 0.5134384357300047, + "grad_norm": 0.13415440917015076, + "learning_rate": 0.00017947467166979365, + "loss": 0.1541, + "step": 7116 + }, + { + "epoch": 0.5135105884050651, + "grad_norm": 0.12662853300571442, + "learning_rate": 0.00017947178525039688, + "loss": 0.1371, + "step": 7117 + }, + { + "epoch": 0.5135827410801256, + "grad_norm": 0.10206128656864166, + "learning_rate": 0.00017946889883100014, + "loss": 0.1565, + "step": 7118 + }, + { + "epoch": 0.513654893755186, + "grad_norm": 0.14350473880767822, + "learning_rate": 0.0001794660124116034, + "loss": 0.1764, + "step": 7119 + }, + { + "epoch": 0.5137270464302464, + "grad_norm": 0.07911955565214157, + "learning_rate": 0.00017946312599220667, + "loss": 0.1468, + "step": 7120 + }, + { + "epoch": 0.5137991991053068, + "grad_norm": 0.12495509535074234, + "learning_rate": 0.00017946023957280993, + "loss": 0.1332, + "step": 7121 + }, + { + "epoch": 0.5138713517803672, + "grad_norm": 0.11479618400335312, + "learning_rate": 0.0001794573531534132, + "loss": 0.1465, + "step": 7122 + }, + { + "epoch": 0.5139435044554277, + "grad_norm": 0.14247792959213257, + "learning_rate": 0.00017945446673401646, + "loss": 0.1726, + "step": 7123 + }, + { + "epoch": 0.5140156571304881, + "grad_norm": 0.1663128286600113, + "learning_rate": 0.00017945158031461972, + "loss": 0.2307, + "step": 7124 + }, + { + "epoch": 0.5140878098055486, + "grad_norm": 0.12039539217948914, + "learning_rate": 0.00017944869389522299, + "loss": 0.1669, + "step": 7125 + }, + { + "epoch": 0.514159962480609, + "grad_norm": 0.11929290741682053, + "learning_rate": 0.00017944580747582625, + "loss": 0.1681, + "step": 7126 + }, + { + "epoch": 0.5142321151556694, + "grad_norm": 0.1053629219532013, + "learning_rate": 0.0001794429210564295, + "loss": 0.1712, + "step": 7127 + }, + { + "epoch": 0.5143042678307298, + "grad_norm": 0.11167839169502258, + "learning_rate": 0.00017944003463703277, + "loss": 0.1656, + "step": 7128 + }, + { + "epoch": 0.5143764205057902, + "grad_norm": 0.11405834555625916, + "learning_rate": 0.00017943714821763604, + "loss": 0.1653, + "step": 7129 + }, + { + "epoch": 0.5144485731808507, + "grad_norm": 0.11706086993217468, + "learning_rate": 0.0001794342617982393, + "loss": 0.1301, + "step": 7130 + }, + { + "epoch": 0.5145207258559111, + "grad_norm": 0.11771078407764435, + "learning_rate": 0.00017943137537884256, + "loss": 0.1333, + "step": 7131 + }, + { + "epoch": 0.5145928785309716, + "grad_norm": 0.1398884356021881, + "learning_rate": 0.00017942848895944583, + "loss": 0.1846, + "step": 7132 + }, + { + "epoch": 0.5146650312060319, + "grad_norm": 0.10110854357481003, + "learning_rate": 0.0001794256025400491, + "loss": 0.1993, + "step": 7133 + }, + { + "epoch": 0.5147371838810924, + "grad_norm": 0.1052713468670845, + "learning_rate": 0.00017942271612065232, + "loss": 0.0889, + "step": 7134 + }, + { + "epoch": 0.5148093365561528, + "grad_norm": 0.10848946869373322, + "learning_rate": 0.0001794198297012556, + "loss": 0.1714, + "step": 7135 + }, + { + "epoch": 0.5148814892312132, + "grad_norm": 0.10938854515552521, + "learning_rate": 0.00017941694328185888, + "loss": 0.1233, + "step": 7136 + }, + { + "epoch": 0.5149536419062737, + "grad_norm": 0.15125346183776855, + "learning_rate": 0.00017941405686246214, + "loss": 0.1782, + "step": 7137 + }, + { + "epoch": 0.5150257945813341, + "grad_norm": 0.11715354770421982, + "learning_rate": 0.0001794111704430654, + "loss": 0.1256, + "step": 7138 + }, + { + "epoch": 0.5150979472563946, + "grad_norm": 0.15763883292675018, + "learning_rate": 0.00017940828402366864, + "loss": 0.1721, + "step": 7139 + }, + { + "epoch": 0.5151700999314549, + "grad_norm": 0.1265951693058014, + "learning_rate": 0.0001794053976042719, + "loss": 0.1218, + "step": 7140 + }, + { + "epoch": 0.5152422526065153, + "grad_norm": 0.1079411506652832, + "learning_rate": 0.00017940251118487517, + "loss": 0.1693, + "step": 7141 + }, + { + "epoch": 0.5153144052815758, + "grad_norm": 0.1086970716714859, + "learning_rate": 0.00017939962476547843, + "loss": 0.1467, + "step": 7142 + }, + { + "epoch": 0.5153865579566362, + "grad_norm": 0.11504565179347992, + "learning_rate": 0.00017939673834608172, + "loss": 0.121, + "step": 7143 + }, + { + "epoch": 0.5154587106316967, + "grad_norm": 0.12443605810403824, + "learning_rate": 0.00017939385192668495, + "loss": 0.1886, + "step": 7144 + }, + { + "epoch": 0.5155308633067571, + "grad_norm": 0.10076568275690079, + "learning_rate": 0.00017939096550728822, + "loss": 0.1106, + "step": 7145 + }, + { + "epoch": 0.5156030159818176, + "grad_norm": 0.12320218235254288, + "learning_rate": 0.00017938807908789148, + "loss": 0.1613, + "step": 7146 + }, + { + "epoch": 0.5156751686568779, + "grad_norm": 0.12105323374271393, + "learning_rate": 0.00017938519266849474, + "loss": 0.1779, + "step": 7147 + }, + { + "epoch": 0.5157473213319383, + "grad_norm": 0.11187251657247543, + "learning_rate": 0.000179382306249098, + "loss": 0.1617, + "step": 7148 + }, + { + "epoch": 0.5158194740069988, + "grad_norm": 0.13889268040657043, + "learning_rate": 0.00017937941982970127, + "loss": 0.142, + "step": 7149 + }, + { + "epoch": 0.5158916266820592, + "grad_norm": 0.15993313491344452, + "learning_rate": 0.0001793765334103045, + "loss": 0.1478, + "step": 7150 + }, + { + "epoch": 0.5159637793571197, + "grad_norm": 0.12263643741607666, + "learning_rate": 0.0001793736469909078, + "loss": 0.116, + "step": 7151 + }, + { + "epoch": 0.5160359320321801, + "grad_norm": 0.11724000424146652, + "learning_rate": 0.00017937076057151106, + "loss": 0.168, + "step": 7152 + }, + { + "epoch": 0.5161080847072406, + "grad_norm": 0.11943333595991135, + "learning_rate": 0.00017936787415211432, + "loss": 0.1757, + "step": 7153 + }, + { + "epoch": 0.5161802373823009, + "grad_norm": 0.12497618049383163, + "learning_rate": 0.00017936498773271758, + "loss": 0.1531, + "step": 7154 + }, + { + "epoch": 0.5162523900573613, + "grad_norm": 0.11900891363620758, + "learning_rate": 0.00017936210131332082, + "loss": 0.1362, + "step": 7155 + }, + { + "epoch": 0.5163245427324218, + "grad_norm": 0.12837271392345428, + "learning_rate": 0.00017935921489392408, + "loss": 0.148, + "step": 7156 + }, + { + "epoch": 0.5163966954074822, + "grad_norm": 0.11471260339021683, + "learning_rate": 0.00017935632847452734, + "loss": 0.1738, + "step": 7157 + }, + { + "epoch": 0.5164688480825427, + "grad_norm": 0.10337173193693161, + "learning_rate": 0.00017935344205513063, + "loss": 0.1604, + "step": 7158 + }, + { + "epoch": 0.5165410007576031, + "grad_norm": 0.12180466949939728, + "learning_rate": 0.0001793505556357339, + "loss": 0.1699, + "step": 7159 + }, + { + "epoch": 0.5166131534326636, + "grad_norm": 0.10087094455957413, + "learning_rate": 0.00017934766921633713, + "loss": 0.1819, + "step": 7160 + }, + { + "epoch": 0.5166853061077239, + "grad_norm": 0.12025085836648941, + "learning_rate": 0.0001793447827969404, + "loss": 0.1661, + "step": 7161 + }, + { + "epoch": 0.5167574587827843, + "grad_norm": 0.11435249447822571, + "learning_rate": 0.00017934189637754366, + "loss": 0.107, + "step": 7162 + }, + { + "epoch": 0.5168296114578448, + "grad_norm": 0.1328279674053192, + "learning_rate": 0.00017933900995814692, + "loss": 0.1972, + "step": 7163 + }, + { + "epoch": 0.5169017641329052, + "grad_norm": 0.11817678064107895, + "learning_rate": 0.00017933612353875019, + "loss": 0.1716, + "step": 7164 + }, + { + "epoch": 0.5169739168079657, + "grad_norm": 0.10996687412261963, + "learning_rate": 0.00017933323711935345, + "loss": 0.164, + "step": 7165 + }, + { + "epoch": 0.5170460694830261, + "grad_norm": 0.1188410222530365, + "learning_rate": 0.0001793303506999567, + "loss": 0.1349, + "step": 7166 + }, + { + "epoch": 0.5171182221580866, + "grad_norm": 0.13090239465236664, + "learning_rate": 0.00017932746428055997, + "loss": 0.2171, + "step": 7167 + }, + { + "epoch": 0.5171903748331469, + "grad_norm": 0.11918402463197708, + "learning_rate": 0.00017932457786116324, + "loss": 0.1671, + "step": 7168 + }, + { + "epoch": 0.5172625275082073, + "grad_norm": 0.12439822405576706, + "learning_rate": 0.0001793216914417665, + "loss": 0.1678, + "step": 7169 + }, + { + "epoch": 0.5173346801832678, + "grad_norm": 0.12405625730752945, + "learning_rate": 0.00017931880502236976, + "loss": 0.107, + "step": 7170 + }, + { + "epoch": 0.5174068328583282, + "grad_norm": 0.10050161182880402, + "learning_rate": 0.000179315918602973, + "loss": 0.1308, + "step": 7171 + }, + { + "epoch": 0.5174789855333887, + "grad_norm": 0.1161796823143959, + "learning_rate": 0.0001793130321835763, + "loss": 0.1474, + "step": 7172 + }, + { + "epoch": 0.5175511382084491, + "grad_norm": 0.1269403100013733, + "learning_rate": 0.00017931014576417955, + "loss": 0.2059, + "step": 7173 + }, + { + "epoch": 0.5176232908835096, + "grad_norm": 0.10315892100334167, + "learning_rate": 0.00017930725934478281, + "loss": 0.1052, + "step": 7174 + }, + { + "epoch": 0.5176954435585699, + "grad_norm": 0.1161118671298027, + "learning_rate": 0.00017930437292538608, + "loss": 0.1421, + "step": 7175 + }, + { + "epoch": 0.5177675962336303, + "grad_norm": 0.1218157634139061, + "learning_rate": 0.0001793014865059893, + "loss": 0.1585, + "step": 7176 + }, + { + "epoch": 0.5178397489086908, + "grad_norm": 0.11405991017818451, + "learning_rate": 0.00017929860008659258, + "loss": 0.1643, + "step": 7177 + }, + { + "epoch": 0.5179119015837512, + "grad_norm": 0.11038383096456528, + "learning_rate": 0.00017929571366719584, + "loss": 0.1415, + "step": 7178 + }, + { + "epoch": 0.5179840542588117, + "grad_norm": 0.12101422995328903, + "learning_rate": 0.00017929282724779913, + "loss": 0.163, + "step": 7179 + }, + { + "epoch": 0.5180562069338721, + "grad_norm": 0.10214401036500931, + "learning_rate": 0.0001792899408284024, + "loss": 0.1273, + "step": 7180 + }, + { + "epoch": 0.5181283596089326, + "grad_norm": 0.08696547150611877, + "learning_rate": 0.00017928705440900563, + "loss": 0.1835, + "step": 7181 + }, + { + "epoch": 0.5182005122839929, + "grad_norm": 0.1108965203166008, + "learning_rate": 0.0001792841679896089, + "loss": 0.1787, + "step": 7182 + }, + { + "epoch": 0.5182726649590533, + "grad_norm": 0.10518892854452133, + "learning_rate": 0.00017928128157021215, + "loss": 0.1906, + "step": 7183 + }, + { + "epoch": 0.5183448176341138, + "grad_norm": 0.11729423701763153, + "learning_rate": 0.00017927839515081542, + "loss": 0.1725, + "step": 7184 + }, + { + "epoch": 0.5184169703091742, + "grad_norm": 0.11098305135965347, + "learning_rate": 0.00017927550873141868, + "loss": 0.1754, + "step": 7185 + }, + { + "epoch": 0.5184891229842347, + "grad_norm": 0.14198029041290283, + "learning_rate": 0.00017927262231202194, + "loss": 0.1726, + "step": 7186 + }, + { + "epoch": 0.5185612756592951, + "grad_norm": 0.12816059589385986, + "learning_rate": 0.0001792697358926252, + "loss": 0.1231, + "step": 7187 + }, + { + "epoch": 0.5186334283343556, + "grad_norm": 0.20600976049900055, + "learning_rate": 0.00017926684947322847, + "loss": 0.1498, + "step": 7188 + }, + { + "epoch": 0.5187055810094159, + "grad_norm": 0.10582345724105835, + "learning_rate": 0.00017926396305383173, + "loss": 0.2063, + "step": 7189 + }, + { + "epoch": 0.5187777336844763, + "grad_norm": 0.11878283321857452, + "learning_rate": 0.000179261076634435, + "loss": 0.1777, + "step": 7190 + }, + { + "epoch": 0.5188498863595368, + "grad_norm": 0.13395068049430847, + "learning_rate": 0.00017925819021503826, + "loss": 0.1911, + "step": 7191 + }, + { + "epoch": 0.5189220390345972, + "grad_norm": 0.12442594021558762, + "learning_rate": 0.0001792553037956415, + "loss": 0.1398, + "step": 7192 + }, + { + "epoch": 0.5189941917096577, + "grad_norm": 0.10257423669099808, + "learning_rate": 0.00017925241737624478, + "loss": 0.1238, + "step": 7193 + }, + { + "epoch": 0.5190663443847181, + "grad_norm": 0.1495160013437271, + "learning_rate": 0.00017924953095684805, + "loss": 0.1993, + "step": 7194 + }, + { + "epoch": 0.5191384970597784, + "grad_norm": 0.11780592799186707, + "learning_rate": 0.0001792466445374513, + "loss": 0.1729, + "step": 7195 + }, + { + "epoch": 0.5192106497348389, + "grad_norm": 0.1251525729894638, + "learning_rate": 0.00017924375811805457, + "loss": 0.1138, + "step": 7196 + }, + { + "epoch": 0.5192828024098993, + "grad_norm": 0.14698529243469238, + "learning_rate": 0.0001792408716986578, + "loss": 0.151, + "step": 7197 + }, + { + "epoch": 0.5193549550849598, + "grad_norm": 0.11803632974624634, + "learning_rate": 0.00017923798527926107, + "loss": 0.1888, + "step": 7198 + }, + { + "epoch": 0.5194271077600202, + "grad_norm": 0.13017745316028595, + "learning_rate": 0.00017923509885986433, + "loss": 0.1845, + "step": 7199 + }, + { + "epoch": 0.5194992604350807, + "grad_norm": 0.11711041629314423, + "learning_rate": 0.00017923221244046762, + "loss": 0.1322, + "step": 7200 + }, + { + "epoch": 0.5195714131101411, + "grad_norm": 0.10870666801929474, + "learning_rate": 0.00017922932602107089, + "loss": 0.1238, + "step": 7201 + }, + { + "epoch": 0.5196435657852014, + "grad_norm": 0.1246069073677063, + "learning_rate": 0.00017922643960167412, + "loss": 0.1529, + "step": 7202 + }, + { + "epoch": 0.5197157184602619, + "grad_norm": 0.11276337504386902, + "learning_rate": 0.00017922355318227738, + "loss": 0.1522, + "step": 7203 + }, + { + "epoch": 0.5197878711353223, + "grad_norm": 0.10751859843730927, + "learning_rate": 0.00017922066676288065, + "loss": 0.2177, + "step": 7204 + }, + { + "epoch": 0.5198600238103828, + "grad_norm": 0.10630340874195099, + "learning_rate": 0.0001792177803434839, + "loss": 0.163, + "step": 7205 + }, + { + "epoch": 0.5199321764854432, + "grad_norm": 0.11413315683603287, + "learning_rate": 0.00017921489392408717, + "loss": 0.1517, + "step": 7206 + }, + { + "epoch": 0.5200043291605037, + "grad_norm": 0.12250448018312454, + "learning_rate": 0.00017921200750469044, + "loss": 0.1842, + "step": 7207 + }, + { + "epoch": 0.5200764818355641, + "grad_norm": 0.09859684854745865, + "learning_rate": 0.0001792091210852937, + "loss": 0.1106, + "step": 7208 + }, + { + "epoch": 0.5201486345106244, + "grad_norm": 0.09322864562273026, + "learning_rate": 0.00017920623466589696, + "loss": 0.145, + "step": 7209 + }, + { + "epoch": 0.5202207871856849, + "grad_norm": 0.11078327894210815, + "learning_rate": 0.00017920334824650023, + "loss": 0.1852, + "step": 7210 + }, + { + "epoch": 0.5202929398607453, + "grad_norm": 0.11507966369390488, + "learning_rate": 0.0001792004618271035, + "loss": 0.1514, + "step": 7211 + }, + { + "epoch": 0.5203650925358058, + "grad_norm": 0.14388643205165863, + "learning_rate": 0.00017919757540770675, + "loss": 0.1565, + "step": 7212 + }, + { + "epoch": 0.5204372452108662, + "grad_norm": 0.10419867932796478, + "learning_rate": 0.00017919468898831, + "loss": 0.111, + "step": 7213 + }, + { + "epoch": 0.5205093978859267, + "grad_norm": 0.14187461137771606, + "learning_rate": 0.00017919180256891328, + "loss": 0.1281, + "step": 7214 + }, + { + "epoch": 0.5205815505609871, + "grad_norm": 0.11850924044847488, + "learning_rate": 0.00017918891614951654, + "loss": 0.1794, + "step": 7215 + }, + { + "epoch": 0.5206537032360474, + "grad_norm": 0.13303731381893158, + "learning_rate": 0.0001791860297301198, + "loss": 0.1664, + "step": 7216 + }, + { + "epoch": 0.5207258559111079, + "grad_norm": 0.14410582184791565, + "learning_rate": 0.00017918314331072307, + "loss": 0.165, + "step": 7217 + }, + { + "epoch": 0.5207980085861683, + "grad_norm": 0.14188916981220245, + "learning_rate": 0.0001791802568913263, + "loss": 0.1256, + "step": 7218 + }, + { + "epoch": 0.5208701612612288, + "grad_norm": 0.15088891983032227, + "learning_rate": 0.00017917737047192956, + "loss": 0.1743, + "step": 7219 + }, + { + "epoch": 0.5209423139362892, + "grad_norm": 0.1348952054977417, + "learning_rate": 0.00017917448405253283, + "loss": 0.1547, + "step": 7220 + }, + { + "epoch": 0.5210144666113496, + "grad_norm": 0.15551945567131042, + "learning_rate": 0.00017917159763313612, + "loss": 0.1381, + "step": 7221 + }, + { + "epoch": 0.5210866192864101, + "grad_norm": 0.12490364164113998, + "learning_rate": 0.00017916871121373938, + "loss": 0.1634, + "step": 7222 + }, + { + "epoch": 0.5211587719614704, + "grad_norm": 0.11290589720010757, + "learning_rate": 0.00017916582479434262, + "loss": 0.141, + "step": 7223 + }, + { + "epoch": 0.5212309246365309, + "grad_norm": 0.10861491411924362, + "learning_rate": 0.00017916293837494588, + "loss": 0.1162, + "step": 7224 + }, + { + "epoch": 0.5213030773115913, + "grad_norm": 0.19713301956653595, + "learning_rate": 0.00017916005195554914, + "loss": 0.1338, + "step": 7225 + }, + { + "epoch": 0.5213752299866518, + "grad_norm": 0.13340701162815094, + "learning_rate": 0.0001791571655361524, + "loss": 0.171, + "step": 7226 + }, + { + "epoch": 0.5214473826617122, + "grad_norm": 0.13060644268989563, + "learning_rate": 0.00017915427911675567, + "loss": 0.1887, + "step": 7227 + }, + { + "epoch": 0.5215195353367726, + "grad_norm": 0.12915129959583282, + "learning_rate": 0.00017915139269735893, + "loss": 0.1207, + "step": 7228 + }, + { + "epoch": 0.5215916880118331, + "grad_norm": 0.11018068343400955, + "learning_rate": 0.0001791485062779622, + "loss": 0.1748, + "step": 7229 + }, + { + "epoch": 0.5216638406868934, + "grad_norm": 0.11852601170539856, + "learning_rate": 0.00017914561985856546, + "loss": 0.1858, + "step": 7230 + }, + { + "epoch": 0.5217359933619539, + "grad_norm": 0.12446949630975723, + "learning_rate": 0.00017914273343916872, + "loss": 0.1479, + "step": 7231 + }, + { + "epoch": 0.5218081460370143, + "grad_norm": 0.15044647455215454, + "learning_rate": 0.00017913984701977198, + "loss": 0.1515, + "step": 7232 + }, + { + "epoch": 0.5218802987120748, + "grad_norm": 0.11914116889238358, + "learning_rate": 0.00017913696060037525, + "loss": 0.1491, + "step": 7233 + }, + { + "epoch": 0.5219524513871352, + "grad_norm": 0.09492188692092896, + "learning_rate": 0.0001791340741809785, + "loss": 0.1718, + "step": 7234 + }, + { + "epoch": 0.5220246040621956, + "grad_norm": 0.13469257950782776, + "learning_rate": 0.00017913118776158177, + "loss": 0.1585, + "step": 7235 + }, + { + "epoch": 0.5220967567372561, + "grad_norm": 0.11164631694555283, + "learning_rate": 0.00017912830134218503, + "loss": 0.1524, + "step": 7236 + }, + { + "epoch": 0.5221689094123164, + "grad_norm": 0.15975221991539001, + "learning_rate": 0.0001791254149227883, + "loss": 0.219, + "step": 7237 + }, + { + "epoch": 0.5222410620873769, + "grad_norm": 0.12551988661289215, + "learning_rate": 0.00017912252850339156, + "loss": 0.1607, + "step": 7238 + }, + { + "epoch": 0.5223132147624373, + "grad_norm": 0.10428985953330994, + "learning_rate": 0.00017911964208399482, + "loss": 0.1552, + "step": 7239 + }, + { + "epoch": 0.5223853674374977, + "grad_norm": 0.11404060572385788, + "learning_rate": 0.00017911675566459806, + "loss": 0.1778, + "step": 7240 + }, + { + "epoch": 0.5224575201125582, + "grad_norm": 0.12849149107933044, + "learning_rate": 0.00017911386924520132, + "loss": 0.1412, + "step": 7241 + }, + { + "epoch": 0.5225296727876186, + "grad_norm": 0.12899062037467957, + "learning_rate": 0.0001791109828258046, + "loss": 0.1425, + "step": 7242 + }, + { + "epoch": 0.5226018254626791, + "grad_norm": 0.11130117624998093, + "learning_rate": 0.00017910809640640787, + "loss": 0.1233, + "step": 7243 + }, + { + "epoch": 0.5226739781377394, + "grad_norm": 0.12267808616161346, + "learning_rate": 0.00017910520998701114, + "loss": 0.1623, + "step": 7244 + }, + { + "epoch": 0.5227461308127999, + "grad_norm": 0.10102290660142899, + "learning_rate": 0.00017910232356761437, + "loss": 0.1738, + "step": 7245 + }, + { + "epoch": 0.5228182834878603, + "grad_norm": 0.10643978416919708, + "learning_rate": 0.00017909943714821764, + "loss": 0.1551, + "step": 7246 + }, + { + "epoch": 0.5228904361629207, + "grad_norm": 0.12526944279670715, + "learning_rate": 0.0001790965507288209, + "loss": 0.1375, + "step": 7247 + }, + { + "epoch": 0.5229625888379812, + "grad_norm": 0.11846707761287689, + "learning_rate": 0.00017909366430942416, + "loss": 0.1041, + "step": 7248 + }, + { + "epoch": 0.5230347415130416, + "grad_norm": 0.15429191291332245, + "learning_rate": 0.00017909077789002745, + "loss": 0.1766, + "step": 7249 + }, + { + "epoch": 0.5231068941881021, + "grad_norm": 0.1419401466846466, + "learning_rate": 0.0001790878914706307, + "loss": 0.2065, + "step": 7250 + }, + { + "epoch": 0.5231790468631624, + "grad_norm": 0.17341232299804688, + "learning_rate": 0.00017908500505123395, + "loss": 0.1787, + "step": 7251 + }, + { + "epoch": 0.5232511995382229, + "grad_norm": 0.11634615063667297, + "learning_rate": 0.00017908211863183721, + "loss": 0.1473, + "step": 7252 + }, + { + "epoch": 0.5233233522132833, + "grad_norm": 0.13800795376300812, + "learning_rate": 0.00017907923221244048, + "loss": 0.1643, + "step": 7253 + }, + { + "epoch": 0.5233955048883437, + "grad_norm": 0.138355553150177, + "learning_rate": 0.00017907634579304374, + "loss": 0.179, + "step": 7254 + }, + { + "epoch": 0.5234676575634042, + "grad_norm": 0.11804528534412384, + "learning_rate": 0.000179073459373647, + "loss": 0.1722, + "step": 7255 + }, + { + "epoch": 0.5235398102384646, + "grad_norm": 0.11212264001369476, + "learning_rate": 0.00017907057295425027, + "loss": 0.1425, + "step": 7256 + }, + { + "epoch": 0.523611962913525, + "grad_norm": 0.1296979933977127, + "learning_rate": 0.00017906768653485353, + "loss": 0.1733, + "step": 7257 + }, + { + "epoch": 0.5236841155885854, + "grad_norm": 0.1127726212143898, + "learning_rate": 0.0001790648001154568, + "loss": 0.1511, + "step": 7258 + }, + { + "epoch": 0.5237562682636459, + "grad_norm": 0.1350761502981186, + "learning_rate": 0.00017906191369606005, + "loss": 0.1661, + "step": 7259 + }, + { + "epoch": 0.5238284209387063, + "grad_norm": 0.10852393507957458, + "learning_rate": 0.00017905902727666332, + "loss": 0.1762, + "step": 7260 + }, + { + "epoch": 0.5239005736137667, + "grad_norm": 0.10514908283948898, + "learning_rate": 0.00017905614085726655, + "loss": 0.1299, + "step": 7261 + }, + { + "epoch": 0.5239727262888272, + "grad_norm": 0.10958047956228256, + "learning_rate": 0.00017905325443786982, + "loss": 0.1214, + "step": 7262 + }, + { + "epoch": 0.5240448789638876, + "grad_norm": 0.09741929918527603, + "learning_rate": 0.0001790503680184731, + "loss": 0.156, + "step": 7263 + }, + { + "epoch": 0.524117031638948, + "grad_norm": 0.10688262432813644, + "learning_rate": 0.00017904748159907637, + "loss": 0.149, + "step": 7264 + }, + { + "epoch": 0.5241891843140084, + "grad_norm": 0.12545837461948395, + "learning_rate": 0.00017904459517967963, + "loss": 0.1376, + "step": 7265 + }, + { + "epoch": 0.5242613369890688, + "grad_norm": 0.09972314536571503, + "learning_rate": 0.00017904170876028287, + "loss": 0.1856, + "step": 7266 + }, + { + "epoch": 0.5243334896641293, + "grad_norm": 0.12121710181236267, + "learning_rate": 0.00017903882234088613, + "loss": 0.1755, + "step": 7267 + }, + { + "epoch": 0.5244056423391897, + "grad_norm": 0.1155657023191452, + "learning_rate": 0.0001790359359214894, + "loss": 0.1547, + "step": 7268 + }, + { + "epoch": 0.5244777950142502, + "grad_norm": 0.12311001867055893, + "learning_rate": 0.00017903304950209266, + "loss": 0.1646, + "step": 7269 + }, + { + "epoch": 0.5245499476893106, + "grad_norm": 0.1461220383644104, + "learning_rate": 0.00017903016308269595, + "loss": 0.1963, + "step": 7270 + }, + { + "epoch": 0.524622100364371, + "grad_norm": 0.1390925645828247, + "learning_rate": 0.00017902727666329918, + "loss": 0.1499, + "step": 7271 + }, + { + "epoch": 0.5246942530394314, + "grad_norm": 0.10540948063135147, + "learning_rate": 0.00017902439024390245, + "loss": 0.1333, + "step": 7272 + }, + { + "epoch": 0.5247664057144918, + "grad_norm": 0.13809522986412048, + "learning_rate": 0.0001790215038245057, + "loss": 0.1216, + "step": 7273 + }, + { + "epoch": 0.5248385583895523, + "grad_norm": 0.13587015867233276, + "learning_rate": 0.00017901861740510897, + "loss": 0.182, + "step": 7274 + }, + { + "epoch": 0.5249107110646127, + "grad_norm": 0.11889909207820892, + "learning_rate": 0.00017901573098571223, + "loss": 0.1897, + "step": 7275 + }, + { + "epoch": 0.5249828637396732, + "grad_norm": 0.1114877238869667, + "learning_rate": 0.0001790128445663155, + "loss": 0.138, + "step": 7276 + }, + { + "epoch": 0.5250550164147336, + "grad_norm": 0.10427363961935043, + "learning_rate": 0.00017900995814691876, + "loss": 0.2297, + "step": 7277 + }, + { + "epoch": 0.525127169089794, + "grad_norm": 0.11984950304031372, + "learning_rate": 0.00017900707172752202, + "loss": 0.1384, + "step": 7278 + }, + { + "epoch": 0.5251993217648544, + "grad_norm": 0.10204624384641647, + "learning_rate": 0.00017900418530812529, + "loss": 0.133, + "step": 7279 + }, + { + "epoch": 0.5252714744399148, + "grad_norm": 0.1155349463224411, + "learning_rate": 0.00017900129888872855, + "loss": 0.1434, + "step": 7280 + }, + { + "epoch": 0.5253436271149753, + "grad_norm": 0.10468247532844543, + "learning_rate": 0.0001789984124693318, + "loss": 0.165, + "step": 7281 + }, + { + "epoch": 0.5254157797900357, + "grad_norm": 0.14522889256477356, + "learning_rate": 0.00017899552604993505, + "loss": 0.1742, + "step": 7282 + }, + { + "epoch": 0.5254879324650962, + "grad_norm": 0.11063766479492188, + "learning_rate": 0.0001789926396305383, + "loss": 0.159, + "step": 7283 + }, + { + "epoch": 0.5255600851401566, + "grad_norm": 0.0959901511669159, + "learning_rate": 0.0001789897532111416, + "loss": 0.1331, + "step": 7284 + }, + { + "epoch": 0.525632237815217, + "grad_norm": 0.10436305403709412, + "learning_rate": 0.00017898686679174486, + "loss": 0.154, + "step": 7285 + }, + { + "epoch": 0.5257043904902774, + "grad_norm": 0.14302986860275269, + "learning_rate": 0.00017898398037234813, + "loss": 0.153, + "step": 7286 + }, + { + "epoch": 0.5257765431653378, + "grad_norm": 0.10951834172010422, + "learning_rate": 0.00017898109395295136, + "loss": 0.1038, + "step": 7287 + }, + { + "epoch": 0.5258486958403983, + "grad_norm": 0.14024408161640167, + "learning_rate": 0.00017897820753355462, + "loss": 0.1774, + "step": 7288 + }, + { + "epoch": 0.5259208485154587, + "grad_norm": 0.14749003946781158, + "learning_rate": 0.0001789753211141579, + "loss": 0.1897, + "step": 7289 + }, + { + "epoch": 0.5259930011905192, + "grad_norm": 0.1298619657754898, + "learning_rate": 0.00017897243469476115, + "loss": 0.1918, + "step": 7290 + }, + { + "epoch": 0.5260651538655796, + "grad_norm": 0.1270139515399933, + "learning_rate": 0.00017896954827536444, + "loss": 0.1808, + "step": 7291 + }, + { + "epoch": 0.52613730654064, + "grad_norm": 0.11911728233098984, + "learning_rate": 0.00017896666185596768, + "loss": 0.1621, + "step": 7292 + }, + { + "epoch": 0.5262094592157004, + "grad_norm": 0.12327376753091812, + "learning_rate": 0.00017896377543657094, + "loss": 0.1415, + "step": 7293 + }, + { + "epoch": 0.5262816118907608, + "grad_norm": 0.10306958854198456, + "learning_rate": 0.0001789608890171742, + "loss": 0.1708, + "step": 7294 + }, + { + "epoch": 0.5263537645658213, + "grad_norm": 0.14238040149211884, + "learning_rate": 0.00017895800259777747, + "loss": 0.1303, + "step": 7295 + }, + { + "epoch": 0.5264259172408817, + "grad_norm": 0.11975669860839844, + "learning_rate": 0.00017895511617838073, + "loss": 0.1656, + "step": 7296 + }, + { + "epoch": 0.5264980699159422, + "grad_norm": 0.11540134996175766, + "learning_rate": 0.000178952229758984, + "loss": 0.1431, + "step": 7297 + }, + { + "epoch": 0.5265702225910026, + "grad_norm": 0.11716841161251068, + "learning_rate": 0.00017894934333958725, + "loss": 0.1022, + "step": 7298 + }, + { + "epoch": 0.526642375266063, + "grad_norm": 0.1370508372783661, + "learning_rate": 0.00017894645692019052, + "loss": 0.1944, + "step": 7299 + }, + { + "epoch": 0.5267145279411234, + "grad_norm": 0.14596012234687805, + "learning_rate": 0.00017894357050079378, + "loss": 0.1265, + "step": 7300 + }, + { + "epoch": 0.5267866806161838, + "grad_norm": 0.1146797239780426, + "learning_rate": 0.00017894068408139704, + "loss": 0.1364, + "step": 7301 + }, + { + "epoch": 0.5268588332912443, + "grad_norm": 0.12097577750682831, + "learning_rate": 0.0001789377976620003, + "loss": 0.1279, + "step": 7302 + }, + { + "epoch": 0.5269309859663047, + "grad_norm": 0.11120638996362686, + "learning_rate": 0.00017893491124260354, + "loss": 0.1288, + "step": 7303 + }, + { + "epoch": 0.5270031386413652, + "grad_norm": 0.11464061588048935, + "learning_rate": 0.0001789320248232068, + "loss": 0.1699, + "step": 7304 + }, + { + "epoch": 0.5270752913164256, + "grad_norm": 0.12002135068178177, + "learning_rate": 0.0001789291384038101, + "loss": 0.1712, + "step": 7305 + }, + { + "epoch": 0.5271474439914859, + "grad_norm": 0.1227092370390892, + "learning_rate": 0.00017892625198441336, + "loss": 0.2047, + "step": 7306 + }, + { + "epoch": 0.5272195966665464, + "grad_norm": 0.11020387709140778, + "learning_rate": 0.00017892336556501662, + "loss": 0.1425, + "step": 7307 + }, + { + "epoch": 0.5272917493416068, + "grad_norm": 0.0997639000415802, + "learning_rate": 0.00017892047914561986, + "loss": 0.1449, + "step": 7308 + }, + { + "epoch": 0.5273639020166673, + "grad_norm": 0.1353057473897934, + "learning_rate": 0.00017891759272622312, + "loss": 0.167, + "step": 7309 + }, + { + "epoch": 0.5274360546917277, + "grad_norm": 0.14489580690860748, + "learning_rate": 0.00017891470630682638, + "loss": 0.1542, + "step": 7310 + }, + { + "epoch": 0.5275082073667882, + "grad_norm": 0.12727104127407074, + "learning_rate": 0.00017891181988742964, + "loss": 0.1562, + "step": 7311 + }, + { + "epoch": 0.5275803600418486, + "grad_norm": 0.11234887689352036, + "learning_rate": 0.00017890893346803293, + "loss": 0.14, + "step": 7312 + }, + { + "epoch": 0.5276525127169089, + "grad_norm": 0.11045292019844055, + "learning_rate": 0.00017890604704863617, + "loss": 0.1741, + "step": 7313 + }, + { + "epoch": 0.5277246653919694, + "grad_norm": 0.1492622047662735, + "learning_rate": 0.00017890316062923943, + "loss": 0.147, + "step": 7314 + }, + { + "epoch": 0.5277968180670298, + "grad_norm": 0.1515076458454132, + "learning_rate": 0.0001789002742098427, + "loss": 0.2006, + "step": 7315 + }, + { + "epoch": 0.5278689707420903, + "grad_norm": 0.1236778125166893, + "learning_rate": 0.00017889738779044596, + "loss": 0.1518, + "step": 7316 + }, + { + "epoch": 0.5279411234171507, + "grad_norm": 0.15923312306404114, + "learning_rate": 0.00017889450137104922, + "loss": 0.1619, + "step": 7317 + }, + { + "epoch": 0.5280132760922112, + "grad_norm": 0.10383324325084686, + "learning_rate": 0.00017889161495165249, + "loss": 0.1007, + "step": 7318 + }, + { + "epoch": 0.5280854287672715, + "grad_norm": 0.11270064115524292, + "learning_rate": 0.00017888872853225575, + "loss": 0.194, + "step": 7319 + }, + { + "epoch": 0.5281575814423319, + "grad_norm": 0.12055972963571548, + "learning_rate": 0.000178885842112859, + "loss": 0.2315, + "step": 7320 + }, + { + "epoch": 0.5282297341173924, + "grad_norm": 0.11434181034564972, + "learning_rate": 0.00017888295569346227, + "loss": 0.1195, + "step": 7321 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 0.13705787062644958, + "learning_rate": 0.00017888006927406554, + "loss": 0.1512, + "step": 7322 + }, + { + "epoch": 0.5283740394675133, + "grad_norm": 0.09598613530397415, + "learning_rate": 0.0001788771828546688, + "loss": 0.1748, + "step": 7323 + }, + { + "epoch": 0.5284461921425737, + "grad_norm": 0.11793044209480286, + "learning_rate": 0.00017887429643527204, + "loss": 0.1213, + "step": 7324 + }, + { + "epoch": 0.5285183448176342, + "grad_norm": 0.15571290254592896, + "learning_rate": 0.0001788714100158753, + "loss": 0.1339, + "step": 7325 + }, + { + "epoch": 0.5285904974926945, + "grad_norm": 0.11369923502206802, + "learning_rate": 0.0001788685235964786, + "loss": 0.1997, + "step": 7326 + }, + { + "epoch": 0.5286626501677549, + "grad_norm": 0.0989232212305069, + "learning_rate": 0.00017886563717708185, + "loss": 0.2261, + "step": 7327 + }, + { + "epoch": 0.5287348028428154, + "grad_norm": 0.1456446349620819, + "learning_rate": 0.00017886275075768511, + "loss": 0.1508, + "step": 7328 + }, + { + "epoch": 0.5288069555178758, + "grad_norm": 0.13999877870082855, + "learning_rate": 0.00017885986433828835, + "loss": 0.1501, + "step": 7329 + }, + { + "epoch": 0.5288791081929363, + "grad_norm": 0.10752904415130615, + "learning_rate": 0.0001788569779188916, + "loss": 0.1255, + "step": 7330 + }, + { + "epoch": 0.5289512608679967, + "grad_norm": 0.11771759390830994, + "learning_rate": 0.00017885409149949488, + "loss": 0.1705, + "step": 7331 + }, + { + "epoch": 0.5290234135430572, + "grad_norm": 0.1180545911192894, + "learning_rate": 0.00017885120508009814, + "loss": 0.1774, + "step": 7332 + }, + { + "epoch": 0.5290955662181175, + "grad_norm": 0.14889933168888092, + "learning_rate": 0.00017884831866070143, + "loss": 0.1587, + "step": 7333 + }, + { + "epoch": 0.5291677188931779, + "grad_norm": 0.14299236238002777, + "learning_rate": 0.00017884543224130466, + "loss": 0.1442, + "step": 7334 + }, + { + "epoch": 0.5292398715682384, + "grad_norm": 0.1142808049917221, + "learning_rate": 0.00017884254582190793, + "loss": 0.1215, + "step": 7335 + }, + { + "epoch": 0.5293120242432988, + "grad_norm": 0.12313350290060043, + "learning_rate": 0.0001788396594025112, + "loss": 0.1455, + "step": 7336 + }, + { + "epoch": 0.5293841769183593, + "grad_norm": 0.14371417462825775, + "learning_rate": 0.00017883677298311445, + "loss": 0.1279, + "step": 7337 + }, + { + "epoch": 0.5294563295934197, + "grad_norm": 0.10964024811983109, + "learning_rate": 0.00017883388656371772, + "loss": 0.1436, + "step": 7338 + }, + { + "epoch": 0.5295284822684801, + "grad_norm": 0.1285284459590912, + "learning_rate": 0.00017883100014432098, + "loss": 0.1519, + "step": 7339 + }, + { + "epoch": 0.5296006349435405, + "grad_norm": 0.1490326076745987, + "learning_rate": 0.00017882811372492424, + "loss": 0.1315, + "step": 7340 + }, + { + "epoch": 0.5296727876186009, + "grad_norm": 0.13214173913002014, + "learning_rate": 0.0001788252273055275, + "loss": 0.156, + "step": 7341 + }, + { + "epoch": 0.5297449402936614, + "grad_norm": 0.1099911779165268, + "learning_rate": 0.00017882234088613077, + "loss": 0.1452, + "step": 7342 + }, + { + "epoch": 0.5298170929687218, + "grad_norm": 0.14715947210788727, + "learning_rate": 0.00017881945446673403, + "loss": 0.1482, + "step": 7343 + }, + { + "epoch": 0.5298892456437823, + "grad_norm": 0.11125681549310684, + "learning_rate": 0.0001788165680473373, + "loss": 0.1225, + "step": 7344 + }, + { + "epoch": 0.5299613983188427, + "grad_norm": 0.11176412552595139, + "learning_rate": 0.00017881368162794056, + "loss": 0.1581, + "step": 7345 + }, + { + "epoch": 0.5300335509939031, + "grad_norm": 0.13218072056770325, + "learning_rate": 0.0001788107952085438, + "loss": 0.1549, + "step": 7346 + }, + { + "epoch": 0.5301057036689635, + "grad_norm": 0.13086387515068054, + "learning_rate": 0.00017880790878914706, + "loss": 0.1645, + "step": 7347 + }, + { + "epoch": 0.5301778563440239, + "grad_norm": 0.1359982043504715, + "learning_rate": 0.00017880502236975035, + "loss": 0.1634, + "step": 7348 + }, + { + "epoch": 0.5302500090190844, + "grad_norm": 0.0855100080370903, + "learning_rate": 0.0001788021359503536, + "loss": 0.1359, + "step": 7349 + }, + { + "epoch": 0.5303221616941448, + "grad_norm": 0.154966801404953, + "learning_rate": 0.00017879924953095687, + "loss": 0.2241, + "step": 7350 + }, + { + "epoch": 0.5303943143692053, + "grad_norm": 0.1637096107006073, + "learning_rate": 0.0001787963631115601, + "loss": 0.1386, + "step": 7351 + }, + { + "epoch": 0.5304664670442657, + "grad_norm": 0.14011329412460327, + "learning_rate": 0.00017879347669216337, + "loss": 0.1895, + "step": 7352 + }, + { + "epoch": 0.5305386197193261, + "grad_norm": 0.13018305599689484, + "learning_rate": 0.00017879059027276663, + "loss": 0.1221, + "step": 7353 + }, + { + "epoch": 0.5306107723943865, + "grad_norm": 0.13092190027236938, + "learning_rate": 0.0001787877038533699, + "loss": 0.1585, + "step": 7354 + }, + { + "epoch": 0.5306829250694469, + "grad_norm": 0.10489319264888763, + "learning_rate": 0.00017878481743397319, + "loss": 0.1557, + "step": 7355 + }, + { + "epoch": 0.5307550777445074, + "grad_norm": 0.14457325637340546, + "learning_rate": 0.00017878193101457642, + "loss": 0.1921, + "step": 7356 + }, + { + "epoch": 0.5308272304195678, + "grad_norm": 0.12561573088169098, + "learning_rate": 0.00017877904459517969, + "loss": 0.1537, + "step": 7357 + }, + { + "epoch": 0.5308993830946283, + "grad_norm": 0.11675235629081726, + "learning_rate": 0.00017877615817578295, + "loss": 0.1242, + "step": 7358 + }, + { + "epoch": 0.5309715357696887, + "grad_norm": 0.12591566145420074, + "learning_rate": 0.0001787732717563862, + "loss": 0.1519, + "step": 7359 + }, + { + "epoch": 0.5310436884447491, + "grad_norm": 0.10922004282474518, + "learning_rate": 0.00017877038533698947, + "loss": 0.141, + "step": 7360 + }, + { + "epoch": 0.5311158411198095, + "grad_norm": 0.12277299910783768, + "learning_rate": 0.00017876749891759274, + "loss": 0.1774, + "step": 7361 + }, + { + "epoch": 0.5311879937948699, + "grad_norm": 0.14493514597415924, + "learning_rate": 0.000178764612498196, + "loss": 0.1453, + "step": 7362 + }, + { + "epoch": 0.5312601464699304, + "grad_norm": 0.10822132229804993, + "learning_rate": 0.00017876172607879926, + "loss": 0.1953, + "step": 7363 + }, + { + "epoch": 0.5313322991449908, + "grad_norm": 0.12490901350975037, + "learning_rate": 0.00017875883965940253, + "loss": 0.1105, + "step": 7364 + }, + { + "epoch": 0.5314044518200512, + "grad_norm": 0.11226848512887955, + "learning_rate": 0.0001787559532400058, + "loss": 0.1592, + "step": 7365 + }, + { + "epoch": 0.5314766044951117, + "grad_norm": 0.13065274059772491, + "learning_rate": 0.00017875306682060905, + "loss": 0.1858, + "step": 7366 + }, + { + "epoch": 0.5315487571701721, + "grad_norm": 0.14060640335083008, + "learning_rate": 0.0001787501804012123, + "loss": 0.1615, + "step": 7367 + }, + { + "epoch": 0.5316209098452325, + "grad_norm": 0.12865915894508362, + "learning_rate": 0.00017874729398181555, + "loss": 0.139, + "step": 7368 + }, + { + "epoch": 0.5316930625202929, + "grad_norm": 0.12481102347373962, + "learning_rate": 0.00017874440756241884, + "loss": 0.1293, + "step": 7369 + }, + { + "epoch": 0.5317652151953534, + "grad_norm": 0.11680778861045837, + "learning_rate": 0.0001787415211430221, + "loss": 0.1598, + "step": 7370 + }, + { + "epoch": 0.5318373678704138, + "grad_norm": 0.09632851928472519, + "learning_rate": 0.00017873863472362537, + "loss": 0.1169, + "step": 7371 + }, + { + "epoch": 0.5319095205454742, + "grad_norm": 0.13258910179138184, + "learning_rate": 0.0001787357483042286, + "loss": 0.1608, + "step": 7372 + }, + { + "epoch": 0.5319816732205347, + "grad_norm": 0.125102698802948, + "learning_rate": 0.00017873286188483186, + "loss": 0.1477, + "step": 7373 + }, + { + "epoch": 0.5320538258955951, + "grad_norm": 0.10045205801725388, + "learning_rate": 0.00017872997546543513, + "loss": 0.1474, + "step": 7374 + }, + { + "epoch": 0.5321259785706555, + "grad_norm": 0.11710695177316666, + "learning_rate": 0.0001787270890460384, + "loss": 0.1835, + "step": 7375 + }, + { + "epoch": 0.5321981312457159, + "grad_norm": 0.11347135901451111, + "learning_rate": 0.00017872420262664168, + "loss": 0.1541, + "step": 7376 + }, + { + "epoch": 0.5322702839207764, + "grad_norm": 0.09853716194629669, + "learning_rate": 0.00017872131620724492, + "loss": 0.1138, + "step": 7377 + }, + { + "epoch": 0.5323424365958368, + "grad_norm": 0.11942870169878006, + "learning_rate": 0.00017871842978784818, + "loss": 0.1378, + "step": 7378 + }, + { + "epoch": 0.5324145892708972, + "grad_norm": 0.08775313943624496, + "learning_rate": 0.00017871554336845144, + "loss": 0.1425, + "step": 7379 + }, + { + "epoch": 0.5324867419459577, + "grad_norm": 0.13760487735271454, + "learning_rate": 0.0001787126569490547, + "loss": 0.1515, + "step": 7380 + }, + { + "epoch": 0.532558894621018, + "grad_norm": 0.12147516757249832, + "learning_rate": 0.00017870977052965797, + "loss": 0.1927, + "step": 7381 + }, + { + "epoch": 0.5326310472960785, + "grad_norm": 0.11707384139299393, + "learning_rate": 0.00017870688411026123, + "loss": 0.1447, + "step": 7382 + }, + { + "epoch": 0.5327031999711389, + "grad_norm": 0.14905457198619843, + "learning_rate": 0.0001787039976908645, + "loss": 0.1562, + "step": 7383 + }, + { + "epoch": 0.5327753526461994, + "grad_norm": 0.11915255337953568, + "learning_rate": 0.00017870111127146776, + "loss": 0.1564, + "step": 7384 + }, + { + "epoch": 0.5328475053212598, + "grad_norm": 0.12020910531282425, + "learning_rate": 0.00017869822485207102, + "loss": 0.1396, + "step": 7385 + }, + { + "epoch": 0.5329196579963202, + "grad_norm": 0.14836715161800385, + "learning_rate": 0.00017869533843267428, + "loss": 0.1485, + "step": 7386 + }, + { + "epoch": 0.5329918106713807, + "grad_norm": 0.11415567994117737, + "learning_rate": 0.00017869245201327755, + "loss": 0.1299, + "step": 7387 + }, + { + "epoch": 0.533063963346441, + "grad_norm": 0.09554366767406464, + "learning_rate": 0.00017868956559388078, + "loss": 0.1387, + "step": 7388 + }, + { + "epoch": 0.5331361160215015, + "grad_norm": 0.12184850126504898, + "learning_rate": 0.00017868667917448404, + "loss": 0.1311, + "step": 7389 + }, + { + "epoch": 0.5332082686965619, + "grad_norm": 0.10899917781352997, + "learning_rate": 0.00017868379275508733, + "loss": 0.1887, + "step": 7390 + }, + { + "epoch": 0.5332804213716223, + "grad_norm": 0.12960898876190186, + "learning_rate": 0.0001786809063356906, + "loss": 0.1725, + "step": 7391 + }, + { + "epoch": 0.5333525740466828, + "grad_norm": 0.11854162812232971, + "learning_rate": 0.00017867801991629386, + "loss": 0.1437, + "step": 7392 + }, + { + "epoch": 0.5334247267217432, + "grad_norm": 0.15556089580059052, + "learning_rate": 0.0001786751334968971, + "loss": 0.1601, + "step": 7393 + }, + { + "epoch": 0.5334968793968037, + "grad_norm": 0.13274702429771423, + "learning_rate": 0.00017867224707750036, + "loss": 0.1342, + "step": 7394 + }, + { + "epoch": 0.533569032071864, + "grad_norm": 0.1248529702425003, + "learning_rate": 0.00017866936065810362, + "loss": 0.1499, + "step": 7395 + }, + { + "epoch": 0.5336411847469245, + "grad_norm": 0.12986032664775848, + "learning_rate": 0.00017866647423870688, + "loss": 0.1588, + "step": 7396 + }, + { + "epoch": 0.5337133374219849, + "grad_norm": 0.10592981427907944, + "learning_rate": 0.00017866358781931017, + "loss": 0.1684, + "step": 7397 + }, + { + "epoch": 0.5337854900970453, + "grad_norm": 0.12006526440382004, + "learning_rate": 0.0001786607013999134, + "loss": 0.1408, + "step": 7398 + }, + { + "epoch": 0.5338576427721058, + "grad_norm": 0.13237181305885315, + "learning_rate": 0.00017865781498051667, + "loss": 0.1512, + "step": 7399 + }, + { + "epoch": 0.5339297954471662, + "grad_norm": 0.12144875526428223, + "learning_rate": 0.00017865492856111994, + "loss": 0.1505, + "step": 7400 + }, + { + "epoch": 0.5340019481222267, + "grad_norm": 0.14751940965652466, + "learning_rate": 0.0001786520421417232, + "loss": 0.1466, + "step": 7401 + }, + { + "epoch": 0.534074100797287, + "grad_norm": 0.10068172216415405, + "learning_rate": 0.00017864915572232646, + "loss": 0.0937, + "step": 7402 + }, + { + "epoch": 0.5341462534723475, + "grad_norm": 0.13573504984378815, + "learning_rate": 0.00017864626930292973, + "loss": 0.1119, + "step": 7403 + }, + { + "epoch": 0.5342184061474079, + "grad_norm": 0.1610974818468094, + "learning_rate": 0.000178643382883533, + "loss": 0.1449, + "step": 7404 + }, + { + "epoch": 0.5342905588224683, + "grad_norm": 0.12172017246484756, + "learning_rate": 0.00017864049646413625, + "loss": 0.1585, + "step": 7405 + }, + { + "epoch": 0.5343627114975288, + "grad_norm": 0.11596543341875076, + "learning_rate": 0.00017863761004473951, + "loss": 0.1571, + "step": 7406 + }, + { + "epoch": 0.5344348641725892, + "grad_norm": 0.12293829023838043, + "learning_rate": 0.00017863472362534278, + "loss": 0.1388, + "step": 7407 + }, + { + "epoch": 0.5345070168476497, + "grad_norm": 0.12956799566745758, + "learning_rate": 0.00017863183720594604, + "loss": 0.1046, + "step": 7408 + }, + { + "epoch": 0.53457916952271, + "grad_norm": 0.13919049501419067, + "learning_rate": 0.00017862895078654928, + "loss": 0.1617, + "step": 7409 + }, + { + "epoch": 0.5346513221977705, + "grad_norm": 0.12343361228704453, + "learning_rate": 0.00017862606436715254, + "loss": 0.1887, + "step": 7410 + }, + { + "epoch": 0.5347234748728309, + "grad_norm": 0.1412951499223709, + "learning_rate": 0.00017862317794775583, + "loss": 0.1573, + "step": 7411 + }, + { + "epoch": 0.5347956275478913, + "grad_norm": 0.13477957248687744, + "learning_rate": 0.0001786202915283591, + "loss": 0.1599, + "step": 7412 + }, + { + "epoch": 0.5348677802229518, + "grad_norm": 0.14682935178279877, + "learning_rate": 0.00017861740510896235, + "loss": 0.1528, + "step": 7413 + }, + { + "epoch": 0.5349399328980122, + "grad_norm": 0.14483290910720825, + "learning_rate": 0.0001786145186895656, + "loss": 0.1552, + "step": 7414 + }, + { + "epoch": 0.5350120855730727, + "grad_norm": 0.12272778898477554, + "learning_rate": 0.00017861163227016885, + "loss": 0.1441, + "step": 7415 + }, + { + "epoch": 0.535084238248133, + "grad_norm": 0.09411150962114334, + "learning_rate": 0.00017860874585077212, + "loss": 0.1685, + "step": 7416 + }, + { + "epoch": 0.5351563909231934, + "grad_norm": 0.10887883603572845, + "learning_rate": 0.00017860585943137538, + "loss": 0.1683, + "step": 7417 + }, + { + "epoch": 0.5352285435982539, + "grad_norm": 0.1251358836889267, + "learning_rate": 0.00017860297301197867, + "loss": 0.1843, + "step": 7418 + }, + { + "epoch": 0.5353006962733143, + "grad_norm": 0.11616582423448563, + "learning_rate": 0.0001786000865925819, + "loss": 0.1655, + "step": 7419 + }, + { + "epoch": 0.5353728489483748, + "grad_norm": 0.10984798520803452, + "learning_rate": 0.00017859720017318517, + "loss": 0.1494, + "step": 7420 + }, + { + "epoch": 0.5354450016234352, + "grad_norm": 0.12896649539470673, + "learning_rate": 0.00017859431375378843, + "loss": 0.1559, + "step": 7421 + }, + { + "epoch": 0.5355171542984957, + "grad_norm": 0.1121734008193016, + "learning_rate": 0.0001785914273343917, + "loss": 0.1404, + "step": 7422 + }, + { + "epoch": 0.535589306973556, + "grad_norm": 0.12782329320907593, + "learning_rate": 0.00017858854091499496, + "loss": 0.1403, + "step": 7423 + }, + { + "epoch": 0.5356614596486164, + "grad_norm": 0.1383926272392273, + "learning_rate": 0.00017858565449559822, + "loss": 0.1697, + "step": 7424 + }, + { + "epoch": 0.5357336123236769, + "grad_norm": 0.12111084908246994, + "learning_rate": 0.00017858276807620148, + "loss": 0.1076, + "step": 7425 + }, + { + "epoch": 0.5358057649987373, + "grad_norm": 0.1440785825252533, + "learning_rate": 0.00017857988165680475, + "loss": 0.1437, + "step": 7426 + }, + { + "epoch": 0.5358779176737978, + "grad_norm": 0.10769060254096985, + "learning_rate": 0.000178576995237408, + "loss": 0.1804, + "step": 7427 + }, + { + "epoch": 0.5359500703488582, + "grad_norm": 0.13949033617973328, + "learning_rate": 0.00017857410881801127, + "loss": 0.1988, + "step": 7428 + }, + { + "epoch": 0.5360222230239187, + "grad_norm": 0.1220717653632164, + "learning_rate": 0.00017857122239861453, + "loss": 0.1741, + "step": 7429 + }, + { + "epoch": 0.536094375698979, + "grad_norm": 0.1301799863576889, + "learning_rate": 0.00017856833597921777, + "loss": 0.1792, + "step": 7430 + }, + { + "epoch": 0.5361665283740394, + "grad_norm": 0.1295902132987976, + "learning_rate": 0.00017856544955982103, + "loss": 0.0954, + "step": 7431 + }, + { + "epoch": 0.5362386810490999, + "grad_norm": 0.12458403408527374, + "learning_rate": 0.00017856256314042432, + "loss": 0.1745, + "step": 7432 + }, + { + "epoch": 0.5363108337241603, + "grad_norm": 0.14518317580223083, + "learning_rate": 0.00017855967672102759, + "loss": 0.2067, + "step": 7433 + }, + { + "epoch": 0.5363829863992208, + "grad_norm": 0.11274714767932892, + "learning_rate": 0.00017855679030163085, + "loss": 0.1549, + "step": 7434 + }, + { + "epoch": 0.5364551390742812, + "grad_norm": 0.1241229847073555, + "learning_rate": 0.00017855390388223408, + "loss": 0.1385, + "step": 7435 + }, + { + "epoch": 0.5365272917493417, + "grad_norm": 0.09956695884466171, + "learning_rate": 0.00017855101746283735, + "loss": 0.1277, + "step": 7436 + }, + { + "epoch": 0.536599444424402, + "grad_norm": 0.1887374222278595, + "learning_rate": 0.0001785481310434406, + "loss": 0.1822, + "step": 7437 + }, + { + "epoch": 0.5366715970994624, + "grad_norm": 0.09727919101715088, + "learning_rate": 0.00017854524462404387, + "loss": 0.1651, + "step": 7438 + }, + { + "epoch": 0.5367437497745229, + "grad_norm": 0.13736706972122192, + "learning_rate": 0.00017854235820464716, + "loss": 0.1952, + "step": 7439 + }, + { + "epoch": 0.5368159024495833, + "grad_norm": 0.1056545078754425, + "learning_rate": 0.0001785394717852504, + "loss": 0.1501, + "step": 7440 + }, + { + "epoch": 0.5368880551246438, + "grad_norm": 0.12971024215221405, + "learning_rate": 0.00017853658536585366, + "loss": 0.1367, + "step": 7441 + }, + { + "epoch": 0.5369602077997042, + "grad_norm": 0.13022993505001068, + "learning_rate": 0.00017853369894645692, + "loss": 0.1455, + "step": 7442 + }, + { + "epoch": 0.5370323604747645, + "grad_norm": 0.09962315857410431, + "learning_rate": 0.0001785308125270602, + "loss": 0.1498, + "step": 7443 + }, + { + "epoch": 0.537104513149825, + "grad_norm": 0.13888941705226898, + "learning_rate": 0.00017852792610766345, + "loss": 0.2137, + "step": 7444 + }, + { + "epoch": 0.5371766658248854, + "grad_norm": 0.12977571785449982, + "learning_rate": 0.00017852503968826671, + "loss": 0.1779, + "step": 7445 + }, + { + "epoch": 0.5372488184999459, + "grad_norm": 0.14706486463546753, + "learning_rate": 0.00017852215326886998, + "loss": 0.1396, + "step": 7446 + }, + { + "epoch": 0.5373209711750063, + "grad_norm": 0.11621315777301788, + "learning_rate": 0.00017851926684947324, + "loss": 0.1595, + "step": 7447 + }, + { + "epoch": 0.5373931238500668, + "grad_norm": 0.1178140938282013, + "learning_rate": 0.0001785163804300765, + "loss": 0.1704, + "step": 7448 + }, + { + "epoch": 0.5374652765251272, + "grad_norm": 0.12601010501384735, + "learning_rate": 0.00017851349401067977, + "loss": 0.1725, + "step": 7449 + }, + { + "epoch": 0.5375374292001875, + "grad_norm": 0.12485157698392868, + "learning_rate": 0.00017851060759128303, + "loss": 0.1638, + "step": 7450 + }, + { + "epoch": 0.537609581875248, + "grad_norm": 0.14232799410820007, + "learning_rate": 0.00017850772117188626, + "loss": 0.1912, + "step": 7451 + }, + { + "epoch": 0.5376817345503084, + "grad_norm": 0.12679298222064972, + "learning_rate": 0.00017850483475248953, + "loss": 0.1562, + "step": 7452 + }, + { + "epoch": 0.5377538872253689, + "grad_norm": 0.11403336375951767, + "learning_rate": 0.00017850194833309282, + "loss": 0.1204, + "step": 7453 + }, + { + "epoch": 0.5378260399004293, + "grad_norm": 0.11030253022909164, + "learning_rate": 0.00017849906191369608, + "loss": 0.1351, + "step": 7454 + }, + { + "epoch": 0.5378981925754898, + "grad_norm": 0.11188559234142303, + "learning_rate": 0.00017849617549429934, + "loss": 0.1371, + "step": 7455 + }, + { + "epoch": 0.5379703452505502, + "grad_norm": 0.14153379201889038, + "learning_rate": 0.00017849328907490258, + "loss": 0.1632, + "step": 7456 + }, + { + "epoch": 0.5380424979256105, + "grad_norm": 0.12268875539302826, + "learning_rate": 0.00017849040265550584, + "loss": 0.1593, + "step": 7457 + }, + { + "epoch": 0.538114650600671, + "grad_norm": 0.10495118796825409, + "learning_rate": 0.0001784875162361091, + "loss": 0.1816, + "step": 7458 + }, + { + "epoch": 0.5381868032757314, + "grad_norm": 0.15740640461444855, + "learning_rate": 0.00017848462981671237, + "loss": 0.1475, + "step": 7459 + }, + { + "epoch": 0.5382589559507919, + "grad_norm": 0.13259257376194, + "learning_rate": 0.00017848174339731566, + "loss": 0.1376, + "step": 7460 + }, + { + "epoch": 0.5383311086258523, + "grad_norm": 0.1220489889383316, + "learning_rate": 0.0001784788569779189, + "loss": 0.1532, + "step": 7461 + }, + { + "epoch": 0.5384032613009128, + "grad_norm": 0.150324746966362, + "learning_rate": 0.00017847597055852216, + "loss": 0.1654, + "step": 7462 + }, + { + "epoch": 0.5384754139759732, + "grad_norm": 0.12435202300548553, + "learning_rate": 0.00017847308413912542, + "loss": 0.1377, + "step": 7463 + }, + { + "epoch": 0.5385475666510335, + "grad_norm": 0.12755684554576874, + "learning_rate": 0.00017847019771972868, + "loss": 0.1627, + "step": 7464 + }, + { + "epoch": 0.538619719326094, + "grad_norm": 0.11508068442344666, + "learning_rate": 0.00017846731130033195, + "loss": 0.1258, + "step": 7465 + }, + { + "epoch": 0.5386918720011544, + "grad_norm": 0.1059078648686409, + "learning_rate": 0.0001784644248809352, + "loss": 0.1732, + "step": 7466 + }, + { + "epoch": 0.5387640246762149, + "grad_norm": 0.12024451792240143, + "learning_rate": 0.00017846153846153847, + "loss": 0.1186, + "step": 7467 + }, + { + "epoch": 0.5388361773512753, + "grad_norm": 0.12258029729127884, + "learning_rate": 0.00017845865204214173, + "loss": 0.1833, + "step": 7468 + }, + { + "epoch": 0.5389083300263358, + "grad_norm": 0.12107551097869873, + "learning_rate": 0.000178455765622745, + "loss": 0.0989, + "step": 7469 + }, + { + "epoch": 0.5389804827013962, + "grad_norm": 0.11932362616062164, + "learning_rate": 0.00017845287920334826, + "loss": 0.1489, + "step": 7470 + }, + { + "epoch": 0.5390526353764565, + "grad_norm": 0.11584921926259995, + "learning_rate": 0.00017844999278395152, + "loss": 0.1332, + "step": 7471 + }, + { + "epoch": 0.539124788051517, + "grad_norm": 0.1187252476811409, + "learning_rate": 0.00017844710636455479, + "loss": 0.1562, + "step": 7472 + }, + { + "epoch": 0.5391969407265774, + "grad_norm": 0.12253367155790329, + "learning_rate": 0.00017844421994515802, + "loss": 0.1184, + "step": 7473 + }, + { + "epoch": 0.5392690934016379, + "grad_norm": 0.11383675038814545, + "learning_rate": 0.0001784413335257613, + "loss": 0.1086, + "step": 7474 + }, + { + "epoch": 0.5393412460766983, + "grad_norm": 0.12146003544330597, + "learning_rate": 0.00017843844710636457, + "loss": 0.1607, + "step": 7475 + }, + { + "epoch": 0.5394133987517588, + "grad_norm": 0.14561280608177185, + "learning_rate": 0.00017843556068696784, + "loss": 0.1257, + "step": 7476 + }, + { + "epoch": 0.5394855514268192, + "grad_norm": 0.15061399340629578, + "learning_rate": 0.0001784326742675711, + "loss": 0.1643, + "step": 7477 + }, + { + "epoch": 0.5395577041018795, + "grad_norm": 0.13116496801376343, + "learning_rate": 0.00017842978784817434, + "loss": 0.1475, + "step": 7478 + }, + { + "epoch": 0.53962985677694, + "grad_norm": 0.1168837696313858, + "learning_rate": 0.0001784269014287776, + "loss": 0.1522, + "step": 7479 + }, + { + "epoch": 0.5397020094520004, + "grad_norm": 0.11416567862033844, + "learning_rate": 0.00017842401500938086, + "loss": 0.1505, + "step": 7480 + }, + { + "epoch": 0.5397741621270609, + "grad_norm": 0.13860619068145752, + "learning_rate": 0.00017842112858998415, + "loss": 0.145, + "step": 7481 + }, + { + "epoch": 0.5398463148021213, + "grad_norm": 0.10703293234109879, + "learning_rate": 0.00017841824217058741, + "loss": 0.1425, + "step": 7482 + }, + { + "epoch": 0.5399184674771818, + "grad_norm": 0.13385754823684692, + "learning_rate": 0.00017841535575119065, + "loss": 0.1485, + "step": 7483 + }, + { + "epoch": 0.5399906201522422, + "grad_norm": 0.11074848473072052, + "learning_rate": 0.0001784124693317939, + "loss": 0.1244, + "step": 7484 + }, + { + "epoch": 0.5400627728273025, + "grad_norm": 0.13621360063552856, + "learning_rate": 0.00017840958291239718, + "loss": 0.1685, + "step": 7485 + }, + { + "epoch": 0.540134925502363, + "grad_norm": 0.15088190138339996, + "learning_rate": 0.00017840669649300044, + "loss": 0.2403, + "step": 7486 + }, + { + "epoch": 0.5402070781774234, + "grad_norm": 0.1203557550907135, + "learning_rate": 0.0001784038100736037, + "loss": 0.1638, + "step": 7487 + }, + { + "epoch": 0.5402792308524839, + "grad_norm": 0.12648047506809235, + "learning_rate": 0.00017840092365420697, + "loss": 0.1408, + "step": 7488 + }, + { + "epoch": 0.5403513835275443, + "grad_norm": 0.12805725634098053, + "learning_rate": 0.00017839803723481023, + "loss": 0.1709, + "step": 7489 + }, + { + "epoch": 0.5404235362026047, + "grad_norm": 0.12517820298671722, + "learning_rate": 0.0001783951508154135, + "loss": 0.1697, + "step": 7490 + }, + { + "epoch": 0.5404956888776652, + "grad_norm": 0.16566172242164612, + "learning_rate": 0.00017839226439601675, + "loss": 0.1823, + "step": 7491 + }, + { + "epoch": 0.5405678415527255, + "grad_norm": 0.12278152257204056, + "learning_rate": 0.00017838937797662002, + "loss": 0.1122, + "step": 7492 + }, + { + "epoch": 0.540639994227786, + "grad_norm": 0.12212082743644714, + "learning_rate": 0.00017838649155722328, + "loss": 0.133, + "step": 7493 + }, + { + "epoch": 0.5407121469028464, + "grad_norm": 0.13311481475830078, + "learning_rate": 0.00017838360513782652, + "loss": 0.1068, + "step": 7494 + }, + { + "epoch": 0.5407842995779069, + "grad_norm": 0.10868926346302032, + "learning_rate": 0.0001783807187184298, + "loss": 0.1112, + "step": 7495 + }, + { + "epoch": 0.5408564522529673, + "grad_norm": 0.11439421772956848, + "learning_rate": 0.00017837783229903307, + "loss": 0.1431, + "step": 7496 + }, + { + "epoch": 0.5409286049280277, + "grad_norm": 0.1813734471797943, + "learning_rate": 0.00017837494587963633, + "loss": 0.1313, + "step": 7497 + }, + { + "epoch": 0.5410007576030882, + "grad_norm": 0.12070521712303162, + "learning_rate": 0.0001783720594602396, + "loss": 0.1214, + "step": 7498 + }, + { + "epoch": 0.5410729102781485, + "grad_norm": 0.12103161960840225, + "learning_rate": 0.00017836917304084283, + "loss": 0.1357, + "step": 7499 + }, + { + "epoch": 0.541145062953209, + "grad_norm": 0.11779063194990158, + "learning_rate": 0.0001783662866214461, + "loss": 0.1659, + "step": 7500 + }, + { + "epoch": 0.5412172156282694, + "grad_norm": 0.10207480937242508, + "learning_rate": 0.00017836340020204936, + "loss": 0.1414, + "step": 7501 + }, + { + "epoch": 0.5412893683033299, + "grad_norm": 0.15236796438694, + "learning_rate": 0.00017836051378265265, + "loss": 0.1169, + "step": 7502 + }, + { + "epoch": 0.5413615209783903, + "grad_norm": 0.12786640226840973, + "learning_rate": 0.0001783576273632559, + "loss": 0.1719, + "step": 7503 + }, + { + "epoch": 0.5414336736534507, + "grad_norm": 0.12417449802160263, + "learning_rate": 0.00017835474094385914, + "loss": 0.1625, + "step": 7504 + }, + { + "epoch": 0.5415058263285111, + "grad_norm": 0.11570677161216736, + "learning_rate": 0.0001783518545244624, + "loss": 0.1395, + "step": 7505 + }, + { + "epoch": 0.5415779790035715, + "grad_norm": 0.10178045928478241, + "learning_rate": 0.00017834896810506567, + "loss": 0.1621, + "step": 7506 + }, + { + "epoch": 0.541650131678632, + "grad_norm": 0.13424795866012573, + "learning_rate": 0.00017834608168566893, + "loss": 0.1552, + "step": 7507 + }, + { + "epoch": 0.5417222843536924, + "grad_norm": 0.15743964910507202, + "learning_rate": 0.0001783431952662722, + "loss": 0.1692, + "step": 7508 + }, + { + "epoch": 0.5417944370287529, + "grad_norm": 0.1120472252368927, + "learning_rate": 0.00017834030884687546, + "loss": 0.1583, + "step": 7509 + }, + { + "epoch": 0.5418665897038133, + "grad_norm": 0.13689734041690826, + "learning_rate": 0.00017833742242747872, + "loss": 0.1499, + "step": 7510 + }, + { + "epoch": 0.5419387423788737, + "grad_norm": 0.10957453399896622, + "learning_rate": 0.00017833453600808199, + "loss": 0.143, + "step": 7511 + }, + { + "epoch": 0.5420108950539341, + "grad_norm": 0.1020032986998558, + "learning_rate": 0.00017833164958868525, + "loss": 0.1344, + "step": 7512 + }, + { + "epoch": 0.5420830477289945, + "grad_norm": 0.10065440088510513, + "learning_rate": 0.0001783287631692885, + "loss": 0.1309, + "step": 7513 + }, + { + "epoch": 0.542155200404055, + "grad_norm": 0.10141412913799286, + "learning_rate": 0.00017832587674989177, + "loss": 0.1971, + "step": 7514 + }, + { + "epoch": 0.5422273530791154, + "grad_norm": 0.10940968990325928, + "learning_rate": 0.000178322990330495, + "loss": 0.2016, + "step": 7515 + }, + { + "epoch": 0.5422995057541758, + "grad_norm": 0.12552902102470398, + "learning_rate": 0.0001783201039110983, + "loss": 0.2007, + "step": 7516 + }, + { + "epoch": 0.5423716584292363, + "grad_norm": 0.11455725878477097, + "learning_rate": 0.00017831721749170156, + "loss": 0.1712, + "step": 7517 + }, + { + "epoch": 0.5424438111042967, + "grad_norm": 0.12817665934562683, + "learning_rate": 0.00017831433107230483, + "loss": 0.1344, + "step": 7518 + }, + { + "epoch": 0.5425159637793571, + "grad_norm": 0.11515673249959946, + "learning_rate": 0.0001783114446529081, + "loss": 0.1457, + "step": 7519 + }, + { + "epoch": 0.5425881164544175, + "grad_norm": 0.14225012063980103, + "learning_rate": 0.00017830855823351132, + "loss": 0.2118, + "step": 7520 + }, + { + "epoch": 0.542660269129478, + "grad_norm": 0.13224397599697113, + "learning_rate": 0.0001783056718141146, + "loss": 0.1475, + "step": 7521 + }, + { + "epoch": 0.5427324218045384, + "grad_norm": 0.1179685965180397, + "learning_rate": 0.00017830278539471785, + "loss": 0.167, + "step": 7522 + }, + { + "epoch": 0.5428045744795988, + "grad_norm": 0.15331527590751648, + "learning_rate": 0.00017829989897532114, + "loss": 0.1492, + "step": 7523 + }, + { + "epoch": 0.5428767271546593, + "grad_norm": 0.11871089786291122, + "learning_rate": 0.0001782970125559244, + "loss": 0.1001, + "step": 7524 + }, + { + "epoch": 0.5429488798297197, + "grad_norm": 0.10068800300359726, + "learning_rate": 0.00017829412613652764, + "loss": 0.1689, + "step": 7525 + }, + { + "epoch": 0.5430210325047801, + "grad_norm": 0.15152029693126678, + "learning_rate": 0.0001782912397171309, + "loss": 0.1281, + "step": 7526 + }, + { + "epoch": 0.5430931851798405, + "grad_norm": 0.13519147038459778, + "learning_rate": 0.00017828835329773416, + "loss": 0.1416, + "step": 7527 + }, + { + "epoch": 0.543165337854901, + "grad_norm": 0.13669776916503906, + "learning_rate": 0.00017828546687833743, + "loss": 0.1426, + "step": 7528 + }, + { + "epoch": 0.5432374905299614, + "grad_norm": 0.1263340562582016, + "learning_rate": 0.0001782825804589407, + "loss": 0.1462, + "step": 7529 + }, + { + "epoch": 0.5433096432050218, + "grad_norm": 0.1487351357936859, + "learning_rate": 0.00017827969403954395, + "loss": 0.2028, + "step": 7530 + }, + { + "epoch": 0.5433817958800823, + "grad_norm": 0.13854576647281647, + "learning_rate": 0.00017827680762014722, + "loss": 0.1274, + "step": 7531 + }, + { + "epoch": 0.5434539485551427, + "grad_norm": 0.111342653632164, + "learning_rate": 0.00017827392120075048, + "loss": 0.138, + "step": 7532 + }, + { + "epoch": 0.5435261012302031, + "grad_norm": 0.15238115191459656, + "learning_rate": 0.00017827103478135374, + "loss": 0.1189, + "step": 7533 + }, + { + "epoch": 0.5435982539052635, + "grad_norm": 0.1749115288257599, + "learning_rate": 0.000178268148361957, + "loss": 0.1141, + "step": 7534 + }, + { + "epoch": 0.543670406580324, + "grad_norm": 0.13412636518478394, + "learning_rate": 0.00017826526194256027, + "loss": 0.1245, + "step": 7535 + }, + { + "epoch": 0.5437425592553844, + "grad_norm": 0.13132649660110474, + "learning_rate": 0.0001782623755231635, + "loss": 0.18, + "step": 7536 + }, + { + "epoch": 0.5438147119304448, + "grad_norm": 0.11912304908037186, + "learning_rate": 0.0001782594891037668, + "loss": 0.1517, + "step": 7537 + }, + { + "epoch": 0.5438868646055053, + "grad_norm": 0.16066962480545044, + "learning_rate": 0.00017825660268437006, + "loss": 0.1757, + "step": 7538 + }, + { + "epoch": 0.5439590172805657, + "grad_norm": 0.12418557703495026, + "learning_rate": 0.00017825371626497332, + "loss": 0.0902, + "step": 7539 + }, + { + "epoch": 0.5440311699556261, + "grad_norm": 0.11531732976436615, + "learning_rate": 0.00017825082984557658, + "loss": 0.1782, + "step": 7540 + }, + { + "epoch": 0.5441033226306865, + "grad_norm": 0.11655911058187485, + "learning_rate": 0.00017824794342617982, + "loss": 0.1245, + "step": 7541 + }, + { + "epoch": 0.544175475305747, + "grad_norm": 0.11822719871997833, + "learning_rate": 0.00017824505700678308, + "loss": 0.1253, + "step": 7542 + }, + { + "epoch": 0.5442476279808074, + "grad_norm": 0.13040819764137268, + "learning_rate": 0.00017824217058738634, + "loss": 0.1286, + "step": 7543 + }, + { + "epoch": 0.5443197806558678, + "grad_norm": 0.12679016590118408, + "learning_rate": 0.0001782392841679896, + "loss": 0.1337, + "step": 7544 + }, + { + "epoch": 0.5443919333309283, + "grad_norm": 0.11832067370414734, + "learning_rate": 0.0001782363977485929, + "loss": 0.118, + "step": 7545 + }, + { + "epoch": 0.5444640860059887, + "grad_norm": 0.12346930801868439, + "learning_rate": 0.00017823351132919613, + "loss": 0.1466, + "step": 7546 + }, + { + "epoch": 0.5445362386810491, + "grad_norm": 0.1250762641429901, + "learning_rate": 0.0001782306249097994, + "loss": 0.1432, + "step": 7547 + }, + { + "epoch": 0.5446083913561095, + "grad_norm": 0.11732055991888046, + "learning_rate": 0.00017822773849040266, + "loss": 0.1582, + "step": 7548 + }, + { + "epoch": 0.5446805440311699, + "grad_norm": 0.16200515627861023, + "learning_rate": 0.00017822485207100592, + "loss": 0.1538, + "step": 7549 + }, + { + "epoch": 0.5447526967062304, + "grad_norm": 0.15169191360473633, + "learning_rate": 0.00017822196565160918, + "loss": 0.1744, + "step": 7550 + }, + { + "epoch": 0.5448248493812908, + "grad_norm": 0.1553208976984024, + "learning_rate": 0.00017821907923221245, + "loss": 0.1946, + "step": 7551 + }, + { + "epoch": 0.5448970020563513, + "grad_norm": 0.10830380022525787, + "learning_rate": 0.0001782161928128157, + "loss": 0.1781, + "step": 7552 + }, + { + "epoch": 0.5449691547314117, + "grad_norm": 0.11492560058832169, + "learning_rate": 0.00017821330639341897, + "loss": 0.1468, + "step": 7553 + }, + { + "epoch": 0.545041307406472, + "grad_norm": 0.10390572249889374, + "learning_rate": 0.00017821041997402224, + "loss": 0.1941, + "step": 7554 + }, + { + "epoch": 0.5451134600815325, + "grad_norm": 0.1058596521615982, + "learning_rate": 0.0001782075335546255, + "loss": 0.1417, + "step": 7555 + }, + { + "epoch": 0.5451856127565929, + "grad_norm": 0.11372096836566925, + "learning_rate": 0.00017820464713522876, + "loss": 0.0962, + "step": 7556 + }, + { + "epoch": 0.5452577654316534, + "grad_norm": 0.1294596791267395, + "learning_rate": 0.000178201760715832, + "loss": 0.1438, + "step": 7557 + }, + { + "epoch": 0.5453299181067138, + "grad_norm": 0.11079258471727371, + "learning_rate": 0.00017819887429643526, + "loss": 0.1097, + "step": 7558 + }, + { + "epoch": 0.5454020707817743, + "grad_norm": 0.11069349944591522, + "learning_rate": 0.00017819598787703855, + "loss": 0.1254, + "step": 7559 + }, + { + "epoch": 0.5454742234568347, + "grad_norm": 0.12368962913751602, + "learning_rate": 0.00017819310145764181, + "loss": 0.143, + "step": 7560 + }, + { + "epoch": 0.545546376131895, + "grad_norm": 0.1657244712114334, + "learning_rate": 0.00017819021503824508, + "loss": 0.1468, + "step": 7561 + }, + { + "epoch": 0.5456185288069555, + "grad_norm": 0.1279374361038208, + "learning_rate": 0.0001781873286188483, + "loss": 0.1696, + "step": 7562 + }, + { + "epoch": 0.5456906814820159, + "grad_norm": 0.12708182632923126, + "learning_rate": 0.00017818444219945158, + "loss": 0.1597, + "step": 7563 + }, + { + "epoch": 0.5457628341570764, + "grad_norm": 0.11969926208257675, + "learning_rate": 0.00017818155578005484, + "loss": 0.1275, + "step": 7564 + }, + { + "epoch": 0.5458349868321368, + "grad_norm": 0.10338714718818665, + "learning_rate": 0.0001781786693606581, + "loss": 0.1419, + "step": 7565 + }, + { + "epoch": 0.5459071395071973, + "grad_norm": 0.11943601071834564, + "learning_rate": 0.0001781757829412614, + "loss": 0.1026, + "step": 7566 + }, + { + "epoch": 0.5459792921822576, + "grad_norm": 0.10397817194461823, + "learning_rate": 0.00017817289652186463, + "loss": 0.1125, + "step": 7567 + }, + { + "epoch": 0.546051444857318, + "grad_norm": 0.11815197020769119, + "learning_rate": 0.0001781700101024679, + "loss": 0.1846, + "step": 7568 + }, + { + "epoch": 0.5461235975323785, + "grad_norm": 0.11354372650384903, + "learning_rate": 0.00017816712368307115, + "loss": 0.1755, + "step": 7569 + }, + { + "epoch": 0.5461957502074389, + "grad_norm": 0.11804115772247314, + "learning_rate": 0.00017816423726367442, + "loss": 0.1541, + "step": 7570 + }, + { + "epoch": 0.5462679028824994, + "grad_norm": 0.12533968687057495, + "learning_rate": 0.00017816135084427768, + "loss": 0.1538, + "step": 7571 + }, + { + "epoch": 0.5463400555575598, + "grad_norm": 0.09542267769575119, + "learning_rate": 0.00017815846442488094, + "loss": 0.1612, + "step": 7572 + }, + { + "epoch": 0.5464122082326203, + "grad_norm": 0.11053860932588577, + "learning_rate": 0.0001781555780054842, + "loss": 0.1906, + "step": 7573 + }, + { + "epoch": 0.5464843609076806, + "grad_norm": 0.1260577142238617, + "learning_rate": 0.00017815269158608747, + "loss": 0.184, + "step": 7574 + }, + { + "epoch": 0.546556513582741, + "grad_norm": 0.11573726683855057, + "learning_rate": 0.00017814980516669073, + "loss": 0.143, + "step": 7575 + }, + { + "epoch": 0.5466286662578015, + "grad_norm": 0.16125363111495972, + "learning_rate": 0.000178146918747294, + "loss": 0.1601, + "step": 7576 + }, + { + "epoch": 0.5467008189328619, + "grad_norm": 0.1232057511806488, + "learning_rate": 0.00017814403232789726, + "loss": 0.1597, + "step": 7577 + }, + { + "epoch": 0.5467729716079224, + "grad_norm": 0.11699820309877396, + "learning_rate": 0.00017814114590850052, + "loss": 0.1498, + "step": 7578 + }, + { + "epoch": 0.5468451242829828, + "grad_norm": 0.11260601133108139, + "learning_rate": 0.00017813825948910376, + "loss": 0.1171, + "step": 7579 + }, + { + "epoch": 0.5469172769580433, + "grad_norm": 0.10898337513208389, + "learning_rate": 0.00017813537306970705, + "loss": 0.1352, + "step": 7580 + }, + { + "epoch": 0.5469894296331036, + "grad_norm": 0.12603630125522614, + "learning_rate": 0.0001781324866503103, + "loss": 0.1892, + "step": 7581 + }, + { + "epoch": 0.547061582308164, + "grad_norm": 0.1058061271905899, + "learning_rate": 0.00017812960023091357, + "loss": 0.2125, + "step": 7582 + }, + { + "epoch": 0.5471337349832245, + "grad_norm": 0.11850077658891678, + "learning_rate": 0.00017812671381151683, + "loss": 0.1416, + "step": 7583 + }, + { + "epoch": 0.5472058876582849, + "grad_norm": 0.11276722699403763, + "learning_rate": 0.00017812382739212007, + "loss": 0.1597, + "step": 7584 + }, + { + "epoch": 0.5472780403333454, + "grad_norm": 0.15107154846191406, + "learning_rate": 0.00017812094097272333, + "loss": 0.1779, + "step": 7585 + }, + { + "epoch": 0.5473501930084058, + "grad_norm": 0.10331179201602936, + "learning_rate": 0.0001781180545533266, + "loss": 0.1795, + "step": 7586 + }, + { + "epoch": 0.5474223456834663, + "grad_norm": 0.13958831131458282, + "learning_rate": 0.00017811516813392989, + "loss": 0.1666, + "step": 7587 + }, + { + "epoch": 0.5474944983585266, + "grad_norm": 0.11555318534374237, + "learning_rate": 0.00017811228171453315, + "loss": 0.1879, + "step": 7588 + }, + { + "epoch": 0.547566651033587, + "grad_norm": 0.13235783576965332, + "learning_rate": 0.00017810939529513638, + "loss": 0.1313, + "step": 7589 + }, + { + "epoch": 0.5476388037086475, + "grad_norm": 0.14834065735340118, + "learning_rate": 0.00017810650887573965, + "loss": 0.1452, + "step": 7590 + }, + { + "epoch": 0.5477109563837079, + "grad_norm": 0.14114947617053986, + "learning_rate": 0.0001781036224563429, + "loss": 0.1599, + "step": 7591 + }, + { + "epoch": 0.5477831090587684, + "grad_norm": 0.1421668529510498, + "learning_rate": 0.00017810073603694617, + "loss": 0.1575, + "step": 7592 + }, + { + "epoch": 0.5478552617338288, + "grad_norm": 0.1561838686466217, + "learning_rate": 0.00017809784961754944, + "loss": 0.1774, + "step": 7593 + }, + { + "epoch": 0.5479274144088893, + "grad_norm": 0.12360503524541855, + "learning_rate": 0.0001780949631981527, + "loss": 0.1588, + "step": 7594 + }, + { + "epoch": 0.5479995670839496, + "grad_norm": 0.11868561804294586, + "learning_rate": 0.00017809207677875596, + "loss": 0.1474, + "step": 7595 + }, + { + "epoch": 0.54807171975901, + "grad_norm": 0.09299908578395844, + "learning_rate": 0.00017808919035935923, + "loss": 0.1211, + "step": 7596 + }, + { + "epoch": 0.5481438724340705, + "grad_norm": 0.14525295794010162, + "learning_rate": 0.0001780863039399625, + "loss": 0.2189, + "step": 7597 + }, + { + "epoch": 0.5482160251091309, + "grad_norm": 0.1374361664056778, + "learning_rate": 0.00017808341752056575, + "loss": 0.1785, + "step": 7598 + }, + { + "epoch": 0.5482881777841914, + "grad_norm": 0.10330330580472946, + "learning_rate": 0.00017808053110116901, + "loss": 0.1544, + "step": 7599 + }, + { + "epoch": 0.5483603304592518, + "grad_norm": 0.10700894892215729, + "learning_rate": 0.00017807764468177225, + "loss": 0.1815, + "step": 7600 + }, + { + "epoch": 0.5484324831343123, + "grad_norm": 0.11932407319545746, + "learning_rate": 0.00017807475826237554, + "loss": 0.1836, + "step": 7601 + }, + { + "epoch": 0.5485046358093726, + "grad_norm": 0.11152351647615433, + "learning_rate": 0.0001780718718429788, + "loss": 0.1673, + "step": 7602 + }, + { + "epoch": 0.548576788484433, + "grad_norm": 0.11458411812782288, + "learning_rate": 0.00017806898542358207, + "loss": 0.1754, + "step": 7603 + }, + { + "epoch": 0.5486489411594935, + "grad_norm": 0.1215558722615242, + "learning_rate": 0.00017806609900418533, + "loss": 0.148, + "step": 7604 + }, + { + "epoch": 0.5487210938345539, + "grad_norm": 0.1182718351483345, + "learning_rate": 0.00017806321258478856, + "loss": 0.1805, + "step": 7605 + }, + { + "epoch": 0.5487932465096144, + "grad_norm": 0.13227002322673798, + "learning_rate": 0.00017806032616539183, + "loss": 0.1678, + "step": 7606 + }, + { + "epoch": 0.5488653991846748, + "grad_norm": 0.09978866577148438, + "learning_rate": 0.0001780574397459951, + "loss": 0.1455, + "step": 7607 + }, + { + "epoch": 0.5489375518597353, + "grad_norm": 0.1297626942396164, + "learning_rate": 0.00017805455332659838, + "loss": 0.1643, + "step": 7608 + }, + { + "epoch": 0.5490097045347956, + "grad_norm": 0.13307395577430725, + "learning_rate": 0.00017805166690720164, + "loss": 0.1121, + "step": 7609 + }, + { + "epoch": 0.549081857209856, + "grad_norm": 0.09766818583011627, + "learning_rate": 0.00017804878048780488, + "loss": 0.182, + "step": 7610 + }, + { + "epoch": 0.5491540098849165, + "grad_norm": 0.1137261837720871, + "learning_rate": 0.00017804589406840814, + "loss": 0.142, + "step": 7611 + }, + { + "epoch": 0.5492261625599769, + "grad_norm": 0.1196482926607132, + "learning_rate": 0.0001780430076490114, + "loss": 0.1306, + "step": 7612 + }, + { + "epoch": 0.5492983152350374, + "grad_norm": 0.11196993291378021, + "learning_rate": 0.00017804012122961467, + "loss": 0.1153, + "step": 7613 + }, + { + "epoch": 0.5493704679100978, + "grad_norm": 0.10517489910125732, + "learning_rate": 0.00017803723481021793, + "loss": 0.1291, + "step": 7614 + }, + { + "epoch": 0.5494426205851582, + "grad_norm": 0.1116122379899025, + "learning_rate": 0.0001780343483908212, + "loss": 0.1371, + "step": 7615 + }, + { + "epoch": 0.5495147732602186, + "grad_norm": 0.21631592512130737, + "learning_rate": 0.00017803146197142446, + "loss": 0.1495, + "step": 7616 + }, + { + "epoch": 0.549586925935279, + "grad_norm": 0.13187208771705627, + "learning_rate": 0.00017802857555202772, + "loss": 0.1828, + "step": 7617 + }, + { + "epoch": 0.5496590786103395, + "grad_norm": 0.10498936474323273, + "learning_rate": 0.00017802568913263098, + "loss": 0.1785, + "step": 7618 + }, + { + "epoch": 0.5497312312853999, + "grad_norm": 0.10033317655324936, + "learning_rate": 0.00017802280271323425, + "loss": 0.19, + "step": 7619 + }, + { + "epoch": 0.5498033839604604, + "grad_norm": 0.11087717115879059, + "learning_rate": 0.0001780199162938375, + "loss": 0.1631, + "step": 7620 + }, + { + "epoch": 0.5498755366355208, + "grad_norm": 0.15313465893268585, + "learning_rate": 0.00017801702987444074, + "loss": 0.1384, + "step": 7621 + }, + { + "epoch": 0.5499476893105812, + "grad_norm": 0.12790971994400024, + "learning_rate": 0.00017801414345504403, + "loss": 0.1112, + "step": 7622 + }, + { + "epoch": 0.5500198419856416, + "grad_norm": 0.1123102530837059, + "learning_rate": 0.0001780112570356473, + "loss": 0.1493, + "step": 7623 + }, + { + "epoch": 0.550091994660702, + "grad_norm": 0.1404675394296646, + "learning_rate": 0.00017800837061625056, + "loss": 0.1352, + "step": 7624 + }, + { + "epoch": 0.5501641473357625, + "grad_norm": 0.12286730110645294, + "learning_rate": 0.00017800548419685382, + "loss": 0.1422, + "step": 7625 + }, + { + "epoch": 0.5502363000108229, + "grad_norm": 0.15925469994544983, + "learning_rate": 0.00017800259777745706, + "loss": 0.1666, + "step": 7626 + }, + { + "epoch": 0.5503084526858834, + "grad_norm": 0.1199231743812561, + "learning_rate": 0.00017799971135806032, + "loss": 0.1769, + "step": 7627 + }, + { + "epoch": 0.5503806053609438, + "grad_norm": 0.11914505064487457, + "learning_rate": 0.00017799682493866358, + "loss": 0.1695, + "step": 7628 + }, + { + "epoch": 0.5504527580360041, + "grad_norm": 0.13842357695102692, + "learning_rate": 0.00017799393851926687, + "loss": 0.1644, + "step": 7629 + }, + { + "epoch": 0.5505249107110646, + "grad_norm": 0.10801035910844803, + "learning_rate": 0.00017799105209987014, + "loss": 0.2006, + "step": 7630 + }, + { + "epoch": 0.550597063386125, + "grad_norm": 0.12393604218959808, + "learning_rate": 0.00017798816568047337, + "loss": 0.1494, + "step": 7631 + }, + { + "epoch": 0.5506692160611855, + "grad_norm": 0.13326948881149292, + "learning_rate": 0.00017798527926107664, + "loss": 0.1248, + "step": 7632 + }, + { + "epoch": 0.5507413687362459, + "grad_norm": 0.11505146324634552, + "learning_rate": 0.0001779823928416799, + "loss": 0.1555, + "step": 7633 + }, + { + "epoch": 0.5508135214113064, + "grad_norm": 0.1264219731092453, + "learning_rate": 0.00017797950642228316, + "loss": 0.1262, + "step": 7634 + }, + { + "epoch": 0.5508856740863668, + "grad_norm": 0.12235382199287415, + "learning_rate": 0.00017797662000288642, + "loss": 0.1525, + "step": 7635 + }, + { + "epoch": 0.5509578267614271, + "grad_norm": 0.11811922490596771, + "learning_rate": 0.0001779737335834897, + "loss": 0.1316, + "step": 7636 + }, + { + "epoch": 0.5510299794364876, + "grad_norm": 0.12360856682062149, + "learning_rate": 0.00017797084716409295, + "loss": 0.1944, + "step": 7637 + }, + { + "epoch": 0.551102132111548, + "grad_norm": 0.16416537761688232, + "learning_rate": 0.00017796796074469621, + "loss": 0.1796, + "step": 7638 + }, + { + "epoch": 0.5511742847866085, + "grad_norm": 0.15926092863082886, + "learning_rate": 0.00017796507432529948, + "loss": 0.1256, + "step": 7639 + }, + { + "epoch": 0.5512464374616689, + "grad_norm": 0.13481405377388, + "learning_rate": 0.00017796218790590274, + "loss": 0.1616, + "step": 7640 + }, + { + "epoch": 0.5513185901367293, + "grad_norm": 0.1308879256248474, + "learning_rate": 0.000177959301486506, + "loss": 0.1884, + "step": 7641 + }, + { + "epoch": 0.5513907428117898, + "grad_norm": 0.13799186050891876, + "learning_rate": 0.00017795641506710924, + "loss": 0.1155, + "step": 7642 + }, + { + "epoch": 0.5514628954868501, + "grad_norm": 0.11589345335960388, + "learning_rate": 0.00017795352864771253, + "loss": 0.1686, + "step": 7643 + }, + { + "epoch": 0.5515350481619106, + "grad_norm": 0.10249035060405731, + "learning_rate": 0.0001779506422283158, + "loss": 0.0997, + "step": 7644 + }, + { + "epoch": 0.551607200836971, + "grad_norm": 0.12435011565685272, + "learning_rate": 0.00017794775580891905, + "loss": 0.1612, + "step": 7645 + }, + { + "epoch": 0.5516793535120315, + "grad_norm": 0.14909322559833527, + "learning_rate": 0.00017794486938952232, + "loss": 0.1083, + "step": 7646 + }, + { + "epoch": 0.5517515061870919, + "grad_norm": 0.1289183795452118, + "learning_rate": 0.00017794198297012555, + "loss": 0.1677, + "step": 7647 + }, + { + "epoch": 0.5518236588621523, + "grad_norm": 0.10382948070764542, + "learning_rate": 0.00017793909655072882, + "loss": 0.091, + "step": 7648 + }, + { + "epoch": 0.5518958115372128, + "grad_norm": 0.11587633937597275, + "learning_rate": 0.00017793621013133208, + "loss": 0.132, + "step": 7649 + }, + { + "epoch": 0.5519679642122731, + "grad_norm": 0.15552467107772827, + "learning_rate": 0.00017793332371193537, + "loss": 0.1311, + "step": 7650 + }, + { + "epoch": 0.5520401168873336, + "grad_norm": 0.13075612485408783, + "learning_rate": 0.00017793043729253863, + "loss": 0.1812, + "step": 7651 + }, + { + "epoch": 0.552112269562394, + "grad_norm": 0.10492058098316193, + "learning_rate": 0.00017792755087314187, + "loss": 0.1694, + "step": 7652 + }, + { + "epoch": 0.5521844222374545, + "grad_norm": 0.10020974278450012, + "learning_rate": 0.00017792466445374513, + "loss": 0.1825, + "step": 7653 + }, + { + "epoch": 0.5522565749125149, + "grad_norm": 0.12632660567760468, + "learning_rate": 0.0001779217780343484, + "loss": 0.1432, + "step": 7654 + }, + { + "epoch": 0.5523287275875753, + "grad_norm": 0.10315108299255371, + "learning_rate": 0.00017791889161495166, + "loss": 0.1665, + "step": 7655 + }, + { + "epoch": 0.5524008802626358, + "grad_norm": 0.12094546854496002, + "learning_rate": 0.00017791600519555492, + "loss": 0.1668, + "step": 7656 + }, + { + "epoch": 0.5524730329376961, + "grad_norm": 0.09945043921470642, + "learning_rate": 0.00017791311877615818, + "loss": 0.1524, + "step": 7657 + }, + { + "epoch": 0.5525451856127566, + "grad_norm": 0.11824460327625275, + "learning_rate": 0.00017791023235676145, + "loss": 0.1465, + "step": 7658 + }, + { + "epoch": 0.552617338287817, + "grad_norm": 0.11674058437347412, + "learning_rate": 0.0001779073459373647, + "loss": 0.1636, + "step": 7659 + }, + { + "epoch": 0.5526894909628774, + "grad_norm": 0.18679635226726532, + "learning_rate": 0.00017790445951796797, + "loss": 0.1319, + "step": 7660 + }, + { + "epoch": 0.5527616436379379, + "grad_norm": 0.11995816230773926, + "learning_rate": 0.00017790157309857123, + "loss": 0.1507, + "step": 7661 + }, + { + "epoch": 0.5528337963129983, + "grad_norm": 0.11749674379825592, + "learning_rate": 0.0001778986866791745, + "loss": 0.1657, + "step": 7662 + }, + { + "epoch": 0.5529059489880588, + "grad_norm": 0.10846541821956635, + "learning_rate": 0.00017789580025977773, + "loss": 0.1576, + "step": 7663 + }, + { + "epoch": 0.5529781016631191, + "grad_norm": 0.1053551658987999, + "learning_rate": 0.00017789291384038102, + "loss": 0.1576, + "step": 7664 + }, + { + "epoch": 0.5530502543381796, + "grad_norm": 0.1170341745018959, + "learning_rate": 0.00017789002742098429, + "loss": 0.1297, + "step": 7665 + }, + { + "epoch": 0.55312240701324, + "grad_norm": 0.15445485711097717, + "learning_rate": 0.00017788714100158755, + "loss": 0.1411, + "step": 7666 + }, + { + "epoch": 0.5531945596883004, + "grad_norm": 0.11116831749677658, + "learning_rate": 0.0001778842545821908, + "loss": 0.1496, + "step": 7667 + }, + { + "epoch": 0.5532667123633609, + "grad_norm": 0.10674598067998886, + "learning_rate": 0.00017788136816279405, + "loss": 0.1317, + "step": 7668 + }, + { + "epoch": 0.5533388650384213, + "grad_norm": 0.11206822097301483, + "learning_rate": 0.0001778784817433973, + "loss": 0.1271, + "step": 7669 + }, + { + "epoch": 0.5534110177134818, + "grad_norm": 0.18465600907802582, + "learning_rate": 0.00017787559532400057, + "loss": 0.199, + "step": 7670 + }, + { + "epoch": 0.5534831703885421, + "grad_norm": 0.1241191029548645, + "learning_rate": 0.00017787270890460386, + "loss": 0.1434, + "step": 7671 + }, + { + "epoch": 0.5535553230636026, + "grad_norm": 0.1338217556476593, + "learning_rate": 0.00017786982248520713, + "loss": 0.1697, + "step": 7672 + }, + { + "epoch": 0.553627475738663, + "grad_norm": 0.1337876319885254, + "learning_rate": 0.00017786693606581036, + "loss": 0.1513, + "step": 7673 + }, + { + "epoch": 0.5536996284137234, + "grad_norm": 0.11135462671518326, + "learning_rate": 0.00017786404964641362, + "loss": 0.1336, + "step": 7674 + }, + { + "epoch": 0.5537717810887839, + "grad_norm": 0.11627933382987976, + "learning_rate": 0.0001778611632270169, + "loss": 0.1688, + "step": 7675 + }, + { + "epoch": 0.5538439337638443, + "grad_norm": 0.11941682547330856, + "learning_rate": 0.00017785827680762015, + "loss": 0.1504, + "step": 7676 + }, + { + "epoch": 0.5539160864389048, + "grad_norm": 0.14273981750011444, + "learning_rate": 0.0001778553903882234, + "loss": 0.1497, + "step": 7677 + }, + { + "epoch": 0.5539882391139651, + "grad_norm": 0.11758162081241608, + "learning_rate": 0.00017785250396882668, + "loss": 0.1439, + "step": 7678 + }, + { + "epoch": 0.5540603917890256, + "grad_norm": 0.12304052710533142, + "learning_rate": 0.00017784961754942994, + "loss": 0.1373, + "step": 7679 + }, + { + "epoch": 0.554132544464086, + "grad_norm": 0.11368243396282196, + "learning_rate": 0.0001778467311300332, + "loss": 0.1176, + "step": 7680 + }, + { + "epoch": 0.5542046971391464, + "grad_norm": 0.10586858540773392, + "learning_rate": 0.00017784384471063647, + "loss": 0.1657, + "step": 7681 + }, + { + "epoch": 0.5542768498142069, + "grad_norm": 0.12059576064348221, + "learning_rate": 0.00017784095829123973, + "loss": 0.1499, + "step": 7682 + }, + { + "epoch": 0.5543490024892673, + "grad_norm": 0.12533071637153625, + "learning_rate": 0.000177838071871843, + "loss": 0.2116, + "step": 7683 + }, + { + "epoch": 0.5544211551643278, + "grad_norm": 0.12862452864646912, + "learning_rate": 0.00017783518545244623, + "loss": 0.1547, + "step": 7684 + }, + { + "epoch": 0.5544933078393881, + "grad_norm": 0.18502777814865112, + "learning_rate": 0.00017783229903304952, + "loss": 0.1716, + "step": 7685 + }, + { + "epoch": 0.5545654605144485, + "grad_norm": 0.12029103934764862, + "learning_rate": 0.00017782941261365278, + "loss": 0.1329, + "step": 7686 + }, + { + "epoch": 0.554637613189509, + "grad_norm": 0.1410187929868698, + "learning_rate": 0.00017782652619425604, + "loss": 0.1179, + "step": 7687 + }, + { + "epoch": 0.5547097658645694, + "grad_norm": 0.1031469777226448, + "learning_rate": 0.0001778236397748593, + "loss": 0.1802, + "step": 7688 + }, + { + "epoch": 0.5547819185396299, + "grad_norm": 0.12865352630615234, + "learning_rate": 0.00017782075335546254, + "loss": 0.1739, + "step": 7689 + }, + { + "epoch": 0.5548540712146903, + "grad_norm": 0.16174176335334778, + "learning_rate": 0.0001778178669360658, + "loss": 0.1199, + "step": 7690 + }, + { + "epoch": 0.5549262238897507, + "grad_norm": 0.16539667546749115, + "learning_rate": 0.00017781498051666907, + "loss": 0.1874, + "step": 7691 + }, + { + "epoch": 0.5549983765648111, + "grad_norm": 0.1633646935224533, + "learning_rate": 0.00017781209409727236, + "loss": 0.1506, + "step": 7692 + }, + { + "epoch": 0.5550705292398715, + "grad_norm": 0.12402980774641037, + "learning_rate": 0.00017780920767787562, + "loss": 0.1601, + "step": 7693 + }, + { + "epoch": 0.555142681914932, + "grad_norm": 0.12489049881696701, + "learning_rate": 0.00017780632125847888, + "loss": 0.1614, + "step": 7694 + }, + { + "epoch": 0.5552148345899924, + "grad_norm": 0.09997840225696564, + "learning_rate": 0.00017780343483908212, + "loss": 0.1898, + "step": 7695 + }, + { + "epoch": 0.5552869872650529, + "grad_norm": 0.10950327664613724, + "learning_rate": 0.00017780054841968538, + "loss": 0.1264, + "step": 7696 + }, + { + "epoch": 0.5553591399401133, + "grad_norm": 0.14299684762954712, + "learning_rate": 0.00017779766200028864, + "loss": 0.1656, + "step": 7697 + }, + { + "epoch": 0.5554312926151737, + "grad_norm": 0.11332228034734726, + "learning_rate": 0.0001777947755808919, + "loss": 0.202, + "step": 7698 + }, + { + "epoch": 0.5555034452902341, + "grad_norm": 0.11944280564785004, + "learning_rate": 0.0001777918891614952, + "loss": 0.1409, + "step": 7699 + }, + { + "epoch": 0.5555755979652945, + "grad_norm": 0.1238955482840538, + "learning_rate": 0.00017778900274209843, + "loss": 0.2241, + "step": 7700 + }, + { + "epoch": 0.555647750640355, + "grad_norm": 0.12767411768436432, + "learning_rate": 0.0001777861163227017, + "loss": 0.1316, + "step": 7701 + }, + { + "epoch": 0.5557199033154154, + "grad_norm": 0.14630673825740814, + "learning_rate": 0.00017778322990330496, + "loss": 0.1333, + "step": 7702 + }, + { + "epoch": 0.5557920559904759, + "grad_norm": 0.15821218490600586, + "learning_rate": 0.00017778034348390822, + "loss": 0.1814, + "step": 7703 + }, + { + "epoch": 0.5558642086655363, + "grad_norm": 0.1112324520945549, + "learning_rate": 0.00017777745706451149, + "loss": 0.1542, + "step": 7704 + }, + { + "epoch": 0.5559363613405967, + "grad_norm": 0.12217922508716583, + "learning_rate": 0.00017777457064511475, + "loss": 0.1775, + "step": 7705 + }, + { + "epoch": 0.5560085140156571, + "grad_norm": 0.16339953243732452, + "learning_rate": 0.000177771684225718, + "loss": 0.0958, + "step": 7706 + }, + { + "epoch": 0.5560806666907175, + "grad_norm": 0.1335275024175644, + "learning_rate": 0.00017776879780632127, + "loss": 0.1565, + "step": 7707 + }, + { + "epoch": 0.556152819365778, + "grad_norm": 0.11807729303836823, + "learning_rate": 0.00017776591138692454, + "loss": 0.1073, + "step": 7708 + }, + { + "epoch": 0.5562249720408384, + "grad_norm": 0.13649781048297882, + "learning_rate": 0.0001777630249675278, + "loss": 0.118, + "step": 7709 + }, + { + "epoch": 0.5562971247158989, + "grad_norm": 0.11952555924654007, + "learning_rate": 0.00017776013854813106, + "loss": 0.1918, + "step": 7710 + }, + { + "epoch": 0.5563692773909593, + "grad_norm": 0.14145714044570923, + "learning_rate": 0.0001777572521287343, + "loss": 0.1392, + "step": 7711 + }, + { + "epoch": 0.5564414300660196, + "grad_norm": 0.12335800379514694, + "learning_rate": 0.00017775436570933756, + "loss": 0.1684, + "step": 7712 + }, + { + "epoch": 0.5565135827410801, + "grad_norm": 0.16189730167388916, + "learning_rate": 0.00017775147928994085, + "loss": 0.194, + "step": 7713 + }, + { + "epoch": 0.5565857354161405, + "grad_norm": 0.11122278869152069, + "learning_rate": 0.00017774859287054411, + "loss": 0.1809, + "step": 7714 + }, + { + "epoch": 0.556657888091201, + "grad_norm": 0.1196560338139534, + "learning_rate": 0.00017774570645114738, + "loss": 0.1621, + "step": 7715 + }, + { + "epoch": 0.5567300407662614, + "grad_norm": 0.11895806342363358, + "learning_rate": 0.0001777428200317506, + "loss": 0.1817, + "step": 7716 + }, + { + "epoch": 0.5568021934413219, + "grad_norm": 0.12140301614999771, + "learning_rate": 0.00017773993361235388, + "loss": 0.1284, + "step": 7717 + }, + { + "epoch": 0.5568743461163823, + "grad_norm": 0.10380367189645767, + "learning_rate": 0.00017773704719295714, + "loss": 0.169, + "step": 7718 + }, + { + "epoch": 0.5569464987914426, + "grad_norm": 0.12693089246749878, + "learning_rate": 0.0001777341607735604, + "loss": 0.1258, + "step": 7719 + }, + { + "epoch": 0.5570186514665031, + "grad_norm": 0.1187371239066124, + "learning_rate": 0.0001777312743541637, + "loss": 0.1467, + "step": 7720 + }, + { + "epoch": 0.5570908041415635, + "grad_norm": 0.1260175108909607, + "learning_rate": 0.00017772838793476693, + "loss": 0.172, + "step": 7721 + }, + { + "epoch": 0.557162956816624, + "grad_norm": 0.11985334753990173, + "learning_rate": 0.0001777255015153702, + "loss": 0.1496, + "step": 7722 + }, + { + "epoch": 0.5572351094916844, + "grad_norm": 0.11689590662717819, + "learning_rate": 0.00017772261509597345, + "loss": 0.1327, + "step": 7723 + }, + { + "epoch": 0.5573072621667449, + "grad_norm": 0.139571875333786, + "learning_rate": 0.00017771972867657672, + "loss": 0.1196, + "step": 7724 + }, + { + "epoch": 0.5573794148418053, + "grad_norm": 0.158230260014534, + "learning_rate": 0.00017771684225717998, + "loss": 0.138, + "step": 7725 + }, + { + "epoch": 0.5574515675168656, + "grad_norm": 0.14422976970672607, + "learning_rate": 0.00017771395583778324, + "loss": 0.1767, + "step": 7726 + }, + { + "epoch": 0.5575237201919261, + "grad_norm": 0.08607729524374008, + "learning_rate": 0.0001777110694183865, + "loss": 0.1117, + "step": 7727 + }, + { + "epoch": 0.5575958728669865, + "grad_norm": 0.11835624277591705, + "learning_rate": 0.00017770818299898977, + "loss": 0.1872, + "step": 7728 + }, + { + "epoch": 0.557668025542047, + "grad_norm": 0.16040194034576416, + "learning_rate": 0.00017770529657959303, + "loss": 0.1255, + "step": 7729 + }, + { + "epoch": 0.5577401782171074, + "grad_norm": 0.11539598554372787, + "learning_rate": 0.0001777024101601963, + "loss": 0.1598, + "step": 7730 + }, + { + "epoch": 0.5578123308921679, + "grad_norm": 0.1356775462627411, + "learning_rate": 0.00017769952374079956, + "loss": 0.161, + "step": 7731 + }, + { + "epoch": 0.5578844835672283, + "grad_norm": 0.11297253519296646, + "learning_rate": 0.0001776966373214028, + "loss": 0.1604, + "step": 7732 + }, + { + "epoch": 0.5579566362422886, + "grad_norm": 0.11831973493099213, + "learning_rate": 0.00017769375090200606, + "loss": 0.178, + "step": 7733 + }, + { + "epoch": 0.5580287889173491, + "grad_norm": 0.1246437281370163, + "learning_rate": 0.00017769086448260935, + "loss": 0.1518, + "step": 7734 + }, + { + "epoch": 0.5581009415924095, + "grad_norm": 0.15455931425094604, + "learning_rate": 0.0001776879780632126, + "loss": 0.173, + "step": 7735 + }, + { + "epoch": 0.55817309426747, + "grad_norm": 0.11363532394170761, + "learning_rate": 0.00017768509164381587, + "loss": 0.1335, + "step": 7736 + }, + { + "epoch": 0.5582452469425304, + "grad_norm": 0.11740487068891525, + "learning_rate": 0.0001776822052244191, + "loss": 0.1696, + "step": 7737 + }, + { + "epoch": 0.5583173996175909, + "grad_norm": 0.1704363375902176, + "learning_rate": 0.00017767931880502237, + "loss": 0.1614, + "step": 7738 + }, + { + "epoch": 0.5583895522926513, + "grad_norm": 0.11599764227867126, + "learning_rate": 0.00017767643238562563, + "loss": 0.1438, + "step": 7739 + }, + { + "epoch": 0.5584617049677116, + "grad_norm": 0.12581701576709747, + "learning_rate": 0.0001776735459662289, + "loss": 0.1482, + "step": 7740 + }, + { + "epoch": 0.5585338576427721, + "grad_norm": 0.13323472440242767, + "learning_rate": 0.00017767065954683216, + "loss": 0.1381, + "step": 7741 + }, + { + "epoch": 0.5586060103178325, + "grad_norm": 0.1303575187921524, + "learning_rate": 0.00017766777312743542, + "loss": 0.1571, + "step": 7742 + }, + { + "epoch": 0.558678162992893, + "grad_norm": 0.12017255276441574, + "learning_rate": 0.00017766488670803868, + "loss": 0.1685, + "step": 7743 + }, + { + "epoch": 0.5587503156679534, + "grad_norm": 0.12586411833763123, + "learning_rate": 0.00017766200028864195, + "loss": 0.1704, + "step": 7744 + }, + { + "epoch": 0.5588224683430139, + "grad_norm": 0.1185087338089943, + "learning_rate": 0.0001776591138692452, + "loss": 0.1719, + "step": 7745 + }, + { + "epoch": 0.5588946210180743, + "grad_norm": 0.1333579421043396, + "learning_rate": 0.00017765622744984847, + "loss": 0.1062, + "step": 7746 + }, + { + "epoch": 0.5589667736931346, + "grad_norm": 0.1394158899784088, + "learning_rate": 0.00017765334103045174, + "loss": 0.1787, + "step": 7747 + }, + { + "epoch": 0.5590389263681951, + "grad_norm": 0.1150098666548729, + "learning_rate": 0.00017765045461105497, + "loss": 0.1542, + "step": 7748 + }, + { + "epoch": 0.5591110790432555, + "grad_norm": 0.12589210271835327, + "learning_rate": 0.00017764756819165826, + "loss": 0.1628, + "step": 7749 + }, + { + "epoch": 0.559183231718316, + "grad_norm": 0.10655258595943451, + "learning_rate": 0.00017764468177226153, + "loss": 0.1381, + "step": 7750 + }, + { + "epoch": 0.5592553843933764, + "grad_norm": 0.16475246846675873, + "learning_rate": 0.0001776417953528648, + "loss": 0.1519, + "step": 7751 + }, + { + "epoch": 0.5593275370684369, + "grad_norm": 0.1504022479057312, + "learning_rate": 0.00017763890893346805, + "loss": 0.1332, + "step": 7752 + }, + { + "epoch": 0.5593996897434972, + "grad_norm": 0.11250517517328262, + "learning_rate": 0.0001776360225140713, + "loss": 0.1696, + "step": 7753 + }, + { + "epoch": 0.5594718424185576, + "grad_norm": 0.10336173325777054, + "learning_rate": 0.00017763313609467455, + "loss": 0.174, + "step": 7754 + }, + { + "epoch": 0.5595439950936181, + "grad_norm": 0.12454720586538315, + "learning_rate": 0.0001776302496752778, + "loss": 0.1843, + "step": 7755 + }, + { + "epoch": 0.5596161477686785, + "grad_norm": 0.11423910409212112, + "learning_rate": 0.0001776273632558811, + "loss": 0.1572, + "step": 7756 + }, + { + "epoch": 0.559688300443739, + "grad_norm": 0.13247184455394745, + "learning_rate": 0.00017762447683648437, + "loss": 0.1453, + "step": 7757 + }, + { + "epoch": 0.5597604531187994, + "grad_norm": 0.10575006902217865, + "learning_rate": 0.0001776215904170876, + "loss": 0.0785, + "step": 7758 + }, + { + "epoch": 0.5598326057938598, + "grad_norm": 0.11786968261003494, + "learning_rate": 0.00017761870399769086, + "loss": 0.1604, + "step": 7759 + }, + { + "epoch": 0.5599047584689202, + "grad_norm": 0.12994790077209473, + "learning_rate": 0.00017761581757829413, + "loss": 0.1449, + "step": 7760 + }, + { + "epoch": 0.5599769111439806, + "grad_norm": 0.15007157623767853, + "learning_rate": 0.0001776129311588974, + "loss": 0.1944, + "step": 7761 + }, + { + "epoch": 0.5600490638190411, + "grad_norm": 0.18437841534614563, + "learning_rate": 0.00017761004473950065, + "loss": 0.1149, + "step": 7762 + }, + { + "epoch": 0.5601212164941015, + "grad_norm": 0.11152983456850052, + "learning_rate": 0.00017760715832010392, + "loss": 0.1887, + "step": 7763 + }, + { + "epoch": 0.560193369169162, + "grad_norm": 0.12899920344352722, + "learning_rate": 0.00017760427190070718, + "loss": 0.1235, + "step": 7764 + }, + { + "epoch": 0.5602655218442224, + "grad_norm": 0.11892049759626389, + "learning_rate": 0.00017760138548131044, + "loss": 0.1411, + "step": 7765 + }, + { + "epoch": 0.5603376745192828, + "grad_norm": 0.10350506752729416, + "learning_rate": 0.0001775984990619137, + "loss": 0.1875, + "step": 7766 + }, + { + "epoch": 0.5604098271943432, + "grad_norm": 0.11183896660804749, + "learning_rate": 0.00017759561264251697, + "loss": 0.1592, + "step": 7767 + }, + { + "epoch": 0.5604819798694036, + "grad_norm": 0.11698009818792343, + "learning_rate": 0.00017759272622312023, + "loss": 0.144, + "step": 7768 + }, + { + "epoch": 0.5605541325444641, + "grad_norm": 0.11330104619264603, + "learning_rate": 0.00017758983980372347, + "loss": 0.1357, + "step": 7769 + }, + { + "epoch": 0.5606262852195245, + "grad_norm": 0.11938729137182236, + "learning_rate": 0.00017758695338432676, + "loss": 0.1204, + "step": 7770 + }, + { + "epoch": 0.560698437894585, + "grad_norm": 0.11675950139760971, + "learning_rate": 0.00017758406696493002, + "loss": 0.1239, + "step": 7771 + }, + { + "epoch": 0.5607705905696454, + "grad_norm": 0.12216418981552124, + "learning_rate": 0.00017758118054553328, + "loss": 0.1392, + "step": 7772 + }, + { + "epoch": 0.5608427432447058, + "grad_norm": 0.11849422752857208, + "learning_rate": 0.00017757829412613655, + "loss": 0.1702, + "step": 7773 + }, + { + "epoch": 0.5609148959197662, + "grad_norm": 0.12819740176200867, + "learning_rate": 0.00017757540770673978, + "loss": 0.1424, + "step": 7774 + }, + { + "epoch": 0.5609870485948266, + "grad_norm": 0.12744930386543274, + "learning_rate": 0.00017757252128734304, + "loss": 0.1797, + "step": 7775 + }, + { + "epoch": 0.5610592012698871, + "grad_norm": 0.11002054065465927, + "learning_rate": 0.0001775696348679463, + "loss": 0.1652, + "step": 7776 + }, + { + "epoch": 0.5611313539449475, + "grad_norm": 0.11307639628648758, + "learning_rate": 0.0001775667484485496, + "loss": 0.1562, + "step": 7777 + }, + { + "epoch": 0.561203506620008, + "grad_norm": 0.1282927691936493, + "learning_rate": 0.00017756386202915286, + "loss": 0.1614, + "step": 7778 + }, + { + "epoch": 0.5612756592950684, + "grad_norm": 0.10930712521076202, + "learning_rate": 0.0001775609756097561, + "loss": 0.123, + "step": 7779 + }, + { + "epoch": 0.5613478119701288, + "grad_norm": 0.08749040216207504, + "learning_rate": 0.00017755808919035936, + "loss": 0.1138, + "step": 7780 + }, + { + "epoch": 0.5614199646451892, + "grad_norm": 0.1288251429796219, + "learning_rate": 0.00017755520277096262, + "loss": 0.1957, + "step": 7781 + }, + { + "epoch": 0.5614921173202496, + "grad_norm": 0.10661497712135315, + "learning_rate": 0.00017755231635156588, + "loss": 0.1582, + "step": 7782 + }, + { + "epoch": 0.5615642699953101, + "grad_norm": 0.1347241997718811, + "learning_rate": 0.00017754942993216915, + "loss": 0.1825, + "step": 7783 + }, + { + "epoch": 0.5616364226703705, + "grad_norm": 0.1108936294913292, + "learning_rate": 0.0001775465435127724, + "loss": 0.1479, + "step": 7784 + }, + { + "epoch": 0.561708575345431, + "grad_norm": 0.1376233994960785, + "learning_rate": 0.00017754365709337567, + "loss": 0.157, + "step": 7785 + }, + { + "epoch": 0.5617807280204914, + "grad_norm": 0.10962553322315216, + "learning_rate": 0.00017754077067397894, + "loss": 0.12, + "step": 7786 + }, + { + "epoch": 0.5618528806955518, + "grad_norm": 0.1102694571018219, + "learning_rate": 0.0001775378842545822, + "loss": 0.1318, + "step": 7787 + }, + { + "epoch": 0.5619250333706122, + "grad_norm": 0.10344778746366501, + "learning_rate": 0.00017753499783518546, + "loss": 0.1434, + "step": 7788 + }, + { + "epoch": 0.5619971860456726, + "grad_norm": 0.15022890269756317, + "learning_rate": 0.00017753211141578873, + "loss": 0.1926, + "step": 7789 + }, + { + "epoch": 0.5620693387207331, + "grad_norm": 0.1173153966665268, + "learning_rate": 0.00017752922499639196, + "loss": 0.1327, + "step": 7790 + }, + { + "epoch": 0.5621414913957935, + "grad_norm": 0.1331547051668167, + "learning_rate": 0.00017752633857699525, + "loss": 0.1249, + "step": 7791 + }, + { + "epoch": 0.562213644070854, + "grad_norm": 0.11356645822525024, + "learning_rate": 0.00017752345215759851, + "loss": 0.1685, + "step": 7792 + }, + { + "epoch": 0.5622857967459144, + "grad_norm": 0.11696729809045792, + "learning_rate": 0.00017752056573820178, + "loss": 0.2, + "step": 7793 + }, + { + "epoch": 0.5623579494209748, + "grad_norm": 0.15943534672260284, + "learning_rate": 0.00017751767931880504, + "loss": 0.1918, + "step": 7794 + }, + { + "epoch": 0.5624301020960352, + "grad_norm": 0.12938512861728668, + "learning_rate": 0.00017751479289940828, + "loss": 0.133, + "step": 7795 + }, + { + "epoch": 0.5625022547710956, + "grad_norm": 0.11734830588102341, + "learning_rate": 0.00017751190648001154, + "loss": 0.1261, + "step": 7796 + }, + { + "epoch": 0.562574407446156, + "grad_norm": 0.11850757896900177, + "learning_rate": 0.0001775090200606148, + "loss": 0.1456, + "step": 7797 + }, + { + "epoch": 0.5626465601212165, + "grad_norm": 0.11291621625423431, + "learning_rate": 0.0001775061336412181, + "loss": 0.1857, + "step": 7798 + }, + { + "epoch": 0.5627187127962769, + "grad_norm": 0.09668024629354477, + "learning_rate": 0.00017750324722182135, + "loss": 0.1862, + "step": 7799 + }, + { + "epoch": 0.5627908654713374, + "grad_norm": 0.1215052530169487, + "learning_rate": 0.0001775003608024246, + "loss": 0.1662, + "step": 7800 + }, + { + "epoch": 0.5628630181463978, + "grad_norm": 0.1308431327342987, + "learning_rate": 0.00017749747438302785, + "loss": 0.2088, + "step": 7801 + }, + { + "epoch": 0.5629351708214582, + "grad_norm": 0.12154701352119446, + "learning_rate": 0.00017749458796363112, + "loss": 0.169, + "step": 7802 + }, + { + "epoch": 0.5630073234965186, + "grad_norm": 0.15033183991909027, + "learning_rate": 0.00017749170154423438, + "loss": 0.15, + "step": 7803 + }, + { + "epoch": 0.563079476171579, + "grad_norm": 0.11695659905672073, + "learning_rate": 0.00017748881512483764, + "loss": 0.0921, + "step": 7804 + }, + { + "epoch": 0.5631516288466395, + "grad_norm": 0.10494501143693924, + "learning_rate": 0.0001774859287054409, + "loss": 0.1731, + "step": 7805 + }, + { + "epoch": 0.5632237815216999, + "grad_norm": 0.10611262172460556, + "learning_rate": 0.00017748304228604417, + "loss": 0.1566, + "step": 7806 + }, + { + "epoch": 0.5632959341967604, + "grad_norm": 0.15995727479457855, + "learning_rate": 0.00017748015586664743, + "loss": 0.1528, + "step": 7807 + }, + { + "epoch": 0.5633680868718208, + "grad_norm": 0.14674517512321472, + "learning_rate": 0.0001774772694472507, + "loss": 0.148, + "step": 7808 + }, + { + "epoch": 0.5634402395468812, + "grad_norm": 0.13919983804225922, + "learning_rate": 0.00017747438302785396, + "loss": 0.1798, + "step": 7809 + }, + { + "epoch": 0.5635123922219416, + "grad_norm": 0.10498116910457611, + "learning_rate": 0.00017747149660845722, + "loss": 0.1377, + "step": 7810 + }, + { + "epoch": 0.563584544897002, + "grad_norm": 0.14174553751945496, + "learning_rate": 0.00017746861018906048, + "loss": 0.1471, + "step": 7811 + }, + { + "epoch": 0.5636566975720625, + "grad_norm": 0.11989643424749374, + "learning_rate": 0.00017746572376966375, + "loss": 0.1488, + "step": 7812 + }, + { + "epoch": 0.5637288502471229, + "grad_norm": 0.13585850596427917, + "learning_rate": 0.000177462837350267, + "loss": 0.2016, + "step": 7813 + }, + { + "epoch": 0.5638010029221834, + "grad_norm": 0.11506017297506332, + "learning_rate": 0.00017745995093087027, + "loss": 0.1419, + "step": 7814 + }, + { + "epoch": 0.5638731555972437, + "grad_norm": 0.13118226826190948, + "learning_rate": 0.00017745706451147353, + "loss": 0.1764, + "step": 7815 + }, + { + "epoch": 0.5639453082723042, + "grad_norm": 0.1246950775384903, + "learning_rate": 0.0001774541780920768, + "loss": 0.1055, + "step": 7816 + }, + { + "epoch": 0.5640174609473646, + "grad_norm": 0.13198302686214447, + "learning_rate": 0.00017745129167268003, + "loss": 0.118, + "step": 7817 + }, + { + "epoch": 0.564089613622425, + "grad_norm": 0.10069485753774643, + "learning_rate": 0.0001774484052532833, + "loss": 0.1893, + "step": 7818 + }, + { + "epoch": 0.5641617662974855, + "grad_norm": 0.10112564265727997, + "learning_rate": 0.00017744551883388659, + "loss": 0.1307, + "step": 7819 + }, + { + "epoch": 0.5642339189725459, + "grad_norm": 0.12570911645889282, + "learning_rate": 0.00017744263241448985, + "loss": 0.1566, + "step": 7820 + }, + { + "epoch": 0.5643060716476064, + "grad_norm": 0.12907375395298004, + "learning_rate": 0.0001774397459950931, + "loss": 0.1428, + "step": 7821 + }, + { + "epoch": 0.5643782243226667, + "grad_norm": 0.10935437679290771, + "learning_rate": 0.00017743685957569635, + "loss": 0.1633, + "step": 7822 + }, + { + "epoch": 0.5644503769977272, + "grad_norm": 0.11014962941408157, + "learning_rate": 0.0001774339731562996, + "loss": 0.1644, + "step": 7823 + }, + { + "epoch": 0.5645225296727876, + "grad_norm": 0.10696630924940109, + "learning_rate": 0.00017743108673690287, + "loss": 0.1317, + "step": 7824 + }, + { + "epoch": 0.564594682347848, + "grad_norm": 0.11150635778903961, + "learning_rate": 0.00017742820031750614, + "loss": 0.1669, + "step": 7825 + }, + { + "epoch": 0.5646668350229085, + "grad_norm": 0.17067500948905945, + "learning_rate": 0.00017742531389810943, + "loss": 0.2251, + "step": 7826 + }, + { + "epoch": 0.5647389876979689, + "grad_norm": 0.10682345926761627, + "learning_rate": 0.00017742242747871266, + "loss": 0.1934, + "step": 7827 + }, + { + "epoch": 0.5648111403730294, + "grad_norm": 0.10371333360671997, + "learning_rate": 0.00017741954105931592, + "loss": 0.1336, + "step": 7828 + }, + { + "epoch": 0.5648832930480897, + "grad_norm": 0.10731291025876999, + "learning_rate": 0.0001774166546399192, + "loss": 0.1173, + "step": 7829 + }, + { + "epoch": 0.5649554457231502, + "grad_norm": 0.10326001793146133, + "learning_rate": 0.00017741376822052245, + "loss": 0.1219, + "step": 7830 + }, + { + "epoch": 0.5650275983982106, + "grad_norm": 0.1006198599934578, + "learning_rate": 0.00017741088180112571, + "loss": 0.1796, + "step": 7831 + }, + { + "epoch": 0.565099751073271, + "grad_norm": 0.1153801754117012, + "learning_rate": 0.00017740799538172898, + "loss": 0.1585, + "step": 7832 + }, + { + "epoch": 0.5651719037483315, + "grad_norm": 0.11774788796901703, + "learning_rate": 0.00017740510896233224, + "loss": 0.2245, + "step": 7833 + }, + { + "epoch": 0.5652440564233919, + "grad_norm": 0.1453564465045929, + "learning_rate": 0.0001774022225429355, + "loss": 0.1558, + "step": 7834 + }, + { + "epoch": 0.5653162090984524, + "grad_norm": 0.10883069783449173, + "learning_rate": 0.00017739933612353877, + "loss": 0.1848, + "step": 7835 + }, + { + "epoch": 0.5653883617735127, + "grad_norm": 0.1174570843577385, + "learning_rate": 0.00017739644970414203, + "loss": 0.1599, + "step": 7836 + }, + { + "epoch": 0.5654605144485731, + "grad_norm": 0.10084299743175507, + "learning_rate": 0.0001773935632847453, + "loss": 0.1504, + "step": 7837 + }, + { + "epoch": 0.5655326671236336, + "grad_norm": 0.13395412266254425, + "learning_rate": 0.00017739067686534853, + "loss": 0.1339, + "step": 7838 + }, + { + "epoch": 0.565604819798694, + "grad_norm": 0.13974985480308533, + "learning_rate": 0.0001773877904459518, + "loss": 0.1623, + "step": 7839 + }, + { + "epoch": 0.5656769724737545, + "grad_norm": 0.14301472902297974, + "learning_rate": 0.00017738490402655508, + "loss": 0.1551, + "step": 7840 + }, + { + "epoch": 0.5657491251488149, + "grad_norm": 0.13463523983955383, + "learning_rate": 0.00017738201760715834, + "loss": 0.1478, + "step": 7841 + }, + { + "epoch": 0.5658212778238754, + "grad_norm": 0.1306980699300766, + "learning_rate": 0.0001773791311877616, + "loss": 0.1303, + "step": 7842 + }, + { + "epoch": 0.5658934304989357, + "grad_norm": 0.14141131937503815, + "learning_rate": 0.00017737624476836484, + "loss": 0.1289, + "step": 7843 + }, + { + "epoch": 0.5659655831739961, + "grad_norm": 0.183505579829216, + "learning_rate": 0.0001773733583489681, + "loss": 0.1868, + "step": 7844 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.11580534279346466, + "learning_rate": 0.00017737047192957137, + "loss": 0.1245, + "step": 7845 + }, + { + "epoch": 0.566109888524117, + "grad_norm": 0.12077432125806808, + "learning_rate": 0.00017736758551017463, + "loss": 0.1593, + "step": 7846 + }, + { + "epoch": 0.5661820411991775, + "grad_norm": 0.15405291318893433, + "learning_rate": 0.00017736469909077792, + "loss": 0.153, + "step": 7847 + }, + { + "epoch": 0.5662541938742379, + "grad_norm": 0.12104575335979462, + "learning_rate": 0.00017736181267138116, + "loss": 0.1468, + "step": 7848 + }, + { + "epoch": 0.5663263465492984, + "grad_norm": 0.12845638394355774, + "learning_rate": 0.00017735892625198442, + "loss": 0.1317, + "step": 7849 + }, + { + "epoch": 0.5663984992243587, + "grad_norm": 0.10479018837213516, + "learning_rate": 0.00017735603983258768, + "loss": 0.141, + "step": 7850 + }, + { + "epoch": 0.5664706518994191, + "grad_norm": 0.11782775074243546, + "learning_rate": 0.00017735315341319094, + "loss": 0.158, + "step": 7851 + }, + { + "epoch": 0.5665428045744796, + "grad_norm": 0.10476533323526382, + "learning_rate": 0.0001773502669937942, + "loss": 0.1423, + "step": 7852 + }, + { + "epoch": 0.56661495724954, + "grad_norm": 0.1525888293981552, + "learning_rate": 0.00017734738057439747, + "loss": 0.1121, + "step": 7853 + }, + { + "epoch": 0.5666871099246005, + "grad_norm": 0.10566260665655136, + "learning_rate": 0.00017734449415500073, + "loss": 0.164, + "step": 7854 + }, + { + "epoch": 0.5667592625996609, + "grad_norm": 0.12231025099754333, + "learning_rate": 0.000177341607735604, + "loss": 0.1728, + "step": 7855 + }, + { + "epoch": 0.5668314152747214, + "grad_norm": 0.11222410947084427, + "learning_rate": 0.00017733872131620726, + "loss": 0.1337, + "step": 7856 + }, + { + "epoch": 0.5669035679497817, + "grad_norm": 0.13453534245491028, + "learning_rate": 0.00017733583489681052, + "loss": 0.1505, + "step": 7857 + }, + { + "epoch": 0.5669757206248421, + "grad_norm": 0.12299520522356033, + "learning_rate": 0.00017733294847741379, + "loss": 0.1656, + "step": 7858 + }, + { + "epoch": 0.5670478732999026, + "grad_norm": 0.1308135837316513, + "learning_rate": 0.00017733006205801702, + "loss": 0.1198, + "step": 7859 + }, + { + "epoch": 0.567120025974963, + "grad_norm": 0.11543524265289307, + "learning_rate": 0.00017732717563862028, + "loss": 0.1835, + "step": 7860 + }, + { + "epoch": 0.5671921786500235, + "grad_norm": 0.16316701471805573, + "learning_rate": 0.00017732428921922357, + "loss": 0.1373, + "step": 7861 + }, + { + "epoch": 0.5672643313250839, + "grad_norm": 0.1554131805896759, + "learning_rate": 0.00017732140279982684, + "loss": 0.1163, + "step": 7862 + }, + { + "epoch": 0.5673364840001444, + "grad_norm": 0.11200838536024094, + "learning_rate": 0.0001773185163804301, + "loss": 0.1432, + "step": 7863 + }, + { + "epoch": 0.5674086366752047, + "grad_norm": 0.12736523151397705, + "learning_rate": 0.00017731562996103334, + "loss": 0.1543, + "step": 7864 + }, + { + "epoch": 0.5674807893502651, + "grad_norm": 0.13625681400299072, + "learning_rate": 0.0001773127435416366, + "loss": 0.1217, + "step": 7865 + }, + { + "epoch": 0.5675529420253256, + "grad_norm": 0.11235533654689789, + "learning_rate": 0.00017730985712223986, + "loss": 0.1119, + "step": 7866 + }, + { + "epoch": 0.567625094700386, + "grad_norm": 0.11463140696287155, + "learning_rate": 0.00017730697070284312, + "loss": 0.0992, + "step": 7867 + }, + { + "epoch": 0.5676972473754465, + "grad_norm": 0.12850303947925568, + "learning_rate": 0.00017730408428344641, + "loss": 0.158, + "step": 7868 + }, + { + "epoch": 0.5677694000505069, + "grad_norm": 0.10813012719154358, + "learning_rate": 0.00017730119786404965, + "loss": 0.1255, + "step": 7869 + }, + { + "epoch": 0.5678415527255674, + "grad_norm": 0.13995739817619324, + "learning_rate": 0.0001772983114446529, + "loss": 0.1458, + "step": 7870 + }, + { + "epoch": 0.5679137054006277, + "grad_norm": 0.12784218788146973, + "learning_rate": 0.00017729542502525618, + "loss": 0.1665, + "step": 7871 + }, + { + "epoch": 0.5679858580756881, + "grad_norm": 0.10078656673431396, + "learning_rate": 0.00017729253860585944, + "loss": 0.1704, + "step": 7872 + }, + { + "epoch": 0.5680580107507486, + "grad_norm": 0.12124498188495636, + "learning_rate": 0.0001772896521864627, + "loss": 0.1322, + "step": 7873 + }, + { + "epoch": 0.568130163425809, + "grad_norm": 0.11808305233716965, + "learning_rate": 0.00017728676576706597, + "loss": 0.1296, + "step": 7874 + }, + { + "epoch": 0.5682023161008695, + "grad_norm": 0.1439986675977707, + "learning_rate": 0.00017728387934766923, + "loss": 0.1617, + "step": 7875 + }, + { + "epoch": 0.5682744687759299, + "grad_norm": 0.14310899376869202, + "learning_rate": 0.0001772809929282725, + "loss": 0.1167, + "step": 7876 + }, + { + "epoch": 0.5683466214509902, + "grad_norm": 0.16115215420722961, + "learning_rate": 0.00017727810650887575, + "loss": 0.1566, + "step": 7877 + }, + { + "epoch": 0.5684187741260507, + "grad_norm": 0.13886554539203644, + "learning_rate": 0.00017727522008947902, + "loss": 0.1257, + "step": 7878 + }, + { + "epoch": 0.5684909268011111, + "grad_norm": 0.10506069660186768, + "learning_rate": 0.00017727233367008228, + "loss": 0.1358, + "step": 7879 + }, + { + "epoch": 0.5685630794761716, + "grad_norm": 0.14647378027439117, + "learning_rate": 0.00017726944725068552, + "loss": 0.1508, + "step": 7880 + }, + { + "epoch": 0.568635232151232, + "grad_norm": 0.12777476012706757, + "learning_rate": 0.00017726656083128878, + "loss": 0.1475, + "step": 7881 + }, + { + "epoch": 0.5687073848262925, + "grad_norm": 0.11274504661560059, + "learning_rate": 0.00017726367441189207, + "loss": 0.1705, + "step": 7882 + }, + { + "epoch": 0.5687795375013529, + "grad_norm": 0.11977039277553558, + "learning_rate": 0.00017726078799249533, + "loss": 0.1316, + "step": 7883 + }, + { + "epoch": 0.5688516901764132, + "grad_norm": 0.11919888108968735, + "learning_rate": 0.0001772579015730986, + "loss": 0.1237, + "step": 7884 + }, + { + "epoch": 0.5689238428514737, + "grad_norm": 0.14281430840492249, + "learning_rate": 0.00017725501515370183, + "loss": 0.1786, + "step": 7885 + }, + { + "epoch": 0.5689959955265341, + "grad_norm": 0.11657267063856125, + "learning_rate": 0.0001772521287343051, + "loss": 0.129, + "step": 7886 + }, + { + "epoch": 0.5690681482015946, + "grad_norm": 0.1586855798959732, + "learning_rate": 0.00017724924231490836, + "loss": 0.1367, + "step": 7887 + }, + { + "epoch": 0.569140300876655, + "grad_norm": 0.14276975393295288, + "learning_rate": 0.00017724635589551162, + "loss": 0.1211, + "step": 7888 + }, + { + "epoch": 0.5692124535517155, + "grad_norm": 0.1068420559167862, + "learning_rate": 0.0001772434694761149, + "loss": 0.1461, + "step": 7889 + }, + { + "epoch": 0.5692846062267759, + "grad_norm": 0.11131718754768372, + "learning_rate": 0.00017724058305671814, + "loss": 0.1772, + "step": 7890 + }, + { + "epoch": 0.5693567589018362, + "grad_norm": 0.11713793873786926, + "learning_rate": 0.0001772376966373214, + "loss": 0.1111, + "step": 7891 + }, + { + "epoch": 0.5694289115768967, + "grad_norm": 0.09942521899938583, + "learning_rate": 0.00017723481021792467, + "loss": 0.1052, + "step": 7892 + }, + { + "epoch": 0.5695010642519571, + "grad_norm": 0.11540911346673965, + "learning_rate": 0.00017723192379852793, + "loss": 0.1405, + "step": 7893 + }, + { + "epoch": 0.5695732169270176, + "grad_norm": 0.11284588277339935, + "learning_rate": 0.0001772290373791312, + "loss": 0.1201, + "step": 7894 + }, + { + "epoch": 0.569645369602078, + "grad_norm": 0.1278034895658493, + "learning_rate": 0.00017722615095973446, + "loss": 0.1076, + "step": 7895 + }, + { + "epoch": 0.5697175222771385, + "grad_norm": 0.1453760266304016, + "learning_rate": 0.00017722326454033772, + "loss": 0.2114, + "step": 7896 + }, + { + "epoch": 0.5697896749521989, + "grad_norm": 0.106064572930336, + "learning_rate": 0.00017722037812094099, + "loss": 0.1124, + "step": 7897 + }, + { + "epoch": 0.5698618276272592, + "grad_norm": 0.1113254651427269, + "learning_rate": 0.00017721749170154425, + "loss": 0.137, + "step": 7898 + }, + { + "epoch": 0.5699339803023197, + "grad_norm": 0.14589551091194153, + "learning_rate": 0.0001772146052821475, + "loss": 0.1727, + "step": 7899 + }, + { + "epoch": 0.5700061329773801, + "grad_norm": 0.1296132653951645, + "learning_rate": 0.00017721171886275077, + "loss": 0.15, + "step": 7900 + }, + { + "epoch": 0.5700782856524406, + "grad_norm": 0.10612339526414871, + "learning_rate": 0.000177208832443354, + "loss": 0.1453, + "step": 7901 + }, + { + "epoch": 0.570150438327501, + "grad_norm": 0.14799730479717255, + "learning_rate": 0.00017720594602395727, + "loss": 0.2194, + "step": 7902 + }, + { + "epoch": 0.5702225910025615, + "grad_norm": 0.1471415013074875, + "learning_rate": 0.00017720305960456056, + "loss": 0.1767, + "step": 7903 + }, + { + "epoch": 0.5702947436776219, + "grad_norm": 0.1280488222837448, + "learning_rate": 0.00017720017318516383, + "loss": 0.1058, + "step": 7904 + }, + { + "epoch": 0.5703668963526822, + "grad_norm": 0.1329127699136734, + "learning_rate": 0.0001771972867657671, + "loss": 0.164, + "step": 7905 + }, + { + "epoch": 0.5704390490277427, + "grad_norm": 0.11687469482421875, + "learning_rate": 0.00017719440034637032, + "loss": 0.1635, + "step": 7906 + }, + { + "epoch": 0.5705112017028031, + "grad_norm": 0.10324371606111526, + "learning_rate": 0.0001771915139269736, + "loss": 0.1563, + "step": 7907 + }, + { + "epoch": 0.5705833543778636, + "grad_norm": 0.1355886459350586, + "learning_rate": 0.00017718862750757685, + "loss": 0.2098, + "step": 7908 + }, + { + "epoch": 0.570655507052924, + "grad_norm": 0.13116252422332764, + "learning_rate": 0.0001771857410881801, + "loss": 0.1752, + "step": 7909 + }, + { + "epoch": 0.5707276597279844, + "grad_norm": 0.12008823454380035, + "learning_rate": 0.0001771828546687834, + "loss": 0.1882, + "step": 7910 + }, + { + "epoch": 0.5707998124030449, + "grad_norm": 0.1371777504682541, + "learning_rate": 0.00017717996824938664, + "loss": 0.1357, + "step": 7911 + }, + { + "epoch": 0.5708719650781052, + "grad_norm": 0.10057571530342102, + "learning_rate": 0.0001771770818299899, + "loss": 0.1335, + "step": 7912 + }, + { + "epoch": 0.5709441177531657, + "grad_norm": 0.11042293906211853, + "learning_rate": 0.00017717419541059316, + "loss": 0.1495, + "step": 7913 + }, + { + "epoch": 0.5710162704282261, + "grad_norm": 0.12480539083480835, + "learning_rate": 0.00017717130899119643, + "loss": 0.1661, + "step": 7914 + }, + { + "epoch": 0.5710884231032866, + "grad_norm": 0.1366511732339859, + "learning_rate": 0.0001771684225717997, + "loss": 0.1799, + "step": 7915 + }, + { + "epoch": 0.571160575778347, + "grad_norm": 0.12441026419401169, + "learning_rate": 0.00017716553615240295, + "loss": 0.1783, + "step": 7916 + }, + { + "epoch": 0.5712327284534074, + "grad_norm": 0.11140074580907822, + "learning_rate": 0.00017716264973300622, + "loss": 0.0865, + "step": 7917 + }, + { + "epoch": 0.5713048811284679, + "grad_norm": 0.11715801805257797, + "learning_rate": 0.00017715976331360948, + "loss": 0.1565, + "step": 7918 + }, + { + "epoch": 0.5713770338035282, + "grad_norm": 0.09649024903774261, + "learning_rate": 0.00017715687689421274, + "loss": 0.128, + "step": 7919 + }, + { + "epoch": 0.5714491864785887, + "grad_norm": 0.11736408621072769, + "learning_rate": 0.000177153990474816, + "loss": 0.1478, + "step": 7920 + }, + { + "epoch": 0.5715213391536491, + "grad_norm": 0.17529132962226868, + "learning_rate": 0.00017715110405541927, + "loss": 0.1721, + "step": 7921 + }, + { + "epoch": 0.5715934918287096, + "grad_norm": 0.11500423401594162, + "learning_rate": 0.00017714821763602253, + "loss": 0.1564, + "step": 7922 + }, + { + "epoch": 0.57166564450377, + "grad_norm": 0.12164665758609772, + "learning_rate": 0.00017714533121662577, + "loss": 0.1281, + "step": 7923 + }, + { + "epoch": 0.5717377971788304, + "grad_norm": 0.1208866536617279, + "learning_rate": 0.00017714244479722906, + "loss": 0.1502, + "step": 7924 + }, + { + "epoch": 0.5718099498538909, + "grad_norm": 0.11391763389110565, + "learning_rate": 0.00017713955837783232, + "loss": 0.1341, + "step": 7925 + }, + { + "epoch": 0.5718821025289512, + "grad_norm": 0.14047269523143768, + "learning_rate": 0.00017713667195843558, + "loss": 0.1277, + "step": 7926 + }, + { + "epoch": 0.5719542552040117, + "grad_norm": 0.10359933227300644, + "learning_rate": 0.00017713378553903885, + "loss": 0.1257, + "step": 7927 + }, + { + "epoch": 0.5720264078790721, + "grad_norm": 0.12921454012393951, + "learning_rate": 0.00017713089911964208, + "loss": 0.1704, + "step": 7928 + }, + { + "epoch": 0.5720985605541326, + "grad_norm": 0.12614530324935913, + "learning_rate": 0.00017712801270024534, + "loss": 0.1127, + "step": 7929 + }, + { + "epoch": 0.572170713229193, + "grad_norm": 0.1164608746767044, + "learning_rate": 0.0001771251262808486, + "loss": 0.124, + "step": 7930 + }, + { + "epoch": 0.5722428659042534, + "grad_norm": 0.1259773224592209, + "learning_rate": 0.0001771222398614519, + "loss": 0.1856, + "step": 7931 + }, + { + "epoch": 0.5723150185793139, + "grad_norm": 0.12784960865974426, + "learning_rate": 0.00017711935344205516, + "loss": 0.1383, + "step": 7932 + }, + { + "epoch": 0.5723871712543742, + "grad_norm": 0.14621759951114655, + "learning_rate": 0.0001771164670226584, + "loss": 0.1635, + "step": 7933 + }, + { + "epoch": 0.5724593239294347, + "grad_norm": 0.12372894585132599, + "learning_rate": 0.00017711358060326166, + "loss": 0.1801, + "step": 7934 + }, + { + "epoch": 0.5725314766044951, + "grad_norm": 0.1166391670703888, + "learning_rate": 0.00017711069418386492, + "loss": 0.1538, + "step": 7935 + }, + { + "epoch": 0.5726036292795555, + "grad_norm": 0.12102074921131134, + "learning_rate": 0.00017710780776446818, + "loss": 0.1643, + "step": 7936 + }, + { + "epoch": 0.572675781954616, + "grad_norm": 0.12680689990520477, + "learning_rate": 0.00017710492134507145, + "loss": 0.1837, + "step": 7937 + }, + { + "epoch": 0.5727479346296764, + "grad_norm": 0.12156582623720169, + "learning_rate": 0.0001771020349256747, + "loss": 0.1849, + "step": 7938 + }, + { + "epoch": 0.5728200873047368, + "grad_norm": 0.1272418349981308, + "learning_rate": 0.00017709914850627797, + "loss": 0.1726, + "step": 7939 + }, + { + "epoch": 0.5728922399797972, + "grad_norm": 0.11186318844556808, + "learning_rate": 0.00017709626208688124, + "loss": 0.1104, + "step": 7940 + }, + { + "epoch": 0.5729643926548577, + "grad_norm": 0.11623166501522064, + "learning_rate": 0.0001770933756674845, + "loss": 0.1697, + "step": 7941 + }, + { + "epoch": 0.5730365453299181, + "grad_norm": 0.09506198763847351, + "learning_rate": 0.00017709048924808776, + "loss": 0.1252, + "step": 7942 + }, + { + "epoch": 0.5731086980049785, + "grad_norm": 0.1287769079208374, + "learning_rate": 0.00017708760282869103, + "loss": 0.1801, + "step": 7943 + }, + { + "epoch": 0.573180850680039, + "grad_norm": 0.1254848688840866, + "learning_rate": 0.00017708471640929426, + "loss": 0.0905, + "step": 7944 + }, + { + "epoch": 0.5732530033550994, + "grad_norm": 0.13968469202518463, + "learning_rate": 0.00017708182998989752, + "loss": 0.1201, + "step": 7945 + }, + { + "epoch": 0.5733251560301598, + "grad_norm": 0.14913301169872284, + "learning_rate": 0.00017707894357050081, + "loss": 0.1704, + "step": 7946 + }, + { + "epoch": 0.5733973087052202, + "grad_norm": 0.12298578768968582, + "learning_rate": 0.00017707605715110408, + "loss": 0.1661, + "step": 7947 + }, + { + "epoch": 0.5734694613802807, + "grad_norm": 0.13233782351016998, + "learning_rate": 0.00017707317073170734, + "loss": 0.1633, + "step": 7948 + }, + { + "epoch": 0.5735416140553411, + "grad_norm": 0.11294033378362656, + "learning_rate": 0.00017707028431231058, + "loss": 0.1812, + "step": 7949 + }, + { + "epoch": 0.5736137667304015, + "grad_norm": 0.11323747038841248, + "learning_rate": 0.00017706739789291384, + "loss": 0.1808, + "step": 7950 + }, + { + "epoch": 0.573685919405462, + "grad_norm": 0.11524073779582977, + "learning_rate": 0.0001770645114735171, + "loss": 0.1934, + "step": 7951 + }, + { + "epoch": 0.5737580720805224, + "grad_norm": 0.1528632491827011, + "learning_rate": 0.00017706162505412036, + "loss": 0.2212, + "step": 7952 + }, + { + "epoch": 0.5738302247555828, + "grad_norm": 0.11348945647478104, + "learning_rate": 0.00017705873863472365, + "loss": 0.1783, + "step": 7953 + }, + { + "epoch": 0.5739023774306432, + "grad_norm": 0.1193523034453392, + "learning_rate": 0.0001770558522153269, + "loss": 0.1534, + "step": 7954 + }, + { + "epoch": 0.5739745301057037, + "grad_norm": 0.10278993844985962, + "learning_rate": 0.00017705296579593015, + "loss": 0.1076, + "step": 7955 + }, + { + "epoch": 0.5740466827807641, + "grad_norm": 0.22504210472106934, + "learning_rate": 0.00017705007937653342, + "loss": 0.1806, + "step": 7956 + }, + { + "epoch": 0.5741188354558245, + "grad_norm": 0.13772232830524445, + "learning_rate": 0.00017704719295713668, + "loss": 0.1451, + "step": 7957 + }, + { + "epoch": 0.574190988130885, + "grad_norm": 0.12129385769367218, + "learning_rate": 0.00017704430653773994, + "loss": 0.1349, + "step": 7958 + }, + { + "epoch": 0.5742631408059454, + "grad_norm": 0.11908667534589767, + "learning_rate": 0.0001770414201183432, + "loss": 0.1441, + "step": 7959 + }, + { + "epoch": 0.5743352934810058, + "grad_norm": 0.13737215101718903, + "learning_rate": 0.00017703853369894647, + "loss": 0.1959, + "step": 7960 + }, + { + "epoch": 0.5744074461560662, + "grad_norm": 0.10954530537128448, + "learning_rate": 0.00017703564727954973, + "loss": 0.134, + "step": 7961 + }, + { + "epoch": 0.5744795988311266, + "grad_norm": 0.1645643413066864, + "learning_rate": 0.000177032760860153, + "loss": 0.1874, + "step": 7962 + }, + { + "epoch": 0.5745517515061871, + "grad_norm": 0.1279773712158203, + "learning_rate": 0.00017702987444075626, + "loss": 0.1334, + "step": 7963 + }, + { + "epoch": 0.5746239041812475, + "grad_norm": 0.17158547043800354, + "learning_rate": 0.00017702698802135952, + "loss": 0.1986, + "step": 7964 + }, + { + "epoch": 0.574696056856308, + "grad_norm": 0.1652517318725586, + "learning_rate": 0.00017702410160196276, + "loss": 0.1859, + "step": 7965 + }, + { + "epoch": 0.5747682095313684, + "grad_norm": 0.17374536395072937, + "learning_rate": 0.00017702121518256602, + "loss": 0.1251, + "step": 7966 + }, + { + "epoch": 0.5748403622064288, + "grad_norm": 0.1305515170097351, + "learning_rate": 0.0001770183287631693, + "loss": 0.1499, + "step": 7967 + }, + { + "epoch": 0.5749125148814892, + "grad_norm": 0.1515403687953949, + "learning_rate": 0.00017701544234377257, + "loss": 0.1328, + "step": 7968 + }, + { + "epoch": 0.5749846675565496, + "grad_norm": 0.13701438903808594, + "learning_rate": 0.00017701255592437583, + "loss": 0.1975, + "step": 7969 + }, + { + "epoch": 0.5750568202316101, + "grad_norm": 0.12394005060195923, + "learning_rate": 0.00017700966950497907, + "loss": 0.1507, + "step": 7970 + }, + { + "epoch": 0.5751289729066705, + "grad_norm": 0.11440756171941757, + "learning_rate": 0.00017700678308558233, + "loss": 0.117, + "step": 7971 + }, + { + "epoch": 0.575201125581731, + "grad_norm": 0.1346004754304886, + "learning_rate": 0.0001770038966661856, + "loss": 0.1944, + "step": 7972 + }, + { + "epoch": 0.5752732782567914, + "grad_norm": 0.10825420916080475, + "learning_rate": 0.00017700101024678886, + "loss": 0.1583, + "step": 7973 + }, + { + "epoch": 0.5753454309318518, + "grad_norm": 0.1305004507303238, + "learning_rate": 0.00017699812382739215, + "loss": 0.1128, + "step": 7974 + }, + { + "epoch": 0.5754175836069122, + "grad_norm": 0.11992324888706207, + "learning_rate": 0.00017699523740799538, + "loss": 0.1341, + "step": 7975 + }, + { + "epoch": 0.5754897362819726, + "grad_norm": 0.13553780317306519, + "learning_rate": 0.00017699235098859865, + "loss": 0.1642, + "step": 7976 + }, + { + "epoch": 0.5755618889570331, + "grad_norm": 0.13199329376220703, + "learning_rate": 0.0001769894645692019, + "loss": 0.1994, + "step": 7977 + }, + { + "epoch": 0.5756340416320935, + "grad_norm": 0.1123315766453743, + "learning_rate": 0.00017698657814980517, + "loss": 0.1362, + "step": 7978 + }, + { + "epoch": 0.575706194307154, + "grad_norm": 0.12229757755994797, + "learning_rate": 0.00017698369173040844, + "loss": 0.1647, + "step": 7979 + }, + { + "epoch": 0.5757783469822144, + "grad_norm": 0.1358393430709839, + "learning_rate": 0.0001769808053110117, + "loss": 0.1466, + "step": 7980 + }, + { + "epoch": 0.5758504996572748, + "grad_norm": 0.1132424995303154, + "learning_rate": 0.00017697791889161496, + "loss": 0.1159, + "step": 7981 + }, + { + "epoch": 0.5759226523323352, + "grad_norm": 0.15970008075237274, + "learning_rate": 0.00017697503247221823, + "loss": 0.2145, + "step": 7982 + }, + { + "epoch": 0.5759948050073956, + "grad_norm": 0.12362741678953171, + "learning_rate": 0.0001769721460528215, + "loss": 0.1255, + "step": 7983 + }, + { + "epoch": 0.5760669576824561, + "grad_norm": 0.12955516576766968, + "learning_rate": 0.00017696925963342475, + "loss": 0.1118, + "step": 7984 + }, + { + "epoch": 0.5761391103575165, + "grad_norm": 0.14107045531272888, + "learning_rate": 0.00017696637321402801, + "loss": 0.1415, + "step": 7985 + }, + { + "epoch": 0.576211263032577, + "grad_norm": 0.11059821397066116, + "learning_rate": 0.00017696348679463125, + "loss": 0.1957, + "step": 7986 + }, + { + "epoch": 0.5762834157076374, + "grad_norm": 0.13710936903953552, + "learning_rate": 0.0001769606003752345, + "loss": 0.1357, + "step": 7987 + }, + { + "epoch": 0.5763555683826977, + "grad_norm": 0.12946854531764984, + "learning_rate": 0.0001769577139558378, + "loss": 0.127, + "step": 7988 + }, + { + "epoch": 0.5764277210577582, + "grad_norm": 0.11622809618711472, + "learning_rate": 0.00017695482753644107, + "loss": 0.1296, + "step": 7989 + }, + { + "epoch": 0.5764998737328186, + "grad_norm": 0.11052305996417999, + "learning_rate": 0.00017695194111704433, + "loss": 0.1659, + "step": 7990 + }, + { + "epoch": 0.5765720264078791, + "grad_norm": 0.12174511700868607, + "learning_rate": 0.00017694905469764756, + "loss": 0.1529, + "step": 7991 + }, + { + "epoch": 0.5766441790829395, + "grad_norm": 0.1198398694396019, + "learning_rate": 0.00017694616827825083, + "loss": 0.1802, + "step": 7992 + }, + { + "epoch": 0.576716331758, + "grad_norm": 0.1096002608537674, + "learning_rate": 0.0001769432818588541, + "loss": 0.1478, + "step": 7993 + }, + { + "epoch": 0.5767884844330604, + "grad_norm": 0.15888899564743042, + "learning_rate": 0.00017694039543945735, + "loss": 0.1951, + "step": 7994 + }, + { + "epoch": 0.5768606371081207, + "grad_norm": 0.10436540842056274, + "learning_rate": 0.00017693750902006064, + "loss": 0.1599, + "step": 7995 + }, + { + "epoch": 0.5769327897831812, + "grad_norm": 0.14138226211071014, + "learning_rate": 0.00017693462260066388, + "loss": 0.1255, + "step": 7996 + }, + { + "epoch": 0.5770049424582416, + "grad_norm": 0.10548333823680878, + "learning_rate": 0.00017693173618126714, + "loss": 0.1171, + "step": 7997 + }, + { + "epoch": 0.5770770951333021, + "grad_norm": 0.14108656346797943, + "learning_rate": 0.0001769288497618704, + "loss": 0.1835, + "step": 7998 + }, + { + "epoch": 0.5771492478083625, + "grad_norm": 0.128933846950531, + "learning_rate": 0.00017692596334247367, + "loss": 0.1689, + "step": 7999 + }, + { + "epoch": 0.577221400483423, + "grad_norm": 0.10365033894777298, + "learning_rate": 0.00017692307692307693, + "loss": 0.1361, + "step": 8000 + }, + { + "epoch": 0.5772935531584833, + "grad_norm": 0.09977944195270538, + "learning_rate": 0.0001769201905036802, + "loss": 0.1758, + "step": 8001 + }, + { + "epoch": 0.5773657058335437, + "grad_norm": 0.13828906416893005, + "learning_rate": 0.00017691730408428346, + "loss": 0.1466, + "step": 8002 + }, + { + "epoch": 0.5774378585086042, + "grad_norm": 0.09375912696123123, + "learning_rate": 0.00017691441766488672, + "loss": 0.151, + "step": 8003 + }, + { + "epoch": 0.5775100111836646, + "grad_norm": 0.1094922423362732, + "learning_rate": 0.00017691153124548998, + "loss": 0.1221, + "step": 8004 + }, + { + "epoch": 0.5775821638587251, + "grad_norm": 0.12363706529140472, + "learning_rate": 0.00017690864482609325, + "loss": 0.2183, + "step": 8005 + }, + { + "epoch": 0.5776543165337855, + "grad_norm": 0.11408735066652298, + "learning_rate": 0.0001769057584066965, + "loss": 0.1519, + "step": 8006 + }, + { + "epoch": 0.577726469208846, + "grad_norm": 0.11651131510734558, + "learning_rate": 0.00017690287198729974, + "loss": 0.1517, + "step": 8007 + }, + { + "epoch": 0.5777986218839063, + "grad_norm": 0.1508703976869583, + "learning_rate": 0.000176899985567903, + "loss": 0.1454, + "step": 8008 + }, + { + "epoch": 0.5778707745589667, + "grad_norm": 0.13194362819194794, + "learning_rate": 0.0001768970991485063, + "loss": 0.1315, + "step": 8009 + }, + { + "epoch": 0.5779429272340272, + "grad_norm": 0.1377791166305542, + "learning_rate": 0.00017689421272910956, + "loss": 0.1828, + "step": 8010 + }, + { + "epoch": 0.5780150799090876, + "grad_norm": 0.14916688203811646, + "learning_rate": 0.00017689132630971282, + "loss": 0.1301, + "step": 8011 + }, + { + "epoch": 0.5780872325841481, + "grad_norm": 0.12122716754674911, + "learning_rate": 0.00017688843989031606, + "loss": 0.1358, + "step": 8012 + }, + { + "epoch": 0.5781593852592085, + "grad_norm": 0.10407043993473053, + "learning_rate": 0.00017688555347091932, + "loss": 0.1897, + "step": 8013 + }, + { + "epoch": 0.578231537934269, + "grad_norm": 0.13571223616600037, + "learning_rate": 0.00017688266705152258, + "loss": 0.1735, + "step": 8014 + }, + { + "epoch": 0.5783036906093293, + "grad_norm": 0.12706823647022247, + "learning_rate": 0.00017687978063212585, + "loss": 0.1405, + "step": 8015 + }, + { + "epoch": 0.5783758432843897, + "grad_norm": 0.12309744209051132, + "learning_rate": 0.00017687689421272914, + "loss": 0.1399, + "step": 8016 + }, + { + "epoch": 0.5784479959594502, + "grad_norm": 0.11440018564462662, + "learning_rate": 0.00017687400779333237, + "loss": 0.1656, + "step": 8017 + }, + { + "epoch": 0.5785201486345106, + "grad_norm": 0.11489548534154892, + "learning_rate": 0.00017687112137393564, + "loss": 0.1434, + "step": 8018 + }, + { + "epoch": 0.5785923013095711, + "grad_norm": 0.14805598556995392, + "learning_rate": 0.0001768682349545389, + "loss": 0.2329, + "step": 8019 + }, + { + "epoch": 0.5786644539846315, + "grad_norm": 0.13587909936904907, + "learning_rate": 0.00017686534853514216, + "loss": 0.1575, + "step": 8020 + }, + { + "epoch": 0.578736606659692, + "grad_norm": 0.1194760873913765, + "learning_rate": 0.00017686246211574542, + "loss": 0.1489, + "step": 8021 + }, + { + "epoch": 0.5788087593347523, + "grad_norm": 0.1006384789943695, + "learning_rate": 0.0001768595756963487, + "loss": 0.1388, + "step": 8022 + }, + { + "epoch": 0.5788809120098127, + "grad_norm": 0.11905606836080551, + "learning_rate": 0.00017685668927695195, + "loss": 0.1608, + "step": 8023 + }, + { + "epoch": 0.5789530646848732, + "grad_norm": 0.10487359762191772, + "learning_rate": 0.0001768538028575552, + "loss": 0.1172, + "step": 8024 + }, + { + "epoch": 0.5790252173599336, + "grad_norm": 0.13513118028640747, + "learning_rate": 0.00017685091643815848, + "loss": 0.167, + "step": 8025 + }, + { + "epoch": 0.5790973700349941, + "grad_norm": 0.13310585916042328, + "learning_rate": 0.00017684803001876174, + "loss": 0.1645, + "step": 8026 + }, + { + "epoch": 0.5791695227100545, + "grad_norm": 0.12719425559043884, + "learning_rate": 0.000176845143599365, + "loss": 0.1273, + "step": 8027 + }, + { + "epoch": 0.579241675385115, + "grad_norm": 0.153107151389122, + "learning_rate": 0.00017684225717996824, + "loss": 0.1389, + "step": 8028 + }, + { + "epoch": 0.5793138280601753, + "grad_norm": 0.15313667058944702, + "learning_rate": 0.0001768393707605715, + "loss": 0.1454, + "step": 8029 + }, + { + "epoch": 0.5793859807352357, + "grad_norm": 0.12298291176557541, + "learning_rate": 0.0001768364843411748, + "loss": 0.1811, + "step": 8030 + }, + { + "epoch": 0.5794581334102962, + "grad_norm": 0.11076110601425171, + "learning_rate": 0.00017683359792177805, + "loss": 0.1416, + "step": 8031 + }, + { + "epoch": 0.5795302860853566, + "grad_norm": 0.10948219895362854, + "learning_rate": 0.00017683071150238132, + "loss": 0.1283, + "step": 8032 + }, + { + "epoch": 0.5796024387604171, + "grad_norm": 0.11751320958137512, + "learning_rate": 0.00017682782508298455, + "loss": 0.1924, + "step": 8033 + }, + { + "epoch": 0.5796745914354775, + "grad_norm": 0.13950355350971222, + "learning_rate": 0.00017682493866358782, + "loss": 0.1472, + "step": 8034 + }, + { + "epoch": 0.579746744110538, + "grad_norm": 0.13754689693450928, + "learning_rate": 0.00017682205224419108, + "loss": 0.1325, + "step": 8035 + }, + { + "epoch": 0.5798188967855983, + "grad_norm": 0.1298028528690338, + "learning_rate": 0.00017681916582479434, + "loss": 0.141, + "step": 8036 + }, + { + "epoch": 0.5798910494606587, + "grad_norm": 0.11332829296588898, + "learning_rate": 0.00017681627940539763, + "loss": 0.1247, + "step": 8037 + }, + { + "epoch": 0.5799632021357192, + "grad_norm": 0.11405795067548752, + "learning_rate": 0.00017681339298600087, + "loss": 0.1323, + "step": 8038 + }, + { + "epoch": 0.5800353548107796, + "grad_norm": 0.1341920644044876, + "learning_rate": 0.00017681050656660413, + "loss": 0.174, + "step": 8039 + }, + { + "epoch": 0.5801075074858401, + "grad_norm": 0.12394584715366364, + "learning_rate": 0.0001768076201472074, + "loss": 0.1567, + "step": 8040 + }, + { + "epoch": 0.5801796601609005, + "grad_norm": 0.1091771200299263, + "learning_rate": 0.00017680473372781066, + "loss": 0.1574, + "step": 8041 + }, + { + "epoch": 0.580251812835961, + "grad_norm": 0.1096918135881424, + "learning_rate": 0.00017680184730841392, + "loss": 0.1164, + "step": 8042 + }, + { + "epoch": 0.5803239655110213, + "grad_norm": 0.13219282031059265, + "learning_rate": 0.00017679896088901718, + "loss": 0.1471, + "step": 8043 + }, + { + "epoch": 0.5803961181860817, + "grad_norm": 0.14074230194091797, + "learning_rate": 0.00017679607446962044, + "loss": 0.1854, + "step": 8044 + }, + { + "epoch": 0.5804682708611422, + "grad_norm": 0.11911468207836151, + "learning_rate": 0.0001767931880502237, + "loss": 0.1105, + "step": 8045 + }, + { + "epoch": 0.5805404235362026, + "grad_norm": 0.15260189771652222, + "learning_rate": 0.00017679030163082697, + "loss": 0.1359, + "step": 8046 + }, + { + "epoch": 0.580612576211263, + "grad_norm": 0.12161078304052353, + "learning_rate": 0.00017678741521143023, + "loss": 0.1979, + "step": 8047 + }, + { + "epoch": 0.5806847288863235, + "grad_norm": 0.1545887142419815, + "learning_rate": 0.0001767845287920335, + "loss": 0.1833, + "step": 8048 + }, + { + "epoch": 0.5807568815613839, + "grad_norm": 0.10799109935760498, + "learning_rate": 0.00017678164237263676, + "loss": 0.1778, + "step": 8049 + }, + { + "epoch": 0.5808290342364443, + "grad_norm": 0.11913906037807465, + "learning_rate": 0.00017677875595324, + "loss": 0.1342, + "step": 8050 + }, + { + "epoch": 0.5809011869115047, + "grad_norm": 0.14187337458133698, + "learning_rate": 0.00017677586953384329, + "loss": 0.1379, + "step": 8051 + }, + { + "epoch": 0.5809733395865652, + "grad_norm": 0.13161632418632507, + "learning_rate": 0.00017677298311444655, + "loss": 0.1695, + "step": 8052 + }, + { + "epoch": 0.5810454922616256, + "grad_norm": 0.09995569288730621, + "learning_rate": 0.0001767700966950498, + "loss": 0.1552, + "step": 8053 + }, + { + "epoch": 0.581117644936686, + "grad_norm": 0.24291056394577026, + "learning_rate": 0.00017676721027565307, + "loss": 0.1736, + "step": 8054 + }, + { + "epoch": 0.5811897976117465, + "grad_norm": 0.12517669796943665, + "learning_rate": 0.0001767643238562563, + "loss": 0.1713, + "step": 8055 + }, + { + "epoch": 0.5812619502868069, + "grad_norm": 0.12074005603790283, + "learning_rate": 0.00017676143743685957, + "loss": 0.1906, + "step": 8056 + }, + { + "epoch": 0.5813341029618673, + "grad_norm": 0.11038137227296829, + "learning_rate": 0.00017675855101746284, + "loss": 0.2014, + "step": 8057 + }, + { + "epoch": 0.5814062556369277, + "grad_norm": 0.10443025827407837, + "learning_rate": 0.00017675566459806613, + "loss": 0.1752, + "step": 8058 + }, + { + "epoch": 0.5814784083119882, + "grad_norm": 0.10579415410757065, + "learning_rate": 0.0001767527781786694, + "loss": 0.1253, + "step": 8059 + }, + { + "epoch": 0.5815505609870486, + "grad_norm": 0.11671837419271469, + "learning_rate": 0.00017674989175927262, + "loss": 0.1062, + "step": 8060 + }, + { + "epoch": 0.581622713662109, + "grad_norm": 0.11670401692390442, + "learning_rate": 0.0001767470053398759, + "loss": 0.1615, + "step": 8061 + }, + { + "epoch": 0.5816948663371695, + "grad_norm": 0.1220296323299408, + "learning_rate": 0.00017674411892047915, + "loss": 0.1762, + "step": 8062 + }, + { + "epoch": 0.5817670190122298, + "grad_norm": 0.12593242526054382, + "learning_rate": 0.0001767412325010824, + "loss": 0.1555, + "step": 8063 + }, + { + "epoch": 0.5818391716872903, + "grad_norm": 0.11044905334711075, + "learning_rate": 0.00017673834608168568, + "loss": 0.1762, + "step": 8064 + }, + { + "epoch": 0.5819113243623507, + "grad_norm": 0.12676940858364105, + "learning_rate": 0.00017673545966228894, + "loss": 0.1826, + "step": 8065 + }, + { + "epoch": 0.5819834770374112, + "grad_norm": 0.12122169137001038, + "learning_rate": 0.0001767325732428922, + "loss": 0.1479, + "step": 8066 + }, + { + "epoch": 0.5820556297124716, + "grad_norm": 0.11513711512088776, + "learning_rate": 0.00017672968682349546, + "loss": 0.1332, + "step": 8067 + }, + { + "epoch": 0.582127782387532, + "grad_norm": 0.13474337756633759, + "learning_rate": 0.00017672680040409873, + "loss": 0.1417, + "step": 8068 + }, + { + "epoch": 0.5821999350625925, + "grad_norm": 0.11347810924053192, + "learning_rate": 0.000176723913984702, + "loss": 0.1159, + "step": 8069 + }, + { + "epoch": 0.5822720877376528, + "grad_norm": 0.1329052895307541, + "learning_rate": 0.00017672102756530525, + "loss": 0.1467, + "step": 8070 + }, + { + "epoch": 0.5823442404127133, + "grad_norm": 0.1284935027360916, + "learning_rate": 0.0001767181411459085, + "loss": 0.2003, + "step": 8071 + }, + { + "epoch": 0.5824163930877737, + "grad_norm": 0.11546535044908524, + "learning_rate": 0.00017671525472651178, + "loss": 0.1359, + "step": 8072 + }, + { + "epoch": 0.5824885457628342, + "grad_norm": 0.12746301293373108, + "learning_rate": 0.00017671236830711504, + "loss": 0.1333, + "step": 8073 + }, + { + "epoch": 0.5825606984378946, + "grad_norm": 0.13880078494548798, + "learning_rate": 0.0001767094818877183, + "loss": 0.1537, + "step": 8074 + }, + { + "epoch": 0.582632851112955, + "grad_norm": 0.11027500033378601, + "learning_rate": 0.00017670659546832157, + "loss": 0.1878, + "step": 8075 + }, + { + "epoch": 0.5827050037880155, + "grad_norm": 0.12961946427822113, + "learning_rate": 0.0001767037090489248, + "loss": 0.196, + "step": 8076 + }, + { + "epoch": 0.5827771564630758, + "grad_norm": 0.1460336446762085, + "learning_rate": 0.00017670082262952807, + "loss": 0.2021, + "step": 8077 + }, + { + "epoch": 0.5828493091381363, + "grad_norm": 0.13091175258159637, + "learning_rate": 0.00017669793621013133, + "loss": 0.1604, + "step": 8078 + }, + { + "epoch": 0.5829214618131967, + "grad_norm": 0.11825500428676605, + "learning_rate": 0.00017669504979073462, + "loss": 0.1935, + "step": 8079 + }, + { + "epoch": 0.5829936144882572, + "grad_norm": 0.13889840245246887, + "learning_rate": 0.00017669216337133788, + "loss": 0.132, + "step": 8080 + }, + { + "epoch": 0.5830657671633176, + "grad_norm": 0.09815270453691483, + "learning_rate": 0.00017668927695194112, + "loss": 0.1476, + "step": 8081 + }, + { + "epoch": 0.583137919838378, + "grad_norm": 0.12321629375219345, + "learning_rate": 0.00017668639053254438, + "loss": 0.149, + "step": 8082 + }, + { + "epoch": 0.5832100725134385, + "grad_norm": 0.10414538532495499, + "learning_rate": 0.00017668350411314764, + "loss": 0.1447, + "step": 8083 + }, + { + "epoch": 0.5832822251884988, + "grad_norm": 0.1281728446483612, + "learning_rate": 0.0001766806176937509, + "loss": 0.1721, + "step": 8084 + }, + { + "epoch": 0.5833543778635593, + "grad_norm": 0.12013985216617584, + "learning_rate": 0.00017667773127435417, + "loss": 0.1812, + "step": 8085 + }, + { + "epoch": 0.5834265305386197, + "grad_norm": 0.1369110494852066, + "learning_rate": 0.00017667484485495743, + "loss": 0.1566, + "step": 8086 + }, + { + "epoch": 0.5834986832136801, + "grad_norm": 0.12445517629384995, + "learning_rate": 0.0001766719584355607, + "loss": 0.1898, + "step": 8087 + }, + { + "epoch": 0.5835708358887406, + "grad_norm": 0.13069842755794525, + "learning_rate": 0.00017666907201616396, + "loss": 0.1904, + "step": 8088 + }, + { + "epoch": 0.583642988563801, + "grad_norm": 0.16488301753997803, + "learning_rate": 0.00017666618559676722, + "loss": 0.1951, + "step": 8089 + }, + { + "epoch": 0.5837151412388615, + "grad_norm": 0.09553637355566025, + "learning_rate": 0.00017666329917737049, + "loss": 0.1271, + "step": 8090 + }, + { + "epoch": 0.5837872939139218, + "grad_norm": 0.1081295982003212, + "learning_rate": 0.00017666041275797375, + "loss": 0.1486, + "step": 8091 + }, + { + "epoch": 0.5838594465889823, + "grad_norm": 0.10838132351636887, + "learning_rate": 0.00017665752633857698, + "loss": 0.1583, + "step": 8092 + }, + { + "epoch": 0.5839315992640427, + "grad_norm": 0.12616951763629913, + "learning_rate": 0.00017665463991918027, + "loss": 0.1385, + "step": 8093 + }, + { + "epoch": 0.5840037519391031, + "grad_norm": 0.128948375582695, + "learning_rate": 0.00017665175349978354, + "loss": 0.18, + "step": 8094 + }, + { + "epoch": 0.5840759046141636, + "grad_norm": 0.10365475714206696, + "learning_rate": 0.0001766488670803868, + "loss": 0.1544, + "step": 8095 + }, + { + "epoch": 0.584148057289224, + "grad_norm": 0.10040339082479477, + "learning_rate": 0.00017664598066099006, + "loss": 0.1165, + "step": 8096 + }, + { + "epoch": 0.5842202099642845, + "grad_norm": 0.0983489602804184, + "learning_rate": 0.0001766430942415933, + "loss": 0.1268, + "step": 8097 + }, + { + "epoch": 0.5842923626393448, + "grad_norm": 0.13310272991657257, + "learning_rate": 0.00017664020782219656, + "loss": 0.2079, + "step": 8098 + }, + { + "epoch": 0.5843645153144053, + "grad_norm": 0.1181897446513176, + "learning_rate": 0.00017663732140279982, + "loss": 0.1429, + "step": 8099 + }, + { + "epoch": 0.5844366679894657, + "grad_norm": 0.1250694841146469, + "learning_rate": 0.00017663443498340311, + "loss": 0.1173, + "step": 8100 + }, + { + "epoch": 0.5845088206645261, + "grad_norm": 0.14011278748512268, + "learning_rate": 0.00017663154856400638, + "loss": 0.134, + "step": 8101 + }, + { + "epoch": 0.5845809733395866, + "grad_norm": 0.12558667361736298, + "learning_rate": 0.0001766286621446096, + "loss": 0.1352, + "step": 8102 + }, + { + "epoch": 0.584653126014647, + "grad_norm": 0.14681829512119293, + "learning_rate": 0.00017662577572521288, + "loss": 0.1809, + "step": 8103 + }, + { + "epoch": 0.5847252786897075, + "grad_norm": 0.13540738821029663, + "learning_rate": 0.00017662288930581614, + "loss": 0.1622, + "step": 8104 + }, + { + "epoch": 0.5847974313647678, + "grad_norm": 0.13094480335712433, + "learning_rate": 0.0001766200028864194, + "loss": 0.1603, + "step": 8105 + }, + { + "epoch": 0.5848695840398282, + "grad_norm": 0.14726389944553375, + "learning_rate": 0.00017661711646702266, + "loss": 0.1949, + "step": 8106 + }, + { + "epoch": 0.5849417367148887, + "grad_norm": 0.1353321075439453, + "learning_rate": 0.00017661423004762593, + "loss": 0.1302, + "step": 8107 + }, + { + "epoch": 0.5850138893899491, + "grad_norm": 0.13002312183380127, + "learning_rate": 0.0001766113436282292, + "loss": 0.1433, + "step": 8108 + }, + { + "epoch": 0.5850860420650096, + "grad_norm": 0.14790894091129303, + "learning_rate": 0.00017660845720883245, + "loss": 0.1423, + "step": 8109 + }, + { + "epoch": 0.58515819474007, + "grad_norm": 0.11800483614206314, + "learning_rate": 0.00017660557078943572, + "loss": 0.1602, + "step": 8110 + }, + { + "epoch": 0.5852303474151305, + "grad_norm": 0.09049563109874725, + "learning_rate": 0.00017660268437003898, + "loss": 0.1143, + "step": 8111 + }, + { + "epoch": 0.5853025000901908, + "grad_norm": 0.10751941800117493, + "learning_rate": 0.00017659979795064224, + "loss": 0.176, + "step": 8112 + }, + { + "epoch": 0.5853746527652512, + "grad_norm": 0.10051951557397842, + "learning_rate": 0.00017659691153124548, + "loss": 0.1714, + "step": 8113 + }, + { + "epoch": 0.5854468054403117, + "grad_norm": 0.11118445545434952, + "learning_rate": 0.00017659402511184877, + "loss": 0.1561, + "step": 8114 + }, + { + "epoch": 0.5855189581153721, + "grad_norm": 0.09477000683546066, + "learning_rate": 0.00017659113869245203, + "loss": 0.1209, + "step": 8115 + }, + { + "epoch": 0.5855911107904326, + "grad_norm": 0.09346623718738556, + "learning_rate": 0.0001765882522730553, + "loss": 0.1768, + "step": 8116 + }, + { + "epoch": 0.585663263465493, + "grad_norm": 0.1038440689444542, + "learning_rate": 0.00017658536585365856, + "loss": 0.0868, + "step": 8117 + }, + { + "epoch": 0.5857354161405535, + "grad_norm": 0.12548695504665375, + "learning_rate": 0.0001765824794342618, + "loss": 0.1595, + "step": 8118 + }, + { + "epoch": 0.5858075688156138, + "grad_norm": 0.10288131237030029, + "learning_rate": 0.00017657959301486506, + "loss": 0.1708, + "step": 8119 + }, + { + "epoch": 0.5858797214906742, + "grad_norm": 0.12666648626327515, + "learning_rate": 0.00017657670659546832, + "loss": 0.1288, + "step": 8120 + }, + { + "epoch": 0.5859518741657347, + "grad_norm": 0.12510676681995392, + "learning_rate": 0.0001765738201760716, + "loss": 0.1497, + "step": 8121 + }, + { + "epoch": 0.5860240268407951, + "grad_norm": 0.13701088726520538, + "learning_rate": 0.00017657093375667487, + "loss": 0.121, + "step": 8122 + }, + { + "epoch": 0.5860961795158556, + "grad_norm": 0.1612546443939209, + "learning_rate": 0.0001765680473372781, + "loss": 0.1468, + "step": 8123 + }, + { + "epoch": 0.586168332190916, + "grad_norm": 0.1792639046907425, + "learning_rate": 0.00017656516091788137, + "loss": 0.1617, + "step": 8124 + }, + { + "epoch": 0.5862404848659764, + "grad_norm": 0.11944622546434402, + "learning_rate": 0.00017656227449848463, + "loss": 0.1339, + "step": 8125 + }, + { + "epoch": 0.5863126375410368, + "grad_norm": 0.1193036362528801, + "learning_rate": 0.0001765593880790879, + "loss": 0.1702, + "step": 8126 + }, + { + "epoch": 0.5863847902160972, + "grad_norm": 0.1043739914894104, + "learning_rate": 0.00017655650165969116, + "loss": 0.1453, + "step": 8127 + }, + { + "epoch": 0.5864569428911577, + "grad_norm": 0.08376028388738632, + "learning_rate": 0.00017655361524029442, + "loss": 0.1438, + "step": 8128 + }, + { + "epoch": 0.5865290955662181, + "grad_norm": 0.11524992436170578, + "learning_rate": 0.00017655072882089768, + "loss": 0.17, + "step": 8129 + }, + { + "epoch": 0.5866012482412786, + "grad_norm": 0.09101670235395432, + "learning_rate": 0.00017654784240150095, + "loss": 0.1545, + "step": 8130 + }, + { + "epoch": 0.586673400916339, + "grad_norm": 0.12667500972747803, + "learning_rate": 0.0001765449559821042, + "loss": 0.2263, + "step": 8131 + }, + { + "epoch": 0.5867455535913993, + "grad_norm": 0.11871380358934402, + "learning_rate": 0.00017654206956270747, + "loss": 0.174, + "step": 8132 + }, + { + "epoch": 0.5868177062664598, + "grad_norm": 0.09008299559354782, + "learning_rate": 0.00017653918314331074, + "loss": 0.1534, + "step": 8133 + }, + { + "epoch": 0.5868898589415202, + "grad_norm": 0.12811166048049927, + "learning_rate": 0.00017653629672391397, + "loss": 0.1238, + "step": 8134 + }, + { + "epoch": 0.5869620116165807, + "grad_norm": 0.12848630547523499, + "learning_rate": 0.00017653341030451724, + "loss": 0.226, + "step": 8135 + }, + { + "epoch": 0.5870341642916411, + "grad_norm": 0.11928470432758331, + "learning_rate": 0.00017653052388512053, + "loss": 0.1446, + "step": 8136 + }, + { + "epoch": 0.5871063169667016, + "grad_norm": 0.1059202253818512, + "learning_rate": 0.0001765276374657238, + "loss": 0.1667, + "step": 8137 + }, + { + "epoch": 0.587178469641762, + "grad_norm": 0.10207483917474747, + "learning_rate": 0.00017652475104632705, + "loss": 0.1564, + "step": 8138 + }, + { + "epoch": 0.5872506223168223, + "grad_norm": 0.1323583722114563, + "learning_rate": 0.0001765218646269303, + "loss": 0.1715, + "step": 8139 + }, + { + "epoch": 0.5873227749918828, + "grad_norm": 0.14230741560459137, + "learning_rate": 0.00017651897820753355, + "loss": 0.1531, + "step": 8140 + }, + { + "epoch": 0.5873949276669432, + "grad_norm": 0.17423692345619202, + "learning_rate": 0.0001765160917881368, + "loss": 0.1823, + "step": 8141 + }, + { + "epoch": 0.5874670803420037, + "grad_norm": 0.13893862068653107, + "learning_rate": 0.00017651320536874008, + "loss": 0.1552, + "step": 8142 + }, + { + "epoch": 0.5875392330170641, + "grad_norm": 0.13696856796741486, + "learning_rate": 0.00017651031894934337, + "loss": 0.1557, + "step": 8143 + }, + { + "epoch": 0.5876113856921246, + "grad_norm": 0.14471766352653503, + "learning_rate": 0.0001765074325299466, + "loss": 0.1854, + "step": 8144 + }, + { + "epoch": 0.587683538367185, + "grad_norm": 0.124466173350811, + "learning_rate": 0.00017650454611054986, + "loss": 0.168, + "step": 8145 + }, + { + "epoch": 0.5877556910422453, + "grad_norm": 0.12519100308418274, + "learning_rate": 0.00017650165969115313, + "loss": 0.144, + "step": 8146 + }, + { + "epoch": 0.5878278437173058, + "grad_norm": 0.12249601632356644, + "learning_rate": 0.0001764987732717564, + "loss": 0.1837, + "step": 8147 + }, + { + "epoch": 0.5878999963923662, + "grad_norm": 0.10754699259996414, + "learning_rate": 0.00017649588685235965, + "loss": 0.1643, + "step": 8148 + }, + { + "epoch": 0.5879721490674267, + "grad_norm": 0.12035319954156876, + "learning_rate": 0.00017649300043296292, + "loss": 0.157, + "step": 8149 + }, + { + "epoch": 0.5880443017424871, + "grad_norm": 0.12663905322551727, + "learning_rate": 0.00017649011401356618, + "loss": 0.1624, + "step": 8150 + }, + { + "epoch": 0.5881164544175476, + "grad_norm": 0.11968550831079483, + "learning_rate": 0.00017648722759416944, + "loss": 0.1206, + "step": 8151 + }, + { + "epoch": 0.588188607092608, + "grad_norm": 0.11833333969116211, + "learning_rate": 0.0001764843411747727, + "loss": 0.1622, + "step": 8152 + }, + { + "epoch": 0.5882607597676683, + "grad_norm": 0.13024753332138062, + "learning_rate": 0.00017648145475537597, + "loss": 0.1178, + "step": 8153 + }, + { + "epoch": 0.5883329124427288, + "grad_norm": 0.11432848125696182, + "learning_rate": 0.00017647856833597923, + "loss": 0.138, + "step": 8154 + }, + { + "epoch": 0.5884050651177892, + "grad_norm": 0.1305474489927292, + "learning_rate": 0.0001764756819165825, + "loss": 0.1701, + "step": 8155 + }, + { + "epoch": 0.5884772177928497, + "grad_norm": 0.10625302791595459, + "learning_rate": 0.00017647279549718573, + "loss": 0.1256, + "step": 8156 + }, + { + "epoch": 0.5885493704679101, + "grad_norm": 0.155123770236969, + "learning_rate": 0.00017646990907778902, + "loss": 0.1649, + "step": 8157 + }, + { + "epoch": 0.5886215231429706, + "grad_norm": 0.10322414338588715, + "learning_rate": 0.00017646702265839228, + "loss": 0.1806, + "step": 8158 + }, + { + "epoch": 0.588693675818031, + "grad_norm": 0.1403033435344696, + "learning_rate": 0.00017646413623899555, + "loss": 0.1803, + "step": 8159 + }, + { + "epoch": 0.5887658284930913, + "grad_norm": 0.16688910126686096, + "learning_rate": 0.0001764612498195988, + "loss": 0.1431, + "step": 8160 + }, + { + "epoch": 0.5888379811681518, + "grad_norm": 0.12197678536176682, + "learning_rate": 0.00017645836340020204, + "loss": 0.0995, + "step": 8161 + }, + { + "epoch": 0.5889101338432122, + "grad_norm": 0.12910448014736176, + "learning_rate": 0.0001764554769808053, + "loss": 0.105, + "step": 8162 + }, + { + "epoch": 0.5889822865182727, + "grad_norm": 0.12313324213027954, + "learning_rate": 0.00017645259056140857, + "loss": 0.167, + "step": 8163 + }, + { + "epoch": 0.5890544391933331, + "grad_norm": 0.14233951270580292, + "learning_rate": 0.00017644970414201186, + "loss": 0.1482, + "step": 8164 + }, + { + "epoch": 0.5891265918683936, + "grad_norm": 0.11455697566270828, + "learning_rate": 0.00017644681772261512, + "loss": 0.1856, + "step": 8165 + }, + { + "epoch": 0.589198744543454, + "grad_norm": 0.13419370353221893, + "learning_rate": 0.00017644393130321836, + "loss": 0.1675, + "step": 8166 + }, + { + "epoch": 0.5892708972185143, + "grad_norm": 0.1370537281036377, + "learning_rate": 0.00017644104488382162, + "loss": 0.1323, + "step": 8167 + }, + { + "epoch": 0.5893430498935748, + "grad_norm": 0.1115809828042984, + "learning_rate": 0.00017643815846442488, + "loss": 0.1097, + "step": 8168 + }, + { + "epoch": 0.5894152025686352, + "grad_norm": 0.13678720593452454, + "learning_rate": 0.00017643527204502815, + "loss": 0.1977, + "step": 8169 + }, + { + "epoch": 0.5894873552436957, + "grad_norm": 0.11522739380598068, + "learning_rate": 0.0001764323856256314, + "loss": 0.19, + "step": 8170 + }, + { + "epoch": 0.5895595079187561, + "grad_norm": 0.11245250701904297, + "learning_rate": 0.00017642949920623467, + "loss": 0.1756, + "step": 8171 + }, + { + "epoch": 0.5896316605938166, + "grad_norm": 0.13633395731449127, + "learning_rate": 0.00017642661278683794, + "loss": 0.1782, + "step": 8172 + }, + { + "epoch": 0.589703813268877, + "grad_norm": 0.12719576060771942, + "learning_rate": 0.0001764237263674412, + "loss": 0.1714, + "step": 8173 + }, + { + "epoch": 0.5897759659439373, + "grad_norm": 0.107512466609478, + "learning_rate": 0.00017642083994804446, + "loss": 0.1105, + "step": 8174 + }, + { + "epoch": 0.5898481186189978, + "grad_norm": 0.13118325173854828, + "learning_rate": 0.00017641795352864772, + "loss": 0.1097, + "step": 8175 + }, + { + "epoch": 0.5899202712940582, + "grad_norm": 0.1463913917541504, + "learning_rate": 0.000176415067109251, + "loss": 0.1743, + "step": 8176 + }, + { + "epoch": 0.5899924239691187, + "grad_norm": 0.1269274204969406, + "learning_rate": 0.00017641218068985422, + "loss": 0.138, + "step": 8177 + }, + { + "epoch": 0.5900645766441791, + "grad_norm": 0.12935011088848114, + "learning_rate": 0.00017640929427045751, + "loss": 0.1528, + "step": 8178 + }, + { + "epoch": 0.5901367293192396, + "grad_norm": 0.10900428891181946, + "learning_rate": 0.00017640640785106078, + "loss": 0.1439, + "step": 8179 + }, + { + "epoch": 0.5902088819942999, + "grad_norm": 0.13429750502109528, + "learning_rate": 0.00017640352143166404, + "loss": 0.1801, + "step": 8180 + }, + { + "epoch": 0.5902810346693603, + "grad_norm": 0.14954394102096558, + "learning_rate": 0.0001764006350122673, + "loss": 0.1817, + "step": 8181 + }, + { + "epoch": 0.5903531873444208, + "grad_norm": 0.13643895089626312, + "learning_rate": 0.00017639774859287054, + "loss": 0.1868, + "step": 8182 + }, + { + "epoch": 0.5904253400194812, + "grad_norm": 0.14662951231002808, + "learning_rate": 0.0001763948621734738, + "loss": 0.1059, + "step": 8183 + }, + { + "epoch": 0.5904974926945417, + "grad_norm": 0.1294011026620865, + "learning_rate": 0.00017639197575407706, + "loss": 0.1778, + "step": 8184 + }, + { + "epoch": 0.5905696453696021, + "grad_norm": 0.12653610110282898, + "learning_rate": 0.00017638908933468035, + "loss": 0.1412, + "step": 8185 + }, + { + "epoch": 0.5906417980446625, + "grad_norm": 0.12400539964437485, + "learning_rate": 0.00017638620291528362, + "loss": 0.192, + "step": 8186 + }, + { + "epoch": 0.5907139507197229, + "grad_norm": 0.0926087498664856, + "learning_rate": 0.00017638331649588685, + "loss": 0.1708, + "step": 8187 + }, + { + "epoch": 0.5907861033947833, + "grad_norm": 0.1382742077112198, + "learning_rate": 0.00017638043007649012, + "loss": 0.1597, + "step": 8188 + }, + { + "epoch": 0.5908582560698438, + "grad_norm": 0.14681148529052734, + "learning_rate": 0.00017637754365709338, + "loss": 0.1769, + "step": 8189 + }, + { + "epoch": 0.5909304087449042, + "grad_norm": 0.1179690808057785, + "learning_rate": 0.00017637465723769664, + "loss": 0.1475, + "step": 8190 + }, + { + "epoch": 0.5910025614199647, + "grad_norm": 0.10221707075834274, + "learning_rate": 0.0001763717708182999, + "loss": 0.1271, + "step": 8191 + }, + { + "epoch": 0.5910747140950251, + "grad_norm": 0.10415665060281754, + "learning_rate": 0.00017636888439890317, + "loss": 0.1735, + "step": 8192 + }, + { + "epoch": 0.5911468667700855, + "grad_norm": 0.11267755925655365, + "learning_rate": 0.00017636599797950643, + "loss": 0.1356, + "step": 8193 + }, + { + "epoch": 0.5912190194451459, + "grad_norm": 0.10448981821537018, + "learning_rate": 0.0001763631115601097, + "loss": 0.117, + "step": 8194 + }, + { + "epoch": 0.5912911721202063, + "grad_norm": 0.14107589423656464, + "learning_rate": 0.00017636022514071296, + "loss": 0.1642, + "step": 8195 + }, + { + "epoch": 0.5913633247952668, + "grad_norm": 0.12608906626701355, + "learning_rate": 0.00017635733872131622, + "loss": 0.1218, + "step": 8196 + }, + { + "epoch": 0.5914354774703272, + "grad_norm": 0.15684707462787628, + "learning_rate": 0.00017635445230191948, + "loss": 0.1486, + "step": 8197 + }, + { + "epoch": 0.5915076301453877, + "grad_norm": 0.1347835212945938, + "learning_rate": 0.00017635156588252272, + "loss": 0.1775, + "step": 8198 + }, + { + "epoch": 0.5915797828204481, + "grad_norm": 0.13035349547863007, + "learning_rate": 0.000176348679463126, + "loss": 0.1867, + "step": 8199 + }, + { + "epoch": 0.5916519354955085, + "grad_norm": 0.12555575370788574, + "learning_rate": 0.00017634579304372927, + "loss": 0.1035, + "step": 8200 + }, + { + "epoch": 0.5917240881705689, + "grad_norm": 0.13564147055149078, + "learning_rate": 0.00017634290662433253, + "loss": 0.1022, + "step": 8201 + }, + { + "epoch": 0.5917962408456293, + "grad_norm": 0.12636464834213257, + "learning_rate": 0.0001763400202049358, + "loss": 0.1745, + "step": 8202 + }, + { + "epoch": 0.5918683935206898, + "grad_norm": 0.12000800669193268, + "learning_rate": 0.00017633713378553903, + "loss": 0.1581, + "step": 8203 + }, + { + "epoch": 0.5919405461957502, + "grad_norm": 0.1053057610988617, + "learning_rate": 0.0001763342473661423, + "loss": 0.1535, + "step": 8204 + }, + { + "epoch": 0.5920126988708106, + "grad_norm": 0.11557186394929886, + "learning_rate": 0.00017633136094674556, + "loss": 0.1258, + "step": 8205 + }, + { + "epoch": 0.5920848515458711, + "grad_norm": 0.11119802296161652, + "learning_rate": 0.00017632847452734885, + "loss": 0.1633, + "step": 8206 + }, + { + "epoch": 0.5921570042209315, + "grad_norm": 0.11186131834983826, + "learning_rate": 0.0001763255881079521, + "loss": 0.1286, + "step": 8207 + }, + { + "epoch": 0.5922291568959919, + "grad_norm": 0.11805294454097748, + "learning_rate": 0.00017632270168855535, + "loss": 0.1443, + "step": 8208 + }, + { + "epoch": 0.5923013095710523, + "grad_norm": 0.1183740571141243, + "learning_rate": 0.0001763198152691586, + "loss": 0.1524, + "step": 8209 + }, + { + "epoch": 0.5923734622461128, + "grad_norm": 0.11456070095300674, + "learning_rate": 0.00017631692884976187, + "loss": 0.1872, + "step": 8210 + }, + { + "epoch": 0.5924456149211732, + "grad_norm": 0.10778361558914185, + "learning_rate": 0.00017631404243036514, + "loss": 0.1611, + "step": 8211 + }, + { + "epoch": 0.5925177675962336, + "grad_norm": 0.14650307595729828, + "learning_rate": 0.0001763111560109684, + "loss": 0.1643, + "step": 8212 + }, + { + "epoch": 0.5925899202712941, + "grad_norm": 0.10854461044073105, + "learning_rate": 0.00017630826959157166, + "loss": 0.1605, + "step": 8213 + }, + { + "epoch": 0.5926620729463545, + "grad_norm": 0.10862936824560165, + "learning_rate": 0.00017630538317217492, + "loss": 0.1706, + "step": 8214 + }, + { + "epoch": 0.5927342256214149, + "grad_norm": 0.14142727851867676, + "learning_rate": 0.0001763024967527782, + "loss": 0.1914, + "step": 8215 + }, + { + "epoch": 0.5928063782964753, + "grad_norm": 0.14297692477703094, + "learning_rate": 0.00017629961033338145, + "loss": 0.1399, + "step": 8216 + }, + { + "epoch": 0.5928785309715358, + "grad_norm": 0.0961894765496254, + "learning_rate": 0.0001762967239139847, + "loss": 0.1311, + "step": 8217 + }, + { + "epoch": 0.5929506836465962, + "grad_norm": 0.1122874915599823, + "learning_rate": 0.00017629383749458798, + "loss": 0.1667, + "step": 8218 + }, + { + "epoch": 0.5930228363216566, + "grad_norm": 0.12797664105892181, + "learning_rate": 0.0001762909510751912, + "loss": 0.1759, + "step": 8219 + }, + { + "epoch": 0.5930949889967171, + "grad_norm": 0.10802415013313293, + "learning_rate": 0.0001762880646557945, + "loss": 0.1064, + "step": 8220 + }, + { + "epoch": 0.5931671416717775, + "grad_norm": 0.11519049853086472, + "learning_rate": 0.00017628517823639777, + "loss": 0.1738, + "step": 8221 + }, + { + "epoch": 0.5932392943468379, + "grad_norm": 0.1328316330909729, + "learning_rate": 0.00017628229181700103, + "loss": 0.1477, + "step": 8222 + }, + { + "epoch": 0.5933114470218983, + "grad_norm": 0.1109108254313469, + "learning_rate": 0.0001762794053976043, + "loss": 0.1461, + "step": 8223 + }, + { + "epoch": 0.5933835996969588, + "grad_norm": 0.11819633841514587, + "learning_rate": 0.00017627651897820753, + "loss": 0.1343, + "step": 8224 + }, + { + "epoch": 0.5934557523720192, + "grad_norm": 0.12076713889837265, + "learning_rate": 0.0001762736325588108, + "loss": 0.2058, + "step": 8225 + }, + { + "epoch": 0.5935279050470796, + "grad_norm": 0.13241450488567352, + "learning_rate": 0.00017627074613941405, + "loss": 0.1176, + "step": 8226 + }, + { + "epoch": 0.5936000577221401, + "grad_norm": 0.11165394634008408, + "learning_rate": 0.00017626785972001734, + "loss": 0.1765, + "step": 8227 + }, + { + "epoch": 0.5936722103972005, + "grad_norm": 0.1249672994017601, + "learning_rate": 0.0001762649733006206, + "loss": 0.0977, + "step": 8228 + }, + { + "epoch": 0.5937443630722609, + "grad_norm": 0.1294601857662201, + "learning_rate": 0.00017626208688122384, + "loss": 0.1549, + "step": 8229 + }, + { + "epoch": 0.5938165157473213, + "grad_norm": 0.1573733389377594, + "learning_rate": 0.0001762592004618271, + "loss": 0.2001, + "step": 8230 + }, + { + "epoch": 0.5938886684223817, + "grad_norm": 0.12269971519708633, + "learning_rate": 0.00017625631404243037, + "loss": 0.1411, + "step": 8231 + }, + { + "epoch": 0.5939608210974422, + "grad_norm": 0.10967043042182922, + "learning_rate": 0.00017625342762303363, + "loss": 0.1858, + "step": 8232 + }, + { + "epoch": 0.5940329737725026, + "grad_norm": 0.10778573900461197, + "learning_rate": 0.0001762505412036369, + "loss": 0.1595, + "step": 8233 + }, + { + "epoch": 0.5941051264475631, + "grad_norm": 0.13718152046203613, + "learning_rate": 0.00017624765478424016, + "loss": 0.1512, + "step": 8234 + }, + { + "epoch": 0.5941772791226235, + "grad_norm": 0.11807660013437271, + "learning_rate": 0.00017624476836484342, + "loss": 0.1529, + "step": 8235 + }, + { + "epoch": 0.5942494317976839, + "grad_norm": 0.13162270188331604, + "learning_rate": 0.00017624188194544668, + "loss": 0.1738, + "step": 8236 + }, + { + "epoch": 0.5943215844727443, + "grad_norm": 0.09376419335603714, + "learning_rate": 0.00017623899552604994, + "loss": 0.0917, + "step": 8237 + }, + { + "epoch": 0.5943937371478047, + "grad_norm": 0.10288868844509125, + "learning_rate": 0.0001762361091066532, + "loss": 0.127, + "step": 8238 + }, + { + "epoch": 0.5944658898228652, + "grad_norm": 0.11284645646810532, + "learning_rate": 0.00017623322268725647, + "loss": 0.1761, + "step": 8239 + }, + { + "epoch": 0.5945380424979256, + "grad_norm": 0.09281051903963089, + "learning_rate": 0.0001762303362678597, + "loss": 0.103, + "step": 8240 + }, + { + "epoch": 0.5946101951729861, + "grad_norm": 0.10831299424171448, + "learning_rate": 0.000176227449848463, + "loss": 0.1427, + "step": 8241 + }, + { + "epoch": 0.5946823478480464, + "grad_norm": 0.13449856638908386, + "learning_rate": 0.00017622456342906626, + "loss": 0.1633, + "step": 8242 + }, + { + "epoch": 0.5947545005231069, + "grad_norm": 0.13355782628059387, + "learning_rate": 0.00017622167700966952, + "loss": 0.1517, + "step": 8243 + }, + { + "epoch": 0.5948266531981673, + "grad_norm": 0.13006524741649628, + "learning_rate": 0.00017621879059027279, + "loss": 0.1852, + "step": 8244 + }, + { + "epoch": 0.5948988058732277, + "grad_norm": 0.11725495010614395, + "learning_rate": 0.00017621590417087602, + "loss": 0.1499, + "step": 8245 + }, + { + "epoch": 0.5949709585482882, + "grad_norm": 0.12848377227783203, + "learning_rate": 0.00017621301775147928, + "loss": 0.1685, + "step": 8246 + }, + { + "epoch": 0.5950431112233486, + "grad_norm": 0.1165483221411705, + "learning_rate": 0.00017621013133208255, + "loss": 0.1683, + "step": 8247 + }, + { + "epoch": 0.5951152638984091, + "grad_norm": 0.11599836498498917, + "learning_rate": 0.00017620724491268584, + "loss": 0.2006, + "step": 8248 + }, + { + "epoch": 0.5951874165734694, + "grad_norm": 0.12383638322353363, + "learning_rate": 0.0001762043584932891, + "loss": 0.1609, + "step": 8249 + }, + { + "epoch": 0.5952595692485299, + "grad_norm": 0.12446649372577667, + "learning_rate": 0.00017620147207389234, + "loss": 0.1178, + "step": 8250 + }, + { + "epoch": 0.5953317219235903, + "grad_norm": 0.11396703869104385, + "learning_rate": 0.0001761985856544956, + "loss": 0.186, + "step": 8251 + }, + { + "epoch": 0.5954038745986507, + "grad_norm": 0.11467541754245758, + "learning_rate": 0.00017619569923509886, + "loss": 0.155, + "step": 8252 + }, + { + "epoch": 0.5954760272737112, + "grad_norm": 0.26076146960258484, + "learning_rate": 0.00017619281281570212, + "loss": 0.1501, + "step": 8253 + }, + { + "epoch": 0.5955481799487716, + "grad_norm": 0.12669560313224792, + "learning_rate": 0.0001761899263963054, + "loss": 0.1418, + "step": 8254 + }, + { + "epoch": 0.5956203326238321, + "grad_norm": 0.12748664617538452, + "learning_rate": 0.00017618703997690865, + "loss": 0.1941, + "step": 8255 + }, + { + "epoch": 0.5956924852988924, + "grad_norm": 0.18919895589351654, + "learning_rate": 0.0001761841535575119, + "loss": 0.1552, + "step": 8256 + }, + { + "epoch": 0.5957646379739528, + "grad_norm": Infinity, + "learning_rate": 0.0001761841535575119, + "loss": 0.2004, + "step": 8257 + }, + { + "epoch": 0.5958367906490133, + "grad_norm": 0.12094852328300476, + "learning_rate": 0.00017618126713811518, + "loss": 0.1832, + "step": 8258 + }, + { + "epoch": 0.5959089433240737, + "grad_norm": 0.1229955181479454, + "learning_rate": 0.00017617838071871844, + "loss": 0.0837, + "step": 8259 + }, + { + "epoch": 0.5959810959991342, + "grad_norm": 0.11439645290374756, + "learning_rate": 0.0001761754942993217, + "loss": 0.1419, + "step": 8260 + }, + { + "epoch": 0.5960532486741946, + "grad_norm": 0.12709204852581024, + "learning_rate": 0.00017617260787992496, + "loss": 0.1911, + "step": 8261 + }, + { + "epoch": 0.5961254013492551, + "grad_norm": 0.13323001563549042, + "learning_rate": 0.00017616972146052823, + "loss": 0.2032, + "step": 8262 + }, + { + "epoch": 0.5961975540243154, + "grad_norm": 0.12606051564216614, + "learning_rate": 0.0001761668350411315, + "loss": 0.1947, + "step": 8263 + }, + { + "epoch": 0.5962697066993758, + "grad_norm": 0.11966247111558914, + "learning_rate": 0.00017616394862173475, + "loss": 0.091, + "step": 8264 + }, + { + "epoch": 0.5963418593744363, + "grad_norm": 0.12878213822841644, + "learning_rate": 0.00017616106220233802, + "loss": 0.1934, + "step": 8265 + }, + { + "epoch": 0.5964140120494967, + "grad_norm": 0.17161764204502106, + "learning_rate": 0.00017615817578294128, + "loss": 0.1866, + "step": 8266 + }, + { + "epoch": 0.5964861647245572, + "grad_norm": 0.12299740314483643, + "learning_rate": 0.00017615528936354454, + "loss": 0.135, + "step": 8267 + }, + { + "epoch": 0.5965583173996176, + "grad_norm": 0.11680129915475845, + "learning_rate": 0.00017615240294414778, + "loss": 0.1201, + "step": 8268 + }, + { + "epoch": 0.5966304700746781, + "grad_norm": 0.1203378289937973, + "learning_rate": 0.00017614951652475104, + "loss": 0.1124, + "step": 8269 + }, + { + "epoch": 0.5967026227497384, + "grad_norm": 0.11593905836343765, + "learning_rate": 0.00017614663010535433, + "loss": 0.1568, + "step": 8270 + }, + { + "epoch": 0.5967747754247988, + "grad_norm": 0.10731831938028336, + "learning_rate": 0.0001761437436859576, + "loss": 0.1897, + "step": 8271 + }, + { + "epoch": 0.5968469280998593, + "grad_norm": 0.13110333681106567, + "learning_rate": 0.00017614085726656086, + "loss": 0.1156, + "step": 8272 + }, + { + "epoch": 0.5969190807749197, + "grad_norm": Infinity, + "learning_rate": 0.00017614085726656086, + "loss": 0.2281, + "step": 8273 + }, + { + "epoch": 0.5969912334499802, + "grad_norm": 0.11687818914651871, + "learning_rate": 0.0001761379708471641, + "loss": 0.1353, + "step": 8274 + }, + { + "epoch": 0.5970633861250406, + "grad_norm": 0.13382591307163239, + "learning_rate": 0.00017613508442776736, + "loss": 0.181, + "step": 8275 + }, + { + "epoch": 0.5971355388001011, + "grad_norm": 0.12436043471097946, + "learning_rate": 0.00017613219800837062, + "loss": 0.1696, + "step": 8276 + }, + { + "epoch": 0.5972076914751614, + "grad_norm": 0.13329434394836426, + "learning_rate": 0.00017612931158897388, + "loss": 0.1778, + "step": 8277 + }, + { + "epoch": 0.5972798441502218, + "grad_norm": 0.12108474224805832, + "learning_rate": 0.00017612642516957717, + "loss": 0.1643, + "step": 8278 + }, + { + "epoch": 0.5973519968252823, + "grad_norm": Infinity, + "learning_rate": 0.00017612642516957717, + "loss": 0.1723, + "step": 8279 + }, + { + "epoch": 0.5974241495003427, + "grad_norm": 0.10556795448064804, + "learning_rate": 0.0001761235387501804, + "loss": 0.147, + "step": 8280 + }, + { + "epoch": 0.5974963021754032, + "grad_norm": 0.14981569349765778, + "learning_rate": 0.00017612065233078367, + "loss": 0.1644, + "step": 8281 + }, + { + "epoch": 0.5975684548504636, + "grad_norm": 0.13146357238292694, + "learning_rate": 0.00017611776591138693, + "loss": 0.1755, + "step": 8282 + }, + { + "epoch": 0.5976406075255241, + "grad_norm": 1.4089605808258057, + "learning_rate": 0.0001761148794919902, + "loss": 0.1624, + "step": 8283 + }, + { + "epoch": 0.5977127602005844, + "grad_norm": 0.11864148825407028, + "learning_rate": 0.00017611199307259346, + "loss": 0.113, + "step": 8284 + }, + { + "epoch": 0.5977849128756448, + "grad_norm": 0.14449827373027802, + "learning_rate": 0.00017610910665319672, + "loss": 0.1887, + "step": 8285 + }, + { + "epoch": 0.5978570655507053, + "grad_norm": 0.17353913187980652, + "learning_rate": 0.00017610622023379998, + "loss": 0.1691, + "step": 8286 + }, + { + "epoch": 0.5979292182257657, + "grad_norm": 0.13227996230125427, + "learning_rate": 0.00017610333381440325, + "loss": 0.1468, + "step": 8287 + }, + { + "epoch": 0.5980013709008262, + "grad_norm": 0.12186557799577713, + "learning_rate": 0.0001761004473950065, + "loss": 0.152, + "step": 8288 + }, + { + "epoch": 0.5980735235758866, + "grad_norm": 0.11306702345609665, + "learning_rate": 0.00017609756097560977, + "loss": 0.1818, + "step": 8289 + }, + { + "epoch": 0.5981456762509471, + "grad_norm": 0.09941356629133224, + "learning_rate": 0.00017609467455621304, + "loss": 0.1653, + "step": 8290 + }, + { + "epoch": 0.5982178289260074, + "grad_norm": 0.10904739052057266, + "learning_rate": 0.00017609178813681627, + "loss": 0.1048, + "step": 8291 + }, + { + "epoch": 0.5982899816010678, + "grad_norm": 0.11784323304891586, + "learning_rate": 0.00017608890171741954, + "loss": 0.1864, + "step": 8292 + }, + { + "epoch": 0.5983621342761283, + "grad_norm": 0.10877453535795212, + "learning_rate": 0.00017608601529802283, + "loss": 0.1739, + "step": 8293 + }, + { + "epoch": 0.5984342869511887, + "grad_norm": 0.11221017688512802, + "learning_rate": 0.0001760831288786261, + "loss": 0.1518, + "step": 8294 + }, + { + "epoch": 0.5985064396262492, + "grad_norm": 0.11134081333875656, + "learning_rate": 0.00017608024245922935, + "loss": 0.1703, + "step": 8295 + }, + { + "epoch": 0.5985785923013096, + "grad_norm": 0.15727955102920532, + "learning_rate": 0.0001760773560398326, + "loss": 0.1345, + "step": 8296 + }, + { + "epoch": 0.59865074497637, + "grad_norm": 0.15842805802822113, + "learning_rate": 0.00017607446962043585, + "loss": 0.1875, + "step": 8297 + }, + { + "epoch": 0.5987228976514304, + "grad_norm": 0.1713588535785675, + "learning_rate": 0.0001760715832010391, + "loss": 0.1926, + "step": 8298 + }, + { + "epoch": 0.5987950503264908, + "grad_norm": 0.13431398570537567, + "learning_rate": 0.00017606869678164238, + "loss": 0.1158, + "step": 8299 + }, + { + "epoch": 0.5988672030015513, + "grad_norm": 0.14271295070648193, + "learning_rate": 0.00017606581036224567, + "loss": 0.1277, + "step": 8300 + }, + { + "epoch": 0.5989393556766117, + "grad_norm": 0.16294990479946136, + "learning_rate": 0.0001760629239428489, + "loss": 0.2188, + "step": 8301 + }, + { + "epoch": 0.5990115083516722, + "grad_norm": 0.14524459838867188, + "learning_rate": 0.00017606003752345216, + "loss": 0.1834, + "step": 8302 + }, + { + "epoch": 0.5990836610267326, + "grad_norm": 0.14850306510925293, + "learning_rate": 0.00017605715110405543, + "loss": 0.1632, + "step": 8303 + }, + { + "epoch": 0.5991558137017929, + "grad_norm": 0.152170330286026, + "learning_rate": 0.0001760542646846587, + "loss": 0.1828, + "step": 8304 + }, + { + "epoch": 0.5992279663768534, + "grad_norm": 0.13063561916351318, + "learning_rate": 0.00017605137826526195, + "loss": 0.0932, + "step": 8305 + }, + { + "epoch": 0.5993001190519138, + "grad_norm": 0.1113244891166687, + "learning_rate": 0.00017604849184586522, + "loss": 0.1761, + "step": 8306 + }, + { + "epoch": 0.5993722717269743, + "grad_norm": 0.18196213245391846, + "learning_rate": 0.00017604560542646848, + "loss": 0.1519, + "step": 8307 + }, + { + "epoch": 0.5994444244020347, + "grad_norm": 0.39838525652885437, + "learning_rate": 0.00017604271900707174, + "loss": 0.1334, + "step": 8308 + }, + { + "epoch": 0.5995165770770952, + "grad_norm": 0.13521502912044525, + "learning_rate": 0.000176039832587675, + "loss": 0.1795, + "step": 8309 + }, + { + "epoch": 0.5995887297521556, + "grad_norm": 0.10476474463939667, + "learning_rate": 0.00017603694616827827, + "loss": 0.1904, + "step": 8310 + }, + { + "epoch": 0.5996608824272159, + "grad_norm": 0.10485932976007462, + "learning_rate": 0.00017603405974888153, + "loss": 0.1499, + "step": 8311 + }, + { + "epoch": 0.5997330351022764, + "grad_norm": 0.1285116970539093, + "learning_rate": 0.00017603117332948477, + "loss": 0.1108, + "step": 8312 + }, + { + "epoch": 0.5998051877773368, + "grad_norm": 0.49555107951164246, + "learning_rate": 0.00017602828691008803, + "loss": 0.1378, + "step": 8313 + }, + { + "epoch": 0.5998773404523973, + "grad_norm": 0.1335631012916565, + "learning_rate": 0.00017602540049069132, + "loss": 0.1742, + "step": 8314 + }, + { + "epoch": 0.5999494931274577, + "grad_norm": 0.17374007403850555, + "learning_rate": 0.00017602251407129458, + "loss": 0.1292, + "step": 8315 + }, + { + "epoch": 0.6000216458025182, + "grad_norm": 0.11139317601919174, + "learning_rate": 0.00017601962765189785, + "loss": 0.153, + "step": 8316 + }, + { + "epoch": 0.6000937984775786, + "grad_norm": 0.1253328174352646, + "learning_rate": 0.00017601674123250108, + "loss": 0.1928, + "step": 8317 + }, + { + "epoch": 0.6001659511526389, + "grad_norm": 0.17890290915966034, + "learning_rate": 0.00017601385481310434, + "loss": 0.1429, + "step": 8318 + }, + { + "epoch": 0.6002381038276994, + "grad_norm": 0.15463121235370636, + "learning_rate": 0.0001760109683937076, + "loss": 0.1566, + "step": 8319 + }, + { + "epoch": 0.6003102565027598, + "grad_norm": 0.13858912885189056, + "learning_rate": 0.00017600808197431087, + "loss": 0.1606, + "step": 8320 + }, + { + "epoch": 0.6003824091778203, + "grad_norm": 0.17602328956127167, + "learning_rate": 0.00017600519555491416, + "loss": 0.165, + "step": 8321 + }, + { + "epoch": 0.6004545618528807, + "grad_norm": 0.1393021047115326, + "learning_rate": 0.0001760023091355174, + "loss": 0.1929, + "step": 8322 + }, + { + "epoch": 0.6005267145279412, + "grad_norm": 0.14253228902816772, + "learning_rate": 0.00017599942271612066, + "loss": 0.1145, + "step": 8323 + }, + { + "epoch": 0.6005988672030016, + "grad_norm": 0.15090863406658173, + "learning_rate": 0.00017599653629672392, + "loss": 0.1349, + "step": 8324 + }, + { + "epoch": 0.6006710198780619, + "grad_norm": 0.5086891055107117, + "learning_rate": 0.00017599364987732718, + "loss": 0.1871, + "step": 8325 + }, + { + "epoch": 0.6007431725531224, + "grad_norm": 0.11092958599328995, + "learning_rate": 0.00017599076345793045, + "loss": 0.1257, + "step": 8326 + }, + { + "epoch": 0.6008153252281828, + "grad_norm": 0.1473458707332611, + "learning_rate": 0.0001759878770385337, + "loss": 0.1264, + "step": 8327 + }, + { + "epoch": 0.6008874779032433, + "grad_norm": 0.14060139656066895, + "learning_rate": 0.00017598499061913695, + "loss": 0.1595, + "step": 8328 + }, + { + "epoch": 0.6009596305783037, + "grad_norm": 0.14718382060527802, + "learning_rate": 0.00017598210419974024, + "loss": 0.106, + "step": 8329 + }, + { + "epoch": 0.6010317832533641, + "grad_norm": 0.14671294391155243, + "learning_rate": 0.0001759792177803435, + "loss": 0.1141, + "step": 8330 + }, + { + "epoch": 0.6011039359284246, + "grad_norm": 0.17073041200637817, + "learning_rate": 0.00017597633136094676, + "loss": 0.171, + "step": 8331 + }, + { + "epoch": 0.6011760886034849, + "grad_norm": 0.1408713310956955, + "learning_rate": 0.00017597344494155003, + "loss": 0.1605, + "step": 8332 + }, + { + "epoch": 0.6012482412785454, + "grad_norm": 0.1482093334197998, + "learning_rate": 0.00017597055852215326, + "loss": 0.1601, + "step": 8333 + }, + { + "epoch": 0.6013203939536058, + "grad_norm": 0.14968480169773102, + "learning_rate": 0.00017596767210275652, + "loss": 0.1451, + "step": 8334 + }, + { + "epoch": 0.6013925466286663, + "grad_norm": 0.15854620933532715, + "learning_rate": 0.0001759647856833598, + "loss": 0.1609, + "step": 8335 + }, + { + "epoch": 0.6014646993037267, + "grad_norm": 0.1588122546672821, + "learning_rate": 0.00017596189926396308, + "loss": 0.1193, + "step": 8336 + }, + { + "epoch": 0.6015368519787871, + "grad_norm": 0.12753252685070038, + "learning_rate": 0.00017595901284456634, + "loss": 0.1469, + "step": 8337 + }, + { + "epoch": 0.6016090046538476, + "grad_norm": 0.17234788835048676, + "learning_rate": 0.00017595612642516958, + "loss": 0.1591, + "step": 8338 + }, + { + "epoch": 0.6016811573289079, + "grad_norm": 0.12896670401096344, + "learning_rate": 0.00017595324000577284, + "loss": 0.0929, + "step": 8339 + }, + { + "epoch": 0.6017533100039684, + "grad_norm": 0.11914967000484467, + "learning_rate": 0.0001759503535863761, + "loss": 0.2071, + "step": 8340 + }, + { + "epoch": 0.6018254626790288, + "grad_norm": 0.15577170252799988, + "learning_rate": 0.00017594746716697936, + "loss": 0.1752, + "step": 8341 + }, + { + "epoch": 0.6018976153540893, + "grad_norm": 0.16357608139514923, + "learning_rate": 0.00017594458074758263, + "loss": 0.138, + "step": 8342 + }, + { + "epoch": 0.6019697680291497, + "grad_norm": 0.13914862275123596, + "learning_rate": 0.0001759416943281859, + "loss": 0.1418, + "step": 8343 + }, + { + "epoch": 0.6020419207042101, + "grad_norm": 0.14447425305843353, + "learning_rate": 0.00017593880790878915, + "loss": 0.1567, + "step": 8344 + }, + { + "epoch": 0.6021140733792706, + "grad_norm": 0.15893866121768951, + "learning_rate": 0.00017593592148939242, + "loss": 0.1409, + "step": 8345 + }, + { + "epoch": 0.6021862260543309, + "grad_norm": 0.13015730679035187, + "learning_rate": 0.00017593303506999568, + "loss": 0.1905, + "step": 8346 + }, + { + "epoch": 0.6022583787293914, + "grad_norm": 0.12765511870384216, + "learning_rate": 0.00017593014865059894, + "loss": 0.1418, + "step": 8347 + }, + { + "epoch": 0.6023305314044518, + "grad_norm": 0.16375190019607544, + "learning_rate": 0.0001759272622312022, + "loss": 0.2088, + "step": 8348 + }, + { + "epoch": 0.6024026840795123, + "grad_norm": 0.12868079543113708, + "learning_rate": 0.00017592437581180544, + "loss": 0.1422, + "step": 8349 + }, + { + "epoch": 0.6024748367545727, + "grad_norm": 0.126992329955101, + "learning_rate": 0.00017592148939240873, + "loss": 0.211, + "step": 8350 + }, + { + "epoch": 0.6025469894296331, + "grad_norm": 0.11882653087377548, + "learning_rate": 0.000175918602973012, + "loss": 0.1504, + "step": 8351 + }, + { + "epoch": 0.6026191421046936, + "grad_norm": 0.13179273903369904, + "learning_rate": 0.00017591571655361526, + "loss": 0.1606, + "step": 8352 + }, + { + "epoch": 0.6026912947797539, + "grad_norm": 0.15020805597305298, + "learning_rate": 0.00017591283013421852, + "loss": 0.1343, + "step": 8353 + }, + { + "epoch": 0.6027634474548144, + "grad_norm": 0.13423669338226318, + "learning_rate": 0.00017590994371482176, + "loss": 0.1468, + "step": 8354 + }, + { + "epoch": 0.6028356001298748, + "grad_norm": 0.11200478672981262, + "learning_rate": 0.00017590705729542502, + "loss": 0.1351, + "step": 8355 + }, + { + "epoch": 0.6029077528049352, + "grad_norm": 0.11784103512763977, + "learning_rate": 0.00017590417087602828, + "loss": 0.1908, + "step": 8356 + }, + { + "epoch": 0.6029799054799957, + "grad_norm": 0.1697710007429123, + "learning_rate": 0.00017590128445663157, + "loss": 0.1684, + "step": 8357 + }, + { + "epoch": 0.6030520581550561, + "grad_norm": 0.14489956200122833, + "learning_rate": 0.00017589839803723483, + "loss": 0.1406, + "step": 8358 + }, + { + "epoch": 0.6031242108301166, + "grad_norm": 0.16728362441062927, + "learning_rate": 0.00017589551161783807, + "loss": 0.1702, + "step": 8359 + }, + { + "epoch": 0.6031963635051769, + "grad_norm": 0.15663135051727295, + "learning_rate": 0.00017589262519844133, + "loss": 0.1406, + "step": 8360 + }, + { + "epoch": 0.6032685161802374, + "grad_norm": 0.11991741508245468, + "learning_rate": 0.0001758897387790446, + "loss": 0.1781, + "step": 8361 + }, + { + "epoch": 0.6033406688552978, + "grad_norm": 0.12316525727510452, + "learning_rate": 0.00017588685235964786, + "loss": 0.1411, + "step": 8362 + }, + { + "epoch": 0.6034128215303582, + "grad_norm": 0.1290789693593979, + "learning_rate": 0.00017588396594025112, + "loss": 0.2423, + "step": 8363 + }, + { + "epoch": 0.6034849742054187, + "grad_norm": 0.12257073819637299, + "learning_rate": 0.00017588107952085438, + "loss": 0.0995, + "step": 8364 + }, + { + "epoch": 0.6035571268804791, + "grad_norm": 0.15544748306274414, + "learning_rate": 0.00017587819310145765, + "loss": 0.1324, + "step": 8365 + }, + { + "epoch": 0.6036292795555395, + "grad_norm": 0.1269799917936325, + "learning_rate": 0.0001758753066820609, + "loss": 0.1462, + "step": 8366 + }, + { + "epoch": 0.6037014322305999, + "grad_norm": 0.1171223595738411, + "learning_rate": 0.00017587242026266417, + "loss": 0.1556, + "step": 8367 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.13106483221054077, + "learning_rate": 0.00017586953384326744, + "loss": 0.1869, + "step": 8368 + }, + { + "epoch": 0.6038457375807208, + "grad_norm": 0.13471928238868713, + "learning_rate": 0.0001758666474238707, + "loss": 0.1953, + "step": 8369 + }, + { + "epoch": 0.6039178902557812, + "grad_norm": 0.11581786721944809, + "learning_rate": 0.00017586376100447393, + "loss": 0.16, + "step": 8370 + }, + { + "epoch": 0.6039900429308417, + "grad_norm": 0.12391266226768494, + "learning_rate": 0.00017586087458507722, + "loss": 0.1026, + "step": 8371 + }, + { + "epoch": 0.6040621956059021, + "grad_norm": 0.1501830667257309, + "learning_rate": 0.0001758579881656805, + "loss": 0.1543, + "step": 8372 + }, + { + "epoch": 0.6041343482809625, + "grad_norm": 0.13379184901714325, + "learning_rate": 0.00017585510174628375, + "loss": 0.172, + "step": 8373 + }, + { + "epoch": 0.6042065009560229, + "grad_norm": 0.1036381721496582, + "learning_rate": 0.00017585221532688701, + "loss": 0.1869, + "step": 8374 + }, + { + "epoch": 0.6042786536310834, + "grad_norm": 0.14931990206241608, + "learning_rate": 0.00017584932890749025, + "loss": 0.1331, + "step": 8375 + }, + { + "epoch": 0.6043508063061438, + "grad_norm": 0.13077418506145477, + "learning_rate": 0.0001758464424880935, + "loss": 0.1143, + "step": 8376 + }, + { + "epoch": 0.6044229589812042, + "grad_norm": 0.13980217278003693, + "learning_rate": 0.00017584355606869678, + "loss": 0.1828, + "step": 8377 + }, + { + "epoch": 0.6044951116562647, + "grad_norm": 0.13169902563095093, + "learning_rate": 0.00017584066964930007, + "loss": 0.1714, + "step": 8378 + }, + { + "epoch": 0.6045672643313251, + "grad_norm": 0.11600630730390549, + "learning_rate": 0.00017583778322990333, + "loss": 0.1674, + "step": 8379 + }, + { + "epoch": 0.6046394170063855, + "grad_norm": 0.12073885649442673, + "learning_rate": 0.00017583489681050656, + "loss": 0.1442, + "step": 8380 + }, + { + "epoch": 0.6047115696814459, + "grad_norm": 0.13687610626220703, + "learning_rate": 0.00017583201039110983, + "loss": 0.1384, + "step": 8381 + }, + { + "epoch": 0.6047837223565063, + "grad_norm": 0.21396295726299286, + "learning_rate": 0.0001758291239717131, + "loss": 0.1329, + "step": 8382 + }, + { + "epoch": 0.6048558750315668, + "grad_norm": 0.13280992209911346, + "learning_rate": 0.00017582623755231635, + "loss": 0.1466, + "step": 8383 + }, + { + "epoch": 0.6049280277066272, + "grad_norm": 0.16064168512821198, + "learning_rate": 0.00017582335113291962, + "loss": 0.1553, + "step": 8384 + }, + { + "epoch": 0.6050001803816877, + "grad_norm": 0.13958929479122162, + "learning_rate": 0.00017582046471352288, + "loss": 0.197, + "step": 8385 + }, + { + "epoch": 0.6050723330567481, + "grad_norm": 0.12864023447036743, + "learning_rate": 0.00017581757829412614, + "loss": 0.1839, + "step": 8386 + }, + { + "epoch": 0.6051444857318085, + "grad_norm": 0.13165269792079926, + "learning_rate": 0.0001758146918747294, + "loss": 0.1637, + "step": 8387 + }, + { + "epoch": 0.6052166384068689, + "grad_norm": 0.12166374921798706, + "learning_rate": 0.00017581180545533267, + "loss": 0.1869, + "step": 8388 + }, + { + "epoch": 0.6052887910819293, + "grad_norm": 0.17785470187664032, + "learning_rate": 0.00017580891903593593, + "loss": 0.2323, + "step": 8389 + }, + { + "epoch": 0.6053609437569898, + "grad_norm": 0.11101136356592178, + "learning_rate": 0.0001758060326165392, + "loss": 0.1265, + "step": 8390 + }, + { + "epoch": 0.6054330964320502, + "grad_norm": 0.13086219131946564, + "learning_rate": 0.00017580314619714246, + "loss": 0.1322, + "step": 8391 + }, + { + "epoch": 0.6055052491071107, + "grad_norm": 0.11813119053840637, + "learning_rate": 0.00017580025977774572, + "loss": 0.1202, + "step": 8392 + }, + { + "epoch": 0.6055774017821711, + "grad_norm": 0.12252860516309738, + "learning_rate": 0.00017579737335834898, + "loss": 0.1369, + "step": 8393 + }, + { + "epoch": 0.6056495544572315, + "grad_norm": 0.14627180993556976, + "learning_rate": 0.00017579448693895224, + "loss": 0.1393, + "step": 8394 + }, + { + "epoch": 0.6057217071322919, + "grad_norm": 0.14379194378852844, + "learning_rate": 0.0001757916005195555, + "loss": 0.2139, + "step": 8395 + }, + { + "epoch": 0.6057938598073523, + "grad_norm": 0.15863902866840363, + "learning_rate": 0.00017578871410015877, + "loss": 0.1719, + "step": 8396 + }, + { + "epoch": 0.6058660124824128, + "grad_norm": 0.13218453526496887, + "learning_rate": 0.000175785827680762, + "loss": 0.0966, + "step": 8397 + }, + { + "epoch": 0.6059381651574732, + "grad_norm": 0.12433724105358124, + "learning_rate": 0.00017578294126136527, + "loss": 0.1607, + "step": 8398 + }, + { + "epoch": 0.6060103178325337, + "grad_norm": 0.1300356090068817, + "learning_rate": 0.00017578005484196856, + "loss": 0.1506, + "step": 8399 + }, + { + "epoch": 0.6060824705075941, + "grad_norm": 0.11802296340465546, + "learning_rate": 0.00017577716842257182, + "loss": 0.148, + "step": 8400 + }, + { + "epoch": 0.6061546231826545, + "grad_norm": 0.10881169885396957, + "learning_rate": 0.00017577428200317509, + "loss": 0.1698, + "step": 8401 + }, + { + "epoch": 0.6062267758577149, + "grad_norm": 0.1484992504119873, + "learning_rate": 0.00017577139558377832, + "loss": 0.1707, + "step": 8402 + }, + { + "epoch": 0.6062989285327753, + "grad_norm": 0.15537378191947937, + "learning_rate": 0.00017576850916438158, + "loss": 0.1942, + "step": 8403 + }, + { + "epoch": 0.6063710812078358, + "grad_norm": 0.15242907404899597, + "learning_rate": 0.00017576562274498485, + "loss": 0.1153, + "step": 8404 + }, + { + "epoch": 0.6064432338828962, + "grad_norm": 0.12700161337852478, + "learning_rate": 0.0001757627363255881, + "loss": 0.1304, + "step": 8405 + }, + { + "epoch": 0.6065153865579567, + "grad_norm": 0.1762162744998932, + "learning_rate": 0.0001757598499061914, + "loss": 0.138, + "step": 8406 + }, + { + "epoch": 0.6065875392330171, + "grad_norm": 0.13309970498085022, + "learning_rate": 0.00017575696348679464, + "loss": 0.1119, + "step": 8407 + }, + { + "epoch": 0.6066596919080774, + "grad_norm": 0.123813696205616, + "learning_rate": 0.0001757540770673979, + "loss": 0.1493, + "step": 8408 + }, + { + "epoch": 0.6067318445831379, + "grad_norm": 0.11938238888978958, + "learning_rate": 0.00017575119064800116, + "loss": 0.1189, + "step": 8409 + }, + { + "epoch": 0.6068039972581983, + "grad_norm": 0.13266606628894806, + "learning_rate": 0.00017574830422860442, + "loss": 0.1454, + "step": 8410 + }, + { + "epoch": 0.6068761499332588, + "grad_norm": 0.14781904220581055, + "learning_rate": 0.0001757454178092077, + "loss": 0.1694, + "step": 8411 + }, + { + "epoch": 0.6069483026083192, + "grad_norm": 0.10925555229187012, + "learning_rate": 0.00017574253138981095, + "loss": 0.1823, + "step": 8412 + }, + { + "epoch": 0.6070204552833797, + "grad_norm": 0.12646707892417908, + "learning_rate": 0.0001757396449704142, + "loss": 0.159, + "step": 8413 + }, + { + "epoch": 0.6070926079584401, + "grad_norm": 0.1189049780368805, + "learning_rate": 0.00017573675855101748, + "loss": 0.1093, + "step": 8414 + }, + { + "epoch": 0.6071647606335004, + "grad_norm": 0.11102983355522156, + "learning_rate": 0.00017573387213162074, + "loss": 0.1654, + "step": 8415 + }, + { + "epoch": 0.6072369133085609, + "grad_norm": 0.13493704795837402, + "learning_rate": 0.000175730985712224, + "loss": 0.0933, + "step": 8416 + }, + { + "epoch": 0.6073090659836213, + "grad_norm": 0.14110951125621796, + "learning_rate": 0.00017572809929282727, + "loss": 0.1382, + "step": 8417 + }, + { + "epoch": 0.6073812186586818, + "grad_norm": 0.0968608409166336, + "learning_rate": 0.0001757252128734305, + "loss": 0.119, + "step": 8418 + }, + { + "epoch": 0.6074533713337422, + "grad_norm": 0.11105870455503464, + "learning_rate": 0.00017572232645403376, + "loss": 0.1332, + "step": 8419 + }, + { + "epoch": 0.6075255240088027, + "grad_norm": 0.1404876559972763, + "learning_rate": 0.00017571944003463705, + "loss": 0.1191, + "step": 8420 + }, + { + "epoch": 0.6075976766838631, + "grad_norm": 0.11005908250808716, + "learning_rate": 0.00017571655361524032, + "loss": 0.1517, + "step": 8421 + }, + { + "epoch": 0.6076698293589234, + "grad_norm": 0.1336730271577835, + "learning_rate": 0.00017571366719584358, + "loss": 0.1874, + "step": 8422 + }, + { + "epoch": 0.6077419820339839, + "grad_norm": 0.14844858646392822, + "learning_rate": 0.00017571078077644682, + "loss": 0.1909, + "step": 8423 + }, + { + "epoch": 0.6078141347090443, + "grad_norm": 0.15753395855426788, + "learning_rate": 0.00017570789435705008, + "loss": 0.1399, + "step": 8424 + }, + { + "epoch": 0.6078862873841048, + "grad_norm": 0.1073303371667862, + "learning_rate": 0.00017570500793765334, + "loss": 0.149, + "step": 8425 + }, + { + "epoch": 0.6079584400591652, + "grad_norm": 0.11728430539369583, + "learning_rate": 0.0001757021215182566, + "loss": 0.115, + "step": 8426 + }, + { + "epoch": 0.6080305927342257, + "grad_norm": 0.14064405858516693, + "learning_rate": 0.0001756992350988599, + "loss": 0.1693, + "step": 8427 + }, + { + "epoch": 0.608102745409286, + "grad_norm": 0.13664928078651428, + "learning_rate": 0.00017569634867946313, + "loss": 0.1718, + "step": 8428 + }, + { + "epoch": 0.6081748980843464, + "grad_norm": 0.12747839093208313, + "learning_rate": 0.0001756934622600664, + "loss": 0.1621, + "step": 8429 + }, + { + "epoch": 0.6082470507594069, + "grad_norm": 0.1261085867881775, + "learning_rate": 0.00017569057584066966, + "loss": 0.1809, + "step": 8430 + }, + { + "epoch": 0.6083192034344673, + "grad_norm": 0.12232770770788193, + "learning_rate": 0.00017568768942127292, + "loss": 0.1752, + "step": 8431 + }, + { + "epoch": 0.6083913561095278, + "grad_norm": 0.11349008232355118, + "learning_rate": 0.00017568480300187618, + "loss": 0.1542, + "step": 8432 + }, + { + "epoch": 0.6084635087845882, + "grad_norm": 0.1297486126422882, + "learning_rate": 0.00017568191658247944, + "loss": 0.1297, + "step": 8433 + }, + { + "epoch": 0.6085356614596487, + "grad_norm": 0.15281839668750763, + "learning_rate": 0.0001756790301630827, + "loss": 0.1011, + "step": 8434 + }, + { + "epoch": 0.608607814134709, + "grad_norm": 0.1351483166217804, + "learning_rate": 0.00017567614374368597, + "loss": 0.192, + "step": 8435 + }, + { + "epoch": 0.6086799668097694, + "grad_norm": 0.17875918745994568, + "learning_rate": 0.00017567325732428923, + "loss": 0.1881, + "step": 8436 + }, + { + "epoch": 0.6087521194848299, + "grad_norm": 0.148821622133255, + "learning_rate": 0.0001756703709048925, + "loss": 0.1612, + "step": 8437 + }, + { + "epoch": 0.6088242721598903, + "grad_norm": 0.12716643512248993, + "learning_rate": 0.00017566748448549576, + "loss": 0.1141, + "step": 8438 + }, + { + "epoch": 0.6088964248349508, + "grad_norm": 0.12227771431207657, + "learning_rate": 0.000175664598066099, + "loss": 0.2097, + "step": 8439 + }, + { + "epoch": 0.6089685775100112, + "grad_norm": 0.1000041589140892, + "learning_rate": 0.00017566171164670226, + "loss": 0.1184, + "step": 8440 + }, + { + "epoch": 0.6090407301850717, + "grad_norm": 0.09327298402786255, + "learning_rate": 0.00017565882522730555, + "loss": 0.1192, + "step": 8441 + }, + { + "epoch": 0.609112882860132, + "grad_norm": 0.11925908923149109, + "learning_rate": 0.0001756559388079088, + "loss": 0.1972, + "step": 8442 + }, + { + "epoch": 0.6091850355351924, + "grad_norm": 0.11328861862421036, + "learning_rate": 0.00017565305238851207, + "loss": 0.1384, + "step": 8443 + }, + { + "epoch": 0.6092571882102529, + "grad_norm": 0.11418649554252625, + "learning_rate": 0.0001756501659691153, + "loss": 0.1356, + "step": 8444 + }, + { + "epoch": 0.6093293408853133, + "grad_norm": 0.10574861615896225, + "learning_rate": 0.00017564727954971857, + "loss": 0.15, + "step": 8445 + }, + { + "epoch": 0.6094014935603738, + "grad_norm": 0.12365765869617462, + "learning_rate": 0.00017564439313032184, + "loss": 0.1564, + "step": 8446 + }, + { + "epoch": 0.6094736462354342, + "grad_norm": 0.09398072212934494, + "learning_rate": 0.0001756415067109251, + "loss": 0.1744, + "step": 8447 + }, + { + "epoch": 0.6095457989104947, + "grad_norm": 0.12091232091188431, + "learning_rate": 0.0001756386202915284, + "loss": 0.1232, + "step": 8448 + }, + { + "epoch": 0.609617951585555, + "grad_norm": 0.11692681163549423, + "learning_rate": 0.00017563573387213162, + "loss": 0.1815, + "step": 8449 + }, + { + "epoch": 0.6096901042606154, + "grad_norm": 0.12754695117473602, + "learning_rate": 0.0001756328474527349, + "loss": 0.1337, + "step": 8450 + }, + { + "epoch": 0.6097622569356759, + "grad_norm": 0.1740695983171463, + "learning_rate": 0.00017562996103333815, + "loss": 0.1787, + "step": 8451 + }, + { + "epoch": 0.6098344096107363, + "grad_norm": 0.12354660779237747, + "learning_rate": 0.0001756270746139414, + "loss": 0.1298, + "step": 8452 + }, + { + "epoch": 0.6099065622857968, + "grad_norm": 0.11192192882299423, + "learning_rate": 0.00017562418819454468, + "loss": 0.1585, + "step": 8453 + }, + { + "epoch": 0.6099787149608572, + "grad_norm": 0.10584592819213867, + "learning_rate": 0.00017562130177514794, + "loss": 0.1279, + "step": 8454 + }, + { + "epoch": 0.6100508676359176, + "grad_norm": 0.11145295947790146, + "learning_rate": 0.0001756184153557512, + "loss": 0.1279, + "step": 8455 + }, + { + "epoch": 0.610123020310978, + "grad_norm": 0.11601231247186661, + "learning_rate": 0.00017561552893635446, + "loss": 0.1752, + "step": 8456 + }, + { + "epoch": 0.6101951729860384, + "grad_norm": 0.1352606862783432, + "learning_rate": 0.00017561264251695773, + "loss": 0.1099, + "step": 8457 + }, + { + "epoch": 0.6102673256610989, + "grad_norm": 0.11708886921405792, + "learning_rate": 0.000175609756097561, + "loss": 0.1533, + "step": 8458 + }, + { + "epoch": 0.6103394783361593, + "grad_norm": 0.14400361478328705, + "learning_rate": 0.00017560686967816425, + "loss": 0.1418, + "step": 8459 + }, + { + "epoch": 0.6104116310112198, + "grad_norm": 0.11821573227643967, + "learning_rate": 0.0001756039832587675, + "loss": 0.1228, + "step": 8460 + }, + { + "epoch": 0.6104837836862802, + "grad_norm": 0.13207882642745972, + "learning_rate": 0.00017560109683937075, + "loss": 0.1242, + "step": 8461 + }, + { + "epoch": 0.6105559363613406, + "grad_norm": 0.13548435270786285, + "learning_rate": 0.00017559821041997404, + "loss": 0.1262, + "step": 8462 + }, + { + "epoch": 0.610628089036401, + "grad_norm": 0.1567934900522232, + "learning_rate": 0.0001755953240005773, + "loss": 0.1973, + "step": 8463 + }, + { + "epoch": 0.6107002417114614, + "grad_norm": 0.17090696096420288, + "learning_rate": 0.00017559243758118057, + "loss": 0.1539, + "step": 8464 + }, + { + "epoch": 0.6107723943865219, + "grad_norm": 0.1312856376171112, + "learning_rate": 0.0001755895511617838, + "loss": 0.1151, + "step": 8465 + }, + { + "epoch": 0.6108445470615823, + "grad_norm": 0.15751205384731293, + "learning_rate": 0.00017558666474238707, + "loss": 0.1696, + "step": 8466 + }, + { + "epoch": 0.6109166997366428, + "grad_norm": 0.14566873013973236, + "learning_rate": 0.00017558377832299033, + "loss": 0.1762, + "step": 8467 + }, + { + "epoch": 0.6109888524117032, + "grad_norm": 0.12950445711612701, + "learning_rate": 0.0001755808919035936, + "loss": 0.1188, + "step": 8468 + }, + { + "epoch": 0.6110610050867636, + "grad_norm": 0.1337715983390808, + "learning_rate": 0.00017557800548419688, + "loss": 0.1809, + "step": 8469 + }, + { + "epoch": 0.611133157761824, + "grad_norm": 0.13403064012527466, + "learning_rate": 0.00017557511906480012, + "loss": 0.1583, + "step": 8470 + }, + { + "epoch": 0.6112053104368844, + "grad_norm": 0.10216417163610458, + "learning_rate": 0.00017557223264540338, + "loss": 0.1674, + "step": 8471 + }, + { + "epoch": 0.6112774631119449, + "grad_norm": 0.1403636634349823, + "learning_rate": 0.00017556934622600664, + "loss": 0.1559, + "step": 8472 + }, + { + "epoch": 0.6113496157870053, + "grad_norm": 0.2026059478521347, + "learning_rate": 0.0001755664598066099, + "loss": 0.1158, + "step": 8473 + }, + { + "epoch": 0.6114217684620658, + "grad_norm": 0.10862534493207932, + "learning_rate": 0.00017556357338721317, + "loss": 0.1191, + "step": 8474 + }, + { + "epoch": 0.6114939211371262, + "grad_norm": 0.12901028990745544, + "learning_rate": 0.00017556068696781643, + "loss": 0.1377, + "step": 8475 + }, + { + "epoch": 0.6115660738121866, + "grad_norm": 0.11864591389894485, + "learning_rate": 0.0001755578005484197, + "loss": 0.118, + "step": 8476 + }, + { + "epoch": 0.611638226487247, + "grad_norm": 0.11161187291145325, + "learning_rate": 0.00017555491412902296, + "loss": 0.1901, + "step": 8477 + }, + { + "epoch": 0.6117103791623074, + "grad_norm": 0.10867039114236832, + "learning_rate": 0.00017555202770962622, + "loss": 0.0806, + "step": 8478 + }, + { + "epoch": 0.6117825318373679, + "grad_norm": 0.13876743614673615, + "learning_rate": 0.00017554914129022948, + "loss": 0.1455, + "step": 8479 + }, + { + "epoch": 0.6118546845124283, + "grad_norm": 0.10943257063627243, + "learning_rate": 0.00017554625487083275, + "loss": 0.1715, + "step": 8480 + }, + { + "epoch": 0.6119268371874887, + "grad_norm": 0.11604628711938858, + "learning_rate": 0.00017554336845143598, + "loss": 0.1693, + "step": 8481 + }, + { + "epoch": 0.6119989898625492, + "grad_norm": 0.1253431886434555, + "learning_rate": 0.00017554048203203925, + "loss": 0.1411, + "step": 8482 + }, + { + "epoch": 0.6120711425376096, + "grad_norm": 0.1400705724954605, + "learning_rate": 0.00017553759561264254, + "loss": 0.1572, + "step": 8483 + }, + { + "epoch": 0.61214329521267, + "grad_norm": 0.15838080644607544, + "learning_rate": 0.0001755347091932458, + "loss": 0.1528, + "step": 8484 + }, + { + "epoch": 0.6122154478877304, + "grad_norm": 0.11225708574056625, + "learning_rate": 0.00017553182277384906, + "loss": 0.1533, + "step": 8485 + }, + { + "epoch": 0.6122876005627909, + "grad_norm": 0.13094502687454224, + "learning_rate": 0.0001755289363544523, + "loss": 0.1668, + "step": 8486 + }, + { + "epoch": 0.6123597532378513, + "grad_norm": 0.13761380314826965, + "learning_rate": 0.00017552604993505556, + "loss": 0.1466, + "step": 8487 + }, + { + "epoch": 0.6124319059129117, + "grad_norm": 0.12459521740674973, + "learning_rate": 0.00017552316351565882, + "loss": 0.1122, + "step": 8488 + }, + { + "epoch": 0.6125040585879722, + "grad_norm": 0.13551610708236694, + "learning_rate": 0.0001755202770962621, + "loss": 0.1571, + "step": 8489 + }, + { + "epoch": 0.6125762112630325, + "grad_norm": 0.10906567424535751, + "learning_rate": 0.00017551739067686538, + "loss": 0.1432, + "step": 8490 + }, + { + "epoch": 0.612648363938093, + "grad_norm": 0.13732370734214783, + "learning_rate": 0.0001755145042574686, + "loss": 0.1786, + "step": 8491 + }, + { + "epoch": 0.6127205166131534, + "grad_norm": 0.11145670711994171, + "learning_rate": 0.00017551161783807188, + "loss": 0.1494, + "step": 8492 + }, + { + "epoch": 0.6127926692882139, + "grad_norm": 0.13747194409370422, + "learning_rate": 0.00017550873141867514, + "loss": 0.1684, + "step": 8493 + }, + { + "epoch": 0.6128648219632743, + "grad_norm": 0.15329043567180634, + "learning_rate": 0.0001755058449992784, + "loss": 0.1808, + "step": 8494 + }, + { + "epoch": 0.6129369746383347, + "grad_norm": 0.15293771028518677, + "learning_rate": 0.00017550295857988166, + "loss": 0.1691, + "step": 8495 + }, + { + "epoch": 0.6130091273133952, + "grad_norm": 0.10288719832897186, + "learning_rate": 0.00017550007216048493, + "loss": 0.1448, + "step": 8496 + }, + { + "epoch": 0.6130812799884555, + "grad_norm": 0.11011450737714767, + "learning_rate": 0.0001754971857410882, + "loss": 0.1423, + "step": 8497 + }, + { + "epoch": 0.613153432663516, + "grad_norm": 0.1155213788151741, + "learning_rate": 0.00017549429932169145, + "loss": 0.1385, + "step": 8498 + }, + { + "epoch": 0.6132255853385764, + "grad_norm": 0.1479150354862213, + "learning_rate": 0.00017549141290229472, + "loss": 0.1405, + "step": 8499 + }, + { + "epoch": 0.6132977380136369, + "grad_norm": 0.1567593663930893, + "learning_rate": 0.00017548852648289798, + "loss": 0.1805, + "step": 8500 + }, + { + "epoch": 0.6133698906886973, + "grad_norm": 0.1502797156572342, + "learning_rate": 0.00017548564006350124, + "loss": 0.1652, + "step": 8501 + }, + { + "epoch": 0.6134420433637577, + "grad_norm": 0.12881901860237122, + "learning_rate": 0.0001754827536441045, + "loss": 0.1654, + "step": 8502 + }, + { + "epoch": 0.6135141960388182, + "grad_norm": 0.15938179194927216, + "learning_rate": 0.00017547986722470774, + "loss": 0.1861, + "step": 8503 + }, + { + "epoch": 0.6135863487138785, + "grad_norm": 0.15454211831092834, + "learning_rate": 0.00017547698080531103, + "loss": 0.1632, + "step": 8504 + }, + { + "epoch": 0.613658501388939, + "grad_norm": 0.18221557140350342, + "learning_rate": 0.0001754740943859143, + "loss": 0.1617, + "step": 8505 + }, + { + "epoch": 0.6137306540639994, + "grad_norm": 0.10344431549310684, + "learning_rate": 0.00017547120796651756, + "loss": 0.1188, + "step": 8506 + }, + { + "epoch": 0.6138028067390598, + "grad_norm": 0.16359294950962067, + "learning_rate": 0.00017546832154712082, + "loss": 0.1974, + "step": 8507 + }, + { + "epoch": 0.6138749594141203, + "grad_norm": 0.12594661116600037, + "learning_rate": 0.00017546543512772406, + "loss": 0.1541, + "step": 8508 + }, + { + "epoch": 0.6139471120891807, + "grad_norm": 0.13963118195533752, + "learning_rate": 0.00017546254870832732, + "loss": 0.1701, + "step": 8509 + }, + { + "epoch": 0.6140192647642412, + "grad_norm": 0.13694241642951965, + "learning_rate": 0.00017545966228893058, + "loss": 0.1564, + "step": 8510 + }, + { + "epoch": 0.6140914174393015, + "grad_norm": 0.15386688709259033, + "learning_rate": 0.00017545677586953387, + "loss": 0.1247, + "step": 8511 + }, + { + "epoch": 0.614163570114362, + "grad_norm": 0.12560677528381348, + "learning_rate": 0.00017545388945013713, + "loss": 0.1475, + "step": 8512 + }, + { + "epoch": 0.6142357227894224, + "grad_norm": 0.15082158148288727, + "learning_rate": 0.00017545100303074037, + "loss": 0.154, + "step": 8513 + }, + { + "epoch": 0.6143078754644828, + "grad_norm": 0.12800118327140808, + "learning_rate": 0.00017544811661134363, + "loss": 0.1477, + "step": 8514 + }, + { + "epoch": 0.6143800281395433, + "grad_norm": 0.11983068287372589, + "learning_rate": 0.0001754452301919469, + "loss": 0.1787, + "step": 8515 + }, + { + "epoch": 0.6144521808146037, + "grad_norm": 0.11161981523036957, + "learning_rate": 0.00017544234377255016, + "loss": 0.177, + "step": 8516 + }, + { + "epoch": 0.6145243334896642, + "grad_norm": 0.12424928694963455, + "learning_rate": 0.00017543945735315342, + "loss": 0.1565, + "step": 8517 + }, + { + "epoch": 0.6145964861647245, + "grad_norm": 0.11708874255418777, + "learning_rate": 0.00017543657093375668, + "loss": 0.1713, + "step": 8518 + }, + { + "epoch": 0.614668638839785, + "grad_norm": 0.13190799951553345, + "learning_rate": 0.00017543368451435995, + "loss": 0.1117, + "step": 8519 + }, + { + "epoch": 0.6147407915148454, + "grad_norm": 0.1273881196975708, + "learning_rate": 0.0001754307980949632, + "loss": 0.117, + "step": 8520 + }, + { + "epoch": 0.6148129441899058, + "grad_norm": 0.15069957077503204, + "learning_rate": 0.00017542791167556647, + "loss": 0.1727, + "step": 8521 + }, + { + "epoch": 0.6148850968649663, + "grad_norm": 0.12246958911418915, + "learning_rate": 0.00017542502525616974, + "loss": 0.1283, + "step": 8522 + }, + { + "epoch": 0.6149572495400267, + "grad_norm": 0.12573544681072235, + "learning_rate": 0.000175422138836773, + "loss": 0.1428, + "step": 8523 + }, + { + "epoch": 0.6150294022150872, + "grad_norm": 0.15929429233074188, + "learning_rate": 0.00017541925241737624, + "loss": 0.1073, + "step": 8524 + }, + { + "epoch": 0.6151015548901475, + "grad_norm": 0.11118916422128677, + "learning_rate": 0.0001754163659979795, + "loss": 0.0931, + "step": 8525 + }, + { + "epoch": 0.615173707565208, + "grad_norm": 0.10511104762554169, + "learning_rate": 0.0001754134795785828, + "loss": 0.1519, + "step": 8526 + }, + { + "epoch": 0.6152458602402684, + "grad_norm": 0.16426770389080048, + "learning_rate": 0.00017541059315918605, + "loss": 0.1643, + "step": 8527 + }, + { + "epoch": 0.6153180129153288, + "grad_norm": 0.12183352559804916, + "learning_rate": 0.00017540770673978931, + "loss": 0.1171, + "step": 8528 + }, + { + "epoch": 0.6153901655903893, + "grad_norm": 0.10447021573781967, + "learning_rate": 0.00017540482032039255, + "loss": 0.1498, + "step": 8529 + }, + { + "epoch": 0.6154623182654497, + "grad_norm": 0.1412649154663086, + "learning_rate": 0.0001754019339009958, + "loss": 0.1806, + "step": 8530 + }, + { + "epoch": 0.6155344709405102, + "grad_norm": 0.1257966160774231, + "learning_rate": 0.00017539904748159908, + "loss": 0.2047, + "step": 8531 + }, + { + "epoch": 0.6156066236155705, + "grad_norm": 0.12649193406105042, + "learning_rate": 0.00017539616106220234, + "loss": 0.1712, + "step": 8532 + }, + { + "epoch": 0.615678776290631, + "grad_norm": 0.17213580012321472, + "learning_rate": 0.00017539327464280563, + "loss": 0.1278, + "step": 8533 + }, + { + "epoch": 0.6157509289656914, + "grad_norm": 0.13075965642929077, + "learning_rate": 0.00017539038822340886, + "loss": 0.1334, + "step": 8534 + }, + { + "epoch": 0.6158230816407518, + "grad_norm": 0.11880091577768326, + "learning_rate": 0.00017538750180401213, + "loss": 0.1513, + "step": 8535 + }, + { + "epoch": 0.6158952343158123, + "grad_norm": 0.1348184049129486, + "learning_rate": 0.0001753846153846154, + "loss": 0.1195, + "step": 8536 + }, + { + "epoch": 0.6159673869908727, + "grad_norm": 0.12139120697975159, + "learning_rate": 0.00017538172896521865, + "loss": 0.1647, + "step": 8537 + }, + { + "epoch": 0.6160395396659332, + "grad_norm": 0.21920908987522125, + "learning_rate": 0.00017537884254582192, + "loss": 0.1475, + "step": 8538 + }, + { + "epoch": 0.6161116923409935, + "grad_norm": 0.14035917818546295, + "learning_rate": 0.00017537595612642518, + "loss": 0.1817, + "step": 8539 + }, + { + "epoch": 0.6161838450160539, + "grad_norm": 0.11102382093667984, + "learning_rate": 0.00017537306970702844, + "loss": 0.1153, + "step": 8540 + }, + { + "epoch": 0.6162559976911144, + "grad_norm": 0.15595851838588715, + "learning_rate": 0.0001753701832876317, + "loss": 0.1358, + "step": 8541 + }, + { + "epoch": 0.6163281503661748, + "grad_norm": 0.14062592387199402, + "learning_rate": 0.00017536729686823497, + "loss": 0.2017, + "step": 8542 + }, + { + "epoch": 0.6164003030412353, + "grad_norm": 0.14323318004608154, + "learning_rate": 0.00017536441044883823, + "loss": 0.1508, + "step": 8543 + }, + { + "epoch": 0.6164724557162957, + "grad_norm": 0.14219464361667633, + "learning_rate": 0.0001753615240294415, + "loss": 0.178, + "step": 8544 + }, + { + "epoch": 0.6165446083913562, + "grad_norm": 0.1259685605764389, + "learning_rate": 0.00017535863761004473, + "loss": 0.1548, + "step": 8545 + }, + { + "epoch": 0.6166167610664165, + "grad_norm": 0.1387157440185547, + "learning_rate": 0.000175355751190648, + "loss": 0.175, + "step": 8546 + }, + { + "epoch": 0.6166889137414769, + "grad_norm": 0.1125483587384224, + "learning_rate": 0.00017535286477125128, + "loss": 0.1553, + "step": 8547 + }, + { + "epoch": 0.6167610664165374, + "grad_norm": 0.13433030247688293, + "learning_rate": 0.00017534997835185455, + "loss": 0.2003, + "step": 8548 + }, + { + "epoch": 0.6168332190915978, + "grad_norm": 0.13598592579364777, + "learning_rate": 0.0001753470919324578, + "loss": 0.1419, + "step": 8549 + }, + { + "epoch": 0.6169053717666583, + "grad_norm": 0.11717887222766876, + "learning_rate": 0.00017534420551306104, + "loss": 0.1611, + "step": 8550 + }, + { + "epoch": 0.6169775244417187, + "grad_norm": 0.129541277885437, + "learning_rate": 0.0001753413190936643, + "loss": 0.1321, + "step": 8551 + }, + { + "epoch": 0.617049677116779, + "grad_norm": 0.11288454383611679, + "learning_rate": 0.00017533843267426757, + "loss": 0.1598, + "step": 8552 + }, + { + "epoch": 0.6171218297918395, + "grad_norm": 0.1419808268547058, + "learning_rate": 0.00017533554625487083, + "loss": 0.1494, + "step": 8553 + }, + { + "epoch": 0.6171939824668999, + "grad_norm": 0.1335771232843399, + "learning_rate": 0.00017533265983547412, + "loss": 0.1457, + "step": 8554 + }, + { + "epoch": 0.6172661351419604, + "grad_norm": 0.1458192765712738, + "learning_rate": 0.00017532977341607736, + "loss": 0.1806, + "step": 8555 + }, + { + "epoch": 0.6173382878170208, + "grad_norm": 0.11284256726503372, + "learning_rate": 0.00017532688699668062, + "loss": 0.1395, + "step": 8556 + }, + { + "epoch": 0.6174104404920813, + "grad_norm": 0.10957358777523041, + "learning_rate": 0.00017532400057728388, + "loss": 0.131, + "step": 8557 + }, + { + "epoch": 0.6174825931671417, + "grad_norm": 0.1465001404285431, + "learning_rate": 0.00017532111415788715, + "loss": 0.1585, + "step": 8558 + }, + { + "epoch": 0.617554745842202, + "grad_norm": 0.14206071197986603, + "learning_rate": 0.0001753182277384904, + "loss": 0.14, + "step": 8559 + }, + { + "epoch": 0.6176268985172625, + "grad_norm": 0.1402553766965866, + "learning_rate": 0.00017531534131909367, + "loss": 0.1754, + "step": 8560 + }, + { + "epoch": 0.6176990511923229, + "grad_norm": 0.12297656387090683, + "learning_rate": 0.00017531245489969694, + "loss": 0.1783, + "step": 8561 + }, + { + "epoch": 0.6177712038673834, + "grad_norm": 0.1184375211596489, + "learning_rate": 0.0001753095684803002, + "loss": 0.1654, + "step": 8562 + }, + { + "epoch": 0.6178433565424438, + "grad_norm": 0.11259499192237854, + "learning_rate": 0.00017530668206090346, + "loss": 0.1434, + "step": 8563 + }, + { + "epoch": 0.6179155092175043, + "grad_norm": 0.13270528614521027, + "learning_rate": 0.00017530379564150672, + "loss": 0.131, + "step": 8564 + }, + { + "epoch": 0.6179876618925647, + "grad_norm": 0.17860640585422516, + "learning_rate": 0.00017530090922211, + "loss": 0.1351, + "step": 8565 + }, + { + "epoch": 0.618059814567625, + "grad_norm": 0.1279137283563614, + "learning_rate": 0.00017529802280271322, + "loss": 0.1286, + "step": 8566 + }, + { + "epoch": 0.6181319672426855, + "grad_norm": 0.15142486989498138, + "learning_rate": 0.0001752951363833165, + "loss": 0.1479, + "step": 8567 + }, + { + "epoch": 0.6182041199177459, + "grad_norm": 0.13252979516983032, + "learning_rate": 0.00017529224996391978, + "loss": 0.2062, + "step": 8568 + }, + { + "epoch": 0.6182762725928064, + "grad_norm": 0.11797452718019485, + "learning_rate": 0.00017528936354452304, + "loss": 0.125, + "step": 8569 + }, + { + "epoch": 0.6183484252678668, + "grad_norm": 0.12526576220989227, + "learning_rate": 0.0001752864771251263, + "loss": 0.1176, + "step": 8570 + }, + { + "epoch": 0.6184205779429273, + "grad_norm": 0.12063656747341156, + "learning_rate": 0.00017528359070572954, + "loss": 0.1391, + "step": 8571 + }, + { + "epoch": 0.6184927306179877, + "grad_norm": 0.10913331061601639, + "learning_rate": 0.0001752807042863328, + "loss": 0.1639, + "step": 8572 + }, + { + "epoch": 0.618564883293048, + "grad_norm": 0.1257828176021576, + "learning_rate": 0.00017527781786693606, + "loss": 0.153, + "step": 8573 + }, + { + "epoch": 0.6186370359681085, + "grad_norm": 0.13285456597805023, + "learning_rate": 0.00017527493144753933, + "loss": 0.1729, + "step": 8574 + }, + { + "epoch": 0.6187091886431689, + "grad_norm": 0.15405382215976715, + "learning_rate": 0.00017527204502814262, + "loss": 0.1654, + "step": 8575 + }, + { + "epoch": 0.6187813413182294, + "grad_norm": 0.11983431130647659, + "learning_rate": 0.00017526915860874585, + "loss": 0.1605, + "step": 8576 + }, + { + "epoch": 0.6188534939932898, + "grad_norm": 0.1335511952638626, + "learning_rate": 0.00017526627218934912, + "loss": 0.1281, + "step": 8577 + }, + { + "epoch": 0.6189256466683503, + "grad_norm": 0.13076375424861908, + "learning_rate": 0.00017526338576995238, + "loss": 0.1468, + "step": 8578 + }, + { + "epoch": 0.6189977993434107, + "grad_norm": 0.11791105568408966, + "learning_rate": 0.00017526049935055564, + "loss": 0.133, + "step": 8579 + }, + { + "epoch": 0.619069952018471, + "grad_norm": 0.15978433191776276, + "learning_rate": 0.0001752576129311589, + "loss": 0.1464, + "step": 8580 + }, + { + "epoch": 0.6191421046935315, + "grad_norm": 0.15569712221622467, + "learning_rate": 0.00017525472651176217, + "loss": 0.1407, + "step": 8581 + }, + { + "epoch": 0.6192142573685919, + "grad_norm": 0.1182025671005249, + "learning_rate": 0.00017525184009236543, + "loss": 0.1861, + "step": 8582 + }, + { + "epoch": 0.6192864100436524, + "grad_norm": 0.13768304884433746, + "learning_rate": 0.0001752489536729687, + "loss": 0.1786, + "step": 8583 + }, + { + "epoch": 0.6193585627187128, + "grad_norm": 0.10402512550354004, + "learning_rate": 0.00017524606725357196, + "loss": 0.1489, + "step": 8584 + }, + { + "epoch": 0.6194307153937733, + "grad_norm": 0.11849159002304077, + "learning_rate": 0.00017524318083417522, + "loss": 0.1867, + "step": 8585 + }, + { + "epoch": 0.6195028680688337, + "grad_norm": 0.11498197168111801, + "learning_rate": 0.00017524029441477848, + "loss": 0.1748, + "step": 8586 + }, + { + "epoch": 0.619575020743894, + "grad_norm": 0.11893530935049057, + "learning_rate": 0.00017523740799538172, + "loss": 0.1826, + "step": 8587 + }, + { + "epoch": 0.6196471734189545, + "grad_norm": 0.1262560784816742, + "learning_rate": 0.00017523452157598498, + "loss": 0.1247, + "step": 8588 + }, + { + "epoch": 0.6197193260940149, + "grad_norm": 0.10598855465650558, + "learning_rate": 0.00017523163515658827, + "loss": 0.135, + "step": 8589 + }, + { + "epoch": 0.6197914787690754, + "grad_norm": 0.1007552519440651, + "learning_rate": 0.00017522874873719153, + "loss": 0.1218, + "step": 8590 + }, + { + "epoch": 0.6198636314441358, + "grad_norm": 0.12365041673183441, + "learning_rate": 0.0001752258623177948, + "loss": 0.1183, + "step": 8591 + }, + { + "epoch": 0.6199357841191963, + "grad_norm": 0.12275546044111252, + "learning_rate": 0.00017522297589839803, + "loss": 0.137, + "step": 8592 + }, + { + "epoch": 0.6200079367942567, + "grad_norm": 0.1261817216873169, + "learning_rate": 0.0001752200894790013, + "loss": 0.1373, + "step": 8593 + }, + { + "epoch": 0.620080089469317, + "grad_norm": 0.14289404451847076, + "learning_rate": 0.00017521720305960456, + "loss": 0.1445, + "step": 8594 + }, + { + "epoch": 0.6201522421443775, + "grad_norm": 0.1267629861831665, + "learning_rate": 0.00017521431664020782, + "loss": 0.134, + "step": 8595 + }, + { + "epoch": 0.6202243948194379, + "grad_norm": 0.11549125611782074, + "learning_rate": 0.0001752114302208111, + "loss": 0.174, + "step": 8596 + }, + { + "epoch": 0.6202965474944984, + "grad_norm": 0.16596673429012299, + "learning_rate": 0.00017520854380141435, + "loss": 0.1311, + "step": 8597 + }, + { + "epoch": 0.6203687001695588, + "grad_norm": 0.13521885871887207, + "learning_rate": 0.0001752056573820176, + "loss": 0.1945, + "step": 8598 + }, + { + "epoch": 0.6204408528446193, + "grad_norm": 0.11223003268241882, + "learning_rate": 0.00017520277096262087, + "loss": 0.1864, + "step": 8599 + }, + { + "epoch": 0.6205130055196797, + "grad_norm": 0.1500011831521988, + "learning_rate": 0.00017519988454322414, + "loss": 0.1917, + "step": 8600 + }, + { + "epoch": 0.62058515819474, + "grad_norm": 0.1375226080417633, + "learning_rate": 0.0001751969981238274, + "loss": 0.1331, + "step": 8601 + }, + { + "epoch": 0.6206573108698005, + "grad_norm": 0.15558865666389465, + "learning_rate": 0.00017519411170443066, + "loss": 0.1416, + "step": 8602 + }, + { + "epoch": 0.6207294635448609, + "grad_norm": 0.11568037420511246, + "learning_rate": 0.00017519122528503392, + "loss": 0.1475, + "step": 8603 + }, + { + "epoch": 0.6208016162199214, + "grad_norm": 0.13693024218082428, + "learning_rate": 0.0001751883388656372, + "loss": 0.1409, + "step": 8604 + }, + { + "epoch": 0.6208737688949818, + "grad_norm": 0.14244809746742249, + "learning_rate": 0.00017518545244624045, + "loss": 0.1655, + "step": 8605 + }, + { + "epoch": 0.6209459215700422, + "grad_norm": 0.14547933638095856, + "learning_rate": 0.0001751825660268437, + "loss": 0.188, + "step": 8606 + }, + { + "epoch": 0.6210180742451027, + "grad_norm": 0.14529702067375183, + "learning_rate": 0.00017517967960744698, + "loss": 0.1711, + "step": 8607 + }, + { + "epoch": 0.621090226920163, + "grad_norm": 0.11163908988237381, + "learning_rate": 0.0001751767931880502, + "loss": 0.128, + "step": 8608 + }, + { + "epoch": 0.6211623795952235, + "grad_norm": 0.15089009702205658, + "learning_rate": 0.00017517390676865347, + "loss": 0.1739, + "step": 8609 + }, + { + "epoch": 0.6212345322702839, + "grad_norm": 0.10636797547340393, + "learning_rate": 0.00017517102034925676, + "loss": 0.1474, + "step": 8610 + }, + { + "epoch": 0.6213066849453444, + "grad_norm": 0.14714211225509644, + "learning_rate": 0.00017516813392986003, + "loss": 0.1739, + "step": 8611 + }, + { + "epoch": 0.6213788376204048, + "grad_norm": 0.12578116357326508, + "learning_rate": 0.0001751652475104633, + "loss": 0.1828, + "step": 8612 + }, + { + "epoch": 0.6214509902954652, + "grad_norm": 0.12373779714107513, + "learning_rate": 0.00017516236109106655, + "loss": 0.1375, + "step": 8613 + }, + { + "epoch": 0.6215231429705256, + "grad_norm": 0.16616494953632355, + "learning_rate": 0.0001751594746716698, + "loss": 0.1831, + "step": 8614 + }, + { + "epoch": 0.621595295645586, + "grad_norm": 0.14434950053691864, + "learning_rate": 0.00017515658825227305, + "loss": 0.1662, + "step": 8615 + }, + { + "epoch": 0.6216674483206465, + "grad_norm": 0.15726368129253387, + "learning_rate": 0.00017515370183287632, + "loss": 0.1119, + "step": 8616 + }, + { + "epoch": 0.6217396009957069, + "grad_norm": 0.13260503113269806, + "learning_rate": 0.0001751508154134796, + "loss": 0.1706, + "step": 8617 + }, + { + "epoch": 0.6218117536707674, + "grad_norm": 0.13859052956104279, + "learning_rate": 0.00017514792899408287, + "loss": 0.1466, + "step": 8618 + }, + { + "epoch": 0.6218839063458278, + "grad_norm": 0.15725117921829224, + "learning_rate": 0.0001751450425746861, + "loss": 0.1442, + "step": 8619 + }, + { + "epoch": 0.6219560590208882, + "grad_norm": 0.15142501890659332, + "learning_rate": 0.00017514215615528937, + "loss": 0.1715, + "step": 8620 + }, + { + "epoch": 0.6220282116959486, + "grad_norm": 0.12751902639865875, + "learning_rate": 0.00017513926973589263, + "loss": 0.1523, + "step": 8621 + }, + { + "epoch": 0.622100364371009, + "grad_norm": 0.1309266984462738, + "learning_rate": 0.0001751363833164959, + "loss": 0.2041, + "step": 8622 + }, + { + "epoch": 0.6221725170460695, + "grad_norm": 0.13360129296779633, + "learning_rate": 0.00017513349689709916, + "loss": 0.1556, + "step": 8623 + }, + { + "epoch": 0.6222446697211299, + "grad_norm": 0.12475387752056122, + "learning_rate": 0.00017513061047770242, + "loss": 0.1112, + "step": 8624 + }, + { + "epoch": 0.6223168223961904, + "grad_norm": 0.11016663908958435, + "learning_rate": 0.00017512772405830568, + "loss": 0.1715, + "step": 8625 + }, + { + "epoch": 0.6223889750712508, + "grad_norm": 0.12061906605958939, + "learning_rate": 0.00017512483763890894, + "loss": 0.172, + "step": 8626 + }, + { + "epoch": 0.6224611277463112, + "grad_norm": 0.1120067685842514, + "learning_rate": 0.0001751219512195122, + "loss": 0.1668, + "step": 8627 + }, + { + "epoch": 0.6225332804213716, + "grad_norm": 0.10962363332509995, + "learning_rate": 0.00017511906480011547, + "loss": 0.1077, + "step": 8628 + }, + { + "epoch": 0.622605433096432, + "grad_norm": 0.12538926303386688, + "learning_rate": 0.00017511617838071873, + "loss": 0.1826, + "step": 8629 + }, + { + "epoch": 0.6226775857714925, + "grad_norm": 0.13419729471206665, + "learning_rate": 0.00017511329196132197, + "loss": 0.1384, + "step": 8630 + }, + { + "epoch": 0.6227497384465529, + "grad_norm": 0.1244427040219307, + "learning_rate": 0.00017511040554192526, + "loss": 0.1417, + "step": 8631 + }, + { + "epoch": 0.6228218911216133, + "grad_norm": 0.14291410148143768, + "learning_rate": 0.00017510751912252852, + "loss": 0.1516, + "step": 8632 + }, + { + "epoch": 0.6228940437966738, + "grad_norm": 0.09003730118274689, + "learning_rate": 0.00017510463270313179, + "loss": 0.1391, + "step": 8633 + }, + { + "epoch": 0.6229661964717342, + "grad_norm": 0.1254691481590271, + "learning_rate": 0.00017510174628373505, + "loss": 0.1943, + "step": 8634 + }, + { + "epoch": 0.6230383491467946, + "grad_norm": 0.13230058550834656, + "learning_rate": 0.00017509885986433828, + "loss": 0.1199, + "step": 8635 + }, + { + "epoch": 0.623110501821855, + "grad_norm": 0.11702358722686768, + "learning_rate": 0.00017509597344494155, + "loss": 0.1576, + "step": 8636 + }, + { + "epoch": 0.6231826544969155, + "grad_norm": 0.12280598282814026, + "learning_rate": 0.0001750930870255448, + "loss": 0.1592, + "step": 8637 + }, + { + "epoch": 0.6232548071719759, + "grad_norm": 0.1280744969844818, + "learning_rate": 0.0001750902006061481, + "loss": 0.1572, + "step": 8638 + }, + { + "epoch": 0.6233269598470363, + "grad_norm": 0.14713403582572937, + "learning_rate": 0.00017508731418675136, + "loss": 0.1358, + "step": 8639 + }, + { + "epoch": 0.6233991125220968, + "grad_norm": 0.1206742525100708, + "learning_rate": 0.0001750844277673546, + "loss": 0.1492, + "step": 8640 + }, + { + "epoch": 0.6234712651971572, + "grad_norm": 0.1162571832537651, + "learning_rate": 0.00017508154134795786, + "loss": 0.137, + "step": 8641 + }, + { + "epoch": 0.6235434178722176, + "grad_norm": 0.10991784930229187, + "learning_rate": 0.00017507865492856112, + "loss": 0.1663, + "step": 8642 + }, + { + "epoch": 0.623615570547278, + "grad_norm": 0.117439404129982, + "learning_rate": 0.0001750757685091644, + "loss": 0.1518, + "step": 8643 + }, + { + "epoch": 0.6236877232223385, + "grad_norm": 0.12584786117076874, + "learning_rate": 0.00017507288208976765, + "loss": 0.1558, + "step": 8644 + }, + { + "epoch": 0.6237598758973989, + "grad_norm": 0.11283411830663681, + "learning_rate": 0.0001750699956703709, + "loss": 0.1734, + "step": 8645 + }, + { + "epoch": 0.6238320285724593, + "grad_norm": 0.1332118809223175, + "learning_rate": 0.00017506710925097418, + "loss": 0.116, + "step": 8646 + }, + { + "epoch": 0.6239041812475198, + "grad_norm": 0.1143932119011879, + "learning_rate": 0.00017506422283157744, + "loss": 0.1296, + "step": 8647 + }, + { + "epoch": 0.6239763339225802, + "grad_norm": 0.1439419388771057, + "learning_rate": 0.0001750613364121807, + "loss": 0.1776, + "step": 8648 + }, + { + "epoch": 0.6240484865976406, + "grad_norm": 0.10962345451116562, + "learning_rate": 0.00017505844999278396, + "loss": 0.1534, + "step": 8649 + }, + { + "epoch": 0.624120639272701, + "grad_norm": 0.1164950579404831, + "learning_rate": 0.00017505556357338723, + "loss": 0.1169, + "step": 8650 + }, + { + "epoch": 0.6241927919477614, + "grad_norm": 0.11711418628692627, + "learning_rate": 0.00017505267715399046, + "loss": 0.1254, + "step": 8651 + }, + { + "epoch": 0.6242649446228219, + "grad_norm": 0.14272616803646088, + "learning_rate": 0.00017504979073459375, + "loss": 0.156, + "step": 8652 + }, + { + "epoch": 0.6243370972978823, + "grad_norm": 0.11704165488481522, + "learning_rate": 0.00017504690431519702, + "loss": 0.1106, + "step": 8653 + }, + { + "epoch": 0.6244092499729428, + "grad_norm": 0.12983237206935883, + "learning_rate": 0.00017504401789580028, + "loss": 0.1266, + "step": 8654 + }, + { + "epoch": 0.6244814026480032, + "grad_norm": 0.2319333404302597, + "learning_rate": 0.00017504113147640354, + "loss": 0.1721, + "step": 8655 + }, + { + "epoch": 0.6245535553230636, + "grad_norm": 0.13093380630016327, + "learning_rate": 0.00017503824505700678, + "loss": 0.1971, + "step": 8656 + }, + { + "epoch": 0.624625707998124, + "grad_norm": 0.12470812350511551, + "learning_rate": 0.00017503535863761004, + "loss": 0.1382, + "step": 8657 + }, + { + "epoch": 0.6246978606731844, + "grad_norm": 0.1307307928800583, + "learning_rate": 0.0001750324722182133, + "loss": 0.1292, + "step": 8658 + }, + { + "epoch": 0.6247700133482449, + "grad_norm": 0.12445079535245895, + "learning_rate": 0.0001750295857988166, + "loss": 0.157, + "step": 8659 + }, + { + "epoch": 0.6248421660233053, + "grad_norm": 0.16324906051158905, + "learning_rate": 0.00017502669937941986, + "loss": 0.1094, + "step": 8660 + }, + { + "epoch": 0.6249143186983658, + "grad_norm": 0.14993080496788025, + "learning_rate": 0.0001750238129600231, + "loss": 0.1342, + "step": 8661 + }, + { + "epoch": 0.6249864713734262, + "grad_norm": 0.12553778290748596, + "learning_rate": 0.00017502092654062636, + "loss": 0.1262, + "step": 8662 + }, + { + "epoch": 0.6250586240484866, + "grad_norm": 0.11642362177371979, + "learning_rate": 0.00017501804012122962, + "loss": 0.1321, + "step": 8663 + }, + { + "epoch": 0.625130776723547, + "grad_norm": 0.12395073473453522, + "learning_rate": 0.00017501515370183288, + "loss": 0.1437, + "step": 8664 + }, + { + "epoch": 0.6252029293986074, + "grad_norm": 0.14258527755737305, + "learning_rate": 0.00017501226728243614, + "loss": 0.1619, + "step": 8665 + }, + { + "epoch": 0.6252750820736679, + "grad_norm": 0.10943171381950378, + "learning_rate": 0.0001750093808630394, + "loss": 0.093, + "step": 8666 + }, + { + "epoch": 0.6253472347487283, + "grad_norm": 0.09779515862464905, + "learning_rate": 0.00017500649444364267, + "loss": 0.101, + "step": 8667 + }, + { + "epoch": 0.6254193874237888, + "grad_norm": 0.11287999153137207, + "learning_rate": 0.00017500360802424593, + "loss": 0.1274, + "step": 8668 + }, + { + "epoch": 0.6254915400988492, + "grad_norm": 0.13966453075408936, + "learning_rate": 0.0001750007216048492, + "loss": 0.1559, + "step": 8669 + }, + { + "epoch": 0.6255636927739096, + "grad_norm": 0.12718842923641205, + "learning_rate": 0.00017499783518545246, + "loss": 0.1294, + "step": 8670 + }, + { + "epoch": 0.62563584544897, + "grad_norm": 0.11159928888082504, + "learning_rate": 0.00017499494876605572, + "loss": 0.1533, + "step": 8671 + }, + { + "epoch": 0.6257079981240304, + "grad_norm": 0.1387571096420288, + "learning_rate": 0.00017499206234665896, + "loss": 0.1852, + "step": 8672 + }, + { + "epoch": 0.6257801507990909, + "grad_norm": 0.11032717674970627, + "learning_rate": 0.00017498917592726225, + "loss": 0.1292, + "step": 8673 + }, + { + "epoch": 0.6258523034741513, + "grad_norm": 0.11329511553049088, + "learning_rate": 0.0001749862895078655, + "loss": 0.1227, + "step": 8674 + }, + { + "epoch": 0.6259244561492118, + "grad_norm": 0.15416178107261658, + "learning_rate": 0.00017498340308846877, + "loss": 0.1241, + "step": 8675 + }, + { + "epoch": 0.6259966088242721, + "grad_norm": 0.12209072709083557, + "learning_rate": 0.00017498051666907204, + "loss": 0.1208, + "step": 8676 + }, + { + "epoch": 0.6260687614993325, + "grad_norm": 0.11543633043766022, + "learning_rate": 0.00017497763024967527, + "loss": 0.1848, + "step": 8677 + }, + { + "epoch": 0.626140914174393, + "grad_norm": 0.10174732655286789, + "learning_rate": 0.00017497474383027854, + "loss": 0.1551, + "step": 8678 + }, + { + "epoch": 0.6262130668494534, + "grad_norm": 0.11609652638435364, + "learning_rate": 0.0001749718574108818, + "loss": 0.1391, + "step": 8679 + }, + { + "epoch": 0.6262852195245139, + "grad_norm": 0.12952245771884918, + "learning_rate": 0.0001749689709914851, + "loss": 0.1448, + "step": 8680 + }, + { + "epoch": 0.6263573721995743, + "grad_norm": 0.128461092710495, + "learning_rate": 0.00017496608457208835, + "loss": 0.157, + "step": 8681 + }, + { + "epoch": 0.6264295248746348, + "grad_norm": 0.2677597105503082, + "learning_rate": 0.0001749631981526916, + "loss": 0.1382, + "step": 8682 + }, + { + "epoch": 0.6265016775496951, + "grad_norm": 0.1268332302570343, + "learning_rate": 0.00017496031173329485, + "loss": 0.171, + "step": 8683 + }, + { + "epoch": 0.6265738302247555, + "grad_norm": 0.15028905868530273, + "learning_rate": 0.0001749574253138981, + "loss": 0.1555, + "step": 8684 + }, + { + "epoch": 0.626645982899816, + "grad_norm": 0.14424365758895874, + "learning_rate": 0.00017495453889450138, + "loss": 0.0959, + "step": 8685 + }, + { + "epoch": 0.6267181355748764, + "grad_norm": 0.12823861837387085, + "learning_rate": 0.00017495165247510464, + "loss": 0.1587, + "step": 8686 + }, + { + "epoch": 0.6267902882499369, + "grad_norm": 0.12356722354888916, + "learning_rate": 0.0001749487660557079, + "loss": 0.1459, + "step": 8687 + }, + { + "epoch": 0.6268624409249973, + "grad_norm": 0.13832490146160126, + "learning_rate": 0.00017494587963631116, + "loss": 0.1678, + "step": 8688 + }, + { + "epoch": 0.6269345936000578, + "grad_norm": 0.13942967355251312, + "learning_rate": 0.00017494299321691443, + "loss": 0.1195, + "step": 8689 + }, + { + "epoch": 0.6270067462751181, + "grad_norm": 0.09987753629684448, + "learning_rate": 0.0001749401067975177, + "loss": 0.1307, + "step": 8690 + }, + { + "epoch": 0.6270788989501785, + "grad_norm": 0.16269667446613312, + "learning_rate": 0.00017493722037812095, + "loss": 0.1805, + "step": 8691 + }, + { + "epoch": 0.627151051625239, + "grad_norm": 0.1342119574546814, + "learning_rate": 0.00017493433395872422, + "loss": 0.1414, + "step": 8692 + }, + { + "epoch": 0.6272232043002994, + "grad_norm": 0.13867583870887756, + "learning_rate": 0.00017493144753932745, + "loss": 0.1808, + "step": 8693 + }, + { + "epoch": 0.6272953569753599, + "grad_norm": 0.1276274472475052, + "learning_rate": 0.00017492856111993074, + "loss": 0.1457, + "step": 8694 + }, + { + "epoch": 0.6273675096504203, + "grad_norm": 0.11411254107952118, + "learning_rate": 0.000174925674700534, + "loss": 0.1462, + "step": 8695 + }, + { + "epoch": 0.6274396623254808, + "grad_norm": 0.12012795358896255, + "learning_rate": 0.00017492278828113727, + "loss": 0.1684, + "step": 8696 + }, + { + "epoch": 0.6275118150005411, + "grad_norm": 0.11880932003259659, + "learning_rate": 0.00017491990186174053, + "loss": 0.1186, + "step": 8697 + }, + { + "epoch": 0.6275839676756015, + "grad_norm": 0.15673860907554626, + "learning_rate": 0.00017491701544234377, + "loss": 0.2067, + "step": 8698 + }, + { + "epoch": 0.627656120350662, + "grad_norm": 0.13303562998771667, + "learning_rate": 0.00017491412902294703, + "loss": 0.1834, + "step": 8699 + }, + { + "epoch": 0.6277282730257224, + "grad_norm": 0.10776332765817642, + "learning_rate": 0.0001749112426035503, + "loss": 0.162, + "step": 8700 + }, + { + "epoch": 0.6278004257007829, + "grad_norm": 0.12505806982517242, + "learning_rate": 0.00017490835618415358, + "loss": 0.1254, + "step": 8701 + }, + { + "epoch": 0.6278725783758433, + "grad_norm": 0.09923242032527924, + "learning_rate": 0.00017490546976475685, + "loss": 0.1769, + "step": 8702 + }, + { + "epoch": 0.6279447310509038, + "grad_norm": 0.12002786248922348, + "learning_rate": 0.00017490258334536008, + "loss": 0.1534, + "step": 8703 + }, + { + "epoch": 0.6280168837259641, + "grad_norm": 0.15881898999214172, + "learning_rate": 0.00017489969692596334, + "loss": 0.2111, + "step": 8704 + }, + { + "epoch": 0.6280890364010245, + "grad_norm": 0.1061813086271286, + "learning_rate": 0.0001748968105065666, + "loss": 0.1912, + "step": 8705 + }, + { + "epoch": 0.628161189076085, + "grad_norm": 0.11571187525987625, + "learning_rate": 0.00017489392408716987, + "loss": 0.1243, + "step": 8706 + }, + { + "epoch": 0.6282333417511454, + "grad_norm": 0.10322290658950806, + "learning_rate": 0.00017489103766777313, + "loss": 0.1472, + "step": 8707 + }, + { + "epoch": 0.6283054944262059, + "grad_norm": 0.11178483814001083, + "learning_rate": 0.0001748881512483764, + "loss": 0.1167, + "step": 8708 + }, + { + "epoch": 0.6283776471012663, + "grad_norm": 0.12025123834609985, + "learning_rate": 0.00017488526482897966, + "loss": 0.1446, + "step": 8709 + }, + { + "epoch": 0.6284497997763268, + "grad_norm": 0.13027307391166687, + "learning_rate": 0.00017488237840958292, + "loss": 0.1324, + "step": 8710 + }, + { + "epoch": 0.6285219524513871, + "grad_norm": 0.1130695715546608, + "learning_rate": 0.00017487949199018618, + "loss": 0.1665, + "step": 8711 + }, + { + "epoch": 0.6285941051264475, + "grad_norm": 0.13819366693496704, + "learning_rate": 0.00017487660557078945, + "loss": 0.108, + "step": 8712 + }, + { + "epoch": 0.628666257801508, + "grad_norm": 0.11559824645519257, + "learning_rate": 0.0001748737191513927, + "loss": 0.0957, + "step": 8713 + }, + { + "epoch": 0.6287384104765684, + "grad_norm": 0.15998490154743195, + "learning_rate": 0.00017487083273199595, + "loss": 0.1218, + "step": 8714 + }, + { + "epoch": 0.6288105631516289, + "grad_norm": 0.11568142473697662, + "learning_rate": 0.00017486794631259924, + "loss": 0.1278, + "step": 8715 + }, + { + "epoch": 0.6288827158266893, + "grad_norm": 0.10960458219051361, + "learning_rate": 0.0001748650598932025, + "loss": 0.1357, + "step": 8716 + }, + { + "epoch": 0.6289548685017498, + "grad_norm": 0.13815350830554962, + "learning_rate": 0.00017486217347380576, + "loss": 0.1773, + "step": 8717 + }, + { + "epoch": 0.6290270211768101, + "grad_norm": 0.1252228170633316, + "learning_rate": 0.00017485928705440902, + "loss": 0.1545, + "step": 8718 + }, + { + "epoch": 0.6290991738518705, + "grad_norm": 0.11478475481271744, + "learning_rate": 0.00017485640063501226, + "loss": 0.1353, + "step": 8719 + }, + { + "epoch": 0.629171326526931, + "grad_norm": 0.1047094315290451, + "learning_rate": 0.00017485351421561552, + "loss": 0.1488, + "step": 8720 + }, + { + "epoch": 0.6292434792019914, + "grad_norm": 0.11537590622901917, + "learning_rate": 0.0001748506277962188, + "loss": 0.117, + "step": 8721 + }, + { + "epoch": 0.6293156318770519, + "grad_norm": 0.13007521629333496, + "learning_rate": 0.00017484774137682205, + "loss": 0.1519, + "step": 8722 + }, + { + "epoch": 0.6293877845521123, + "grad_norm": 0.14826463162899017, + "learning_rate": 0.00017484485495742534, + "loss": 0.1953, + "step": 8723 + }, + { + "epoch": 0.6294599372271727, + "grad_norm": 0.12364893406629562, + "learning_rate": 0.00017484196853802858, + "loss": 0.1411, + "step": 8724 + }, + { + "epoch": 0.6295320899022331, + "grad_norm": 0.11580374091863632, + "learning_rate": 0.00017483908211863184, + "loss": 0.1244, + "step": 8725 + }, + { + "epoch": 0.6296042425772935, + "grad_norm": 0.10067996382713318, + "learning_rate": 0.0001748361956992351, + "loss": 0.1506, + "step": 8726 + }, + { + "epoch": 0.629676395252354, + "grad_norm": 0.1357460469007492, + "learning_rate": 0.00017483330927983836, + "loss": 0.1553, + "step": 8727 + }, + { + "epoch": 0.6297485479274144, + "grad_norm": 0.12918460369110107, + "learning_rate": 0.00017483042286044163, + "loss": 0.1572, + "step": 8728 + }, + { + "epoch": 0.6298207006024749, + "grad_norm": 0.12484171241521835, + "learning_rate": 0.0001748275364410449, + "loss": 0.1436, + "step": 8729 + }, + { + "epoch": 0.6298928532775353, + "grad_norm": 0.14255495369434357, + "learning_rate": 0.00017482465002164815, + "loss": 0.1387, + "step": 8730 + }, + { + "epoch": 0.6299650059525957, + "grad_norm": 0.10925611853599548, + "learning_rate": 0.00017482176360225142, + "loss": 0.1287, + "step": 8731 + }, + { + "epoch": 0.6300371586276561, + "grad_norm": 0.13331304490566254, + "learning_rate": 0.00017481887718285468, + "loss": 0.1441, + "step": 8732 + }, + { + "epoch": 0.6301093113027165, + "grad_norm": 0.1275538206100464, + "learning_rate": 0.00017481599076345794, + "loss": 0.1368, + "step": 8733 + }, + { + "epoch": 0.630181463977777, + "grad_norm": 0.11866793036460876, + "learning_rate": 0.0001748131043440612, + "loss": 0.1594, + "step": 8734 + }, + { + "epoch": 0.6302536166528374, + "grad_norm": 0.13025593757629395, + "learning_rate": 0.00017481021792466447, + "loss": 0.13, + "step": 8735 + }, + { + "epoch": 0.6303257693278979, + "grad_norm": 0.15139830112457275, + "learning_rate": 0.0001748073315052677, + "loss": 0.1629, + "step": 8736 + }, + { + "epoch": 0.6303979220029583, + "grad_norm": 0.1346716433763504, + "learning_rate": 0.000174804445085871, + "loss": 0.1279, + "step": 8737 + }, + { + "epoch": 0.6304700746780186, + "grad_norm": 0.12959204614162445, + "learning_rate": 0.00017480155866647426, + "loss": 0.1375, + "step": 8738 + }, + { + "epoch": 0.6305422273530791, + "grad_norm": 0.13815361261367798, + "learning_rate": 0.00017479867224707752, + "loss": 0.1194, + "step": 8739 + }, + { + "epoch": 0.6306143800281395, + "grad_norm": 0.11633475869894028, + "learning_rate": 0.00017479578582768078, + "loss": 0.14, + "step": 8740 + }, + { + "epoch": 0.6306865327032, + "grad_norm": 0.14336754381656647, + "learning_rate": 0.00017479289940828402, + "loss": 0.134, + "step": 8741 + }, + { + "epoch": 0.6307586853782604, + "grad_norm": 0.11213643848896027, + "learning_rate": 0.00017479001298888728, + "loss": 0.1408, + "step": 8742 + }, + { + "epoch": 0.6308308380533209, + "grad_norm": 0.11513768136501312, + "learning_rate": 0.00017478712656949054, + "loss": 0.1488, + "step": 8743 + }, + { + "epoch": 0.6309029907283813, + "grad_norm": 0.1266719400882721, + "learning_rate": 0.00017478424015009383, + "loss": 0.1407, + "step": 8744 + }, + { + "epoch": 0.6309751434034416, + "grad_norm": 0.12178874760866165, + "learning_rate": 0.0001747813537306971, + "loss": 0.1267, + "step": 8745 + }, + { + "epoch": 0.6310472960785021, + "grad_norm": 0.11389937251806259, + "learning_rate": 0.00017477846731130033, + "loss": 0.2012, + "step": 8746 + }, + { + "epoch": 0.6311194487535625, + "grad_norm": 0.1470189392566681, + "learning_rate": 0.0001747755808919036, + "loss": 0.1914, + "step": 8747 + }, + { + "epoch": 0.631191601428623, + "grad_norm": 0.10151471942663193, + "learning_rate": 0.00017477269447250686, + "loss": 0.1635, + "step": 8748 + }, + { + "epoch": 0.6312637541036834, + "grad_norm": 0.1164185032248497, + "learning_rate": 0.00017476980805311012, + "loss": 0.1088, + "step": 8749 + }, + { + "epoch": 0.6313359067787438, + "grad_norm": 0.12604768574237823, + "learning_rate": 0.00017476692163371338, + "loss": 0.1445, + "step": 8750 + }, + { + "epoch": 0.6314080594538043, + "grad_norm": 0.12387518584728241, + "learning_rate": 0.00017476403521431665, + "loss": 0.1584, + "step": 8751 + }, + { + "epoch": 0.6314802121288646, + "grad_norm": 0.1140308678150177, + "learning_rate": 0.0001747611487949199, + "loss": 0.1565, + "step": 8752 + }, + { + "epoch": 0.6315523648039251, + "grad_norm": 0.10060878098011017, + "learning_rate": 0.00017475826237552317, + "loss": 0.1516, + "step": 8753 + }, + { + "epoch": 0.6316245174789855, + "grad_norm": 0.12557385861873627, + "learning_rate": 0.00017475537595612644, + "loss": 0.165, + "step": 8754 + }, + { + "epoch": 0.631696670154046, + "grad_norm": 0.1297946721315384, + "learning_rate": 0.0001747524895367297, + "loss": 0.1414, + "step": 8755 + }, + { + "epoch": 0.6317688228291064, + "grad_norm": 0.12463095784187317, + "learning_rate": 0.00017474960311733296, + "loss": 0.1583, + "step": 8756 + }, + { + "epoch": 0.6318409755041668, + "grad_norm": 0.11979036033153534, + "learning_rate": 0.0001747467166979362, + "loss": 0.1665, + "step": 8757 + }, + { + "epoch": 0.6319131281792273, + "grad_norm": 0.12523148953914642, + "learning_rate": 0.0001747438302785395, + "loss": 0.1585, + "step": 8758 + }, + { + "epoch": 0.6319852808542876, + "grad_norm": 0.1275649517774582, + "learning_rate": 0.00017474094385914275, + "loss": 0.1434, + "step": 8759 + }, + { + "epoch": 0.6320574335293481, + "grad_norm": 0.1115640178322792, + "learning_rate": 0.000174738057439746, + "loss": 0.1405, + "step": 8760 + }, + { + "epoch": 0.6321295862044085, + "grad_norm": 0.11878620833158493, + "learning_rate": 0.00017473517102034928, + "loss": 0.1416, + "step": 8761 + }, + { + "epoch": 0.632201738879469, + "grad_norm": 0.1362275630235672, + "learning_rate": 0.0001747322846009525, + "loss": 0.159, + "step": 8762 + }, + { + "epoch": 0.6322738915545294, + "grad_norm": 0.13509535789489746, + "learning_rate": 0.00017472939818155578, + "loss": 0.1644, + "step": 8763 + }, + { + "epoch": 0.6323460442295898, + "grad_norm": 0.12418070435523987, + "learning_rate": 0.00017472651176215904, + "loss": 0.1594, + "step": 8764 + }, + { + "epoch": 0.6324181969046503, + "grad_norm": 0.12860502302646637, + "learning_rate": 0.00017472362534276233, + "loss": 0.1132, + "step": 8765 + }, + { + "epoch": 0.6324903495797106, + "grad_norm": 0.14515362679958344, + "learning_rate": 0.0001747207389233656, + "loss": 0.1654, + "step": 8766 + }, + { + "epoch": 0.6325625022547711, + "grad_norm": 0.12168245762586594, + "learning_rate": 0.00017471785250396883, + "loss": 0.1178, + "step": 8767 + }, + { + "epoch": 0.6326346549298315, + "grad_norm": 0.13350696861743927, + "learning_rate": 0.0001747149660845721, + "loss": 0.1459, + "step": 8768 + }, + { + "epoch": 0.632706807604892, + "grad_norm": 0.12162505090236664, + "learning_rate": 0.00017471207966517535, + "loss": 0.1625, + "step": 8769 + }, + { + "epoch": 0.6327789602799524, + "grad_norm": 0.1261981874704361, + "learning_rate": 0.00017470919324577862, + "loss": 0.1498, + "step": 8770 + }, + { + "epoch": 0.6328511129550128, + "grad_norm": 0.12764649093151093, + "learning_rate": 0.00017470630682638188, + "loss": 0.1942, + "step": 8771 + }, + { + "epoch": 0.6329232656300733, + "grad_norm": 0.12060592323541641, + "learning_rate": 0.00017470342040698514, + "loss": 0.1494, + "step": 8772 + }, + { + "epoch": 0.6329954183051336, + "grad_norm": 0.12116419523954391, + "learning_rate": 0.0001747005339875884, + "loss": 0.1392, + "step": 8773 + }, + { + "epoch": 0.6330675709801941, + "grad_norm": 0.12815189361572266, + "learning_rate": 0.00017469764756819167, + "loss": 0.1444, + "step": 8774 + }, + { + "epoch": 0.6331397236552545, + "grad_norm": 0.15026380121707916, + "learning_rate": 0.00017469476114879493, + "loss": 0.1727, + "step": 8775 + }, + { + "epoch": 0.633211876330315, + "grad_norm": 0.17085008323192596, + "learning_rate": 0.0001746918747293982, + "loss": 0.1534, + "step": 8776 + }, + { + "epoch": 0.6332840290053754, + "grad_norm": 0.14156273007392883, + "learning_rate": 0.00017468898831000146, + "loss": 0.1646, + "step": 8777 + }, + { + "epoch": 0.6333561816804358, + "grad_norm": 0.13254617154598236, + "learning_rate": 0.0001746861018906047, + "loss": 0.1259, + "step": 8778 + }, + { + "epoch": 0.6334283343554963, + "grad_norm": 0.1390049159526825, + "learning_rate": 0.00017468321547120798, + "loss": 0.1702, + "step": 8779 + }, + { + "epoch": 0.6335004870305566, + "grad_norm": 0.1458684653043747, + "learning_rate": 0.00017468032905181124, + "loss": 0.1466, + "step": 8780 + }, + { + "epoch": 0.6335726397056171, + "grad_norm": 0.1377117782831192, + "learning_rate": 0.0001746774426324145, + "loss": 0.1531, + "step": 8781 + }, + { + "epoch": 0.6336447923806775, + "grad_norm": 0.11346470564603806, + "learning_rate": 0.00017467455621301777, + "loss": 0.1724, + "step": 8782 + }, + { + "epoch": 0.633716945055738, + "grad_norm": 0.1253042221069336, + "learning_rate": 0.000174671669793621, + "loss": 0.1973, + "step": 8783 + }, + { + "epoch": 0.6337890977307984, + "grad_norm": 0.11717258393764496, + "learning_rate": 0.00017466878337422427, + "loss": 0.1256, + "step": 8784 + }, + { + "epoch": 0.6338612504058588, + "grad_norm": 0.11658114939928055, + "learning_rate": 0.00017466589695482753, + "loss": 0.1705, + "step": 8785 + }, + { + "epoch": 0.6339334030809193, + "grad_norm": 0.11756215244531631, + "learning_rate": 0.00017466301053543082, + "loss": 0.1484, + "step": 8786 + }, + { + "epoch": 0.6340055557559796, + "grad_norm": 0.11114584654569626, + "learning_rate": 0.00017466012411603409, + "loss": 0.091, + "step": 8787 + }, + { + "epoch": 0.63407770843104, + "grad_norm": 0.09902235120534897, + "learning_rate": 0.00017465723769663732, + "loss": 0.1089, + "step": 8788 + }, + { + "epoch": 0.6341498611061005, + "grad_norm": 0.14965693652629852, + "learning_rate": 0.00017465435127724058, + "loss": 0.1519, + "step": 8789 + }, + { + "epoch": 0.6342220137811609, + "grad_norm": 0.11054068803787231, + "learning_rate": 0.00017465146485784385, + "loss": 0.1564, + "step": 8790 + }, + { + "epoch": 0.6342941664562214, + "grad_norm": 0.10102395713329315, + "learning_rate": 0.0001746485784384471, + "loss": 0.1928, + "step": 8791 + }, + { + "epoch": 0.6343663191312818, + "grad_norm": 0.11846046894788742, + "learning_rate": 0.00017464569201905037, + "loss": 0.1286, + "step": 8792 + }, + { + "epoch": 0.6344384718063423, + "grad_norm": 0.13022750616073608, + "learning_rate": 0.00017464280559965364, + "loss": 0.1428, + "step": 8793 + }, + { + "epoch": 0.6345106244814026, + "grad_norm": 0.11548298597335815, + "learning_rate": 0.0001746399191802569, + "loss": 0.1116, + "step": 8794 + }, + { + "epoch": 0.634582777156463, + "grad_norm": 0.12897050380706787, + "learning_rate": 0.00017463703276086016, + "loss": 0.1657, + "step": 8795 + }, + { + "epoch": 0.6346549298315235, + "grad_norm": 0.11598154902458191, + "learning_rate": 0.00017463414634146342, + "loss": 0.1812, + "step": 8796 + }, + { + "epoch": 0.6347270825065839, + "grad_norm": 0.1394772082567215, + "learning_rate": 0.0001746312599220667, + "loss": 0.1886, + "step": 8797 + }, + { + "epoch": 0.6347992351816444, + "grad_norm": 0.1729123890399933, + "learning_rate": 0.00017462837350266995, + "loss": 0.1067, + "step": 8798 + }, + { + "epoch": 0.6348713878567048, + "grad_norm": 0.12441124767065048, + "learning_rate": 0.00017462548708327319, + "loss": 0.2004, + "step": 8799 + }, + { + "epoch": 0.6349435405317652, + "grad_norm": 0.13342799246311188, + "learning_rate": 0.00017462260066387648, + "loss": 0.1619, + "step": 8800 + }, + { + "epoch": 0.6350156932068256, + "grad_norm": 0.13017302751541138, + "learning_rate": 0.00017461971424447974, + "loss": 0.1307, + "step": 8801 + }, + { + "epoch": 0.635087845881886, + "grad_norm": 0.09042824059724808, + "learning_rate": 0.000174616827825083, + "loss": 0.1751, + "step": 8802 + }, + { + "epoch": 0.6351599985569465, + "grad_norm": 0.13923433423042297, + "learning_rate": 0.00017461394140568626, + "loss": 0.1526, + "step": 8803 + }, + { + "epoch": 0.6352321512320069, + "grad_norm": 0.12554508447647095, + "learning_rate": 0.0001746110549862895, + "loss": 0.1528, + "step": 8804 + }, + { + "epoch": 0.6353043039070674, + "grad_norm": 0.11458244919776917, + "learning_rate": 0.00017460816856689276, + "loss": 0.1277, + "step": 8805 + }, + { + "epoch": 0.6353764565821278, + "grad_norm": 0.1285858452320099, + "learning_rate": 0.00017460528214749603, + "loss": 0.1692, + "step": 8806 + }, + { + "epoch": 0.6354486092571882, + "grad_norm": 0.13324186205863953, + "learning_rate": 0.00017460239572809932, + "loss": 0.175, + "step": 8807 + }, + { + "epoch": 0.6355207619322486, + "grad_norm": 0.12661926448345184, + "learning_rate": 0.00017459950930870258, + "loss": 0.1376, + "step": 8808 + }, + { + "epoch": 0.635592914607309, + "grad_norm": 0.11739029735326767, + "learning_rate": 0.00017459662288930582, + "loss": 0.1313, + "step": 8809 + }, + { + "epoch": 0.6356650672823695, + "grad_norm": 0.10466552525758743, + "learning_rate": 0.00017459373646990908, + "loss": 0.1541, + "step": 8810 + }, + { + "epoch": 0.6357372199574299, + "grad_norm": 0.12793004512786865, + "learning_rate": 0.00017459085005051234, + "loss": 0.2179, + "step": 8811 + }, + { + "epoch": 0.6358093726324904, + "grad_norm": 0.10991460084915161, + "learning_rate": 0.0001745879636311156, + "loss": 0.1911, + "step": 8812 + }, + { + "epoch": 0.6358815253075508, + "grad_norm": 0.12637649476528168, + "learning_rate": 0.00017458507721171887, + "loss": 0.1155, + "step": 8813 + }, + { + "epoch": 0.6359536779826112, + "grad_norm": 0.10572929680347443, + "learning_rate": 0.00017458219079232213, + "loss": 0.0975, + "step": 8814 + }, + { + "epoch": 0.6360258306576716, + "grad_norm": 0.11248910427093506, + "learning_rate": 0.0001745793043729254, + "loss": 0.1795, + "step": 8815 + }, + { + "epoch": 0.636097983332732, + "grad_norm": 0.1279258131980896, + "learning_rate": 0.00017457641795352866, + "loss": 0.1528, + "step": 8816 + }, + { + "epoch": 0.6361701360077925, + "grad_norm": 0.10839273780584335, + "learning_rate": 0.00017457353153413192, + "loss": 0.1751, + "step": 8817 + }, + { + "epoch": 0.6362422886828529, + "grad_norm": 0.13171249628067017, + "learning_rate": 0.00017457064511473518, + "loss": 0.1549, + "step": 8818 + }, + { + "epoch": 0.6363144413579134, + "grad_norm": 0.11827006936073303, + "learning_rate": 0.00017456775869533844, + "loss": 0.1295, + "step": 8819 + }, + { + "epoch": 0.6363865940329738, + "grad_norm": 0.11012952774763107, + "learning_rate": 0.00017456487227594168, + "loss": 0.1051, + "step": 8820 + }, + { + "epoch": 0.6364587467080342, + "grad_norm": 0.11319731920957565, + "learning_rate": 0.00017456198585654497, + "loss": 0.1582, + "step": 8821 + }, + { + "epoch": 0.6365308993830946, + "grad_norm": 0.10944925248622894, + "learning_rate": 0.00017455909943714823, + "loss": 0.1749, + "step": 8822 + }, + { + "epoch": 0.636603052058155, + "grad_norm": 0.10941876471042633, + "learning_rate": 0.0001745562130177515, + "loss": 0.1563, + "step": 8823 + }, + { + "epoch": 0.6366752047332155, + "grad_norm": 0.1071227639913559, + "learning_rate": 0.00017455332659835476, + "loss": 0.1394, + "step": 8824 + }, + { + "epoch": 0.6367473574082759, + "grad_norm": 0.11995568871498108, + "learning_rate": 0.000174550440178958, + "loss": 0.1243, + "step": 8825 + }, + { + "epoch": 0.6368195100833364, + "grad_norm": 0.12740518152713776, + "learning_rate": 0.00017454755375956126, + "loss": 0.1442, + "step": 8826 + }, + { + "epoch": 0.6368916627583968, + "grad_norm": 0.11265741288661957, + "learning_rate": 0.00017454466734016452, + "loss": 0.1692, + "step": 8827 + }, + { + "epoch": 0.6369638154334571, + "grad_norm": 0.1335940957069397, + "learning_rate": 0.0001745417809207678, + "loss": 0.0964, + "step": 8828 + }, + { + "epoch": 0.6370359681085176, + "grad_norm": 0.12412536889314651, + "learning_rate": 0.00017453889450137107, + "loss": 0.1094, + "step": 8829 + }, + { + "epoch": 0.637108120783578, + "grad_norm": 0.11655788123607635, + "learning_rate": 0.0001745360080819743, + "loss": 0.1524, + "step": 8830 + }, + { + "epoch": 0.6371802734586385, + "grad_norm": 0.11918830871582031, + "learning_rate": 0.00017453312166257757, + "loss": 0.1182, + "step": 8831 + }, + { + "epoch": 0.6372524261336989, + "grad_norm": 0.14078155159950256, + "learning_rate": 0.00017453023524318084, + "loss": 0.1709, + "step": 8832 + }, + { + "epoch": 0.6373245788087594, + "grad_norm": 0.11668020486831665, + "learning_rate": 0.0001745273488237841, + "loss": 0.0906, + "step": 8833 + }, + { + "epoch": 0.6373967314838198, + "grad_norm": 0.10997610539197922, + "learning_rate": 0.00017452446240438736, + "loss": 0.155, + "step": 8834 + }, + { + "epoch": 0.6374688841588801, + "grad_norm": 0.11578045785427094, + "learning_rate": 0.00017452157598499062, + "loss": 0.1799, + "step": 8835 + }, + { + "epoch": 0.6375410368339406, + "grad_norm": 0.12486160546541214, + "learning_rate": 0.0001745186895655939, + "loss": 0.1332, + "step": 8836 + }, + { + "epoch": 0.637613189509001, + "grad_norm": 0.17098915576934814, + "learning_rate": 0.00017451580314619715, + "loss": 0.1367, + "step": 8837 + }, + { + "epoch": 0.6376853421840615, + "grad_norm": 0.13832706212997437, + "learning_rate": 0.0001745129167268004, + "loss": 0.1714, + "step": 8838 + }, + { + "epoch": 0.6377574948591219, + "grad_norm": 0.17727001011371613, + "learning_rate": 0.00017451003030740368, + "loss": 0.1732, + "step": 8839 + }, + { + "epoch": 0.6378296475341824, + "grad_norm": 0.1226031705737114, + "learning_rate": 0.00017450714388800694, + "loss": 0.1622, + "step": 8840 + }, + { + "epoch": 0.6379018002092428, + "grad_norm": 0.11801004409790039, + "learning_rate": 0.0001745042574686102, + "loss": 0.1565, + "step": 8841 + }, + { + "epoch": 0.6379739528843031, + "grad_norm": 0.14041762053966522, + "learning_rate": 0.00017450137104921346, + "loss": 0.162, + "step": 8842 + }, + { + "epoch": 0.6380461055593636, + "grad_norm": 0.10397140681743622, + "learning_rate": 0.00017449848462981673, + "loss": 0.1237, + "step": 8843 + }, + { + "epoch": 0.638118258234424, + "grad_norm": 0.1438121199607849, + "learning_rate": 0.00017449559821042, + "loss": 0.1204, + "step": 8844 + }, + { + "epoch": 0.6381904109094845, + "grad_norm": 0.1460319459438324, + "learning_rate": 0.00017449271179102325, + "loss": 0.181, + "step": 8845 + }, + { + "epoch": 0.6382625635845449, + "grad_norm": 0.10840224474668503, + "learning_rate": 0.00017448982537162652, + "loss": 0.1114, + "step": 8846 + }, + { + "epoch": 0.6383347162596054, + "grad_norm": 0.15132613480091095, + "learning_rate": 0.00017448693895222975, + "loss": 0.1205, + "step": 8847 + }, + { + "epoch": 0.6384068689346658, + "grad_norm": 0.0996752679347992, + "learning_rate": 0.00017448405253283302, + "loss": 0.1632, + "step": 8848 + }, + { + "epoch": 0.6384790216097261, + "grad_norm": 0.131254181265831, + "learning_rate": 0.0001744811661134363, + "loss": 0.1482, + "step": 8849 + }, + { + "epoch": 0.6385511742847866, + "grad_norm": 0.11335877329111099, + "learning_rate": 0.00017447827969403957, + "loss": 0.1318, + "step": 8850 + }, + { + "epoch": 0.638623326959847, + "grad_norm": 0.131666362285614, + "learning_rate": 0.00017447539327464283, + "loss": 0.1272, + "step": 8851 + }, + { + "epoch": 0.6386954796349075, + "grad_norm": 0.12124770134687424, + "learning_rate": 0.00017447250685524607, + "loss": 0.2281, + "step": 8852 + }, + { + "epoch": 0.6387676323099679, + "grad_norm": 0.11714229732751846, + "learning_rate": 0.00017446962043584933, + "loss": 0.1716, + "step": 8853 + }, + { + "epoch": 0.6388397849850284, + "grad_norm": 0.13586623966693878, + "learning_rate": 0.0001744667340164526, + "loss": 0.1225, + "step": 8854 + }, + { + "epoch": 0.6389119376600888, + "grad_norm": 0.18940232694149017, + "learning_rate": 0.00017446384759705586, + "loss": 0.1713, + "step": 8855 + }, + { + "epoch": 0.6389840903351491, + "grad_norm": 0.13319925963878632, + "learning_rate": 0.00017446096117765915, + "loss": 0.1439, + "step": 8856 + }, + { + "epoch": 0.6390562430102096, + "grad_norm": 0.10407208651304245, + "learning_rate": 0.00017445807475826238, + "loss": 0.1557, + "step": 8857 + }, + { + "epoch": 0.63912839568527, + "grad_norm": 0.11361634731292725, + "learning_rate": 0.00017445518833886564, + "loss": 0.1159, + "step": 8858 + }, + { + "epoch": 0.6392005483603305, + "grad_norm": 0.1038358137011528, + "learning_rate": 0.0001744523019194689, + "loss": 0.143, + "step": 8859 + }, + { + "epoch": 0.6392727010353909, + "grad_norm": 0.10394181311130524, + "learning_rate": 0.00017444941550007217, + "loss": 0.0907, + "step": 8860 + }, + { + "epoch": 0.6393448537104514, + "grad_norm": 0.12115704268217087, + "learning_rate": 0.00017444652908067543, + "loss": 0.1268, + "step": 8861 + }, + { + "epoch": 0.6394170063855117, + "grad_norm": 0.1325000524520874, + "learning_rate": 0.0001744436426612787, + "loss": 0.1396, + "step": 8862 + }, + { + "epoch": 0.6394891590605721, + "grad_norm": 0.14049799740314484, + "learning_rate": 0.00017444075624188196, + "loss": 0.1837, + "step": 8863 + }, + { + "epoch": 0.6395613117356326, + "grad_norm": 0.1779290735721588, + "learning_rate": 0.00017443786982248522, + "loss": 0.1752, + "step": 8864 + }, + { + "epoch": 0.639633464410693, + "grad_norm": 0.12939026951789856, + "learning_rate": 0.00017443498340308848, + "loss": 0.0974, + "step": 8865 + }, + { + "epoch": 0.6397056170857535, + "grad_norm": 0.11895648390054703, + "learning_rate": 0.00017443209698369175, + "loss": 0.1321, + "step": 8866 + }, + { + "epoch": 0.6397777697608139, + "grad_norm": 0.13495993614196777, + "learning_rate": 0.000174429210564295, + "loss": 0.1731, + "step": 8867 + }, + { + "epoch": 0.6398499224358744, + "grad_norm": 0.12926140427589417, + "learning_rate": 0.00017442632414489825, + "loss": 0.1683, + "step": 8868 + }, + { + "epoch": 0.6399220751109347, + "grad_norm": 0.12302546203136444, + "learning_rate": 0.0001744234377255015, + "loss": 0.1318, + "step": 8869 + }, + { + "epoch": 0.6399942277859951, + "grad_norm": 0.12506909668445587, + "learning_rate": 0.0001744205513061048, + "loss": 0.178, + "step": 8870 + }, + { + "epoch": 0.6400663804610556, + "grad_norm": 0.15225814282894135, + "learning_rate": 0.00017441766488670806, + "loss": 0.1534, + "step": 8871 + }, + { + "epoch": 0.640138533136116, + "grad_norm": 0.12990593910217285, + "learning_rate": 0.00017441477846731133, + "loss": 0.1337, + "step": 8872 + }, + { + "epoch": 0.6402106858111765, + "grad_norm": 0.1329309344291687, + "learning_rate": 0.00017441189204791456, + "loss": 0.1332, + "step": 8873 + }, + { + "epoch": 0.6402828384862369, + "grad_norm": 0.1620033234357834, + "learning_rate": 0.00017440900562851782, + "loss": 0.1796, + "step": 8874 + }, + { + "epoch": 0.6403549911612973, + "grad_norm": 0.11278960108757019, + "learning_rate": 0.0001744061192091211, + "loss": 0.1394, + "step": 8875 + }, + { + "epoch": 0.6404271438363577, + "grad_norm": 0.12775515019893646, + "learning_rate": 0.00017440323278972435, + "loss": 0.1686, + "step": 8876 + }, + { + "epoch": 0.6404992965114181, + "grad_norm": 0.13999012112617493, + "learning_rate": 0.00017440034637032764, + "loss": 0.1519, + "step": 8877 + }, + { + "epoch": 0.6405714491864786, + "grad_norm": 0.13053064048290253, + "learning_rate": 0.00017439745995093088, + "loss": 0.1398, + "step": 8878 + }, + { + "epoch": 0.640643601861539, + "grad_norm": 0.11259409785270691, + "learning_rate": 0.00017439457353153414, + "loss": 0.1681, + "step": 8879 + }, + { + "epoch": 0.6407157545365995, + "grad_norm": 0.12789352238178253, + "learning_rate": 0.0001743916871121374, + "loss": 0.1797, + "step": 8880 + }, + { + "epoch": 0.6407879072116599, + "grad_norm": 0.11534351855516434, + "learning_rate": 0.00017438880069274066, + "loss": 0.1227, + "step": 8881 + }, + { + "epoch": 0.6408600598867203, + "grad_norm": 0.16358792781829834, + "learning_rate": 0.00017438591427334393, + "loss": 0.1178, + "step": 8882 + }, + { + "epoch": 0.6409322125617807, + "grad_norm": 0.1316683292388916, + "learning_rate": 0.0001743830278539472, + "loss": 0.1611, + "step": 8883 + }, + { + "epoch": 0.6410043652368411, + "grad_norm": 0.14574764668941498, + "learning_rate": 0.00017438014143455045, + "loss": 0.1929, + "step": 8884 + }, + { + "epoch": 0.6410765179119016, + "grad_norm": 0.12572930753231049, + "learning_rate": 0.00017437725501515372, + "loss": 0.1585, + "step": 8885 + }, + { + "epoch": 0.641148670586962, + "grad_norm": 0.17408978939056396, + "learning_rate": 0.00017437436859575698, + "loss": 0.1606, + "step": 8886 + }, + { + "epoch": 0.6412208232620225, + "grad_norm": 0.10904928296804428, + "learning_rate": 0.00017437148217636024, + "loss": 0.1288, + "step": 8887 + }, + { + "epoch": 0.6412929759370829, + "grad_norm": 0.1295899897813797, + "learning_rate": 0.0001743685957569635, + "loss": 0.1674, + "step": 8888 + }, + { + "epoch": 0.6413651286121433, + "grad_norm": 0.13774964213371277, + "learning_rate": 0.00017436570933756674, + "loss": 0.1628, + "step": 8889 + }, + { + "epoch": 0.6414372812872037, + "grad_norm": 0.10904145985841751, + "learning_rate": 0.00017436282291817, + "loss": 0.1532, + "step": 8890 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 0.17066548764705658, + "learning_rate": 0.0001743599364987733, + "loss": 0.1818, + "step": 8891 + }, + { + "epoch": 0.6415815866373246, + "grad_norm": 0.09667755663394928, + "learning_rate": 0.00017435705007937656, + "loss": 0.1036, + "step": 8892 + }, + { + "epoch": 0.641653739312385, + "grad_norm": 0.09876111894845963, + "learning_rate": 0.00017435416365997982, + "loss": 0.1071, + "step": 8893 + }, + { + "epoch": 0.6417258919874455, + "grad_norm": 0.12817133963108063, + "learning_rate": 0.00017435127724058306, + "loss": 0.152, + "step": 8894 + }, + { + "epoch": 0.6417980446625059, + "grad_norm": 0.12401299178600311, + "learning_rate": 0.00017434839082118632, + "loss": 0.1233, + "step": 8895 + }, + { + "epoch": 0.6418701973375663, + "grad_norm": 0.1025814563035965, + "learning_rate": 0.00017434550440178958, + "loss": 0.1495, + "step": 8896 + }, + { + "epoch": 0.6419423500126267, + "grad_norm": 0.15205642580986023, + "learning_rate": 0.00017434261798239284, + "loss": 0.1213, + "step": 8897 + }, + { + "epoch": 0.6420145026876871, + "grad_norm": 0.11617343127727509, + "learning_rate": 0.00017433973156299613, + "loss": 0.19, + "step": 8898 + }, + { + "epoch": 0.6420866553627476, + "grad_norm": 0.1051785945892334, + "learning_rate": 0.00017433684514359937, + "loss": 0.0935, + "step": 8899 + }, + { + "epoch": 0.642158808037808, + "grad_norm": 0.13081860542297363, + "learning_rate": 0.00017433395872420263, + "loss": 0.1629, + "step": 8900 + }, + { + "epoch": 0.6422309607128684, + "grad_norm": 0.10840879380702972, + "learning_rate": 0.0001743310723048059, + "loss": 0.1534, + "step": 8901 + }, + { + "epoch": 0.6423031133879289, + "grad_norm": 0.1379835158586502, + "learning_rate": 0.00017432818588540916, + "loss": 0.1478, + "step": 8902 + }, + { + "epoch": 0.6423752660629893, + "grad_norm": 0.14373107254505157, + "learning_rate": 0.00017432529946601242, + "loss": 0.142, + "step": 8903 + }, + { + "epoch": 0.6424474187380497, + "grad_norm": 0.13795924186706543, + "learning_rate": 0.00017432241304661568, + "loss": 0.131, + "step": 8904 + }, + { + "epoch": 0.6425195714131101, + "grad_norm": 0.13038359582424164, + "learning_rate": 0.00017431952662721895, + "loss": 0.2, + "step": 8905 + }, + { + "epoch": 0.6425917240881706, + "grad_norm": 0.13648727536201477, + "learning_rate": 0.0001743166402078222, + "loss": 0.116, + "step": 8906 + }, + { + "epoch": 0.642663876763231, + "grad_norm": 0.1238555833697319, + "learning_rate": 0.00017431375378842547, + "loss": 0.1402, + "step": 8907 + }, + { + "epoch": 0.6427360294382914, + "grad_norm": 0.09869746118783951, + "learning_rate": 0.00017431086736902874, + "loss": 0.1483, + "step": 8908 + }, + { + "epoch": 0.6428081821133519, + "grad_norm": 0.13061855733394623, + "learning_rate": 0.000174307980949632, + "loss": 0.1738, + "step": 8909 + }, + { + "epoch": 0.6428803347884123, + "grad_norm": 0.12091786414384842, + "learning_rate": 0.00017430509453023523, + "loss": 0.126, + "step": 8910 + }, + { + "epoch": 0.6429524874634727, + "grad_norm": 0.13954919576644897, + "learning_rate": 0.0001743022081108385, + "loss": 0.1315, + "step": 8911 + }, + { + "epoch": 0.6430246401385331, + "grad_norm": 0.1259092539548874, + "learning_rate": 0.0001742993216914418, + "loss": 0.1744, + "step": 8912 + }, + { + "epoch": 0.6430967928135936, + "grad_norm": 0.0919126346707344, + "learning_rate": 0.00017429643527204505, + "loss": 0.139, + "step": 8913 + }, + { + "epoch": 0.643168945488654, + "grad_norm": 0.11809264123439789, + "learning_rate": 0.00017429354885264831, + "loss": 0.2004, + "step": 8914 + }, + { + "epoch": 0.6432410981637144, + "grad_norm": 0.1017296239733696, + "learning_rate": 0.00017429066243325155, + "loss": 0.1515, + "step": 8915 + }, + { + "epoch": 0.6433132508387749, + "grad_norm": 0.10862316936254501, + "learning_rate": 0.0001742877760138548, + "loss": 0.1471, + "step": 8916 + }, + { + "epoch": 0.6433854035138353, + "grad_norm": 0.1337960660457611, + "learning_rate": 0.00017428488959445808, + "loss": 0.1852, + "step": 8917 + }, + { + "epoch": 0.6434575561888957, + "grad_norm": 0.12023504823446274, + "learning_rate": 0.00017428200317506134, + "loss": 0.2022, + "step": 8918 + }, + { + "epoch": 0.6435297088639561, + "grad_norm": 0.1266653686761856, + "learning_rate": 0.0001742791167556646, + "loss": 0.1578, + "step": 8919 + }, + { + "epoch": 0.6436018615390166, + "grad_norm": 0.09703958034515381, + "learning_rate": 0.00017427623033626786, + "loss": 0.1417, + "step": 8920 + }, + { + "epoch": 0.643674014214077, + "grad_norm": 0.10218293219804764, + "learning_rate": 0.00017427334391687113, + "loss": 0.1621, + "step": 8921 + }, + { + "epoch": 0.6437461668891374, + "grad_norm": 0.10678768157958984, + "learning_rate": 0.0001742704574974744, + "loss": 0.122, + "step": 8922 + }, + { + "epoch": 0.6438183195641979, + "grad_norm": 0.1419648379087448, + "learning_rate": 0.00017426757107807765, + "loss": 0.2476, + "step": 8923 + }, + { + "epoch": 0.6438904722392582, + "grad_norm": 0.10171731561422348, + "learning_rate": 0.00017426468465868092, + "loss": 0.1387, + "step": 8924 + }, + { + "epoch": 0.6439626249143187, + "grad_norm": 0.1244988664984703, + "learning_rate": 0.00017426179823928418, + "loss": 0.1465, + "step": 8925 + }, + { + "epoch": 0.6440347775893791, + "grad_norm": 0.09062091261148453, + "learning_rate": 0.00017425891181988741, + "loss": 0.1233, + "step": 8926 + }, + { + "epoch": 0.6441069302644395, + "grad_norm": 0.1125463992357254, + "learning_rate": 0.0001742560254004907, + "loss": 0.1311, + "step": 8927 + }, + { + "epoch": 0.6441790829395, + "grad_norm": 0.12110617756843567, + "learning_rate": 0.00017425313898109397, + "loss": 0.1166, + "step": 8928 + }, + { + "epoch": 0.6442512356145604, + "grad_norm": 0.153212770819664, + "learning_rate": 0.00017425025256169723, + "loss": 0.1534, + "step": 8929 + }, + { + "epoch": 0.6443233882896209, + "grad_norm": 0.15019434690475464, + "learning_rate": 0.0001742473661423005, + "loss": 0.1215, + "step": 8930 + }, + { + "epoch": 0.6443955409646812, + "grad_norm": 0.13318707048892975, + "learning_rate": 0.00017424447972290373, + "loss": 0.1641, + "step": 8931 + }, + { + "epoch": 0.6444676936397417, + "grad_norm": 0.1360812485218048, + "learning_rate": 0.000174241593303507, + "loss": 0.1578, + "step": 8932 + }, + { + "epoch": 0.6445398463148021, + "grad_norm": 0.15063872933387756, + "learning_rate": 0.00017423870688411025, + "loss": 0.1781, + "step": 8933 + }, + { + "epoch": 0.6446119989898625, + "grad_norm": 0.11997667700052261, + "learning_rate": 0.00017423582046471355, + "loss": 0.1412, + "step": 8934 + }, + { + "epoch": 0.644684151664923, + "grad_norm": 0.11618088185787201, + "learning_rate": 0.0001742329340453168, + "loss": 0.1667, + "step": 8935 + }, + { + "epoch": 0.6447563043399834, + "grad_norm": 0.1366327404975891, + "learning_rate": 0.00017423004762592004, + "loss": 0.1746, + "step": 8936 + }, + { + "epoch": 0.6448284570150439, + "grad_norm": 0.11012442409992218, + "learning_rate": 0.0001742271612065233, + "loss": 0.1424, + "step": 8937 + }, + { + "epoch": 0.6449006096901042, + "grad_norm": 0.10074211657047272, + "learning_rate": 0.00017422427478712657, + "loss": 0.1463, + "step": 8938 + }, + { + "epoch": 0.6449727623651647, + "grad_norm": 0.09732308983802795, + "learning_rate": 0.00017422138836772983, + "loss": 0.2021, + "step": 8939 + }, + { + "epoch": 0.6450449150402251, + "grad_norm": 0.1292310208082199, + "learning_rate": 0.0001742185019483331, + "loss": 0.158, + "step": 8940 + }, + { + "epoch": 0.6451170677152855, + "grad_norm": 0.11873973906040192, + "learning_rate": 0.00017421561552893636, + "loss": 0.1379, + "step": 8941 + }, + { + "epoch": 0.645189220390346, + "grad_norm": 0.12597502768039703, + "learning_rate": 0.00017421272910953962, + "loss": 0.1949, + "step": 8942 + }, + { + "epoch": 0.6452613730654064, + "grad_norm": 0.1288425624370575, + "learning_rate": 0.00017420984269014288, + "loss": 0.1049, + "step": 8943 + }, + { + "epoch": 0.6453335257404669, + "grad_norm": 0.11511608958244324, + "learning_rate": 0.00017420695627074615, + "loss": 0.185, + "step": 8944 + }, + { + "epoch": 0.6454056784155272, + "grad_norm": 0.11685147881507874, + "learning_rate": 0.0001742040698513494, + "loss": 0.1677, + "step": 8945 + }, + { + "epoch": 0.6454778310905877, + "grad_norm": 0.13405561447143555, + "learning_rate": 0.00017420118343195267, + "loss": 0.1731, + "step": 8946 + }, + { + "epoch": 0.6455499837656481, + "grad_norm": 0.11815590411424637, + "learning_rate": 0.0001741982970125559, + "loss": 0.1125, + "step": 8947 + }, + { + "epoch": 0.6456221364407085, + "grad_norm": 0.15212565660476685, + "learning_rate": 0.0001741954105931592, + "loss": 0.1492, + "step": 8948 + }, + { + "epoch": 0.645694289115769, + "grad_norm": 0.13262511789798737, + "learning_rate": 0.00017419252417376246, + "loss": 0.1658, + "step": 8949 + }, + { + "epoch": 0.6457664417908294, + "grad_norm": 0.12056512385606766, + "learning_rate": 0.00017418963775436572, + "loss": 0.1551, + "step": 8950 + }, + { + "epoch": 0.6458385944658899, + "grad_norm": 0.14746679365634918, + "learning_rate": 0.000174186751334969, + "loss": 0.1721, + "step": 8951 + }, + { + "epoch": 0.6459107471409502, + "grad_norm": 0.11994044482707977, + "learning_rate": 0.00017418386491557222, + "loss": 0.1043, + "step": 8952 + }, + { + "epoch": 0.6459828998160106, + "grad_norm": 0.1168675571680069, + "learning_rate": 0.00017418097849617549, + "loss": 0.1317, + "step": 8953 + }, + { + "epoch": 0.6460550524910711, + "grad_norm": 0.14804407954216003, + "learning_rate": 0.00017417809207677875, + "loss": 0.1971, + "step": 8954 + }, + { + "epoch": 0.6461272051661315, + "grad_norm": 0.13889257609844208, + "learning_rate": 0.00017417520565738204, + "loss": 0.1625, + "step": 8955 + }, + { + "epoch": 0.646199357841192, + "grad_norm": 0.13769084215164185, + "learning_rate": 0.0001741723192379853, + "loss": 0.1459, + "step": 8956 + }, + { + "epoch": 0.6462715105162524, + "grad_norm": 0.13194069266319275, + "learning_rate": 0.00017416943281858854, + "loss": 0.1474, + "step": 8957 + }, + { + "epoch": 0.6463436631913129, + "grad_norm": 0.14091116189956665, + "learning_rate": 0.0001741665463991918, + "loss": 0.1615, + "step": 8958 + }, + { + "epoch": 0.6464158158663732, + "grad_norm": 0.15515783429145813, + "learning_rate": 0.00017416365997979506, + "loss": 0.1721, + "step": 8959 + }, + { + "epoch": 0.6464879685414336, + "grad_norm": 0.12409847974777222, + "learning_rate": 0.00017416077356039833, + "loss": 0.1612, + "step": 8960 + }, + { + "epoch": 0.6465601212164941, + "grad_norm": 0.12766936421394348, + "learning_rate": 0.0001741578871410016, + "loss": 0.0999, + "step": 8961 + }, + { + "epoch": 0.6466322738915545, + "grad_norm": 0.11388033628463745, + "learning_rate": 0.00017415500072160488, + "loss": 0.156, + "step": 8962 + }, + { + "epoch": 0.646704426566615, + "grad_norm": 0.12239213287830353, + "learning_rate": 0.00017415211430220812, + "loss": 0.1523, + "step": 8963 + }, + { + "epoch": 0.6467765792416754, + "grad_norm": 0.11749175935983658, + "learning_rate": 0.00017414922788281138, + "loss": 0.1223, + "step": 8964 + }, + { + "epoch": 0.6468487319167359, + "grad_norm": 0.15253546833992004, + "learning_rate": 0.00017414634146341464, + "loss": 0.1802, + "step": 8965 + }, + { + "epoch": 0.6469208845917962, + "grad_norm": 0.32105064392089844, + "learning_rate": 0.0001741434550440179, + "loss": 0.1777, + "step": 8966 + }, + { + "epoch": 0.6469930372668566, + "grad_norm": 0.11545057594776154, + "learning_rate": 0.00017414056862462117, + "loss": 0.1665, + "step": 8967 + }, + { + "epoch": 0.6470651899419171, + "grad_norm": 0.13333509862422943, + "learning_rate": 0.00017413768220522443, + "loss": 0.1473, + "step": 8968 + }, + { + "epoch": 0.6471373426169775, + "grad_norm": 0.12783385813236237, + "learning_rate": 0.0001741347957858277, + "loss": 0.1507, + "step": 8969 + }, + { + "epoch": 0.647209495292038, + "grad_norm": 0.1804548054933548, + "learning_rate": 0.00017413190936643096, + "loss": 0.1961, + "step": 8970 + }, + { + "epoch": 0.6472816479670984, + "grad_norm": 0.11268200725317001, + "learning_rate": 0.00017412902294703422, + "loss": 0.1626, + "step": 8971 + }, + { + "epoch": 0.6473538006421589, + "grad_norm": 0.127872034907341, + "learning_rate": 0.00017412613652763748, + "loss": 0.1689, + "step": 8972 + }, + { + "epoch": 0.6474259533172192, + "grad_norm": 0.15813906490802765, + "learning_rate": 0.00017412325010824074, + "loss": 0.1737, + "step": 8973 + }, + { + "epoch": 0.6474981059922796, + "grad_norm": 0.15613089501857758, + "learning_rate": 0.00017412036368884398, + "loss": 0.1417, + "step": 8974 + }, + { + "epoch": 0.6475702586673401, + "grad_norm": 0.16650298237800598, + "learning_rate": 0.00017411747726944724, + "loss": 0.142, + "step": 8975 + }, + { + "epoch": 0.6476424113424005, + "grad_norm": 0.10688599199056625, + "learning_rate": 0.00017411459085005053, + "loss": 0.1724, + "step": 8976 + }, + { + "epoch": 0.647714564017461, + "grad_norm": 0.14942897856235504, + "learning_rate": 0.0001741117044306538, + "loss": 0.1122, + "step": 8977 + }, + { + "epoch": 0.6477867166925214, + "grad_norm": 0.13502287864685059, + "learning_rate": 0.00017410881801125706, + "loss": 0.1792, + "step": 8978 + }, + { + "epoch": 0.6478588693675819, + "grad_norm": 0.10939890146255493, + "learning_rate": 0.0001741059315918603, + "loss": 0.1645, + "step": 8979 + }, + { + "epoch": 0.6479310220426422, + "grad_norm": 0.1340005099773407, + "learning_rate": 0.00017410304517246356, + "loss": 0.169, + "step": 8980 + }, + { + "epoch": 0.6480031747177026, + "grad_norm": 0.11436860263347626, + "learning_rate": 0.00017410015875306682, + "loss": 0.1316, + "step": 8981 + }, + { + "epoch": 0.6480753273927631, + "grad_norm": 0.10513529926538467, + "learning_rate": 0.00017409727233367008, + "loss": 0.1081, + "step": 8982 + }, + { + "epoch": 0.6481474800678235, + "grad_norm": 0.13902191817760468, + "learning_rate": 0.00017409438591427337, + "loss": 0.1259, + "step": 8983 + }, + { + "epoch": 0.648219632742884, + "grad_norm": 0.13189220428466797, + "learning_rate": 0.0001740914994948766, + "loss": 0.1382, + "step": 8984 + }, + { + "epoch": 0.6482917854179444, + "grad_norm": 0.1712450236082077, + "learning_rate": 0.00017408861307547987, + "loss": 0.165, + "step": 8985 + }, + { + "epoch": 0.6483639380930047, + "grad_norm": 0.15805469453334808, + "learning_rate": 0.00017408572665608314, + "loss": 0.1587, + "step": 8986 + }, + { + "epoch": 0.6484360907680652, + "grad_norm": 0.18048928678035736, + "learning_rate": 0.0001740828402366864, + "loss": 0.1628, + "step": 8987 + }, + { + "epoch": 0.6485082434431256, + "grad_norm": 0.14291101694107056, + "learning_rate": 0.00017407995381728966, + "loss": 0.144, + "step": 8988 + }, + { + "epoch": 0.6485803961181861, + "grad_norm": 0.1614530086517334, + "learning_rate": 0.00017407706739789292, + "loss": 0.1113, + "step": 8989 + }, + { + "epoch": 0.6486525487932465, + "grad_norm": 0.14932601153850555, + "learning_rate": 0.0001740741809784962, + "loss": 0.1439, + "step": 8990 + }, + { + "epoch": 0.648724701468307, + "grad_norm": 0.1545875370502472, + "learning_rate": 0.00017407129455909945, + "loss": 0.1403, + "step": 8991 + }, + { + "epoch": 0.6487968541433674, + "grad_norm": 0.14962558448314667, + "learning_rate": 0.0001740684081397027, + "loss": 0.1506, + "step": 8992 + }, + { + "epoch": 0.6488690068184277, + "grad_norm": 0.11427868902683258, + "learning_rate": 0.00017406552172030598, + "loss": 0.1574, + "step": 8993 + }, + { + "epoch": 0.6489411594934882, + "grad_norm": 0.13557592034339905, + "learning_rate": 0.00017406263530090924, + "loss": 0.1226, + "step": 8994 + }, + { + "epoch": 0.6490133121685486, + "grad_norm": 0.12659603357315063, + "learning_rate": 0.00017405974888151247, + "loss": 0.1653, + "step": 8995 + }, + { + "epoch": 0.6490854648436091, + "grad_norm": 0.11794587969779968, + "learning_rate": 0.00017405686246211574, + "loss": 0.1686, + "step": 8996 + }, + { + "epoch": 0.6491576175186695, + "grad_norm": 0.14208902418613434, + "learning_rate": 0.00017405397604271903, + "loss": 0.1705, + "step": 8997 + }, + { + "epoch": 0.64922977019373, + "grad_norm": 0.1434546709060669, + "learning_rate": 0.0001740510896233223, + "loss": 0.1735, + "step": 8998 + }, + { + "epoch": 0.6493019228687904, + "grad_norm": 0.11880837380886078, + "learning_rate": 0.00017404820320392555, + "loss": 0.1294, + "step": 8999 + }, + { + "epoch": 0.6493740755438507, + "grad_norm": 0.11917240172624588, + "learning_rate": 0.0001740453167845288, + "loss": 0.1591, + "step": 9000 + }, + { + "epoch": 0.6494462282189112, + "grad_norm": 0.13792914152145386, + "learning_rate": 0.00017404243036513205, + "loss": 0.1422, + "step": 9001 + }, + { + "epoch": 0.6495183808939716, + "grad_norm": 0.1305783987045288, + "learning_rate": 0.00017403954394573532, + "loss": 0.144, + "step": 9002 + }, + { + "epoch": 0.6495905335690321, + "grad_norm": 0.1063060387969017, + "learning_rate": 0.00017403665752633858, + "loss": 0.154, + "step": 9003 + }, + { + "epoch": 0.6496626862440925, + "grad_norm": 0.11954107135534286, + "learning_rate": 0.00017403377110694187, + "loss": 0.1124, + "step": 9004 + }, + { + "epoch": 0.649734838919153, + "grad_norm": 0.1425926238298416, + "learning_rate": 0.0001740308846875451, + "loss": 0.1338, + "step": 9005 + }, + { + "epoch": 0.6498069915942134, + "grad_norm": 0.11237387359142303, + "learning_rate": 0.00017402799826814837, + "loss": 0.1435, + "step": 9006 + }, + { + "epoch": 0.6498791442692737, + "grad_norm": 0.12347462028265, + "learning_rate": 0.00017402511184875163, + "loss": 0.1599, + "step": 9007 + }, + { + "epoch": 0.6499512969443342, + "grad_norm": 0.10732217133045197, + "learning_rate": 0.0001740222254293549, + "loss": 0.1328, + "step": 9008 + }, + { + "epoch": 0.6500234496193946, + "grad_norm": 0.24294838309288025, + "learning_rate": 0.00017401933900995816, + "loss": 0.148, + "step": 9009 + }, + { + "epoch": 0.6500956022944551, + "grad_norm": 0.10149053484201431, + "learning_rate": 0.00017401645259056142, + "loss": 0.1174, + "step": 9010 + }, + { + "epoch": 0.6501677549695155, + "grad_norm": 0.11952211707830429, + "learning_rate": 0.00017401356617116468, + "loss": 0.1687, + "step": 9011 + }, + { + "epoch": 0.650239907644576, + "grad_norm": 0.13148994743824005, + "learning_rate": 0.00017401067975176794, + "loss": 0.1652, + "step": 9012 + }, + { + "epoch": 0.6503120603196364, + "grad_norm": 0.12399965524673462, + "learning_rate": 0.0001740077933323712, + "loss": 0.1636, + "step": 9013 + }, + { + "epoch": 0.6503842129946967, + "grad_norm": 0.114295594394207, + "learning_rate": 0.00017400490691297447, + "loss": 0.1434, + "step": 9014 + }, + { + "epoch": 0.6504563656697572, + "grad_norm": 0.13758529722690582, + "learning_rate": 0.00017400202049357773, + "loss": 0.1302, + "step": 9015 + }, + { + "epoch": 0.6505285183448176, + "grad_norm": 0.12450076639652252, + "learning_rate": 0.00017399913407418097, + "loss": 0.2087, + "step": 9016 + }, + { + "epoch": 0.6506006710198781, + "grad_norm": 0.12183558940887451, + "learning_rate": 0.00017399624765478423, + "loss": 0.1517, + "step": 9017 + }, + { + "epoch": 0.6506728236949385, + "grad_norm": 0.1189873218536377, + "learning_rate": 0.00017399336123538752, + "loss": 0.1552, + "step": 9018 + }, + { + "epoch": 0.650744976369999, + "grad_norm": 0.12722237408161163, + "learning_rate": 0.00017399047481599078, + "loss": 0.1544, + "step": 9019 + }, + { + "epoch": 0.6508171290450594, + "grad_norm": 0.17670875787734985, + "learning_rate": 0.00017398758839659405, + "loss": 0.1583, + "step": 9020 + }, + { + "epoch": 0.6508892817201197, + "grad_norm": 0.12485279142856598, + "learning_rate": 0.00017398470197719728, + "loss": 0.1588, + "step": 9021 + }, + { + "epoch": 0.6509614343951802, + "grad_norm": 0.15111367404460907, + "learning_rate": 0.00017398181555780055, + "loss": 0.1757, + "step": 9022 + }, + { + "epoch": 0.6510335870702406, + "grad_norm": 0.1123456209897995, + "learning_rate": 0.0001739789291384038, + "loss": 0.1306, + "step": 9023 + }, + { + "epoch": 0.6511057397453011, + "grad_norm": 0.13833723962306976, + "learning_rate": 0.00017397604271900707, + "loss": 0.1329, + "step": 9024 + }, + { + "epoch": 0.6511778924203615, + "grad_norm": 0.11428412050008774, + "learning_rate": 0.00017397315629961036, + "loss": 0.1491, + "step": 9025 + }, + { + "epoch": 0.651250045095422, + "grad_norm": 0.12801140546798706, + "learning_rate": 0.0001739702698802136, + "loss": 0.172, + "step": 9026 + }, + { + "epoch": 0.6513221977704824, + "grad_norm": 0.10705477744340897, + "learning_rate": 0.00017396738346081686, + "loss": 0.1163, + "step": 9027 + }, + { + "epoch": 0.6513943504455427, + "grad_norm": 0.14140252768993378, + "learning_rate": 0.00017396449704142012, + "loss": 0.1914, + "step": 9028 + }, + { + "epoch": 0.6514665031206032, + "grad_norm": 0.11550770699977875, + "learning_rate": 0.0001739616106220234, + "loss": 0.1464, + "step": 9029 + }, + { + "epoch": 0.6515386557956636, + "grad_norm": 0.11913446336984634, + "learning_rate": 0.00017395872420262665, + "loss": 0.1302, + "step": 9030 + }, + { + "epoch": 0.6516108084707241, + "grad_norm": 0.12009747326374054, + "learning_rate": 0.0001739558377832299, + "loss": 0.1749, + "step": 9031 + }, + { + "epoch": 0.6516829611457845, + "grad_norm": 0.11971908062696457, + "learning_rate": 0.00017395295136383318, + "loss": 0.133, + "step": 9032 + }, + { + "epoch": 0.651755113820845, + "grad_norm": 0.13815897703170776, + "learning_rate": 0.00017395006494443644, + "loss": 0.144, + "step": 9033 + }, + { + "epoch": 0.6518272664959054, + "grad_norm": 0.12359744310379028, + "learning_rate": 0.0001739471785250397, + "loss": 0.1585, + "step": 9034 + }, + { + "epoch": 0.6518994191709657, + "grad_norm": 0.13417810201644897, + "learning_rate": 0.00017394429210564296, + "loss": 0.1407, + "step": 9035 + }, + { + "epoch": 0.6519715718460262, + "grad_norm": 0.12911564111709595, + "learning_rate": 0.00017394140568624623, + "loss": 0.1524, + "step": 9036 + }, + { + "epoch": 0.6520437245210866, + "grad_norm": 0.1165081039071083, + "learning_rate": 0.00017393851926684946, + "loss": 0.1445, + "step": 9037 + }, + { + "epoch": 0.652115877196147, + "grad_norm": 0.14567390084266663, + "learning_rate": 0.00017393563284745273, + "loss": 0.1455, + "step": 9038 + }, + { + "epoch": 0.6521880298712075, + "grad_norm": 0.12246451526880264, + "learning_rate": 0.00017393274642805602, + "loss": 0.1476, + "step": 9039 + }, + { + "epoch": 0.6522601825462679, + "grad_norm": 0.13728214800357819, + "learning_rate": 0.00017392986000865928, + "loss": 0.1845, + "step": 9040 + }, + { + "epoch": 0.6523323352213284, + "grad_norm": 0.1250610500574112, + "learning_rate": 0.00017392697358926254, + "loss": 0.0996, + "step": 9041 + }, + { + "epoch": 0.6524044878963887, + "grad_norm": 0.12806789577007294, + "learning_rate": 0.00017392408716986578, + "loss": 0.1788, + "step": 9042 + }, + { + "epoch": 0.6524766405714492, + "grad_norm": 0.1574259102344513, + "learning_rate": 0.00017392120075046904, + "loss": 0.1679, + "step": 9043 + }, + { + "epoch": 0.6525487932465096, + "grad_norm": 0.09644627571105957, + "learning_rate": 0.0001739183143310723, + "loss": 0.1305, + "step": 9044 + }, + { + "epoch": 0.65262094592157, + "grad_norm": 0.18874594569206238, + "learning_rate": 0.00017391542791167557, + "loss": 0.1565, + "step": 9045 + }, + { + "epoch": 0.6526930985966305, + "grad_norm": 0.1280926764011383, + "learning_rate": 0.00017391254149227886, + "loss": 0.1642, + "step": 9046 + }, + { + "epoch": 0.6527652512716909, + "grad_norm": 0.12547607719898224, + "learning_rate": 0.0001739096550728821, + "loss": 0.1266, + "step": 9047 + }, + { + "epoch": 0.6528374039467513, + "grad_norm": 0.15735046565532684, + "learning_rate": 0.00017390676865348536, + "loss": 0.1283, + "step": 9048 + }, + { + "epoch": 0.6529095566218117, + "grad_norm": 0.1462177336215973, + "learning_rate": 0.00017390388223408862, + "loss": 0.1425, + "step": 9049 + }, + { + "epoch": 0.6529817092968722, + "grad_norm": 0.12556686997413635, + "learning_rate": 0.00017390099581469188, + "loss": 0.1284, + "step": 9050 + }, + { + "epoch": 0.6530538619719326, + "grad_norm": 0.12326043099164963, + "learning_rate": 0.00017389810939529514, + "loss": 0.1789, + "step": 9051 + }, + { + "epoch": 0.653126014646993, + "grad_norm": 0.13321638107299805, + "learning_rate": 0.0001738952229758984, + "loss": 0.1553, + "step": 9052 + }, + { + "epoch": 0.6531981673220535, + "grad_norm": 0.12841051816940308, + "learning_rate": 0.00017389233655650167, + "loss": 0.1218, + "step": 9053 + }, + { + "epoch": 0.6532703199971139, + "grad_norm": 0.1277436763048172, + "learning_rate": 0.00017388945013710493, + "loss": 0.1623, + "step": 9054 + }, + { + "epoch": 0.6533424726721743, + "grad_norm": 0.13014984130859375, + "learning_rate": 0.0001738865637177082, + "loss": 0.1872, + "step": 9055 + }, + { + "epoch": 0.6534146253472347, + "grad_norm": 0.08620619028806686, + "learning_rate": 0.00017388367729831146, + "loss": 0.1381, + "step": 9056 + }, + { + "epoch": 0.6534867780222952, + "grad_norm": 0.12672410905361176, + "learning_rate": 0.00017388079087891472, + "loss": 0.1348, + "step": 9057 + }, + { + "epoch": 0.6535589306973556, + "grad_norm": 0.1097164899110794, + "learning_rate": 0.00017387790445951796, + "loss": 0.1504, + "step": 9058 + }, + { + "epoch": 0.653631083372416, + "grad_norm": 0.11512299627065659, + "learning_rate": 0.00017387501804012122, + "loss": 0.1701, + "step": 9059 + }, + { + "epoch": 0.6537032360474765, + "grad_norm": 0.11480879038572311, + "learning_rate": 0.0001738721316207245, + "loss": 0.1297, + "step": 9060 + }, + { + "epoch": 0.6537753887225369, + "grad_norm": 0.11241624504327774, + "learning_rate": 0.00017386924520132777, + "loss": 0.1218, + "step": 9061 + }, + { + "epoch": 0.6538475413975973, + "grad_norm": 0.12165973335504532, + "learning_rate": 0.00017386635878193104, + "loss": 0.1612, + "step": 9062 + }, + { + "epoch": 0.6539196940726577, + "grad_norm": 0.12013527750968933, + "learning_rate": 0.00017386347236253427, + "loss": 0.1698, + "step": 9063 + }, + { + "epoch": 0.6539918467477182, + "grad_norm": 0.12046132236719131, + "learning_rate": 0.00017386058594313754, + "loss": 0.1403, + "step": 9064 + }, + { + "epoch": 0.6540639994227786, + "grad_norm": 0.19139885902404785, + "learning_rate": 0.0001738576995237408, + "loss": 0.1486, + "step": 9065 + }, + { + "epoch": 0.654136152097839, + "grad_norm": 0.10963544249534607, + "learning_rate": 0.00017385481310434406, + "loss": 0.1256, + "step": 9066 + }, + { + "epoch": 0.6542083047728995, + "grad_norm": 0.12036189436912537, + "learning_rate": 0.00017385192668494735, + "loss": 0.1456, + "step": 9067 + }, + { + "epoch": 0.6542804574479599, + "grad_norm": 0.15908774733543396, + "learning_rate": 0.0001738490402655506, + "loss": 0.1481, + "step": 9068 + }, + { + "epoch": 0.6543526101230203, + "grad_norm": 0.10817126929759979, + "learning_rate": 0.00017384615384615385, + "loss": 0.1329, + "step": 9069 + }, + { + "epoch": 0.6544247627980807, + "grad_norm": 0.15718476474285126, + "learning_rate": 0.0001738432674267571, + "loss": 0.176, + "step": 9070 + }, + { + "epoch": 0.6544969154731411, + "grad_norm": 0.11672906577587128, + "learning_rate": 0.00017384038100736038, + "loss": 0.1484, + "step": 9071 + }, + { + "epoch": 0.6545690681482016, + "grad_norm": 0.10921874642372131, + "learning_rate": 0.00017383749458796364, + "loss": 0.1706, + "step": 9072 + }, + { + "epoch": 0.654641220823262, + "grad_norm": 0.11707517504692078, + "learning_rate": 0.0001738346081685669, + "loss": 0.1011, + "step": 9073 + }, + { + "epoch": 0.6547133734983225, + "grad_norm": 0.12148593366146088, + "learning_rate": 0.00017383172174917016, + "loss": 0.1497, + "step": 9074 + }, + { + "epoch": 0.6547855261733829, + "grad_norm": 0.12721076607704163, + "learning_rate": 0.00017382883532977343, + "loss": 0.1131, + "step": 9075 + }, + { + "epoch": 0.6548576788484433, + "grad_norm": 0.1254507154226303, + "learning_rate": 0.0001738259489103767, + "loss": 0.1319, + "step": 9076 + }, + { + "epoch": 0.6549298315235037, + "grad_norm": 0.1127161830663681, + "learning_rate": 0.00017382306249097995, + "loss": 0.1671, + "step": 9077 + }, + { + "epoch": 0.6550019841985641, + "grad_norm": 0.1015736311674118, + "learning_rate": 0.00017382017607158322, + "loss": 0.1358, + "step": 9078 + }, + { + "epoch": 0.6550741368736246, + "grad_norm": 0.12445895373821259, + "learning_rate": 0.00017381728965218648, + "loss": 0.1781, + "step": 9079 + }, + { + "epoch": 0.655146289548685, + "grad_norm": 0.11472073197364807, + "learning_rate": 0.00017381440323278971, + "loss": 0.1472, + "step": 9080 + }, + { + "epoch": 0.6552184422237455, + "grad_norm": 0.1182158887386322, + "learning_rate": 0.000173811516813393, + "loss": 0.1353, + "step": 9081 + }, + { + "epoch": 0.6552905948988059, + "grad_norm": 0.10132770985364914, + "learning_rate": 0.00017380863039399627, + "loss": 0.0996, + "step": 9082 + }, + { + "epoch": 0.6553627475738663, + "grad_norm": 0.13250702619552612, + "learning_rate": 0.00017380574397459953, + "loss": 0.1399, + "step": 9083 + }, + { + "epoch": 0.6554349002489267, + "grad_norm": 0.13307896256446838, + "learning_rate": 0.0001738028575552028, + "loss": 0.1609, + "step": 9084 + }, + { + "epoch": 0.6555070529239871, + "grad_norm": 0.10817299783229828, + "learning_rate": 0.00017379997113580603, + "loss": 0.1663, + "step": 9085 + }, + { + "epoch": 0.6555792055990476, + "grad_norm": 0.12967552244663239, + "learning_rate": 0.0001737970847164093, + "loss": 0.195, + "step": 9086 + }, + { + "epoch": 0.655651358274108, + "grad_norm": 0.1627797782421112, + "learning_rate": 0.00017379419829701256, + "loss": 0.1271, + "step": 9087 + }, + { + "epoch": 0.6557235109491685, + "grad_norm": 0.39876589179039, + "learning_rate": 0.00017379131187761585, + "loss": 0.1646, + "step": 9088 + }, + { + "epoch": 0.6557956636242289, + "grad_norm": 0.12808945775032043, + "learning_rate": 0.0001737884254582191, + "loss": 0.1721, + "step": 9089 + }, + { + "epoch": 0.6558678162992893, + "grad_norm": 0.10217764228582382, + "learning_rate": 0.00017378553903882234, + "loss": 0.1559, + "step": 9090 + }, + { + "epoch": 0.6559399689743497, + "grad_norm": 0.13511773943901062, + "learning_rate": 0.0001737826526194256, + "loss": 0.1269, + "step": 9091 + }, + { + "epoch": 0.6560121216494101, + "grad_norm": 0.12675148248672485, + "learning_rate": 0.00017377976620002887, + "loss": 0.1626, + "step": 9092 + }, + { + "epoch": 0.6560842743244706, + "grad_norm": 0.1097366064786911, + "learning_rate": 0.00017377687978063213, + "loss": 0.171, + "step": 9093 + }, + { + "epoch": 0.656156426999531, + "grad_norm": 0.1200672835111618, + "learning_rate": 0.0001737739933612354, + "loss": 0.1106, + "step": 9094 + }, + { + "epoch": 0.6562285796745915, + "grad_norm": 0.09111147373914719, + "learning_rate": 0.00017377110694183866, + "loss": 0.1084, + "step": 9095 + }, + { + "epoch": 0.6563007323496519, + "grad_norm": 0.13823656737804413, + "learning_rate": 0.00017376822052244192, + "loss": 0.1241, + "step": 9096 + }, + { + "epoch": 0.6563728850247122, + "grad_norm": 0.1446205973625183, + "learning_rate": 0.00017376533410304518, + "loss": 0.1689, + "step": 9097 + }, + { + "epoch": 0.6564450376997727, + "grad_norm": 0.13478872179985046, + "learning_rate": 0.00017376244768364845, + "loss": 0.1042, + "step": 9098 + }, + { + "epoch": 0.6565171903748331, + "grad_norm": 0.12716075778007507, + "learning_rate": 0.0001737595612642517, + "loss": 0.14, + "step": 9099 + }, + { + "epoch": 0.6565893430498936, + "grad_norm": 0.1401834487915039, + "learning_rate": 0.00017375667484485497, + "loss": 0.1949, + "step": 9100 + }, + { + "epoch": 0.656661495724954, + "grad_norm": 0.1363379806280136, + "learning_rate": 0.0001737537884254582, + "loss": 0.1919, + "step": 9101 + }, + { + "epoch": 0.6567336484000145, + "grad_norm": 0.12148359417915344, + "learning_rate": 0.0001737509020060615, + "loss": 0.0931, + "step": 9102 + }, + { + "epoch": 0.6568058010750749, + "grad_norm": 0.17094339430332184, + "learning_rate": 0.00017374801558666476, + "loss": 0.1411, + "step": 9103 + }, + { + "epoch": 0.6568779537501352, + "grad_norm": 0.11943087726831436, + "learning_rate": 0.00017374512916726802, + "loss": 0.2096, + "step": 9104 + }, + { + "epoch": 0.6569501064251957, + "grad_norm": 0.1346716731786728, + "learning_rate": 0.0001737422427478713, + "loss": 0.1242, + "step": 9105 + }, + { + "epoch": 0.6570222591002561, + "grad_norm": 0.14123597741127014, + "learning_rate": 0.00017373935632847452, + "loss": 0.1607, + "step": 9106 + }, + { + "epoch": 0.6570944117753166, + "grad_norm": 0.12001238763332367, + "learning_rate": 0.0001737364699090778, + "loss": 0.0955, + "step": 9107 + }, + { + "epoch": 0.657166564450377, + "grad_norm": 0.1153496503829956, + "learning_rate": 0.00017373358348968105, + "loss": 0.1066, + "step": 9108 + }, + { + "epoch": 0.6572387171254375, + "grad_norm": 0.1329270452260971, + "learning_rate": 0.00017373069707028434, + "loss": 0.213, + "step": 9109 + }, + { + "epoch": 0.6573108698004978, + "grad_norm": 0.13615714013576508, + "learning_rate": 0.0001737278106508876, + "loss": 0.1278, + "step": 9110 + }, + { + "epoch": 0.6573830224755582, + "grad_norm": 0.14342476427555084, + "learning_rate": 0.00017372492423149084, + "loss": 0.1512, + "step": 9111 + }, + { + "epoch": 0.6574551751506187, + "grad_norm": 0.1282559186220169, + "learning_rate": 0.0001737220378120941, + "loss": 0.1891, + "step": 9112 + }, + { + "epoch": 0.6575273278256791, + "grad_norm": 0.14406467974185944, + "learning_rate": 0.00017371915139269736, + "loss": 0.1585, + "step": 9113 + }, + { + "epoch": 0.6575994805007396, + "grad_norm": 0.22428575158119202, + "learning_rate": 0.00017371626497330063, + "loss": 0.1143, + "step": 9114 + }, + { + "epoch": 0.6576716331758, + "grad_norm": 0.13172686100006104, + "learning_rate": 0.0001737133785539039, + "loss": 0.1947, + "step": 9115 + }, + { + "epoch": 0.6577437858508605, + "grad_norm": 0.11411601305007935, + "learning_rate": 0.00017371049213450715, + "loss": 0.1748, + "step": 9116 + }, + { + "epoch": 0.6578159385259208, + "grad_norm": 0.12008052319288254, + "learning_rate": 0.00017370760571511042, + "loss": 0.1556, + "step": 9117 + }, + { + "epoch": 0.6578880912009812, + "grad_norm": 0.26494452357292175, + "learning_rate": 0.00017370471929571368, + "loss": 0.1572, + "step": 9118 + }, + { + "epoch": 0.6579602438760417, + "grad_norm": 0.11839305609464645, + "learning_rate": 0.00017370183287631694, + "loss": 0.1664, + "step": 9119 + }, + { + "epoch": 0.6580323965511021, + "grad_norm": 0.10700162500143051, + "learning_rate": 0.0001736989464569202, + "loss": 0.1318, + "step": 9120 + }, + { + "epoch": 0.6581045492261626, + "grad_norm": 0.11779207736253738, + "learning_rate": 0.00017369606003752347, + "loss": 0.1172, + "step": 9121 + }, + { + "epoch": 0.658176701901223, + "grad_norm": 0.32551857829093933, + "learning_rate": 0.0001736931736181267, + "loss": 0.1116, + "step": 9122 + }, + { + "epoch": 0.6582488545762835, + "grad_norm": 0.11007951945066452, + "learning_rate": 0.00017369028719872997, + "loss": 0.1587, + "step": 9123 + }, + { + "epoch": 0.6583210072513438, + "grad_norm": 0.10492239147424698, + "learning_rate": 0.00017368740077933326, + "loss": 0.1242, + "step": 9124 + }, + { + "epoch": 0.6583931599264042, + "grad_norm": 0.12001994997262955, + "learning_rate": 0.00017368451435993652, + "loss": 0.166, + "step": 9125 + }, + { + "epoch": 0.6584653126014647, + "grad_norm": 0.14339768886566162, + "learning_rate": 0.00017368162794053978, + "loss": 0.1323, + "step": 9126 + }, + { + "epoch": 0.6585374652765251, + "grad_norm": 0.12687118351459503, + "learning_rate": 0.00017367874152114302, + "loss": 0.1348, + "step": 9127 + }, + { + "epoch": 0.6586096179515856, + "grad_norm": 0.1434631049633026, + "learning_rate": 0.00017367585510174628, + "loss": 0.1581, + "step": 9128 + }, + { + "epoch": 0.658681770626646, + "grad_norm": 0.11574066430330276, + "learning_rate": 0.00017367296868234954, + "loss": 0.1262, + "step": 9129 + }, + { + "epoch": 0.6587539233017065, + "grad_norm": 0.13802607357501984, + "learning_rate": 0.0001736700822629528, + "loss": 0.1637, + "step": 9130 + }, + { + "epoch": 0.6588260759767668, + "grad_norm": 0.12027553468942642, + "learning_rate": 0.0001736671958435561, + "loss": 0.1266, + "step": 9131 + }, + { + "epoch": 0.6588982286518272, + "grad_norm": 0.13257108628749847, + "learning_rate": 0.00017366430942415933, + "loss": 0.2181, + "step": 9132 + }, + { + "epoch": 0.6589703813268877, + "grad_norm": 0.15435339510440826, + "learning_rate": 0.0001736614230047626, + "loss": 0.1721, + "step": 9133 + }, + { + "epoch": 0.6590425340019481, + "grad_norm": 0.12928619980812073, + "learning_rate": 0.00017365853658536586, + "loss": 0.1218, + "step": 9134 + }, + { + "epoch": 0.6591146866770086, + "grad_norm": 0.13578961789608002, + "learning_rate": 0.00017365565016596912, + "loss": 0.1625, + "step": 9135 + }, + { + "epoch": 0.659186839352069, + "grad_norm": 0.11385858058929443, + "learning_rate": 0.00017365276374657238, + "loss": 0.177, + "step": 9136 + }, + { + "epoch": 0.6592589920271295, + "grad_norm": 0.11839604377746582, + "learning_rate": 0.00017364987732717565, + "loss": 0.1303, + "step": 9137 + }, + { + "epoch": 0.6593311447021898, + "grad_norm": 0.15037642419338226, + "learning_rate": 0.0001736469909077789, + "loss": 0.2064, + "step": 9138 + }, + { + "epoch": 0.6594032973772502, + "grad_norm": 0.15002579987049103, + "learning_rate": 0.00017364410448838217, + "loss": 0.116, + "step": 9139 + }, + { + "epoch": 0.6594754500523107, + "grad_norm": 0.14399875700473785, + "learning_rate": 0.00017364121806898544, + "loss": 0.1374, + "step": 9140 + }, + { + "epoch": 0.6595476027273711, + "grad_norm": 0.161982923746109, + "learning_rate": 0.0001736383316495887, + "loss": 0.1634, + "step": 9141 + }, + { + "epoch": 0.6596197554024316, + "grad_norm": 0.11750172823667526, + "learning_rate": 0.00017363544523019196, + "loss": 0.1982, + "step": 9142 + }, + { + "epoch": 0.659691908077492, + "grad_norm": 0.1385720819234848, + "learning_rate": 0.0001736325588107952, + "loss": 0.1265, + "step": 9143 + }, + { + "epoch": 0.6597640607525525, + "grad_norm": 0.11476393043994904, + "learning_rate": 0.00017362967239139846, + "loss": 0.1465, + "step": 9144 + }, + { + "epoch": 0.6598362134276128, + "grad_norm": 0.15320426225662231, + "learning_rate": 0.00017362678597200175, + "loss": 0.1539, + "step": 9145 + }, + { + "epoch": 0.6599083661026732, + "grad_norm": 0.1807917058467865, + "learning_rate": 0.000173623899552605, + "loss": 0.1649, + "step": 9146 + }, + { + "epoch": 0.6599805187777337, + "grad_norm": 0.12906044721603394, + "learning_rate": 0.00017362101313320828, + "loss": 0.1785, + "step": 9147 + }, + { + "epoch": 0.6600526714527941, + "grad_norm": 0.12782162427902222, + "learning_rate": 0.0001736181267138115, + "loss": 0.223, + "step": 9148 + }, + { + "epoch": 0.6601248241278546, + "grad_norm": 0.11752726882696152, + "learning_rate": 0.00017361524029441477, + "loss": 0.1313, + "step": 9149 + }, + { + "epoch": 0.660196976802915, + "grad_norm": 0.13953953981399536, + "learning_rate": 0.00017361235387501804, + "loss": 0.1489, + "step": 9150 + }, + { + "epoch": 0.6602691294779754, + "grad_norm": 0.14270111918449402, + "learning_rate": 0.0001736094674556213, + "loss": 0.1692, + "step": 9151 + }, + { + "epoch": 0.6603412821530358, + "grad_norm": 0.1215793639421463, + "learning_rate": 0.0001736065810362246, + "loss": 0.1602, + "step": 9152 + }, + { + "epoch": 0.6604134348280962, + "grad_norm": 0.11785700917243958, + "learning_rate": 0.00017360369461682783, + "loss": 0.1667, + "step": 9153 + }, + { + "epoch": 0.6604855875031567, + "grad_norm": 0.13058754801750183, + "learning_rate": 0.0001736008081974311, + "loss": 0.1413, + "step": 9154 + }, + { + "epoch": 0.6605577401782171, + "grad_norm": 0.14015406370162964, + "learning_rate": 0.00017359792177803435, + "loss": 0.1907, + "step": 9155 + }, + { + "epoch": 0.6606298928532776, + "grad_norm": 0.13626953959465027, + "learning_rate": 0.00017359503535863762, + "loss": 0.1652, + "step": 9156 + }, + { + "epoch": 0.660702045528338, + "grad_norm": 0.1171763464808464, + "learning_rate": 0.00017359214893924088, + "loss": 0.1636, + "step": 9157 + }, + { + "epoch": 0.6607741982033984, + "grad_norm": 0.13445504009723663, + "learning_rate": 0.00017358926251984414, + "loss": 0.1257, + "step": 9158 + }, + { + "epoch": 0.6608463508784588, + "grad_norm": 0.11587665975093842, + "learning_rate": 0.0001735863761004474, + "loss": 0.1388, + "step": 9159 + }, + { + "epoch": 0.6609185035535192, + "grad_norm": 0.12831035256385803, + "learning_rate": 0.00017358348968105067, + "loss": 0.1398, + "step": 9160 + }, + { + "epoch": 0.6609906562285797, + "grad_norm": 0.1297786980867386, + "learning_rate": 0.00017358060326165393, + "loss": 0.1714, + "step": 9161 + }, + { + "epoch": 0.6610628089036401, + "grad_norm": 0.11348583549261093, + "learning_rate": 0.0001735777168422572, + "loss": 0.2014, + "step": 9162 + }, + { + "epoch": 0.6611349615787006, + "grad_norm": 0.10949632525444031, + "learning_rate": 0.00017357483042286046, + "loss": 0.1695, + "step": 9163 + }, + { + "epoch": 0.661207114253761, + "grad_norm": 0.10830409079790115, + "learning_rate": 0.0001735719440034637, + "loss": 0.1603, + "step": 9164 + }, + { + "epoch": 0.6612792669288214, + "grad_norm": 0.12362265586853027, + "learning_rate": 0.00017356905758406695, + "loss": 0.1288, + "step": 9165 + }, + { + "epoch": 0.6613514196038818, + "grad_norm": 0.14946235716342926, + "learning_rate": 0.00017356617116467024, + "loss": 0.1215, + "step": 9166 + }, + { + "epoch": 0.6614235722789422, + "grad_norm": 0.10959585756063461, + "learning_rate": 0.0001735632847452735, + "loss": 0.1563, + "step": 9167 + }, + { + "epoch": 0.6614957249540027, + "grad_norm": 0.11575577408075333, + "learning_rate": 0.00017356039832587677, + "loss": 0.1416, + "step": 9168 + }, + { + "epoch": 0.6615678776290631, + "grad_norm": 0.14225371181964874, + "learning_rate": 0.00017355751190648, + "loss": 0.112, + "step": 9169 + }, + { + "epoch": 0.6616400303041235, + "grad_norm": 0.1122872456908226, + "learning_rate": 0.00017355462548708327, + "loss": 0.1053, + "step": 9170 + }, + { + "epoch": 0.661712182979184, + "grad_norm": 0.12862779200077057, + "learning_rate": 0.00017355173906768653, + "loss": 0.1571, + "step": 9171 + }, + { + "epoch": 0.6617843356542443, + "grad_norm": 0.09694743156433105, + "learning_rate": 0.0001735488526482898, + "loss": 0.1853, + "step": 9172 + }, + { + "epoch": 0.6618564883293048, + "grad_norm": 0.12143151462078094, + "learning_rate": 0.00017354596622889309, + "loss": 0.1153, + "step": 9173 + }, + { + "epoch": 0.6619286410043652, + "grad_norm": 0.1144392341375351, + "learning_rate": 0.00017354307980949632, + "loss": 0.1797, + "step": 9174 + }, + { + "epoch": 0.6620007936794257, + "grad_norm": 0.11424224823713303, + "learning_rate": 0.00017354019339009958, + "loss": 0.1396, + "step": 9175 + }, + { + "epoch": 0.6620729463544861, + "grad_norm": 0.10566970705986023, + "learning_rate": 0.00017353730697070285, + "loss": 0.1309, + "step": 9176 + }, + { + "epoch": 0.6621450990295465, + "grad_norm": 0.11206743121147156, + "learning_rate": 0.0001735344205513061, + "loss": 0.1465, + "step": 9177 + }, + { + "epoch": 0.662217251704607, + "grad_norm": 0.12112920731306076, + "learning_rate": 0.00017353153413190937, + "loss": 0.1493, + "step": 9178 + }, + { + "epoch": 0.6622894043796673, + "grad_norm": 0.12534743547439575, + "learning_rate": 0.00017352864771251264, + "loss": 0.1683, + "step": 9179 + }, + { + "epoch": 0.6623615570547278, + "grad_norm": 0.1410726010799408, + "learning_rate": 0.0001735257612931159, + "loss": 0.1698, + "step": 9180 + }, + { + "epoch": 0.6624337097297882, + "grad_norm": 0.14639095962047577, + "learning_rate": 0.00017352287487371916, + "loss": 0.15, + "step": 9181 + }, + { + "epoch": 0.6625058624048487, + "grad_norm": 0.13366194069385529, + "learning_rate": 0.00017351998845432242, + "loss": 0.1467, + "step": 9182 + }, + { + "epoch": 0.6625780150799091, + "grad_norm": 0.1295410394668579, + "learning_rate": 0.0001735171020349257, + "loss": 0.1067, + "step": 9183 + }, + { + "epoch": 0.6626501677549695, + "grad_norm": 0.13306112587451935, + "learning_rate": 0.00017351421561552895, + "loss": 0.1788, + "step": 9184 + }, + { + "epoch": 0.66272232043003, + "grad_norm": 0.12532058358192444, + "learning_rate": 0.0001735113291961322, + "loss": 0.188, + "step": 9185 + }, + { + "epoch": 0.6627944731050903, + "grad_norm": 0.1394512951374054, + "learning_rate": 0.00017350844277673545, + "loss": 0.1339, + "step": 9186 + }, + { + "epoch": 0.6628666257801508, + "grad_norm": 0.1411266326904297, + "learning_rate": 0.00017350555635733874, + "loss": 0.1815, + "step": 9187 + }, + { + "epoch": 0.6629387784552112, + "grad_norm": 0.14007483422756195, + "learning_rate": 0.000173502669937942, + "loss": 0.1358, + "step": 9188 + }, + { + "epoch": 0.6630109311302717, + "grad_norm": 0.11793815344572067, + "learning_rate": 0.00017349978351854526, + "loss": 0.1542, + "step": 9189 + }, + { + "epoch": 0.6630830838053321, + "grad_norm": 0.12423793226480484, + "learning_rate": 0.00017349689709914853, + "loss": 0.1299, + "step": 9190 + }, + { + "epoch": 0.6631552364803925, + "grad_norm": 0.10026775300502777, + "learning_rate": 0.00017349401067975176, + "loss": 0.1334, + "step": 9191 + }, + { + "epoch": 0.663227389155453, + "grad_norm": 0.12117689102888107, + "learning_rate": 0.00017349112426035503, + "loss": 0.1354, + "step": 9192 + }, + { + "epoch": 0.6632995418305133, + "grad_norm": 0.10784576833248138, + "learning_rate": 0.0001734882378409583, + "loss": 0.144, + "step": 9193 + }, + { + "epoch": 0.6633716945055738, + "grad_norm": 0.12422692775726318, + "learning_rate": 0.00017348535142156158, + "loss": 0.2063, + "step": 9194 + }, + { + "epoch": 0.6634438471806342, + "grad_norm": 0.11072385311126709, + "learning_rate": 0.00017348246500216484, + "loss": 0.1621, + "step": 9195 + }, + { + "epoch": 0.6635159998556946, + "grad_norm": 0.11303888261318207, + "learning_rate": 0.00017347957858276808, + "loss": 0.1967, + "step": 9196 + }, + { + "epoch": 0.6635881525307551, + "grad_norm": 0.1248398870229721, + "learning_rate": 0.00017347669216337134, + "loss": 0.135, + "step": 9197 + }, + { + "epoch": 0.6636603052058155, + "grad_norm": 0.15116511285305023, + "learning_rate": 0.0001734738057439746, + "loss": 0.1321, + "step": 9198 + }, + { + "epoch": 0.663732457880876, + "grad_norm": 0.14752434194087982, + "learning_rate": 0.00017347091932457787, + "loss": 0.1263, + "step": 9199 + }, + { + "epoch": 0.6638046105559363, + "grad_norm": 0.14145156741142273, + "learning_rate": 0.00017346803290518113, + "loss": 0.1105, + "step": 9200 + }, + { + "epoch": 0.6638767632309968, + "grad_norm": 0.12625358998775482, + "learning_rate": 0.0001734651464857844, + "loss": 0.1207, + "step": 9201 + }, + { + "epoch": 0.6639489159060572, + "grad_norm": 0.1078922376036644, + "learning_rate": 0.00017346226006638766, + "loss": 0.1343, + "step": 9202 + }, + { + "epoch": 0.6640210685811176, + "grad_norm": 0.11718551069498062, + "learning_rate": 0.00017345937364699092, + "loss": 0.1425, + "step": 9203 + }, + { + "epoch": 0.6640932212561781, + "grad_norm": 0.11266859620809555, + "learning_rate": 0.00017345648722759418, + "loss": 0.1679, + "step": 9204 + }, + { + "epoch": 0.6641653739312385, + "grad_norm": 0.0987582877278328, + "learning_rate": 0.00017345360080819744, + "loss": 0.1503, + "step": 9205 + }, + { + "epoch": 0.664237526606299, + "grad_norm": 0.11114714294672012, + "learning_rate": 0.0001734507143888007, + "loss": 0.1958, + "step": 9206 + }, + { + "epoch": 0.6643096792813593, + "grad_norm": 0.13825024664402008, + "learning_rate": 0.00017344782796940394, + "loss": 0.1484, + "step": 9207 + }, + { + "epoch": 0.6643818319564198, + "grad_norm": 0.1525411456823349, + "learning_rate": 0.00017344494155000723, + "loss": 0.1482, + "step": 9208 + }, + { + "epoch": 0.6644539846314802, + "grad_norm": 0.11406046152114868, + "learning_rate": 0.0001734420551306105, + "loss": 0.1675, + "step": 9209 + }, + { + "epoch": 0.6645261373065406, + "grad_norm": 0.12549228966236115, + "learning_rate": 0.00017343916871121376, + "loss": 0.1388, + "step": 9210 + }, + { + "epoch": 0.6645982899816011, + "grad_norm": 0.10623666644096375, + "learning_rate": 0.00017343628229181702, + "loss": 0.1478, + "step": 9211 + }, + { + "epoch": 0.6646704426566615, + "grad_norm": 0.11086919903755188, + "learning_rate": 0.00017343339587242026, + "loss": 0.1588, + "step": 9212 + }, + { + "epoch": 0.664742595331722, + "grad_norm": 0.14076271653175354, + "learning_rate": 0.00017343050945302352, + "loss": 0.1376, + "step": 9213 + }, + { + "epoch": 0.6648147480067823, + "grad_norm": 0.14605247974395752, + "learning_rate": 0.00017342762303362678, + "loss": 0.1864, + "step": 9214 + }, + { + "epoch": 0.6648869006818428, + "grad_norm": 0.11169461905956268, + "learning_rate": 0.00017342473661423007, + "loss": 0.1709, + "step": 9215 + }, + { + "epoch": 0.6649590533569032, + "grad_norm": 0.1111663356423378, + "learning_rate": 0.00017342185019483334, + "loss": 0.1568, + "step": 9216 + }, + { + "epoch": 0.6650312060319636, + "grad_norm": 0.11549469828605652, + "learning_rate": 0.00017341896377543657, + "loss": 0.1018, + "step": 9217 + }, + { + "epoch": 0.6651033587070241, + "grad_norm": 0.12490471452474594, + "learning_rate": 0.00017341607735603984, + "loss": 0.1482, + "step": 9218 + }, + { + "epoch": 0.6651755113820845, + "grad_norm": 0.14729900658130646, + "learning_rate": 0.0001734131909366431, + "loss": 0.1523, + "step": 9219 + }, + { + "epoch": 0.665247664057145, + "grad_norm": 0.12454581260681152, + "learning_rate": 0.00017341030451724636, + "loss": 0.1901, + "step": 9220 + }, + { + "epoch": 0.6653198167322053, + "grad_norm": 0.14527574181556702, + "learning_rate": 0.00017340741809784962, + "loss": 0.1269, + "step": 9221 + }, + { + "epoch": 0.6653919694072657, + "grad_norm": 0.11533478647470474, + "learning_rate": 0.0001734045316784529, + "loss": 0.1096, + "step": 9222 + }, + { + "epoch": 0.6654641220823262, + "grad_norm": 0.10546639561653137, + "learning_rate": 0.00017340164525905615, + "loss": 0.1381, + "step": 9223 + }, + { + "epoch": 0.6655362747573866, + "grad_norm": 0.16379734873771667, + "learning_rate": 0.0001733987588396594, + "loss": 0.1197, + "step": 9224 + }, + { + "epoch": 0.6656084274324471, + "grad_norm": 0.1349085569381714, + "learning_rate": 0.00017339587242026268, + "loss": 0.1796, + "step": 9225 + }, + { + "epoch": 0.6656805801075075, + "grad_norm": 0.11026155203580856, + "learning_rate": 0.00017339298600086594, + "loss": 0.1436, + "step": 9226 + }, + { + "epoch": 0.665752732782568, + "grad_norm": 0.11905506253242493, + "learning_rate": 0.0001733900995814692, + "loss": 0.146, + "step": 9227 + }, + { + "epoch": 0.6658248854576283, + "grad_norm": 0.11506408452987671, + "learning_rate": 0.00017338721316207244, + "loss": 0.0963, + "step": 9228 + }, + { + "epoch": 0.6658970381326887, + "grad_norm": 0.11228253692388535, + "learning_rate": 0.00017338432674267573, + "loss": 0.1216, + "step": 9229 + }, + { + "epoch": 0.6659691908077492, + "grad_norm": 0.12005262076854706, + "learning_rate": 0.000173381440323279, + "loss": 0.1218, + "step": 9230 + }, + { + "epoch": 0.6660413434828096, + "grad_norm": 0.15088176727294922, + "learning_rate": 0.00017337855390388225, + "loss": 0.2027, + "step": 9231 + }, + { + "epoch": 0.6661134961578701, + "grad_norm": 0.11610926687717438, + "learning_rate": 0.00017337566748448552, + "loss": 0.1593, + "step": 9232 + }, + { + "epoch": 0.6661856488329305, + "grad_norm": 0.12422781437635422, + "learning_rate": 0.00017337278106508875, + "loss": 0.1488, + "step": 9233 + }, + { + "epoch": 0.6662578015079909, + "grad_norm": 0.12029799818992615, + "learning_rate": 0.00017336989464569201, + "loss": 0.1523, + "step": 9234 + }, + { + "epoch": 0.6663299541830513, + "grad_norm": 0.12359961867332458, + "learning_rate": 0.00017336700822629528, + "loss": 0.1469, + "step": 9235 + }, + { + "epoch": 0.6664021068581117, + "grad_norm": 0.10292413085699081, + "learning_rate": 0.00017336412180689857, + "loss": 0.1257, + "step": 9236 + }, + { + "epoch": 0.6664742595331722, + "grad_norm": 0.09860517829656601, + "learning_rate": 0.00017336123538750183, + "loss": 0.1214, + "step": 9237 + }, + { + "epoch": 0.6665464122082326, + "grad_norm": 0.10806264728307724, + "learning_rate": 0.00017335834896810507, + "loss": 0.1466, + "step": 9238 + }, + { + "epoch": 0.6666185648832931, + "grad_norm": 0.1720741242170334, + "learning_rate": 0.00017335546254870833, + "loss": 0.2037, + "step": 9239 + }, + { + "epoch": 0.6666907175583535, + "grad_norm": 0.1357303261756897, + "learning_rate": 0.0001733525761293116, + "loss": 0.1505, + "step": 9240 + }, + { + "epoch": 0.6667628702334139, + "grad_norm": 0.11138720065355301, + "learning_rate": 0.00017334968970991486, + "loss": 0.117, + "step": 9241 + }, + { + "epoch": 0.6668350229084743, + "grad_norm": 0.1529517024755478, + "learning_rate": 0.00017334680329051812, + "loss": 0.1293, + "step": 9242 + }, + { + "epoch": 0.6669071755835347, + "grad_norm": 0.1390070766210556, + "learning_rate": 0.00017334391687112138, + "loss": 0.1433, + "step": 9243 + }, + { + "epoch": 0.6669793282585952, + "grad_norm": 0.1554955691099167, + "learning_rate": 0.00017334103045172464, + "loss": 0.1329, + "step": 9244 + }, + { + "epoch": 0.6670514809336556, + "grad_norm": 0.13039268553256989, + "learning_rate": 0.0001733381440323279, + "loss": 0.1956, + "step": 9245 + }, + { + "epoch": 0.6671236336087161, + "grad_norm": 0.1186772882938385, + "learning_rate": 0.00017333525761293117, + "loss": 0.1512, + "step": 9246 + }, + { + "epoch": 0.6671957862837765, + "grad_norm": 0.12815089523792267, + "learning_rate": 0.00017333237119353443, + "loss": 0.1297, + "step": 9247 + }, + { + "epoch": 0.6672679389588368, + "grad_norm": 0.13035452365875244, + "learning_rate": 0.0001733294847741377, + "loss": 0.1845, + "step": 9248 + }, + { + "epoch": 0.6673400916338973, + "grad_norm": 0.12902598083019257, + "learning_rate": 0.00017332659835474093, + "loss": 0.125, + "step": 9249 + }, + { + "epoch": 0.6674122443089577, + "grad_norm": 0.13073696196079254, + "learning_rate": 0.00017332371193534422, + "loss": 0.1342, + "step": 9250 + }, + { + "epoch": 0.6674843969840182, + "grad_norm": 0.12306298315525055, + "learning_rate": 0.00017332082551594748, + "loss": 0.1453, + "step": 9251 + }, + { + "epoch": 0.6675565496590786, + "grad_norm": 0.1435564160346985, + "learning_rate": 0.00017331793909655075, + "loss": 0.2049, + "step": 9252 + }, + { + "epoch": 0.6676287023341391, + "grad_norm": 0.1250120848417282, + "learning_rate": 0.000173315052677154, + "loss": 0.1394, + "step": 9253 + }, + { + "epoch": 0.6677008550091995, + "grad_norm": 0.1147419810295105, + "learning_rate": 0.00017331216625775725, + "loss": 0.1613, + "step": 9254 + }, + { + "epoch": 0.6677730076842598, + "grad_norm": 0.15764589607715607, + "learning_rate": 0.0001733092798383605, + "loss": 0.1241, + "step": 9255 + }, + { + "epoch": 0.6678451603593203, + "grad_norm": 0.16571545600891113, + "learning_rate": 0.00017330639341896377, + "loss": 0.1939, + "step": 9256 + }, + { + "epoch": 0.6679173130343807, + "grad_norm": 0.1149231493473053, + "learning_rate": 0.00017330350699956706, + "loss": 0.133, + "step": 9257 + }, + { + "epoch": 0.6679894657094412, + "grad_norm": 0.15089252591133118, + "learning_rate": 0.00017330062058017033, + "loss": 0.1276, + "step": 9258 + }, + { + "epoch": 0.6680616183845016, + "grad_norm": 0.11390020698308945, + "learning_rate": 0.00017329773416077356, + "loss": 0.1727, + "step": 9259 + }, + { + "epoch": 0.6681337710595621, + "grad_norm": 0.14585581421852112, + "learning_rate": 0.00017329484774137682, + "loss": 0.153, + "step": 9260 + }, + { + "epoch": 0.6682059237346225, + "grad_norm": 0.10556942224502563, + "learning_rate": 0.0001732919613219801, + "loss": 0.1383, + "step": 9261 + }, + { + "epoch": 0.6682780764096828, + "grad_norm": 0.1368727684020996, + "learning_rate": 0.00017328907490258335, + "loss": 0.1672, + "step": 9262 + }, + { + "epoch": 0.6683502290847433, + "grad_norm": 0.10373331606388092, + "learning_rate": 0.0001732861884831866, + "loss": 0.1535, + "step": 9263 + }, + { + "epoch": 0.6684223817598037, + "grad_norm": 0.1099395751953125, + "learning_rate": 0.00017328330206378988, + "loss": 0.1044, + "step": 9264 + }, + { + "epoch": 0.6684945344348642, + "grad_norm": 0.16567417979240417, + "learning_rate": 0.00017328041564439314, + "loss": 0.2068, + "step": 9265 + }, + { + "epoch": 0.6685666871099246, + "grad_norm": 0.12957066297531128, + "learning_rate": 0.0001732775292249964, + "loss": 0.1651, + "step": 9266 + }, + { + "epoch": 0.6686388397849851, + "grad_norm": 0.1365097612142563, + "learning_rate": 0.00017327464280559966, + "loss": 0.1454, + "step": 9267 + }, + { + "epoch": 0.6687109924600455, + "grad_norm": 0.11914641410112381, + "learning_rate": 0.00017327175638620293, + "loss": 0.1368, + "step": 9268 + }, + { + "epoch": 0.6687831451351058, + "grad_norm": 0.13403435051441193, + "learning_rate": 0.0001732688699668062, + "loss": 0.1677, + "step": 9269 + }, + { + "epoch": 0.6688552978101663, + "grad_norm": 0.12896324694156647, + "learning_rate": 0.00017326598354740943, + "loss": 0.1832, + "step": 9270 + }, + { + "epoch": 0.6689274504852267, + "grad_norm": 0.13894294202327728, + "learning_rate": 0.00017326309712801272, + "loss": 0.1239, + "step": 9271 + }, + { + "epoch": 0.6689996031602872, + "grad_norm": 0.1334586888551712, + "learning_rate": 0.00017326021070861598, + "loss": 0.1633, + "step": 9272 + }, + { + "epoch": 0.6690717558353476, + "grad_norm": 0.11441192030906677, + "learning_rate": 0.00017325732428921924, + "loss": 0.1351, + "step": 9273 + }, + { + "epoch": 0.6691439085104081, + "grad_norm": 0.12247227877378464, + "learning_rate": 0.0001732544378698225, + "loss": 0.1434, + "step": 9274 + }, + { + "epoch": 0.6692160611854685, + "grad_norm": 0.10820969194173813, + "learning_rate": 0.00017325155145042574, + "loss": 0.131, + "step": 9275 + }, + { + "epoch": 0.6692882138605288, + "grad_norm": 0.11150060594081879, + "learning_rate": 0.000173248665031029, + "loss": 0.1278, + "step": 9276 + }, + { + "epoch": 0.6693603665355893, + "grad_norm": 0.11507835239171982, + "learning_rate": 0.00017324577861163227, + "loss": 0.1424, + "step": 9277 + }, + { + "epoch": 0.6694325192106497, + "grad_norm": 0.14963656663894653, + "learning_rate": 0.00017324289219223556, + "loss": 0.2403, + "step": 9278 + }, + { + "epoch": 0.6695046718857102, + "grad_norm": 0.15234751999378204, + "learning_rate": 0.00017324000577283882, + "loss": 0.1937, + "step": 9279 + }, + { + "epoch": 0.6695768245607706, + "grad_norm": 0.12259795516729355, + "learning_rate": 0.00017323711935344206, + "loss": 0.1788, + "step": 9280 + }, + { + "epoch": 0.669648977235831, + "grad_norm": 0.11492430418729782, + "learning_rate": 0.00017323423293404532, + "loss": 0.103, + "step": 9281 + }, + { + "epoch": 0.6697211299108915, + "grad_norm": 0.13131506741046906, + "learning_rate": 0.00017323134651464858, + "loss": 0.1534, + "step": 9282 + }, + { + "epoch": 0.6697932825859518, + "grad_norm": 0.14066502451896667, + "learning_rate": 0.00017322846009525184, + "loss": 0.1237, + "step": 9283 + }, + { + "epoch": 0.6698654352610123, + "grad_norm": 0.12199169397354126, + "learning_rate": 0.0001732255736758551, + "loss": 0.1902, + "step": 9284 + }, + { + "epoch": 0.6699375879360727, + "grad_norm": 0.14018568396568298, + "learning_rate": 0.00017322268725645837, + "loss": 0.1269, + "step": 9285 + }, + { + "epoch": 0.6700097406111332, + "grad_norm": 0.14004381000995636, + "learning_rate": 0.00017321980083706163, + "loss": 0.1613, + "step": 9286 + }, + { + "epoch": 0.6700818932861936, + "grad_norm": 0.11310358345508575, + "learning_rate": 0.0001732169144176649, + "loss": 0.1388, + "step": 9287 + }, + { + "epoch": 0.670154045961254, + "grad_norm": 0.11956499516963959, + "learning_rate": 0.00017321402799826816, + "loss": 0.1646, + "step": 9288 + }, + { + "epoch": 0.6702261986363145, + "grad_norm": 0.13938401639461517, + "learning_rate": 0.00017321114157887142, + "loss": 0.1247, + "step": 9289 + }, + { + "epoch": 0.6702983513113748, + "grad_norm": 0.12736207246780396, + "learning_rate": 0.00017320825515947468, + "loss": 0.1567, + "step": 9290 + }, + { + "epoch": 0.6703705039864353, + "grad_norm": 0.1147407665848732, + "learning_rate": 0.00017320536874007792, + "loss": 0.1721, + "step": 9291 + }, + { + "epoch": 0.6704426566614957, + "grad_norm": 0.12425895780324936, + "learning_rate": 0.0001732024823206812, + "loss": 0.1532, + "step": 9292 + }, + { + "epoch": 0.6705148093365562, + "grad_norm": 0.13098202645778656, + "learning_rate": 0.00017319959590128447, + "loss": 0.1793, + "step": 9293 + }, + { + "epoch": 0.6705869620116166, + "grad_norm": 0.13009867072105408, + "learning_rate": 0.00017319670948188774, + "loss": 0.1198, + "step": 9294 + }, + { + "epoch": 0.670659114686677, + "grad_norm": 0.14394155144691467, + "learning_rate": 0.000173193823062491, + "loss": 0.1742, + "step": 9295 + }, + { + "epoch": 0.6707312673617374, + "grad_norm": 0.14598698914051056, + "learning_rate": 0.00017319093664309423, + "loss": 0.1242, + "step": 9296 + }, + { + "epoch": 0.6708034200367978, + "grad_norm": 0.14718224108219147, + "learning_rate": 0.0001731880502236975, + "loss": 0.1691, + "step": 9297 + }, + { + "epoch": 0.6708755727118583, + "grad_norm": 0.1442996710538864, + "learning_rate": 0.00017318516380430076, + "loss": 0.167, + "step": 9298 + }, + { + "epoch": 0.6709477253869187, + "grad_norm": 0.15796516835689545, + "learning_rate": 0.00017318227738490405, + "loss": 0.1633, + "step": 9299 + }, + { + "epoch": 0.6710198780619792, + "grad_norm": 0.13265594840049744, + "learning_rate": 0.0001731793909655073, + "loss": 0.1334, + "step": 9300 + }, + { + "epoch": 0.6710920307370396, + "grad_norm": 0.10171502083539963, + "learning_rate": 0.00017317650454611055, + "loss": 0.1917, + "step": 9301 + }, + { + "epoch": 0.6711641834121, + "grad_norm": 0.11402595788240433, + "learning_rate": 0.0001731736181267138, + "loss": 0.1237, + "step": 9302 + }, + { + "epoch": 0.6712363360871604, + "grad_norm": 0.11267706751823425, + "learning_rate": 0.00017317073170731708, + "loss": 0.1131, + "step": 9303 + }, + { + "epoch": 0.6713084887622208, + "grad_norm": 0.1277383267879486, + "learning_rate": 0.00017316784528792034, + "loss": 0.1939, + "step": 9304 + }, + { + "epoch": 0.6713806414372813, + "grad_norm": 0.11373007297515869, + "learning_rate": 0.0001731649588685236, + "loss": 0.1375, + "step": 9305 + }, + { + "epoch": 0.6714527941123417, + "grad_norm": 0.12875738739967346, + "learning_rate": 0.00017316207244912686, + "loss": 0.1629, + "step": 9306 + }, + { + "epoch": 0.6715249467874022, + "grad_norm": 0.1210503950715065, + "learning_rate": 0.00017315918602973013, + "loss": 0.1695, + "step": 9307 + }, + { + "epoch": 0.6715970994624626, + "grad_norm": 0.12265054881572723, + "learning_rate": 0.0001731562996103334, + "loss": 0.1549, + "step": 9308 + }, + { + "epoch": 0.671669252137523, + "grad_norm": 0.16889280080795288, + "learning_rate": 0.00017315341319093665, + "loss": 0.1612, + "step": 9309 + }, + { + "epoch": 0.6717414048125834, + "grad_norm": 0.10836353152990341, + "learning_rate": 0.00017315052677153992, + "loss": 0.0958, + "step": 9310 + }, + { + "epoch": 0.6718135574876438, + "grad_norm": 0.13138365745544434, + "learning_rate": 0.00017314764035214318, + "loss": 0.189, + "step": 9311 + }, + { + "epoch": 0.6718857101627043, + "grad_norm": 0.1403338760137558, + "learning_rate": 0.00017314475393274644, + "loss": 0.1797, + "step": 9312 + }, + { + "epoch": 0.6719578628377647, + "grad_norm": 0.11391955614089966, + "learning_rate": 0.00017314186751334968, + "loss": 0.1524, + "step": 9313 + }, + { + "epoch": 0.6720300155128252, + "grad_norm": 0.1363377422094345, + "learning_rate": 0.00017313898109395297, + "loss": 0.1743, + "step": 9314 + }, + { + "epoch": 0.6721021681878856, + "grad_norm": 0.11498959362506866, + "learning_rate": 0.00017313609467455623, + "loss": 0.1127, + "step": 9315 + }, + { + "epoch": 0.672174320862946, + "grad_norm": 0.15003925561904907, + "learning_rate": 0.0001731332082551595, + "loss": 0.1359, + "step": 9316 + }, + { + "epoch": 0.6722464735380064, + "grad_norm": 0.2588501572608948, + "learning_rate": 0.00017313032183576276, + "loss": 0.1256, + "step": 9317 + }, + { + "epoch": 0.6723186262130668, + "grad_norm": 0.11945448815822601, + "learning_rate": 0.000173127435416366, + "loss": 0.1448, + "step": 9318 + }, + { + "epoch": 0.6723907788881273, + "grad_norm": 0.1260116547346115, + "learning_rate": 0.00017312454899696925, + "loss": 0.1636, + "step": 9319 + }, + { + "epoch": 0.6724629315631877, + "grad_norm": 0.15767249464988708, + "learning_rate": 0.00017312166257757252, + "loss": 0.1541, + "step": 9320 + }, + { + "epoch": 0.6725350842382481, + "grad_norm": 0.10866338014602661, + "learning_rate": 0.0001731187761581758, + "loss": 0.1175, + "step": 9321 + }, + { + "epoch": 0.6726072369133086, + "grad_norm": 0.11621398478746414, + "learning_rate": 0.00017311588973877907, + "loss": 0.1795, + "step": 9322 + }, + { + "epoch": 0.672679389588369, + "grad_norm": 0.12204501032829285, + "learning_rate": 0.0001731130033193823, + "loss": 0.1531, + "step": 9323 + }, + { + "epoch": 0.6727515422634294, + "grad_norm": 0.1150672510266304, + "learning_rate": 0.00017311011689998557, + "loss": 0.1294, + "step": 9324 + }, + { + "epoch": 0.6728236949384898, + "grad_norm": 0.1237521544098854, + "learning_rate": 0.00017310723048058883, + "loss": 0.1397, + "step": 9325 + }, + { + "epoch": 0.6728958476135503, + "grad_norm": 0.10224903374910355, + "learning_rate": 0.0001731043440611921, + "loss": 0.1182, + "step": 9326 + }, + { + "epoch": 0.6729680002886107, + "grad_norm": 0.10159409791231155, + "learning_rate": 0.00017310145764179536, + "loss": 0.1351, + "step": 9327 + }, + { + "epoch": 0.6730401529636711, + "grad_norm": 0.10354913026094437, + "learning_rate": 0.00017309857122239862, + "loss": 0.1667, + "step": 9328 + }, + { + "epoch": 0.6731123056387316, + "grad_norm": 0.13224786520004272, + "learning_rate": 0.00017309568480300188, + "loss": 0.1567, + "step": 9329 + }, + { + "epoch": 0.673184458313792, + "grad_norm": 0.1250360608100891, + "learning_rate": 0.00017309279838360515, + "loss": 0.1538, + "step": 9330 + }, + { + "epoch": 0.6732566109888524, + "grad_norm": 0.11552729457616806, + "learning_rate": 0.0001730899119642084, + "loss": 0.1454, + "step": 9331 + }, + { + "epoch": 0.6733287636639128, + "grad_norm": 0.12813526391983032, + "learning_rate": 0.00017308702554481167, + "loss": 0.1015, + "step": 9332 + }, + { + "epoch": 0.6734009163389733, + "grad_norm": 0.1277819573879242, + "learning_rate": 0.00017308413912541494, + "loss": 0.1689, + "step": 9333 + }, + { + "epoch": 0.6734730690140337, + "grad_norm": 0.11043453216552734, + "learning_rate": 0.00017308125270601817, + "loss": 0.1426, + "step": 9334 + }, + { + "epoch": 0.6735452216890941, + "grad_norm": 0.12549757957458496, + "learning_rate": 0.00017307836628662146, + "loss": 0.1157, + "step": 9335 + }, + { + "epoch": 0.6736173743641546, + "grad_norm": 0.12948933243751526, + "learning_rate": 0.00017307547986722472, + "loss": 0.122, + "step": 9336 + }, + { + "epoch": 0.673689527039215, + "grad_norm": 0.12379142642021179, + "learning_rate": 0.000173072593447828, + "loss": 0.1467, + "step": 9337 + }, + { + "epoch": 0.6737616797142754, + "grad_norm": 0.12186525762081146, + "learning_rate": 0.00017306970702843125, + "loss": 0.0807, + "step": 9338 + }, + { + "epoch": 0.6738338323893358, + "grad_norm": 0.12237106263637543, + "learning_rate": 0.00017306682060903449, + "loss": 0.1665, + "step": 9339 + }, + { + "epoch": 0.6739059850643963, + "grad_norm": 0.1131465956568718, + "learning_rate": 0.00017306393418963775, + "loss": 0.1921, + "step": 9340 + }, + { + "epoch": 0.6739781377394567, + "grad_norm": 0.11823917180299759, + "learning_rate": 0.000173061047770241, + "loss": 0.1777, + "step": 9341 + }, + { + "epoch": 0.6740502904145171, + "grad_norm": 0.11592920869588852, + "learning_rate": 0.0001730581613508443, + "loss": 0.1456, + "step": 9342 + }, + { + "epoch": 0.6741224430895776, + "grad_norm": 0.13973605632781982, + "learning_rate": 0.00017305527493144756, + "loss": 0.1273, + "step": 9343 + }, + { + "epoch": 0.674194595764638, + "grad_norm": 0.13213014602661133, + "learning_rate": 0.0001730523885120508, + "loss": 0.135, + "step": 9344 + }, + { + "epoch": 0.6742667484396984, + "grad_norm": 0.1320028156042099, + "learning_rate": 0.00017304950209265406, + "loss": 0.1549, + "step": 9345 + }, + { + "epoch": 0.6743389011147588, + "grad_norm": 0.126231849193573, + "learning_rate": 0.00017304661567325733, + "loss": 0.1158, + "step": 9346 + }, + { + "epoch": 0.6744110537898192, + "grad_norm": 0.12511806190013885, + "learning_rate": 0.0001730437292538606, + "loss": 0.1436, + "step": 9347 + }, + { + "epoch": 0.6744832064648797, + "grad_norm": 0.12003560364246368, + "learning_rate": 0.00017304084283446385, + "loss": 0.124, + "step": 9348 + }, + { + "epoch": 0.6745553591399401, + "grad_norm": 0.14016613364219666, + "learning_rate": 0.00017303795641506712, + "loss": 0.1371, + "step": 9349 + }, + { + "epoch": 0.6746275118150006, + "grad_norm": 0.14404606819152832, + "learning_rate": 0.00017303506999567038, + "loss": 0.1867, + "step": 9350 + }, + { + "epoch": 0.674699664490061, + "grad_norm": 0.1769256889820099, + "learning_rate": 0.00017303218357627364, + "loss": 0.1762, + "step": 9351 + }, + { + "epoch": 0.6747718171651214, + "grad_norm": 0.11872179061174393, + "learning_rate": 0.0001730292971568769, + "loss": 0.1419, + "step": 9352 + }, + { + "epoch": 0.6748439698401818, + "grad_norm": 0.16135835647583008, + "learning_rate": 0.00017302641073748017, + "loss": 0.1975, + "step": 9353 + }, + { + "epoch": 0.6749161225152422, + "grad_norm": 0.1160866841673851, + "learning_rate": 0.00017302352431808343, + "loss": 0.1376, + "step": 9354 + }, + { + "epoch": 0.6749882751903027, + "grad_norm": 0.12021545320749283, + "learning_rate": 0.00017302063789868667, + "loss": 0.1097, + "step": 9355 + }, + { + "epoch": 0.6750604278653631, + "grad_norm": 0.12414583563804626, + "learning_rate": 0.00017301775147928996, + "loss": 0.1826, + "step": 9356 + }, + { + "epoch": 0.6751325805404236, + "grad_norm": 0.10644211620092392, + "learning_rate": 0.00017301486505989322, + "loss": 0.195, + "step": 9357 + }, + { + "epoch": 0.6752047332154839, + "grad_norm": 0.11322013288736343, + "learning_rate": 0.00017301197864049648, + "loss": 0.1603, + "step": 9358 + }, + { + "epoch": 0.6752768858905444, + "grad_norm": 0.11740998178720474, + "learning_rate": 0.00017300909222109974, + "loss": 0.173, + "step": 9359 + }, + { + "epoch": 0.6753490385656048, + "grad_norm": 0.13309647142887115, + "learning_rate": 0.00017300620580170298, + "loss": 0.0993, + "step": 9360 + }, + { + "epoch": 0.6754211912406652, + "grad_norm": 0.12983696162700653, + "learning_rate": 0.00017300331938230624, + "loss": 0.141, + "step": 9361 + }, + { + "epoch": 0.6754933439157257, + "grad_norm": 0.11454816907644272, + "learning_rate": 0.0001730004329629095, + "loss": 0.1624, + "step": 9362 + }, + { + "epoch": 0.6755654965907861, + "grad_norm": 0.1439279317855835, + "learning_rate": 0.0001729975465435128, + "loss": 0.1423, + "step": 9363 + }, + { + "epoch": 0.6756376492658466, + "grad_norm": 0.11750374734401703, + "learning_rate": 0.00017299466012411606, + "loss": 0.1883, + "step": 9364 + }, + { + "epoch": 0.6757098019409069, + "grad_norm": 0.10431762784719467, + "learning_rate": 0.0001729917737047193, + "loss": 0.1434, + "step": 9365 + }, + { + "epoch": 0.6757819546159674, + "grad_norm": 0.13653522729873657, + "learning_rate": 0.00017298888728532256, + "loss": 0.0969, + "step": 9366 + }, + { + "epoch": 0.6758541072910278, + "grad_norm": 0.14985524117946625, + "learning_rate": 0.00017298600086592582, + "loss": 0.1314, + "step": 9367 + }, + { + "epoch": 0.6759262599660882, + "grad_norm": 0.1311166137456894, + "learning_rate": 0.00017298311444652908, + "loss": 0.1129, + "step": 9368 + }, + { + "epoch": 0.6759984126411487, + "grad_norm": 0.1330045461654663, + "learning_rate": 0.00017298022802713235, + "loss": 0.1555, + "step": 9369 + }, + { + "epoch": 0.6760705653162091, + "grad_norm": 0.14036214351654053, + "learning_rate": 0.0001729773416077356, + "loss": 0.1691, + "step": 9370 + }, + { + "epoch": 0.6761427179912696, + "grad_norm": 0.14099359512329102, + "learning_rate": 0.00017297445518833887, + "loss": 0.2007, + "step": 9371 + }, + { + "epoch": 0.6762148706663299, + "grad_norm": 0.12093814462423325, + "learning_rate": 0.00017297156876894214, + "loss": 0.1089, + "step": 9372 + }, + { + "epoch": 0.6762870233413903, + "grad_norm": 0.14383164048194885, + "learning_rate": 0.0001729686823495454, + "loss": 0.1937, + "step": 9373 + }, + { + "epoch": 0.6763591760164508, + "grad_norm": 0.12347964197397232, + "learning_rate": 0.00017296579593014866, + "loss": 0.1303, + "step": 9374 + }, + { + "epoch": 0.6764313286915112, + "grad_norm": 0.10055216401815414, + "learning_rate": 0.00017296290951075192, + "loss": 0.1261, + "step": 9375 + }, + { + "epoch": 0.6765034813665717, + "grad_norm": 0.11191868782043457, + "learning_rate": 0.00017296002309135516, + "loss": 0.1554, + "step": 9376 + }, + { + "epoch": 0.6765756340416321, + "grad_norm": 0.115901418030262, + "learning_rate": 0.00017295713667195845, + "loss": 0.1105, + "step": 9377 + }, + { + "epoch": 0.6766477867166926, + "grad_norm": 0.14895713329315186, + "learning_rate": 0.0001729542502525617, + "loss": 0.1164, + "step": 9378 + }, + { + "epoch": 0.6767199393917529, + "grad_norm": 0.11927609145641327, + "learning_rate": 0.00017295136383316498, + "loss": 0.1252, + "step": 9379 + }, + { + "epoch": 0.6767920920668133, + "grad_norm": 0.14049996435642242, + "learning_rate": 0.00017294847741376824, + "loss": 0.1873, + "step": 9380 + }, + { + "epoch": 0.6768642447418738, + "grad_norm": 0.12153515964746475, + "learning_rate": 0.00017294559099437147, + "loss": 0.1326, + "step": 9381 + }, + { + "epoch": 0.6769363974169342, + "grad_norm": 0.10713379085063934, + "learning_rate": 0.00017294270457497474, + "loss": 0.1187, + "step": 9382 + }, + { + "epoch": 0.6770085500919947, + "grad_norm": 0.1224794015288353, + "learning_rate": 0.000172939818155578, + "loss": 0.2151, + "step": 9383 + }, + { + "epoch": 0.6770807027670551, + "grad_norm": 0.11354351788759232, + "learning_rate": 0.0001729369317361813, + "loss": 0.1237, + "step": 9384 + }, + { + "epoch": 0.6771528554421156, + "grad_norm": 0.13472294807434082, + "learning_rate": 0.00017293404531678455, + "loss": 0.1396, + "step": 9385 + }, + { + "epoch": 0.6772250081171759, + "grad_norm": 0.1110520288348198, + "learning_rate": 0.0001729311588973878, + "loss": 0.1332, + "step": 9386 + }, + { + "epoch": 0.6772971607922363, + "grad_norm": 0.13699764013290405, + "learning_rate": 0.00017292827247799105, + "loss": 0.1321, + "step": 9387 + }, + { + "epoch": 0.6773693134672968, + "grad_norm": 0.11775174736976624, + "learning_rate": 0.00017292538605859432, + "loss": 0.1417, + "step": 9388 + }, + { + "epoch": 0.6774414661423572, + "grad_norm": 0.13674022257328033, + "learning_rate": 0.00017292249963919758, + "loss": 0.1393, + "step": 9389 + }, + { + "epoch": 0.6775136188174177, + "grad_norm": 0.12976695597171783, + "learning_rate": 0.00017291961321980084, + "loss": 0.1223, + "step": 9390 + }, + { + "epoch": 0.6775857714924781, + "grad_norm": 0.11459603160619736, + "learning_rate": 0.0001729167268004041, + "loss": 0.1396, + "step": 9391 + }, + { + "epoch": 0.6776579241675386, + "grad_norm": 0.11260157078504562, + "learning_rate": 0.00017291384038100737, + "loss": 0.1094, + "step": 9392 + }, + { + "epoch": 0.6777300768425989, + "grad_norm": 0.15651853382587433, + "learning_rate": 0.00017291095396161063, + "loss": 0.1238, + "step": 9393 + }, + { + "epoch": 0.6778022295176593, + "grad_norm": 0.11085481941699982, + "learning_rate": 0.0001729080675422139, + "loss": 0.111, + "step": 9394 + }, + { + "epoch": 0.6778743821927198, + "grad_norm": 0.13176412880420685, + "learning_rate": 0.00017290518112281716, + "loss": 0.125, + "step": 9395 + }, + { + "epoch": 0.6779465348677802, + "grad_norm": 0.12989777326583862, + "learning_rate": 0.00017290229470342042, + "loss": 0.1401, + "step": 9396 + }, + { + "epoch": 0.6780186875428407, + "grad_norm": 0.11334443837404251, + "learning_rate": 0.00017289940828402365, + "loss": 0.1895, + "step": 9397 + }, + { + "epoch": 0.6780908402179011, + "grad_norm": 0.12922954559326172, + "learning_rate": 0.00017289652186462694, + "loss": 0.1404, + "step": 9398 + }, + { + "epoch": 0.6781629928929616, + "grad_norm": 0.12999506294727325, + "learning_rate": 0.0001728936354452302, + "loss": 0.1793, + "step": 9399 + }, + { + "epoch": 0.6782351455680219, + "grad_norm": 0.14302140474319458, + "learning_rate": 0.00017289074902583347, + "loss": 0.1557, + "step": 9400 + }, + { + "epoch": 0.6783072982430823, + "grad_norm": 0.13245822489261627, + "learning_rate": 0.00017288786260643673, + "loss": 0.1292, + "step": 9401 + }, + { + "epoch": 0.6783794509181428, + "grad_norm": 0.15688206255435944, + "learning_rate": 0.00017288497618703997, + "loss": 0.1519, + "step": 9402 + }, + { + "epoch": 0.6784516035932032, + "grad_norm": 0.12463241070508957, + "learning_rate": 0.00017288208976764323, + "loss": 0.1406, + "step": 9403 + }, + { + "epoch": 0.6785237562682637, + "grad_norm": 0.12490706145763397, + "learning_rate": 0.0001728792033482465, + "loss": 0.1746, + "step": 9404 + }, + { + "epoch": 0.6785959089433241, + "grad_norm": 0.1181960180401802, + "learning_rate": 0.00017287631692884978, + "loss": 0.1423, + "step": 9405 + }, + { + "epoch": 0.6786680616183846, + "grad_norm": 0.13778024911880493, + "learning_rate": 0.00017287343050945305, + "loss": 0.1425, + "step": 9406 + }, + { + "epoch": 0.6787402142934449, + "grad_norm": 0.12254858762025833, + "learning_rate": 0.00017287054409005628, + "loss": 0.1267, + "step": 9407 + }, + { + "epoch": 0.6788123669685053, + "grad_norm": 0.14109854400157928, + "learning_rate": 0.00017286765767065955, + "loss": 0.1313, + "step": 9408 + }, + { + "epoch": 0.6788845196435658, + "grad_norm": 0.1068730428814888, + "learning_rate": 0.0001728647712512628, + "loss": 0.1553, + "step": 9409 + }, + { + "epoch": 0.6789566723186262, + "grad_norm": 0.137517049908638, + "learning_rate": 0.00017286188483186607, + "loss": 0.1921, + "step": 9410 + }, + { + "epoch": 0.6790288249936867, + "grad_norm": 0.14153388142585754, + "learning_rate": 0.00017285899841246934, + "loss": 0.1704, + "step": 9411 + }, + { + "epoch": 0.6791009776687471, + "grad_norm": 0.1350041925907135, + "learning_rate": 0.0001728561119930726, + "loss": 0.1646, + "step": 9412 + }, + { + "epoch": 0.6791731303438076, + "grad_norm": 0.12162260711193085, + "learning_rate": 0.00017285322557367586, + "loss": 0.1823, + "step": 9413 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.1428827941417694, + "learning_rate": 0.00017285033915427912, + "loss": 0.1548, + "step": 9414 + }, + { + "epoch": 0.6793174356939283, + "grad_norm": 0.11298346519470215, + "learning_rate": 0.0001728474527348824, + "loss": 0.1114, + "step": 9415 + }, + { + "epoch": 0.6793895883689888, + "grad_norm": 0.14177198708057404, + "learning_rate": 0.00017284456631548565, + "loss": 0.1665, + "step": 9416 + }, + { + "epoch": 0.6794617410440492, + "grad_norm": 0.14046108722686768, + "learning_rate": 0.0001728416798960889, + "loss": 0.1998, + "step": 9417 + }, + { + "epoch": 0.6795338937191097, + "grad_norm": 0.13591402769088745, + "learning_rate": 0.00017283879347669218, + "loss": 0.1313, + "step": 9418 + }, + { + "epoch": 0.6796060463941701, + "grad_norm": 0.1478928178548813, + "learning_rate": 0.00017283590705729544, + "loss": 0.1595, + "step": 9419 + }, + { + "epoch": 0.6796781990692304, + "grad_norm": 0.10648253560066223, + "learning_rate": 0.0001728330206378987, + "loss": 0.1338, + "step": 9420 + }, + { + "epoch": 0.6797503517442909, + "grad_norm": 0.1144508346915245, + "learning_rate": 0.00017283013421850196, + "loss": 0.1517, + "step": 9421 + }, + { + "epoch": 0.6798225044193513, + "grad_norm": 0.12182658165693283, + "learning_rate": 0.00017282724779910523, + "loss": 0.1726, + "step": 9422 + }, + { + "epoch": 0.6798946570944118, + "grad_norm": 0.09788574278354645, + "learning_rate": 0.0001728243613797085, + "loss": 0.1191, + "step": 9423 + }, + { + "epoch": 0.6799668097694722, + "grad_norm": 0.12478946894407272, + "learning_rate": 0.00017282147496031173, + "loss": 0.1847, + "step": 9424 + }, + { + "epoch": 0.6800389624445327, + "grad_norm": 0.13396452367305756, + "learning_rate": 0.000172818588540915, + "loss": 0.1459, + "step": 9425 + }, + { + "epoch": 0.6801111151195931, + "grad_norm": 0.14490275084972382, + "learning_rate": 0.00017281570212151828, + "loss": 0.1488, + "step": 9426 + }, + { + "epoch": 0.6801832677946534, + "grad_norm": 0.11920656263828278, + "learning_rate": 0.00017281281570212154, + "loss": 0.1479, + "step": 9427 + }, + { + "epoch": 0.6802554204697139, + "grad_norm": 0.13172681629657745, + "learning_rate": 0.0001728099292827248, + "loss": 0.1396, + "step": 9428 + }, + { + "epoch": 0.6803275731447743, + "grad_norm": 0.11047541350126266, + "learning_rate": 0.00017280704286332804, + "loss": 0.1122, + "step": 9429 + }, + { + "epoch": 0.6803997258198348, + "grad_norm": 0.10893700271844864, + "learning_rate": 0.0001728041564439313, + "loss": 0.1294, + "step": 9430 + }, + { + "epoch": 0.6804718784948952, + "grad_norm": 0.12578247487545013, + "learning_rate": 0.00017280127002453457, + "loss": 0.1333, + "step": 9431 + }, + { + "epoch": 0.6805440311699557, + "grad_norm": 0.13303852081298828, + "learning_rate": 0.00017279838360513783, + "loss": 0.1471, + "step": 9432 + }, + { + "epoch": 0.6806161838450161, + "grad_norm": 0.13911806046962738, + "learning_rate": 0.00017279549718574112, + "loss": 0.1831, + "step": 9433 + }, + { + "epoch": 0.6806883365200764, + "grad_norm": 0.10774588584899902, + "learning_rate": 0.00017279261076634436, + "loss": 0.1762, + "step": 9434 + }, + { + "epoch": 0.6807604891951369, + "grad_norm": 0.12354011088609695, + "learning_rate": 0.00017278972434694762, + "loss": 0.2078, + "step": 9435 + }, + { + "epoch": 0.6808326418701973, + "grad_norm": 0.1513359546661377, + "learning_rate": 0.00017278683792755088, + "loss": 0.1283, + "step": 9436 + }, + { + "epoch": 0.6809047945452578, + "grad_norm": 0.13830235600471497, + "learning_rate": 0.00017278395150815414, + "loss": 0.1261, + "step": 9437 + }, + { + "epoch": 0.6809769472203182, + "grad_norm": 0.13394580781459808, + "learning_rate": 0.0001727810650887574, + "loss": 0.179, + "step": 9438 + }, + { + "epoch": 0.6810490998953787, + "grad_norm": 0.11969741433858871, + "learning_rate": 0.00017277817866936067, + "loss": 0.1396, + "step": 9439 + }, + { + "epoch": 0.6811212525704391, + "grad_norm": 0.13217154145240784, + "learning_rate": 0.00017277529224996393, + "loss": 0.1401, + "step": 9440 + }, + { + "epoch": 0.6811934052454994, + "grad_norm": 0.11761936545372009, + "learning_rate": 0.0001727724058305672, + "loss": 0.1316, + "step": 9441 + }, + { + "epoch": 0.6812655579205599, + "grad_norm": 0.1200450211763382, + "learning_rate": 0.00017276951941117046, + "loss": 0.1436, + "step": 9442 + }, + { + "epoch": 0.6813377105956203, + "grad_norm": 0.1316228061914444, + "learning_rate": 0.00017276663299177372, + "loss": 0.1564, + "step": 9443 + }, + { + "epoch": 0.6814098632706808, + "grad_norm": 0.12965944409370422, + "learning_rate": 0.00017276374657237698, + "loss": 0.1458, + "step": 9444 + }, + { + "epoch": 0.6814820159457412, + "grad_norm": 0.13953819870948792, + "learning_rate": 0.00017276086015298022, + "loss": 0.1425, + "step": 9445 + }, + { + "epoch": 0.6815541686208016, + "grad_norm": 0.1284487396478653, + "learning_rate": 0.00017275797373358348, + "loss": 0.1378, + "step": 9446 + }, + { + "epoch": 0.6816263212958621, + "grad_norm": 0.09896153956651688, + "learning_rate": 0.00017275508731418677, + "loss": 0.1547, + "step": 9447 + }, + { + "epoch": 0.6816984739709224, + "grad_norm": 0.11616410315036774, + "learning_rate": 0.00017275220089479004, + "loss": 0.116, + "step": 9448 + }, + { + "epoch": 0.6817706266459829, + "grad_norm": 0.150199756026268, + "learning_rate": 0.0001727493144753933, + "loss": 0.1362, + "step": 9449 + }, + { + "epoch": 0.6818427793210433, + "grad_norm": 0.15000593662261963, + "learning_rate": 0.00017274642805599653, + "loss": 0.1047, + "step": 9450 + }, + { + "epoch": 0.6819149319961038, + "grad_norm": 0.1399821788072586, + "learning_rate": 0.0001727435416365998, + "loss": 0.1229, + "step": 9451 + }, + { + "epoch": 0.6819870846711642, + "grad_norm": 0.12325285375118256, + "learning_rate": 0.00017274065521720306, + "loss": 0.1374, + "step": 9452 + }, + { + "epoch": 0.6820592373462246, + "grad_norm": 0.10625096410512924, + "learning_rate": 0.00017273776879780632, + "loss": 0.0805, + "step": 9453 + }, + { + "epoch": 0.6821313900212851, + "grad_norm": 0.1238563284277916, + "learning_rate": 0.00017273488237840961, + "loss": 0.1878, + "step": 9454 + }, + { + "epoch": 0.6822035426963454, + "grad_norm": 0.14863990247249603, + "learning_rate": 0.00017273199595901285, + "loss": 0.1547, + "step": 9455 + }, + { + "epoch": 0.6822756953714059, + "grad_norm": 0.14600786566734314, + "learning_rate": 0.0001727291095396161, + "loss": 0.187, + "step": 9456 + }, + { + "epoch": 0.6823478480464663, + "grad_norm": 0.13600049912929535, + "learning_rate": 0.00017272622312021938, + "loss": 0.1425, + "step": 9457 + }, + { + "epoch": 0.6824200007215268, + "grad_norm": 0.14593887329101562, + "learning_rate": 0.00017272333670082264, + "loss": 0.1396, + "step": 9458 + }, + { + "epoch": 0.6824921533965872, + "grad_norm": 0.10805844515562057, + "learning_rate": 0.0001727204502814259, + "loss": 0.1403, + "step": 9459 + }, + { + "epoch": 0.6825643060716476, + "grad_norm": 0.13881978392601013, + "learning_rate": 0.00017271756386202916, + "loss": 0.1365, + "step": 9460 + }, + { + "epoch": 0.6826364587467081, + "grad_norm": 0.11945115774869919, + "learning_rate": 0.00017271467744263243, + "loss": 0.1848, + "step": 9461 + }, + { + "epoch": 0.6827086114217684, + "grad_norm": 0.10048525780439377, + "learning_rate": 0.0001727117910232357, + "loss": 0.1184, + "step": 9462 + }, + { + "epoch": 0.6827807640968289, + "grad_norm": 0.1330692172050476, + "learning_rate": 0.00017270890460383895, + "loss": 0.124, + "step": 9463 + }, + { + "epoch": 0.6828529167718893, + "grad_norm": 0.1183803454041481, + "learning_rate": 0.00017270601818444222, + "loss": 0.1459, + "step": 9464 + }, + { + "epoch": 0.6829250694469498, + "grad_norm": 0.1102006658911705, + "learning_rate": 0.00017270313176504548, + "loss": 0.1913, + "step": 9465 + }, + { + "epoch": 0.6829972221220102, + "grad_norm": 0.1035265251994133, + "learning_rate": 0.00017270024534564871, + "loss": 0.16, + "step": 9466 + }, + { + "epoch": 0.6830693747970706, + "grad_norm": 0.13262054324150085, + "learning_rate": 0.00017269735892625198, + "loss": 0.1612, + "step": 9467 + }, + { + "epoch": 0.6831415274721311, + "grad_norm": 0.12814365327358246, + "learning_rate": 0.00017269447250685527, + "loss": 0.1059, + "step": 9468 + }, + { + "epoch": 0.6832136801471914, + "grad_norm": 0.10215715318918228, + "learning_rate": 0.00017269158608745853, + "loss": 0.1637, + "step": 9469 + }, + { + "epoch": 0.6832858328222519, + "grad_norm": 0.13353413343429565, + "learning_rate": 0.0001726886996680618, + "loss": 0.1213, + "step": 9470 + }, + { + "epoch": 0.6833579854973123, + "grad_norm": 0.1456976979970932, + "learning_rate": 0.00017268581324866503, + "loss": 0.1989, + "step": 9471 + }, + { + "epoch": 0.6834301381723727, + "grad_norm": 0.11505065858364105, + "learning_rate": 0.0001726829268292683, + "loss": 0.1887, + "step": 9472 + }, + { + "epoch": 0.6835022908474332, + "grad_norm": 0.11727729439735413, + "learning_rate": 0.00017268004040987156, + "loss": 0.1246, + "step": 9473 + }, + { + "epoch": 0.6835744435224936, + "grad_norm": 0.1266585737466812, + "learning_rate": 0.00017267715399047482, + "loss": 0.1695, + "step": 9474 + }, + { + "epoch": 0.6836465961975541, + "grad_norm": 0.13724775612354279, + "learning_rate": 0.0001726742675710781, + "loss": 0.1464, + "step": 9475 + }, + { + "epoch": 0.6837187488726144, + "grad_norm": 0.11454111337661743, + "learning_rate": 0.00017267138115168134, + "loss": 0.1337, + "step": 9476 + }, + { + "epoch": 0.6837909015476749, + "grad_norm": 0.13361407816410065, + "learning_rate": 0.0001726684947322846, + "loss": 0.1239, + "step": 9477 + }, + { + "epoch": 0.6838630542227353, + "grad_norm": 0.1123400554060936, + "learning_rate": 0.00017266560831288787, + "loss": 0.1107, + "step": 9478 + }, + { + "epoch": 0.6839352068977957, + "grad_norm": 0.12301890552043915, + "learning_rate": 0.00017266272189349113, + "loss": 0.1448, + "step": 9479 + }, + { + "epoch": 0.6840073595728562, + "grad_norm": 0.13098479807376862, + "learning_rate": 0.0001726598354740944, + "loss": 0.1417, + "step": 9480 + }, + { + "epoch": 0.6840795122479166, + "grad_norm": 0.12436211854219437, + "learning_rate": 0.00017265694905469766, + "loss": 0.1333, + "step": 9481 + }, + { + "epoch": 0.684151664922977, + "grad_norm": 0.11635185778141022, + "learning_rate": 0.00017265406263530092, + "loss": 0.2084, + "step": 9482 + }, + { + "epoch": 0.6842238175980374, + "grad_norm": 0.14020775258541107, + "learning_rate": 0.00017265117621590418, + "loss": 0.15, + "step": 9483 + }, + { + "epoch": 0.6842959702730979, + "grad_norm": 0.14138399064540863, + "learning_rate": 0.00017264828979650745, + "loss": 0.2069, + "step": 9484 + }, + { + "epoch": 0.6843681229481583, + "grad_norm": 0.13020771741867065, + "learning_rate": 0.0001726454033771107, + "loss": 0.1489, + "step": 9485 + }, + { + "epoch": 0.6844402756232187, + "grad_norm": 0.11580926924943924, + "learning_rate": 0.00017264251695771397, + "loss": 0.1375, + "step": 9486 + }, + { + "epoch": 0.6845124282982792, + "grad_norm": 0.12390075623989105, + "learning_rate": 0.0001726396305383172, + "loss": 0.1811, + "step": 9487 + }, + { + "epoch": 0.6845845809733396, + "grad_norm": 0.11783546954393387, + "learning_rate": 0.00017263674411892047, + "loss": 0.13, + "step": 9488 + }, + { + "epoch": 0.6846567336484, + "grad_norm": 0.13768181204795837, + "learning_rate": 0.00017263385769952376, + "loss": 0.1394, + "step": 9489 + }, + { + "epoch": 0.6847288863234604, + "grad_norm": 0.13023586571216583, + "learning_rate": 0.00017263097128012702, + "loss": 0.1207, + "step": 9490 + }, + { + "epoch": 0.6848010389985209, + "grad_norm": 0.12852199375629425, + "learning_rate": 0.0001726280848607303, + "loss": 0.1538, + "step": 9491 + }, + { + "epoch": 0.6848731916735813, + "grad_norm": 0.15153183043003082, + "learning_rate": 0.00017262519844133352, + "loss": 0.1209, + "step": 9492 + }, + { + "epoch": 0.6849453443486417, + "grad_norm": 0.12875881791114807, + "learning_rate": 0.00017262231202193679, + "loss": 0.1907, + "step": 9493 + }, + { + "epoch": 0.6850174970237022, + "grad_norm": 0.11426623910665512, + "learning_rate": 0.00017261942560254005, + "loss": 0.1444, + "step": 9494 + }, + { + "epoch": 0.6850896496987626, + "grad_norm": 0.15419016778469086, + "learning_rate": 0.0001726165391831433, + "loss": 0.1739, + "step": 9495 + }, + { + "epoch": 0.685161802373823, + "grad_norm": 0.09922267496585846, + "learning_rate": 0.0001726136527637466, + "loss": 0.0887, + "step": 9496 + }, + { + "epoch": 0.6852339550488834, + "grad_norm": 0.12559925019741058, + "learning_rate": 0.00017261076634434984, + "loss": 0.1384, + "step": 9497 + }, + { + "epoch": 0.6853061077239438, + "grad_norm": 0.16568002104759216, + "learning_rate": 0.0001726078799249531, + "loss": 0.1957, + "step": 9498 + }, + { + "epoch": 0.6853782603990043, + "grad_norm": 0.13108040392398834, + "learning_rate": 0.00017260499350555636, + "loss": 0.1209, + "step": 9499 + }, + { + "epoch": 0.6854504130740647, + "grad_norm": 0.12693406641483307, + "learning_rate": 0.00017260210708615963, + "loss": 0.1452, + "step": 9500 + }, + { + "epoch": 0.6855225657491252, + "grad_norm": 0.15336719155311584, + "learning_rate": 0.0001725992206667629, + "loss": 0.1389, + "step": 9501 + }, + { + "epoch": 0.6855947184241856, + "grad_norm": 0.11107275635004044, + "learning_rate": 0.00017259633424736615, + "loss": 0.0965, + "step": 9502 + }, + { + "epoch": 0.685666871099246, + "grad_norm": 0.13186608254909515, + "learning_rate": 0.0001725934478279694, + "loss": 0.1351, + "step": 9503 + }, + { + "epoch": 0.6857390237743064, + "grad_norm": 0.11895376443862915, + "learning_rate": 0.00017259056140857268, + "loss": 0.0987, + "step": 9504 + }, + { + "epoch": 0.6858111764493668, + "grad_norm": 0.11779962480068207, + "learning_rate": 0.00017258767498917594, + "loss": 0.1417, + "step": 9505 + }, + { + "epoch": 0.6858833291244273, + "grad_norm": 0.12343606352806091, + "learning_rate": 0.0001725847885697792, + "loss": 0.1554, + "step": 9506 + }, + { + "epoch": 0.6859554817994877, + "grad_norm": 0.13738082349300385, + "learning_rate": 0.00017258190215038247, + "loss": 0.198, + "step": 9507 + }, + { + "epoch": 0.6860276344745482, + "grad_norm": 0.13566620647907257, + "learning_rate": 0.0001725790157309857, + "loss": 0.1304, + "step": 9508 + }, + { + "epoch": 0.6860997871496086, + "grad_norm": 0.12368319928646088, + "learning_rate": 0.00017257612931158897, + "loss": 0.1367, + "step": 9509 + }, + { + "epoch": 0.686171939824669, + "grad_norm": 0.16261732578277588, + "learning_rate": 0.00017257324289219223, + "loss": 0.18, + "step": 9510 + }, + { + "epoch": 0.6862440924997294, + "grad_norm": 0.11897819489240646, + "learning_rate": 0.00017257035647279552, + "loss": 0.1463, + "step": 9511 + }, + { + "epoch": 0.6863162451747898, + "grad_norm": 0.10302698612213135, + "learning_rate": 0.00017256747005339878, + "loss": 0.1263, + "step": 9512 + }, + { + "epoch": 0.6863883978498503, + "grad_norm": 0.10292990505695343, + "learning_rate": 0.00017256458363400202, + "loss": 0.1433, + "step": 9513 + }, + { + "epoch": 0.6864605505249107, + "grad_norm": 0.13708685338497162, + "learning_rate": 0.00017256169721460528, + "loss": 0.1819, + "step": 9514 + }, + { + "epoch": 0.6865327031999712, + "grad_norm": 0.1115337535738945, + "learning_rate": 0.00017255881079520854, + "loss": 0.1542, + "step": 9515 + }, + { + "epoch": 0.6866048558750316, + "grad_norm": 0.15068112313747406, + "learning_rate": 0.0001725559243758118, + "loss": 0.1547, + "step": 9516 + }, + { + "epoch": 0.686677008550092, + "grad_norm": 0.13998079299926758, + "learning_rate": 0.00017255303795641507, + "loss": 0.1274, + "step": 9517 + }, + { + "epoch": 0.6867491612251524, + "grad_norm": 0.1232718825340271, + "learning_rate": 0.00017255015153701833, + "loss": 0.1221, + "step": 9518 + }, + { + "epoch": 0.6868213139002128, + "grad_norm": 0.11890007555484772, + "learning_rate": 0.0001725472651176216, + "loss": 0.1534, + "step": 9519 + }, + { + "epoch": 0.6868934665752733, + "grad_norm": 0.12487497925758362, + "learning_rate": 0.00017254437869822486, + "loss": 0.1456, + "step": 9520 + }, + { + "epoch": 0.6869656192503337, + "grad_norm": 0.13111813366413116, + "learning_rate": 0.00017254149227882812, + "loss": 0.124, + "step": 9521 + }, + { + "epoch": 0.6870377719253942, + "grad_norm": 0.1577240377664566, + "learning_rate": 0.00017253860585943138, + "loss": 0.1544, + "step": 9522 + }, + { + "epoch": 0.6871099246004546, + "grad_norm": 0.14386539161205292, + "learning_rate": 0.00017253571944003465, + "loss": 0.1515, + "step": 9523 + }, + { + "epoch": 0.687182077275515, + "grad_norm": 0.12887921929359436, + "learning_rate": 0.00017253283302063788, + "loss": 0.1457, + "step": 9524 + }, + { + "epoch": 0.6872542299505754, + "grad_norm": 0.1316738724708557, + "learning_rate": 0.00017252994660124117, + "loss": 0.1257, + "step": 9525 + }, + { + "epoch": 0.6873263826256358, + "grad_norm": 0.12598678469657898, + "learning_rate": 0.00017252706018184444, + "loss": 0.1165, + "step": 9526 + }, + { + "epoch": 0.6873985353006963, + "grad_norm": 0.11530796438455582, + "learning_rate": 0.0001725241737624477, + "loss": 0.1135, + "step": 9527 + }, + { + "epoch": 0.6874706879757567, + "grad_norm": 0.1386406421661377, + "learning_rate": 0.00017252128734305096, + "loss": 0.1847, + "step": 9528 + }, + { + "epoch": 0.6875428406508172, + "grad_norm": 0.12268256396055222, + "learning_rate": 0.00017251840092365422, + "loss": 0.2082, + "step": 9529 + }, + { + "epoch": 0.6876149933258776, + "grad_norm": 0.15390999615192413, + "learning_rate": 0.00017251551450425746, + "loss": 0.128, + "step": 9530 + }, + { + "epoch": 0.6876871460009379, + "grad_norm": 0.10604900121688843, + "learning_rate": 0.00017251262808486072, + "loss": 0.106, + "step": 9531 + }, + { + "epoch": 0.6877592986759984, + "grad_norm": 0.1284235268831253, + "learning_rate": 0.000172509741665464, + "loss": 0.1408, + "step": 9532 + }, + { + "epoch": 0.6878314513510588, + "grad_norm": 0.1350274533033371, + "learning_rate": 0.00017250685524606728, + "loss": 0.1856, + "step": 9533 + }, + { + "epoch": 0.6879036040261193, + "grad_norm": 0.11651486158370972, + "learning_rate": 0.00017250396882667054, + "loss": 0.1913, + "step": 9534 + }, + { + "epoch": 0.6879757567011797, + "grad_norm": 0.15933158993721008, + "learning_rate": 0.00017250108240727377, + "loss": 0.1646, + "step": 9535 + }, + { + "epoch": 0.6880479093762402, + "grad_norm": 0.10325729101896286, + "learning_rate": 0.00017249819598787704, + "loss": 0.1412, + "step": 9536 + }, + { + "epoch": 0.6881200620513006, + "grad_norm": 0.12476102262735367, + "learning_rate": 0.0001724953095684803, + "loss": 0.14, + "step": 9537 + }, + { + "epoch": 0.6881922147263609, + "grad_norm": 0.10627015680074692, + "learning_rate": 0.00017249242314908356, + "loss": 0.1665, + "step": 9538 + }, + { + "epoch": 0.6882643674014214, + "grad_norm": 0.10776442289352417, + "learning_rate": 0.00017248953672968685, + "loss": 0.1371, + "step": 9539 + }, + { + "epoch": 0.6883365200764818, + "grad_norm": 0.13613662123680115, + "learning_rate": 0.0001724866503102901, + "loss": 0.1821, + "step": 9540 + }, + { + "epoch": 0.6884086727515423, + "grad_norm": 0.11171852797269821, + "learning_rate": 0.00017248376389089335, + "loss": 0.1647, + "step": 9541 + }, + { + "epoch": 0.6884808254266027, + "grad_norm": 0.11853030323982239, + "learning_rate": 0.00017248087747149662, + "loss": 0.1447, + "step": 9542 + }, + { + "epoch": 0.6885529781016632, + "grad_norm": 0.11910616606473923, + "learning_rate": 0.00017247799105209988, + "loss": 0.1787, + "step": 9543 + }, + { + "epoch": 0.6886251307767235, + "grad_norm": 0.11468103528022766, + "learning_rate": 0.00017247510463270314, + "loss": 0.1179, + "step": 9544 + }, + { + "epoch": 0.6886972834517839, + "grad_norm": 0.1218499168753624, + "learning_rate": 0.0001724722182133064, + "loss": 0.1822, + "step": 9545 + }, + { + "epoch": 0.6887694361268444, + "grad_norm": 0.1380220502614975, + "learning_rate": 0.00017246933179390967, + "loss": 0.142, + "step": 9546 + }, + { + "epoch": 0.6888415888019048, + "grad_norm": 0.1014731153845787, + "learning_rate": 0.00017246644537451293, + "loss": 0.158, + "step": 9547 + }, + { + "epoch": 0.6889137414769653, + "grad_norm": 0.12388917803764343, + "learning_rate": 0.0001724635589551162, + "loss": 0.1516, + "step": 9548 + }, + { + "epoch": 0.6889858941520257, + "grad_norm": 0.13061605393886566, + "learning_rate": 0.00017246067253571946, + "loss": 0.1361, + "step": 9549 + }, + { + "epoch": 0.6890580468270862, + "grad_norm": 0.11290262639522552, + "learning_rate": 0.00017245778611632272, + "loss": 0.1292, + "step": 9550 + }, + { + "epoch": 0.6891301995021465, + "grad_norm": 0.10076184570789337, + "learning_rate": 0.00017245489969692595, + "loss": 0.1734, + "step": 9551 + }, + { + "epoch": 0.6892023521772069, + "grad_norm": 0.1263117641210556, + "learning_rate": 0.00017245201327752922, + "loss": 0.1593, + "step": 9552 + }, + { + "epoch": 0.6892745048522674, + "grad_norm": 0.13651815056800842, + "learning_rate": 0.0001724491268581325, + "loss": 0.1683, + "step": 9553 + }, + { + "epoch": 0.6893466575273278, + "grad_norm": 0.13438211381435394, + "learning_rate": 0.00017244624043873577, + "loss": 0.1387, + "step": 9554 + }, + { + "epoch": 0.6894188102023883, + "grad_norm": 0.11674962937831879, + "learning_rate": 0.00017244335401933903, + "loss": 0.1842, + "step": 9555 + }, + { + "epoch": 0.6894909628774487, + "grad_norm": 0.14424452185630798, + "learning_rate": 0.00017244046759994227, + "loss": 0.1309, + "step": 9556 + }, + { + "epoch": 0.6895631155525092, + "grad_norm": 0.1262129247188568, + "learning_rate": 0.00017243758118054553, + "loss": 0.1803, + "step": 9557 + }, + { + "epoch": 0.6896352682275695, + "grad_norm": 0.13307301700115204, + "learning_rate": 0.0001724346947611488, + "loss": 0.1321, + "step": 9558 + }, + { + "epoch": 0.6897074209026299, + "grad_norm": 0.13330549001693726, + "learning_rate": 0.00017243180834175206, + "loss": 0.1213, + "step": 9559 + }, + { + "epoch": 0.6897795735776904, + "grad_norm": 0.11524797230958939, + "learning_rate": 0.00017242892192235535, + "loss": 0.1487, + "step": 9560 + }, + { + "epoch": 0.6898517262527508, + "grad_norm": 0.12419763952493668, + "learning_rate": 0.00017242603550295858, + "loss": 0.1586, + "step": 9561 + }, + { + "epoch": 0.6899238789278113, + "grad_norm": 0.1287110149860382, + "learning_rate": 0.00017242314908356185, + "loss": 0.1274, + "step": 9562 + }, + { + "epoch": 0.6899960316028717, + "grad_norm": 0.15064340829849243, + "learning_rate": 0.0001724202626641651, + "loss": 0.1192, + "step": 9563 + }, + { + "epoch": 0.6900681842779322, + "grad_norm": 0.11350834369659424, + "learning_rate": 0.00017241737624476837, + "loss": 0.2087, + "step": 9564 + }, + { + "epoch": 0.6901403369529925, + "grad_norm": 0.11448984593153, + "learning_rate": 0.00017241448982537164, + "loss": 0.178, + "step": 9565 + }, + { + "epoch": 0.6902124896280529, + "grad_norm": 0.09771303087472916, + "learning_rate": 0.0001724116034059749, + "loss": 0.143, + "step": 9566 + }, + { + "epoch": 0.6902846423031134, + "grad_norm": 0.12573154270648956, + "learning_rate": 0.00017240871698657816, + "loss": 0.1395, + "step": 9567 + }, + { + "epoch": 0.6903567949781738, + "grad_norm": 0.10742811858654022, + "learning_rate": 0.00017240583056718142, + "loss": 0.1624, + "step": 9568 + }, + { + "epoch": 0.6904289476532343, + "grad_norm": 0.13596831262111664, + "learning_rate": 0.0001724029441477847, + "loss": 0.131, + "step": 9569 + }, + { + "epoch": 0.6905011003282947, + "grad_norm": 0.12550032138824463, + "learning_rate": 0.00017240005772838795, + "loss": 0.1523, + "step": 9570 + }, + { + "epoch": 0.6905732530033551, + "grad_norm": 0.1494235396385193, + "learning_rate": 0.0001723971713089912, + "loss": 0.1294, + "step": 9571 + }, + { + "epoch": 0.6906454056784155, + "grad_norm": 0.11433825641870499, + "learning_rate": 0.00017239428488959445, + "loss": 0.0807, + "step": 9572 + }, + { + "epoch": 0.6907175583534759, + "grad_norm": 0.12817634642124176, + "learning_rate": 0.0001723913984701977, + "loss": 0.1258, + "step": 9573 + }, + { + "epoch": 0.6907897110285364, + "grad_norm": 0.1334555745124817, + "learning_rate": 0.000172388512050801, + "loss": 0.1566, + "step": 9574 + }, + { + "epoch": 0.6908618637035968, + "grad_norm": 0.1532028317451477, + "learning_rate": 0.00017238562563140426, + "loss": 0.1331, + "step": 9575 + }, + { + "epoch": 0.6909340163786573, + "grad_norm": 0.11318008601665497, + "learning_rate": 0.00017238273921200753, + "loss": 0.1324, + "step": 9576 + }, + { + "epoch": 0.6910061690537177, + "grad_norm": 0.13026483356952667, + "learning_rate": 0.00017237985279261076, + "loss": 0.1817, + "step": 9577 + }, + { + "epoch": 0.6910783217287781, + "grad_norm": 0.1172034814953804, + "learning_rate": 0.00017237696637321403, + "loss": 0.1744, + "step": 9578 + }, + { + "epoch": 0.6911504744038385, + "grad_norm": 0.11258107423782349, + "learning_rate": 0.0001723740799538173, + "loss": 0.1678, + "step": 9579 + }, + { + "epoch": 0.6912226270788989, + "grad_norm": 0.12172319740056992, + "learning_rate": 0.00017237119353442055, + "loss": 0.1469, + "step": 9580 + }, + { + "epoch": 0.6912947797539594, + "grad_norm": 0.1422765851020813, + "learning_rate": 0.00017236830711502384, + "loss": 0.1624, + "step": 9581 + }, + { + "epoch": 0.6913669324290198, + "grad_norm": 0.10180949419736862, + "learning_rate": 0.00017236542069562708, + "loss": 0.1255, + "step": 9582 + }, + { + "epoch": 0.6914390851040803, + "grad_norm": 0.11146809905767441, + "learning_rate": 0.00017236253427623034, + "loss": 0.1089, + "step": 9583 + }, + { + "epoch": 0.6915112377791407, + "grad_norm": 0.10429064929485321, + "learning_rate": 0.0001723596478568336, + "loss": 0.1667, + "step": 9584 + }, + { + "epoch": 0.6915833904542011, + "grad_norm": 0.12864717841148376, + "learning_rate": 0.00017235676143743687, + "loss": 0.1105, + "step": 9585 + }, + { + "epoch": 0.6916555431292615, + "grad_norm": 0.16162557899951935, + "learning_rate": 0.00017235387501804013, + "loss": 0.1241, + "step": 9586 + }, + { + "epoch": 0.6917276958043219, + "grad_norm": 0.14276309311389923, + "learning_rate": 0.0001723509885986434, + "loss": 0.1546, + "step": 9587 + }, + { + "epoch": 0.6917998484793824, + "grad_norm": 0.11883494257926941, + "learning_rate": 0.00017234810217924666, + "loss": 0.1405, + "step": 9588 + }, + { + "epoch": 0.6918720011544428, + "grad_norm": 0.16758319735527039, + "learning_rate": 0.00017234521575984992, + "loss": 0.1938, + "step": 9589 + }, + { + "epoch": 0.6919441538295033, + "grad_norm": 0.12192453444004059, + "learning_rate": 0.00017234232934045318, + "loss": 0.1321, + "step": 9590 + }, + { + "epoch": 0.6920163065045637, + "grad_norm": 0.12941285967826843, + "learning_rate": 0.00017233944292105644, + "loss": 0.1712, + "step": 9591 + }, + { + "epoch": 0.6920884591796241, + "grad_norm": 0.11735109984874725, + "learning_rate": 0.0001723365565016597, + "loss": 0.1034, + "step": 9592 + }, + { + "epoch": 0.6921606118546845, + "grad_norm": 0.11214487999677658, + "learning_rate": 0.00017233367008226294, + "loss": 0.1359, + "step": 9593 + }, + { + "epoch": 0.6922327645297449, + "grad_norm": 0.12300753593444824, + "learning_rate": 0.0001723307836628662, + "loss": 0.1582, + "step": 9594 + }, + { + "epoch": 0.6923049172048054, + "grad_norm": 0.13960377871990204, + "learning_rate": 0.0001723278972434695, + "loss": 0.2027, + "step": 9595 + }, + { + "epoch": 0.6923770698798658, + "grad_norm": 0.10137218236923218, + "learning_rate": 0.00017232501082407276, + "loss": 0.0833, + "step": 9596 + }, + { + "epoch": 0.6924492225549262, + "grad_norm": 0.11234530061483383, + "learning_rate": 0.00017232212440467602, + "loss": 0.1472, + "step": 9597 + }, + { + "epoch": 0.6925213752299867, + "grad_norm": 0.11832275986671448, + "learning_rate": 0.00017231923798527926, + "loss": 0.1347, + "step": 9598 + }, + { + "epoch": 0.6925935279050471, + "grad_norm": 0.11702826619148254, + "learning_rate": 0.00017231635156588252, + "loss": 0.1819, + "step": 9599 + }, + { + "epoch": 0.6926656805801075, + "grad_norm": 0.12916803359985352, + "learning_rate": 0.00017231346514648578, + "loss": 0.1322, + "step": 9600 + }, + { + "epoch": 0.6927378332551679, + "grad_norm": 0.13468345999717712, + "learning_rate": 0.00017231057872708905, + "loss": 0.2088, + "step": 9601 + }, + { + "epoch": 0.6928099859302284, + "grad_norm": 0.13211590051651, + "learning_rate": 0.00017230769230769234, + "loss": 0.1602, + "step": 9602 + }, + { + "epoch": 0.6928821386052888, + "grad_norm": 0.13652805984020233, + "learning_rate": 0.00017230480588829557, + "loss": 0.1893, + "step": 9603 + }, + { + "epoch": 0.6929542912803492, + "grad_norm": 0.14025311172008514, + "learning_rate": 0.00017230191946889884, + "loss": 0.1307, + "step": 9604 + }, + { + "epoch": 0.6930264439554097, + "grad_norm": 0.10820788890123367, + "learning_rate": 0.0001722990330495021, + "loss": 0.1834, + "step": 9605 + }, + { + "epoch": 0.69309859663047, + "grad_norm": 0.14675457775592804, + "learning_rate": 0.00017229614663010536, + "loss": 0.1804, + "step": 9606 + }, + { + "epoch": 0.6931707493055305, + "grad_norm": 0.11689016968011856, + "learning_rate": 0.00017229326021070862, + "loss": 0.1217, + "step": 9607 + }, + { + "epoch": 0.6932429019805909, + "grad_norm": 0.24402357637882233, + "learning_rate": 0.0001722903737913119, + "loss": 0.1643, + "step": 9608 + }, + { + "epoch": 0.6933150546556514, + "grad_norm": 0.11813085526227951, + "learning_rate": 0.00017228748737191515, + "loss": 0.1675, + "step": 9609 + }, + { + "epoch": 0.6933872073307118, + "grad_norm": 0.10200902819633484, + "learning_rate": 0.0001722846009525184, + "loss": 0.1088, + "step": 9610 + }, + { + "epoch": 0.6934593600057722, + "grad_norm": 0.11175123602151871, + "learning_rate": 0.00017228171453312168, + "loss": 0.1709, + "step": 9611 + }, + { + "epoch": 0.6935315126808327, + "grad_norm": 0.10914164036512375, + "learning_rate": 0.00017227882811372494, + "loss": 0.1564, + "step": 9612 + }, + { + "epoch": 0.693603665355893, + "grad_norm": 0.1380518078804016, + "learning_rate": 0.0001722759416943282, + "loss": 0.1299, + "step": 9613 + }, + { + "epoch": 0.6936758180309535, + "grad_norm": 0.12954658269882202, + "learning_rate": 0.00017227305527493144, + "loss": 0.1477, + "step": 9614 + }, + { + "epoch": 0.6937479707060139, + "grad_norm": 0.13407106697559357, + "learning_rate": 0.0001722701688555347, + "loss": 0.1991, + "step": 9615 + }, + { + "epoch": 0.6938201233810743, + "grad_norm": 0.1745564341545105, + "learning_rate": 0.000172267282436138, + "loss": 0.1809, + "step": 9616 + }, + { + "epoch": 0.6938922760561348, + "grad_norm": 0.112023264169693, + "learning_rate": 0.00017226439601674125, + "loss": 0.2297, + "step": 9617 + }, + { + "epoch": 0.6939644287311952, + "grad_norm": 0.1244281604886055, + "learning_rate": 0.00017226150959734452, + "loss": 0.1197, + "step": 9618 + }, + { + "epoch": 0.6940365814062557, + "grad_norm": 0.11674293130636215, + "learning_rate": 0.00017225862317794775, + "loss": 0.1706, + "step": 9619 + }, + { + "epoch": 0.694108734081316, + "grad_norm": 0.10764952003955841, + "learning_rate": 0.00017225573675855101, + "loss": 0.1818, + "step": 9620 + }, + { + "epoch": 0.6941808867563765, + "grad_norm": 0.1448487639427185, + "learning_rate": 0.00017225285033915428, + "loss": 0.1375, + "step": 9621 + }, + { + "epoch": 0.6942530394314369, + "grad_norm": 0.14177432656288147, + "learning_rate": 0.00017224996391975754, + "loss": 0.1386, + "step": 9622 + }, + { + "epoch": 0.6943251921064973, + "grad_norm": 0.11783361434936523, + "learning_rate": 0.00017224707750036083, + "loss": 0.1665, + "step": 9623 + }, + { + "epoch": 0.6943973447815578, + "grad_norm": 0.137073814868927, + "learning_rate": 0.00017224419108096407, + "loss": 0.1446, + "step": 9624 + }, + { + "epoch": 0.6944694974566182, + "grad_norm": 0.12243133783340454, + "learning_rate": 0.00017224130466156733, + "loss": 0.1175, + "step": 9625 + }, + { + "epoch": 0.6945416501316787, + "grad_norm": 0.13991667330265045, + "learning_rate": 0.0001722384182421706, + "loss": 0.1561, + "step": 9626 + }, + { + "epoch": 0.694613802806739, + "grad_norm": 0.14223317801952362, + "learning_rate": 0.00017223553182277386, + "loss": 0.1893, + "step": 9627 + }, + { + "epoch": 0.6946859554817995, + "grad_norm": 0.12393701821565628, + "learning_rate": 0.00017223264540337712, + "loss": 0.2033, + "step": 9628 + }, + { + "epoch": 0.6947581081568599, + "grad_norm": 0.16162846982479095, + "learning_rate": 0.00017222975898398038, + "loss": 0.1483, + "step": 9629 + }, + { + "epoch": 0.6948302608319203, + "grad_norm": 0.10883046686649323, + "learning_rate": 0.00017222687256458364, + "loss": 0.1971, + "step": 9630 + }, + { + "epoch": 0.6949024135069808, + "grad_norm": 0.13543400168418884, + "learning_rate": 0.0001722239861451869, + "loss": 0.1575, + "step": 9631 + }, + { + "epoch": 0.6949745661820412, + "grad_norm": 0.11675167083740234, + "learning_rate": 0.00017222109972579017, + "loss": 0.1327, + "step": 9632 + }, + { + "epoch": 0.6950467188571017, + "grad_norm": 0.12616172432899475, + "learning_rate": 0.00017221821330639343, + "loss": 0.1708, + "step": 9633 + }, + { + "epoch": 0.695118871532162, + "grad_norm": 0.12379202991724014, + "learning_rate": 0.0001722153268869967, + "loss": 0.1934, + "step": 9634 + }, + { + "epoch": 0.6951910242072225, + "grad_norm": 0.12510265409946442, + "learning_rate": 0.00017221244046759993, + "loss": 0.1411, + "step": 9635 + }, + { + "epoch": 0.6952631768822829, + "grad_norm": 0.1382853239774704, + "learning_rate": 0.0001722095540482032, + "loss": 0.1537, + "step": 9636 + }, + { + "epoch": 0.6953353295573433, + "grad_norm": 0.14490236341953278, + "learning_rate": 0.00017220666762880648, + "loss": 0.1318, + "step": 9637 + }, + { + "epoch": 0.6954074822324038, + "grad_norm": 0.20690834522247314, + "learning_rate": 0.00017220378120940975, + "loss": 0.1717, + "step": 9638 + }, + { + "epoch": 0.6954796349074642, + "grad_norm": 0.14243018627166748, + "learning_rate": 0.000172200894790013, + "loss": 0.1559, + "step": 9639 + }, + { + "epoch": 0.6955517875825247, + "grad_norm": 0.1079765185713768, + "learning_rate": 0.00017219800837061625, + "loss": 0.1484, + "step": 9640 + }, + { + "epoch": 0.695623940257585, + "grad_norm": 0.12757904827594757, + "learning_rate": 0.0001721951219512195, + "loss": 0.1159, + "step": 9641 + }, + { + "epoch": 0.6956960929326454, + "grad_norm": 0.09857811033725739, + "learning_rate": 0.00017219223553182277, + "loss": 0.1791, + "step": 9642 + }, + { + "epoch": 0.6957682456077059, + "grad_norm": 0.10688071697950363, + "learning_rate": 0.00017218934911242603, + "loss": 0.1583, + "step": 9643 + }, + { + "epoch": 0.6958403982827663, + "grad_norm": 0.1281883716583252, + "learning_rate": 0.00017218646269302932, + "loss": 0.1197, + "step": 9644 + }, + { + "epoch": 0.6959125509578268, + "grad_norm": 0.11293533444404602, + "learning_rate": 0.00017218357627363256, + "loss": 0.1473, + "step": 9645 + }, + { + "epoch": 0.6959847036328872, + "grad_norm": 0.09393231570720673, + "learning_rate": 0.00017218068985423582, + "loss": 0.1865, + "step": 9646 + }, + { + "epoch": 0.6960568563079477, + "grad_norm": 0.10949627310037613, + "learning_rate": 0.0001721778034348391, + "loss": 0.1177, + "step": 9647 + }, + { + "epoch": 0.696129008983008, + "grad_norm": 0.10585692524909973, + "learning_rate": 0.00017217491701544235, + "loss": 0.162, + "step": 9648 + }, + { + "epoch": 0.6962011616580684, + "grad_norm": 0.20258155465126038, + "learning_rate": 0.0001721720305960456, + "loss": 0.1325, + "step": 9649 + }, + { + "epoch": 0.6962733143331289, + "grad_norm": 0.11715184152126312, + "learning_rate": 0.00017216914417664888, + "loss": 0.0812, + "step": 9650 + }, + { + "epoch": 0.6963454670081893, + "grad_norm": 0.12601949274539948, + "learning_rate": 0.00017216625775725214, + "loss": 0.1534, + "step": 9651 + }, + { + "epoch": 0.6964176196832498, + "grad_norm": 0.12976346909999847, + "learning_rate": 0.0001721633713378554, + "loss": 0.1311, + "step": 9652 + }, + { + "epoch": 0.6964897723583102, + "grad_norm": 0.11151674389839172, + "learning_rate": 0.00017216048491845866, + "loss": 0.1125, + "step": 9653 + }, + { + "epoch": 0.6965619250333707, + "grad_norm": 0.11534590274095535, + "learning_rate": 0.00017215759849906193, + "loss": 0.1359, + "step": 9654 + }, + { + "epoch": 0.696634077708431, + "grad_norm": 0.11569371819496155, + "learning_rate": 0.0001721547120796652, + "loss": 0.1609, + "step": 9655 + }, + { + "epoch": 0.6967062303834914, + "grad_norm": 0.10117107629776001, + "learning_rate": 0.00017215182566026845, + "loss": 0.1113, + "step": 9656 + }, + { + "epoch": 0.6967783830585519, + "grad_norm": 0.20143665373325348, + "learning_rate": 0.0001721489392408717, + "loss": 0.2259, + "step": 9657 + }, + { + "epoch": 0.6968505357336123, + "grad_norm": 0.13160905241966248, + "learning_rate": 0.00017214605282147498, + "loss": 0.1889, + "step": 9658 + }, + { + "epoch": 0.6969226884086728, + "grad_norm": 0.12414603680372238, + "learning_rate": 0.00017214316640207824, + "loss": 0.1555, + "step": 9659 + }, + { + "epoch": 0.6969948410837332, + "grad_norm": 0.12692929804325104, + "learning_rate": 0.0001721402799826815, + "loss": 0.135, + "step": 9660 + }, + { + "epoch": 0.6970669937587937, + "grad_norm": 0.14018724858760834, + "learning_rate": 0.00017213739356328477, + "loss": 0.1293, + "step": 9661 + }, + { + "epoch": 0.697139146433854, + "grad_norm": 0.12804187834262848, + "learning_rate": 0.000172134507143888, + "loss": 0.1518, + "step": 9662 + }, + { + "epoch": 0.6972112991089144, + "grad_norm": 0.11388224363327026, + "learning_rate": 0.00017213162072449127, + "loss": 0.1431, + "step": 9663 + }, + { + "epoch": 0.6972834517839749, + "grad_norm": 0.12438242137432098, + "learning_rate": 0.00017212873430509453, + "loss": 0.2015, + "step": 9664 + }, + { + "epoch": 0.6973556044590353, + "grad_norm": 0.1338079273700714, + "learning_rate": 0.00017212584788569782, + "loss": 0.1611, + "step": 9665 + }, + { + "epoch": 0.6974277571340958, + "grad_norm": 0.14716395735740662, + "learning_rate": 0.00017212296146630108, + "loss": 0.1185, + "step": 9666 + }, + { + "epoch": 0.6974999098091562, + "grad_norm": 0.12072800099849701, + "learning_rate": 0.00017212007504690432, + "loss": 0.1504, + "step": 9667 + }, + { + "epoch": 0.6975720624842165, + "grad_norm": 0.11952561885118484, + "learning_rate": 0.00017211718862750758, + "loss": 0.2027, + "step": 9668 + }, + { + "epoch": 0.697644215159277, + "grad_norm": 0.14002487063407898, + "learning_rate": 0.00017211430220811084, + "loss": 0.1722, + "step": 9669 + }, + { + "epoch": 0.6977163678343374, + "grad_norm": 0.17951540648937225, + "learning_rate": 0.0001721114157887141, + "loss": 0.1605, + "step": 9670 + }, + { + "epoch": 0.6977885205093979, + "grad_norm": 0.12686780095100403, + "learning_rate": 0.00017210852936931737, + "loss": 0.1333, + "step": 9671 + }, + { + "epoch": 0.6978606731844583, + "grad_norm": 0.1033802255988121, + "learning_rate": 0.00017210564294992063, + "loss": 0.1672, + "step": 9672 + }, + { + "epoch": 0.6979328258595188, + "grad_norm": 0.13104183971881866, + "learning_rate": 0.0001721027565305239, + "loss": 0.1433, + "step": 9673 + }, + { + "epoch": 0.6980049785345792, + "grad_norm": 0.12744154036045074, + "learning_rate": 0.00017209987011112716, + "loss": 0.1628, + "step": 9674 + }, + { + "epoch": 0.6980771312096395, + "grad_norm": 0.12353894114494324, + "learning_rate": 0.00017209698369173042, + "loss": 0.15, + "step": 9675 + }, + { + "epoch": 0.6981492838847, + "grad_norm": 0.13690710067749023, + "learning_rate": 0.00017209409727233368, + "loss": 0.1135, + "step": 9676 + }, + { + "epoch": 0.6982214365597604, + "grad_norm": 0.1367052048444748, + "learning_rate": 0.00017209121085293695, + "loss": 0.1601, + "step": 9677 + }, + { + "epoch": 0.6982935892348209, + "grad_norm": 0.10972071439027786, + "learning_rate": 0.00017208832443354018, + "loss": 0.1109, + "step": 9678 + }, + { + "epoch": 0.6983657419098813, + "grad_norm": 0.19588278234004974, + "learning_rate": 0.00017208543801414347, + "loss": 0.1736, + "step": 9679 + }, + { + "epoch": 0.6984378945849418, + "grad_norm": 0.14738193154335022, + "learning_rate": 0.00017208255159474674, + "loss": 0.1543, + "step": 9680 + }, + { + "epoch": 0.6985100472600022, + "grad_norm": 0.1534094214439392, + "learning_rate": 0.00017207966517535, + "loss": 0.1204, + "step": 9681 + }, + { + "epoch": 0.6985821999350625, + "grad_norm": 0.18039028346538544, + "learning_rate": 0.00017207677875595326, + "loss": 0.1981, + "step": 9682 + }, + { + "epoch": 0.698654352610123, + "grad_norm": 0.12097857892513275, + "learning_rate": 0.0001720738923365565, + "loss": 0.1441, + "step": 9683 + }, + { + "epoch": 0.6987265052851834, + "grad_norm": 0.13416697084903717, + "learning_rate": 0.00017207100591715976, + "loss": 0.1491, + "step": 9684 + }, + { + "epoch": 0.6987986579602439, + "grad_norm": 0.17525966465473175, + "learning_rate": 0.00017206811949776302, + "loss": 0.1977, + "step": 9685 + }, + { + "epoch": 0.6988708106353043, + "grad_norm": 0.1118546798825264, + "learning_rate": 0.0001720652330783663, + "loss": 0.1563, + "step": 9686 + }, + { + "epoch": 0.6989429633103648, + "grad_norm": 0.1599108874797821, + "learning_rate": 0.00017206234665896958, + "loss": 0.2525, + "step": 9687 + }, + { + "epoch": 0.6990151159854252, + "grad_norm": 0.10533977299928665, + "learning_rate": 0.0001720594602395728, + "loss": 0.1827, + "step": 9688 + }, + { + "epoch": 0.6990872686604855, + "grad_norm": 0.1353236883878708, + "learning_rate": 0.00017205657382017608, + "loss": 0.1175, + "step": 9689 + }, + { + "epoch": 0.699159421335546, + "grad_norm": 0.21189481019973755, + "learning_rate": 0.00017205368740077934, + "loss": 0.1894, + "step": 9690 + }, + { + "epoch": 0.6992315740106064, + "grad_norm": 0.13929972052574158, + "learning_rate": 0.0001720508009813826, + "loss": 0.1996, + "step": 9691 + }, + { + "epoch": 0.6993037266856669, + "grad_norm": 0.1412792205810547, + "learning_rate": 0.00017204791456198586, + "loss": 0.1683, + "step": 9692 + }, + { + "epoch": 0.6993758793607273, + "grad_norm": 0.11568436771631241, + "learning_rate": 0.00017204502814258913, + "loss": 0.1659, + "step": 9693 + }, + { + "epoch": 0.6994480320357878, + "grad_norm": 0.11832127720117569, + "learning_rate": 0.0001720421417231924, + "loss": 0.1163, + "step": 9694 + }, + { + "epoch": 0.6995201847108482, + "grad_norm": 0.12044627219438553, + "learning_rate": 0.00017203925530379565, + "loss": 0.1622, + "step": 9695 + }, + { + "epoch": 0.6995923373859085, + "grad_norm": 0.11714223027229309, + "learning_rate": 0.00017203636888439892, + "loss": 0.1213, + "step": 9696 + }, + { + "epoch": 0.699664490060969, + "grad_norm": 0.11178726702928543, + "learning_rate": 0.00017203348246500218, + "loss": 0.1417, + "step": 9697 + }, + { + "epoch": 0.6997366427360294, + "grad_norm": 0.10355254262685776, + "learning_rate": 0.00017203059604560544, + "loss": 0.1369, + "step": 9698 + }, + { + "epoch": 0.6998087954110899, + "grad_norm": 0.15807823836803436, + "learning_rate": 0.00017202770962620868, + "loss": 0.1503, + "step": 9699 + }, + { + "epoch": 0.6998809480861503, + "grad_norm": 0.09935374557971954, + "learning_rate": 0.00017202482320681194, + "loss": 0.1474, + "step": 9700 + }, + { + "epoch": 0.6999531007612108, + "grad_norm": 0.10886482894420624, + "learning_rate": 0.00017202193678741523, + "loss": 0.1866, + "step": 9701 + }, + { + "epoch": 0.7000252534362712, + "grad_norm": 0.11364229768514633, + "learning_rate": 0.0001720190503680185, + "loss": 0.1196, + "step": 9702 + }, + { + "epoch": 0.7000974061113315, + "grad_norm": 0.12444400787353516, + "learning_rate": 0.00017201616394862176, + "loss": 0.1216, + "step": 9703 + }, + { + "epoch": 0.700169558786392, + "grad_norm": 0.14167913794517517, + "learning_rate": 0.000172013277529225, + "loss": 0.1237, + "step": 9704 + }, + { + "epoch": 0.7002417114614524, + "grad_norm": 0.12108553946018219, + "learning_rate": 0.00017201039110982825, + "loss": 0.1821, + "step": 9705 + }, + { + "epoch": 0.7003138641365129, + "grad_norm": 0.1302456557750702, + "learning_rate": 0.00017200750469043152, + "loss": 0.1074, + "step": 9706 + }, + { + "epoch": 0.7003860168115733, + "grad_norm": 0.13411375880241394, + "learning_rate": 0.00017200461827103478, + "loss": 0.1422, + "step": 9707 + }, + { + "epoch": 0.7004581694866338, + "grad_norm": 0.13024677336215973, + "learning_rate": 0.00017200173185163807, + "loss": 0.1648, + "step": 9708 + }, + { + "epoch": 0.7005303221616942, + "grad_norm": 0.12400952726602554, + "learning_rate": 0.0001719988454322413, + "loss": 0.1654, + "step": 9709 + }, + { + "epoch": 0.7006024748367545, + "grad_norm": 0.121695376932621, + "learning_rate": 0.00017199595901284457, + "loss": 0.1818, + "step": 9710 + }, + { + "epoch": 0.700674627511815, + "grad_norm": 0.1169712096452713, + "learning_rate": 0.00017199307259344783, + "loss": 0.1702, + "step": 9711 + }, + { + "epoch": 0.7007467801868754, + "grad_norm": 0.1264851689338684, + "learning_rate": 0.0001719901861740511, + "loss": 0.13, + "step": 9712 + }, + { + "epoch": 0.7008189328619359, + "grad_norm": 0.11403610557317734, + "learning_rate": 0.00017198729975465436, + "loss": 0.1638, + "step": 9713 + }, + { + "epoch": 0.7008910855369963, + "grad_norm": 0.12154081463813782, + "learning_rate": 0.00017198441333525762, + "loss": 0.1206, + "step": 9714 + }, + { + "epoch": 0.7009632382120567, + "grad_norm": 0.12071266770362854, + "learning_rate": 0.00017198152691586088, + "loss": 0.1472, + "step": 9715 + }, + { + "epoch": 0.7010353908871172, + "grad_norm": 0.12383043020963669, + "learning_rate": 0.00017197864049646415, + "loss": 0.1187, + "step": 9716 + }, + { + "epoch": 0.7011075435621775, + "grad_norm": 0.14367400109767914, + "learning_rate": 0.0001719757540770674, + "loss": 0.1238, + "step": 9717 + }, + { + "epoch": 0.701179696237238, + "grad_norm": 0.1460137665271759, + "learning_rate": 0.00017197286765767067, + "loss": 0.1454, + "step": 9718 + }, + { + "epoch": 0.7012518489122984, + "grad_norm": 0.10463149100542068, + "learning_rate": 0.00017196998123827394, + "loss": 0.1509, + "step": 9719 + }, + { + "epoch": 0.7013240015873589, + "grad_norm": 0.1418483406305313, + "learning_rate": 0.00017196709481887717, + "loss": 0.1841, + "step": 9720 + }, + { + "epoch": 0.7013961542624193, + "grad_norm": 0.14183053374290466, + "learning_rate": 0.00017196420839948043, + "loss": 0.2072, + "step": 9721 + }, + { + "epoch": 0.7014683069374797, + "grad_norm": 0.14547039568424225, + "learning_rate": 0.00017196132198008372, + "loss": 0.1647, + "step": 9722 + }, + { + "epoch": 0.7015404596125402, + "grad_norm": 0.11831801384687424, + "learning_rate": 0.000171958435560687, + "loss": 0.1648, + "step": 9723 + }, + { + "epoch": 0.7016126122876005, + "grad_norm": 0.15214595198631287, + "learning_rate": 0.00017195554914129025, + "loss": 0.1577, + "step": 9724 + }, + { + "epoch": 0.701684764962661, + "grad_norm": 0.1515510529279709, + "learning_rate": 0.00017195266272189349, + "loss": 0.1071, + "step": 9725 + }, + { + "epoch": 0.7017569176377214, + "grad_norm": 0.1443483531475067, + "learning_rate": 0.00017194977630249675, + "loss": 0.1358, + "step": 9726 + }, + { + "epoch": 0.7018290703127819, + "grad_norm": 0.11696092784404755, + "learning_rate": 0.0001719468898831, + "loss": 0.086, + "step": 9727 + }, + { + "epoch": 0.7019012229878423, + "grad_norm": 0.1589919775724411, + "learning_rate": 0.00017194400346370327, + "loss": 0.1566, + "step": 9728 + }, + { + "epoch": 0.7019733756629027, + "grad_norm": 0.1331232637166977, + "learning_rate": 0.00017194111704430656, + "loss": 0.1431, + "step": 9729 + }, + { + "epoch": 0.7020455283379631, + "grad_norm": 0.12140674144029617, + "learning_rate": 0.0001719382306249098, + "loss": 0.1458, + "step": 9730 + }, + { + "epoch": 0.7021176810130235, + "grad_norm": 0.16071046888828278, + "learning_rate": 0.00017193534420551306, + "loss": 0.1976, + "step": 9731 + }, + { + "epoch": 0.702189833688084, + "grad_norm": 0.11735766381025314, + "learning_rate": 0.00017193245778611633, + "loss": 0.1283, + "step": 9732 + }, + { + "epoch": 0.7022619863631444, + "grad_norm": 0.12011319398880005, + "learning_rate": 0.0001719295713667196, + "loss": 0.1559, + "step": 9733 + }, + { + "epoch": 0.7023341390382049, + "grad_norm": 0.10947341471910477, + "learning_rate": 0.00017192668494732285, + "loss": 0.1376, + "step": 9734 + }, + { + "epoch": 0.7024062917132653, + "grad_norm": 0.1267535537481308, + "learning_rate": 0.00017192379852792612, + "loss": 0.1407, + "step": 9735 + }, + { + "epoch": 0.7024784443883257, + "grad_norm": 0.10268282145261765, + "learning_rate": 0.00017192091210852938, + "loss": 0.1855, + "step": 9736 + }, + { + "epoch": 0.7025505970633861, + "grad_norm": 0.12966306507587433, + "learning_rate": 0.00017191802568913264, + "loss": 0.1203, + "step": 9737 + }, + { + "epoch": 0.7026227497384465, + "grad_norm": 0.12109366059303284, + "learning_rate": 0.0001719151392697359, + "loss": 0.1425, + "step": 9738 + }, + { + "epoch": 0.702694902413507, + "grad_norm": 0.11503426730632782, + "learning_rate": 0.00017191225285033917, + "loss": 0.1411, + "step": 9739 + }, + { + "epoch": 0.7027670550885674, + "grad_norm": 0.12528328597545624, + "learning_rate": 0.00017190936643094243, + "loss": 0.1367, + "step": 9740 + }, + { + "epoch": 0.7028392077636278, + "grad_norm": 0.11384246498346329, + "learning_rate": 0.00017190648001154567, + "loss": 0.1325, + "step": 9741 + }, + { + "epoch": 0.7029113604386883, + "grad_norm": 0.1350889950990677, + "learning_rate": 0.00017190359359214893, + "loss": 0.1155, + "step": 9742 + }, + { + "epoch": 0.7029835131137487, + "grad_norm": 0.13169564306735992, + "learning_rate": 0.00017190070717275222, + "loss": 0.1646, + "step": 9743 + }, + { + "epoch": 0.7030556657888091, + "grad_norm": 0.12558674812316895, + "learning_rate": 0.00017189782075335548, + "loss": 0.1683, + "step": 9744 + }, + { + "epoch": 0.7031278184638695, + "grad_norm": 0.12045971304178238, + "learning_rate": 0.00017189493433395874, + "loss": 0.1713, + "step": 9745 + }, + { + "epoch": 0.70319997113893, + "grad_norm": 0.10757529735565186, + "learning_rate": 0.00017189204791456198, + "loss": 0.138, + "step": 9746 + }, + { + "epoch": 0.7032721238139904, + "grad_norm": 0.11273236572742462, + "learning_rate": 0.00017188916149516524, + "loss": 0.0946, + "step": 9747 + }, + { + "epoch": 0.7033442764890508, + "grad_norm": 0.100577212870121, + "learning_rate": 0.0001718862750757685, + "loss": 0.1557, + "step": 9748 + }, + { + "epoch": 0.7034164291641113, + "grad_norm": 0.13702300190925598, + "learning_rate": 0.00017188338865637177, + "loss": 0.149, + "step": 9749 + }, + { + "epoch": 0.7034885818391717, + "grad_norm": 0.15205080807209015, + "learning_rate": 0.00017188050223697506, + "loss": 0.1669, + "step": 9750 + }, + { + "epoch": 0.7035607345142321, + "grad_norm": 0.10914731025695801, + "learning_rate": 0.0001718776158175783, + "loss": 0.1311, + "step": 9751 + }, + { + "epoch": 0.7036328871892925, + "grad_norm": 0.11806619167327881, + "learning_rate": 0.00017187472939818156, + "loss": 0.1528, + "step": 9752 + }, + { + "epoch": 0.703705039864353, + "grad_norm": 0.12875840067863464, + "learning_rate": 0.00017187184297878482, + "loss": 0.1028, + "step": 9753 + }, + { + "epoch": 0.7037771925394134, + "grad_norm": 0.12242988497018814, + "learning_rate": 0.00017186895655938808, + "loss": 0.1797, + "step": 9754 + }, + { + "epoch": 0.7038493452144738, + "grad_norm": 0.15106521546840668, + "learning_rate": 0.00017186607013999135, + "loss": 0.1605, + "step": 9755 + }, + { + "epoch": 0.7039214978895343, + "grad_norm": 0.1285814642906189, + "learning_rate": 0.0001718631837205946, + "loss": 0.1856, + "step": 9756 + }, + { + "epoch": 0.7039936505645947, + "grad_norm": 0.12011777609586716, + "learning_rate": 0.00017186029730119787, + "loss": 0.135, + "step": 9757 + }, + { + "epoch": 0.7040658032396551, + "grad_norm": 0.13019108772277832, + "learning_rate": 0.00017185741088180114, + "loss": 0.1589, + "step": 9758 + }, + { + "epoch": 0.7041379559147155, + "grad_norm": 0.12479769438505173, + "learning_rate": 0.0001718545244624044, + "loss": 0.1187, + "step": 9759 + }, + { + "epoch": 0.704210108589776, + "grad_norm": 0.30191943049430847, + "learning_rate": 0.00017185163804300766, + "loss": 0.1201, + "step": 9760 + }, + { + "epoch": 0.7042822612648364, + "grad_norm": 0.12107078731060028, + "learning_rate": 0.00017184875162361092, + "loss": 0.1464, + "step": 9761 + }, + { + "epoch": 0.7043544139398968, + "grad_norm": 0.1304750144481659, + "learning_rate": 0.0001718458652042142, + "loss": 0.1765, + "step": 9762 + }, + { + "epoch": 0.7044265666149573, + "grad_norm": 0.11234626173973083, + "learning_rate": 0.00017184297878481742, + "loss": 0.1156, + "step": 9763 + }, + { + "epoch": 0.7044987192900177, + "grad_norm": 0.11644095182418823, + "learning_rate": 0.0001718400923654207, + "loss": 0.124, + "step": 9764 + }, + { + "epoch": 0.7045708719650781, + "grad_norm": 0.14146173000335693, + "learning_rate": 0.00017183720594602398, + "loss": 0.1258, + "step": 9765 + }, + { + "epoch": 0.7046430246401385, + "grad_norm": 0.13076259195804596, + "learning_rate": 0.00017183431952662724, + "loss": 0.1368, + "step": 9766 + }, + { + "epoch": 0.704715177315199, + "grad_norm": 0.1335371434688568, + "learning_rate": 0.0001718314331072305, + "loss": 0.162, + "step": 9767 + }, + { + "epoch": 0.7047873299902594, + "grad_norm": 0.15889213979244232, + "learning_rate": 0.00017182854668783374, + "loss": 0.1524, + "step": 9768 + }, + { + "epoch": 0.7048594826653198, + "grad_norm": 0.11813731491565704, + "learning_rate": 0.000171825660268437, + "loss": 0.1364, + "step": 9769 + }, + { + "epoch": 0.7049316353403803, + "grad_norm": 0.11796697974205017, + "learning_rate": 0.00017182277384904026, + "loss": 0.1635, + "step": 9770 + }, + { + "epoch": 0.7050037880154407, + "grad_norm": 0.13033068180084229, + "learning_rate": 0.00017181988742964355, + "loss": 0.1523, + "step": 9771 + }, + { + "epoch": 0.7050759406905011, + "grad_norm": 0.10769747197628021, + "learning_rate": 0.00017181700101024682, + "loss": 0.1449, + "step": 9772 + }, + { + "epoch": 0.7051480933655615, + "grad_norm": 0.1245947927236557, + "learning_rate": 0.00017181411459085005, + "loss": 0.1393, + "step": 9773 + }, + { + "epoch": 0.705220246040622, + "grad_norm": 0.11615555733442307, + "learning_rate": 0.00017181122817145331, + "loss": 0.2104, + "step": 9774 + }, + { + "epoch": 0.7052923987156824, + "grad_norm": 0.1361609548330307, + "learning_rate": 0.00017180834175205658, + "loss": 0.1473, + "step": 9775 + }, + { + "epoch": 0.7053645513907428, + "grad_norm": 0.11919081211090088, + "learning_rate": 0.00017180545533265984, + "loss": 0.1078, + "step": 9776 + }, + { + "epoch": 0.7054367040658033, + "grad_norm": 0.11736463755369186, + "learning_rate": 0.0001718025689132631, + "loss": 0.1614, + "step": 9777 + }, + { + "epoch": 0.7055088567408637, + "grad_norm": 0.09773008525371552, + "learning_rate": 0.00017179968249386637, + "loss": 0.146, + "step": 9778 + }, + { + "epoch": 0.705581009415924, + "grad_norm": 0.15151232481002808, + "learning_rate": 0.00017179679607446963, + "loss": 0.1969, + "step": 9779 + }, + { + "epoch": 0.7056531620909845, + "grad_norm": 0.12831969559192657, + "learning_rate": 0.0001717939096550729, + "loss": 0.1403, + "step": 9780 + }, + { + "epoch": 0.7057253147660449, + "grad_norm": 0.11678869277238846, + "learning_rate": 0.00017179102323567616, + "loss": 0.1455, + "step": 9781 + }, + { + "epoch": 0.7057974674411054, + "grad_norm": 0.10794802755117416, + "learning_rate": 0.00017178813681627942, + "loss": 0.1364, + "step": 9782 + }, + { + "epoch": 0.7058696201161658, + "grad_norm": 0.15175952017307281, + "learning_rate": 0.00017178525039688268, + "loss": 0.1973, + "step": 9783 + }, + { + "epoch": 0.7059417727912263, + "grad_norm": 0.13190539181232452, + "learning_rate": 0.00017178236397748592, + "loss": 0.1294, + "step": 9784 + }, + { + "epoch": 0.7060139254662866, + "grad_norm": 0.13258545100688934, + "learning_rate": 0.0001717794775580892, + "loss": 0.1977, + "step": 9785 + }, + { + "epoch": 0.706086078141347, + "grad_norm": 0.11998707056045532, + "learning_rate": 0.00017177659113869247, + "loss": 0.1372, + "step": 9786 + }, + { + "epoch": 0.7061582308164075, + "grad_norm": 0.1670992523431778, + "learning_rate": 0.00017177370471929573, + "loss": 0.1864, + "step": 9787 + }, + { + "epoch": 0.7062303834914679, + "grad_norm": 0.14586926996707916, + "learning_rate": 0.000171770818299899, + "loss": 0.1557, + "step": 9788 + }, + { + "epoch": 0.7063025361665284, + "grad_norm": 0.12285439670085907, + "learning_rate": 0.00017176793188050223, + "loss": 0.157, + "step": 9789 + }, + { + "epoch": 0.7063746888415888, + "grad_norm": 0.10460318624973297, + "learning_rate": 0.0001717650454611055, + "loss": 0.1386, + "step": 9790 + }, + { + "epoch": 0.7064468415166493, + "grad_norm": 0.14111445844173431, + "learning_rate": 0.00017176215904170876, + "loss": 0.1299, + "step": 9791 + }, + { + "epoch": 0.7065189941917096, + "grad_norm": 0.1285100281238556, + "learning_rate": 0.00017175927262231205, + "loss": 0.153, + "step": 9792 + }, + { + "epoch": 0.70659114686677, + "grad_norm": 0.1267320066690445, + "learning_rate": 0.0001717563862029153, + "loss": 0.1916, + "step": 9793 + }, + { + "epoch": 0.7066632995418305, + "grad_norm": 0.14147202670574188, + "learning_rate": 0.00017175349978351855, + "loss": 0.1354, + "step": 9794 + }, + { + "epoch": 0.7067354522168909, + "grad_norm": 0.15187636017799377, + "learning_rate": 0.0001717506133641218, + "loss": 0.1668, + "step": 9795 + }, + { + "epoch": 0.7068076048919514, + "grad_norm": 0.1322856992483139, + "learning_rate": 0.00017174772694472507, + "loss": 0.2105, + "step": 9796 + }, + { + "epoch": 0.7068797575670118, + "grad_norm": 0.11827823519706726, + "learning_rate": 0.00017174484052532834, + "loss": 0.1454, + "step": 9797 + }, + { + "epoch": 0.7069519102420723, + "grad_norm": 0.13164323568344116, + "learning_rate": 0.0001717419541059316, + "loss": 0.1443, + "step": 9798 + }, + { + "epoch": 0.7070240629171326, + "grad_norm": 0.1065685823559761, + "learning_rate": 0.00017173906768653486, + "loss": 0.1235, + "step": 9799 + }, + { + "epoch": 0.707096215592193, + "grad_norm": 0.6572790741920471, + "learning_rate": 0.00017173618126713812, + "loss": 0.1534, + "step": 9800 + }, + { + "epoch": 0.7071683682672535, + "grad_norm": 0.12067247182130814, + "learning_rate": 0.0001717332948477414, + "loss": 0.185, + "step": 9801 + }, + { + "epoch": 0.7072405209423139, + "grad_norm": 0.10997969657182693, + "learning_rate": 0.00017173040842834465, + "loss": 0.1665, + "step": 9802 + }, + { + "epoch": 0.7073126736173744, + "grad_norm": 0.1059444323182106, + "learning_rate": 0.0001717275220089479, + "loss": 0.1168, + "step": 9803 + }, + { + "epoch": 0.7073848262924348, + "grad_norm": 0.10876884311437607, + "learning_rate": 0.00017172463558955118, + "loss": 0.1466, + "step": 9804 + }, + { + "epoch": 0.7074569789674953, + "grad_norm": 0.11825942248106003, + "learning_rate": 0.0001717217491701544, + "loss": 0.1516, + "step": 9805 + }, + { + "epoch": 0.7075291316425556, + "grad_norm": 0.1341393142938614, + "learning_rate": 0.0001717188627507577, + "loss": 0.1745, + "step": 9806 + }, + { + "epoch": 0.707601284317616, + "grad_norm": 0.10476581007242203, + "learning_rate": 0.00017171597633136096, + "loss": 0.1466, + "step": 9807 + }, + { + "epoch": 0.7076734369926765, + "grad_norm": 0.11815085262060165, + "learning_rate": 0.00017171308991196423, + "loss": 0.1593, + "step": 9808 + }, + { + "epoch": 0.7077455896677369, + "grad_norm": 0.12892809510231018, + "learning_rate": 0.0001717102034925675, + "loss": 0.1365, + "step": 9809 + }, + { + "epoch": 0.7078177423427974, + "grad_norm": 0.13001443445682526, + "learning_rate": 0.00017170731707317073, + "loss": 0.1805, + "step": 9810 + }, + { + "epoch": 0.7078898950178578, + "grad_norm": 0.1401710957288742, + "learning_rate": 0.000171704430653774, + "loss": 0.1604, + "step": 9811 + }, + { + "epoch": 0.7079620476929183, + "grad_norm": 0.1300428807735443, + "learning_rate": 0.00017170154423437725, + "loss": 0.1425, + "step": 9812 + }, + { + "epoch": 0.7080342003679786, + "grad_norm": 0.11680950224399567, + "learning_rate": 0.00017169865781498054, + "loss": 0.1626, + "step": 9813 + }, + { + "epoch": 0.708106353043039, + "grad_norm": 0.12791261076927185, + "learning_rate": 0.0001716957713955838, + "loss": 0.1756, + "step": 9814 + }, + { + "epoch": 0.7081785057180995, + "grad_norm": 0.1518712341785431, + "learning_rate": 0.00017169288497618704, + "loss": 0.1706, + "step": 9815 + }, + { + "epoch": 0.7082506583931599, + "grad_norm": 0.12080371379852295, + "learning_rate": 0.0001716899985567903, + "loss": 0.1604, + "step": 9816 + }, + { + "epoch": 0.7083228110682204, + "grad_norm": 0.13024398684501648, + "learning_rate": 0.00017168711213739357, + "loss": 0.1538, + "step": 9817 + }, + { + "epoch": 0.7083949637432808, + "grad_norm": 0.14617067575454712, + "learning_rate": 0.00017168422571799683, + "loss": 0.1482, + "step": 9818 + }, + { + "epoch": 0.7084671164183413, + "grad_norm": 0.11080318689346313, + "learning_rate": 0.0001716813392986001, + "loss": 0.1428, + "step": 9819 + }, + { + "epoch": 0.7085392690934016, + "grad_norm": 0.13892851769924164, + "learning_rate": 0.00017167845287920336, + "loss": 0.1448, + "step": 9820 + }, + { + "epoch": 0.708611421768462, + "grad_norm": 0.18297190964221954, + "learning_rate": 0.00017167556645980662, + "loss": 0.1453, + "step": 9821 + }, + { + "epoch": 0.7086835744435225, + "grad_norm": 0.10961407423019409, + "learning_rate": 0.00017167268004040988, + "loss": 0.1476, + "step": 9822 + }, + { + "epoch": 0.7087557271185829, + "grad_norm": 0.15383893251419067, + "learning_rate": 0.00017166979362101314, + "loss": 0.1899, + "step": 9823 + }, + { + "epoch": 0.7088278797936434, + "grad_norm": 0.11609875410795212, + "learning_rate": 0.0001716669072016164, + "loss": 0.124, + "step": 9824 + }, + { + "epoch": 0.7089000324687038, + "grad_norm": 0.1328735500574112, + "learning_rate": 0.00017166402078221967, + "loss": 0.1486, + "step": 9825 + }, + { + "epoch": 0.7089721851437643, + "grad_norm": 0.1313145011663437, + "learning_rate": 0.0001716611343628229, + "loss": 0.159, + "step": 9826 + }, + { + "epoch": 0.7090443378188246, + "grad_norm": 0.11288633942604065, + "learning_rate": 0.0001716582479434262, + "loss": 0.1553, + "step": 9827 + }, + { + "epoch": 0.709116490493885, + "grad_norm": 0.14243653416633606, + "learning_rate": 0.00017165536152402946, + "loss": 0.1693, + "step": 9828 + }, + { + "epoch": 0.7091886431689455, + "grad_norm": 0.12032989412546158, + "learning_rate": 0.00017165247510463272, + "loss": 0.1274, + "step": 9829 + }, + { + "epoch": 0.7092607958440059, + "grad_norm": 0.13511602580547333, + "learning_rate": 0.00017164958868523598, + "loss": 0.1661, + "step": 9830 + }, + { + "epoch": 0.7093329485190664, + "grad_norm": 0.1368311047554016, + "learning_rate": 0.00017164670226583922, + "loss": 0.1767, + "step": 9831 + }, + { + "epoch": 0.7094051011941268, + "grad_norm": 0.10124894976615906, + "learning_rate": 0.00017164381584644248, + "loss": 0.1765, + "step": 9832 + }, + { + "epoch": 0.7094772538691873, + "grad_norm": 0.11749887466430664, + "learning_rate": 0.00017164092942704575, + "loss": 0.1343, + "step": 9833 + }, + { + "epoch": 0.7095494065442476, + "grad_norm": 0.12168819457292557, + "learning_rate": 0.00017163804300764904, + "loss": 0.1573, + "step": 9834 + }, + { + "epoch": 0.709621559219308, + "grad_norm": 0.11898388713598251, + "learning_rate": 0.0001716351565882523, + "loss": 0.1558, + "step": 9835 + }, + { + "epoch": 0.7096937118943685, + "grad_norm": 0.1410788744688034, + "learning_rate": 0.00017163227016885553, + "loss": 0.1722, + "step": 9836 + }, + { + "epoch": 0.7097658645694289, + "grad_norm": 0.10520344972610474, + "learning_rate": 0.0001716293837494588, + "loss": 0.0989, + "step": 9837 + }, + { + "epoch": 0.7098380172444894, + "grad_norm": 0.1149938628077507, + "learning_rate": 0.00017162649733006206, + "loss": 0.1037, + "step": 9838 + }, + { + "epoch": 0.7099101699195498, + "grad_norm": 0.15595239400863647, + "learning_rate": 0.00017162361091066532, + "loss": 0.1389, + "step": 9839 + }, + { + "epoch": 0.7099823225946102, + "grad_norm": 0.12930183112621307, + "learning_rate": 0.0001716207244912686, + "loss": 0.0971, + "step": 9840 + }, + { + "epoch": 0.7100544752696706, + "grad_norm": 0.11761047691106796, + "learning_rate": 0.00017161783807187185, + "loss": 0.1513, + "step": 9841 + }, + { + "epoch": 0.710126627944731, + "grad_norm": 0.18295948207378387, + "learning_rate": 0.0001716149516524751, + "loss": 0.1786, + "step": 9842 + }, + { + "epoch": 0.7101987806197915, + "grad_norm": 0.15927353501319885, + "learning_rate": 0.00017161206523307838, + "loss": 0.1635, + "step": 9843 + }, + { + "epoch": 0.7102709332948519, + "grad_norm": 0.14733755588531494, + "learning_rate": 0.00017160917881368164, + "loss": 0.1586, + "step": 9844 + }, + { + "epoch": 0.7103430859699124, + "grad_norm": 0.11530192941427231, + "learning_rate": 0.0001716062923942849, + "loss": 0.1309, + "step": 9845 + }, + { + "epoch": 0.7104152386449728, + "grad_norm": 0.5042710304260254, + "learning_rate": 0.00017160340597488816, + "loss": 0.1704, + "step": 9846 + }, + { + "epoch": 0.7104873913200331, + "grad_norm": 0.13949906826019287, + "learning_rate": 0.0001716005195554914, + "loss": 0.1263, + "step": 9847 + }, + { + "epoch": 0.7105595439950936, + "grad_norm": 0.10125601291656494, + "learning_rate": 0.0001715976331360947, + "loss": 0.1121, + "step": 9848 + }, + { + "epoch": 0.710631696670154, + "grad_norm": 0.11898714303970337, + "learning_rate": 0.00017159474671669795, + "loss": 0.1455, + "step": 9849 + }, + { + "epoch": 0.7107038493452145, + "grad_norm": 0.13794389367103577, + "learning_rate": 0.00017159186029730122, + "loss": 0.1873, + "step": 9850 + }, + { + "epoch": 0.7107760020202749, + "grad_norm": 0.11388465762138367, + "learning_rate": 0.00017158897387790448, + "loss": 0.1974, + "step": 9851 + }, + { + "epoch": 0.7108481546953354, + "grad_norm": 0.169645294547081, + "learning_rate": 0.00017158608745850771, + "loss": 0.1842, + "step": 9852 + }, + { + "epoch": 0.7109203073703958, + "grad_norm": 0.11859288066625595, + "learning_rate": 0.00017158320103911098, + "loss": 0.1438, + "step": 9853 + }, + { + "epoch": 0.7109924600454561, + "grad_norm": 0.11447511613368988, + "learning_rate": 0.00017158031461971424, + "loss": 0.1258, + "step": 9854 + }, + { + "epoch": 0.7110646127205166, + "grad_norm": 0.1500857025384903, + "learning_rate": 0.00017157742820031753, + "loss": 0.1707, + "step": 9855 + }, + { + "epoch": 0.711136765395577, + "grad_norm": 0.1037248969078064, + "learning_rate": 0.0001715745417809208, + "loss": 0.1653, + "step": 9856 + }, + { + "epoch": 0.7112089180706375, + "grad_norm": 0.14994950592517853, + "learning_rate": 0.00017157165536152403, + "loss": 0.1192, + "step": 9857 + }, + { + "epoch": 0.7112810707456979, + "grad_norm": 0.11467479914426804, + "learning_rate": 0.0001715687689421273, + "loss": 0.1513, + "step": 9858 + }, + { + "epoch": 0.7113532234207584, + "grad_norm": 0.11612613499164581, + "learning_rate": 0.00017156588252273055, + "loss": 0.1455, + "step": 9859 + }, + { + "epoch": 0.7114253760958188, + "grad_norm": 0.14025089144706726, + "learning_rate": 0.00017156299610333382, + "loss": 0.1282, + "step": 9860 + }, + { + "epoch": 0.7114975287708791, + "grad_norm": 0.10642143338918686, + "learning_rate": 0.00017156010968393708, + "loss": 0.175, + "step": 9861 + }, + { + "epoch": 0.7115696814459396, + "grad_norm": 0.1332758218050003, + "learning_rate": 0.00017155722326454034, + "loss": 0.1791, + "step": 9862 + }, + { + "epoch": 0.711641834121, + "grad_norm": 0.1146327331662178, + "learning_rate": 0.0001715543368451436, + "loss": 0.1294, + "step": 9863 + }, + { + "epoch": 0.7117139867960605, + "grad_norm": 0.1492261290550232, + "learning_rate": 0.00017155145042574687, + "loss": 0.152, + "step": 9864 + }, + { + "epoch": 0.7117861394711209, + "grad_norm": 0.14946997165679932, + "learning_rate": 0.00017154856400635013, + "loss": 0.1694, + "step": 9865 + }, + { + "epoch": 0.7118582921461813, + "grad_norm": 0.14650723338127136, + "learning_rate": 0.0001715456775869534, + "loss": 0.1241, + "step": 9866 + }, + { + "epoch": 0.7119304448212418, + "grad_norm": 0.12067387253046036, + "learning_rate": 0.00017154279116755666, + "loss": 0.1527, + "step": 9867 + }, + { + "epoch": 0.7120025974963021, + "grad_norm": 0.12674711644649506, + "learning_rate": 0.0001715399047481599, + "loss": 0.1037, + "step": 9868 + }, + { + "epoch": 0.7120747501713626, + "grad_norm": 0.12338986992835999, + "learning_rate": 0.00017153701832876318, + "loss": 0.1904, + "step": 9869 + }, + { + "epoch": 0.712146902846423, + "grad_norm": 0.13921062648296356, + "learning_rate": 0.00017153413190936645, + "loss": 0.1651, + "step": 9870 + }, + { + "epoch": 0.7122190555214835, + "grad_norm": 0.1583152711391449, + "learning_rate": 0.0001715312454899697, + "loss": 0.1568, + "step": 9871 + }, + { + "epoch": 0.7122912081965439, + "grad_norm": 0.11268026381731033, + "learning_rate": 0.00017152835907057297, + "loss": 0.1469, + "step": 9872 + }, + { + "epoch": 0.7123633608716043, + "grad_norm": 0.14031006395816803, + "learning_rate": 0.00017152547265117624, + "loss": 0.207, + "step": 9873 + }, + { + "epoch": 0.7124355135466648, + "grad_norm": 0.13599331676959991, + "learning_rate": 0.00017152258623177947, + "loss": 0.1778, + "step": 9874 + }, + { + "epoch": 0.7125076662217251, + "grad_norm": 0.14373207092285156, + "learning_rate": 0.00017151969981238273, + "loss": 0.1517, + "step": 9875 + }, + { + "epoch": 0.7125798188967856, + "grad_norm": 0.14559577405452728, + "learning_rate": 0.00017151681339298602, + "loss": 0.1573, + "step": 9876 + }, + { + "epoch": 0.712651971571846, + "grad_norm": 0.13349908590316772, + "learning_rate": 0.0001715139269735893, + "loss": 0.1683, + "step": 9877 + }, + { + "epoch": 0.7127241242469065, + "grad_norm": 0.13450081646442413, + "learning_rate": 0.00017151104055419255, + "loss": 0.2079, + "step": 9878 + }, + { + "epoch": 0.7127962769219669, + "grad_norm": 0.1484353095293045, + "learning_rate": 0.00017150815413479579, + "loss": 0.1633, + "step": 9879 + }, + { + "epoch": 0.7128684295970273, + "grad_norm": 0.12456122785806656, + "learning_rate": 0.00017150526771539905, + "loss": 0.1789, + "step": 9880 + }, + { + "epoch": 0.7129405822720878, + "grad_norm": 0.11555665731430054, + "learning_rate": 0.0001715023812960023, + "loss": 0.1746, + "step": 9881 + }, + { + "epoch": 0.7130127349471481, + "grad_norm": 0.1268804520368576, + "learning_rate": 0.00017149949487660557, + "loss": 0.1736, + "step": 9882 + }, + { + "epoch": 0.7130848876222086, + "grad_norm": 0.10452228039503098, + "learning_rate": 0.00017149660845720886, + "loss": 0.1, + "step": 9883 + }, + { + "epoch": 0.713157040297269, + "grad_norm": 0.10974579304456711, + "learning_rate": 0.0001714937220378121, + "loss": 0.154, + "step": 9884 + }, + { + "epoch": 0.7132291929723295, + "grad_norm": 0.09949525445699692, + "learning_rate": 0.00017149083561841536, + "loss": 0.1743, + "step": 9885 + }, + { + "epoch": 0.7133013456473899, + "grad_norm": 0.13148930668830872, + "learning_rate": 0.00017148794919901863, + "loss": 0.1599, + "step": 9886 + }, + { + "epoch": 0.7133734983224503, + "grad_norm": 0.14373208582401276, + "learning_rate": 0.0001714850627796219, + "loss": 0.1374, + "step": 9887 + }, + { + "epoch": 0.7134456509975108, + "grad_norm": 0.11444774270057678, + "learning_rate": 0.00017148217636022515, + "loss": 0.1616, + "step": 9888 + }, + { + "epoch": 0.7135178036725711, + "grad_norm": 0.17785054445266724, + "learning_rate": 0.00017147928994082842, + "loss": 0.1814, + "step": 9889 + }, + { + "epoch": 0.7135899563476316, + "grad_norm": 0.11420541256666183, + "learning_rate": 0.00017147640352143168, + "loss": 0.14, + "step": 9890 + }, + { + "epoch": 0.713662109022692, + "grad_norm": 0.12373317778110504, + "learning_rate": 0.00017147351710203494, + "loss": 0.1239, + "step": 9891 + }, + { + "epoch": 0.7137342616977524, + "grad_norm": 0.1315503716468811, + "learning_rate": 0.0001714706306826382, + "loss": 0.1345, + "step": 9892 + }, + { + "epoch": 0.7138064143728129, + "grad_norm": 0.13126040995121002, + "learning_rate": 0.00017146774426324147, + "loss": 0.1735, + "step": 9893 + }, + { + "epoch": 0.7138785670478733, + "grad_norm": 0.11406406760215759, + "learning_rate": 0.00017146485784384473, + "loss": 0.1266, + "step": 9894 + }, + { + "epoch": 0.7139507197229338, + "grad_norm": 0.12042064964771271, + "learning_rate": 0.00017146197142444797, + "loss": 0.1657, + "step": 9895 + }, + { + "epoch": 0.7140228723979941, + "grad_norm": 0.13797765970230103, + "learning_rate": 0.00017145908500505123, + "loss": 0.1272, + "step": 9896 + }, + { + "epoch": 0.7140950250730546, + "grad_norm": 0.12352259457111359, + "learning_rate": 0.0001714561985856545, + "loss": 0.1665, + "step": 9897 + }, + { + "epoch": 0.714167177748115, + "grad_norm": 0.13089880347251892, + "learning_rate": 0.00017145331216625778, + "loss": 0.1745, + "step": 9898 + }, + { + "epoch": 0.7142393304231754, + "grad_norm": 0.13658390939235687, + "learning_rate": 0.00017145042574686104, + "loss": 0.1538, + "step": 9899 + }, + { + "epoch": 0.7143114830982359, + "grad_norm": 0.13549038767814636, + "learning_rate": 0.00017144753932746428, + "loss": 0.1206, + "step": 9900 + }, + { + "epoch": 0.7143836357732963, + "grad_norm": 0.13650107383728027, + "learning_rate": 0.00017144465290806754, + "loss": 0.1378, + "step": 9901 + }, + { + "epoch": 0.7144557884483568, + "grad_norm": 0.12035445123910904, + "learning_rate": 0.0001714417664886708, + "loss": 0.1522, + "step": 9902 + }, + { + "epoch": 0.7145279411234171, + "grad_norm": 0.15744692087173462, + "learning_rate": 0.00017143888006927407, + "loss": 0.1576, + "step": 9903 + }, + { + "epoch": 0.7146000937984776, + "grad_norm": 0.14461341500282288, + "learning_rate": 0.00017143599364987733, + "loss": 0.2089, + "step": 9904 + }, + { + "epoch": 0.714672246473538, + "grad_norm": 0.15803925693035126, + "learning_rate": 0.0001714331072304806, + "loss": 0.2166, + "step": 9905 + }, + { + "epoch": 0.7147443991485984, + "grad_norm": 0.12885110080242157, + "learning_rate": 0.00017143022081108386, + "loss": 0.1752, + "step": 9906 + }, + { + "epoch": 0.7148165518236589, + "grad_norm": 0.1436045616865158, + "learning_rate": 0.00017142733439168712, + "loss": 0.2196, + "step": 9907 + }, + { + "epoch": 0.7148887044987193, + "grad_norm": 0.10944678634405136, + "learning_rate": 0.00017142444797229038, + "loss": 0.1131, + "step": 9908 + }, + { + "epoch": 0.7149608571737797, + "grad_norm": 0.12081919610500336, + "learning_rate": 0.00017142156155289365, + "loss": 0.1596, + "step": 9909 + }, + { + "epoch": 0.7150330098488401, + "grad_norm": 0.10796798765659332, + "learning_rate": 0.0001714186751334969, + "loss": 0.1817, + "step": 9910 + }, + { + "epoch": 0.7151051625239006, + "grad_norm": 0.12811782956123352, + "learning_rate": 0.00017141578871410015, + "loss": 0.1859, + "step": 9911 + }, + { + "epoch": 0.715177315198961, + "grad_norm": 0.12989024817943573, + "learning_rate": 0.00017141290229470344, + "loss": 0.172, + "step": 9912 + }, + { + "epoch": 0.7152494678740214, + "grad_norm": 0.12131687253713608, + "learning_rate": 0.0001714100158753067, + "loss": 0.1459, + "step": 9913 + }, + { + "epoch": 0.7153216205490819, + "grad_norm": 0.12954425811767578, + "learning_rate": 0.00017140712945590996, + "loss": 0.1669, + "step": 9914 + }, + { + "epoch": 0.7153937732241423, + "grad_norm": 0.09515898674726486, + "learning_rate": 0.00017140424303651322, + "loss": 0.1473, + "step": 9915 + }, + { + "epoch": 0.7154659258992027, + "grad_norm": 0.14139306545257568, + "learning_rate": 0.00017140135661711646, + "loss": 0.1211, + "step": 9916 + }, + { + "epoch": 0.7155380785742631, + "grad_norm": 0.118651382625103, + "learning_rate": 0.00017139847019771972, + "loss": 0.1129, + "step": 9917 + }, + { + "epoch": 0.7156102312493235, + "grad_norm": 0.11456239968538284, + "learning_rate": 0.00017139558377832299, + "loss": 0.1543, + "step": 9918 + }, + { + "epoch": 0.715682383924384, + "grad_norm": 0.11746779084205627, + "learning_rate": 0.00017139269735892628, + "loss": 0.1007, + "step": 9919 + }, + { + "epoch": 0.7157545365994444, + "grad_norm": 0.12484867870807648, + "learning_rate": 0.00017138981093952954, + "loss": 0.1402, + "step": 9920 + }, + { + "epoch": 0.7158266892745049, + "grad_norm": 0.12649855017662048, + "learning_rate": 0.00017138692452013277, + "loss": 0.1495, + "step": 9921 + }, + { + "epoch": 0.7158988419495653, + "grad_norm": 0.12505637109279633, + "learning_rate": 0.00017138403810073604, + "loss": 0.1251, + "step": 9922 + }, + { + "epoch": 0.7159709946246257, + "grad_norm": 0.15270036458969116, + "learning_rate": 0.0001713811516813393, + "loss": 0.1469, + "step": 9923 + }, + { + "epoch": 0.7160431472996861, + "grad_norm": 0.14083799719810486, + "learning_rate": 0.00017137826526194256, + "loss": 0.1705, + "step": 9924 + }, + { + "epoch": 0.7161152999747465, + "grad_norm": 0.1292945295572281, + "learning_rate": 0.00017137537884254583, + "loss": 0.1094, + "step": 9925 + }, + { + "epoch": 0.716187452649807, + "grad_norm": 0.11223212629556656, + "learning_rate": 0.0001713724924231491, + "loss": 0.1134, + "step": 9926 + }, + { + "epoch": 0.7162596053248674, + "grad_norm": 0.11963280290365219, + "learning_rate": 0.00017136960600375235, + "loss": 0.1646, + "step": 9927 + }, + { + "epoch": 0.7163317579999279, + "grad_norm": 0.11987181752920151, + "learning_rate": 0.00017136671958435562, + "loss": 0.1701, + "step": 9928 + }, + { + "epoch": 0.7164039106749883, + "grad_norm": 0.1538725346326828, + "learning_rate": 0.00017136383316495888, + "loss": 0.1631, + "step": 9929 + }, + { + "epoch": 0.7164760633500487, + "grad_norm": 0.11311746388673782, + "learning_rate": 0.00017136094674556214, + "loss": 0.1808, + "step": 9930 + }, + { + "epoch": 0.7165482160251091, + "grad_norm": 0.16025066375732422, + "learning_rate": 0.0001713580603261654, + "loss": 0.1725, + "step": 9931 + }, + { + "epoch": 0.7166203687001695, + "grad_norm": 0.09958884865045547, + "learning_rate": 0.00017135517390676864, + "loss": 0.1903, + "step": 9932 + }, + { + "epoch": 0.71669252137523, + "grad_norm": 0.1386653184890747, + "learning_rate": 0.00017135228748737193, + "loss": 0.151, + "step": 9933 + }, + { + "epoch": 0.7167646740502904, + "grad_norm": 0.14528487622737885, + "learning_rate": 0.0001713494010679752, + "loss": 0.2039, + "step": 9934 + }, + { + "epoch": 0.7168368267253509, + "grad_norm": 0.11425530910491943, + "learning_rate": 0.00017134651464857846, + "loss": 0.1829, + "step": 9935 + }, + { + "epoch": 0.7169089794004113, + "grad_norm": 0.11770536750555038, + "learning_rate": 0.00017134362822918172, + "loss": 0.2005, + "step": 9936 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 0.11351843178272247, + "learning_rate": 0.00017134074180978495, + "loss": 0.1404, + "step": 9937 + }, + { + "epoch": 0.7170532847505321, + "grad_norm": 0.14741669595241547, + "learning_rate": 0.00017133785539038822, + "loss": 0.1254, + "step": 9938 + }, + { + "epoch": 0.7171254374255925, + "grad_norm": 0.10124711692333221, + "learning_rate": 0.00017133496897099148, + "loss": 0.1792, + "step": 9939 + }, + { + "epoch": 0.717197590100653, + "grad_norm": 0.12266574800014496, + "learning_rate": 0.00017133208255159477, + "loss": 0.1897, + "step": 9940 + }, + { + "epoch": 0.7172697427757134, + "grad_norm": 0.15079885721206665, + "learning_rate": 0.00017132919613219803, + "loss": 0.2157, + "step": 9941 + }, + { + "epoch": 0.7173418954507739, + "grad_norm": 0.14475995302200317, + "learning_rate": 0.00017132630971280127, + "loss": 0.1779, + "step": 9942 + }, + { + "epoch": 0.7174140481258343, + "grad_norm": 0.1366308331489563, + "learning_rate": 0.00017132342329340453, + "loss": 0.1503, + "step": 9943 + }, + { + "epoch": 0.7174862008008946, + "grad_norm": 0.13412003219127655, + "learning_rate": 0.0001713205368740078, + "loss": 0.1522, + "step": 9944 + }, + { + "epoch": 0.7175583534759551, + "grad_norm": 0.15304332971572876, + "learning_rate": 0.00017131765045461106, + "loss": 0.1418, + "step": 9945 + }, + { + "epoch": 0.7176305061510155, + "grad_norm": 0.12837323546409607, + "learning_rate": 0.00017131476403521432, + "loss": 0.155, + "step": 9946 + }, + { + "epoch": 0.717702658826076, + "grad_norm": 0.14579154551029205, + "learning_rate": 0.00017131187761581758, + "loss": 0.152, + "step": 9947 + }, + { + "epoch": 0.7177748115011364, + "grad_norm": 0.1218762919306755, + "learning_rate": 0.00017130899119642085, + "loss": 0.1179, + "step": 9948 + }, + { + "epoch": 0.7178469641761969, + "grad_norm": 0.11870580911636353, + "learning_rate": 0.0001713061047770241, + "loss": 0.1741, + "step": 9949 + }, + { + "epoch": 0.7179191168512573, + "grad_norm": 0.1070319414138794, + "learning_rate": 0.00017130321835762737, + "loss": 0.1778, + "step": 9950 + }, + { + "epoch": 0.7179912695263176, + "grad_norm": 0.11091592162847519, + "learning_rate": 0.00017130033193823064, + "loss": 0.1717, + "step": 9951 + }, + { + "epoch": 0.7180634222013781, + "grad_norm": 0.13534674048423767, + "learning_rate": 0.0001712974455188339, + "loss": 0.1926, + "step": 9952 + }, + { + "epoch": 0.7181355748764385, + "grad_norm": 0.10683610290288925, + "learning_rate": 0.00017129455909943713, + "loss": 0.1329, + "step": 9953 + }, + { + "epoch": 0.718207727551499, + "grad_norm": 0.13404878973960876, + "learning_rate": 0.00017129167268004042, + "loss": 0.1191, + "step": 9954 + }, + { + "epoch": 0.7182798802265594, + "grad_norm": 0.1568976789712906, + "learning_rate": 0.0001712887862606437, + "loss": 0.1917, + "step": 9955 + }, + { + "epoch": 0.7183520329016199, + "grad_norm": 0.09924539923667908, + "learning_rate": 0.00017128589984124695, + "loss": 0.129, + "step": 9956 + }, + { + "epoch": 0.7184241855766803, + "grad_norm": 0.14538629353046417, + "learning_rate": 0.0001712830134218502, + "loss": 0.1502, + "step": 9957 + }, + { + "epoch": 0.7184963382517406, + "grad_norm": 0.11760757118463516, + "learning_rate": 0.00017128012700245345, + "loss": 0.1622, + "step": 9958 + }, + { + "epoch": 0.7185684909268011, + "grad_norm": 0.12738563120365143, + "learning_rate": 0.0001712772405830567, + "loss": 0.1684, + "step": 9959 + }, + { + "epoch": 0.7186406436018615, + "grad_norm": 0.1411866843700409, + "learning_rate": 0.00017127435416365997, + "loss": 0.1755, + "step": 9960 + }, + { + "epoch": 0.718712796276922, + "grad_norm": 0.1326630711555481, + "learning_rate": 0.00017127146774426326, + "loss": 0.1994, + "step": 9961 + }, + { + "epoch": 0.7187849489519824, + "grad_norm": 0.12362193316221237, + "learning_rate": 0.00017126858132486653, + "loss": 0.1716, + "step": 9962 + }, + { + "epoch": 0.7188571016270429, + "grad_norm": 0.11543554812669754, + "learning_rate": 0.00017126569490546976, + "loss": 0.2053, + "step": 9963 + }, + { + "epoch": 0.7189292543021033, + "grad_norm": 0.18158745765686035, + "learning_rate": 0.00017126280848607303, + "loss": 0.1318, + "step": 9964 + }, + { + "epoch": 0.7190014069771636, + "grad_norm": 0.17341430485248566, + "learning_rate": 0.0001712599220666763, + "loss": 0.1776, + "step": 9965 + }, + { + "epoch": 0.7190735596522241, + "grad_norm": 0.12372343242168427, + "learning_rate": 0.00017125703564727955, + "loss": 0.0971, + "step": 9966 + }, + { + "epoch": 0.7191457123272845, + "grad_norm": 0.14443086087703705, + "learning_rate": 0.00017125414922788281, + "loss": 0.1811, + "step": 9967 + }, + { + "epoch": 0.719217865002345, + "grad_norm": 0.12786145508289337, + "learning_rate": 0.00017125126280848608, + "loss": 0.1371, + "step": 9968 + }, + { + "epoch": 0.7192900176774054, + "grad_norm": 0.12725846469402313, + "learning_rate": 0.00017124837638908934, + "loss": 0.1724, + "step": 9969 + }, + { + "epoch": 0.7193621703524659, + "grad_norm": 0.14343729615211487, + "learning_rate": 0.0001712454899696926, + "loss": 0.1309, + "step": 9970 + }, + { + "epoch": 0.7194343230275262, + "grad_norm": 0.13157957792282104, + "learning_rate": 0.00017124260355029587, + "loss": 0.149, + "step": 9971 + }, + { + "epoch": 0.7195064757025866, + "grad_norm": 0.12144739925861359, + "learning_rate": 0.00017123971713089913, + "loss": 0.0983, + "step": 9972 + }, + { + "epoch": 0.7195786283776471, + "grad_norm": 0.11445478349924088, + "learning_rate": 0.0001712368307115024, + "loss": 0.1063, + "step": 9973 + }, + { + "epoch": 0.7196507810527075, + "grad_norm": 0.10009689629077911, + "learning_rate": 0.00017123394429210563, + "loss": 0.1739, + "step": 9974 + }, + { + "epoch": 0.719722933727768, + "grad_norm": 0.16738224029541016, + "learning_rate": 0.00017123105787270892, + "loss": 0.1399, + "step": 9975 + }, + { + "epoch": 0.7197950864028284, + "grad_norm": 0.11219814419746399, + "learning_rate": 0.00017122817145331218, + "loss": 0.1006, + "step": 9976 + }, + { + "epoch": 0.7198672390778889, + "grad_norm": 0.10597624629735947, + "learning_rate": 0.00017122528503391544, + "loss": 0.1562, + "step": 9977 + }, + { + "epoch": 0.7199393917529492, + "grad_norm": 0.1117899939417839, + "learning_rate": 0.0001712223986145187, + "loss": 0.1558, + "step": 9978 + }, + { + "epoch": 0.7200115444280096, + "grad_norm": 0.14814738929271698, + "learning_rate": 0.00017121951219512194, + "loss": 0.2113, + "step": 9979 + }, + { + "epoch": 0.7200836971030701, + "grad_norm": 0.1278420090675354, + "learning_rate": 0.0001712166257757252, + "loss": 0.1927, + "step": 9980 + }, + { + "epoch": 0.7201558497781305, + "grad_norm": 0.12248340994119644, + "learning_rate": 0.00017121373935632847, + "loss": 0.1348, + "step": 9981 + }, + { + "epoch": 0.720228002453191, + "grad_norm": 0.1193833276629448, + "learning_rate": 0.00017121085293693176, + "loss": 0.1439, + "step": 9982 + }, + { + "epoch": 0.7203001551282514, + "grad_norm": 0.1236564889550209, + "learning_rate": 0.00017120796651753502, + "loss": 0.1097, + "step": 9983 + }, + { + "epoch": 0.7203723078033119, + "grad_norm": 0.12870290875434875, + "learning_rate": 0.00017120508009813826, + "loss": 0.1418, + "step": 9984 + }, + { + "epoch": 0.7204444604783722, + "grad_norm": 0.13253255188465118, + "learning_rate": 0.00017120219367874152, + "loss": 0.1525, + "step": 9985 + }, + { + "epoch": 0.7205166131534326, + "grad_norm": 0.1435869038105011, + "learning_rate": 0.00017119930725934478, + "loss": 0.1625, + "step": 9986 + }, + { + "epoch": 0.7205887658284931, + "grad_norm": 0.11423452943563461, + "learning_rate": 0.00017119642083994805, + "loss": 0.1222, + "step": 9987 + }, + { + "epoch": 0.7206609185035535, + "grad_norm": 0.11768162250518799, + "learning_rate": 0.0001711935344205513, + "loss": 0.1332, + "step": 9988 + }, + { + "epoch": 0.720733071178614, + "grad_norm": 0.12453066557645798, + "learning_rate": 0.00017119064800115457, + "loss": 0.1189, + "step": 9989 + }, + { + "epoch": 0.7208052238536744, + "grad_norm": 0.14446821808815002, + "learning_rate": 0.00017118776158175783, + "loss": 0.1886, + "step": 9990 + }, + { + "epoch": 0.7208773765287348, + "grad_norm": 0.1353866159915924, + "learning_rate": 0.0001711848751623611, + "loss": 0.17, + "step": 9991 + }, + { + "epoch": 0.7209495292037952, + "grad_norm": 0.12891019880771637, + "learning_rate": 0.00017118198874296436, + "loss": 0.1298, + "step": 9992 + }, + { + "epoch": 0.7210216818788556, + "grad_norm": 0.13144133985042572, + "learning_rate": 0.00017117910232356762, + "loss": 0.1379, + "step": 9993 + }, + { + "epoch": 0.7210938345539161, + "grad_norm": 0.14286787807941437, + "learning_rate": 0.0001711762159041709, + "loss": 0.1333, + "step": 9994 + }, + { + "epoch": 0.7211659872289765, + "grad_norm": 0.1285174936056137, + "learning_rate": 0.00017117332948477415, + "loss": 0.1381, + "step": 9995 + }, + { + "epoch": 0.721238139904037, + "grad_norm": 0.12918663024902344, + "learning_rate": 0.0001711704430653774, + "loss": 0.1332, + "step": 9996 + }, + { + "epoch": 0.7213102925790974, + "grad_norm": 0.13201268017292023, + "learning_rate": 0.00017116755664598068, + "loss": 0.1679, + "step": 9997 + }, + { + "epoch": 0.7213824452541578, + "grad_norm": 0.12824837863445282, + "learning_rate": 0.00017116467022658394, + "loss": 0.1709, + "step": 9998 + }, + { + "epoch": 0.7214545979292182, + "grad_norm": 0.12619931995868683, + "learning_rate": 0.0001711617838071872, + "loss": 0.1428, + "step": 9999 + }, + { + "epoch": 0.7215267506042786, + "grad_norm": 0.14253847301006317, + "learning_rate": 0.00017115889738779046, + "loss": 0.1476, + "step": 10000 + }, + { + "epoch": 0.7215989032793391, + "grad_norm": 0.20578481256961823, + "learning_rate": 0.0001711560109683937, + "loss": 0.1211, + "step": 10001 + }, + { + "epoch": 0.7216710559543995, + "grad_norm": 0.15455134212970734, + "learning_rate": 0.00017115312454899696, + "loss": 0.1585, + "step": 10002 + }, + { + "epoch": 0.72174320862946, + "grad_norm": 0.12016787379980087, + "learning_rate": 0.00017115023812960025, + "loss": 0.1748, + "step": 10003 + }, + { + "epoch": 0.7218153613045204, + "grad_norm": 0.12311887741088867, + "learning_rate": 0.00017114735171020352, + "loss": 0.1412, + "step": 10004 + }, + { + "epoch": 0.7218875139795808, + "grad_norm": 0.15775811672210693, + "learning_rate": 0.00017114446529080678, + "loss": 0.1747, + "step": 10005 + }, + { + "epoch": 0.7219596666546412, + "grad_norm": 0.12297017872333527, + "learning_rate": 0.00017114157887141001, + "loss": 0.1616, + "step": 10006 + }, + { + "epoch": 0.7220318193297016, + "grad_norm": 0.10638998448848724, + "learning_rate": 0.00017113869245201328, + "loss": 0.1458, + "step": 10007 + }, + { + "epoch": 0.7221039720047621, + "grad_norm": 0.13442695140838623, + "learning_rate": 0.00017113580603261654, + "loss": 0.155, + "step": 10008 + }, + { + "epoch": 0.7221761246798225, + "grad_norm": 0.11203104257583618, + "learning_rate": 0.0001711329196132198, + "loss": 0.1521, + "step": 10009 + }, + { + "epoch": 0.722248277354883, + "grad_norm": 0.12732189893722534, + "learning_rate": 0.0001711300331938231, + "loss": 0.1849, + "step": 10010 + }, + { + "epoch": 0.7223204300299434, + "grad_norm": 0.1229824498295784, + "learning_rate": 0.00017112714677442633, + "loss": 0.1829, + "step": 10011 + }, + { + "epoch": 0.7223925827050038, + "grad_norm": 0.1607559472322464, + "learning_rate": 0.0001711242603550296, + "loss": 0.1613, + "step": 10012 + }, + { + "epoch": 0.7224647353800642, + "grad_norm": 0.1528715193271637, + "learning_rate": 0.00017112137393563286, + "loss": 0.2003, + "step": 10013 + }, + { + "epoch": 0.7225368880551246, + "grad_norm": 0.12777936458587646, + "learning_rate": 0.00017111848751623612, + "loss": 0.1339, + "step": 10014 + }, + { + "epoch": 0.7226090407301851, + "grad_norm": 0.12514327466487885, + "learning_rate": 0.00017111560109683938, + "loss": 0.1781, + "step": 10015 + }, + { + "epoch": 0.7226811934052455, + "grad_norm": 0.12802931666374207, + "learning_rate": 0.00017111271467744264, + "loss": 0.1376, + "step": 10016 + }, + { + "epoch": 0.722753346080306, + "grad_norm": 0.12756921350955963, + "learning_rate": 0.0001711098282580459, + "loss": 0.1606, + "step": 10017 + }, + { + "epoch": 0.7228254987553664, + "grad_norm": 0.129941925406456, + "learning_rate": 0.00017110694183864917, + "loss": 0.1805, + "step": 10018 + }, + { + "epoch": 0.7228976514304268, + "grad_norm": 0.11695303022861481, + "learning_rate": 0.00017110405541925243, + "loss": 0.1748, + "step": 10019 + }, + { + "epoch": 0.7229698041054872, + "grad_norm": 0.13325800001621246, + "learning_rate": 0.0001711011689998557, + "loss": 0.147, + "step": 10020 + }, + { + "epoch": 0.7230419567805476, + "grad_norm": 0.14300288259983063, + "learning_rate": 0.00017109828258045896, + "loss": 0.1306, + "step": 10021 + }, + { + "epoch": 0.7231141094556081, + "grad_norm": 0.1265317052602768, + "learning_rate": 0.0001710953961610622, + "loss": 0.1317, + "step": 10022 + }, + { + "epoch": 0.7231862621306685, + "grad_norm": 0.13566988706588745, + "learning_rate": 0.00017109250974166546, + "loss": 0.175, + "step": 10023 + }, + { + "epoch": 0.7232584148057289, + "grad_norm": 0.1730663776397705, + "learning_rate": 0.00017108962332226875, + "loss": 0.1616, + "step": 10024 + }, + { + "epoch": 0.7233305674807894, + "grad_norm": 0.13771170377731323, + "learning_rate": 0.000171086736902872, + "loss": 0.1969, + "step": 10025 + }, + { + "epoch": 0.7234027201558498, + "grad_norm": 0.12104084342718124, + "learning_rate": 0.00017108385048347527, + "loss": 0.1646, + "step": 10026 + }, + { + "epoch": 0.7234748728309102, + "grad_norm": 0.13200148940086365, + "learning_rate": 0.0001710809640640785, + "loss": 0.2274, + "step": 10027 + }, + { + "epoch": 0.7235470255059706, + "grad_norm": 0.11547756940126419, + "learning_rate": 0.00017107807764468177, + "loss": 0.1619, + "step": 10028 + }, + { + "epoch": 0.723619178181031, + "grad_norm": 0.12168179452419281, + "learning_rate": 0.00017107519122528503, + "loss": 0.1962, + "step": 10029 + }, + { + "epoch": 0.7236913308560915, + "grad_norm": 0.13543535768985748, + "learning_rate": 0.0001710723048058883, + "loss": 0.1609, + "step": 10030 + }, + { + "epoch": 0.7237634835311519, + "grad_norm": 0.13941948115825653, + "learning_rate": 0.0001710694183864916, + "loss": 0.1634, + "step": 10031 + }, + { + "epoch": 0.7238356362062124, + "grad_norm": 0.11474108695983887, + "learning_rate": 0.00017106653196709482, + "loss": 0.1859, + "step": 10032 + }, + { + "epoch": 0.7239077888812727, + "grad_norm": 0.12846776843070984, + "learning_rate": 0.00017106364554769809, + "loss": 0.149, + "step": 10033 + }, + { + "epoch": 0.7239799415563332, + "grad_norm": 0.10760863125324249, + "learning_rate": 0.00017106075912830135, + "loss": 0.1786, + "step": 10034 + }, + { + "epoch": 0.7240520942313936, + "grad_norm": 0.12543274462223053, + "learning_rate": 0.0001710578727089046, + "loss": 0.1386, + "step": 10035 + }, + { + "epoch": 0.724124246906454, + "grad_norm": 0.11868203431367874, + "learning_rate": 0.00017105498628950788, + "loss": 0.1529, + "step": 10036 + }, + { + "epoch": 0.7241963995815145, + "grad_norm": 0.11668435484170914, + "learning_rate": 0.00017105209987011114, + "loss": 0.1325, + "step": 10037 + }, + { + "epoch": 0.7242685522565749, + "grad_norm": 0.15729379653930664, + "learning_rate": 0.0001710492134507144, + "loss": 0.1503, + "step": 10038 + }, + { + "epoch": 0.7243407049316354, + "grad_norm": 0.1290607750415802, + "learning_rate": 0.00017104632703131766, + "loss": 0.1718, + "step": 10039 + }, + { + "epoch": 0.7244128576066957, + "grad_norm": 0.12674786150455475, + "learning_rate": 0.00017104344061192093, + "loss": 0.0857, + "step": 10040 + }, + { + "epoch": 0.7244850102817562, + "grad_norm": 0.11376924812793732, + "learning_rate": 0.0001710405541925242, + "loss": 0.142, + "step": 10041 + }, + { + "epoch": 0.7245571629568166, + "grad_norm": 0.11870646476745605, + "learning_rate": 0.00017103766777312745, + "loss": 0.1461, + "step": 10042 + }, + { + "epoch": 0.724629315631877, + "grad_norm": 0.13887138664722443, + "learning_rate": 0.0001710347813537307, + "loss": 0.129, + "step": 10043 + }, + { + "epoch": 0.7247014683069375, + "grad_norm": 0.11356978863477707, + "learning_rate": 0.00017103189493433395, + "loss": 0.097, + "step": 10044 + }, + { + "epoch": 0.7247736209819979, + "grad_norm": 0.1348128318786621, + "learning_rate": 0.00017102900851493724, + "loss": 0.1446, + "step": 10045 + }, + { + "epoch": 0.7248457736570584, + "grad_norm": 0.13882333040237427, + "learning_rate": 0.0001710261220955405, + "loss": 0.1242, + "step": 10046 + }, + { + "epoch": 0.7249179263321187, + "grad_norm": 0.13452178239822388, + "learning_rate": 0.00017102323567614377, + "loss": 0.1336, + "step": 10047 + }, + { + "epoch": 0.7249900790071792, + "grad_norm": 0.10064591467380524, + "learning_rate": 0.000171020349256747, + "loss": 0.0977, + "step": 10048 + }, + { + "epoch": 0.7250622316822396, + "grad_norm": 0.14382019639015198, + "learning_rate": 0.00017101746283735027, + "loss": 0.1991, + "step": 10049 + }, + { + "epoch": 0.7251343843573, + "grad_norm": 0.11695842444896698, + "learning_rate": 0.00017101457641795353, + "loss": 0.1164, + "step": 10050 + }, + { + "epoch": 0.7252065370323605, + "grad_norm": 0.1313917487859726, + "learning_rate": 0.0001710116899985568, + "loss": 0.1575, + "step": 10051 + }, + { + "epoch": 0.7252786897074209, + "grad_norm": 0.16316965222358704, + "learning_rate": 0.00017100880357916008, + "loss": 0.1643, + "step": 10052 + }, + { + "epoch": 0.7253508423824814, + "grad_norm": 0.1231640875339508, + "learning_rate": 0.00017100591715976332, + "loss": 0.1688, + "step": 10053 + }, + { + "epoch": 0.7254229950575417, + "grad_norm": 0.1439533829689026, + "learning_rate": 0.00017100303074036658, + "loss": 0.1871, + "step": 10054 + }, + { + "epoch": 0.7254951477326022, + "grad_norm": 0.12452032417058945, + "learning_rate": 0.00017100014432096984, + "loss": 0.1418, + "step": 10055 + }, + { + "epoch": 0.7255673004076626, + "grad_norm": 0.12324023991823196, + "learning_rate": 0.0001709972579015731, + "loss": 0.1924, + "step": 10056 + }, + { + "epoch": 0.725639453082723, + "grad_norm": 0.13798697292804718, + "learning_rate": 0.00017099437148217637, + "loss": 0.1033, + "step": 10057 + }, + { + "epoch": 0.7257116057577835, + "grad_norm": 0.13696545362472534, + "learning_rate": 0.00017099148506277963, + "loss": 0.1339, + "step": 10058 + }, + { + "epoch": 0.7257837584328439, + "grad_norm": 0.1420350968837738, + "learning_rate": 0.0001709885986433829, + "loss": 0.2039, + "step": 10059 + }, + { + "epoch": 0.7258559111079044, + "grad_norm": 0.1186450943350792, + "learning_rate": 0.00017098571222398616, + "loss": 0.1643, + "step": 10060 + }, + { + "epoch": 0.7259280637829647, + "grad_norm": 0.1391008347272873, + "learning_rate": 0.00017098282580458942, + "loss": 0.1168, + "step": 10061 + }, + { + "epoch": 0.7260002164580251, + "grad_norm": 0.14331486821174622, + "learning_rate": 0.00017097993938519268, + "loss": 0.212, + "step": 10062 + }, + { + "epoch": 0.7260723691330856, + "grad_norm": 0.0992756262421608, + "learning_rate": 0.00017097705296579595, + "loss": 0.1528, + "step": 10063 + }, + { + "epoch": 0.726144521808146, + "grad_norm": 0.14541590213775635, + "learning_rate": 0.00017097416654639918, + "loss": 0.2204, + "step": 10064 + }, + { + "epoch": 0.7262166744832065, + "grad_norm": 0.13561774790287018, + "learning_rate": 0.00017097128012700245, + "loss": 0.1278, + "step": 10065 + }, + { + "epoch": 0.7262888271582669, + "grad_norm": 0.12054046988487244, + "learning_rate": 0.00017096839370760574, + "loss": 0.1575, + "step": 10066 + }, + { + "epoch": 0.7263609798333274, + "grad_norm": 0.12773770093917847, + "learning_rate": 0.000170965507288209, + "loss": 0.1479, + "step": 10067 + }, + { + "epoch": 0.7264331325083877, + "grad_norm": 0.12233958393335342, + "learning_rate": 0.00017096262086881226, + "loss": 0.1576, + "step": 10068 + }, + { + "epoch": 0.7265052851834481, + "grad_norm": 0.12692715227603912, + "learning_rate": 0.0001709597344494155, + "loss": 0.1406, + "step": 10069 + }, + { + "epoch": 0.7265774378585086, + "grad_norm": 0.14090755581855774, + "learning_rate": 0.00017095684803001876, + "loss": 0.1059, + "step": 10070 + }, + { + "epoch": 0.726649590533569, + "grad_norm": 0.13065628707408905, + "learning_rate": 0.00017095396161062202, + "loss": 0.1638, + "step": 10071 + }, + { + "epoch": 0.7267217432086295, + "grad_norm": 0.13230380415916443, + "learning_rate": 0.00017095107519122529, + "loss": 0.1357, + "step": 10072 + }, + { + "epoch": 0.7267938958836899, + "grad_norm": 0.14219319820404053, + "learning_rate": 0.00017094818877182858, + "loss": 0.2059, + "step": 10073 + }, + { + "epoch": 0.7268660485587504, + "grad_norm": 0.12111040204763412, + "learning_rate": 0.0001709453023524318, + "loss": 0.1343, + "step": 10074 + }, + { + "epoch": 0.7269382012338107, + "grad_norm": 0.11151017248630524, + "learning_rate": 0.00017094241593303507, + "loss": 0.1541, + "step": 10075 + }, + { + "epoch": 0.7270103539088711, + "grad_norm": 0.13076148927211761, + "learning_rate": 0.00017093952951363834, + "loss": 0.1342, + "step": 10076 + }, + { + "epoch": 0.7270825065839316, + "grad_norm": 0.15508994460105896, + "learning_rate": 0.0001709366430942416, + "loss": 0.1355, + "step": 10077 + }, + { + "epoch": 0.727154659258992, + "grad_norm": 0.13253220915794373, + "learning_rate": 0.00017093375667484486, + "loss": 0.1287, + "step": 10078 + }, + { + "epoch": 0.7272268119340525, + "grad_norm": 0.1186075359582901, + "learning_rate": 0.00017093087025544813, + "loss": 0.1317, + "step": 10079 + }, + { + "epoch": 0.7272989646091129, + "grad_norm": 0.15678545832633972, + "learning_rate": 0.0001709279838360514, + "loss": 0.1704, + "step": 10080 + }, + { + "epoch": 0.7273711172841734, + "grad_norm": 0.09805598109960556, + "learning_rate": 0.00017092509741665465, + "loss": 0.1376, + "step": 10081 + }, + { + "epoch": 0.7274432699592337, + "grad_norm": 0.13805170357227325, + "learning_rate": 0.00017092221099725792, + "loss": 0.204, + "step": 10082 + }, + { + "epoch": 0.7275154226342941, + "grad_norm": 0.12763634324073792, + "learning_rate": 0.00017091932457786118, + "loss": 0.1744, + "step": 10083 + }, + { + "epoch": 0.7275875753093546, + "grad_norm": 0.1276688426733017, + "learning_rate": 0.00017091643815846444, + "loss": 0.1901, + "step": 10084 + }, + { + "epoch": 0.727659727984415, + "grad_norm": 0.14280658960342407, + "learning_rate": 0.00017091355173906768, + "loss": 0.144, + "step": 10085 + }, + { + "epoch": 0.7277318806594755, + "grad_norm": 0.11498956382274628, + "learning_rate": 0.00017091066531967094, + "loss": 0.1353, + "step": 10086 + }, + { + "epoch": 0.7278040333345359, + "grad_norm": 0.10629608482122421, + "learning_rate": 0.00017090777890027423, + "loss": 0.145, + "step": 10087 + }, + { + "epoch": 0.7278761860095964, + "grad_norm": 0.1334647238254547, + "learning_rate": 0.0001709048924808775, + "loss": 0.1681, + "step": 10088 + }, + { + "epoch": 0.7279483386846567, + "grad_norm": 0.12539075314998627, + "learning_rate": 0.00017090200606148076, + "loss": 0.1383, + "step": 10089 + }, + { + "epoch": 0.7280204913597171, + "grad_norm": 0.13498573005199432, + "learning_rate": 0.000170899119642084, + "loss": 0.1435, + "step": 10090 + }, + { + "epoch": 0.7280926440347776, + "grad_norm": 0.11439192295074463, + "learning_rate": 0.00017089623322268725, + "loss": 0.1539, + "step": 10091 + }, + { + "epoch": 0.728164796709838, + "grad_norm": 0.10842323303222656, + "learning_rate": 0.00017089334680329052, + "loss": 0.1098, + "step": 10092 + }, + { + "epoch": 0.7282369493848985, + "grad_norm": 0.1083604246377945, + "learning_rate": 0.00017089046038389378, + "loss": 0.1522, + "step": 10093 + }, + { + "epoch": 0.7283091020599589, + "grad_norm": 0.15302124619483948, + "learning_rate": 0.00017088757396449704, + "loss": 0.1659, + "step": 10094 + }, + { + "epoch": 0.7283812547350192, + "grad_norm": 0.1261652261018753, + "learning_rate": 0.0001708846875451003, + "loss": 0.1497, + "step": 10095 + }, + { + "epoch": 0.7284534074100797, + "grad_norm": 0.1251983344554901, + "learning_rate": 0.00017088180112570357, + "loss": 0.1364, + "step": 10096 + }, + { + "epoch": 0.7285255600851401, + "grad_norm": 0.13810065388679504, + "learning_rate": 0.00017087891470630683, + "loss": 0.1888, + "step": 10097 + }, + { + "epoch": 0.7285977127602006, + "grad_norm": 0.16389283537864685, + "learning_rate": 0.0001708760282869101, + "loss": 0.1574, + "step": 10098 + }, + { + "epoch": 0.728669865435261, + "grad_norm": 0.14504991471767426, + "learning_rate": 0.00017087314186751336, + "loss": 0.1818, + "step": 10099 + }, + { + "epoch": 0.7287420181103215, + "grad_norm": 0.10631627589464188, + "learning_rate": 0.00017087025544811662, + "loss": 0.1367, + "step": 10100 + }, + { + "epoch": 0.7288141707853819, + "grad_norm": 0.14447440207004547, + "learning_rate": 0.00017086736902871988, + "loss": 0.1231, + "step": 10101 + }, + { + "epoch": 0.7288863234604422, + "grad_norm": 0.11821649223566055, + "learning_rate": 0.00017086448260932315, + "loss": 0.1434, + "step": 10102 + }, + { + "epoch": 0.7289584761355027, + "grad_norm": 0.1422772854566574, + "learning_rate": 0.0001708615961899264, + "loss": 0.2335, + "step": 10103 + }, + { + "epoch": 0.7290306288105631, + "grad_norm": 0.13319611549377441, + "learning_rate": 0.00017085870977052967, + "loss": 0.1332, + "step": 10104 + }, + { + "epoch": 0.7291027814856236, + "grad_norm": 0.13768510520458221, + "learning_rate": 0.00017085582335113294, + "loss": 0.1334, + "step": 10105 + }, + { + "epoch": 0.729174934160684, + "grad_norm": 0.12326176464557648, + "learning_rate": 0.0001708529369317362, + "loss": 0.1608, + "step": 10106 + }, + { + "epoch": 0.7292470868357445, + "grad_norm": 0.10238928347826004, + "learning_rate": 0.00017085005051233943, + "loss": 0.1859, + "step": 10107 + }, + { + "epoch": 0.7293192395108049, + "grad_norm": 0.12593914568424225, + "learning_rate": 0.0001708471640929427, + "loss": 0.1376, + "step": 10108 + }, + { + "epoch": 0.7293913921858652, + "grad_norm": 0.12407062202692032, + "learning_rate": 0.000170844277673546, + "loss": 0.1306, + "step": 10109 + }, + { + "epoch": 0.7294635448609257, + "grad_norm": 0.1254005879163742, + "learning_rate": 0.00017084139125414925, + "loss": 0.2009, + "step": 10110 + }, + { + "epoch": 0.7295356975359861, + "grad_norm": 0.12667208909988403, + "learning_rate": 0.0001708385048347525, + "loss": 0.1096, + "step": 10111 + }, + { + "epoch": 0.7296078502110466, + "grad_norm": 0.11722003668546677, + "learning_rate": 0.00017083561841535575, + "loss": 0.1369, + "step": 10112 + }, + { + "epoch": 0.729680002886107, + "grad_norm": 0.13819009065628052, + "learning_rate": 0.000170832731995959, + "loss": 0.1957, + "step": 10113 + }, + { + "epoch": 0.7297521555611675, + "grad_norm": 0.15020692348480225, + "learning_rate": 0.00017082984557656227, + "loss": 0.1156, + "step": 10114 + }, + { + "epoch": 0.7298243082362279, + "grad_norm": 0.126190185546875, + "learning_rate": 0.00017082695915716554, + "loss": 0.1554, + "step": 10115 + }, + { + "epoch": 0.7298964609112882, + "grad_norm": 0.17837029695510864, + "learning_rate": 0.00017082407273776883, + "loss": 0.1639, + "step": 10116 + }, + { + "epoch": 0.7299686135863487, + "grad_norm": 0.15776686370372772, + "learning_rate": 0.00017082118631837206, + "loss": 0.1393, + "step": 10117 + }, + { + "epoch": 0.7300407662614091, + "grad_norm": 0.14482903480529785, + "learning_rate": 0.00017081829989897533, + "loss": 0.1971, + "step": 10118 + }, + { + "epoch": 0.7301129189364696, + "grad_norm": 0.13167844712734222, + "learning_rate": 0.0001708154134795786, + "loss": 0.1216, + "step": 10119 + }, + { + "epoch": 0.73018507161153, + "grad_norm": 0.12372057139873505, + "learning_rate": 0.00017081252706018185, + "loss": 0.1852, + "step": 10120 + }, + { + "epoch": 0.7302572242865905, + "grad_norm": 0.12731702625751495, + "learning_rate": 0.00017080964064078512, + "loss": 0.1643, + "step": 10121 + }, + { + "epoch": 0.7303293769616509, + "grad_norm": 0.1208278015255928, + "learning_rate": 0.00017080675422138838, + "loss": 0.1215, + "step": 10122 + }, + { + "epoch": 0.7304015296367112, + "grad_norm": 0.11122459918260574, + "learning_rate": 0.00017080386780199164, + "loss": 0.1746, + "step": 10123 + }, + { + "epoch": 0.7304736823117717, + "grad_norm": 0.1511441320180893, + "learning_rate": 0.0001708009813825949, + "loss": 0.1628, + "step": 10124 + }, + { + "epoch": 0.7305458349868321, + "grad_norm": 0.11985379457473755, + "learning_rate": 0.00017079809496319817, + "loss": 0.1583, + "step": 10125 + }, + { + "epoch": 0.7306179876618926, + "grad_norm": 0.12886056303977966, + "learning_rate": 0.00017079520854380143, + "loss": 0.1738, + "step": 10126 + }, + { + "epoch": 0.730690140336953, + "grad_norm": 0.1353933960199356, + "learning_rate": 0.0001707923221244047, + "loss": 0.1565, + "step": 10127 + }, + { + "epoch": 0.7307622930120135, + "grad_norm": 0.1290416270494461, + "learning_rate": 0.00017078943570500793, + "loss": 0.1337, + "step": 10128 + }, + { + "epoch": 0.7308344456870739, + "grad_norm": 0.13235312700271606, + "learning_rate": 0.0001707865492856112, + "loss": 0.1311, + "step": 10129 + }, + { + "epoch": 0.7309065983621342, + "grad_norm": 0.10793448984622955, + "learning_rate": 0.00017078366286621448, + "loss": 0.1722, + "step": 10130 + }, + { + "epoch": 0.7309787510371947, + "grad_norm": 0.1269800215959549, + "learning_rate": 0.00017078077644681774, + "loss": 0.114, + "step": 10131 + }, + { + "epoch": 0.7310509037122551, + "grad_norm": 0.13012513518333435, + "learning_rate": 0.000170777890027421, + "loss": 0.1407, + "step": 10132 + }, + { + "epoch": 0.7311230563873156, + "grad_norm": 0.1555079072713852, + "learning_rate": 0.00017077500360802424, + "loss": 0.1749, + "step": 10133 + }, + { + "epoch": 0.731195209062376, + "grad_norm": 0.12376718968153, + "learning_rate": 0.0001707721171886275, + "loss": 0.1088, + "step": 10134 + }, + { + "epoch": 0.7312673617374364, + "grad_norm": 0.10243170708417892, + "learning_rate": 0.00017076923076923077, + "loss": 0.1269, + "step": 10135 + }, + { + "epoch": 0.7313395144124969, + "grad_norm": 0.14888563752174377, + "learning_rate": 0.00017076634434983403, + "loss": 0.19, + "step": 10136 + }, + { + "epoch": 0.7314116670875572, + "grad_norm": 0.12645818293094635, + "learning_rate": 0.00017076345793043732, + "loss": 0.1304, + "step": 10137 + }, + { + "epoch": 0.7314838197626177, + "grad_norm": 0.13674162328243256, + "learning_rate": 0.00017076057151104056, + "loss": 0.1357, + "step": 10138 + }, + { + "epoch": 0.7315559724376781, + "grad_norm": 0.13103006780147552, + "learning_rate": 0.00017075768509164382, + "loss": 0.145, + "step": 10139 + }, + { + "epoch": 0.7316281251127386, + "grad_norm": 0.10523130744695663, + "learning_rate": 0.00017075479867224708, + "loss": 0.1585, + "step": 10140 + }, + { + "epoch": 0.731700277787799, + "grad_norm": 0.10682960599660873, + "learning_rate": 0.00017075191225285035, + "loss": 0.1467, + "step": 10141 + }, + { + "epoch": 0.7317724304628594, + "grad_norm": 0.12744466960430145, + "learning_rate": 0.0001707490258334536, + "loss": 0.17, + "step": 10142 + }, + { + "epoch": 0.7318445831379199, + "grad_norm": 0.1741938292980194, + "learning_rate": 0.00017074613941405687, + "loss": 0.1534, + "step": 10143 + }, + { + "epoch": 0.7319167358129802, + "grad_norm": 0.11006762832403183, + "learning_rate": 0.00017074325299466014, + "loss": 0.1436, + "step": 10144 + }, + { + "epoch": 0.7319888884880407, + "grad_norm": 0.1253357082605362, + "learning_rate": 0.0001707403665752634, + "loss": 0.1665, + "step": 10145 + }, + { + "epoch": 0.7320610411631011, + "grad_norm": 0.11946944147348404, + "learning_rate": 0.00017073748015586666, + "loss": 0.1685, + "step": 10146 + }, + { + "epoch": 0.7321331938381616, + "grad_norm": 0.3890308737754822, + "learning_rate": 0.00017073459373646992, + "loss": 0.1891, + "step": 10147 + }, + { + "epoch": 0.732205346513222, + "grad_norm": 0.13302887976169586, + "learning_rate": 0.0001707317073170732, + "loss": 0.1052, + "step": 10148 + }, + { + "epoch": 0.7322774991882824, + "grad_norm": 0.1204490065574646, + "learning_rate": 0.00017072882089767642, + "loss": 0.128, + "step": 10149 + }, + { + "epoch": 0.7323496518633429, + "grad_norm": 0.15260469913482666, + "learning_rate": 0.00017072593447827969, + "loss": 0.1898, + "step": 10150 + }, + { + "epoch": 0.7324218045384032, + "grad_norm": 0.15671607851982117, + "learning_rate": 0.00017072304805888298, + "loss": 0.1479, + "step": 10151 + }, + { + "epoch": 0.7324939572134637, + "grad_norm": 0.16737323999404907, + "learning_rate": 0.00017072016163948624, + "loss": 0.1342, + "step": 10152 + }, + { + "epoch": 0.7325661098885241, + "grad_norm": 0.14424218237400055, + "learning_rate": 0.0001707172752200895, + "loss": 0.1619, + "step": 10153 + }, + { + "epoch": 0.7326382625635846, + "grad_norm": 0.12221214920282364, + "learning_rate": 0.00017071438880069274, + "loss": 0.1382, + "step": 10154 + }, + { + "epoch": 0.732710415238645, + "grad_norm": 0.12235883623361588, + "learning_rate": 0.000170711502381296, + "loss": 0.143, + "step": 10155 + }, + { + "epoch": 0.7327825679137054, + "grad_norm": 0.12458932399749756, + "learning_rate": 0.00017070861596189926, + "loss": 0.1641, + "step": 10156 + }, + { + "epoch": 0.7328547205887658, + "grad_norm": 0.12354271113872528, + "learning_rate": 0.00017070572954250253, + "loss": 0.1511, + "step": 10157 + }, + { + "epoch": 0.7329268732638262, + "grad_norm": 0.142563596367836, + "learning_rate": 0.00017070284312310582, + "loss": 0.1252, + "step": 10158 + }, + { + "epoch": 0.7329990259388867, + "grad_norm": 0.13783249258995056, + "learning_rate": 0.00017069995670370905, + "loss": 0.1016, + "step": 10159 + }, + { + "epoch": 0.7330711786139471, + "grad_norm": 0.14102815091609955, + "learning_rate": 0.00017069707028431231, + "loss": 0.171, + "step": 10160 + }, + { + "epoch": 0.7331433312890075, + "grad_norm": 0.1071409359574318, + "learning_rate": 0.00017069418386491558, + "loss": 0.1769, + "step": 10161 + }, + { + "epoch": 0.733215483964068, + "grad_norm": 0.1648252159357071, + "learning_rate": 0.00017069129744551884, + "loss": 0.1868, + "step": 10162 + }, + { + "epoch": 0.7332876366391284, + "grad_norm": 0.11002007126808167, + "learning_rate": 0.0001706884110261221, + "loss": 0.1612, + "step": 10163 + }, + { + "epoch": 0.7333597893141888, + "grad_norm": 0.13496430218219757, + "learning_rate": 0.00017068552460672537, + "loss": 0.1275, + "step": 10164 + }, + { + "epoch": 0.7334319419892492, + "grad_norm": 0.14498379826545715, + "learning_rate": 0.00017068263818732863, + "loss": 0.1544, + "step": 10165 + }, + { + "epoch": 0.7335040946643097, + "grad_norm": 0.1313927322626114, + "learning_rate": 0.0001706797517679319, + "loss": 0.1489, + "step": 10166 + }, + { + "epoch": 0.7335762473393701, + "grad_norm": 0.12810350954532623, + "learning_rate": 0.00017067686534853516, + "loss": 0.1256, + "step": 10167 + }, + { + "epoch": 0.7336484000144305, + "grad_norm": 0.1490558534860611, + "learning_rate": 0.00017067397892913842, + "loss": 0.1415, + "step": 10168 + }, + { + "epoch": 0.733720552689491, + "grad_norm": 0.1265607327222824, + "learning_rate": 0.00017067109250974168, + "loss": 0.138, + "step": 10169 + }, + { + "epoch": 0.7337927053645514, + "grad_norm": 0.14467424154281616, + "learning_rate": 0.00017066820609034492, + "loss": 0.164, + "step": 10170 + }, + { + "epoch": 0.7338648580396118, + "grad_norm": 0.14883024990558624, + "learning_rate": 0.00017066531967094818, + "loss": 0.1691, + "step": 10171 + }, + { + "epoch": 0.7339370107146722, + "grad_norm": 0.11522503942251205, + "learning_rate": 0.00017066243325155147, + "loss": 0.1184, + "step": 10172 + }, + { + "epoch": 0.7340091633897327, + "grad_norm": 0.18318067491054535, + "learning_rate": 0.00017065954683215473, + "loss": 0.1523, + "step": 10173 + }, + { + "epoch": 0.7340813160647931, + "grad_norm": 0.11408477276563644, + "learning_rate": 0.000170656660412758, + "loss": 0.1236, + "step": 10174 + }, + { + "epoch": 0.7341534687398535, + "grad_norm": 0.11548970639705658, + "learning_rate": 0.00017065377399336123, + "loss": 0.1361, + "step": 10175 + }, + { + "epoch": 0.734225621414914, + "grad_norm": 0.11234814673662186, + "learning_rate": 0.0001706508875739645, + "loss": 0.1877, + "step": 10176 + }, + { + "epoch": 0.7342977740899744, + "grad_norm": 0.12228648364543915, + "learning_rate": 0.00017064800115456776, + "loss": 0.1763, + "step": 10177 + }, + { + "epoch": 0.7343699267650348, + "grad_norm": 0.12698303163051605, + "learning_rate": 0.00017064511473517102, + "loss": 0.1457, + "step": 10178 + }, + { + "epoch": 0.7344420794400952, + "grad_norm": 0.10683344304561615, + "learning_rate": 0.0001706422283157743, + "loss": 0.1815, + "step": 10179 + }, + { + "epoch": 0.7345142321151557, + "grad_norm": 0.15215647220611572, + "learning_rate": 0.00017063934189637755, + "loss": 0.126, + "step": 10180 + }, + { + "epoch": 0.7345863847902161, + "grad_norm": 0.11738741397857666, + "learning_rate": 0.0001706364554769808, + "loss": 0.1444, + "step": 10181 + }, + { + "epoch": 0.7346585374652765, + "grad_norm": 0.1048831045627594, + "learning_rate": 0.00017063356905758407, + "loss": 0.1873, + "step": 10182 + }, + { + "epoch": 0.734730690140337, + "grad_norm": 0.1351441740989685, + "learning_rate": 0.00017063068263818733, + "loss": 0.133, + "step": 10183 + }, + { + "epoch": 0.7348028428153974, + "grad_norm": 0.14018692076206207, + "learning_rate": 0.0001706277962187906, + "loss": 0.1705, + "step": 10184 + }, + { + "epoch": 0.7348749954904578, + "grad_norm": 0.10088642686605453, + "learning_rate": 0.00017062490979939386, + "loss": 0.1964, + "step": 10185 + }, + { + "epoch": 0.7349471481655182, + "grad_norm": 0.13162271678447723, + "learning_rate": 0.00017062202337999712, + "loss": 0.1733, + "step": 10186 + }, + { + "epoch": 0.7350193008405786, + "grad_norm": 0.11622099578380585, + "learning_rate": 0.0001706191369606004, + "loss": 0.1714, + "step": 10187 + }, + { + "epoch": 0.7350914535156391, + "grad_norm": 0.11611003428697586, + "learning_rate": 0.00017061625054120365, + "loss": 0.1416, + "step": 10188 + }, + { + "epoch": 0.7351636061906995, + "grad_norm": 0.1257910132408142, + "learning_rate": 0.0001706133641218069, + "loss": 0.1274, + "step": 10189 + }, + { + "epoch": 0.73523575886576, + "grad_norm": 0.1268366575241089, + "learning_rate": 0.00017061047770241018, + "loss": 0.1629, + "step": 10190 + }, + { + "epoch": 0.7353079115408204, + "grad_norm": 0.12874042987823486, + "learning_rate": 0.0001706075912830134, + "loss": 0.1698, + "step": 10191 + }, + { + "epoch": 0.7353800642158808, + "grad_norm": 0.1255289614200592, + "learning_rate": 0.00017060470486361667, + "loss": 0.1547, + "step": 10192 + }, + { + "epoch": 0.7354522168909412, + "grad_norm": 0.11216352880001068, + "learning_rate": 0.00017060181844421996, + "loss": 0.1335, + "step": 10193 + }, + { + "epoch": 0.7355243695660016, + "grad_norm": 0.09830225259065628, + "learning_rate": 0.00017059893202482323, + "loss": 0.1087, + "step": 10194 + }, + { + "epoch": 0.7355965222410621, + "grad_norm": 0.12806928157806396, + "learning_rate": 0.0001705960456054265, + "loss": 0.1885, + "step": 10195 + }, + { + "epoch": 0.7356686749161225, + "grad_norm": 0.11917254328727722, + "learning_rate": 0.00017059315918602973, + "loss": 0.155, + "step": 10196 + }, + { + "epoch": 0.735740827591183, + "grad_norm": 0.11708007752895355, + "learning_rate": 0.000170590272766633, + "loss": 0.0909, + "step": 10197 + }, + { + "epoch": 0.7358129802662434, + "grad_norm": 0.1161891296505928, + "learning_rate": 0.00017058738634723625, + "loss": 0.1694, + "step": 10198 + }, + { + "epoch": 0.7358851329413038, + "grad_norm": 0.12696947157382965, + "learning_rate": 0.00017058449992783951, + "loss": 0.1794, + "step": 10199 + }, + { + "epoch": 0.7359572856163642, + "grad_norm": 0.16883036494255066, + "learning_rate": 0.0001705816135084428, + "loss": 0.1698, + "step": 10200 + }, + { + "epoch": 0.7360294382914246, + "grad_norm": 0.11908348649740219, + "learning_rate": 0.00017057872708904604, + "loss": 0.1056, + "step": 10201 + }, + { + "epoch": 0.7361015909664851, + "grad_norm": 0.10129093378782272, + "learning_rate": 0.0001705758406696493, + "loss": 0.1728, + "step": 10202 + }, + { + "epoch": 0.7361737436415455, + "grad_norm": 0.11746098101139069, + "learning_rate": 0.00017057295425025257, + "loss": 0.146, + "step": 10203 + }, + { + "epoch": 0.736245896316606, + "grad_norm": 0.13154010474681854, + "learning_rate": 0.00017057006783085583, + "loss": 0.1349, + "step": 10204 + }, + { + "epoch": 0.7363180489916664, + "grad_norm": 0.12824952602386475, + "learning_rate": 0.0001705671814114591, + "loss": 0.1214, + "step": 10205 + }, + { + "epoch": 0.7363902016667268, + "grad_norm": 0.10343913733959198, + "learning_rate": 0.00017056429499206235, + "loss": 0.129, + "step": 10206 + }, + { + "epoch": 0.7364623543417872, + "grad_norm": 0.14011070132255554, + "learning_rate": 0.00017056140857266562, + "loss": 0.19, + "step": 10207 + }, + { + "epoch": 0.7365345070168476, + "grad_norm": 0.15326650440692902, + "learning_rate": 0.00017055852215326888, + "loss": 0.1614, + "step": 10208 + }, + { + "epoch": 0.7366066596919081, + "grad_norm": 0.11808323860168457, + "learning_rate": 0.00017055563573387214, + "loss": 0.1423, + "step": 10209 + }, + { + "epoch": 0.7366788123669685, + "grad_norm": 0.12559036910533905, + "learning_rate": 0.0001705527493144754, + "loss": 0.2252, + "step": 10210 + }, + { + "epoch": 0.736750965042029, + "grad_norm": 0.11885204911231995, + "learning_rate": 0.00017054986289507867, + "loss": 0.1672, + "step": 10211 + }, + { + "epoch": 0.7368231177170894, + "grad_norm": 0.11948374658823013, + "learning_rate": 0.0001705469764756819, + "loss": 0.1679, + "step": 10212 + }, + { + "epoch": 0.7368952703921497, + "grad_norm": 0.11641169339418411, + "learning_rate": 0.00017054409005628517, + "loss": 0.1811, + "step": 10213 + }, + { + "epoch": 0.7369674230672102, + "grad_norm": 0.14575567841529846, + "learning_rate": 0.00017054120363688846, + "loss": 0.1539, + "step": 10214 + }, + { + "epoch": 0.7370395757422706, + "grad_norm": 0.12116880714893341, + "learning_rate": 0.00017053831721749172, + "loss": 0.1403, + "step": 10215 + }, + { + "epoch": 0.7371117284173311, + "grad_norm": 0.12820352613925934, + "learning_rate": 0.00017053543079809498, + "loss": 0.171, + "step": 10216 + }, + { + "epoch": 0.7371838810923915, + "grad_norm": 0.12892208993434906, + "learning_rate": 0.00017053254437869822, + "loss": 0.1217, + "step": 10217 + }, + { + "epoch": 0.737256033767452, + "grad_norm": 0.11848312616348267, + "learning_rate": 0.00017052965795930148, + "loss": 0.1366, + "step": 10218 + }, + { + "epoch": 0.7373281864425123, + "grad_norm": 0.10166176408529282, + "learning_rate": 0.00017052677153990475, + "loss": 0.1273, + "step": 10219 + }, + { + "epoch": 0.7374003391175727, + "grad_norm": 0.13539794087409973, + "learning_rate": 0.000170523885120508, + "loss": 0.1311, + "step": 10220 + }, + { + "epoch": 0.7374724917926332, + "grad_norm": 0.10737601667642593, + "learning_rate": 0.0001705209987011113, + "loss": 0.1346, + "step": 10221 + }, + { + "epoch": 0.7375446444676936, + "grad_norm": 0.11928819864988327, + "learning_rate": 0.00017051811228171456, + "loss": 0.2049, + "step": 10222 + }, + { + "epoch": 0.7376167971427541, + "grad_norm": 0.12231762707233429, + "learning_rate": 0.0001705152258623178, + "loss": 0.1663, + "step": 10223 + }, + { + "epoch": 0.7376889498178145, + "grad_norm": 0.12005126476287842, + "learning_rate": 0.00017051233944292106, + "loss": 0.1627, + "step": 10224 + }, + { + "epoch": 0.737761102492875, + "grad_norm": 0.14178934693336487, + "learning_rate": 0.00017050945302352432, + "loss": 0.1194, + "step": 10225 + }, + { + "epoch": 0.7378332551679353, + "grad_norm": 0.12870538234710693, + "learning_rate": 0.00017050656660412759, + "loss": 0.1542, + "step": 10226 + }, + { + "epoch": 0.7379054078429957, + "grad_norm": 0.11229836195707321, + "learning_rate": 0.00017050368018473085, + "loss": 0.1437, + "step": 10227 + }, + { + "epoch": 0.7379775605180562, + "grad_norm": 0.1517128050327301, + "learning_rate": 0.0001705007937653341, + "loss": 0.1417, + "step": 10228 + }, + { + "epoch": 0.7380497131931166, + "grad_norm": 0.12081311643123627, + "learning_rate": 0.00017049790734593738, + "loss": 0.1452, + "step": 10229 + }, + { + "epoch": 0.7381218658681771, + "grad_norm": 0.1069989800453186, + "learning_rate": 0.00017049502092654064, + "loss": 0.107, + "step": 10230 + }, + { + "epoch": 0.7381940185432375, + "grad_norm": 0.11483065038919449, + "learning_rate": 0.0001704921345071439, + "loss": 0.1321, + "step": 10231 + }, + { + "epoch": 0.738266171218298, + "grad_norm": 0.15927647054195404, + "learning_rate": 0.00017048924808774716, + "loss": 0.1807, + "step": 10232 + }, + { + "epoch": 0.7383383238933583, + "grad_norm": 0.12965205311775208, + "learning_rate": 0.00017048636166835043, + "loss": 0.1774, + "step": 10233 + }, + { + "epoch": 0.7384104765684187, + "grad_norm": 0.11923392862081528, + "learning_rate": 0.00017048347524895366, + "loss": 0.1272, + "step": 10234 + }, + { + "epoch": 0.7384826292434792, + "grad_norm": 0.11872682720422745, + "learning_rate": 0.00017048058882955695, + "loss": 0.1845, + "step": 10235 + }, + { + "epoch": 0.7385547819185396, + "grad_norm": 0.1188027635216713, + "learning_rate": 0.00017047770241016022, + "loss": 0.156, + "step": 10236 + }, + { + "epoch": 0.7386269345936001, + "grad_norm": 0.11587867140769958, + "learning_rate": 0.00017047481599076348, + "loss": 0.1246, + "step": 10237 + }, + { + "epoch": 0.7386990872686605, + "grad_norm": 0.10874349623918533, + "learning_rate": 0.00017047192957136674, + "loss": 0.1273, + "step": 10238 + }, + { + "epoch": 0.738771239943721, + "grad_norm": 0.12459474056959152, + "learning_rate": 0.00017046904315196998, + "loss": 0.2181, + "step": 10239 + }, + { + "epoch": 0.7388433926187813, + "grad_norm": 0.11925427615642548, + "learning_rate": 0.00017046615673257324, + "loss": 0.1334, + "step": 10240 + }, + { + "epoch": 0.7389155452938417, + "grad_norm": 0.09950932115316391, + "learning_rate": 0.0001704632703131765, + "loss": 0.1162, + "step": 10241 + }, + { + "epoch": 0.7389876979689022, + "grad_norm": 0.15613147616386414, + "learning_rate": 0.0001704603838937798, + "loss": 0.1572, + "step": 10242 + }, + { + "epoch": 0.7390598506439626, + "grad_norm": 0.10930442810058594, + "learning_rate": 0.00017045749747438306, + "loss": 0.1569, + "step": 10243 + }, + { + "epoch": 0.7391320033190231, + "grad_norm": 0.14993628859519958, + "learning_rate": 0.0001704546110549863, + "loss": 0.116, + "step": 10244 + }, + { + "epoch": 0.7392041559940835, + "grad_norm": 0.12143708020448685, + "learning_rate": 0.00017045172463558955, + "loss": 0.1969, + "step": 10245 + }, + { + "epoch": 0.739276308669144, + "grad_norm": 0.13101693987846375, + "learning_rate": 0.00017044883821619282, + "loss": 0.1173, + "step": 10246 + }, + { + "epoch": 0.7393484613442043, + "grad_norm": 0.14847607910633087, + "learning_rate": 0.00017044595179679608, + "loss": 0.1377, + "step": 10247 + }, + { + "epoch": 0.7394206140192647, + "grad_norm": 0.14123043417930603, + "learning_rate": 0.00017044306537739934, + "loss": 0.2099, + "step": 10248 + }, + { + "epoch": 0.7394927666943252, + "grad_norm": 0.13185706734657288, + "learning_rate": 0.0001704401789580026, + "loss": 0.1807, + "step": 10249 + }, + { + "epoch": 0.7395649193693856, + "grad_norm": 0.12274592369794846, + "learning_rate": 0.00017043729253860587, + "loss": 0.1357, + "step": 10250 + }, + { + "epoch": 0.7396370720444461, + "grad_norm": 0.12145980447530746, + "learning_rate": 0.00017043440611920913, + "loss": 0.1451, + "step": 10251 + }, + { + "epoch": 0.7397092247195065, + "grad_norm": 0.15183459222316742, + "learning_rate": 0.0001704315196998124, + "loss": 0.1306, + "step": 10252 + }, + { + "epoch": 0.739781377394567, + "grad_norm": 0.17286445200443268, + "learning_rate": 0.00017042863328041566, + "loss": 0.1253, + "step": 10253 + }, + { + "epoch": 0.7398535300696273, + "grad_norm": 0.12338947504758835, + "learning_rate": 0.00017042574686101892, + "loss": 0.1671, + "step": 10254 + }, + { + "epoch": 0.7399256827446877, + "grad_norm": 0.13943032920360565, + "learning_rate": 0.00017042286044162216, + "loss": 0.1527, + "step": 10255 + }, + { + "epoch": 0.7399978354197482, + "grad_norm": 0.11988866329193115, + "learning_rate": 0.00017041997402222545, + "loss": 0.1655, + "step": 10256 + }, + { + "epoch": 0.7400699880948086, + "grad_norm": 0.1152622401714325, + "learning_rate": 0.0001704170876028287, + "loss": 0.1276, + "step": 10257 + }, + { + "epoch": 0.7401421407698691, + "grad_norm": 0.15951548516750336, + "learning_rate": 0.00017041420118343197, + "loss": 0.1386, + "step": 10258 + }, + { + "epoch": 0.7402142934449295, + "grad_norm": 0.12054353952407837, + "learning_rate": 0.00017041131476403524, + "loss": 0.1663, + "step": 10259 + }, + { + "epoch": 0.74028644611999, + "grad_norm": 0.11046476662158966, + "learning_rate": 0.00017040842834463847, + "loss": 0.1466, + "step": 10260 + }, + { + "epoch": 0.7403585987950503, + "grad_norm": 0.10996885597705841, + "learning_rate": 0.00017040554192524173, + "loss": 0.1292, + "step": 10261 + }, + { + "epoch": 0.7404307514701107, + "grad_norm": 0.16070950031280518, + "learning_rate": 0.000170402655505845, + "loss": 0.2011, + "step": 10262 + }, + { + "epoch": 0.7405029041451712, + "grad_norm": 0.11673824489116669, + "learning_rate": 0.0001703997690864483, + "loss": 0.1772, + "step": 10263 + }, + { + "epoch": 0.7405750568202316, + "grad_norm": 0.11503014713525772, + "learning_rate": 0.00017039688266705155, + "loss": 0.1636, + "step": 10264 + }, + { + "epoch": 0.7406472094952921, + "grad_norm": 0.1217779591679573, + "learning_rate": 0.00017039399624765479, + "loss": 0.1235, + "step": 10265 + }, + { + "epoch": 0.7407193621703525, + "grad_norm": 0.14887690544128418, + "learning_rate": 0.00017039110982825805, + "loss": 0.1662, + "step": 10266 + }, + { + "epoch": 0.740791514845413, + "grad_norm": 0.15789437294006348, + "learning_rate": 0.0001703882234088613, + "loss": 0.1578, + "step": 10267 + }, + { + "epoch": 0.7408636675204733, + "grad_norm": 0.149520143866539, + "learning_rate": 0.00017038533698946457, + "loss": 0.1287, + "step": 10268 + }, + { + "epoch": 0.7409358201955337, + "grad_norm": 0.1349458545446396, + "learning_rate": 0.00017038245057006784, + "loss": 0.1216, + "step": 10269 + }, + { + "epoch": 0.7410079728705942, + "grad_norm": 0.12127988785505295, + "learning_rate": 0.0001703795641506711, + "loss": 0.1378, + "step": 10270 + }, + { + "epoch": 0.7410801255456546, + "grad_norm": 0.12569235265254974, + "learning_rate": 0.00017037667773127436, + "loss": 0.1641, + "step": 10271 + }, + { + "epoch": 0.741152278220715, + "grad_norm": 0.15909765660762787, + "learning_rate": 0.00017037379131187763, + "loss": 0.1541, + "step": 10272 + }, + { + "epoch": 0.7412244308957755, + "grad_norm": 0.13231971859931946, + "learning_rate": 0.0001703709048924809, + "loss": 0.1246, + "step": 10273 + }, + { + "epoch": 0.7412965835708359, + "grad_norm": 0.13595962524414062, + "learning_rate": 0.00017036801847308415, + "loss": 0.1856, + "step": 10274 + }, + { + "epoch": 0.7413687362458963, + "grad_norm": 0.11069751530885696, + "learning_rate": 0.00017036513205368742, + "loss": 0.1249, + "step": 10275 + }, + { + "epoch": 0.7414408889209567, + "grad_norm": 0.1497296690940857, + "learning_rate": 0.00017036224563429065, + "loss": 0.131, + "step": 10276 + }, + { + "epoch": 0.7415130415960172, + "grad_norm": 0.13204748928546906, + "learning_rate": 0.00017035935921489394, + "loss": 0.1537, + "step": 10277 + }, + { + "epoch": 0.7415851942710776, + "grad_norm": 0.1311349719762802, + "learning_rate": 0.0001703564727954972, + "loss": 0.1527, + "step": 10278 + }, + { + "epoch": 0.741657346946138, + "grad_norm": 0.13212205469608307, + "learning_rate": 0.00017035358637610047, + "loss": 0.1667, + "step": 10279 + }, + { + "epoch": 0.7417294996211985, + "grad_norm": 0.1319914013147354, + "learning_rate": 0.00017035069995670373, + "loss": 0.1303, + "step": 10280 + }, + { + "epoch": 0.7418016522962588, + "grad_norm": 0.14723023772239685, + "learning_rate": 0.00017034781353730697, + "loss": 0.1533, + "step": 10281 + }, + { + "epoch": 0.7418738049713193, + "grad_norm": 0.107989601790905, + "learning_rate": 0.00017034492711791023, + "loss": 0.1736, + "step": 10282 + }, + { + "epoch": 0.7419459576463797, + "grad_norm": 0.1135607436299324, + "learning_rate": 0.0001703420406985135, + "loss": 0.1548, + "step": 10283 + }, + { + "epoch": 0.7420181103214402, + "grad_norm": 0.12351679801940918, + "learning_rate": 0.00017033915427911678, + "loss": 0.1967, + "step": 10284 + }, + { + "epoch": 0.7420902629965006, + "grad_norm": 0.12371385842561722, + "learning_rate": 0.00017033626785972004, + "loss": 0.1234, + "step": 10285 + }, + { + "epoch": 0.742162415671561, + "grad_norm": 0.09827817231416702, + "learning_rate": 0.00017033338144032328, + "loss": 0.1501, + "step": 10286 + }, + { + "epoch": 0.7422345683466215, + "grad_norm": 0.14295990765094757, + "learning_rate": 0.00017033049502092654, + "loss": 0.1303, + "step": 10287 + }, + { + "epoch": 0.7423067210216818, + "grad_norm": 0.11243745684623718, + "learning_rate": 0.0001703276086015298, + "loss": 0.1327, + "step": 10288 + }, + { + "epoch": 0.7423788736967423, + "grad_norm": 0.10786975920200348, + "learning_rate": 0.00017032472218213307, + "loss": 0.1222, + "step": 10289 + }, + { + "epoch": 0.7424510263718027, + "grad_norm": 0.1218876987695694, + "learning_rate": 0.00017032183576273633, + "loss": 0.1701, + "step": 10290 + }, + { + "epoch": 0.7425231790468632, + "grad_norm": 0.11511393636465073, + "learning_rate": 0.0001703189493433396, + "loss": 0.1492, + "step": 10291 + }, + { + "epoch": 0.7425953317219236, + "grad_norm": 0.12704148888587952, + "learning_rate": 0.00017031606292394286, + "loss": 0.1061, + "step": 10292 + }, + { + "epoch": 0.742667484396984, + "grad_norm": 0.17741963267326355, + "learning_rate": 0.00017031317650454612, + "loss": 0.1653, + "step": 10293 + }, + { + "epoch": 0.7427396370720445, + "grad_norm": 0.1387127786874771, + "learning_rate": 0.00017031029008514938, + "loss": 0.1746, + "step": 10294 + }, + { + "epoch": 0.7428117897471048, + "grad_norm": 0.1183629259467125, + "learning_rate": 0.00017030740366575265, + "loss": 0.1156, + "step": 10295 + }, + { + "epoch": 0.7428839424221653, + "grad_norm": 0.13635767996311188, + "learning_rate": 0.0001703045172463559, + "loss": 0.1429, + "step": 10296 + }, + { + "epoch": 0.7429560950972257, + "grad_norm": 0.13225290179252625, + "learning_rate": 0.00017030163082695915, + "loss": 0.1052, + "step": 10297 + }, + { + "epoch": 0.7430282477722862, + "grad_norm": 0.13401448726654053, + "learning_rate": 0.0001702987444075624, + "loss": 0.1197, + "step": 10298 + }, + { + "epoch": 0.7431004004473466, + "grad_norm": 0.1291186511516571, + "learning_rate": 0.0001702958579881657, + "loss": 0.1424, + "step": 10299 + }, + { + "epoch": 0.743172553122407, + "grad_norm": 0.11414799094200134, + "learning_rate": 0.00017029297156876896, + "loss": 0.2152, + "step": 10300 + }, + { + "epoch": 0.7432447057974675, + "grad_norm": 0.10049829632043839, + "learning_rate": 0.00017029008514937222, + "loss": 0.2259, + "step": 10301 + }, + { + "epoch": 0.7433168584725278, + "grad_norm": 0.1451665759086609, + "learning_rate": 0.00017028719872997546, + "loss": 0.1854, + "step": 10302 + }, + { + "epoch": 0.7433890111475883, + "grad_norm": 0.12422867119312286, + "learning_rate": 0.00017028431231057872, + "loss": 0.1514, + "step": 10303 + }, + { + "epoch": 0.7434611638226487, + "grad_norm": 0.12117260694503784, + "learning_rate": 0.00017028142589118199, + "loss": 0.1644, + "step": 10304 + }, + { + "epoch": 0.7435333164977092, + "grad_norm": 0.1230030283331871, + "learning_rate": 0.00017027853947178525, + "loss": 0.1242, + "step": 10305 + }, + { + "epoch": 0.7436054691727696, + "grad_norm": 0.12640705704689026, + "learning_rate": 0.00017027565305238854, + "loss": 0.1934, + "step": 10306 + }, + { + "epoch": 0.74367762184783, + "grad_norm": 0.10742591321468353, + "learning_rate": 0.00017027276663299177, + "loss": 0.1581, + "step": 10307 + }, + { + "epoch": 0.7437497745228905, + "grad_norm": 0.12577266991138458, + "learning_rate": 0.00017026988021359504, + "loss": 0.1343, + "step": 10308 + }, + { + "epoch": 0.7438219271979508, + "grad_norm": 0.1116676852107048, + "learning_rate": 0.0001702669937941983, + "loss": 0.099, + "step": 10309 + }, + { + "epoch": 0.7438940798730113, + "grad_norm": 0.13507837057113647, + "learning_rate": 0.00017026410737480156, + "loss": 0.1237, + "step": 10310 + }, + { + "epoch": 0.7439662325480717, + "grad_norm": 0.15094028413295746, + "learning_rate": 0.00017026122095540483, + "loss": 0.1319, + "step": 10311 + }, + { + "epoch": 0.7440383852231321, + "grad_norm": 0.150686115026474, + "learning_rate": 0.0001702583345360081, + "loss": 0.1256, + "step": 10312 + }, + { + "epoch": 0.7441105378981926, + "grad_norm": 0.11062533408403397, + "learning_rate": 0.00017025544811661135, + "loss": 0.1243, + "step": 10313 + }, + { + "epoch": 0.744182690573253, + "grad_norm": 0.12666769325733185, + "learning_rate": 0.00017025256169721461, + "loss": 0.1767, + "step": 10314 + }, + { + "epoch": 0.7442548432483135, + "grad_norm": 0.12366464734077454, + "learning_rate": 0.00017024967527781788, + "loss": 0.15, + "step": 10315 + }, + { + "epoch": 0.7443269959233738, + "grad_norm": 0.14813701808452606, + "learning_rate": 0.00017024678885842114, + "loss": 0.1266, + "step": 10316 + }, + { + "epoch": 0.7443991485984343, + "grad_norm": 0.12138532847166061, + "learning_rate": 0.0001702439024390244, + "loss": 0.1147, + "step": 10317 + }, + { + "epoch": 0.7444713012734947, + "grad_norm": 0.12905026972293854, + "learning_rate": 0.00017024101601962764, + "loss": 0.0911, + "step": 10318 + }, + { + "epoch": 0.7445434539485551, + "grad_norm": 0.13295404613018036, + "learning_rate": 0.0001702381296002309, + "loss": 0.1484, + "step": 10319 + }, + { + "epoch": 0.7446156066236156, + "grad_norm": 0.15890981256961823, + "learning_rate": 0.0001702352431808342, + "loss": 0.1794, + "step": 10320 + }, + { + "epoch": 0.744687759298676, + "grad_norm": 0.11637790501117706, + "learning_rate": 0.00017023235676143746, + "loss": 0.1104, + "step": 10321 + }, + { + "epoch": 0.7447599119737365, + "grad_norm": 0.1495571881532669, + "learning_rate": 0.00017022947034204072, + "loss": 0.1615, + "step": 10322 + }, + { + "epoch": 0.7448320646487968, + "grad_norm": 0.12739881873130798, + "learning_rate": 0.00017022658392264395, + "loss": 0.1152, + "step": 10323 + }, + { + "epoch": 0.7449042173238573, + "grad_norm": 0.13501571118831635, + "learning_rate": 0.00017022369750324722, + "loss": 0.1711, + "step": 10324 + }, + { + "epoch": 0.7449763699989177, + "grad_norm": 0.1249973401427269, + "learning_rate": 0.00017022081108385048, + "loss": 0.1366, + "step": 10325 + }, + { + "epoch": 0.7450485226739781, + "grad_norm": 0.13027076423168182, + "learning_rate": 0.00017021792466445374, + "loss": 0.1628, + "step": 10326 + }, + { + "epoch": 0.7451206753490386, + "grad_norm": 0.1260368376970291, + "learning_rate": 0.00017021503824505703, + "loss": 0.1419, + "step": 10327 + }, + { + "epoch": 0.745192828024099, + "grad_norm": 0.12586842477321625, + "learning_rate": 0.00017021215182566027, + "loss": 0.0825, + "step": 10328 + }, + { + "epoch": 0.7452649806991595, + "grad_norm": 0.13267827033996582, + "learning_rate": 0.00017020926540626353, + "loss": 0.1424, + "step": 10329 + }, + { + "epoch": 0.7453371333742198, + "grad_norm": 0.1316024363040924, + "learning_rate": 0.0001702063789868668, + "loss": 0.1184, + "step": 10330 + }, + { + "epoch": 0.7454092860492803, + "grad_norm": 0.12148045748472214, + "learning_rate": 0.00017020349256747006, + "loss": 0.1377, + "step": 10331 + }, + { + "epoch": 0.7454814387243407, + "grad_norm": 0.1150650829076767, + "learning_rate": 0.00017020060614807332, + "loss": 0.209, + "step": 10332 + }, + { + "epoch": 0.7455535913994011, + "grad_norm": 0.14753451943397522, + "learning_rate": 0.00017019771972867658, + "loss": 0.1344, + "step": 10333 + }, + { + "epoch": 0.7456257440744616, + "grad_norm": 0.1898769736289978, + "learning_rate": 0.00017019483330927985, + "loss": 0.1278, + "step": 10334 + }, + { + "epoch": 0.745697896749522, + "grad_norm": 0.1436893194913864, + "learning_rate": 0.0001701919468898831, + "loss": 0.1581, + "step": 10335 + }, + { + "epoch": 0.7457700494245825, + "grad_norm": 0.10907220095396042, + "learning_rate": 0.00017018906047048637, + "loss": 0.1198, + "step": 10336 + }, + { + "epoch": 0.7458422020996428, + "grad_norm": 0.09962081164121628, + "learning_rate": 0.00017018617405108964, + "loss": 0.1721, + "step": 10337 + }, + { + "epoch": 0.7459143547747032, + "grad_norm": 0.10710161179304123, + "learning_rate": 0.0001701832876316929, + "loss": 0.1383, + "step": 10338 + }, + { + "epoch": 0.7459865074497637, + "grad_norm": 0.13437522947788239, + "learning_rate": 0.00017018040121229616, + "loss": 0.1758, + "step": 10339 + }, + { + "epoch": 0.7460586601248241, + "grad_norm": 0.19935932755470276, + "learning_rate": 0.0001701775147928994, + "loss": 0.1387, + "step": 10340 + }, + { + "epoch": 0.7461308127998846, + "grad_norm": 0.12037497013807297, + "learning_rate": 0.0001701746283735027, + "loss": 0.1502, + "step": 10341 + }, + { + "epoch": 0.746202965474945, + "grad_norm": 0.17062495648860931, + "learning_rate": 0.00017017174195410595, + "loss": 0.182, + "step": 10342 + }, + { + "epoch": 0.7462751181500054, + "grad_norm": 0.11846476793289185, + "learning_rate": 0.0001701688555347092, + "loss": 0.1571, + "step": 10343 + }, + { + "epoch": 0.7463472708250658, + "grad_norm": 0.12169267237186432, + "learning_rate": 0.00017016596911531248, + "loss": 0.1633, + "step": 10344 + }, + { + "epoch": 0.7464194235001262, + "grad_norm": 0.1215028464794159, + "learning_rate": 0.0001701630826959157, + "loss": 0.1245, + "step": 10345 + }, + { + "epoch": 0.7464915761751867, + "grad_norm": 0.13800732791423798, + "learning_rate": 0.00017016019627651897, + "loss": 0.1209, + "step": 10346 + }, + { + "epoch": 0.7465637288502471, + "grad_norm": 0.1455557644367218, + "learning_rate": 0.00017015730985712224, + "loss": 0.106, + "step": 10347 + }, + { + "epoch": 0.7466358815253076, + "grad_norm": 0.11333151161670685, + "learning_rate": 0.00017015442343772553, + "loss": 0.1287, + "step": 10348 + }, + { + "epoch": 0.746708034200368, + "grad_norm": 0.12079031765460968, + "learning_rate": 0.0001701515370183288, + "loss": 0.1287, + "step": 10349 + }, + { + "epoch": 0.7467801868754284, + "grad_norm": 0.12342115491628647, + "learning_rate": 0.00017014865059893203, + "loss": 0.1223, + "step": 10350 + }, + { + "epoch": 0.7468523395504888, + "grad_norm": 0.1584462970495224, + "learning_rate": 0.0001701457641795353, + "loss": 0.1633, + "step": 10351 + }, + { + "epoch": 0.7469244922255492, + "grad_norm": 0.15562903881072998, + "learning_rate": 0.00017014287776013855, + "loss": 0.1489, + "step": 10352 + }, + { + "epoch": 0.7469966449006097, + "grad_norm": 0.22772414982318878, + "learning_rate": 0.00017013999134074181, + "loss": 0.2305, + "step": 10353 + }, + { + "epoch": 0.7470687975756701, + "grad_norm": 0.15519371628761292, + "learning_rate": 0.00017013710492134508, + "loss": 0.0979, + "step": 10354 + }, + { + "epoch": 0.7471409502507306, + "grad_norm": 0.11990796774625778, + "learning_rate": 0.00017013421850194834, + "loss": 0.1244, + "step": 10355 + }, + { + "epoch": 0.747213102925791, + "grad_norm": 0.13046614825725555, + "learning_rate": 0.0001701313320825516, + "loss": 0.1048, + "step": 10356 + }, + { + "epoch": 0.7472852556008514, + "grad_norm": 0.13911251723766327, + "learning_rate": 0.00017012844566315487, + "loss": 0.1657, + "step": 10357 + }, + { + "epoch": 0.7473574082759118, + "grad_norm": 0.16226904094219208, + "learning_rate": 0.00017012555924375813, + "loss": 0.1899, + "step": 10358 + }, + { + "epoch": 0.7474295609509722, + "grad_norm": 0.14956645667552948, + "learning_rate": 0.0001701226728243614, + "loss": 0.1472, + "step": 10359 + }, + { + "epoch": 0.7475017136260327, + "grad_norm": 0.11555131524801254, + "learning_rate": 0.00017011978640496466, + "loss": 0.1899, + "step": 10360 + }, + { + "epoch": 0.7475738663010931, + "grad_norm": 0.13167062401771545, + "learning_rate": 0.0001701168999855679, + "loss": 0.1666, + "step": 10361 + }, + { + "epoch": 0.7476460189761536, + "grad_norm": 0.1495990753173828, + "learning_rate": 0.00017011401356617118, + "loss": 0.1483, + "step": 10362 + }, + { + "epoch": 0.747718171651214, + "grad_norm": 0.13280245661735535, + "learning_rate": 0.00017011112714677444, + "loss": 0.1396, + "step": 10363 + }, + { + "epoch": 0.7477903243262743, + "grad_norm": 0.14321625232696533, + "learning_rate": 0.0001701082407273777, + "loss": 0.1656, + "step": 10364 + }, + { + "epoch": 0.7478624770013348, + "grad_norm": 0.14525526762008667, + "learning_rate": 0.00017010535430798097, + "loss": 0.132, + "step": 10365 + }, + { + "epoch": 0.7479346296763952, + "grad_norm": 0.12012545019388199, + "learning_rate": 0.0001701024678885842, + "loss": 0.1542, + "step": 10366 + }, + { + "epoch": 0.7480067823514557, + "grad_norm": 0.11860865354537964, + "learning_rate": 0.00017009958146918747, + "loss": 0.1519, + "step": 10367 + }, + { + "epoch": 0.7480789350265161, + "grad_norm": 0.11256777495145798, + "learning_rate": 0.00017009669504979073, + "loss": 0.1418, + "step": 10368 + }, + { + "epoch": 0.7481510877015766, + "grad_norm": 0.12375520169734955, + "learning_rate": 0.00017009380863039402, + "loss": 0.1606, + "step": 10369 + }, + { + "epoch": 0.748223240376637, + "grad_norm": 0.1487514227628708, + "learning_rate": 0.00017009092221099728, + "loss": 0.1531, + "step": 10370 + }, + { + "epoch": 0.7482953930516973, + "grad_norm": 0.1397939920425415, + "learning_rate": 0.00017008803579160052, + "loss": 0.1043, + "step": 10371 + }, + { + "epoch": 0.7483675457267578, + "grad_norm": 0.15424096584320068, + "learning_rate": 0.00017008514937220378, + "loss": 0.1452, + "step": 10372 + }, + { + "epoch": 0.7484396984018182, + "grad_norm": 0.16209803521633148, + "learning_rate": 0.00017008226295280705, + "loss": 0.1851, + "step": 10373 + }, + { + "epoch": 0.7485118510768787, + "grad_norm": 0.10724832862615585, + "learning_rate": 0.0001700793765334103, + "loss": 0.1372, + "step": 10374 + }, + { + "epoch": 0.7485840037519391, + "grad_norm": 0.1410454362630844, + "learning_rate": 0.00017007649011401357, + "loss": 0.1291, + "step": 10375 + }, + { + "epoch": 0.7486561564269996, + "grad_norm": 0.11869491636753082, + "learning_rate": 0.00017007360369461683, + "loss": 0.1405, + "step": 10376 + }, + { + "epoch": 0.74872830910206, + "grad_norm": 0.12462595850229263, + "learning_rate": 0.0001700707172752201, + "loss": 0.1124, + "step": 10377 + }, + { + "epoch": 0.7488004617771203, + "grad_norm": 0.148727685213089, + "learning_rate": 0.00017006783085582336, + "loss": 0.1731, + "step": 10378 + }, + { + "epoch": 0.7488726144521808, + "grad_norm": 0.1503356695175171, + "learning_rate": 0.00017006494443642662, + "loss": 0.1683, + "step": 10379 + }, + { + "epoch": 0.7489447671272412, + "grad_norm": 0.11363843083381653, + "learning_rate": 0.0001700620580170299, + "loss": 0.1039, + "step": 10380 + }, + { + "epoch": 0.7490169198023017, + "grad_norm": 0.10472255200147629, + "learning_rate": 0.00017005917159763315, + "loss": 0.1464, + "step": 10381 + }, + { + "epoch": 0.7490890724773621, + "grad_norm": 0.11832225322723389, + "learning_rate": 0.00017005628517823639, + "loss": 0.139, + "step": 10382 + }, + { + "epoch": 0.7491612251524226, + "grad_norm": 0.16063863039016724, + "learning_rate": 0.00017005339875883968, + "loss": 0.1847, + "step": 10383 + }, + { + "epoch": 0.749233377827483, + "grad_norm": 0.11803101003170013, + "learning_rate": 0.00017005051233944294, + "loss": 0.1772, + "step": 10384 + }, + { + "epoch": 0.7493055305025433, + "grad_norm": 0.11719434708356857, + "learning_rate": 0.0001700476259200462, + "loss": 0.1745, + "step": 10385 + }, + { + "epoch": 0.7493776831776038, + "grad_norm": 0.1297706663608551, + "learning_rate": 0.00017004473950064946, + "loss": 0.1692, + "step": 10386 + }, + { + "epoch": 0.7494498358526642, + "grad_norm": 0.13485105335712433, + "learning_rate": 0.0001700418530812527, + "loss": 0.1523, + "step": 10387 + }, + { + "epoch": 0.7495219885277247, + "grad_norm": 0.15161189436912537, + "learning_rate": 0.00017003896666185596, + "loss": 0.1668, + "step": 10388 + }, + { + "epoch": 0.7495941412027851, + "grad_norm": 0.12917101383209229, + "learning_rate": 0.00017003608024245923, + "loss": 0.1824, + "step": 10389 + }, + { + "epoch": 0.7496662938778456, + "grad_norm": 0.12237484008073807, + "learning_rate": 0.00017003319382306252, + "loss": 0.1413, + "step": 10390 + }, + { + "epoch": 0.749738446552906, + "grad_norm": 0.12186992168426514, + "learning_rate": 0.00017003030740366578, + "loss": 0.1002, + "step": 10391 + }, + { + "epoch": 0.7498105992279663, + "grad_norm": 0.12338005006313324, + "learning_rate": 0.00017002742098426901, + "loss": 0.1595, + "step": 10392 + }, + { + "epoch": 0.7498827519030268, + "grad_norm": 0.12393818795681, + "learning_rate": 0.00017002453456487228, + "loss": 0.1045, + "step": 10393 + }, + { + "epoch": 0.7499549045780872, + "grad_norm": 0.1437109261751175, + "learning_rate": 0.00017002164814547554, + "loss": 0.134, + "step": 10394 + }, + { + "epoch": 0.7500270572531477, + "grad_norm": 0.12134461849927902, + "learning_rate": 0.0001700187617260788, + "loss": 0.1285, + "step": 10395 + }, + { + "epoch": 0.7500992099282081, + "grad_norm": 0.14531710743904114, + "learning_rate": 0.00017001587530668207, + "loss": 0.1368, + "step": 10396 + }, + { + "epoch": 0.7501713626032686, + "grad_norm": 0.10627005249261856, + "learning_rate": 0.00017001298888728533, + "loss": 0.157, + "step": 10397 + }, + { + "epoch": 0.750243515278329, + "grad_norm": 0.15446774661540985, + "learning_rate": 0.0001700101024678886, + "loss": 0.1568, + "step": 10398 + }, + { + "epoch": 0.7503156679533893, + "grad_norm": 0.12825755774974823, + "learning_rate": 0.00017000721604849185, + "loss": 0.125, + "step": 10399 + }, + { + "epoch": 0.7503878206284498, + "grad_norm": 0.12873336672782898, + "learning_rate": 0.00017000432962909512, + "loss": 0.1751, + "step": 10400 + }, + { + "epoch": 0.7504599733035102, + "grad_norm": 0.1349041610956192, + "learning_rate": 0.00017000144320969838, + "loss": 0.222, + "step": 10401 + }, + { + "epoch": 0.7505321259785707, + "grad_norm": 0.11576125025749207, + "learning_rate": 0.00016999855679030164, + "loss": 0.1801, + "step": 10402 + }, + { + "epoch": 0.7506042786536311, + "grad_norm": 0.16132929921150208, + "learning_rate": 0.00016999567037090488, + "loss": 0.1499, + "step": 10403 + }, + { + "epoch": 0.7506764313286916, + "grad_norm": 0.09981893748044968, + "learning_rate": 0.00016999278395150817, + "loss": 0.1179, + "step": 10404 + }, + { + "epoch": 0.7507485840037519, + "grad_norm": 0.1367056667804718, + "learning_rate": 0.00016998989753211143, + "loss": 0.1519, + "step": 10405 + }, + { + "epoch": 0.7508207366788123, + "grad_norm": 0.11750783026218414, + "learning_rate": 0.0001699870111127147, + "loss": 0.1608, + "step": 10406 + }, + { + "epoch": 0.7508928893538728, + "grad_norm": 0.10363408923149109, + "learning_rate": 0.00016998412469331796, + "loss": 0.1685, + "step": 10407 + }, + { + "epoch": 0.7509650420289332, + "grad_norm": 0.12339738011360168, + "learning_rate": 0.0001699812382739212, + "loss": 0.1216, + "step": 10408 + }, + { + "epoch": 0.7510371947039937, + "grad_norm": 0.15196317434310913, + "learning_rate": 0.00016997835185452446, + "loss": 0.1887, + "step": 10409 + }, + { + "epoch": 0.7511093473790541, + "grad_norm": 0.1309935450553894, + "learning_rate": 0.00016997546543512772, + "loss": 0.1506, + "step": 10410 + }, + { + "epoch": 0.7511815000541145, + "grad_norm": 0.1456453651189804, + "learning_rate": 0.000169972579015731, + "loss": 0.1851, + "step": 10411 + }, + { + "epoch": 0.7512536527291749, + "grad_norm": 0.12276160717010498, + "learning_rate": 0.00016996969259633427, + "loss": 0.1703, + "step": 10412 + }, + { + "epoch": 0.7513258054042353, + "grad_norm": 0.12643969058990479, + "learning_rate": 0.0001699668061769375, + "loss": 0.1499, + "step": 10413 + }, + { + "epoch": 0.7513979580792958, + "grad_norm": 0.11966102570295334, + "learning_rate": 0.00016996391975754077, + "loss": 0.1294, + "step": 10414 + }, + { + "epoch": 0.7514701107543562, + "grad_norm": 0.12556810677051544, + "learning_rate": 0.00016996103333814403, + "loss": 0.1216, + "step": 10415 + }, + { + "epoch": 0.7515422634294167, + "grad_norm": 0.12193518131971359, + "learning_rate": 0.0001699581469187473, + "loss": 0.1218, + "step": 10416 + }, + { + "epoch": 0.7516144161044771, + "grad_norm": 0.11967793107032776, + "learning_rate": 0.00016995526049935056, + "loss": 0.1809, + "step": 10417 + }, + { + "epoch": 0.7516865687795375, + "grad_norm": 0.1303376853466034, + "learning_rate": 0.00016995237407995382, + "loss": 0.156, + "step": 10418 + }, + { + "epoch": 0.7517587214545979, + "grad_norm": 0.11569512635469437, + "learning_rate": 0.00016994948766055709, + "loss": 0.1307, + "step": 10419 + }, + { + "epoch": 0.7518308741296583, + "grad_norm": 0.1179572120308876, + "learning_rate": 0.00016994660124116035, + "loss": 0.1423, + "step": 10420 + }, + { + "epoch": 0.7519030268047188, + "grad_norm": 0.13549353182315826, + "learning_rate": 0.0001699437148217636, + "loss": 0.1471, + "step": 10421 + }, + { + "epoch": 0.7519751794797792, + "grad_norm": 0.13920718431472778, + "learning_rate": 0.00016994082840236688, + "loss": 0.1169, + "step": 10422 + }, + { + "epoch": 0.7520473321548397, + "grad_norm": 0.14348579943180084, + "learning_rate": 0.00016993794198297014, + "loss": 0.1132, + "step": 10423 + }, + { + "epoch": 0.7521194848299001, + "grad_norm": 0.12128528952598572, + "learning_rate": 0.00016993505556357337, + "loss": 0.1564, + "step": 10424 + }, + { + "epoch": 0.7521916375049605, + "grad_norm": 0.13051150739192963, + "learning_rate": 0.00016993216914417666, + "loss": 0.1195, + "step": 10425 + }, + { + "epoch": 0.7522637901800209, + "grad_norm": 0.14699678122997284, + "learning_rate": 0.00016992928272477993, + "loss": 0.1884, + "step": 10426 + }, + { + "epoch": 0.7523359428550813, + "grad_norm": 0.12040664255619049, + "learning_rate": 0.0001699263963053832, + "loss": 0.1212, + "step": 10427 + }, + { + "epoch": 0.7524080955301418, + "grad_norm": 0.16848629713058472, + "learning_rate": 0.00016992350988598645, + "loss": 0.1749, + "step": 10428 + }, + { + "epoch": 0.7524802482052022, + "grad_norm": 0.13176479935646057, + "learning_rate": 0.0001699206234665897, + "loss": 0.178, + "step": 10429 + }, + { + "epoch": 0.7525524008802627, + "grad_norm": 0.12817110121250153, + "learning_rate": 0.00016991773704719295, + "loss": 0.1494, + "step": 10430 + }, + { + "epoch": 0.7526245535553231, + "grad_norm": 0.11498573422431946, + "learning_rate": 0.00016991485062779621, + "loss": 0.1385, + "step": 10431 + }, + { + "epoch": 0.7526967062303835, + "grad_norm": 0.12366468459367752, + "learning_rate": 0.0001699119642083995, + "loss": 0.1304, + "step": 10432 + }, + { + "epoch": 0.7527688589054439, + "grad_norm": 0.13612817227840424, + "learning_rate": 0.00016990907778900277, + "loss": 0.1738, + "step": 10433 + }, + { + "epoch": 0.7528410115805043, + "grad_norm": 0.10676132887601852, + "learning_rate": 0.000169906191369606, + "loss": 0.1567, + "step": 10434 + }, + { + "epoch": 0.7529131642555648, + "grad_norm": 0.12812548875808716, + "learning_rate": 0.00016990330495020927, + "loss": 0.1694, + "step": 10435 + }, + { + "epoch": 0.7529853169306252, + "grad_norm": 0.12463195621967316, + "learning_rate": 0.00016990041853081253, + "loss": 0.167, + "step": 10436 + }, + { + "epoch": 0.7530574696056856, + "grad_norm": 0.130820631980896, + "learning_rate": 0.0001698975321114158, + "loss": 0.1442, + "step": 10437 + }, + { + "epoch": 0.7531296222807461, + "grad_norm": 0.21053113043308258, + "learning_rate": 0.00016989464569201905, + "loss": 0.1621, + "step": 10438 + }, + { + "epoch": 0.7532017749558065, + "grad_norm": 0.1375279277563095, + "learning_rate": 0.00016989175927262232, + "loss": 0.1263, + "step": 10439 + }, + { + "epoch": 0.7532739276308669, + "grad_norm": 0.1391715258359909, + "learning_rate": 0.00016988887285322558, + "loss": 0.1442, + "step": 10440 + }, + { + "epoch": 0.7533460803059273, + "grad_norm": 0.09525745362043381, + "learning_rate": 0.00016988598643382884, + "loss": 0.1582, + "step": 10441 + }, + { + "epoch": 0.7534182329809878, + "grad_norm": 0.12608473002910614, + "learning_rate": 0.0001698831000144321, + "loss": 0.1562, + "step": 10442 + }, + { + "epoch": 0.7534903856560482, + "grad_norm": 0.14218218624591827, + "learning_rate": 0.00016988021359503537, + "loss": 0.1569, + "step": 10443 + }, + { + "epoch": 0.7535625383311086, + "grad_norm": 0.1170210987329483, + "learning_rate": 0.00016987732717563863, + "loss": 0.1118, + "step": 10444 + }, + { + "epoch": 0.7536346910061691, + "grad_norm": 0.1179591715335846, + "learning_rate": 0.0001698744407562419, + "loss": 0.1414, + "step": 10445 + }, + { + "epoch": 0.7537068436812295, + "grad_norm": 0.1263728141784668, + "learning_rate": 0.00016987155433684516, + "loss": 0.1607, + "step": 10446 + }, + { + "epoch": 0.7537789963562899, + "grad_norm": 0.12224898487329483, + "learning_rate": 0.00016986866791744842, + "loss": 0.1155, + "step": 10447 + }, + { + "epoch": 0.7538511490313503, + "grad_norm": 0.12241674959659576, + "learning_rate": 0.00016986578149805168, + "loss": 0.1496, + "step": 10448 + }, + { + "epoch": 0.7539233017064108, + "grad_norm": 0.13699893653392792, + "learning_rate": 0.00016986289507865495, + "loss": 0.1763, + "step": 10449 + }, + { + "epoch": 0.7539954543814712, + "grad_norm": 0.12991581857204437, + "learning_rate": 0.0001698600086592582, + "loss": 0.1727, + "step": 10450 + }, + { + "epoch": 0.7540676070565316, + "grad_norm": 0.11489979177713394, + "learning_rate": 0.00016985712223986145, + "loss": 0.0994, + "step": 10451 + }, + { + "epoch": 0.7541397597315921, + "grad_norm": 0.12281400710344315, + "learning_rate": 0.0001698542358204647, + "loss": 0.0968, + "step": 10452 + }, + { + "epoch": 0.7542119124066525, + "grad_norm": 0.1341441124677658, + "learning_rate": 0.000169851349401068, + "loss": 0.1217, + "step": 10453 + }, + { + "epoch": 0.7542840650817129, + "grad_norm": 0.11157387495040894, + "learning_rate": 0.00016984846298167126, + "loss": 0.1332, + "step": 10454 + }, + { + "epoch": 0.7543562177567733, + "grad_norm": 0.13739007711410522, + "learning_rate": 0.00016984557656227452, + "loss": 0.1817, + "step": 10455 + }, + { + "epoch": 0.7544283704318338, + "grad_norm": 0.11979958415031433, + "learning_rate": 0.00016984269014287776, + "loss": 0.1797, + "step": 10456 + }, + { + "epoch": 0.7545005231068942, + "grad_norm": 0.13151156902313232, + "learning_rate": 0.00016983980372348102, + "loss": 0.1484, + "step": 10457 + }, + { + "epoch": 0.7545726757819546, + "grad_norm": 0.10742885619401932, + "learning_rate": 0.00016983691730408429, + "loss": 0.1095, + "step": 10458 + }, + { + "epoch": 0.7546448284570151, + "grad_norm": 0.127263605594635, + "learning_rate": 0.00016983403088468755, + "loss": 0.1483, + "step": 10459 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.13081799447536469, + "learning_rate": 0.00016983114446529084, + "loss": 0.1322, + "step": 10460 + }, + { + "epoch": 0.7547891338071359, + "grad_norm": 0.1259460151195526, + "learning_rate": 0.00016982825804589407, + "loss": 0.1523, + "step": 10461 + }, + { + "epoch": 0.7548612864821963, + "grad_norm": 0.1344456672668457, + "learning_rate": 0.00016982537162649734, + "loss": 0.1354, + "step": 10462 + }, + { + "epoch": 0.7549334391572567, + "grad_norm": 0.0938878208398819, + "learning_rate": 0.0001698224852071006, + "loss": 0.1613, + "step": 10463 + }, + { + "epoch": 0.7550055918323172, + "grad_norm": 0.14028160274028778, + "learning_rate": 0.00016981959878770386, + "loss": 0.1738, + "step": 10464 + }, + { + "epoch": 0.7550777445073776, + "grad_norm": 0.11873999983072281, + "learning_rate": 0.00016981671236830713, + "loss": 0.171, + "step": 10465 + }, + { + "epoch": 0.7551498971824381, + "grad_norm": 0.14979086816310883, + "learning_rate": 0.0001698138259489104, + "loss": 0.1222, + "step": 10466 + }, + { + "epoch": 0.7552220498574984, + "grad_norm": 0.13354110717773438, + "learning_rate": 0.00016981093952951365, + "loss": 0.1766, + "step": 10467 + }, + { + "epoch": 0.7552942025325589, + "grad_norm": 0.1624889224767685, + "learning_rate": 0.00016980805311011692, + "loss": 0.157, + "step": 10468 + }, + { + "epoch": 0.7553663552076193, + "grad_norm": 0.11007406562566757, + "learning_rate": 0.00016980516669072018, + "loss": 0.1467, + "step": 10469 + }, + { + "epoch": 0.7554385078826797, + "grad_norm": 0.1187017410993576, + "learning_rate": 0.00016980228027132344, + "loss": 0.1675, + "step": 10470 + }, + { + "epoch": 0.7555106605577402, + "grad_norm": 0.1325860619544983, + "learning_rate": 0.0001697993938519267, + "loss": 0.1588, + "step": 10471 + }, + { + "epoch": 0.7555828132328006, + "grad_norm": 0.1073029488325119, + "learning_rate": 0.00016979650743252994, + "loss": 0.1438, + "step": 10472 + }, + { + "epoch": 0.7556549659078611, + "grad_norm": 0.1197071373462677, + "learning_rate": 0.0001697936210131332, + "loss": 0.1207, + "step": 10473 + }, + { + "epoch": 0.7557271185829214, + "grad_norm": 0.10033522546291351, + "learning_rate": 0.0001697907345937365, + "loss": 0.1403, + "step": 10474 + }, + { + "epoch": 0.7557992712579819, + "grad_norm": 0.15383054316043854, + "learning_rate": 0.00016978784817433976, + "loss": 0.1566, + "step": 10475 + }, + { + "epoch": 0.7558714239330423, + "grad_norm": 0.1383810043334961, + "learning_rate": 0.00016978496175494302, + "loss": 0.1637, + "step": 10476 + }, + { + "epoch": 0.7559435766081027, + "grad_norm": 0.12929747998714447, + "learning_rate": 0.00016978207533554625, + "loss": 0.1333, + "step": 10477 + }, + { + "epoch": 0.7560157292831632, + "grad_norm": 0.14632758498191833, + "learning_rate": 0.00016977918891614952, + "loss": 0.1209, + "step": 10478 + }, + { + "epoch": 0.7560878819582236, + "grad_norm": 0.12172812968492508, + "learning_rate": 0.00016977630249675278, + "loss": 0.1562, + "step": 10479 + }, + { + "epoch": 0.7561600346332841, + "grad_norm": 0.12071457505226135, + "learning_rate": 0.00016977341607735604, + "loss": 0.1862, + "step": 10480 + }, + { + "epoch": 0.7562321873083444, + "grad_norm": 0.17001500725746155, + "learning_rate": 0.0001697705296579593, + "loss": 0.18, + "step": 10481 + }, + { + "epoch": 0.7563043399834048, + "grad_norm": 0.1405080109834671, + "learning_rate": 0.00016976764323856257, + "loss": 0.1594, + "step": 10482 + }, + { + "epoch": 0.7563764926584653, + "grad_norm": 0.13267336785793304, + "learning_rate": 0.00016976475681916583, + "loss": 0.1662, + "step": 10483 + }, + { + "epoch": 0.7564486453335257, + "grad_norm": 0.11440946161746979, + "learning_rate": 0.0001697618703997691, + "loss": 0.1922, + "step": 10484 + }, + { + "epoch": 0.7565207980085862, + "grad_norm": 0.11692952364683151, + "learning_rate": 0.00016975898398037236, + "loss": 0.1521, + "step": 10485 + }, + { + "epoch": 0.7565929506836466, + "grad_norm": 0.11950668692588806, + "learning_rate": 0.00016975609756097562, + "loss": 0.1226, + "step": 10486 + }, + { + "epoch": 0.7566651033587071, + "grad_norm": 0.18788966536521912, + "learning_rate": 0.00016975321114157888, + "loss": 0.1664, + "step": 10487 + }, + { + "epoch": 0.7567372560337674, + "grad_norm": 0.14929738640785217, + "learning_rate": 0.00016975032472218212, + "loss": 0.2299, + "step": 10488 + }, + { + "epoch": 0.7568094087088278, + "grad_norm": 0.12198912352323532, + "learning_rate": 0.0001697474383027854, + "loss": 0.1459, + "step": 10489 + }, + { + "epoch": 0.7568815613838883, + "grad_norm": 0.12246627360582352, + "learning_rate": 0.00016974455188338867, + "loss": 0.1054, + "step": 10490 + }, + { + "epoch": 0.7569537140589487, + "grad_norm": 0.2314099669456482, + "learning_rate": 0.00016974166546399194, + "loss": 0.1356, + "step": 10491 + }, + { + "epoch": 0.7570258667340092, + "grad_norm": 0.14936254918575287, + "learning_rate": 0.0001697387790445952, + "loss": 0.1425, + "step": 10492 + }, + { + "epoch": 0.7570980194090696, + "grad_norm": 0.25354865193367004, + "learning_rate": 0.00016973589262519843, + "loss": 0.1332, + "step": 10493 + }, + { + "epoch": 0.7571701720841301, + "grad_norm": 0.1566449999809265, + "learning_rate": 0.0001697330062058017, + "loss": 0.1457, + "step": 10494 + }, + { + "epoch": 0.7572423247591904, + "grad_norm": 0.11155090481042862, + "learning_rate": 0.00016973011978640496, + "loss": 0.1403, + "step": 10495 + }, + { + "epoch": 0.7573144774342508, + "grad_norm": 0.16458825767040253, + "learning_rate": 0.00016972723336700825, + "loss": 0.1304, + "step": 10496 + }, + { + "epoch": 0.7573866301093113, + "grad_norm": 0.157064750790596, + "learning_rate": 0.0001697243469476115, + "loss": 0.164, + "step": 10497 + }, + { + "epoch": 0.7574587827843717, + "grad_norm": 0.11798777431249619, + "learning_rate": 0.00016972146052821475, + "loss": 0.1347, + "step": 10498 + }, + { + "epoch": 0.7575309354594322, + "grad_norm": 0.12378858029842377, + "learning_rate": 0.000169718574108818, + "loss": 0.1091, + "step": 10499 + }, + { + "epoch": 0.7576030881344926, + "grad_norm": 0.10340747237205505, + "learning_rate": 0.00016971568768942127, + "loss": 0.1362, + "step": 10500 + }, + { + "epoch": 0.7576752408095531, + "grad_norm": 0.12779733538627625, + "learning_rate": 0.00016971280127002454, + "loss": 0.1408, + "step": 10501 + }, + { + "epoch": 0.7577473934846134, + "grad_norm": 0.11349274218082428, + "learning_rate": 0.0001697099148506278, + "loss": 0.1894, + "step": 10502 + }, + { + "epoch": 0.7578195461596738, + "grad_norm": 0.12187425792217255, + "learning_rate": 0.00016970702843123106, + "loss": 0.1075, + "step": 10503 + }, + { + "epoch": 0.7578916988347343, + "grad_norm": 0.11838438361883163, + "learning_rate": 0.00016970414201183433, + "loss": 0.1302, + "step": 10504 + }, + { + "epoch": 0.7579638515097947, + "grad_norm": 0.16571784019470215, + "learning_rate": 0.0001697012555924376, + "loss": 0.1419, + "step": 10505 + }, + { + "epoch": 0.7580360041848552, + "grad_norm": 0.13391056656837463, + "learning_rate": 0.00016969836917304085, + "loss": 0.1051, + "step": 10506 + }, + { + "epoch": 0.7581081568599156, + "grad_norm": 0.1178104355931282, + "learning_rate": 0.00016969548275364411, + "loss": 0.123, + "step": 10507 + }, + { + "epoch": 0.7581803095349761, + "grad_norm": 0.120695561170578, + "learning_rate": 0.00016969259633424738, + "loss": 0.138, + "step": 10508 + }, + { + "epoch": 0.7582524622100364, + "grad_norm": 0.1247735545039177, + "learning_rate": 0.00016968970991485061, + "loss": 0.1937, + "step": 10509 + }, + { + "epoch": 0.7583246148850968, + "grad_norm": 0.14644338190555573, + "learning_rate": 0.0001696868234954539, + "loss": 0.1871, + "step": 10510 + }, + { + "epoch": 0.7583967675601573, + "grad_norm": 0.13256101310253143, + "learning_rate": 0.00016968393707605717, + "loss": 0.1359, + "step": 10511 + }, + { + "epoch": 0.7584689202352177, + "grad_norm": 0.16652289032936096, + "learning_rate": 0.00016968105065666043, + "loss": 0.2052, + "step": 10512 + }, + { + "epoch": 0.7585410729102782, + "grad_norm": 0.12736444175243378, + "learning_rate": 0.0001696781642372637, + "loss": 0.2008, + "step": 10513 + }, + { + "epoch": 0.7586132255853386, + "grad_norm": 0.15626487135887146, + "learning_rate": 0.00016967527781786693, + "loss": 0.1391, + "step": 10514 + }, + { + "epoch": 0.7586853782603991, + "grad_norm": 0.11295124888420105, + "learning_rate": 0.0001696723913984702, + "loss": 0.1772, + "step": 10515 + }, + { + "epoch": 0.7587575309354594, + "grad_norm": 0.12680451571941376, + "learning_rate": 0.00016966950497907345, + "loss": 0.1579, + "step": 10516 + }, + { + "epoch": 0.7588296836105198, + "grad_norm": 0.13611815869808197, + "learning_rate": 0.00016966661855967674, + "loss": 0.1583, + "step": 10517 + }, + { + "epoch": 0.7589018362855803, + "grad_norm": 0.13193492591381073, + "learning_rate": 0.00016966373214028, + "loss": 0.126, + "step": 10518 + }, + { + "epoch": 0.7589739889606407, + "grad_norm": 0.12841257452964783, + "learning_rate": 0.00016966084572088324, + "loss": 0.1759, + "step": 10519 + }, + { + "epoch": 0.7590461416357012, + "grad_norm": 0.1348455548286438, + "learning_rate": 0.0001696579593014865, + "loss": 0.1695, + "step": 10520 + }, + { + "epoch": 0.7591182943107616, + "grad_norm": 0.18122850358486176, + "learning_rate": 0.00016965507288208977, + "loss": 0.1138, + "step": 10521 + }, + { + "epoch": 0.759190446985822, + "grad_norm": 0.13468194007873535, + "learning_rate": 0.00016965218646269303, + "loss": 0.1631, + "step": 10522 + }, + { + "epoch": 0.7592625996608824, + "grad_norm": 0.10488202422857285, + "learning_rate": 0.0001696493000432963, + "loss": 0.1398, + "step": 10523 + }, + { + "epoch": 0.7593347523359428, + "grad_norm": 0.12960539758205414, + "learning_rate": 0.00016964641362389956, + "loss": 0.1449, + "step": 10524 + }, + { + "epoch": 0.7594069050110033, + "grad_norm": 0.13000430166721344, + "learning_rate": 0.00016964352720450282, + "loss": 0.226, + "step": 10525 + }, + { + "epoch": 0.7594790576860637, + "grad_norm": 0.13472294807434082, + "learning_rate": 0.00016964064078510608, + "loss": 0.153, + "step": 10526 + }, + { + "epoch": 0.7595512103611242, + "grad_norm": 0.12741820514202118, + "learning_rate": 0.00016963775436570935, + "loss": 0.1445, + "step": 10527 + }, + { + "epoch": 0.7596233630361846, + "grad_norm": 0.1185845211148262, + "learning_rate": 0.0001696348679463126, + "loss": 0.1334, + "step": 10528 + }, + { + "epoch": 0.7596955157112449, + "grad_norm": 0.13726027309894562, + "learning_rate": 0.00016963198152691587, + "loss": 0.1478, + "step": 10529 + }, + { + "epoch": 0.7597676683863054, + "grad_norm": 0.13547062873840332, + "learning_rate": 0.0001696290951075191, + "loss": 0.1653, + "step": 10530 + }, + { + "epoch": 0.7598398210613658, + "grad_norm": 0.12183656543493271, + "learning_rate": 0.0001696262086881224, + "loss": 0.1175, + "step": 10531 + }, + { + "epoch": 0.7599119737364263, + "grad_norm": 0.11496282368898392, + "learning_rate": 0.00016962332226872566, + "loss": 0.1361, + "step": 10532 + }, + { + "epoch": 0.7599841264114867, + "grad_norm": 0.10563007742166519, + "learning_rate": 0.00016962043584932892, + "loss": 0.0923, + "step": 10533 + }, + { + "epoch": 0.7600562790865472, + "grad_norm": 0.12999393045902252, + "learning_rate": 0.0001696175494299322, + "loss": 0.193, + "step": 10534 + }, + { + "epoch": 0.7601284317616076, + "grad_norm": 0.12223460525274277, + "learning_rate": 0.00016961466301053542, + "loss": 0.138, + "step": 10535 + }, + { + "epoch": 0.7602005844366679, + "grad_norm": 0.10693982243537903, + "learning_rate": 0.00016961177659113869, + "loss": 0.1323, + "step": 10536 + }, + { + "epoch": 0.7602727371117284, + "grad_norm": 0.15222221612930298, + "learning_rate": 0.00016960889017174195, + "loss": 0.1668, + "step": 10537 + }, + { + "epoch": 0.7603448897867888, + "grad_norm": 0.12483417242765427, + "learning_rate": 0.00016960600375234524, + "loss": 0.1504, + "step": 10538 + }, + { + "epoch": 0.7604170424618493, + "grad_norm": 0.12700700759887695, + "learning_rate": 0.0001696031173329485, + "loss": 0.1705, + "step": 10539 + }, + { + "epoch": 0.7604891951369097, + "grad_norm": 0.1095048263669014, + "learning_rate": 0.00016960023091355174, + "loss": 0.134, + "step": 10540 + }, + { + "epoch": 0.7605613478119702, + "grad_norm": 0.11708509922027588, + "learning_rate": 0.000169597344494155, + "loss": 0.1653, + "step": 10541 + }, + { + "epoch": 0.7606335004870306, + "grad_norm": 0.11617586761713028, + "learning_rate": 0.00016959445807475826, + "loss": 0.1648, + "step": 10542 + }, + { + "epoch": 0.7607056531620909, + "grad_norm": 0.11328215897083282, + "learning_rate": 0.00016959157165536153, + "loss": 0.1126, + "step": 10543 + }, + { + "epoch": 0.7607778058371514, + "grad_norm": 0.13360731303691864, + "learning_rate": 0.0001695886852359648, + "loss": 0.1805, + "step": 10544 + }, + { + "epoch": 0.7608499585122118, + "grad_norm": 0.11923332512378693, + "learning_rate": 0.00016958579881656805, + "loss": 0.195, + "step": 10545 + }, + { + "epoch": 0.7609221111872723, + "grad_norm": 0.11156607419252396, + "learning_rate": 0.00016958291239717131, + "loss": 0.1654, + "step": 10546 + }, + { + "epoch": 0.7609942638623327, + "grad_norm": 0.11855777353048325, + "learning_rate": 0.00016958002597777458, + "loss": 0.163, + "step": 10547 + }, + { + "epoch": 0.7610664165373932, + "grad_norm": 0.12136217206716537, + "learning_rate": 0.00016957713955837784, + "loss": 0.1644, + "step": 10548 + }, + { + "epoch": 0.7611385692124536, + "grad_norm": 0.11538812518119812, + "learning_rate": 0.0001695742531389811, + "loss": 0.1351, + "step": 10549 + }, + { + "epoch": 0.7612107218875139, + "grad_norm": 0.1355658620595932, + "learning_rate": 0.00016957136671958437, + "loss": 0.1553, + "step": 10550 + }, + { + "epoch": 0.7612828745625744, + "grad_norm": 0.12112399190664291, + "learning_rate": 0.0001695684803001876, + "loss": 0.1589, + "step": 10551 + }, + { + "epoch": 0.7613550272376348, + "grad_norm": 0.12598440051078796, + "learning_rate": 0.0001695655938807909, + "loss": 0.1808, + "step": 10552 + }, + { + "epoch": 0.7614271799126953, + "grad_norm": 0.11427775770425797, + "learning_rate": 0.00016956270746139416, + "loss": 0.1173, + "step": 10553 + }, + { + "epoch": 0.7614993325877557, + "grad_norm": 0.10798707604408264, + "learning_rate": 0.00016955982104199742, + "loss": 0.0849, + "step": 10554 + }, + { + "epoch": 0.7615714852628162, + "grad_norm": 0.1420150250196457, + "learning_rate": 0.00016955693462260068, + "loss": 0.1353, + "step": 10555 + }, + { + "epoch": 0.7616436379378766, + "grad_norm": 0.14471231400966644, + "learning_rate": 0.00016955404820320392, + "loss": 0.1246, + "step": 10556 + }, + { + "epoch": 0.7617157906129369, + "grad_norm": 0.1214648112654686, + "learning_rate": 0.00016955116178380718, + "loss": 0.1547, + "step": 10557 + }, + { + "epoch": 0.7617879432879974, + "grad_norm": 0.10652025043964386, + "learning_rate": 0.00016954827536441044, + "loss": 0.1363, + "step": 10558 + }, + { + "epoch": 0.7618600959630578, + "grad_norm": 0.13901583850383759, + "learning_rate": 0.00016954538894501373, + "loss": 0.1473, + "step": 10559 + }, + { + "epoch": 0.7619322486381183, + "grad_norm": 0.1238698959350586, + "learning_rate": 0.000169542502525617, + "loss": 0.1511, + "step": 10560 + }, + { + "epoch": 0.7620044013131787, + "grad_norm": 0.12141493707895279, + "learning_rate": 0.00016953961610622023, + "loss": 0.117, + "step": 10561 + }, + { + "epoch": 0.7620765539882391, + "grad_norm": 0.16087163984775543, + "learning_rate": 0.0001695367296868235, + "loss": 0.1778, + "step": 10562 + }, + { + "epoch": 0.7621487066632996, + "grad_norm": 0.11237693578004837, + "learning_rate": 0.00016953384326742676, + "loss": 0.1186, + "step": 10563 + }, + { + "epoch": 0.7622208593383599, + "grad_norm": 0.12508907914161682, + "learning_rate": 0.00016953095684803002, + "loss": 0.1679, + "step": 10564 + }, + { + "epoch": 0.7622930120134204, + "grad_norm": 0.11945411562919617, + "learning_rate": 0.00016952807042863328, + "loss": 0.1131, + "step": 10565 + }, + { + "epoch": 0.7623651646884808, + "grad_norm": 0.10769058018922806, + "learning_rate": 0.00016952518400923655, + "loss": 0.1321, + "step": 10566 + }, + { + "epoch": 0.7624373173635413, + "grad_norm": 0.12163474410772324, + "learning_rate": 0.0001695222975898398, + "loss": 0.1591, + "step": 10567 + }, + { + "epoch": 0.7625094700386017, + "grad_norm": 0.11006736755371094, + "learning_rate": 0.00016951941117044307, + "loss": 0.1283, + "step": 10568 + }, + { + "epoch": 0.7625816227136621, + "grad_norm": 0.10961829870939255, + "learning_rate": 0.00016951652475104633, + "loss": 0.2023, + "step": 10569 + }, + { + "epoch": 0.7626537753887226, + "grad_norm": 0.12149278819561005, + "learning_rate": 0.0001695136383316496, + "loss": 0.1072, + "step": 10570 + }, + { + "epoch": 0.7627259280637829, + "grad_norm": 0.14779675006866455, + "learning_rate": 0.00016951075191225286, + "loss": 0.1284, + "step": 10571 + }, + { + "epoch": 0.7627980807388434, + "grad_norm": 0.12754885852336884, + "learning_rate": 0.00016950786549285612, + "loss": 0.1409, + "step": 10572 + }, + { + "epoch": 0.7628702334139038, + "grad_norm": 0.12782339751720428, + "learning_rate": 0.00016950497907345939, + "loss": 0.1769, + "step": 10573 + }, + { + "epoch": 0.7629423860889643, + "grad_norm": 0.1684998869895935, + "learning_rate": 0.00016950209265406265, + "loss": 0.1587, + "step": 10574 + }, + { + "epoch": 0.7630145387640247, + "grad_norm": 0.14323611557483673, + "learning_rate": 0.0001694992062346659, + "loss": 0.128, + "step": 10575 + }, + { + "epoch": 0.7630866914390851, + "grad_norm": 0.13388456404209137, + "learning_rate": 0.00016949631981526918, + "loss": 0.114, + "step": 10576 + }, + { + "epoch": 0.7631588441141456, + "grad_norm": 0.15299002826213837, + "learning_rate": 0.00016949343339587244, + "loss": 0.1677, + "step": 10577 + }, + { + "epoch": 0.7632309967892059, + "grad_norm": 0.1399703323841095, + "learning_rate": 0.00016949054697647567, + "loss": 0.1231, + "step": 10578 + }, + { + "epoch": 0.7633031494642664, + "grad_norm": 0.12333228439092636, + "learning_rate": 0.00016948766055707894, + "loss": 0.1611, + "step": 10579 + }, + { + "epoch": 0.7633753021393268, + "grad_norm": 0.11283402144908905, + "learning_rate": 0.00016948477413768223, + "loss": 0.1616, + "step": 10580 + }, + { + "epoch": 0.7634474548143872, + "grad_norm": 0.12879428267478943, + "learning_rate": 0.0001694818877182855, + "loss": 0.1887, + "step": 10581 + }, + { + "epoch": 0.7635196074894477, + "grad_norm": 0.12961862981319427, + "learning_rate": 0.00016947900129888875, + "loss": 0.1694, + "step": 10582 + }, + { + "epoch": 0.7635917601645081, + "grad_norm": 0.11353792995214462, + "learning_rate": 0.000169476114879492, + "loss": 0.1381, + "step": 10583 + }, + { + "epoch": 0.7636639128395686, + "grad_norm": 0.13557705283164978, + "learning_rate": 0.00016947322846009525, + "loss": 0.1359, + "step": 10584 + }, + { + "epoch": 0.7637360655146289, + "grad_norm": 0.13384157419204712, + "learning_rate": 0.00016947034204069851, + "loss": 0.1468, + "step": 10585 + }, + { + "epoch": 0.7638082181896894, + "grad_norm": 0.11125749349594116, + "learning_rate": 0.00016946745562130178, + "loss": 0.1409, + "step": 10586 + }, + { + "epoch": 0.7638803708647498, + "grad_norm": 0.1282130628824234, + "learning_rate": 0.00016946456920190507, + "loss": 0.1239, + "step": 10587 + }, + { + "epoch": 0.7639525235398102, + "grad_norm": 0.14103315770626068, + "learning_rate": 0.0001694616827825083, + "loss": 0.1109, + "step": 10588 + }, + { + "epoch": 0.7640246762148707, + "grad_norm": 0.129958376288414, + "learning_rate": 0.00016945879636311157, + "loss": 0.1855, + "step": 10589 + }, + { + "epoch": 0.7640968288899311, + "grad_norm": 0.15713174641132355, + "learning_rate": 0.00016945590994371483, + "loss": 0.1798, + "step": 10590 + }, + { + "epoch": 0.7641689815649915, + "grad_norm": 0.1206924319267273, + "learning_rate": 0.0001694530235243181, + "loss": 0.1376, + "step": 10591 + }, + { + "epoch": 0.7642411342400519, + "grad_norm": 0.11607380211353302, + "learning_rate": 0.00016945013710492135, + "loss": 0.0943, + "step": 10592 + }, + { + "epoch": 0.7643132869151124, + "grad_norm": 0.11553899198770523, + "learning_rate": 0.00016944725068552462, + "loss": 0.1038, + "step": 10593 + }, + { + "epoch": 0.7643854395901728, + "grad_norm": 0.12019038945436478, + "learning_rate": 0.00016944436426612788, + "loss": 0.153, + "step": 10594 + }, + { + "epoch": 0.7644575922652332, + "grad_norm": 0.14442084729671478, + "learning_rate": 0.00016944147784673114, + "loss": 0.1657, + "step": 10595 + }, + { + "epoch": 0.7645297449402937, + "grad_norm": 0.1325032263994217, + "learning_rate": 0.0001694385914273344, + "loss": 0.1738, + "step": 10596 + }, + { + "epoch": 0.7646018976153541, + "grad_norm": 0.11100459843873978, + "learning_rate": 0.00016943570500793767, + "loss": 0.1271, + "step": 10597 + }, + { + "epoch": 0.7646740502904145, + "grad_norm": 0.1309879720211029, + "learning_rate": 0.00016943281858854093, + "loss": 0.1298, + "step": 10598 + }, + { + "epoch": 0.7647462029654749, + "grad_norm": 0.10606569796800613, + "learning_rate": 0.00016942993216914417, + "loss": 0.0989, + "step": 10599 + }, + { + "epoch": 0.7648183556405354, + "grad_norm": 0.11444351077079773, + "learning_rate": 0.00016942704574974743, + "loss": 0.1487, + "step": 10600 + }, + { + "epoch": 0.7648905083155958, + "grad_norm": 0.1355319619178772, + "learning_rate": 0.00016942415933035072, + "loss": 0.1473, + "step": 10601 + }, + { + "epoch": 0.7649626609906562, + "grad_norm": 0.10285015404224396, + "learning_rate": 0.00016942127291095398, + "loss": 0.085, + "step": 10602 + }, + { + "epoch": 0.7650348136657167, + "grad_norm": 0.1468527615070343, + "learning_rate": 0.00016941838649155725, + "loss": 0.1947, + "step": 10603 + }, + { + "epoch": 0.7651069663407771, + "grad_norm": 0.11259424686431885, + "learning_rate": 0.00016941550007216048, + "loss": 0.1345, + "step": 10604 + }, + { + "epoch": 0.7651791190158375, + "grad_norm": 0.10993609577417374, + "learning_rate": 0.00016941261365276375, + "loss": 0.1777, + "step": 10605 + }, + { + "epoch": 0.7652512716908979, + "grad_norm": 0.11793410778045654, + "learning_rate": 0.000169409727233367, + "loss": 0.1205, + "step": 10606 + }, + { + "epoch": 0.7653234243659583, + "grad_norm": 0.14657844603061676, + "learning_rate": 0.00016940684081397027, + "loss": 0.1635, + "step": 10607 + }, + { + "epoch": 0.7653955770410188, + "grad_norm": 0.1182616651058197, + "learning_rate": 0.00016940395439457356, + "loss": 0.1611, + "step": 10608 + }, + { + "epoch": 0.7654677297160792, + "grad_norm": 0.14213289320468903, + "learning_rate": 0.0001694010679751768, + "loss": 0.1835, + "step": 10609 + }, + { + "epoch": 0.7655398823911397, + "grad_norm": 0.12534356117248535, + "learning_rate": 0.00016939818155578006, + "loss": 0.1351, + "step": 10610 + }, + { + "epoch": 0.7656120350662001, + "grad_norm": 0.11216719448566437, + "learning_rate": 0.00016939529513638332, + "loss": 0.1542, + "step": 10611 + }, + { + "epoch": 0.7656841877412605, + "grad_norm": 0.12680353224277496, + "learning_rate": 0.00016939240871698659, + "loss": 0.1498, + "step": 10612 + }, + { + "epoch": 0.7657563404163209, + "grad_norm": 0.1329881250858307, + "learning_rate": 0.00016938952229758985, + "loss": 0.1257, + "step": 10613 + }, + { + "epoch": 0.7658284930913813, + "grad_norm": 0.12916041910648346, + "learning_rate": 0.0001693866358781931, + "loss": 0.1436, + "step": 10614 + }, + { + "epoch": 0.7659006457664418, + "grad_norm": 0.12449880689382553, + "learning_rate": 0.00016938374945879637, + "loss": 0.1535, + "step": 10615 + }, + { + "epoch": 0.7659727984415022, + "grad_norm": 0.14934857189655304, + "learning_rate": 0.00016938086303939964, + "loss": 0.175, + "step": 10616 + }, + { + "epoch": 0.7660449511165627, + "grad_norm": 0.10569044202566147, + "learning_rate": 0.0001693779766200029, + "loss": 0.1539, + "step": 10617 + }, + { + "epoch": 0.7661171037916231, + "grad_norm": 0.11117196083068848, + "learning_rate": 0.00016937509020060616, + "loss": 0.1819, + "step": 10618 + }, + { + "epoch": 0.7661892564666835, + "grad_norm": 0.15367351472377777, + "learning_rate": 0.00016937220378120943, + "loss": 0.1558, + "step": 10619 + }, + { + "epoch": 0.7662614091417439, + "grad_norm": 0.11300353705883026, + "learning_rate": 0.00016936931736181266, + "loss": 0.1581, + "step": 10620 + }, + { + "epoch": 0.7663335618168043, + "grad_norm": 0.11891332268714905, + "learning_rate": 0.00016936643094241593, + "loss": 0.1779, + "step": 10621 + }, + { + "epoch": 0.7664057144918648, + "grad_norm": 0.11424824595451355, + "learning_rate": 0.00016936354452301922, + "loss": 0.1732, + "step": 10622 + }, + { + "epoch": 0.7664778671669252, + "grad_norm": 0.10762421786785126, + "learning_rate": 0.00016936065810362248, + "loss": 0.1504, + "step": 10623 + }, + { + "epoch": 0.7665500198419857, + "grad_norm": 0.10763785988092422, + "learning_rate": 0.00016935777168422574, + "loss": 0.1497, + "step": 10624 + }, + { + "epoch": 0.7666221725170461, + "grad_norm": 0.13233208656311035, + "learning_rate": 0.00016935488526482898, + "loss": 0.1385, + "step": 10625 + }, + { + "epoch": 0.7666943251921065, + "grad_norm": 0.143177330493927, + "learning_rate": 0.00016935199884543224, + "loss": 0.2356, + "step": 10626 + }, + { + "epoch": 0.7667664778671669, + "grad_norm": 0.10598569363355637, + "learning_rate": 0.0001693491124260355, + "loss": 0.1418, + "step": 10627 + }, + { + "epoch": 0.7668386305422273, + "grad_norm": 0.13420173525810242, + "learning_rate": 0.00016934622600663877, + "loss": 0.1654, + "step": 10628 + }, + { + "epoch": 0.7669107832172878, + "grad_norm": 0.13415329158306122, + "learning_rate": 0.00016934333958724206, + "loss": 0.163, + "step": 10629 + }, + { + "epoch": 0.7669829358923482, + "grad_norm": 0.12192816287279129, + "learning_rate": 0.0001693404531678453, + "loss": 0.1422, + "step": 10630 + }, + { + "epoch": 0.7670550885674087, + "grad_norm": 0.10028716921806335, + "learning_rate": 0.00016933756674844855, + "loss": 0.1549, + "step": 10631 + }, + { + "epoch": 0.7671272412424691, + "grad_norm": 0.14095130562782288, + "learning_rate": 0.00016933468032905182, + "loss": 0.1345, + "step": 10632 + }, + { + "epoch": 0.7671993939175294, + "grad_norm": 0.1579335480928421, + "learning_rate": 0.00016933179390965508, + "loss": 0.1835, + "step": 10633 + }, + { + "epoch": 0.7672715465925899, + "grad_norm": 0.11538061499595642, + "learning_rate": 0.00016932890749025834, + "loss": 0.1737, + "step": 10634 + }, + { + "epoch": 0.7673436992676503, + "grad_norm": 0.12305877357721329, + "learning_rate": 0.0001693260210708616, + "loss": 0.1435, + "step": 10635 + }, + { + "epoch": 0.7674158519427108, + "grad_norm": 0.11222461611032486, + "learning_rate": 0.00016932313465146487, + "loss": 0.1926, + "step": 10636 + }, + { + "epoch": 0.7674880046177712, + "grad_norm": 0.141736701130867, + "learning_rate": 0.00016932024823206813, + "loss": 0.1649, + "step": 10637 + }, + { + "epoch": 0.7675601572928317, + "grad_norm": 0.11970090121030807, + "learning_rate": 0.0001693173618126714, + "loss": 0.1496, + "step": 10638 + }, + { + "epoch": 0.7676323099678921, + "grad_norm": 0.12373533844947815, + "learning_rate": 0.00016931447539327466, + "loss": 0.1525, + "step": 10639 + }, + { + "epoch": 0.7677044626429524, + "grad_norm": 0.10975643992424011, + "learning_rate": 0.00016931158897387792, + "loss": 0.1319, + "step": 10640 + }, + { + "epoch": 0.7677766153180129, + "grad_norm": 0.1512438952922821, + "learning_rate": 0.00016930870255448116, + "loss": 0.1895, + "step": 10641 + }, + { + "epoch": 0.7678487679930733, + "grad_norm": 0.15371336042881012, + "learning_rate": 0.00016930581613508442, + "loss": 0.1561, + "step": 10642 + }, + { + "epoch": 0.7679209206681338, + "grad_norm": 0.16743223369121552, + "learning_rate": 0.0001693029297156877, + "loss": 0.1434, + "step": 10643 + }, + { + "epoch": 0.7679930733431942, + "grad_norm": 0.11173707991838455, + "learning_rate": 0.00016930004329629097, + "loss": 0.2314, + "step": 10644 + }, + { + "epoch": 0.7680652260182547, + "grad_norm": 0.13833162188529968, + "learning_rate": 0.00016929715687689424, + "loss": 0.1643, + "step": 10645 + }, + { + "epoch": 0.7681373786933151, + "grad_norm": 0.14580032229423523, + "learning_rate": 0.00016929427045749747, + "loss": 0.1791, + "step": 10646 + }, + { + "epoch": 0.7682095313683754, + "grad_norm": 0.12167628109455109, + "learning_rate": 0.00016929138403810073, + "loss": 0.1377, + "step": 10647 + }, + { + "epoch": 0.7682816840434359, + "grad_norm": 0.10616665333509445, + "learning_rate": 0.000169288497618704, + "loss": 0.1351, + "step": 10648 + }, + { + "epoch": 0.7683538367184963, + "grad_norm": 0.1406809240579605, + "learning_rate": 0.00016928561119930726, + "loss": 0.1399, + "step": 10649 + }, + { + "epoch": 0.7684259893935568, + "grad_norm": 0.12610119581222534, + "learning_rate": 0.00016928272477991055, + "loss": 0.1451, + "step": 10650 + }, + { + "epoch": 0.7684981420686172, + "grad_norm": 0.12259224057197571, + "learning_rate": 0.00016927983836051379, + "loss": 0.1398, + "step": 10651 + }, + { + "epoch": 0.7685702947436777, + "grad_norm": 0.17353153228759766, + "learning_rate": 0.00016927695194111705, + "loss": 0.1219, + "step": 10652 + }, + { + "epoch": 0.768642447418738, + "grad_norm": 0.12674789130687714, + "learning_rate": 0.0001692740655217203, + "loss": 0.1321, + "step": 10653 + }, + { + "epoch": 0.7687146000937984, + "grad_norm": 0.14105506241321564, + "learning_rate": 0.00016927117910232357, + "loss": 0.1645, + "step": 10654 + }, + { + "epoch": 0.7687867527688589, + "grad_norm": 0.1468188464641571, + "learning_rate": 0.00016926829268292684, + "loss": 0.1863, + "step": 10655 + }, + { + "epoch": 0.7688589054439193, + "grad_norm": 0.10780065506696701, + "learning_rate": 0.0001692654062635301, + "loss": 0.1524, + "step": 10656 + }, + { + "epoch": 0.7689310581189798, + "grad_norm": 0.12532243132591248, + "learning_rate": 0.00016926251984413336, + "loss": 0.131, + "step": 10657 + }, + { + "epoch": 0.7690032107940402, + "grad_norm": 0.14364686608314514, + "learning_rate": 0.00016925963342473663, + "loss": 0.1368, + "step": 10658 + }, + { + "epoch": 0.7690753634691007, + "grad_norm": 0.11589592695236206, + "learning_rate": 0.0001692567470053399, + "loss": 0.1454, + "step": 10659 + }, + { + "epoch": 0.769147516144161, + "grad_norm": 0.12993989884853363, + "learning_rate": 0.00016925386058594315, + "loss": 0.1301, + "step": 10660 + }, + { + "epoch": 0.7692196688192214, + "grad_norm": 0.10442299395799637, + "learning_rate": 0.00016925097416654642, + "loss": 0.1113, + "step": 10661 + }, + { + "epoch": 0.7692918214942819, + "grad_norm": 0.11771980673074722, + "learning_rate": 0.00016924808774714965, + "loss": 0.1237, + "step": 10662 + }, + { + "epoch": 0.7693639741693423, + "grad_norm": 0.10796065628528595, + "learning_rate": 0.00016924520132775291, + "loss": 0.1148, + "step": 10663 + }, + { + "epoch": 0.7694361268444028, + "grad_norm": 0.13033612072467804, + "learning_rate": 0.0001692423149083562, + "loss": 0.13, + "step": 10664 + }, + { + "epoch": 0.7695082795194632, + "grad_norm": 0.12694084644317627, + "learning_rate": 0.00016923942848895947, + "loss": 0.1824, + "step": 10665 + }, + { + "epoch": 0.7695804321945237, + "grad_norm": 0.11404303461313248, + "learning_rate": 0.00016923654206956273, + "loss": 0.1272, + "step": 10666 + }, + { + "epoch": 0.769652584869584, + "grad_norm": 0.11634194105863571, + "learning_rate": 0.00016923365565016597, + "loss": 0.143, + "step": 10667 + }, + { + "epoch": 0.7697247375446444, + "grad_norm": 0.15671080350875854, + "learning_rate": 0.00016923076923076923, + "loss": 0.2003, + "step": 10668 + }, + { + "epoch": 0.7697968902197049, + "grad_norm": 0.13332685828208923, + "learning_rate": 0.0001692278828113725, + "loss": 0.1697, + "step": 10669 + }, + { + "epoch": 0.7698690428947653, + "grad_norm": 0.13197067379951477, + "learning_rate": 0.00016922499639197575, + "loss": 0.145, + "step": 10670 + }, + { + "epoch": 0.7699411955698258, + "grad_norm": 0.12754791975021362, + "learning_rate": 0.00016922210997257904, + "loss": 0.1259, + "step": 10671 + }, + { + "epoch": 0.7700133482448862, + "grad_norm": 0.15722058713436127, + "learning_rate": 0.00016921922355318228, + "loss": 0.1884, + "step": 10672 + }, + { + "epoch": 0.7700855009199467, + "grad_norm": 0.13386334478855133, + "learning_rate": 0.00016921633713378554, + "loss": 0.1332, + "step": 10673 + }, + { + "epoch": 0.770157653595007, + "grad_norm": 0.08790024369955063, + "learning_rate": 0.0001692134507143888, + "loss": 0.1512, + "step": 10674 + }, + { + "epoch": 0.7702298062700674, + "grad_norm": 0.11426102370023727, + "learning_rate": 0.00016921056429499207, + "loss": 0.1388, + "step": 10675 + }, + { + "epoch": 0.7703019589451279, + "grad_norm": 0.12486805021762848, + "learning_rate": 0.00016920767787559533, + "loss": 0.1354, + "step": 10676 + }, + { + "epoch": 0.7703741116201883, + "grad_norm": 0.1404605358839035, + "learning_rate": 0.0001692047914561986, + "loss": 0.1629, + "step": 10677 + }, + { + "epoch": 0.7704462642952488, + "grad_norm": 0.11838164925575256, + "learning_rate": 0.00016920190503680186, + "loss": 0.1379, + "step": 10678 + }, + { + "epoch": 0.7705184169703092, + "grad_norm": 0.11654802411794662, + "learning_rate": 0.00016919901861740512, + "loss": 0.1385, + "step": 10679 + }, + { + "epoch": 0.7705905696453696, + "grad_norm": 0.12266696989536285, + "learning_rate": 0.00016919613219800838, + "loss": 0.2171, + "step": 10680 + }, + { + "epoch": 0.77066272232043, + "grad_norm": 0.15840482711791992, + "learning_rate": 0.00016919324577861165, + "loss": 0.1579, + "step": 10681 + }, + { + "epoch": 0.7707348749954904, + "grad_norm": 0.12705667316913605, + "learning_rate": 0.0001691903593592149, + "loss": 0.1098, + "step": 10682 + }, + { + "epoch": 0.7708070276705509, + "grad_norm": 0.09946004301309586, + "learning_rate": 0.00016918747293981817, + "loss": 0.1467, + "step": 10683 + }, + { + "epoch": 0.7708791803456113, + "grad_norm": 0.12914541363716125, + "learning_rate": 0.0001691845865204214, + "loss": 0.1541, + "step": 10684 + }, + { + "epoch": 0.7709513330206718, + "grad_norm": 0.11290688812732697, + "learning_rate": 0.00016918170010102467, + "loss": 0.151, + "step": 10685 + }, + { + "epoch": 0.7710234856957322, + "grad_norm": 0.32748568058013916, + "learning_rate": 0.00016917881368162796, + "loss": 0.1804, + "step": 10686 + }, + { + "epoch": 0.7710956383707926, + "grad_norm": 0.14599604904651642, + "learning_rate": 0.00016917592726223122, + "loss": 0.1855, + "step": 10687 + }, + { + "epoch": 0.771167791045853, + "grad_norm": 0.11095081269741058, + "learning_rate": 0.0001691730408428345, + "loss": 0.179, + "step": 10688 + }, + { + "epoch": 0.7712399437209134, + "grad_norm": 0.14396479725837708, + "learning_rate": 0.00016917015442343772, + "loss": 0.1457, + "step": 10689 + }, + { + "epoch": 0.7713120963959739, + "grad_norm": 0.11976207047700882, + "learning_rate": 0.00016916726800404099, + "loss": 0.2047, + "step": 10690 + }, + { + "epoch": 0.7713842490710343, + "grad_norm": 0.14696775376796722, + "learning_rate": 0.00016916438158464425, + "loss": 0.1652, + "step": 10691 + }, + { + "epoch": 0.7714564017460948, + "grad_norm": 0.14759984612464905, + "learning_rate": 0.0001691614951652475, + "loss": 0.1867, + "step": 10692 + }, + { + "epoch": 0.7715285544211552, + "grad_norm": 0.15106844902038574, + "learning_rate": 0.0001691586087458508, + "loss": 0.1715, + "step": 10693 + }, + { + "epoch": 0.7716007070962156, + "grad_norm": 0.1324152797460556, + "learning_rate": 0.00016915572232645404, + "loss": 0.1529, + "step": 10694 + }, + { + "epoch": 0.771672859771276, + "grad_norm": 0.1258843094110489, + "learning_rate": 0.0001691528359070573, + "loss": 0.1704, + "step": 10695 + }, + { + "epoch": 0.7717450124463364, + "grad_norm": 0.12931056320667267, + "learning_rate": 0.00016914994948766056, + "loss": 0.141, + "step": 10696 + }, + { + "epoch": 0.7718171651213969, + "grad_norm": 0.1137009933590889, + "learning_rate": 0.00016914706306826383, + "loss": 0.1317, + "step": 10697 + }, + { + "epoch": 0.7718893177964573, + "grad_norm": 0.12198218703269958, + "learning_rate": 0.0001691441766488671, + "loss": 0.1589, + "step": 10698 + }, + { + "epoch": 0.7719614704715178, + "grad_norm": 0.13116715848445892, + "learning_rate": 0.00016914129022947035, + "loss": 0.1776, + "step": 10699 + }, + { + "epoch": 0.7720336231465782, + "grad_norm": 0.15571896731853485, + "learning_rate": 0.00016913840381007361, + "loss": 0.1644, + "step": 10700 + }, + { + "epoch": 0.7721057758216386, + "grad_norm": 0.15015654265880585, + "learning_rate": 0.00016913551739067688, + "loss": 0.142, + "step": 10701 + }, + { + "epoch": 0.772177928496699, + "grad_norm": 0.14481572806835175, + "learning_rate": 0.00016913263097128014, + "loss": 0.1455, + "step": 10702 + }, + { + "epoch": 0.7722500811717594, + "grad_norm": 0.11075858026742935, + "learning_rate": 0.0001691297445518834, + "loss": 0.1451, + "step": 10703 + }, + { + "epoch": 0.7723222338468199, + "grad_norm": 0.14457841217517853, + "learning_rate": 0.00016912685813248667, + "loss": 0.1397, + "step": 10704 + }, + { + "epoch": 0.7723943865218803, + "grad_norm": 0.12623868882656097, + "learning_rate": 0.0001691239717130899, + "loss": 0.1117, + "step": 10705 + }, + { + "epoch": 0.7724665391969407, + "grad_norm": 0.1958254724740982, + "learning_rate": 0.00016912108529369317, + "loss": 0.192, + "step": 10706 + }, + { + "epoch": 0.7725386918720012, + "grad_norm": 0.12673625349998474, + "learning_rate": 0.00016911819887429646, + "loss": 0.1742, + "step": 10707 + }, + { + "epoch": 0.7726108445470616, + "grad_norm": 0.16572904586791992, + "learning_rate": 0.00016911531245489972, + "loss": 0.1314, + "step": 10708 + }, + { + "epoch": 0.772682997222122, + "grad_norm": 0.1162322536110878, + "learning_rate": 0.00016911242603550298, + "loss": 0.1218, + "step": 10709 + }, + { + "epoch": 0.7727551498971824, + "grad_norm": 0.11697287857532501, + "learning_rate": 0.00016910953961610622, + "loss": 0.1442, + "step": 10710 + }, + { + "epoch": 0.7728273025722429, + "grad_norm": 0.11673609167337418, + "learning_rate": 0.00016910665319670948, + "loss": 0.1322, + "step": 10711 + }, + { + "epoch": 0.7728994552473033, + "grad_norm": 0.13033811748027802, + "learning_rate": 0.00016910376677731274, + "loss": 0.1588, + "step": 10712 + }, + { + "epoch": 0.7729716079223637, + "grad_norm": 0.11955799907445908, + "learning_rate": 0.000169100880357916, + "loss": 0.1487, + "step": 10713 + }, + { + "epoch": 0.7730437605974242, + "grad_norm": 0.1232030987739563, + "learning_rate": 0.0001690979939385193, + "loss": 0.1041, + "step": 10714 + }, + { + "epoch": 0.7731159132724845, + "grad_norm": 0.13681338727474213, + "learning_rate": 0.00016909510751912253, + "loss": 0.1624, + "step": 10715 + }, + { + "epoch": 0.773188065947545, + "grad_norm": 0.16010960936546326, + "learning_rate": 0.0001690922210997258, + "loss": 0.1497, + "step": 10716 + }, + { + "epoch": 0.7732602186226054, + "grad_norm": 0.10944380611181259, + "learning_rate": 0.00016908933468032906, + "loss": 0.0934, + "step": 10717 + }, + { + "epoch": 0.7733323712976659, + "grad_norm": 0.15146002173423767, + "learning_rate": 0.00016908644826093232, + "loss": 0.1349, + "step": 10718 + }, + { + "epoch": 0.7734045239727263, + "grad_norm": 0.12636396288871765, + "learning_rate": 0.00016908356184153558, + "loss": 0.0996, + "step": 10719 + }, + { + "epoch": 0.7734766766477867, + "grad_norm": 0.1262812614440918, + "learning_rate": 0.00016908067542213885, + "loss": 0.1905, + "step": 10720 + }, + { + "epoch": 0.7735488293228472, + "grad_norm": 0.12517490983009338, + "learning_rate": 0.0001690777890027421, + "loss": 0.1575, + "step": 10721 + }, + { + "epoch": 0.7736209819979075, + "grad_norm": 0.1311482936143875, + "learning_rate": 0.00016907490258334537, + "loss": 0.1676, + "step": 10722 + }, + { + "epoch": 0.773693134672968, + "grad_norm": 0.11919399350881577, + "learning_rate": 0.00016907201616394863, + "loss": 0.1316, + "step": 10723 + }, + { + "epoch": 0.7737652873480284, + "grad_norm": 0.13207107782363892, + "learning_rate": 0.0001690691297445519, + "loss": 0.1424, + "step": 10724 + }, + { + "epoch": 0.7738374400230889, + "grad_norm": 0.13004319369792938, + "learning_rate": 0.00016906624332515516, + "loss": 0.1912, + "step": 10725 + }, + { + "epoch": 0.7739095926981493, + "grad_norm": 0.1217634528875351, + "learning_rate": 0.0001690633569057584, + "loss": 0.143, + "step": 10726 + }, + { + "epoch": 0.7739817453732097, + "grad_norm": 0.1645490676164627, + "learning_rate": 0.00016906047048636166, + "loss": 0.1609, + "step": 10727 + }, + { + "epoch": 0.7740538980482702, + "grad_norm": 0.15471287071704865, + "learning_rate": 0.00016905758406696495, + "loss": 0.1037, + "step": 10728 + }, + { + "epoch": 0.7741260507233305, + "grad_norm": 0.15667590498924255, + "learning_rate": 0.0001690546976475682, + "loss": 0.2011, + "step": 10729 + }, + { + "epoch": 0.774198203398391, + "grad_norm": 0.1259351372718811, + "learning_rate": 0.00016905181122817148, + "loss": 0.1646, + "step": 10730 + }, + { + "epoch": 0.7742703560734514, + "grad_norm": 0.12652148306369781, + "learning_rate": 0.0001690489248087747, + "loss": 0.119, + "step": 10731 + }, + { + "epoch": 0.7743425087485118, + "grad_norm": 0.1419692188501358, + "learning_rate": 0.00016904603838937797, + "loss": 0.1098, + "step": 10732 + }, + { + "epoch": 0.7744146614235723, + "grad_norm": 0.12805712223052979, + "learning_rate": 0.00016904315196998124, + "loss": 0.1245, + "step": 10733 + }, + { + "epoch": 0.7744868140986327, + "grad_norm": 0.13761387765407562, + "learning_rate": 0.0001690402655505845, + "loss": 0.1687, + "step": 10734 + }, + { + "epoch": 0.7745589667736932, + "grad_norm": 0.1373915672302246, + "learning_rate": 0.0001690373791311878, + "loss": 0.1557, + "step": 10735 + }, + { + "epoch": 0.7746311194487535, + "grad_norm": 0.16077910363674164, + "learning_rate": 0.00016903449271179103, + "loss": 0.1386, + "step": 10736 + }, + { + "epoch": 0.774703272123814, + "grad_norm": 0.16774095594882965, + "learning_rate": 0.0001690316062923943, + "loss": 0.1649, + "step": 10737 + }, + { + "epoch": 0.7747754247988744, + "grad_norm": 0.14782363176345825, + "learning_rate": 0.00016902871987299755, + "loss": 0.1749, + "step": 10738 + }, + { + "epoch": 0.7748475774739348, + "grad_norm": 0.15641121566295624, + "learning_rate": 0.00016902583345360081, + "loss": 0.1631, + "step": 10739 + }, + { + "epoch": 0.7749197301489953, + "grad_norm": 0.12619055807590485, + "learning_rate": 0.00016902294703420408, + "loss": 0.1684, + "step": 10740 + }, + { + "epoch": 0.7749918828240557, + "grad_norm": 0.12410851567983627, + "learning_rate": 0.00016902006061480734, + "loss": 0.1097, + "step": 10741 + }, + { + "epoch": 0.7750640354991162, + "grad_norm": 0.11382365971803665, + "learning_rate": 0.0001690171741954106, + "loss": 0.1824, + "step": 10742 + }, + { + "epoch": 0.7751361881741765, + "grad_norm": 0.11722074449062347, + "learning_rate": 0.00016901428777601387, + "loss": 0.1302, + "step": 10743 + }, + { + "epoch": 0.775208340849237, + "grad_norm": 0.15282313525676727, + "learning_rate": 0.00016901140135661713, + "loss": 0.1719, + "step": 10744 + }, + { + "epoch": 0.7752804935242974, + "grad_norm": 0.09096650779247284, + "learning_rate": 0.0001690085149372204, + "loss": 0.1648, + "step": 10745 + }, + { + "epoch": 0.7753526461993578, + "grad_norm": 0.10087671130895615, + "learning_rate": 0.00016900562851782366, + "loss": 0.1545, + "step": 10746 + }, + { + "epoch": 0.7754247988744183, + "grad_norm": 0.11997248977422714, + "learning_rate": 0.0001690027420984269, + "loss": 0.1821, + "step": 10747 + }, + { + "epoch": 0.7754969515494787, + "grad_norm": 0.10546719282865524, + "learning_rate": 0.00016899985567903015, + "loss": 0.1186, + "step": 10748 + }, + { + "epoch": 0.7755691042245392, + "grad_norm": 0.1350906938314438, + "learning_rate": 0.00016899696925963344, + "loss": 0.143, + "step": 10749 + }, + { + "epoch": 0.7756412568995995, + "grad_norm": 0.12140945345163345, + "learning_rate": 0.0001689940828402367, + "loss": 0.1197, + "step": 10750 + }, + { + "epoch": 0.77571340957466, + "grad_norm": 0.13864918053150177, + "learning_rate": 0.00016899119642083997, + "loss": 0.1636, + "step": 10751 + }, + { + "epoch": 0.7757855622497204, + "grad_norm": 0.12148050963878632, + "learning_rate": 0.0001689883100014432, + "loss": 0.1389, + "step": 10752 + }, + { + "epoch": 0.7758577149247808, + "grad_norm": 0.11241988092660904, + "learning_rate": 0.00016898542358204647, + "loss": 0.1512, + "step": 10753 + }, + { + "epoch": 0.7759298675998413, + "grad_norm": 0.11895349621772766, + "learning_rate": 0.00016898253716264973, + "loss": 0.0942, + "step": 10754 + }, + { + "epoch": 0.7760020202749017, + "grad_norm": 0.10460896044969559, + "learning_rate": 0.000168979650743253, + "loss": 0.1165, + "step": 10755 + }, + { + "epoch": 0.7760741729499622, + "grad_norm": 0.12709875404834747, + "learning_rate": 0.00016897676432385628, + "loss": 0.1779, + "step": 10756 + }, + { + "epoch": 0.7761463256250225, + "grad_norm": 0.12685567140579224, + "learning_rate": 0.00016897387790445952, + "loss": 0.1404, + "step": 10757 + }, + { + "epoch": 0.776218478300083, + "grad_norm": 0.12755173444747925, + "learning_rate": 0.00016897099148506278, + "loss": 0.1457, + "step": 10758 + }, + { + "epoch": 0.7762906309751434, + "grad_norm": 0.12302425503730774, + "learning_rate": 0.00016896810506566605, + "loss": 0.1253, + "step": 10759 + }, + { + "epoch": 0.7763627836502038, + "grad_norm": 0.1417432427406311, + "learning_rate": 0.0001689652186462693, + "loss": 0.1744, + "step": 10760 + }, + { + "epoch": 0.7764349363252643, + "grad_norm": 0.1446855515241623, + "learning_rate": 0.00016896233222687257, + "loss": 0.1304, + "step": 10761 + }, + { + "epoch": 0.7765070890003247, + "grad_norm": 0.13084016740322113, + "learning_rate": 0.00016895944580747583, + "loss": 0.1248, + "step": 10762 + }, + { + "epoch": 0.7765792416753852, + "grad_norm": 0.126628115773201, + "learning_rate": 0.0001689565593880791, + "loss": 0.176, + "step": 10763 + }, + { + "epoch": 0.7766513943504455, + "grad_norm": 0.11866142600774765, + "learning_rate": 0.00016895367296868236, + "loss": 0.1711, + "step": 10764 + }, + { + "epoch": 0.776723547025506, + "grad_norm": 0.10277380049228668, + "learning_rate": 0.00016895078654928562, + "loss": 0.1761, + "step": 10765 + }, + { + "epoch": 0.7767956997005664, + "grad_norm": 0.15225686132907867, + "learning_rate": 0.00016894790012988889, + "loss": 0.1789, + "step": 10766 + }, + { + "epoch": 0.7768678523756268, + "grad_norm": 0.12679462134838104, + "learning_rate": 0.00016894501371049215, + "loss": 0.1518, + "step": 10767 + }, + { + "epoch": 0.7769400050506873, + "grad_norm": 0.1197810247540474, + "learning_rate": 0.00016894212729109539, + "loss": 0.1386, + "step": 10768 + }, + { + "epoch": 0.7770121577257477, + "grad_norm": 0.12745292484760284, + "learning_rate": 0.00016893924087169865, + "loss": 0.0812, + "step": 10769 + }, + { + "epoch": 0.7770843104008082, + "grad_norm": 0.10420835763216019, + "learning_rate": 0.00016893635445230194, + "loss": 0.1773, + "step": 10770 + }, + { + "epoch": 0.7771564630758685, + "grad_norm": 0.13833922147750854, + "learning_rate": 0.0001689334680329052, + "loss": 0.1549, + "step": 10771 + }, + { + "epoch": 0.7772286157509289, + "grad_norm": 0.1223774328827858, + "learning_rate": 0.00016893058161350846, + "loss": 0.1592, + "step": 10772 + }, + { + "epoch": 0.7773007684259894, + "grad_norm": 0.1366223692893982, + "learning_rate": 0.0001689276951941117, + "loss": 0.1607, + "step": 10773 + }, + { + "epoch": 0.7773729211010498, + "grad_norm": 0.14215941727161407, + "learning_rate": 0.00016892480877471496, + "loss": 0.143, + "step": 10774 + }, + { + "epoch": 0.7774450737761103, + "grad_norm": 0.12098676711320877, + "learning_rate": 0.00016892192235531823, + "loss": 0.1474, + "step": 10775 + }, + { + "epoch": 0.7775172264511707, + "grad_norm": 0.14428995549678802, + "learning_rate": 0.0001689190359359215, + "loss": 0.1557, + "step": 10776 + }, + { + "epoch": 0.777589379126231, + "grad_norm": 0.12384141981601715, + "learning_rate": 0.00016891614951652478, + "loss": 0.1735, + "step": 10777 + }, + { + "epoch": 0.7776615318012915, + "grad_norm": 0.1443023979663849, + "learning_rate": 0.00016891326309712801, + "loss": 0.1317, + "step": 10778 + }, + { + "epoch": 0.7777336844763519, + "grad_norm": 0.11942099034786224, + "learning_rate": 0.00016891037667773128, + "loss": 0.1694, + "step": 10779 + }, + { + "epoch": 0.7778058371514124, + "grad_norm": 0.13776026666164398, + "learning_rate": 0.00016890749025833454, + "loss": 0.147, + "step": 10780 + }, + { + "epoch": 0.7778779898264728, + "grad_norm": 0.12703008949756622, + "learning_rate": 0.0001689046038389378, + "loss": 0.1834, + "step": 10781 + }, + { + "epoch": 0.7779501425015333, + "grad_norm": 0.1265055388212204, + "learning_rate": 0.00016890171741954107, + "loss": 0.1549, + "step": 10782 + }, + { + "epoch": 0.7780222951765937, + "grad_norm": 0.12888294458389282, + "learning_rate": 0.00016889883100014433, + "loss": 0.1315, + "step": 10783 + }, + { + "epoch": 0.778094447851654, + "grad_norm": 0.11409862339496613, + "learning_rate": 0.0001688959445807476, + "loss": 0.1035, + "step": 10784 + }, + { + "epoch": 0.7781666005267145, + "grad_norm": 0.12385515868663788, + "learning_rate": 0.00016889305816135085, + "loss": 0.187, + "step": 10785 + }, + { + "epoch": 0.7782387532017749, + "grad_norm": 0.16930937767028809, + "learning_rate": 0.00016889017174195412, + "loss": 0.1942, + "step": 10786 + }, + { + "epoch": 0.7783109058768354, + "grad_norm": 0.11819856613874435, + "learning_rate": 0.00016888728532255738, + "loss": 0.103, + "step": 10787 + }, + { + "epoch": 0.7783830585518958, + "grad_norm": 0.1241755485534668, + "learning_rate": 0.00016888439890316064, + "loss": 0.127, + "step": 10788 + }, + { + "epoch": 0.7784552112269563, + "grad_norm": 0.14113487303256989, + "learning_rate": 0.0001688815124837639, + "loss": 0.1682, + "step": 10789 + }, + { + "epoch": 0.7785273639020167, + "grad_norm": 0.12371841818094254, + "learning_rate": 0.00016887862606436714, + "loss": 0.1559, + "step": 10790 + }, + { + "epoch": 0.778599516577077, + "grad_norm": 0.12854477763175964, + "learning_rate": 0.00016887573964497043, + "loss": 0.1635, + "step": 10791 + }, + { + "epoch": 0.7786716692521375, + "grad_norm": 0.12481623888015747, + "learning_rate": 0.0001688728532255737, + "loss": 0.1517, + "step": 10792 + }, + { + "epoch": 0.7787438219271979, + "grad_norm": 0.12101232260465622, + "learning_rate": 0.00016886996680617696, + "loss": 0.181, + "step": 10793 + }, + { + "epoch": 0.7788159746022584, + "grad_norm": 0.15985621511936188, + "learning_rate": 0.00016886708038678022, + "loss": 0.1382, + "step": 10794 + }, + { + "epoch": 0.7788881272773188, + "grad_norm": 0.12230433523654938, + "learning_rate": 0.00016886419396738346, + "loss": 0.129, + "step": 10795 + }, + { + "epoch": 0.7789602799523793, + "grad_norm": 0.124177947640419, + "learning_rate": 0.00016886130754798672, + "loss": 0.138, + "step": 10796 + }, + { + "epoch": 0.7790324326274397, + "grad_norm": 0.16062316298484802, + "learning_rate": 0.00016885842112858998, + "loss": 0.1646, + "step": 10797 + }, + { + "epoch": 0.7791045853025, + "grad_norm": 0.12008413672447205, + "learning_rate": 0.00016885553470919327, + "loss": 0.157, + "step": 10798 + }, + { + "epoch": 0.7791767379775605, + "grad_norm": 0.1269167959690094, + "learning_rate": 0.00016885264828979654, + "loss": 0.1238, + "step": 10799 + }, + { + "epoch": 0.7792488906526209, + "grad_norm": 0.10606911778450012, + "learning_rate": 0.00016884976187039977, + "loss": 0.1213, + "step": 10800 + }, + { + "epoch": 0.7793210433276814, + "grad_norm": 0.10247175395488739, + "learning_rate": 0.00016884687545100303, + "loss": 0.1617, + "step": 10801 + }, + { + "epoch": 0.7793931960027418, + "grad_norm": 0.107964888215065, + "learning_rate": 0.0001688439890316063, + "loss": 0.1601, + "step": 10802 + }, + { + "epoch": 0.7794653486778023, + "grad_norm": 0.13006651401519775, + "learning_rate": 0.00016884110261220956, + "loss": 0.1439, + "step": 10803 + }, + { + "epoch": 0.7795375013528627, + "grad_norm": 0.1302005499601364, + "learning_rate": 0.00016883821619281282, + "loss": 0.1269, + "step": 10804 + }, + { + "epoch": 0.779609654027923, + "grad_norm": 0.12471070140600204, + "learning_rate": 0.00016883532977341609, + "loss": 0.1638, + "step": 10805 + }, + { + "epoch": 0.7796818067029835, + "grad_norm": 0.10785996168851852, + "learning_rate": 0.00016883244335401935, + "loss": 0.1934, + "step": 10806 + }, + { + "epoch": 0.7797539593780439, + "grad_norm": 0.12432392686605453, + "learning_rate": 0.0001688295569346226, + "loss": 0.1791, + "step": 10807 + }, + { + "epoch": 0.7798261120531044, + "grad_norm": 0.13929963111877441, + "learning_rate": 0.00016882667051522587, + "loss": 0.1332, + "step": 10808 + }, + { + "epoch": 0.7798982647281648, + "grad_norm": 0.12116377800703049, + "learning_rate": 0.00016882378409582914, + "loss": 0.1829, + "step": 10809 + }, + { + "epoch": 0.7799704174032253, + "grad_norm": 0.11945638060569763, + "learning_rate": 0.0001688208976764324, + "loss": 0.1106, + "step": 10810 + }, + { + "epoch": 0.7800425700782857, + "grad_norm": 0.1397213488817215, + "learning_rate": 0.00016881801125703564, + "loss": 0.1425, + "step": 10811 + }, + { + "epoch": 0.780114722753346, + "grad_norm": 0.11561702191829681, + "learning_rate": 0.00016881512483763893, + "loss": 0.1579, + "step": 10812 + }, + { + "epoch": 0.7801868754284065, + "grad_norm": 0.12192624807357788, + "learning_rate": 0.0001688122384182422, + "loss": 0.1477, + "step": 10813 + }, + { + "epoch": 0.7802590281034669, + "grad_norm": 0.12877143919467926, + "learning_rate": 0.00016880935199884545, + "loss": 0.1478, + "step": 10814 + }, + { + "epoch": 0.7803311807785274, + "grad_norm": 0.14012739062309265, + "learning_rate": 0.00016880646557944872, + "loss": 0.1527, + "step": 10815 + }, + { + "epoch": 0.7804033334535878, + "grad_norm": 0.14249911904335022, + "learning_rate": 0.00016880357916005195, + "loss": 0.1467, + "step": 10816 + }, + { + "epoch": 0.7804754861286483, + "grad_norm": 0.1142381876707077, + "learning_rate": 0.00016880069274065521, + "loss": 0.1031, + "step": 10817 + }, + { + "epoch": 0.7805476388037087, + "grad_norm": 0.14810532331466675, + "learning_rate": 0.00016879780632125848, + "loss": 0.1363, + "step": 10818 + }, + { + "epoch": 0.780619791478769, + "grad_norm": 0.11812330037355423, + "learning_rate": 0.00016879491990186177, + "loss": 0.1941, + "step": 10819 + }, + { + "epoch": 0.7806919441538295, + "grad_norm": 0.13109090924263, + "learning_rate": 0.00016879203348246503, + "loss": 0.1461, + "step": 10820 + }, + { + "epoch": 0.7807640968288899, + "grad_norm": 0.1146184429526329, + "learning_rate": 0.00016878914706306827, + "loss": 0.1562, + "step": 10821 + }, + { + "epoch": 0.7808362495039504, + "grad_norm": 0.14675399661064148, + "learning_rate": 0.00016878626064367153, + "loss": 0.134, + "step": 10822 + }, + { + "epoch": 0.7809084021790108, + "grad_norm": 0.13877572119235992, + "learning_rate": 0.0001687833742242748, + "loss": 0.1441, + "step": 10823 + }, + { + "epoch": 0.7809805548540713, + "grad_norm": 0.13332527875900269, + "learning_rate": 0.00016878048780487805, + "loss": 0.189, + "step": 10824 + }, + { + "epoch": 0.7810527075291317, + "grad_norm": 0.11128589510917664, + "learning_rate": 0.00016877760138548132, + "loss": 0.1317, + "step": 10825 + }, + { + "epoch": 0.781124860204192, + "grad_norm": 0.13484609127044678, + "learning_rate": 0.00016877471496608458, + "loss": 0.1502, + "step": 10826 + }, + { + "epoch": 0.7811970128792525, + "grad_norm": 0.1292797327041626, + "learning_rate": 0.00016877182854668784, + "loss": 0.1171, + "step": 10827 + }, + { + "epoch": 0.7812691655543129, + "grad_norm": 0.14283114671707153, + "learning_rate": 0.0001687689421272911, + "loss": 0.1898, + "step": 10828 + }, + { + "epoch": 0.7813413182293734, + "grad_norm": 0.13625703752040863, + "learning_rate": 0.00016876605570789437, + "loss": 0.1963, + "step": 10829 + }, + { + "epoch": 0.7814134709044338, + "grad_norm": 0.11518652737140656, + "learning_rate": 0.00016876316928849763, + "loss": 0.1691, + "step": 10830 + }, + { + "epoch": 0.7814856235794942, + "grad_norm": 0.13726310431957245, + "learning_rate": 0.0001687602828691009, + "loss": 0.1371, + "step": 10831 + }, + { + "epoch": 0.7815577762545547, + "grad_norm": 0.1303400844335556, + "learning_rate": 0.00016875739644970413, + "loss": 0.1665, + "step": 10832 + }, + { + "epoch": 0.781629928929615, + "grad_norm": 0.14024718105793, + "learning_rate": 0.00016875451003030742, + "loss": 0.1337, + "step": 10833 + }, + { + "epoch": 0.7817020816046755, + "grad_norm": 0.116419218480587, + "learning_rate": 0.00016875162361091068, + "loss": 0.16, + "step": 10834 + }, + { + "epoch": 0.7817742342797359, + "grad_norm": 0.1419404000043869, + "learning_rate": 0.00016874873719151395, + "loss": 0.1306, + "step": 10835 + }, + { + "epoch": 0.7818463869547964, + "grad_norm": 0.11938845366239548, + "learning_rate": 0.0001687458507721172, + "loss": 0.1363, + "step": 10836 + }, + { + "epoch": 0.7819185396298568, + "grad_norm": 0.12528158724308014, + "learning_rate": 0.00016874296435272045, + "loss": 0.1374, + "step": 10837 + }, + { + "epoch": 0.7819906923049172, + "grad_norm": 0.11019222438335419, + "learning_rate": 0.0001687400779333237, + "loss": 0.1364, + "step": 10838 + }, + { + "epoch": 0.7820628449799776, + "grad_norm": 0.1004524901509285, + "learning_rate": 0.00016873719151392697, + "loss": 0.1293, + "step": 10839 + }, + { + "epoch": 0.782134997655038, + "grad_norm": 0.13266058266162872, + "learning_rate": 0.00016873430509453026, + "loss": 0.1314, + "step": 10840 + }, + { + "epoch": 0.7822071503300985, + "grad_norm": 0.17364564538002014, + "learning_rate": 0.00016873141867513352, + "loss": 0.1968, + "step": 10841 + }, + { + "epoch": 0.7822793030051589, + "grad_norm": 0.1177988052368164, + "learning_rate": 0.00016872853225573676, + "loss": 0.1591, + "step": 10842 + }, + { + "epoch": 0.7823514556802194, + "grad_norm": 0.18249043822288513, + "learning_rate": 0.00016872564583634002, + "loss": 0.2081, + "step": 10843 + }, + { + "epoch": 0.7824236083552798, + "grad_norm": 0.11414939910173416, + "learning_rate": 0.00016872275941694329, + "loss": 0.1286, + "step": 10844 + }, + { + "epoch": 0.7824957610303402, + "grad_norm": 0.11555025726556778, + "learning_rate": 0.00016871987299754655, + "loss": 0.1155, + "step": 10845 + }, + { + "epoch": 0.7825679137054006, + "grad_norm": 0.12041687965393066, + "learning_rate": 0.0001687169865781498, + "loss": 0.109, + "step": 10846 + }, + { + "epoch": 0.782640066380461, + "grad_norm": 0.12202759832143784, + "learning_rate": 0.00016871410015875307, + "loss": 0.1369, + "step": 10847 + }, + { + "epoch": 0.7827122190555215, + "grad_norm": 0.11626517027616501, + "learning_rate": 0.00016871121373935634, + "loss": 0.1648, + "step": 10848 + }, + { + "epoch": 0.7827843717305819, + "grad_norm": 0.113328717648983, + "learning_rate": 0.0001687083273199596, + "loss": 0.178, + "step": 10849 + }, + { + "epoch": 0.7828565244056424, + "grad_norm": 0.15602675080299377, + "learning_rate": 0.00016870544090056286, + "loss": 0.1751, + "step": 10850 + }, + { + "epoch": 0.7829286770807028, + "grad_norm": 0.13897716999053955, + "learning_rate": 0.00016870255448116613, + "loss": 0.1409, + "step": 10851 + }, + { + "epoch": 0.7830008297557632, + "grad_norm": 0.13525345921516418, + "learning_rate": 0.0001686996680617694, + "loss": 0.1135, + "step": 10852 + }, + { + "epoch": 0.7830729824308236, + "grad_norm": 0.1421322375535965, + "learning_rate": 0.00016869678164237263, + "loss": 0.1788, + "step": 10853 + }, + { + "epoch": 0.783145135105884, + "grad_norm": 0.1331518441438675, + "learning_rate": 0.00016869389522297592, + "loss": 0.2281, + "step": 10854 + }, + { + "epoch": 0.7832172877809445, + "grad_norm": 0.13175399601459503, + "learning_rate": 0.00016869100880357918, + "loss": 0.1567, + "step": 10855 + }, + { + "epoch": 0.7832894404560049, + "grad_norm": 0.13462431728839874, + "learning_rate": 0.00016868812238418244, + "loss": 0.1574, + "step": 10856 + }, + { + "epoch": 0.7833615931310653, + "grad_norm": 0.15298853814601898, + "learning_rate": 0.0001686852359647857, + "loss": 0.1195, + "step": 10857 + }, + { + "epoch": 0.7834337458061258, + "grad_norm": 0.14155510067939758, + "learning_rate": 0.00016868234954538894, + "loss": 0.1517, + "step": 10858 + }, + { + "epoch": 0.7835058984811862, + "grad_norm": 0.12201669067144394, + "learning_rate": 0.0001686794631259922, + "loss": 0.1525, + "step": 10859 + }, + { + "epoch": 0.7835780511562466, + "grad_norm": 0.12332157045602798, + "learning_rate": 0.00016867657670659547, + "loss": 0.1462, + "step": 10860 + }, + { + "epoch": 0.783650203831307, + "grad_norm": 0.13681064546108246, + "learning_rate": 0.00016867369028719876, + "loss": 0.1072, + "step": 10861 + }, + { + "epoch": 0.7837223565063675, + "grad_norm": 0.16225147247314453, + "learning_rate": 0.00016867080386780202, + "loss": 0.119, + "step": 10862 + }, + { + "epoch": 0.7837945091814279, + "grad_norm": 0.12531347572803497, + "learning_rate": 0.00016866791744840525, + "loss": 0.1352, + "step": 10863 + }, + { + "epoch": 0.7838666618564883, + "grad_norm": 0.11706455051898956, + "learning_rate": 0.00016866503102900852, + "loss": 0.1303, + "step": 10864 + }, + { + "epoch": 0.7839388145315488, + "grad_norm": 0.12031789124011993, + "learning_rate": 0.00016866214460961178, + "loss": 0.2086, + "step": 10865 + }, + { + "epoch": 0.7840109672066092, + "grad_norm": 0.12857460975646973, + "learning_rate": 0.00016865925819021504, + "loss": 0.1688, + "step": 10866 + }, + { + "epoch": 0.7840831198816696, + "grad_norm": 0.12610378861427307, + "learning_rate": 0.0001686563717708183, + "loss": 0.1963, + "step": 10867 + }, + { + "epoch": 0.78415527255673, + "grad_norm": 0.12777261435985565, + "learning_rate": 0.00016865348535142157, + "loss": 0.1576, + "step": 10868 + }, + { + "epoch": 0.7842274252317905, + "grad_norm": 0.09360314160585403, + "learning_rate": 0.00016865059893202483, + "loss": 0.1196, + "step": 10869 + }, + { + "epoch": 0.7842995779068509, + "grad_norm": 0.12106601148843765, + "learning_rate": 0.0001686477125126281, + "loss": 0.0844, + "step": 10870 + }, + { + "epoch": 0.7843717305819113, + "grad_norm": 0.12097611278295517, + "learning_rate": 0.00016864482609323136, + "loss": 0.1942, + "step": 10871 + }, + { + "epoch": 0.7844438832569718, + "grad_norm": 0.12000215798616409, + "learning_rate": 0.00016864193967383462, + "loss": 0.1234, + "step": 10872 + }, + { + "epoch": 0.7845160359320322, + "grad_norm": 0.13257797062397003, + "learning_rate": 0.00016863905325443788, + "loss": 0.1488, + "step": 10873 + }, + { + "epoch": 0.7845881886070926, + "grad_norm": 0.11641601473093033, + "learning_rate": 0.00016863616683504112, + "loss": 0.1159, + "step": 10874 + }, + { + "epoch": 0.784660341282153, + "grad_norm": 0.11885194480419159, + "learning_rate": 0.00016863328041564438, + "loss": 0.1736, + "step": 10875 + }, + { + "epoch": 0.7847324939572135, + "grad_norm": 0.12223616987466812, + "learning_rate": 0.00016863039399624767, + "loss": 0.1634, + "step": 10876 + }, + { + "epoch": 0.7848046466322739, + "grad_norm": 0.1331803798675537, + "learning_rate": 0.00016862750757685094, + "loss": 0.1812, + "step": 10877 + }, + { + "epoch": 0.7848767993073343, + "grad_norm": 0.13615085184574127, + "learning_rate": 0.0001686246211574542, + "loss": 0.1512, + "step": 10878 + }, + { + "epoch": 0.7849489519823948, + "grad_norm": 0.12675344944000244, + "learning_rate": 0.00016862173473805743, + "loss": 0.2002, + "step": 10879 + }, + { + "epoch": 0.7850211046574552, + "grad_norm": 0.13016042113304138, + "learning_rate": 0.0001686188483186607, + "loss": 0.1745, + "step": 10880 + }, + { + "epoch": 0.7850932573325156, + "grad_norm": 0.15857744216918945, + "learning_rate": 0.00016861596189926396, + "loss": 0.1854, + "step": 10881 + }, + { + "epoch": 0.785165410007576, + "grad_norm": 0.1491554081439972, + "learning_rate": 0.00016861307547986722, + "loss": 0.1416, + "step": 10882 + }, + { + "epoch": 0.7852375626826364, + "grad_norm": 0.1276445835828781, + "learning_rate": 0.0001686101890604705, + "loss": 0.1493, + "step": 10883 + }, + { + "epoch": 0.7853097153576969, + "grad_norm": 0.1050001010298729, + "learning_rate": 0.00016860730264107375, + "loss": 0.163, + "step": 10884 + }, + { + "epoch": 0.7853818680327573, + "grad_norm": 0.16622294485569, + "learning_rate": 0.000168604416221677, + "loss": 0.1567, + "step": 10885 + }, + { + "epoch": 0.7854540207078178, + "grad_norm": 0.1504763662815094, + "learning_rate": 0.00016860152980228027, + "loss": 0.1235, + "step": 10886 + }, + { + "epoch": 0.7855261733828782, + "grad_norm": 0.11783110350370407, + "learning_rate": 0.00016859864338288354, + "loss": 0.1368, + "step": 10887 + }, + { + "epoch": 0.7855983260579386, + "grad_norm": 0.13893578946590424, + "learning_rate": 0.0001685957569634868, + "loss": 0.1315, + "step": 10888 + }, + { + "epoch": 0.785670478732999, + "grad_norm": 0.12017761915922165, + "learning_rate": 0.00016859287054409006, + "loss": 0.181, + "step": 10889 + }, + { + "epoch": 0.7857426314080594, + "grad_norm": 0.13865232467651367, + "learning_rate": 0.00016858998412469333, + "loss": 0.1214, + "step": 10890 + }, + { + "epoch": 0.7858147840831199, + "grad_norm": 0.1186799556016922, + "learning_rate": 0.0001685870977052966, + "loss": 0.1217, + "step": 10891 + }, + { + "epoch": 0.7858869367581803, + "grad_norm": 0.1285744309425354, + "learning_rate": 0.00016858421128589985, + "loss": 0.1181, + "step": 10892 + }, + { + "epoch": 0.7859590894332408, + "grad_norm": 0.11852288991212845, + "learning_rate": 0.00016858132486650311, + "loss": 0.1499, + "step": 10893 + }, + { + "epoch": 0.7860312421083012, + "grad_norm": 0.12144551426172256, + "learning_rate": 0.00016857843844710638, + "loss": 0.1436, + "step": 10894 + }, + { + "epoch": 0.7861033947833616, + "grad_norm": 0.13382890820503235, + "learning_rate": 0.0001685755520277096, + "loss": 0.1288, + "step": 10895 + }, + { + "epoch": 0.786175547458422, + "grad_norm": 0.12314509600400925, + "learning_rate": 0.00016857266560831288, + "loss": 0.1561, + "step": 10896 + }, + { + "epoch": 0.7862477001334824, + "grad_norm": 0.11607926338911057, + "learning_rate": 0.00016856977918891617, + "loss": 0.1555, + "step": 10897 + }, + { + "epoch": 0.7863198528085429, + "grad_norm": 0.13606394827365875, + "learning_rate": 0.00016856689276951943, + "loss": 0.1663, + "step": 10898 + }, + { + "epoch": 0.7863920054836033, + "grad_norm": 0.16237007081508636, + "learning_rate": 0.0001685640063501227, + "loss": 0.1275, + "step": 10899 + }, + { + "epoch": 0.7864641581586638, + "grad_norm": 0.13067243993282318, + "learning_rate": 0.00016856111993072593, + "loss": 0.1582, + "step": 10900 + }, + { + "epoch": 0.7865363108337241, + "grad_norm": 0.14115722477436066, + "learning_rate": 0.0001685582335113292, + "loss": 0.1358, + "step": 10901 + }, + { + "epoch": 0.7866084635087846, + "grad_norm": 0.09878934174776077, + "learning_rate": 0.00016855534709193245, + "loss": 0.1509, + "step": 10902 + }, + { + "epoch": 0.786680616183845, + "grad_norm": 0.132870152592659, + "learning_rate": 0.00016855246067253572, + "loss": 0.1154, + "step": 10903 + }, + { + "epoch": 0.7867527688589054, + "grad_norm": 0.08644016832113266, + "learning_rate": 0.000168549574253139, + "loss": 0.1532, + "step": 10904 + }, + { + "epoch": 0.7868249215339659, + "grad_norm": 0.12310770153999329, + "learning_rate": 0.00016854668783374224, + "loss": 0.1308, + "step": 10905 + }, + { + "epoch": 0.7868970742090263, + "grad_norm": 0.11368894577026367, + "learning_rate": 0.0001685438014143455, + "loss": 0.1171, + "step": 10906 + }, + { + "epoch": 0.7869692268840868, + "grad_norm": 0.11752180010080338, + "learning_rate": 0.00016854091499494877, + "loss": 0.1616, + "step": 10907 + }, + { + "epoch": 0.7870413795591471, + "grad_norm": 0.1096261516213417, + "learning_rate": 0.00016853802857555203, + "loss": 0.1141, + "step": 10908 + }, + { + "epoch": 0.7871135322342075, + "grad_norm": 0.1345599889755249, + "learning_rate": 0.0001685351421561553, + "loss": 0.1711, + "step": 10909 + }, + { + "epoch": 0.787185684909268, + "grad_norm": 0.11580589413642883, + "learning_rate": 0.00016853225573675856, + "loss": 0.144, + "step": 10910 + }, + { + "epoch": 0.7872578375843284, + "grad_norm": 0.16636911034584045, + "learning_rate": 0.00016852936931736182, + "loss": 0.1685, + "step": 10911 + }, + { + "epoch": 0.7873299902593889, + "grad_norm": 0.12539048492908478, + "learning_rate": 0.00016852648289796508, + "loss": 0.1341, + "step": 10912 + }, + { + "epoch": 0.7874021429344493, + "grad_norm": 0.12115386128425598, + "learning_rate": 0.00016852359647856835, + "loss": 0.1485, + "step": 10913 + }, + { + "epoch": 0.7874742956095098, + "grad_norm": 0.12285510450601578, + "learning_rate": 0.0001685207100591716, + "loss": 0.1301, + "step": 10914 + }, + { + "epoch": 0.7875464482845701, + "grad_norm": 0.11459830403327942, + "learning_rate": 0.00016851782363977487, + "loss": 0.1605, + "step": 10915 + }, + { + "epoch": 0.7876186009596305, + "grad_norm": 0.1197519302368164, + "learning_rate": 0.00016851493722037813, + "loss": 0.1275, + "step": 10916 + }, + { + "epoch": 0.787690753634691, + "grad_norm": 0.13795311748981476, + "learning_rate": 0.00016851205080098137, + "loss": 0.1568, + "step": 10917 + }, + { + "epoch": 0.7877629063097514, + "grad_norm": 0.14670489728450775, + "learning_rate": 0.00016850916438158466, + "loss": 0.2084, + "step": 10918 + }, + { + "epoch": 0.7878350589848119, + "grad_norm": 0.11156128346920013, + "learning_rate": 0.00016850627796218792, + "loss": 0.1654, + "step": 10919 + }, + { + "epoch": 0.7879072116598723, + "grad_norm": 0.13015100359916687, + "learning_rate": 0.0001685033915427912, + "loss": 0.1147, + "step": 10920 + }, + { + "epoch": 0.7879793643349328, + "grad_norm": 0.1453978270292282, + "learning_rate": 0.00016850050512339445, + "loss": 0.1496, + "step": 10921 + }, + { + "epoch": 0.7880515170099931, + "grad_norm": 0.12433165311813354, + "learning_rate": 0.00016849761870399769, + "loss": 0.1396, + "step": 10922 + }, + { + "epoch": 0.7881236696850535, + "grad_norm": 0.1300925463438034, + "learning_rate": 0.00016849473228460095, + "loss": 0.146, + "step": 10923 + }, + { + "epoch": 0.788195822360114, + "grad_norm": 0.15568086504936218, + "learning_rate": 0.0001684918458652042, + "loss": 0.1706, + "step": 10924 + }, + { + "epoch": 0.7882679750351744, + "grad_norm": 0.12890571355819702, + "learning_rate": 0.0001684889594458075, + "loss": 0.1484, + "step": 10925 + }, + { + "epoch": 0.7883401277102349, + "grad_norm": 0.12381922453641891, + "learning_rate": 0.00016848607302641076, + "loss": 0.1536, + "step": 10926 + }, + { + "epoch": 0.7884122803852953, + "grad_norm": 0.10168445855379105, + "learning_rate": 0.000168483186607014, + "loss": 0.1703, + "step": 10927 + }, + { + "epoch": 0.7884844330603558, + "grad_norm": 0.10643867403268814, + "learning_rate": 0.00016848030018761726, + "loss": 0.149, + "step": 10928 + }, + { + "epoch": 0.7885565857354161, + "grad_norm": 0.14253583550453186, + "learning_rate": 0.00016847741376822053, + "loss": 0.1532, + "step": 10929 + }, + { + "epoch": 0.7886287384104765, + "grad_norm": 0.1330222636461258, + "learning_rate": 0.0001684745273488238, + "loss": 0.1518, + "step": 10930 + }, + { + "epoch": 0.788700891085537, + "grad_norm": 0.10836239159107208, + "learning_rate": 0.00016847164092942705, + "loss": 0.1459, + "step": 10931 + }, + { + "epoch": 0.7887730437605974, + "grad_norm": 0.11788875609636307, + "learning_rate": 0.00016846875451003031, + "loss": 0.1645, + "step": 10932 + }, + { + "epoch": 0.7888451964356579, + "grad_norm": 0.13345979154109955, + "learning_rate": 0.00016846586809063358, + "loss": 0.1267, + "step": 10933 + }, + { + "epoch": 0.7889173491107183, + "grad_norm": 0.11228634417057037, + "learning_rate": 0.00016846298167123684, + "loss": 0.1121, + "step": 10934 + }, + { + "epoch": 0.7889895017857788, + "grad_norm": 0.13591991364955902, + "learning_rate": 0.0001684600952518401, + "loss": 0.1445, + "step": 10935 + }, + { + "epoch": 0.7890616544608391, + "grad_norm": 0.12425404042005539, + "learning_rate": 0.00016845720883244337, + "loss": 0.1689, + "step": 10936 + }, + { + "epoch": 0.7891338071358995, + "grad_norm": 0.11064977943897247, + "learning_rate": 0.00016845432241304663, + "loss": 0.1406, + "step": 10937 + }, + { + "epoch": 0.78920595981096, + "grad_norm": 0.12111656367778778, + "learning_rate": 0.00016845143599364986, + "loss": 0.142, + "step": 10938 + }, + { + "epoch": 0.7892781124860204, + "grad_norm": 0.13789787888526917, + "learning_rate": 0.00016844854957425315, + "loss": 0.1613, + "step": 10939 + }, + { + "epoch": 0.7893502651610809, + "grad_norm": 0.12854664027690887, + "learning_rate": 0.00016844566315485642, + "loss": 0.1634, + "step": 10940 + }, + { + "epoch": 0.7894224178361413, + "grad_norm": 0.12651242315769196, + "learning_rate": 0.00016844277673545968, + "loss": 0.1214, + "step": 10941 + }, + { + "epoch": 0.7894945705112018, + "grad_norm": 0.12767834961414337, + "learning_rate": 0.00016843989031606294, + "loss": 0.1311, + "step": 10942 + }, + { + "epoch": 0.7895667231862621, + "grad_norm": 0.13253939151763916, + "learning_rate": 0.00016843700389666618, + "loss": 0.107, + "step": 10943 + }, + { + "epoch": 0.7896388758613225, + "grad_norm": 0.130879208445549, + "learning_rate": 0.00016843411747726944, + "loss": 0.2173, + "step": 10944 + }, + { + "epoch": 0.789711028536383, + "grad_norm": 0.12859420478343964, + "learning_rate": 0.0001684312310578727, + "loss": 0.158, + "step": 10945 + }, + { + "epoch": 0.7897831812114434, + "grad_norm": 0.15572106838226318, + "learning_rate": 0.000168428344638476, + "loss": 0.1779, + "step": 10946 + }, + { + "epoch": 0.7898553338865039, + "grad_norm": 0.1253107190132141, + "learning_rate": 0.00016842545821907926, + "loss": 0.1792, + "step": 10947 + }, + { + "epoch": 0.7899274865615643, + "grad_norm": 0.13750699162483215, + "learning_rate": 0.0001684225717996825, + "loss": 0.1698, + "step": 10948 + }, + { + "epoch": 0.7899996392366248, + "grad_norm": 0.1363247185945511, + "learning_rate": 0.00016841968538028576, + "loss": 0.1339, + "step": 10949 + }, + { + "epoch": 0.7900717919116851, + "grad_norm": 0.11178048700094223, + "learning_rate": 0.00016841679896088902, + "loss": 0.1553, + "step": 10950 + }, + { + "epoch": 0.7901439445867455, + "grad_norm": 0.14459486305713654, + "learning_rate": 0.00016841391254149228, + "loss": 0.1378, + "step": 10951 + }, + { + "epoch": 0.790216097261806, + "grad_norm": 0.11802060902118683, + "learning_rate": 0.00016841102612209555, + "loss": 0.1492, + "step": 10952 + }, + { + "epoch": 0.7902882499368664, + "grad_norm": 0.12663447856903076, + "learning_rate": 0.0001684081397026988, + "loss": 0.1783, + "step": 10953 + }, + { + "epoch": 0.7903604026119269, + "grad_norm": 0.12220313400030136, + "learning_rate": 0.00016840525328330207, + "loss": 0.1298, + "step": 10954 + }, + { + "epoch": 0.7904325552869873, + "grad_norm": 0.12975122034549713, + "learning_rate": 0.00016840236686390533, + "loss": 0.195, + "step": 10955 + }, + { + "epoch": 0.7905047079620477, + "grad_norm": 0.10023821890354156, + "learning_rate": 0.0001683994804445086, + "loss": 0.1572, + "step": 10956 + }, + { + "epoch": 0.7905768606371081, + "grad_norm": 0.10845848172903061, + "learning_rate": 0.00016839659402511186, + "loss": 0.159, + "step": 10957 + }, + { + "epoch": 0.7906490133121685, + "grad_norm": 0.10611424595117569, + "learning_rate": 0.00016839370760571512, + "loss": 0.1648, + "step": 10958 + }, + { + "epoch": 0.790721165987229, + "grad_norm": 0.1480133831501007, + "learning_rate": 0.00016839082118631836, + "loss": 0.1169, + "step": 10959 + }, + { + "epoch": 0.7907933186622894, + "grad_norm": 0.1449739634990692, + "learning_rate": 0.00016838793476692165, + "loss": 0.2015, + "step": 10960 + }, + { + "epoch": 0.7908654713373499, + "grad_norm": 0.1358804553747177, + "learning_rate": 0.0001683850483475249, + "loss": 0.1521, + "step": 10961 + }, + { + "epoch": 0.7909376240124103, + "grad_norm": 0.12293657660484314, + "learning_rate": 0.00016838216192812818, + "loss": 0.1589, + "step": 10962 + }, + { + "epoch": 0.7910097766874706, + "grad_norm": 0.12123416364192963, + "learning_rate": 0.00016837927550873144, + "loss": 0.1311, + "step": 10963 + }, + { + "epoch": 0.7910819293625311, + "grad_norm": 0.17734867334365845, + "learning_rate": 0.00016837638908933467, + "loss": 0.1805, + "step": 10964 + }, + { + "epoch": 0.7911540820375915, + "grad_norm": 0.12703557312488556, + "learning_rate": 0.00016837350266993794, + "loss": 0.1313, + "step": 10965 + }, + { + "epoch": 0.791226234712652, + "grad_norm": 0.13336026668548584, + "learning_rate": 0.0001683706162505412, + "loss": 0.1366, + "step": 10966 + }, + { + "epoch": 0.7912983873877124, + "grad_norm": 0.13004052639007568, + "learning_rate": 0.0001683677298311445, + "loss": 0.186, + "step": 10967 + }, + { + "epoch": 0.7913705400627729, + "grad_norm": 0.1224091425538063, + "learning_rate": 0.00016836484341174775, + "loss": 0.1608, + "step": 10968 + }, + { + "epoch": 0.7914426927378333, + "grad_norm": 0.12723009288311005, + "learning_rate": 0.000168361956992351, + "loss": 0.1769, + "step": 10969 + }, + { + "epoch": 0.7915148454128936, + "grad_norm": 0.11376544088125229, + "learning_rate": 0.00016835907057295425, + "loss": 0.1576, + "step": 10970 + }, + { + "epoch": 0.7915869980879541, + "grad_norm": 0.12775452435016632, + "learning_rate": 0.00016835618415355751, + "loss": 0.1666, + "step": 10971 + }, + { + "epoch": 0.7916591507630145, + "grad_norm": 0.1533483862876892, + "learning_rate": 0.00016835329773416078, + "loss": 0.1292, + "step": 10972 + }, + { + "epoch": 0.791731303438075, + "grad_norm": 0.11418499052524567, + "learning_rate": 0.00016835041131476404, + "loss": 0.1533, + "step": 10973 + }, + { + "epoch": 0.7918034561131354, + "grad_norm": 0.19840049743652344, + "learning_rate": 0.0001683475248953673, + "loss": 0.1527, + "step": 10974 + }, + { + "epoch": 0.7918756087881959, + "grad_norm": 0.1294814646244049, + "learning_rate": 0.00016834463847597057, + "loss": 0.131, + "step": 10975 + }, + { + "epoch": 0.7919477614632563, + "grad_norm": 0.11488209664821625, + "learning_rate": 0.00016834175205657383, + "loss": 0.1398, + "step": 10976 + }, + { + "epoch": 0.7920199141383166, + "grad_norm": 0.13387158513069153, + "learning_rate": 0.0001683388656371771, + "loss": 0.1738, + "step": 10977 + }, + { + "epoch": 0.7920920668133771, + "grad_norm": 0.10368437319993973, + "learning_rate": 0.00016833597921778035, + "loss": 0.1688, + "step": 10978 + }, + { + "epoch": 0.7921642194884375, + "grad_norm": 0.11186818778514862, + "learning_rate": 0.00016833309279838362, + "loss": 0.1874, + "step": 10979 + }, + { + "epoch": 0.792236372163498, + "grad_norm": 0.12419915944337845, + "learning_rate": 0.00016833020637898685, + "loss": 0.1283, + "step": 10980 + }, + { + "epoch": 0.7923085248385584, + "grad_norm": 0.12867562472820282, + "learning_rate": 0.00016832731995959014, + "loss": 0.1525, + "step": 10981 + }, + { + "epoch": 0.7923806775136188, + "grad_norm": 0.12154834717512131, + "learning_rate": 0.0001683244335401934, + "loss": 0.1687, + "step": 10982 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.1212034672498703, + "learning_rate": 0.00016832154712079667, + "loss": 0.1295, + "step": 10983 + }, + { + "epoch": 0.7925249828637396, + "grad_norm": 0.14599169790744781, + "learning_rate": 0.00016831866070139993, + "loss": 0.1702, + "step": 10984 + }, + { + "epoch": 0.7925971355388001, + "grad_norm": 0.13765747845172882, + "learning_rate": 0.00016831577428200317, + "loss": 0.1584, + "step": 10985 + }, + { + "epoch": 0.7926692882138605, + "grad_norm": 0.14818070828914642, + "learning_rate": 0.00016831288786260643, + "loss": 0.1627, + "step": 10986 + }, + { + "epoch": 0.792741440888921, + "grad_norm": 0.1253470629453659, + "learning_rate": 0.0001683100014432097, + "loss": 0.1414, + "step": 10987 + }, + { + "epoch": 0.7928135935639814, + "grad_norm": 0.10206609964370728, + "learning_rate": 0.00016830711502381298, + "loss": 0.1615, + "step": 10988 + }, + { + "epoch": 0.7928857462390418, + "grad_norm": 0.13145264983177185, + "learning_rate": 0.00016830422860441625, + "loss": 0.1741, + "step": 10989 + }, + { + "epoch": 0.7929578989141023, + "grad_norm": 0.1456836611032486, + "learning_rate": 0.00016830134218501948, + "loss": 0.1373, + "step": 10990 + }, + { + "epoch": 0.7930300515891626, + "grad_norm": 0.1337760090827942, + "learning_rate": 0.00016829845576562275, + "loss": 0.1051, + "step": 10991 + }, + { + "epoch": 0.7931022042642231, + "grad_norm": 0.12097540497779846, + "learning_rate": 0.000168295569346226, + "loss": 0.1076, + "step": 10992 + }, + { + "epoch": 0.7931743569392835, + "grad_norm": 0.12801945209503174, + "learning_rate": 0.00016829268292682927, + "loss": 0.146, + "step": 10993 + }, + { + "epoch": 0.793246509614344, + "grad_norm": 0.11971257627010345, + "learning_rate": 0.00016828979650743253, + "loss": 0.157, + "step": 10994 + }, + { + "epoch": 0.7933186622894044, + "grad_norm": 0.13249745965003967, + "learning_rate": 0.0001682869100880358, + "loss": 0.12, + "step": 10995 + }, + { + "epoch": 0.7933908149644648, + "grad_norm": 0.2097712904214859, + "learning_rate": 0.00016828402366863906, + "loss": 0.1284, + "step": 10996 + }, + { + "epoch": 0.7934629676395253, + "grad_norm": 0.12385252863168716, + "learning_rate": 0.00016828113724924232, + "loss": 0.2047, + "step": 10997 + }, + { + "epoch": 0.7935351203145856, + "grad_norm": 0.13884519040584564, + "learning_rate": 0.00016827825082984559, + "loss": 0.1494, + "step": 10998 + }, + { + "epoch": 0.7936072729896461, + "grad_norm": 0.14670537412166595, + "learning_rate": 0.00016827536441044885, + "loss": 0.1556, + "step": 10999 + }, + { + "epoch": 0.7936794256647065, + "grad_norm": 0.11171022057533264, + "learning_rate": 0.0001682724779910521, + "loss": 0.1081, + "step": 11000 + }, + { + "epoch": 0.793751578339767, + "grad_norm": 0.15498478710651398, + "learning_rate": 0.00016826959157165535, + "loss": 0.1346, + "step": 11001 + }, + { + "epoch": 0.7938237310148274, + "grad_norm": 0.13899146020412445, + "learning_rate": 0.00016826670515225864, + "loss": 0.1592, + "step": 11002 + }, + { + "epoch": 0.7938958836898878, + "grad_norm": 0.1698872596025467, + "learning_rate": 0.0001682638187328619, + "loss": 0.1465, + "step": 11003 + }, + { + "epoch": 0.7939680363649483, + "grad_norm": 0.13369418680667877, + "learning_rate": 0.00016826093231346516, + "loss": 0.1208, + "step": 11004 + }, + { + "epoch": 0.7940401890400086, + "grad_norm": 0.15157140791416168, + "learning_rate": 0.00016825804589406843, + "loss": 0.1777, + "step": 11005 + }, + { + "epoch": 0.7941123417150691, + "grad_norm": 0.15633326768875122, + "learning_rate": 0.00016825515947467166, + "loss": 0.1924, + "step": 11006 + }, + { + "epoch": 0.7941844943901295, + "grad_norm": 0.15235872566699982, + "learning_rate": 0.00016825227305527493, + "loss": 0.1608, + "step": 11007 + }, + { + "epoch": 0.79425664706519, + "grad_norm": 0.13778749108314514, + "learning_rate": 0.0001682493866358782, + "loss": 0.1851, + "step": 11008 + }, + { + "epoch": 0.7943287997402504, + "grad_norm": 0.11128794401884079, + "learning_rate": 0.00016824650021648148, + "loss": 0.1282, + "step": 11009 + }, + { + "epoch": 0.7944009524153108, + "grad_norm": 0.11886956542730331, + "learning_rate": 0.00016824361379708474, + "loss": 0.1332, + "step": 11010 + }, + { + "epoch": 0.7944731050903713, + "grad_norm": 0.1259932816028595, + "learning_rate": 0.00016824072737768798, + "loss": 0.1581, + "step": 11011 + }, + { + "epoch": 0.7945452577654316, + "grad_norm": 0.12389802187681198, + "learning_rate": 0.00016823784095829124, + "loss": 0.135, + "step": 11012 + }, + { + "epoch": 0.7946174104404921, + "grad_norm": 0.13104520738124847, + "learning_rate": 0.0001682349545388945, + "loss": 0.1377, + "step": 11013 + }, + { + "epoch": 0.7946895631155525, + "grad_norm": 0.14885981380939484, + "learning_rate": 0.00016823206811949777, + "loss": 0.1501, + "step": 11014 + }, + { + "epoch": 0.7947617157906129, + "grad_norm": 0.14056426286697388, + "learning_rate": 0.00016822918170010103, + "loss": 0.135, + "step": 11015 + }, + { + "epoch": 0.7948338684656734, + "grad_norm": 0.14511126279830933, + "learning_rate": 0.0001682262952807043, + "loss": 0.1362, + "step": 11016 + }, + { + "epoch": 0.7949060211407338, + "grad_norm": 0.11810924112796783, + "learning_rate": 0.00016822340886130755, + "loss": 0.1169, + "step": 11017 + }, + { + "epoch": 0.7949781738157943, + "grad_norm": 0.13882525265216827, + "learning_rate": 0.00016822052244191082, + "loss": 0.1672, + "step": 11018 + }, + { + "epoch": 0.7950503264908546, + "grad_norm": 0.11954337358474731, + "learning_rate": 0.00016821763602251408, + "loss": 0.1856, + "step": 11019 + }, + { + "epoch": 0.795122479165915, + "grad_norm": 0.11523205041885376, + "learning_rate": 0.00016821474960311734, + "loss": 0.1555, + "step": 11020 + }, + { + "epoch": 0.7951946318409755, + "grad_norm": 0.14944764971733093, + "learning_rate": 0.0001682118631837206, + "loss": 0.168, + "step": 11021 + }, + { + "epoch": 0.7952667845160359, + "grad_norm": 0.12593041360378265, + "learning_rate": 0.00016820897676432387, + "loss": 0.2075, + "step": 11022 + }, + { + "epoch": 0.7953389371910964, + "grad_norm": 0.12796074151992798, + "learning_rate": 0.00016820609034492713, + "loss": 0.1472, + "step": 11023 + }, + { + "epoch": 0.7954110898661568, + "grad_norm": 0.13617539405822754, + "learning_rate": 0.0001682032039255304, + "loss": 0.1355, + "step": 11024 + }, + { + "epoch": 0.7954832425412172, + "grad_norm": 0.12220434844493866, + "learning_rate": 0.00016820031750613366, + "loss": 0.1286, + "step": 11025 + }, + { + "epoch": 0.7955553952162776, + "grad_norm": 0.13229826092720032, + "learning_rate": 0.00016819743108673692, + "loss": 0.1169, + "step": 11026 + }, + { + "epoch": 0.795627547891338, + "grad_norm": 0.1340683549642563, + "learning_rate": 0.00016819454466734018, + "loss": 0.1285, + "step": 11027 + }, + { + "epoch": 0.7956997005663985, + "grad_norm": 0.11346662789583206, + "learning_rate": 0.00016819165824794342, + "loss": 0.1267, + "step": 11028 + }, + { + "epoch": 0.7957718532414589, + "grad_norm": 0.1273529827594757, + "learning_rate": 0.00016818877182854668, + "loss": 0.1172, + "step": 11029 + }, + { + "epoch": 0.7958440059165194, + "grad_norm": 0.12470119446516037, + "learning_rate": 0.00016818588540914997, + "loss": 0.1863, + "step": 11030 + }, + { + "epoch": 0.7959161585915798, + "grad_norm": 0.15152983367443085, + "learning_rate": 0.00016818299898975324, + "loss": 0.1409, + "step": 11031 + }, + { + "epoch": 0.7959883112666402, + "grad_norm": 0.13718275725841522, + "learning_rate": 0.0001681801125703565, + "loss": 0.1661, + "step": 11032 + }, + { + "epoch": 0.7960604639417006, + "grad_norm": 0.13387587666511536, + "learning_rate": 0.00016817722615095973, + "loss": 0.1327, + "step": 11033 + }, + { + "epoch": 0.796132616616761, + "grad_norm": 0.13916029036045074, + "learning_rate": 0.000168174339731563, + "loss": 0.1708, + "step": 11034 + }, + { + "epoch": 0.7962047692918215, + "grad_norm": 0.12818798422813416, + "learning_rate": 0.00016817145331216626, + "loss": 0.1445, + "step": 11035 + }, + { + "epoch": 0.7962769219668819, + "grad_norm": 0.11526653915643692, + "learning_rate": 0.00016816856689276952, + "loss": 0.1111, + "step": 11036 + }, + { + "epoch": 0.7963490746419424, + "grad_norm": 0.13559386134147644, + "learning_rate": 0.0001681656804733728, + "loss": 0.168, + "step": 11037 + }, + { + "epoch": 0.7964212273170028, + "grad_norm": 0.10860492289066315, + "learning_rate": 0.00016816279405397605, + "loss": 0.1433, + "step": 11038 + }, + { + "epoch": 0.7964933799920632, + "grad_norm": 0.14890626072883606, + "learning_rate": 0.0001681599076345793, + "loss": 0.1484, + "step": 11039 + }, + { + "epoch": 0.7965655326671236, + "grad_norm": 0.13415803015232086, + "learning_rate": 0.00016815702121518257, + "loss": 0.1534, + "step": 11040 + }, + { + "epoch": 0.796637685342184, + "grad_norm": 0.1619867980480194, + "learning_rate": 0.00016815413479578584, + "loss": 0.2313, + "step": 11041 + }, + { + "epoch": 0.7967098380172445, + "grad_norm": 0.1239086389541626, + "learning_rate": 0.0001681512483763891, + "loss": 0.18, + "step": 11042 + }, + { + "epoch": 0.7967819906923049, + "grad_norm": 0.13515089452266693, + "learning_rate": 0.00016814836195699236, + "loss": 0.1975, + "step": 11043 + }, + { + "epoch": 0.7968541433673654, + "grad_norm": 0.15039147436618805, + "learning_rate": 0.00016814547553759563, + "loss": 0.1609, + "step": 11044 + }, + { + "epoch": 0.7969262960424258, + "grad_norm": 0.12758326530456543, + "learning_rate": 0.0001681425891181989, + "loss": 0.1529, + "step": 11045 + }, + { + "epoch": 0.7969984487174862, + "grad_norm": 0.14930225908756256, + "learning_rate": 0.00016813970269880215, + "loss": 0.1397, + "step": 11046 + }, + { + "epoch": 0.7970706013925466, + "grad_norm": 0.15626980364322662, + "learning_rate": 0.00016813681627940541, + "loss": 0.1611, + "step": 11047 + }, + { + "epoch": 0.797142754067607, + "grad_norm": 0.18415814638137817, + "learning_rate": 0.00016813392986000868, + "loss": 0.131, + "step": 11048 + }, + { + "epoch": 0.7972149067426675, + "grad_norm": 0.11131515353918076, + "learning_rate": 0.00016813104344061191, + "loss": 0.1164, + "step": 11049 + }, + { + "epoch": 0.7972870594177279, + "grad_norm": 0.12445852905511856, + "learning_rate": 0.00016812815702121518, + "loss": 0.1369, + "step": 11050 + }, + { + "epoch": 0.7973592120927884, + "grad_norm": 0.13122273981571198, + "learning_rate": 0.00016812527060181847, + "loss": 0.1133, + "step": 11051 + }, + { + "epoch": 0.7974313647678488, + "grad_norm": 0.12259101122617722, + "learning_rate": 0.00016812238418242173, + "loss": 0.1821, + "step": 11052 + }, + { + "epoch": 0.7975035174429091, + "grad_norm": 0.14598500728607178, + "learning_rate": 0.000168119497763025, + "loss": 0.1431, + "step": 11053 + }, + { + "epoch": 0.7975756701179696, + "grad_norm": 0.1440744400024414, + "learning_rate": 0.00016811661134362823, + "loss": 0.141, + "step": 11054 + }, + { + "epoch": 0.79764782279303, + "grad_norm": 0.12835662066936493, + "learning_rate": 0.0001681137249242315, + "loss": 0.1309, + "step": 11055 + }, + { + "epoch": 0.7977199754680905, + "grad_norm": 0.12210889160633087, + "learning_rate": 0.00016811083850483475, + "loss": 0.1485, + "step": 11056 + }, + { + "epoch": 0.7977921281431509, + "grad_norm": 0.11877714842557907, + "learning_rate": 0.00016810795208543802, + "loss": 0.1407, + "step": 11057 + }, + { + "epoch": 0.7978642808182114, + "grad_norm": 0.11467092484235764, + "learning_rate": 0.0001681050656660413, + "loss": 0.1181, + "step": 11058 + }, + { + "epoch": 0.7979364334932718, + "grad_norm": 0.13680388033390045, + "learning_rate": 0.00016810217924664454, + "loss": 0.2024, + "step": 11059 + }, + { + "epoch": 0.7980085861683321, + "grad_norm": 0.1774633377790451, + "learning_rate": 0.0001680992928272478, + "loss": 0.1748, + "step": 11060 + }, + { + "epoch": 0.7980807388433926, + "grad_norm": 0.11040361225605011, + "learning_rate": 0.00016809640640785107, + "loss": 0.1621, + "step": 11061 + }, + { + "epoch": 0.798152891518453, + "grad_norm": 0.3321816027164459, + "learning_rate": 0.00016809351998845433, + "loss": 0.1643, + "step": 11062 + }, + { + "epoch": 0.7982250441935135, + "grad_norm": 0.1258113533258438, + "learning_rate": 0.0001680906335690576, + "loss": 0.1304, + "step": 11063 + }, + { + "epoch": 0.7982971968685739, + "grad_norm": 0.14619180560112, + "learning_rate": 0.00016808774714966086, + "loss": 0.1457, + "step": 11064 + }, + { + "epoch": 0.7983693495436344, + "grad_norm": 0.11650705337524414, + "learning_rate": 0.00016808486073026412, + "loss": 0.177, + "step": 11065 + }, + { + "epoch": 0.7984415022186948, + "grad_norm": 0.16568635404109955, + "learning_rate": 0.00016808197431086738, + "loss": 0.126, + "step": 11066 + }, + { + "epoch": 0.7985136548937551, + "grad_norm": 0.11353182047605515, + "learning_rate": 0.00016807908789147065, + "loss": 0.1656, + "step": 11067 + }, + { + "epoch": 0.7985858075688156, + "grad_norm": 0.16108182072639465, + "learning_rate": 0.0001680762014720739, + "loss": 0.1462, + "step": 11068 + }, + { + "epoch": 0.798657960243876, + "grad_norm": 0.13784301280975342, + "learning_rate": 0.00016807331505267717, + "loss": 0.1297, + "step": 11069 + }, + { + "epoch": 0.7987301129189365, + "grad_norm": 0.1316288560628891, + "learning_rate": 0.0001680704286332804, + "loss": 0.177, + "step": 11070 + }, + { + "epoch": 0.7988022655939969, + "grad_norm": 0.12894243001937866, + "learning_rate": 0.00016806754221388367, + "loss": 0.1345, + "step": 11071 + }, + { + "epoch": 0.7988744182690574, + "grad_norm": 0.10570898652076721, + "learning_rate": 0.00016806465579448693, + "loss": 0.1641, + "step": 11072 + }, + { + "epoch": 0.7989465709441178, + "grad_norm": 0.16653867065906525, + "learning_rate": 0.00016806176937509022, + "loss": 0.1412, + "step": 11073 + }, + { + "epoch": 0.7990187236191781, + "grad_norm": 0.13186757266521454, + "learning_rate": 0.0001680588829556935, + "loss": 0.1499, + "step": 11074 + }, + { + "epoch": 0.7990908762942386, + "grad_norm": 0.16151700913906097, + "learning_rate": 0.00016805599653629672, + "loss": 0.1783, + "step": 11075 + }, + { + "epoch": 0.799163028969299, + "grad_norm": 0.14202305674552917, + "learning_rate": 0.00016805311011689999, + "loss": 0.1441, + "step": 11076 + }, + { + "epoch": 0.7992351816443595, + "grad_norm": 0.13662859797477722, + "learning_rate": 0.00016805022369750325, + "loss": 0.1678, + "step": 11077 + }, + { + "epoch": 0.7993073343194199, + "grad_norm": 0.13601934909820557, + "learning_rate": 0.0001680473372781065, + "loss": 0.1353, + "step": 11078 + }, + { + "epoch": 0.7993794869944804, + "grad_norm": 0.14189928770065308, + "learning_rate": 0.00016804445085870977, + "loss": 0.1134, + "step": 11079 + }, + { + "epoch": 0.7994516396695408, + "grad_norm": 0.12195149064064026, + "learning_rate": 0.00016804156443931304, + "loss": 0.1695, + "step": 11080 + }, + { + "epoch": 0.7995237923446011, + "grad_norm": 0.12974140048027039, + "learning_rate": 0.0001680386780199163, + "loss": 0.1421, + "step": 11081 + }, + { + "epoch": 0.7995959450196616, + "grad_norm": 0.12792320549488068, + "learning_rate": 0.00016803579160051956, + "loss": 0.1302, + "step": 11082 + }, + { + "epoch": 0.799668097694722, + "grad_norm": 0.14331893622875214, + "learning_rate": 0.00016803290518112283, + "loss": 0.1269, + "step": 11083 + }, + { + "epoch": 0.7997402503697825, + "grad_norm": 0.12498790770769119, + "learning_rate": 0.0001680300187617261, + "loss": 0.1295, + "step": 11084 + }, + { + "epoch": 0.7998124030448429, + "grad_norm": 0.12381463497877121, + "learning_rate": 0.00016802713234232935, + "loss": 0.185, + "step": 11085 + }, + { + "epoch": 0.7998845557199034, + "grad_norm": 0.16269150376319885, + "learning_rate": 0.0001680242459229326, + "loss": 0.1914, + "step": 11086 + }, + { + "epoch": 0.7999567083949637, + "grad_norm": 0.13072358071804047, + "learning_rate": 0.00016802135950353588, + "loss": 0.1577, + "step": 11087 + }, + { + "epoch": 0.8000288610700241, + "grad_norm": 0.15264281630516052, + "learning_rate": 0.00016801847308413914, + "loss": 0.202, + "step": 11088 + }, + { + "epoch": 0.8001010137450846, + "grad_norm": 0.11712154746055603, + "learning_rate": 0.0001680155866647424, + "loss": 0.1266, + "step": 11089 + }, + { + "epoch": 0.800173166420145, + "grad_norm": 0.12803013622760773, + "learning_rate": 0.00016801270024534567, + "loss": 0.1535, + "step": 11090 + }, + { + "epoch": 0.8002453190952055, + "grad_norm": 0.10732483118772507, + "learning_rate": 0.0001680098138259489, + "loss": 0.1794, + "step": 11091 + }, + { + "epoch": 0.8003174717702659, + "grad_norm": 0.12305649369955063, + "learning_rate": 0.00016800692740655217, + "loss": 0.1655, + "step": 11092 + }, + { + "epoch": 0.8003896244453264, + "grad_norm": 0.13291281461715698, + "learning_rate": 0.00016800404098715543, + "loss": 0.1491, + "step": 11093 + }, + { + "epoch": 0.8004617771203867, + "grad_norm": 0.11650607734918594, + "learning_rate": 0.00016800115456775872, + "loss": 0.1201, + "step": 11094 + }, + { + "epoch": 0.8005339297954471, + "grad_norm": 0.13787731528282166, + "learning_rate": 0.00016799826814836198, + "loss": 0.1982, + "step": 11095 + }, + { + "epoch": 0.8006060824705076, + "grad_norm": 0.13782642781734467, + "learning_rate": 0.00016799538172896522, + "loss": 0.1443, + "step": 11096 + }, + { + "epoch": 0.800678235145568, + "grad_norm": 0.1185268759727478, + "learning_rate": 0.00016799249530956848, + "loss": 0.0963, + "step": 11097 + }, + { + "epoch": 0.8007503878206285, + "grad_norm": 0.15925146639347076, + "learning_rate": 0.00016798960889017174, + "loss": 0.1398, + "step": 11098 + }, + { + "epoch": 0.8008225404956889, + "grad_norm": 0.11412061750888824, + "learning_rate": 0.000167986722470775, + "loss": 0.1256, + "step": 11099 + }, + { + "epoch": 0.8008946931707493, + "grad_norm": 0.12418772280216217, + "learning_rate": 0.00016798383605137827, + "loss": 0.17, + "step": 11100 + }, + { + "epoch": 0.8009668458458097, + "grad_norm": 0.14615289866924286, + "learning_rate": 0.00016798094963198153, + "loss": 0.1543, + "step": 11101 + }, + { + "epoch": 0.8010389985208701, + "grad_norm": 0.12303359806537628, + "learning_rate": 0.0001679780632125848, + "loss": 0.1103, + "step": 11102 + }, + { + "epoch": 0.8011111511959306, + "grad_norm": 0.11707977950572968, + "learning_rate": 0.00016797517679318806, + "loss": 0.1468, + "step": 11103 + }, + { + "epoch": 0.801183303870991, + "grad_norm": 0.13978630304336548, + "learning_rate": 0.00016797229037379132, + "loss": 0.1343, + "step": 11104 + }, + { + "epoch": 0.8012554565460515, + "grad_norm": 0.11165934056043625, + "learning_rate": 0.00016796940395439458, + "loss": 0.1898, + "step": 11105 + }, + { + "epoch": 0.8013276092211119, + "grad_norm": 0.1523934006690979, + "learning_rate": 0.00016796651753499785, + "loss": 0.1602, + "step": 11106 + }, + { + "epoch": 0.8013997618961723, + "grad_norm": 0.16679495573043823, + "learning_rate": 0.00016796363111560108, + "loss": 0.1846, + "step": 11107 + }, + { + "epoch": 0.8014719145712327, + "grad_norm": 0.11543601751327515, + "learning_rate": 0.00016796074469620437, + "loss": 0.1513, + "step": 11108 + }, + { + "epoch": 0.8015440672462931, + "grad_norm": 0.12580257654190063, + "learning_rate": 0.00016795785827680763, + "loss": 0.1397, + "step": 11109 + }, + { + "epoch": 0.8016162199213536, + "grad_norm": 0.16165702044963837, + "learning_rate": 0.0001679549718574109, + "loss": 0.1716, + "step": 11110 + }, + { + "epoch": 0.801688372596414, + "grad_norm": 0.18446248769760132, + "learning_rate": 0.00016795208543801416, + "loss": 0.1899, + "step": 11111 + }, + { + "epoch": 0.8017605252714745, + "grad_norm": 0.11582392454147339, + "learning_rate": 0.0001679491990186174, + "loss": 0.1481, + "step": 11112 + }, + { + "epoch": 0.8018326779465349, + "grad_norm": 0.11042899638414383, + "learning_rate": 0.00016794631259922066, + "loss": 0.1733, + "step": 11113 + }, + { + "epoch": 0.8019048306215953, + "grad_norm": 0.1459999531507492, + "learning_rate": 0.00016794342617982392, + "loss": 0.1677, + "step": 11114 + }, + { + "epoch": 0.8019769832966557, + "grad_norm": 0.12890112400054932, + "learning_rate": 0.0001679405397604272, + "loss": 0.1541, + "step": 11115 + }, + { + "epoch": 0.8020491359717161, + "grad_norm": 0.1220753863453865, + "learning_rate": 0.00016793765334103048, + "loss": 0.1718, + "step": 11116 + }, + { + "epoch": 0.8021212886467766, + "grad_norm": 0.122266486287117, + "learning_rate": 0.0001679347669216337, + "loss": 0.144, + "step": 11117 + }, + { + "epoch": 0.802193441321837, + "grad_norm": 0.12399003654718399, + "learning_rate": 0.00016793188050223697, + "loss": 0.137, + "step": 11118 + }, + { + "epoch": 0.8022655939968975, + "grad_norm": 0.15937049686908722, + "learning_rate": 0.00016792899408284024, + "loss": 0.1457, + "step": 11119 + }, + { + "epoch": 0.8023377466719579, + "grad_norm": 0.10514838248491287, + "learning_rate": 0.0001679261076634435, + "loss": 0.1775, + "step": 11120 + }, + { + "epoch": 0.8024098993470183, + "grad_norm": 0.1084451898932457, + "learning_rate": 0.00016792322124404676, + "loss": 0.152, + "step": 11121 + }, + { + "epoch": 0.8024820520220787, + "grad_norm": 0.11995755881071091, + "learning_rate": 0.00016792033482465003, + "loss": 0.1656, + "step": 11122 + }, + { + "epoch": 0.8025542046971391, + "grad_norm": 0.11566449701786041, + "learning_rate": 0.0001679174484052533, + "loss": 0.1407, + "step": 11123 + }, + { + "epoch": 0.8026263573721996, + "grad_norm": 0.1932581067085266, + "learning_rate": 0.00016791456198585655, + "loss": 0.2038, + "step": 11124 + }, + { + "epoch": 0.80269851004726, + "grad_norm": 0.12143067270517349, + "learning_rate": 0.00016791167556645981, + "loss": 0.1021, + "step": 11125 + }, + { + "epoch": 0.8027706627223204, + "grad_norm": 0.11793456971645355, + "learning_rate": 0.00016790878914706308, + "loss": 0.1047, + "step": 11126 + }, + { + "epoch": 0.8028428153973809, + "grad_norm": 0.13719895482063293, + "learning_rate": 0.00016790590272766634, + "loss": 0.1541, + "step": 11127 + }, + { + "epoch": 0.8029149680724413, + "grad_norm": 0.12765516340732574, + "learning_rate": 0.00016790301630826958, + "loss": 0.1213, + "step": 11128 + }, + { + "epoch": 0.8029871207475017, + "grad_norm": 0.1204436868429184, + "learning_rate": 0.00016790012988887287, + "loss": 0.1423, + "step": 11129 + }, + { + "epoch": 0.8030592734225621, + "grad_norm": 0.10415612161159515, + "learning_rate": 0.00016789724346947613, + "loss": 0.1272, + "step": 11130 + }, + { + "epoch": 0.8031314260976226, + "grad_norm": 0.12991686165332794, + "learning_rate": 0.0001678943570500794, + "loss": 0.1381, + "step": 11131 + }, + { + "epoch": 0.803203578772683, + "grad_norm": 0.14686962962150574, + "learning_rate": 0.00016789147063068265, + "loss": 0.159, + "step": 11132 + }, + { + "epoch": 0.8032757314477434, + "grad_norm": 0.11568587273359299, + "learning_rate": 0.0001678885842112859, + "loss": 0.1534, + "step": 11133 + }, + { + "epoch": 0.8033478841228039, + "grad_norm": 0.13723793625831604, + "learning_rate": 0.00016788569779188915, + "loss": 0.1367, + "step": 11134 + }, + { + "epoch": 0.8034200367978643, + "grad_norm": 0.12334664165973663, + "learning_rate": 0.00016788281137249242, + "loss": 0.1297, + "step": 11135 + }, + { + "epoch": 0.8034921894729247, + "grad_norm": 0.1709556132555008, + "learning_rate": 0.0001678799249530957, + "loss": 0.1469, + "step": 11136 + }, + { + "epoch": 0.8035643421479851, + "grad_norm": 0.12486976385116577, + "learning_rate": 0.00016787703853369897, + "loss": 0.1848, + "step": 11137 + }, + { + "epoch": 0.8036364948230456, + "grad_norm": 0.12414836883544922, + "learning_rate": 0.00016787415211430223, + "loss": 0.1363, + "step": 11138 + }, + { + "epoch": 0.803708647498106, + "grad_norm": 0.1451658457517624, + "learning_rate": 0.00016787126569490547, + "loss": 0.1283, + "step": 11139 + }, + { + "epoch": 0.8037808001731664, + "grad_norm": 0.12030114978551865, + "learning_rate": 0.00016786837927550873, + "loss": 0.0882, + "step": 11140 + }, + { + "epoch": 0.8038529528482269, + "grad_norm": 0.13571010529994965, + "learning_rate": 0.000167865492856112, + "loss": 0.1278, + "step": 11141 + }, + { + "epoch": 0.8039251055232873, + "grad_norm": 0.12487173825502396, + "learning_rate": 0.00016786260643671526, + "loss": 0.1593, + "step": 11142 + }, + { + "epoch": 0.8039972581983477, + "grad_norm": 0.12655773758888245, + "learning_rate": 0.00016785972001731855, + "loss": 0.1221, + "step": 11143 + }, + { + "epoch": 0.8040694108734081, + "grad_norm": 0.1122177243232727, + "learning_rate": 0.00016785683359792178, + "loss": 0.1716, + "step": 11144 + }, + { + "epoch": 0.8041415635484686, + "grad_norm": 0.11251802742481232, + "learning_rate": 0.00016785394717852505, + "loss": 0.117, + "step": 11145 + }, + { + "epoch": 0.804213716223529, + "grad_norm": 0.11069377511739731, + "learning_rate": 0.0001678510607591283, + "loss": 0.1195, + "step": 11146 + }, + { + "epoch": 0.8042858688985894, + "grad_norm": 0.14175282418727875, + "learning_rate": 0.00016784817433973157, + "loss": 0.1554, + "step": 11147 + }, + { + "epoch": 0.8043580215736499, + "grad_norm": 0.14564311504364014, + "learning_rate": 0.00016784528792033483, + "loss": 0.2149, + "step": 11148 + }, + { + "epoch": 0.8044301742487102, + "grad_norm": 0.1261538416147232, + "learning_rate": 0.0001678424015009381, + "loss": 0.1294, + "step": 11149 + }, + { + "epoch": 0.8045023269237707, + "grad_norm": 0.16583268344402313, + "learning_rate": 0.00016783951508154136, + "loss": 0.1325, + "step": 11150 + }, + { + "epoch": 0.8045744795988311, + "grad_norm": 0.17326302826404572, + "learning_rate": 0.00016783662866214462, + "loss": 0.1394, + "step": 11151 + }, + { + "epoch": 0.8046466322738915, + "grad_norm": 0.10651998221874237, + "learning_rate": 0.00016783374224274789, + "loss": 0.1718, + "step": 11152 + }, + { + "epoch": 0.804718784948952, + "grad_norm": 0.155874565243721, + "learning_rate": 0.00016783085582335115, + "loss": 0.1603, + "step": 11153 + }, + { + "epoch": 0.8047909376240124, + "grad_norm": 0.1414341926574707, + "learning_rate": 0.0001678279694039544, + "loss": 0.1659, + "step": 11154 + }, + { + "epoch": 0.8048630902990729, + "grad_norm": 0.10812345892190933, + "learning_rate": 0.00016782508298455765, + "loss": 0.1377, + "step": 11155 + }, + { + "epoch": 0.8049352429741332, + "grad_norm": 0.11542447656393051, + "learning_rate": 0.0001678221965651609, + "loss": 0.1732, + "step": 11156 + }, + { + "epoch": 0.8050073956491937, + "grad_norm": 0.1083124428987503, + "learning_rate": 0.0001678193101457642, + "loss": 0.0919, + "step": 11157 + }, + { + "epoch": 0.8050795483242541, + "grad_norm": 0.14099064469337463, + "learning_rate": 0.00016781642372636746, + "loss": 0.151, + "step": 11158 + }, + { + "epoch": 0.8051517009993145, + "grad_norm": 0.13833631575107574, + "learning_rate": 0.00016781353730697073, + "loss": 0.1222, + "step": 11159 + }, + { + "epoch": 0.805223853674375, + "grad_norm": 0.11046667397022247, + "learning_rate": 0.00016781065088757396, + "loss": 0.1266, + "step": 11160 + }, + { + "epoch": 0.8052960063494354, + "grad_norm": 0.11911126971244812, + "learning_rate": 0.00016780776446817723, + "loss": 0.1513, + "step": 11161 + }, + { + "epoch": 0.8053681590244959, + "grad_norm": 0.11474090814590454, + "learning_rate": 0.0001678048780487805, + "loss": 0.1932, + "step": 11162 + }, + { + "epoch": 0.8054403116995562, + "grad_norm": 0.13939101994037628, + "learning_rate": 0.00016780199162938375, + "loss": 0.1614, + "step": 11163 + }, + { + "epoch": 0.8055124643746167, + "grad_norm": 0.12426722049713135, + "learning_rate": 0.00016779910520998704, + "loss": 0.1221, + "step": 11164 + }, + { + "epoch": 0.8055846170496771, + "grad_norm": 0.13918867707252502, + "learning_rate": 0.00016779621879059028, + "loss": 0.1437, + "step": 11165 + }, + { + "epoch": 0.8056567697247375, + "grad_norm": 0.14279721677303314, + "learning_rate": 0.00016779333237119354, + "loss": 0.1442, + "step": 11166 + }, + { + "epoch": 0.805728922399798, + "grad_norm": 0.09675594419240952, + "learning_rate": 0.0001677904459517968, + "loss": 0.0808, + "step": 11167 + }, + { + "epoch": 0.8058010750748584, + "grad_norm": 0.14310872554779053, + "learning_rate": 0.00016778755953240007, + "loss": 0.1837, + "step": 11168 + }, + { + "epoch": 0.8058732277499189, + "grad_norm": 0.12977318465709686, + "learning_rate": 0.00016778467311300333, + "loss": 0.1405, + "step": 11169 + }, + { + "epoch": 0.8059453804249792, + "grad_norm": 0.15566326677799225, + "learning_rate": 0.0001677817866936066, + "loss": 0.1557, + "step": 11170 + }, + { + "epoch": 0.8060175331000397, + "grad_norm": 0.12260245531797409, + "learning_rate": 0.00016777890027420985, + "loss": 0.152, + "step": 11171 + }, + { + "epoch": 0.8060896857751001, + "grad_norm": 0.1256342977285385, + "learning_rate": 0.00016777601385481312, + "loss": 0.1045, + "step": 11172 + }, + { + "epoch": 0.8061618384501605, + "grad_norm": 0.13613678514957428, + "learning_rate": 0.00016777312743541638, + "loss": 0.1467, + "step": 11173 + }, + { + "epoch": 0.806233991125221, + "grad_norm": 0.11526962369680405, + "learning_rate": 0.00016777024101601964, + "loss": 0.1202, + "step": 11174 + }, + { + "epoch": 0.8063061438002814, + "grad_norm": 0.1624159812927246, + "learning_rate": 0.0001677673545966229, + "loss": 0.1944, + "step": 11175 + }, + { + "epoch": 0.8063782964753419, + "grad_norm": 0.12479659169912338, + "learning_rate": 0.00016776446817722614, + "loss": 0.2056, + "step": 11176 + }, + { + "epoch": 0.8064504491504022, + "grad_norm": 0.1443111151456833, + "learning_rate": 0.0001677615817578294, + "loss": 0.1321, + "step": 11177 + }, + { + "epoch": 0.8065226018254626, + "grad_norm": 0.1252567321062088, + "learning_rate": 0.0001677586953384327, + "loss": 0.1626, + "step": 11178 + }, + { + "epoch": 0.8065947545005231, + "grad_norm": 0.11821672320365906, + "learning_rate": 0.00016775580891903596, + "loss": 0.1475, + "step": 11179 + }, + { + "epoch": 0.8066669071755835, + "grad_norm": 0.12168832868337631, + "learning_rate": 0.00016775292249963922, + "loss": 0.1922, + "step": 11180 + }, + { + "epoch": 0.806739059850644, + "grad_norm": 0.15275044739246368, + "learning_rate": 0.00016775003608024246, + "loss": 0.1887, + "step": 11181 + }, + { + "epoch": 0.8068112125257044, + "grad_norm": 0.10763371735811234, + "learning_rate": 0.00016774714966084572, + "loss": 0.1638, + "step": 11182 + }, + { + "epoch": 0.8068833652007649, + "grad_norm": 0.11747167259454727, + "learning_rate": 0.00016774426324144898, + "loss": 0.1281, + "step": 11183 + }, + { + "epoch": 0.8069555178758252, + "grad_norm": 0.1161046177148819, + "learning_rate": 0.00016774137682205225, + "loss": 0.1831, + "step": 11184 + }, + { + "epoch": 0.8070276705508856, + "grad_norm": 0.12537787854671478, + "learning_rate": 0.00016773849040265554, + "loss": 0.143, + "step": 11185 + }, + { + "epoch": 0.8070998232259461, + "grad_norm": 0.1212264746427536, + "learning_rate": 0.00016773560398325877, + "loss": 0.1711, + "step": 11186 + }, + { + "epoch": 0.8071719759010065, + "grad_norm": 0.12325877696275711, + "learning_rate": 0.00016773271756386203, + "loss": 0.1536, + "step": 11187 + }, + { + "epoch": 0.807244128576067, + "grad_norm": 0.11776801943778992, + "learning_rate": 0.0001677298311444653, + "loss": 0.1784, + "step": 11188 + }, + { + "epoch": 0.8073162812511274, + "grad_norm": 0.12928734719753265, + "learning_rate": 0.00016772694472506856, + "loss": 0.1933, + "step": 11189 + }, + { + "epoch": 0.8073884339261879, + "grad_norm": 0.1151927262544632, + "learning_rate": 0.00016772405830567182, + "loss": 0.1473, + "step": 11190 + }, + { + "epoch": 0.8074605866012482, + "grad_norm": 0.16673599183559418, + "learning_rate": 0.00016772117188627509, + "loss": 0.1819, + "step": 11191 + }, + { + "epoch": 0.8075327392763086, + "grad_norm": 0.11783953756093979, + "learning_rate": 0.00016771828546687835, + "loss": 0.1417, + "step": 11192 + }, + { + "epoch": 0.8076048919513691, + "grad_norm": 0.123351089656353, + "learning_rate": 0.0001677153990474816, + "loss": 0.1522, + "step": 11193 + }, + { + "epoch": 0.8076770446264295, + "grad_norm": 0.1466146856546402, + "learning_rate": 0.00016771251262808487, + "loss": 0.1501, + "step": 11194 + }, + { + "epoch": 0.80774919730149, + "grad_norm": 0.1161920353770256, + "learning_rate": 0.00016770962620868814, + "loss": 0.1799, + "step": 11195 + }, + { + "epoch": 0.8078213499765504, + "grad_norm": 0.11827120929956436, + "learning_rate": 0.0001677067397892914, + "loss": 0.1913, + "step": 11196 + }, + { + "epoch": 0.8078935026516109, + "grad_norm": 0.14861279726028442, + "learning_rate": 0.00016770385336989464, + "loss": 0.1656, + "step": 11197 + }, + { + "epoch": 0.8079656553266712, + "grad_norm": 0.12918280065059662, + "learning_rate": 0.0001677009669504979, + "loss": 0.1735, + "step": 11198 + }, + { + "epoch": 0.8080378080017316, + "grad_norm": 0.11571779102087021, + "learning_rate": 0.0001676980805311012, + "loss": 0.1383, + "step": 11199 + }, + { + "epoch": 0.8081099606767921, + "grad_norm": 0.11656660586595535, + "learning_rate": 0.00016769519411170445, + "loss": 0.1501, + "step": 11200 + }, + { + "epoch": 0.8081821133518525, + "grad_norm": 0.12739737331867218, + "learning_rate": 0.00016769230769230772, + "loss": 0.1368, + "step": 11201 + }, + { + "epoch": 0.808254266026913, + "grad_norm": 0.14029301702976227, + "learning_rate": 0.00016768942127291095, + "loss": 0.1191, + "step": 11202 + }, + { + "epoch": 0.8083264187019734, + "grad_norm": 0.10776422917842865, + "learning_rate": 0.00016768653485351421, + "loss": 0.1448, + "step": 11203 + }, + { + "epoch": 0.8083985713770339, + "grad_norm": 0.12312967330217361, + "learning_rate": 0.00016768364843411748, + "loss": 0.1387, + "step": 11204 + }, + { + "epoch": 0.8084707240520942, + "grad_norm": 0.1375785768032074, + "learning_rate": 0.00016768076201472074, + "loss": 0.1581, + "step": 11205 + }, + { + "epoch": 0.8085428767271546, + "grad_norm": 0.11886519193649292, + "learning_rate": 0.00016767787559532403, + "loss": 0.1384, + "step": 11206 + }, + { + "epoch": 0.8086150294022151, + "grad_norm": 0.1250735968351364, + "learning_rate": 0.00016767498917592727, + "loss": 0.1567, + "step": 11207 + }, + { + "epoch": 0.8086871820772755, + "grad_norm": 0.1060570776462555, + "learning_rate": 0.00016767210275653053, + "loss": 0.175, + "step": 11208 + }, + { + "epoch": 0.808759334752336, + "grad_norm": 0.11892693489789963, + "learning_rate": 0.0001676692163371338, + "loss": 0.1379, + "step": 11209 + }, + { + "epoch": 0.8088314874273964, + "grad_norm": 0.10399611294269562, + "learning_rate": 0.00016766632991773705, + "loss": 0.116, + "step": 11210 + }, + { + "epoch": 0.8089036401024567, + "grad_norm": 0.10794928669929504, + "learning_rate": 0.00016766344349834032, + "loss": 0.1365, + "step": 11211 + }, + { + "epoch": 0.8089757927775172, + "grad_norm": 0.14975190162658691, + "learning_rate": 0.00016766055707894358, + "loss": 0.1608, + "step": 11212 + }, + { + "epoch": 0.8090479454525776, + "grad_norm": 0.11688051372766495, + "learning_rate": 0.00016765767065954684, + "loss": 0.1273, + "step": 11213 + }, + { + "epoch": 0.8091200981276381, + "grad_norm": 0.11928575485944748, + "learning_rate": 0.0001676547842401501, + "loss": 0.1428, + "step": 11214 + }, + { + "epoch": 0.8091922508026985, + "grad_norm": 0.13377560675144196, + "learning_rate": 0.00016765189782075337, + "loss": 0.1409, + "step": 11215 + }, + { + "epoch": 0.809264403477759, + "grad_norm": 0.13069269061088562, + "learning_rate": 0.00016764901140135663, + "loss": 0.1968, + "step": 11216 + }, + { + "epoch": 0.8093365561528194, + "grad_norm": 0.11220591515302658, + "learning_rate": 0.0001676461249819599, + "loss": 0.1061, + "step": 11217 + }, + { + "epoch": 0.8094087088278797, + "grad_norm": 0.15183371305465698, + "learning_rate": 0.00016764323856256313, + "loss": 0.1728, + "step": 11218 + }, + { + "epoch": 0.8094808615029402, + "grad_norm": 0.13244639337062836, + "learning_rate": 0.0001676403521431664, + "loss": 0.1345, + "step": 11219 + }, + { + "epoch": 0.8095530141780006, + "grad_norm": 0.1347954273223877, + "learning_rate": 0.00016763746572376968, + "loss": 0.1207, + "step": 11220 + }, + { + "epoch": 0.8096251668530611, + "grad_norm": 0.11808697134256363, + "learning_rate": 0.00016763457930437295, + "loss": 0.1422, + "step": 11221 + }, + { + "epoch": 0.8096973195281215, + "grad_norm": 0.1248321384191513, + "learning_rate": 0.0001676316928849762, + "loss": 0.141, + "step": 11222 + }, + { + "epoch": 0.809769472203182, + "grad_norm": 0.12465531378984451, + "learning_rate": 0.00016762880646557945, + "loss": 0.1774, + "step": 11223 + }, + { + "epoch": 0.8098416248782424, + "grad_norm": 0.12220194935798645, + "learning_rate": 0.0001676259200461827, + "loss": 0.1766, + "step": 11224 + }, + { + "epoch": 0.8099137775533027, + "grad_norm": 0.14990898966789246, + "learning_rate": 0.00016762303362678597, + "loss": 0.1763, + "step": 11225 + }, + { + "epoch": 0.8099859302283632, + "grad_norm": 0.15053509175777435, + "learning_rate": 0.00016762014720738923, + "loss": 0.1366, + "step": 11226 + }, + { + "epoch": 0.8100580829034236, + "grad_norm": 0.13961613178253174, + "learning_rate": 0.00016761726078799252, + "loss": 0.1554, + "step": 11227 + }, + { + "epoch": 0.8101302355784841, + "grad_norm": 0.14634525775909424, + "learning_rate": 0.00016761437436859576, + "loss": 0.1336, + "step": 11228 + }, + { + "epoch": 0.8102023882535445, + "grad_norm": 0.10927371680736542, + "learning_rate": 0.00016761148794919902, + "loss": 0.1568, + "step": 11229 + }, + { + "epoch": 0.810274540928605, + "grad_norm": 0.13111984729766846, + "learning_rate": 0.00016760860152980229, + "loss": 0.103, + "step": 11230 + }, + { + "epoch": 0.8103466936036654, + "grad_norm": 0.12341060489416122, + "learning_rate": 0.00016760571511040555, + "loss": 0.11, + "step": 11231 + }, + { + "epoch": 0.8104188462787257, + "grad_norm": 0.1280965507030487, + "learning_rate": 0.0001676028286910088, + "loss": 0.0989, + "step": 11232 + }, + { + "epoch": 0.8104909989537862, + "grad_norm": 0.12683764100074768, + "learning_rate": 0.00016759994227161207, + "loss": 0.1371, + "step": 11233 + }, + { + "epoch": 0.8105631516288466, + "grad_norm": 0.16215945780277252, + "learning_rate": 0.00016759705585221534, + "loss": 0.1319, + "step": 11234 + }, + { + "epoch": 0.8106353043039071, + "grad_norm": 0.11233913153409958, + "learning_rate": 0.0001675941694328186, + "loss": 0.1328, + "step": 11235 + }, + { + "epoch": 0.8107074569789675, + "grad_norm": 0.12480802834033966, + "learning_rate": 0.00016759128301342186, + "loss": 0.1461, + "step": 11236 + }, + { + "epoch": 0.810779609654028, + "grad_norm": 0.14508230984210968, + "learning_rate": 0.00016758839659402513, + "loss": 0.1261, + "step": 11237 + }, + { + "epoch": 0.8108517623290884, + "grad_norm": 0.11494122445583344, + "learning_rate": 0.0001675855101746284, + "loss": 0.1382, + "step": 11238 + }, + { + "epoch": 0.8109239150041487, + "grad_norm": 0.10430081188678741, + "learning_rate": 0.00016758262375523162, + "loss": 0.1159, + "step": 11239 + }, + { + "epoch": 0.8109960676792092, + "grad_norm": 0.11663941293954849, + "learning_rate": 0.0001675797373358349, + "loss": 0.1413, + "step": 11240 + }, + { + "epoch": 0.8110682203542696, + "grad_norm": 0.12681062519550323, + "learning_rate": 0.00016757685091643818, + "loss": 0.1445, + "step": 11241 + }, + { + "epoch": 0.8111403730293301, + "grad_norm": 0.12679167091846466, + "learning_rate": 0.00016757396449704144, + "loss": 0.1555, + "step": 11242 + }, + { + "epoch": 0.8112125257043905, + "grad_norm": 0.1359633356332779, + "learning_rate": 0.0001675710780776447, + "loss": 0.1371, + "step": 11243 + }, + { + "epoch": 0.811284678379451, + "grad_norm": 0.10687170177698135, + "learning_rate": 0.00016756819165824794, + "loss": 0.1833, + "step": 11244 + }, + { + "epoch": 0.8113568310545114, + "grad_norm": 0.12260755151510239, + "learning_rate": 0.0001675653052388512, + "loss": 0.1535, + "step": 11245 + }, + { + "epoch": 0.8114289837295717, + "grad_norm": 0.11183297634124756, + "learning_rate": 0.00016756241881945447, + "loss": 0.1613, + "step": 11246 + }, + { + "epoch": 0.8115011364046322, + "grad_norm": 0.12028773874044418, + "learning_rate": 0.00016755953240005773, + "loss": 0.1303, + "step": 11247 + }, + { + "epoch": 0.8115732890796926, + "grad_norm": 0.11625336855649948, + "learning_rate": 0.00016755664598066102, + "loss": 0.1652, + "step": 11248 + }, + { + "epoch": 0.8116454417547531, + "grad_norm": 0.12436410784721375, + "learning_rate": 0.00016755375956126425, + "loss": 0.1558, + "step": 11249 + }, + { + "epoch": 0.8117175944298135, + "grad_norm": 0.13242529332637787, + "learning_rate": 0.00016755087314186752, + "loss": 0.1258, + "step": 11250 + }, + { + "epoch": 0.811789747104874, + "grad_norm": 0.12705907225608826, + "learning_rate": 0.00016754798672247078, + "loss": 0.124, + "step": 11251 + }, + { + "epoch": 0.8118618997799344, + "grad_norm": 0.12154010683298111, + "learning_rate": 0.00016754510030307404, + "loss": 0.168, + "step": 11252 + }, + { + "epoch": 0.8119340524549947, + "grad_norm": 0.12976540625095367, + "learning_rate": 0.0001675422138836773, + "loss": 0.1559, + "step": 11253 + }, + { + "epoch": 0.8120062051300552, + "grad_norm": 0.11038821190595627, + "learning_rate": 0.00016753932746428057, + "loss": 0.1713, + "step": 11254 + }, + { + "epoch": 0.8120783578051156, + "grad_norm": 0.11370693147182465, + "learning_rate": 0.00016753644104488383, + "loss": 0.1736, + "step": 11255 + }, + { + "epoch": 0.8121505104801761, + "grad_norm": 0.12086302787065506, + "learning_rate": 0.0001675335546254871, + "loss": 0.1648, + "step": 11256 + }, + { + "epoch": 0.8122226631552365, + "grad_norm": 0.1379534900188446, + "learning_rate": 0.00016753066820609036, + "loss": 0.1458, + "step": 11257 + }, + { + "epoch": 0.812294815830297, + "grad_norm": 0.1315435767173767, + "learning_rate": 0.00016752778178669362, + "loss": 0.1855, + "step": 11258 + }, + { + "epoch": 0.8123669685053574, + "grad_norm": 0.10974664986133575, + "learning_rate": 0.00016752489536729688, + "loss": 0.1462, + "step": 11259 + }, + { + "epoch": 0.8124391211804177, + "grad_norm": 0.11963661760091782, + "learning_rate": 0.00016752200894790015, + "loss": 0.1593, + "step": 11260 + }, + { + "epoch": 0.8125112738554782, + "grad_norm": 0.12325388193130493, + "learning_rate": 0.00016751912252850338, + "loss": 0.1379, + "step": 11261 + }, + { + "epoch": 0.8125834265305386, + "grad_norm": 0.11024509370326996, + "learning_rate": 0.00016751623610910667, + "loss": 0.1094, + "step": 11262 + }, + { + "epoch": 0.812655579205599, + "grad_norm": 0.11602330207824707, + "learning_rate": 0.00016751334968970993, + "loss": 0.1302, + "step": 11263 + }, + { + "epoch": 0.8127277318806595, + "grad_norm": 0.1416502147912979, + "learning_rate": 0.0001675104632703132, + "loss": 0.1638, + "step": 11264 + }, + { + "epoch": 0.8127998845557199, + "grad_norm": 0.12808161973953247, + "learning_rate": 0.00016750757685091646, + "loss": 0.1488, + "step": 11265 + }, + { + "epoch": 0.8128720372307804, + "grad_norm": 0.14989076554775238, + "learning_rate": 0.0001675046904315197, + "loss": 0.174, + "step": 11266 + }, + { + "epoch": 0.8129441899058407, + "grad_norm": 0.11965425312519073, + "learning_rate": 0.00016750180401212296, + "loss": 0.1437, + "step": 11267 + }, + { + "epoch": 0.8130163425809012, + "grad_norm": 0.12085084617137909, + "learning_rate": 0.00016749891759272622, + "loss": 0.1232, + "step": 11268 + }, + { + "epoch": 0.8130884952559616, + "grad_norm": 0.1350867748260498, + "learning_rate": 0.00016749603117332949, + "loss": 0.1333, + "step": 11269 + }, + { + "epoch": 0.813160647931022, + "grad_norm": 0.14008502662181854, + "learning_rate": 0.00016749314475393278, + "loss": 0.1382, + "step": 11270 + }, + { + "epoch": 0.8132328006060825, + "grad_norm": 0.12756000459194183, + "learning_rate": 0.000167490258334536, + "loss": 0.1764, + "step": 11271 + }, + { + "epoch": 0.8133049532811429, + "grad_norm": 0.12092864513397217, + "learning_rate": 0.00016748737191513927, + "loss": 0.1201, + "step": 11272 + }, + { + "epoch": 0.8133771059562033, + "grad_norm": 0.1111254170536995, + "learning_rate": 0.00016748448549574254, + "loss": 0.1212, + "step": 11273 + }, + { + "epoch": 0.8134492586312637, + "grad_norm": 0.11691894382238388, + "learning_rate": 0.0001674815990763458, + "loss": 0.129, + "step": 11274 + }, + { + "epoch": 0.8135214113063242, + "grad_norm": 0.1301519274711609, + "learning_rate": 0.00016747871265694906, + "loss": 0.1261, + "step": 11275 + }, + { + "epoch": 0.8135935639813846, + "grad_norm": 0.12759649753570557, + "learning_rate": 0.00016747582623755233, + "loss": 0.1058, + "step": 11276 + }, + { + "epoch": 0.813665716656445, + "grad_norm": 0.15787038207054138, + "learning_rate": 0.0001674729398181556, + "loss": 0.1682, + "step": 11277 + }, + { + "epoch": 0.8137378693315055, + "grad_norm": 0.1614495813846588, + "learning_rate": 0.00016747005339875885, + "loss": 0.1503, + "step": 11278 + }, + { + "epoch": 0.8138100220065659, + "grad_norm": 0.1081974133849144, + "learning_rate": 0.00016746716697936211, + "loss": 0.1154, + "step": 11279 + }, + { + "epoch": 0.8138821746816263, + "grad_norm": 0.1459926962852478, + "learning_rate": 0.00016746428055996538, + "loss": 0.1193, + "step": 11280 + }, + { + "epoch": 0.8139543273566867, + "grad_norm": 0.10666241496801376, + "learning_rate": 0.00016746139414056864, + "loss": 0.1682, + "step": 11281 + }, + { + "epoch": 0.8140264800317472, + "grad_norm": 0.11136288940906525, + "learning_rate": 0.00016745850772117188, + "loss": 0.1232, + "step": 11282 + }, + { + "epoch": 0.8140986327068076, + "grad_norm": 0.10956153273582458, + "learning_rate": 0.00016745562130177514, + "loss": 0.1283, + "step": 11283 + }, + { + "epoch": 0.814170785381868, + "grad_norm": 0.12416128814220428, + "learning_rate": 0.00016745273488237843, + "loss": 0.0944, + "step": 11284 + }, + { + "epoch": 0.8142429380569285, + "grad_norm": 0.12514308094978333, + "learning_rate": 0.0001674498484629817, + "loss": 0.1457, + "step": 11285 + }, + { + "epoch": 0.8143150907319889, + "grad_norm": 0.11458826810121536, + "learning_rate": 0.00016744696204358496, + "loss": 0.1193, + "step": 11286 + }, + { + "epoch": 0.8143872434070493, + "grad_norm": 0.1273362785577774, + "learning_rate": 0.0001674440756241882, + "loss": 0.1279, + "step": 11287 + }, + { + "epoch": 0.8144593960821097, + "grad_norm": 0.13258153200149536, + "learning_rate": 0.00016744118920479145, + "loss": 0.1405, + "step": 11288 + }, + { + "epoch": 0.8145315487571702, + "grad_norm": 0.15388639271259308, + "learning_rate": 0.00016743830278539472, + "loss": 0.175, + "step": 11289 + }, + { + "epoch": 0.8146037014322306, + "grad_norm": 0.1363607496023178, + "learning_rate": 0.00016743541636599798, + "loss": 0.1533, + "step": 11290 + }, + { + "epoch": 0.814675854107291, + "grad_norm": 0.09938452392816544, + "learning_rate": 0.00016743252994660127, + "loss": 0.1546, + "step": 11291 + }, + { + "epoch": 0.8147480067823515, + "grad_norm": 0.1165052130818367, + "learning_rate": 0.0001674296435272045, + "loss": 0.1516, + "step": 11292 + }, + { + "epoch": 0.8148201594574119, + "grad_norm": 0.12615390121936798, + "learning_rate": 0.00016742675710780777, + "loss": 0.1686, + "step": 11293 + }, + { + "epoch": 0.8148923121324723, + "grad_norm": 0.15578638017177582, + "learning_rate": 0.00016742387068841103, + "loss": 0.1546, + "step": 11294 + }, + { + "epoch": 0.8149644648075327, + "grad_norm": 0.12379761785268784, + "learning_rate": 0.0001674209842690143, + "loss": 0.1009, + "step": 11295 + }, + { + "epoch": 0.8150366174825932, + "grad_norm": 0.13307251036167145, + "learning_rate": 0.00016741809784961756, + "loss": 0.1382, + "step": 11296 + }, + { + "epoch": 0.8151087701576536, + "grad_norm": 0.12157807499170303, + "learning_rate": 0.00016741521143022082, + "loss": 0.1465, + "step": 11297 + }, + { + "epoch": 0.815180922832714, + "grad_norm": 0.1452769786119461, + "learning_rate": 0.00016741232501082408, + "loss": 0.206, + "step": 11298 + }, + { + "epoch": 0.8152530755077745, + "grad_norm": 0.11464543640613556, + "learning_rate": 0.00016740943859142735, + "loss": 0.1673, + "step": 11299 + }, + { + "epoch": 0.8153252281828349, + "grad_norm": 0.1436835080385208, + "learning_rate": 0.0001674065521720306, + "loss": 0.1452, + "step": 11300 + }, + { + "epoch": 0.8153973808578953, + "grad_norm": 0.12474244832992554, + "learning_rate": 0.00016740366575263387, + "loss": 0.1306, + "step": 11301 + }, + { + "epoch": 0.8154695335329557, + "grad_norm": 0.10895528644323349, + "learning_rate": 0.00016740077933323713, + "loss": 0.0865, + "step": 11302 + }, + { + "epoch": 0.8155416862080161, + "grad_norm": 0.11682059615850449, + "learning_rate": 0.00016739789291384037, + "loss": 0.1051, + "step": 11303 + }, + { + "epoch": 0.8156138388830766, + "grad_norm": 0.11779388785362244, + "learning_rate": 0.00016739500649444363, + "loss": 0.1642, + "step": 11304 + }, + { + "epoch": 0.815685991558137, + "grad_norm": 0.13912788033485413, + "learning_rate": 0.00016739212007504692, + "loss": 0.1451, + "step": 11305 + }, + { + "epoch": 0.8157581442331975, + "grad_norm": 0.14120379090309143, + "learning_rate": 0.00016738923365565019, + "loss": 0.1531, + "step": 11306 + }, + { + "epoch": 0.8158302969082579, + "grad_norm": 0.13223904371261597, + "learning_rate": 0.00016738634723625345, + "loss": 0.1145, + "step": 11307 + }, + { + "epoch": 0.8159024495833183, + "grad_norm": 0.10849042236804962, + "learning_rate": 0.00016738346081685669, + "loss": 0.1252, + "step": 11308 + }, + { + "epoch": 0.8159746022583787, + "grad_norm": 0.1092374175786972, + "learning_rate": 0.00016738057439745995, + "loss": 0.1861, + "step": 11309 + }, + { + "epoch": 0.8160467549334391, + "grad_norm": 0.15699417889118195, + "learning_rate": 0.0001673776879780632, + "loss": 0.1783, + "step": 11310 + }, + { + "epoch": 0.8161189076084996, + "grad_norm": 0.12624722719192505, + "learning_rate": 0.00016737480155866647, + "loss": 0.1394, + "step": 11311 + }, + { + "epoch": 0.81619106028356, + "grad_norm": 0.1225733831524849, + "learning_rate": 0.00016737191513926976, + "loss": 0.1113, + "step": 11312 + }, + { + "epoch": 0.8162632129586205, + "grad_norm": 0.13231535255908966, + "learning_rate": 0.000167369028719873, + "loss": 0.1254, + "step": 11313 + }, + { + "epoch": 0.8163353656336809, + "grad_norm": 0.10797318071126938, + "learning_rate": 0.00016736614230047626, + "loss": 0.1623, + "step": 11314 + }, + { + "epoch": 0.8164075183087413, + "grad_norm": 0.12934650480747223, + "learning_rate": 0.00016736325588107953, + "loss": 0.1713, + "step": 11315 + }, + { + "epoch": 0.8164796709838017, + "grad_norm": 0.14138473570346832, + "learning_rate": 0.0001673603694616828, + "loss": 0.1406, + "step": 11316 + }, + { + "epoch": 0.8165518236588621, + "grad_norm": 0.11887970566749573, + "learning_rate": 0.00016735748304228605, + "loss": 0.1198, + "step": 11317 + }, + { + "epoch": 0.8166239763339226, + "grad_norm": 0.15465225279331207, + "learning_rate": 0.00016735459662288931, + "loss": 0.1481, + "step": 11318 + }, + { + "epoch": 0.816696129008983, + "grad_norm": 0.11133931577205658, + "learning_rate": 0.00016735171020349258, + "loss": 0.1649, + "step": 11319 + }, + { + "epoch": 0.8167682816840435, + "grad_norm": 0.14071118831634521, + "learning_rate": 0.00016734882378409584, + "loss": 0.2198, + "step": 11320 + }, + { + "epoch": 0.8168404343591039, + "grad_norm": 0.1234261766076088, + "learning_rate": 0.0001673459373646991, + "loss": 0.1704, + "step": 11321 + }, + { + "epoch": 0.8169125870341643, + "grad_norm": 0.13197477161884308, + "learning_rate": 0.00016734305094530237, + "loss": 0.1667, + "step": 11322 + }, + { + "epoch": 0.8169847397092247, + "grad_norm": 0.15567070245742798, + "learning_rate": 0.00016734016452590563, + "loss": 0.1706, + "step": 11323 + }, + { + "epoch": 0.8170568923842851, + "grad_norm": 0.14438800513744354, + "learning_rate": 0.00016733727810650886, + "loss": 0.1418, + "step": 11324 + }, + { + "epoch": 0.8171290450593456, + "grad_norm": 0.12666374444961548, + "learning_rate": 0.00016733439168711213, + "loss": 0.1437, + "step": 11325 + }, + { + "epoch": 0.817201197734406, + "grad_norm": 0.10681997239589691, + "learning_rate": 0.00016733150526771542, + "loss": 0.1826, + "step": 11326 + }, + { + "epoch": 0.8172733504094665, + "grad_norm": 0.13707345724105835, + "learning_rate": 0.00016732861884831868, + "loss": 0.1495, + "step": 11327 + }, + { + "epoch": 0.8173455030845269, + "grad_norm": 0.10398998111486435, + "learning_rate": 0.00016732573242892194, + "loss": 0.1334, + "step": 11328 + }, + { + "epoch": 0.8174176557595872, + "grad_norm": 0.13627657294273376, + "learning_rate": 0.00016732284600952518, + "loss": 0.1568, + "step": 11329 + }, + { + "epoch": 0.8174898084346477, + "grad_norm": 0.11443864554166794, + "learning_rate": 0.00016731995959012844, + "loss": 0.1403, + "step": 11330 + }, + { + "epoch": 0.8175619611097081, + "grad_norm": 0.13510550558567047, + "learning_rate": 0.0001673170731707317, + "loss": 0.138, + "step": 11331 + }, + { + "epoch": 0.8176341137847686, + "grad_norm": 0.13009688258171082, + "learning_rate": 0.00016731418675133497, + "loss": 0.2187, + "step": 11332 + }, + { + "epoch": 0.817706266459829, + "grad_norm": 0.12749722599983215, + "learning_rate": 0.00016731130033193826, + "loss": 0.1697, + "step": 11333 + }, + { + "epoch": 0.8177784191348895, + "grad_norm": 0.13540560007095337, + "learning_rate": 0.0001673084139125415, + "loss": 0.1496, + "step": 11334 + }, + { + "epoch": 0.8178505718099498, + "grad_norm": 0.12042059749364853, + "learning_rate": 0.00016730552749314476, + "loss": 0.1515, + "step": 11335 + }, + { + "epoch": 0.8179227244850102, + "grad_norm": 0.11170211434364319, + "learning_rate": 0.00016730264107374802, + "loss": 0.1372, + "step": 11336 + }, + { + "epoch": 0.8179948771600707, + "grad_norm": 0.12397082149982452, + "learning_rate": 0.00016729975465435128, + "loss": 0.18, + "step": 11337 + }, + { + "epoch": 0.8180670298351311, + "grad_norm": 0.15444037318229675, + "learning_rate": 0.00016729686823495455, + "loss": 0.1394, + "step": 11338 + }, + { + "epoch": 0.8181391825101916, + "grad_norm": 0.16244472563266754, + "learning_rate": 0.0001672939818155578, + "loss": 0.2063, + "step": 11339 + }, + { + "epoch": 0.818211335185252, + "grad_norm": 0.14282941818237305, + "learning_rate": 0.00016729109539616107, + "loss": 0.1592, + "step": 11340 + }, + { + "epoch": 0.8182834878603125, + "grad_norm": 0.12612587213516235, + "learning_rate": 0.00016728820897676433, + "loss": 0.117, + "step": 11341 + }, + { + "epoch": 0.8183556405353728, + "grad_norm": 0.1341152936220169, + "learning_rate": 0.0001672853225573676, + "loss": 0.0801, + "step": 11342 + }, + { + "epoch": 0.8184277932104332, + "grad_norm": 0.12467809021472931, + "learning_rate": 0.00016728243613797086, + "loss": 0.1567, + "step": 11343 + }, + { + "epoch": 0.8184999458854937, + "grad_norm": 0.1068568080663681, + "learning_rate": 0.00016727954971857412, + "loss": 0.1679, + "step": 11344 + }, + { + "epoch": 0.8185720985605541, + "grad_norm": 0.1270045042037964, + "learning_rate": 0.00016727666329917736, + "loss": 0.1398, + "step": 11345 + }, + { + "epoch": 0.8186442512356146, + "grad_norm": 0.1394347995519638, + "learning_rate": 0.00016727377687978062, + "loss": 0.1676, + "step": 11346 + }, + { + "epoch": 0.818716403910675, + "grad_norm": 0.10702545195817947, + "learning_rate": 0.0001672708904603839, + "loss": 0.1303, + "step": 11347 + }, + { + "epoch": 0.8187885565857355, + "grad_norm": 0.10880977660417557, + "learning_rate": 0.00016726800404098717, + "loss": 0.1486, + "step": 11348 + }, + { + "epoch": 0.8188607092607958, + "grad_norm": 0.12881873548030853, + "learning_rate": 0.00016726511762159044, + "loss": 0.1323, + "step": 11349 + }, + { + "epoch": 0.8189328619358562, + "grad_norm": 0.13381995260715485, + "learning_rate": 0.00016726223120219367, + "loss": 0.1607, + "step": 11350 + }, + { + "epoch": 0.8190050146109167, + "grad_norm": 0.12776945531368256, + "learning_rate": 0.00016725934478279694, + "loss": 0.1351, + "step": 11351 + }, + { + "epoch": 0.8190771672859771, + "grad_norm": 0.10766884684562683, + "learning_rate": 0.0001672564583634002, + "loss": 0.1314, + "step": 11352 + }, + { + "epoch": 0.8191493199610376, + "grad_norm": 0.10411593317985535, + "learning_rate": 0.00016725357194400346, + "loss": 0.1241, + "step": 11353 + }, + { + "epoch": 0.819221472636098, + "grad_norm": 0.11986301839351654, + "learning_rate": 0.00016725068552460675, + "loss": 0.1641, + "step": 11354 + }, + { + "epoch": 0.8192936253111585, + "grad_norm": 0.11862561106681824, + "learning_rate": 0.00016724779910521, + "loss": 0.0899, + "step": 11355 + }, + { + "epoch": 0.8193657779862188, + "grad_norm": 0.09624575823545456, + "learning_rate": 0.00016724491268581325, + "loss": 0.1614, + "step": 11356 + }, + { + "epoch": 0.8194379306612792, + "grad_norm": 0.11138183623552322, + "learning_rate": 0.00016724202626641651, + "loss": 0.1067, + "step": 11357 + }, + { + "epoch": 0.8195100833363397, + "grad_norm": 0.14141838252544403, + "learning_rate": 0.00016723913984701978, + "loss": 0.2167, + "step": 11358 + }, + { + "epoch": 0.8195822360114001, + "grad_norm": 0.13359762728214264, + "learning_rate": 0.00016723625342762304, + "loss": 0.1533, + "step": 11359 + }, + { + "epoch": 0.8196543886864606, + "grad_norm": 0.14311687648296356, + "learning_rate": 0.0001672333670082263, + "loss": 0.0928, + "step": 11360 + }, + { + "epoch": 0.819726541361521, + "grad_norm": 0.12119587510824203, + "learning_rate": 0.00016723048058882957, + "loss": 0.1239, + "step": 11361 + }, + { + "epoch": 0.8197986940365815, + "grad_norm": 0.1338701993227005, + "learning_rate": 0.00016722759416943283, + "loss": 0.1138, + "step": 11362 + }, + { + "epoch": 0.8198708467116418, + "grad_norm": 0.14802776277065277, + "learning_rate": 0.0001672247077500361, + "loss": 0.1847, + "step": 11363 + }, + { + "epoch": 0.8199429993867022, + "grad_norm": 0.10998402535915375, + "learning_rate": 0.00016722182133063935, + "loss": 0.1656, + "step": 11364 + }, + { + "epoch": 0.8200151520617627, + "grad_norm": 0.0966625064611435, + "learning_rate": 0.00016721893491124262, + "loss": 0.1347, + "step": 11365 + }, + { + "epoch": 0.8200873047368231, + "grad_norm": 0.12330837547779083, + "learning_rate": 0.00016721604849184588, + "loss": 0.1189, + "step": 11366 + }, + { + "epoch": 0.8201594574118836, + "grad_norm": 0.14066612720489502, + "learning_rate": 0.00016721316207244912, + "loss": 0.1443, + "step": 11367 + }, + { + "epoch": 0.820231610086944, + "grad_norm": 0.12989702820777893, + "learning_rate": 0.0001672102756530524, + "loss": 0.1289, + "step": 11368 + }, + { + "epoch": 0.8203037627620045, + "grad_norm": 0.11941897124052048, + "learning_rate": 0.00016720738923365567, + "loss": 0.1422, + "step": 11369 + }, + { + "epoch": 0.8203759154370648, + "grad_norm": 0.14209255576133728, + "learning_rate": 0.00016720450281425893, + "loss": 0.133, + "step": 11370 + }, + { + "epoch": 0.8204480681121252, + "grad_norm": 0.11604241281747818, + "learning_rate": 0.0001672016163948622, + "loss": 0.1468, + "step": 11371 + }, + { + "epoch": 0.8205202207871857, + "grad_norm": 0.12015185505151749, + "learning_rate": 0.00016719872997546543, + "loss": 0.1473, + "step": 11372 + }, + { + "epoch": 0.8205923734622461, + "grad_norm": 0.11163683235645294, + "learning_rate": 0.0001671958435560687, + "loss": 0.1324, + "step": 11373 + }, + { + "epoch": 0.8206645261373066, + "grad_norm": 0.1218007430434227, + "learning_rate": 0.00016719295713667196, + "loss": 0.165, + "step": 11374 + }, + { + "epoch": 0.820736678812367, + "grad_norm": 0.12326497584581375, + "learning_rate": 0.00016719007071727525, + "loss": 0.1737, + "step": 11375 + }, + { + "epoch": 0.8208088314874274, + "grad_norm": 0.11788228154182434, + "learning_rate": 0.0001671871842978785, + "loss": 0.1724, + "step": 11376 + }, + { + "epoch": 0.8208809841624878, + "grad_norm": 0.13680443167686462, + "learning_rate": 0.00016718429787848175, + "loss": 0.1258, + "step": 11377 + }, + { + "epoch": 0.8209531368375482, + "grad_norm": 0.14765697717666626, + "learning_rate": 0.000167181411459085, + "loss": 0.1183, + "step": 11378 + }, + { + "epoch": 0.8210252895126087, + "grad_norm": 0.12750886380672455, + "learning_rate": 0.00016717852503968827, + "loss": 0.1449, + "step": 11379 + }, + { + "epoch": 0.8210974421876691, + "grad_norm": 0.1314268410205841, + "learning_rate": 0.00016717563862029153, + "loss": 0.1475, + "step": 11380 + }, + { + "epoch": 0.8211695948627296, + "grad_norm": 0.09203343838453293, + "learning_rate": 0.0001671727522008948, + "loss": 0.1635, + "step": 11381 + }, + { + "epoch": 0.82124174753779, + "grad_norm": 0.1281193345785141, + "learning_rate": 0.00016716986578149806, + "loss": 0.1168, + "step": 11382 + }, + { + "epoch": 0.8213139002128504, + "grad_norm": 0.12138880789279938, + "learning_rate": 0.00016716697936210132, + "loss": 0.1024, + "step": 11383 + }, + { + "epoch": 0.8213860528879108, + "grad_norm": 0.12816603481769562, + "learning_rate": 0.00016716409294270459, + "loss": 0.187, + "step": 11384 + }, + { + "epoch": 0.8214582055629712, + "grad_norm": 0.12044396996498108, + "learning_rate": 0.00016716120652330785, + "loss": 0.1667, + "step": 11385 + }, + { + "epoch": 0.8215303582380317, + "grad_norm": 0.12165230512619019, + "learning_rate": 0.0001671583201039111, + "loss": 0.1448, + "step": 11386 + }, + { + "epoch": 0.8216025109130921, + "grad_norm": 0.12377709150314331, + "learning_rate": 0.00016715543368451437, + "loss": 0.1234, + "step": 11387 + }, + { + "epoch": 0.8216746635881526, + "grad_norm": 0.1418270766735077, + "learning_rate": 0.0001671525472651176, + "loss": 0.1433, + "step": 11388 + }, + { + "epoch": 0.821746816263213, + "grad_norm": 0.14677253365516663, + "learning_rate": 0.0001671496608457209, + "loss": 0.1389, + "step": 11389 + }, + { + "epoch": 0.8218189689382734, + "grad_norm": 0.13841110467910767, + "learning_rate": 0.00016714677442632416, + "loss": 0.1414, + "step": 11390 + }, + { + "epoch": 0.8218911216133338, + "grad_norm": 0.12224303185939789, + "learning_rate": 0.00016714388800692743, + "loss": 0.1198, + "step": 11391 + }, + { + "epoch": 0.8219632742883942, + "grad_norm": 0.11889313906431198, + "learning_rate": 0.0001671410015875307, + "loss": 0.149, + "step": 11392 + }, + { + "epoch": 0.8220354269634547, + "grad_norm": 0.12684161961078644, + "learning_rate": 0.00016713811516813393, + "loss": 0.1097, + "step": 11393 + }, + { + "epoch": 0.8221075796385151, + "grad_norm": 0.14017455279827118, + "learning_rate": 0.0001671352287487372, + "loss": 0.1165, + "step": 11394 + }, + { + "epoch": 0.8221797323135756, + "grad_norm": 0.13586123287677765, + "learning_rate": 0.00016713234232934045, + "loss": 0.1928, + "step": 11395 + }, + { + "epoch": 0.822251884988636, + "grad_norm": 0.12471065670251846, + "learning_rate": 0.00016712945590994374, + "loss": 0.1518, + "step": 11396 + }, + { + "epoch": 0.8223240376636963, + "grad_norm": 0.1188042163848877, + "learning_rate": 0.000167126569490547, + "loss": 0.0972, + "step": 11397 + }, + { + "epoch": 0.8223961903387568, + "grad_norm": 0.1473436951637268, + "learning_rate": 0.00016712368307115024, + "loss": 0.2051, + "step": 11398 + }, + { + "epoch": 0.8224683430138172, + "grad_norm": 0.12210531532764435, + "learning_rate": 0.0001671207966517535, + "loss": 0.1327, + "step": 11399 + }, + { + "epoch": 0.8225404956888777, + "grad_norm": 0.13445596396923065, + "learning_rate": 0.00016711791023235677, + "loss": 0.1217, + "step": 11400 + }, + { + "epoch": 0.8226126483639381, + "grad_norm": 0.12706753611564636, + "learning_rate": 0.00016711502381296003, + "loss": 0.1666, + "step": 11401 + }, + { + "epoch": 0.8226848010389985, + "grad_norm": 0.1364869624376297, + "learning_rate": 0.0001671121373935633, + "loss": 0.1012, + "step": 11402 + }, + { + "epoch": 0.822756953714059, + "grad_norm": 0.1339399367570877, + "learning_rate": 0.00016710925097416655, + "loss": 0.1085, + "step": 11403 + }, + { + "epoch": 0.8228291063891193, + "grad_norm": 0.12054028362035751, + "learning_rate": 0.00016710636455476982, + "loss": 0.1091, + "step": 11404 + }, + { + "epoch": 0.8229012590641798, + "grad_norm": 0.12239770591259003, + "learning_rate": 0.00016710347813537308, + "loss": 0.112, + "step": 11405 + }, + { + "epoch": 0.8229734117392402, + "grad_norm": 0.12391000986099243, + "learning_rate": 0.00016710059171597634, + "loss": 0.1344, + "step": 11406 + }, + { + "epoch": 0.8230455644143007, + "grad_norm": 0.13942229747772217, + "learning_rate": 0.0001670977052965796, + "loss": 0.1902, + "step": 11407 + }, + { + "epoch": 0.8231177170893611, + "grad_norm": 0.12491654604673386, + "learning_rate": 0.00016709481887718287, + "loss": 0.1334, + "step": 11408 + }, + { + "epoch": 0.8231898697644215, + "grad_norm": 0.12403670698404312, + "learning_rate": 0.0001670919324577861, + "loss": 0.1935, + "step": 11409 + }, + { + "epoch": 0.823262022439482, + "grad_norm": 0.1195942834019661, + "learning_rate": 0.0001670890460383894, + "loss": 0.1291, + "step": 11410 + }, + { + "epoch": 0.8233341751145423, + "grad_norm": 0.12322898209095001, + "learning_rate": 0.00016708615961899266, + "loss": 0.1439, + "step": 11411 + }, + { + "epoch": 0.8234063277896028, + "grad_norm": 0.15479105710983276, + "learning_rate": 0.00016708327319959592, + "loss": 0.1983, + "step": 11412 + }, + { + "epoch": 0.8234784804646632, + "grad_norm": 0.14281509816646576, + "learning_rate": 0.00016708038678019918, + "loss": 0.1374, + "step": 11413 + }, + { + "epoch": 0.8235506331397237, + "grad_norm": 0.18304792046546936, + "learning_rate": 0.00016707750036080242, + "loss": 0.1163, + "step": 11414 + }, + { + "epoch": 0.8236227858147841, + "grad_norm": 0.1353597342967987, + "learning_rate": 0.00016707461394140568, + "loss": 0.1208, + "step": 11415 + }, + { + "epoch": 0.8236949384898445, + "grad_norm": 0.11527366191148758, + "learning_rate": 0.00016707172752200895, + "loss": 0.1905, + "step": 11416 + }, + { + "epoch": 0.823767091164905, + "grad_norm": 0.13008643686771393, + "learning_rate": 0.00016706884110261224, + "loss": 0.1522, + "step": 11417 + }, + { + "epoch": 0.8238392438399653, + "grad_norm": 0.1226491928100586, + "learning_rate": 0.0001670659546832155, + "loss": 0.1924, + "step": 11418 + }, + { + "epoch": 0.8239113965150258, + "grad_norm": 0.10746552050113678, + "learning_rate": 0.00016706306826381873, + "loss": 0.1586, + "step": 11419 + }, + { + "epoch": 0.8239835491900862, + "grad_norm": 0.11757387965917587, + "learning_rate": 0.000167060181844422, + "loss": 0.1338, + "step": 11420 + }, + { + "epoch": 0.8240557018651467, + "grad_norm": 0.12384447455406189, + "learning_rate": 0.00016705729542502526, + "loss": 0.0816, + "step": 11421 + }, + { + "epoch": 0.8241278545402071, + "grad_norm": 0.1322738528251648, + "learning_rate": 0.00016705440900562852, + "loss": 0.1374, + "step": 11422 + }, + { + "epoch": 0.8242000072152675, + "grad_norm": 0.12896503508090973, + "learning_rate": 0.00016705152258623179, + "loss": 0.1012, + "step": 11423 + }, + { + "epoch": 0.824272159890328, + "grad_norm": 0.10323493182659149, + "learning_rate": 0.00016704863616683505, + "loss": 0.1385, + "step": 11424 + }, + { + "epoch": 0.8243443125653883, + "grad_norm": 0.12220212072134018, + "learning_rate": 0.0001670457497474383, + "loss": 0.1403, + "step": 11425 + }, + { + "epoch": 0.8244164652404488, + "grad_norm": 0.10760347545146942, + "learning_rate": 0.00016704286332804157, + "loss": 0.1465, + "step": 11426 + }, + { + "epoch": 0.8244886179155092, + "grad_norm": 0.11672066152095795, + "learning_rate": 0.00016703997690864484, + "loss": 0.1387, + "step": 11427 + }, + { + "epoch": 0.8245607705905696, + "grad_norm": 0.11714489012956619, + "learning_rate": 0.0001670370904892481, + "loss": 0.1338, + "step": 11428 + }, + { + "epoch": 0.8246329232656301, + "grad_norm": 0.14161768555641174, + "learning_rate": 0.00016703420406985136, + "loss": 0.1463, + "step": 11429 + }, + { + "epoch": 0.8247050759406905, + "grad_norm": 0.17559358477592468, + "learning_rate": 0.0001670313176504546, + "loss": 0.1412, + "step": 11430 + }, + { + "epoch": 0.824777228615751, + "grad_norm": 0.10626249015331268, + "learning_rate": 0.0001670284312310579, + "loss": 0.1907, + "step": 11431 + }, + { + "epoch": 0.8248493812908113, + "grad_norm": 0.1279340535402298, + "learning_rate": 0.00016702554481166115, + "loss": 0.1475, + "step": 11432 + }, + { + "epoch": 0.8249215339658718, + "grad_norm": 0.10294477641582489, + "learning_rate": 0.00016702265839226441, + "loss": 0.1131, + "step": 11433 + }, + { + "epoch": 0.8249936866409322, + "grad_norm": 0.13072919845581055, + "learning_rate": 0.00016701977197286768, + "loss": 0.1605, + "step": 11434 + }, + { + "epoch": 0.8250658393159926, + "grad_norm": 0.15092572569847107, + "learning_rate": 0.0001670168855534709, + "loss": 0.1565, + "step": 11435 + }, + { + "epoch": 0.8251379919910531, + "grad_norm": 0.13632875680923462, + "learning_rate": 0.00016701399913407418, + "loss": 0.1183, + "step": 11436 + }, + { + "epoch": 0.8252101446661135, + "grad_norm": 0.10911676287651062, + "learning_rate": 0.00016701111271467744, + "loss": 0.1335, + "step": 11437 + }, + { + "epoch": 0.825282297341174, + "grad_norm": 0.15608882904052734, + "learning_rate": 0.00016700822629528073, + "loss": 0.134, + "step": 11438 + }, + { + "epoch": 0.8253544500162343, + "grad_norm": 0.11503724753856659, + "learning_rate": 0.000167005339875884, + "loss": 0.1257, + "step": 11439 + }, + { + "epoch": 0.8254266026912948, + "grad_norm": 0.1044754907488823, + "learning_rate": 0.00016700245345648723, + "loss": 0.1657, + "step": 11440 + }, + { + "epoch": 0.8254987553663552, + "grad_norm": 0.1386835128068924, + "learning_rate": 0.0001669995670370905, + "loss": 0.1441, + "step": 11441 + }, + { + "epoch": 0.8255709080414156, + "grad_norm": 0.11585705727338791, + "learning_rate": 0.00016699668061769375, + "loss": 0.1287, + "step": 11442 + }, + { + "epoch": 0.8256430607164761, + "grad_norm": 0.14055432379245758, + "learning_rate": 0.00016699379419829702, + "loss": 0.1557, + "step": 11443 + }, + { + "epoch": 0.8257152133915365, + "grad_norm": 0.14461514353752136, + "learning_rate": 0.00016699090777890028, + "loss": 0.2014, + "step": 11444 + }, + { + "epoch": 0.825787366066597, + "grad_norm": 0.11708839982748032, + "learning_rate": 0.00016698802135950354, + "loss": 0.136, + "step": 11445 + }, + { + "epoch": 0.8258595187416573, + "grad_norm": 0.14736014604568481, + "learning_rate": 0.0001669851349401068, + "loss": 0.2258, + "step": 11446 + }, + { + "epoch": 0.8259316714167177, + "grad_norm": 0.13729187846183777, + "learning_rate": 0.00016698224852071007, + "loss": 0.1322, + "step": 11447 + }, + { + "epoch": 0.8260038240917782, + "grad_norm": 0.17847642302513123, + "learning_rate": 0.00016697936210131333, + "loss": 0.158, + "step": 11448 + }, + { + "epoch": 0.8260759767668386, + "grad_norm": 0.12275891751050949, + "learning_rate": 0.0001669764756819166, + "loss": 0.1545, + "step": 11449 + }, + { + "epoch": 0.8261481294418991, + "grad_norm": 0.12103191018104553, + "learning_rate": 0.00016697358926251986, + "loss": 0.1496, + "step": 11450 + }, + { + "epoch": 0.8262202821169595, + "grad_norm": 0.12052446603775024, + "learning_rate": 0.0001669707028431231, + "loss": 0.1507, + "step": 11451 + }, + { + "epoch": 0.8262924347920199, + "grad_norm": 0.13198426365852356, + "learning_rate": 0.00016696781642372638, + "loss": 0.1461, + "step": 11452 + }, + { + "epoch": 0.8263645874670803, + "grad_norm": 0.11974554508924484, + "learning_rate": 0.00016696493000432965, + "loss": 0.1452, + "step": 11453 + }, + { + "epoch": 0.8264367401421407, + "grad_norm": 0.14511948823928833, + "learning_rate": 0.0001669620435849329, + "loss": 0.1544, + "step": 11454 + }, + { + "epoch": 0.8265088928172012, + "grad_norm": 0.12905672192573547, + "learning_rate": 0.00016695915716553617, + "loss": 0.2065, + "step": 11455 + }, + { + "epoch": 0.8265810454922616, + "grad_norm": 0.12299144268035889, + "learning_rate": 0.0001669562707461394, + "loss": 0.1579, + "step": 11456 + }, + { + "epoch": 0.8266531981673221, + "grad_norm": 0.11316408962011337, + "learning_rate": 0.00016695338432674267, + "loss": 0.1802, + "step": 11457 + }, + { + "epoch": 0.8267253508423825, + "grad_norm": 0.11111404746770859, + "learning_rate": 0.00016695049790734593, + "loss": 0.1292, + "step": 11458 + }, + { + "epoch": 0.8267975035174429, + "grad_norm": 0.11728944629430771, + "learning_rate": 0.0001669476114879492, + "loss": 0.1598, + "step": 11459 + }, + { + "epoch": 0.8268696561925033, + "grad_norm": 0.13308461010456085, + "learning_rate": 0.0001669447250685525, + "loss": 0.147, + "step": 11460 + }, + { + "epoch": 0.8269418088675637, + "grad_norm": 0.12469086796045303, + "learning_rate": 0.00016694183864915572, + "loss": 0.1544, + "step": 11461 + }, + { + "epoch": 0.8270139615426242, + "grad_norm": 0.1318177431821823, + "learning_rate": 0.00016693895222975899, + "loss": 0.1395, + "step": 11462 + }, + { + "epoch": 0.8270861142176846, + "grad_norm": 0.10883750021457672, + "learning_rate": 0.00016693606581036225, + "loss": 0.1476, + "step": 11463 + }, + { + "epoch": 0.8271582668927451, + "grad_norm": 0.11540371179580688, + "learning_rate": 0.0001669331793909655, + "loss": 0.1527, + "step": 11464 + }, + { + "epoch": 0.8272304195678055, + "grad_norm": 0.11190015822649002, + "learning_rate": 0.00016693029297156877, + "loss": 0.1358, + "step": 11465 + }, + { + "epoch": 0.8273025722428659, + "grad_norm": 0.14315339922904968, + "learning_rate": 0.00016692740655217204, + "loss": 0.1627, + "step": 11466 + }, + { + "epoch": 0.8273747249179263, + "grad_norm": 0.11390077322721481, + "learning_rate": 0.0001669245201327753, + "loss": 0.1291, + "step": 11467 + }, + { + "epoch": 0.8274468775929867, + "grad_norm": 0.11460059136152267, + "learning_rate": 0.00016692163371337856, + "loss": 0.163, + "step": 11468 + }, + { + "epoch": 0.8275190302680472, + "grad_norm": 0.12776412069797516, + "learning_rate": 0.00016691874729398183, + "loss": 0.1456, + "step": 11469 + }, + { + "epoch": 0.8275911829431076, + "grad_norm": 0.1419527381658554, + "learning_rate": 0.0001669158608745851, + "loss": 0.1697, + "step": 11470 + }, + { + "epoch": 0.8276633356181681, + "grad_norm": 0.1077413484454155, + "learning_rate": 0.00016691297445518835, + "loss": 0.0857, + "step": 11471 + }, + { + "epoch": 0.8277354882932285, + "grad_norm": 0.11748011410236359, + "learning_rate": 0.0001669100880357916, + "loss": 0.1341, + "step": 11472 + }, + { + "epoch": 0.8278076409682888, + "grad_norm": 0.12573198974132538, + "learning_rate": 0.00016690720161639485, + "loss": 0.1627, + "step": 11473 + }, + { + "epoch": 0.8278797936433493, + "grad_norm": 0.1180257648229599, + "learning_rate": 0.00016690431519699814, + "loss": 0.1353, + "step": 11474 + }, + { + "epoch": 0.8279519463184097, + "grad_norm": 0.11512475460767746, + "learning_rate": 0.0001669014287776014, + "loss": 0.1908, + "step": 11475 + }, + { + "epoch": 0.8280240989934702, + "grad_norm": 0.11620759218931198, + "learning_rate": 0.00016689854235820467, + "loss": 0.185, + "step": 11476 + }, + { + "epoch": 0.8280962516685306, + "grad_norm": 0.11351757496595383, + "learning_rate": 0.0001668956559388079, + "loss": 0.1665, + "step": 11477 + }, + { + "epoch": 0.8281684043435911, + "grad_norm": 0.11081854999065399, + "learning_rate": 0.00016689276951941116, + "loss": 0.1247, + "step": 11478 + }, + { + "epoch": 0.8282405570186515, + "grad_norm": 0.12272868305444717, + "learning_rate": 0.00016688988310001443, + "loss": 0.177, + "step": 11479 + }, + { + "epoch": 0.8283127096937118, + "grad_norm": 0.15450416505336761, + "learning_rate": 0.0001668869966806177, + "loss": 0.1583, + "step": 11480 + }, + { + "epoch": 0.8283848623687723, + "grad_norm": 0.12492774426937103, + "learning_rate": 0.00016688411026122098, + "loss": 0.1731, + "step": 11481 + }, + { + "epoch": 0.8284570150438327, + "grad_norm": 0.1554734855890274, + "learning_rate": 0.00016688122384182422, + "loss": 0.1951, + "step": 11482 + }, + { + "epoch": 0.8285291677188932, + "grad_norm": 0.12958884239196777, + "learning_rate": 0.00016687833742242748, + "loss": 0.1956, + "step": 11483 + }, + { + "epoch": 0.8286013203939536, + "grad_norm": 0.11311990767717361, + "learning_rate": 0.00016687545100303074, + "loss": 0.1696, + "step": 11484 + }, + { + "epoch": 0.8286734730690141, + "grad_norm": 0.14542226493358612, + "learning_rate": 0.000166872564583634, + "loss": 0.1311, + "step": 11485 + }, + { + "epoch": 0.8287456257440745, + "grad_norm": 0.13559548556804657, + "learning_rate": 0.00016686967816423727, + "loss": 0.1058, + "step": 11486 + }, + { + "epoch": 0.8288177784191348, + "grad_norm": 0.13337862491607666, + "learning_rate": 0.00016686679174484053, + "loss": 0.1734, + "step": 11487 + }, + { + "epoch": 0.8288899310941953, + "grad_norm": 0.10532380640506744, + "learning_rate": 0.0001668639053254438, + "loss": 0.1618, + "step": 11488 + }, + { + "epoch": 0.8289620837692557, + "grad_norm": 0.1238148957490921, + "learning_rate": 0.00016686101890604706, + "loss": 0.1336, + "step": 11489 + }, + { + "epoch": 0.8290342364443162, + "grad_norm": 0.11113140732049942, + "learning_rate": 0.00016685813248665032, + "loss": 0.1732, + "step": 11490 + }, + { + "epoch": 0.8291063891193766, + "grad_norm": 0.11272302269935608, + "learning_rate": 0.00016685524606725358, + "loss": 0.1699, + "step": 11491 + }, + { + "epoch": 0.8291785417944371, + "grad_norm": 0.14038799703121185, + "learning_rate": 0.00016685235964785685, + "loss": 0.1912, + "step": 11492 + }, + { + "epoch": 0.8292506944694975, + "grad_norm": 0.15837141871452332, + "learning_rate": 0.0001668494732284601, + "loss": 0.19, + "step": 11493 + }, + { + "epoch": 0.8293228471445578, + "grad_norm": 0.11943506449460983, + "learning_rate": 0.00016684658680906334, + "loss": 0.1677, + "step": 11494 + }, + { + "epoch": 0.8293949998196183, + "grad_norm": 0.11513236910104752, + "learning_rate": 0.00016684370038966663, + "loss": 0.2095, + "step": 11495 + }, + { + "epoch": 0.8294671524946787, + "grad_norm": 0.12636685371398926, + "learning_rate": 0.0001668408139702699, + "loss": 0.1642, + "step": 11496 + }, + { + "epoch": 0.8295393051697392, + "grad_norm": 0.13415803015232086, + "learning_rate": 0.00016683792755087316, + "loss": 0.1485, + "step": 11497 + }, + { + "epoch": 0.8296114578447996, + "grad_norm": 0.11816070973873138, + "learning_rate": 0.00016683504113147642, + "loss": 0.1304, + "step": 11498 + }, + { + "epoch": 0.8296836105198601, + "grad_norm": 0.13376086950302124, + "learning_rate": 0.00016683215471207966, + "loss": 0.14, + "step": 11499 + }, + { + "epoch": 0.8297557631949205, + "grad_norm": 0.12860022485256195, + "learning_rate": 0.00016682926829268292, + "loss": 0.0953, + "step": 11500 + }, + { + "epoch": 0.8298279158699808, + "grad_norm": 0.1409020721912384, + "learning_rate": 0.00016682638187328619, + "loss": 0.1394, + "step": 11501 + }, + { + "epoch": 0.8299000685450413, + "grad_norm": 0.14115962386131287, + "learning_rate": 0.00016682349545388948, + "loss": 0.1678, + "step": 11502 + }, + { + "epoch": 0.8299722212201017, + "grad_norm": 0.17172449827194214, + "learning_rate": 0.00016682060903449274, + "loss": 0.1547, + "step": 11503 + }, + { + "epoch": 0.8300443738951622, + "grad_norm": 0.12612241506576538, + "learning_rate": 0.00016681772261509597, + "loss": 0.0994, + "step": 11504 + }, + { + "epoch": 0.8301165265702226, + "grad_norm": 0.11162726581096649, + "learning_rate": 0.00016681483619569924, + "loss": 0.142, + "step": 11505 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 0.12488222122192383, + "learning_rate": 0.0001668119497763025, + "loss": 0.1436, + "step": 11506 + }, + { + "epoch": 0.8302608319203435, + "grad_norm": 0.12031426280736923, + "learning_rate": 0.00016680906335690576, + "loss": 0.1167, + "step": 11507 + }, + { + "epoch": 0.8303329845954038, + "grad_norm": 0.11629743129014969, + "learning_rate": 0.00016680617693750903, + "loss": 0.1639, + "step": 11508 + }, + { + "epoch": 0.8304051372704643, + "grad_norm": 0.13165660202503204, + "learning_rate": 0.0001668032905181123, + "loss": 0.194, + "step": 11509 + }, + { + "epoch": 0.8304772899455247, + "grad_norm": 0.12788671255111694, + "learning_rate": 0.00016680040409871555, + "loss": 0.1567, + "step": 11510 + }, + { + "epoch": 0.8305494426205852, + "grad_norm": 0.11516663432121277, + "learning_rate": 0.00016679751767931881, + "loss": 0.1759, + "step": 11511 + }, + { + "epoch": 0.8306215952956456, + "grad_norm": 0.133749857544899, + "learning_rate": 0.00016679463125992208, + "loss": 0.1222, + "step": 11512 + }, + { + "epoch": 0.830693747970706, + "grad_norm": 0.1398380994796753, + "learning_rate": 0.00016679174484052534, + "loss": 0.1513, + "step": 11513 + }, + { + "epoch": 0.8307659006457664, + "grad_norm": 0.12789134681224823, + "learning_rate": 0.0001667888584211286, + "loss": 0.1611, + "step": 11514 + }, + { + "epoch": 0.8308380533208268, + "grad_norm": 0.1294834166765213, + "learning_rate": 0.00016678597200173184, + "loss": 0.1273, + "step": 11515 + }, + { + "epoch": 0.8309102059958873, + "grad_norm": 0.11569526791572571, + "learning_rate": 0.00016678308558233513, + "loss": 0.1147, + "step": 11516 + }, + { + "epoch": 0.8309823586709477, + "grad_norm": 0.1409730315208435, + "learning_rate": 0.0001667801991629384, + "loss": 0.2149, + "step": 11517 + }, + { + "epoch": 0.8310545113460082, + "grad_norm": 0.10316772013902664, + "learning_rate": 0.00016677731274354165, + "loss": 0.1332, + "step": 11518 + }, + { + "epoch": 0.8311266640210686, + "grad_norm": 0.1244623139500618, + "learning_rate": 0.00016677442632414492, + "loss": 0.1979, + "step": 11519 + }, + { + "epoch": 0.831198816696129, + "grad_norm": 0.11210250854492188, + "learning_rate": 0.00016677153990474815, + "loss": 0.1677, + "step": 11520 + }, + { + "epoch": 0.8312709693711894, + "grad_norm": 0.14468421041965485, + "learning_rate": 0.00016676865348535142, + "loss": 0.1433, + "step": 11521 + }, + { + "epoch": 0.8313431220462498, + "grad_norm": 0.13782362639904022, + "learning_rate": 0.00016676576706595468, + "loss": 0.2326, + "step": 11522 + }, + { + "epoch": 0.8314152747213103, + "grad_norm": 0.12275179475545883, + "learning_rate": 0.00016676288064655797, + "loss": 0.1579, + "step": 11523 + }, + { + "epoch": 0.8314874273963707, + "grad_norm": 0.12560491263866425, + "learning_rate": 0.00016675999422716123, + "loss": 0.1762, + "step": 11524 + }, + { + "epoch": 0.8315595800714312, + "grad_norm": 0.11914512515068054, + "learning_rate": 0.00016675710780776447, + "loss": 0.176, + "step": 11525 + }, + { + "epoch": 0.8316317327464916, + "grad_norm": 0.12210630625486374, + "learning_rate": 0.00016675422138836773, + "loss": 0.1536, + "step": 11526 + }, + { + "epoch": 0.831703885421552, + "grad_norm": 0.15344761312007904, + "learning_rate": 0.000166751334968971, + "loss": 0.1431, + "step": 11527 + }, + { + "epoch": 0.8317760380966124, + "grad_norm": 0.12144190818071365, + "learning_rate": 0.00016674844854957426, + "loss": 0.1842, + "step": 11528 + }, + { + "epoch": 0.8318481907716728, + "grad_norm": 0.12379223108291626, + "learning_rate": 0.00016674556213017752, + "loss": 0.1751, + "step": 11529 + }, + { + "epoch": 0.8319203434467333, + "grad_norm": 0.1101490706205368, + "learning_rate": 0.00016674267571078078, + "loss": 0.1572, + "step": 11530 + }, + { + "epoch": 0.8319924961217937, + "grad_norm": 0.13809335231781006, + "learning_rate": 0.00016673978929138405, + "loss": 0.2072, + "step": 11531 + }, + { + "epoch": 0.8320646487968542, + "grad_norm": 0.11991512775421143, + "learning_rate": 0.0001667369028719873, + "loss": 0.1055, + "step": 11532 + }, + { + "epoch": 0.8321368014719146, + "grad_norm": 0.11620807647705078, + "learning_rate": 0.00016673401645259057, + "loss": 0.1341, + "step": 11533 + }, + { + "epoch": 0.832208954146975, + "grad_norm": 0.16210775077342987, + "learning_rate": 0.00016673113003319383, + "loss": 0.2117, + "step": 11534 + }, + { + "epoch": 0.8322811068220354, + "grad_norm": 0.1260756105184555, + "learning_rate": 0.0001667282436137971, + "loss": 0.1628, + "step": 11535 + }, + { + "epoch": 0.8323532594970958, + "grad_norm": 0.10844235867261887, + "learning_rate": 0.00016672535719440033, + "loss": 0.1438, + "step": 11536 + }, + { + "epoch": 0.8324254121721563, + "grad_norm": 0.11905061453580856, + "learning_rate": 0.00016672247077500362, + "loss": 0.1055, + "step": 11537 + }, + { + "epoch": 0.8324975648472167, + "grad_norm": 0.15732115507125854, + "learning_rate": 0.00016671958435560689, + "loss": 0.1399, + "step": 11538 + }, + { + "epoch": 0.8325697175222772, + "grad_norm": 0.11067745834589005, + "learning_rate": 0.00016671669793621015, + "loss": 0.1822, + "step": 11539 + }, + { + "epoch": 0.8326418701973376, + "grad_norm": 0.14369891583919525, + "learning_rate": 0.0001667138115168134, + "loss": 0.1583, + "step": 11540 + }, + { + "epoch": 0.832714022872398, + "grad_norm": 0.11254069209098816, + "learning_rate": 0.00016671092509741665, + "loss": 0.1024, + "step": 11541 + }, + { + "epoch": 0.8327861755474584, + "grad_norm": 0.13814480602741241, + "learning_rate": 0.0001667080386780199, + "loss": 0.141, + "step": 11542 + }, + { + "epoch": 0.8328583282225188, + "grad_norm": 0.1405809223651886, + "learning_rate": 0.00016670515225862317, + "loss": 0.2155, + "step": 11543 + }, + { + "epoch": 0.8329304808975793, + "grad_norm": 0.13315117359161377, + "learning_rate": 0.00016670226583922646, + "loss": 0.1662, + "step": 11544 + }, + { + "epoch": 0.8330026335726397, + "grad_norm": 0.14672383666038513, + "learning_rate": 0.00016669937941982973, + "loss": 0.1382, + "step": 11545 + }, + { + "epoch": 0.8330747862477001, + "grad_norm": 0.11861515045166016, + "learning_rate": 0.00016669649300043296, + "loss": 0.1208, + "step": 11546 + }, + { + "epoch": 0.8331469389227606, + "grad_norm": 0.14986343681812286, + "learning_rate": 0.00016669360658103623, + "loss": 0.097, + "step": 11547 + }, + { + "epoch": 0.833219091597821, + "grad_norm": 0.13064922392368317, + "learning_rate": 0.0001666907201616395, + "loss": 0.1685, + "step": 11548 + }, + { + "epoch": 0.8332912442728814, + "grad_norm": 0.12011728435754776, + "learning_rate": 0.00016668783374224275, + "loss": 0.1227, + "step": 11549 + }, + { + "epoch": 0.8333633969479418, + "grad_norm": 0.11312943696975708, + "learning_rate": 0.00016668494732284601, + "loss": 0.0844, + "step": 11550 + }, + { + "epoch": 0.8334355496230023, + "grad_norm": 0.12065355479717255, + "learning_rate": 0.00016668206090344928, + "loss": 0.112, + "step": 11551 + }, + { + "epoch": 0.8335077022980627, + "grad_norm": 0.13878248631954193, + "learning_rate": 0.00016667917448405254, + "loss": 0.1753, + "step": 11552 + }, + { + "epoch": 0.8335798549731231, + "grad_norm": 0.16975809633731842, + "learning_rate": 0.0001666762880646558, + "loss": 0.1151, + "step": 11553 + }, + { + "epoch": 0.8336520076481836, + "grad_norm": 0.13543464243412018, + "learning_rate": 0.00016667340164525907, + "loss": 0.1407, + "step": 11554 + }, + { + "epoch": 0.833724160323244, + "grad_norm": 0.12031517177820206, + "learning_rate": 0.00016667051522586233, + "loss": 0.1399, + "step": 11555 + }, + { + "epoch": 0.8337963129983044, + "grad_norm": 0.14035508036613464, + "learning_rate": 0.0001666676288064656, + "loss": 0.1424, + "step": 11556 + }, + { + "epoch": 0.8338684656733648, + "grad_norm": 0.14016559720039368, + "learning_rate": 0.00016666474238706883, + "loss": 0.125, + "step": 11557 + }, + { + "epoch": 0.8339406183484253, + "grad_norm": 0.1287499964237213, + "learning_rate": 0.00016666185596767212, + "loss": 0.1863, + "step": 11558 + }, + { + "epoch": 0.8340127710234857, + "grad_norm": 0.1210627481341362, + "learning_rate": 0.00016665896954827538, + "loss": 0.1541, + "step": 11559 + }, + { + "epoch": 0.8340849236985461, + "grad_norm": 0.14031276106834412, + "learning_rate": 0.00016665608312887864, + "loss": 0.1555, + "step": 11560 + }, + { + "epoch": 0.8341570763736066, + "grad_norm": 0.1310780644416809, + "learning_rate": 0.0001666531967094819, + "loss": 0.1365, + "step": 11561 + }, + { + "epoch": 0.834229229048667, + "grad_norm": 0.14506569504737854, + "learning_rate": 0.00016665031029008514, + "loss": 0.1649, + "step": 11562 + }, + { + "epoch": 0.8343013817237274, + "grad_norm": 0.1415039449930191, + "learning_rate": 0.0001666474238706884, + "loss": 0.1915, + "step": 11563 + }, + { + "epoch": 0.8343735343987878, + "grad_norm": 0.10610788315534592, + "learning_rate": 0.00016664453745129167, + "loss": 0.1244, + "step": 11564 + }, + { + "epoch": 0.8344456870738483, + "grad_norm": 0.10591522604227066, + "learning_rate": 0.00016664165103189496, + "loss": 0.1504, + "step": 11565 + }, + { + "epoch": 0.8345178397489087, + "grad_norm": 0.11307922005653381, + "learning_rate": 0.00016663876461249822, + "loss": 0.1631, + "step": 11566 + }, + { + "epoch": 0.8345899924239691, + "grad_norm": 0.11754350364208221, + "learning_rate": 0.00016663587819310146, + "loss": 0.1145, + "step": 11567 + }, + { + "epoch": 0.8346621450990296, + "grad_norm": 0.1252533346414566, + "learning_rate": 0.00016663299177370472, + "loss": 0.1688, + "step": 11568 + }, + { + "epoch": 0.83473429777409, + "grad_norm": 0.14636370539665222, + "learning_rate": 0.00016663010535430798, + "loss": 0.1494, + "step": 11569 + }, + { + "epoch": 0.8348064504491504, + "grad_norm": 0.1493082493543625, + "learning_rate": 0.00016662721893491125, + "loss": 0.1801, + "step": 11570 + }, + { + "epoch": 0.8348786031242108, + "grad_norm": 0.13885556161403656, + "learning_rate": 0.0001666243325155145, + "loss": 0.1435, + "step": 11571 + }, + { + "epoch": 0.8349507557992712, + "grad_norm": 0.11697115749120712, + "learning_rate": 0.00016662144609611777, + "loss": 0.1327, + "step": 11572 + }, + { + "epoch": 0.8350229084743317, + "grad_norm": 0.12583328783512115, + "learning_rate": 0.00016661855967672103, + "loss": 0.1206, + "step": 11573 + }, + { + "epoch": 0.8350950611493921, + "grad_norm": 0.11333484947681427, + "learning_rate": 0.0001666156732573243, + "loss": 0.1397, + "step": 11574 + }, + { + "epoch": 0.8351672138244526, + "grad_norm": 0.1309988647699356, + "learning_rate": 0.00016661278683792756, + "loss": 0.1506, + "step": 11575 + }, + { + "epoch": 0.8352393664995129, + "grad_norm": 0.13478970527648926, + "learning_rate": 0.00016660990041853082, + "loss": 0.1074, + "step": 11576 + }, + { + "epoch": 0.8353115191745734, + "grad_norm": 0.13597935438156128, + "learning_rate": 0.00016660701399913409, + "loss": 0.1678, + "step": 11577 + }, + { + "epoch": 0.8353836718496338, + "grad_norm": 0.11729282885789871, + "learning_rate": 0.00016660412757973732, + "loss": 0.1036, + "step": 11578 + }, + { + "epoch": 0.8354558245246942, + "grad_norm": 0.12911324203014374, + "learning_rate": 0.0001666012411603406, + "loss": 0.1369, + "step": 11579 + }, + { + "epoch": 0.8355279771997547, + "grad_norm": 0.12044049799442291, + "learning_rate": 0.00016659835474094387, + "loss": 0.1524, + "step": 11580 + }, + { + "epoch": 0.8356001298748151, + "grad_norm": 0.12708896398544312, + "learning_rate": 0.00016659546832154714, + "loss": 0.1234, + "step": 11581 + }, + { + "epoch": 0.8356722825498756, + "grad_norm": 0.15077067911624908, + "learning_rate": 0.0001665925819021504, + "loss": 0.1612, + "step": 11582 + }, + { + "epoch": 0.8357444352249359, + "grad_norm": 0.13292542099952698, + "learning_rate": 0.00016658969548275364, + "loss": 0.1269, + "step": 11583 + }, + { + "epoch": 0.8358165878999964, + "grad_norm": 0.10909565538167953, + "learning_rate": 0.0001665868090633569, + "loss": 0.1611, + "step": 11584 + }, + { + "epoch": 0.8358887405750568, + "grad_norm": 0.11976361274719238, + "learning_rate": 0.00016658392264396016, + "loss": 0.2101, + "step": 11585 + }, + { + "epoch": 0.8359608932501172, + "grad_norm": 0.13796712458133698, + "learning_rate": 0.00016658103622456345, + "loss": 0.1632, + "step": 11586 + }, + { + "epoch": 0.8360330459251777, + "grad_norm": 0.1254863739013672, + "learning_rate": 0.00016657814980516671, + "loss": 0.2033, + "step": 11587 + }, + { + "epoch": 0.8361051986002381, + "grad_norm": 0.11761124432086945, + "learning_rate": 0.00016657526338576995, + "loss": 0.1263, + "step": 11588 + }, + { + "epoch": 0.8361773512752986, + "grad_norm": 0.11324866116046906, + "learning_rate": 0.00016657237696637321, + "loss": 0.1696, + "step": 11589 + }, + { + "epoch": 0.8362495039503589, + "grad_norm": 0.11937769502401352, + "learning_rate": 0.00016656949054697648, + "loss": 0.1739, + "step": 11590 + }, + { + "epoch": 0.8363216566254194, + "grad_norm": 0.12692390382289886, + "learning_rate": 0.00016656660412757974, + "loss": 0.1929, + "step": 11591 + }, + { + "epoch": 0.8363938093004798, + "grad_norm": 0.11676450073719025, + "learning_rate": 0.000166563717708183, + "loss": 0.0951, + "step": 11592 + }, + { + "epoch": 0.8364659619755402, + "grad_norm": 0.1368885040283203, + "learning_rate": 0.00016656083128878627, + "loss": 0.0942, + "step": 11593 + }, + { + "epoch": 0.8365381146506007, + "grad_norm": 0.11106109619140625, + "learning_rate": 0.00016655794486938953, + "loss": 0.1062, + "step": 11594 + }, + { + "epoch": 0.8366102673256611, + "grad_norm": 0.13033699989318848, + "learning_rate": 0.0001665550584499928, + "loss": 0.1791, + "step": 11595 + }, + { + "epoch": 0.8366824200007216, + "grad_norm": 0.12796767055988312, + "learning_rate": 0.00016655217203059605, + "loss": 0.1027, + "step": 11596 + }, + { + "epoch": 0.8367545726757819, + "grad_norm": 0.13466742634773254, + "learning_rate": 0.00016654928561119932, + "loss": 0.1332, + "step": 11597 + }, + { + "epoch": 0.8368267253508423, + "grad_norm": 0.12785984575748444, + "learning_rate": 0.00016654639919180258, + "loss": 0.1346, + "step": 11598 + }, + { + "epoch": 0.8368988780259028, + "grad_norm": 0.12983115017414093, + "learning_rate": 0.00016654351277240584, + "loss": 0.1502, + "step": 11599 + }, + { + "epoch": 0.8369710307009632, + "grad_norm": 0.14171342551708221, + "learning_rate": 0.0001665406263530091, + "loss": 0.1731, + "step": 11600 + }, + { + "epoch": 0.8370431833760237, + "grad_norm": 0.10917267203330994, + "learning_rate": 0.00016653773993361237, + "loss": 0.1408, + "step": 11601 + }, + { + "epoch": 0.8371153360510841, + "grad_norm": 0.13165688514709473, + "learning_rate": 0.00016653485351421563, + "loss": 0.1488, + "step": 11602 + }, + { + "epoch": 0.8371874887261446, + "grad_norm": 0.12978290021419525, + "learning_rate": 0.0001665319670948189, + "loss": 0.1539, + "step": 11603 + }, + { + "epoch": 0.8372596414012049, + "grad_norm": 0.1484910249710083, + "learning_rate": 0.00016652908067542216, + "loss": 0.163, + "step": 11604 + }, + { + "epoch": 0.8373317940762653, + "grad_norm": 0.11736516654491425, + "learning_rate": 0.0001665261942560254, + "loss": 0.1658, + "step": 11605 + }, + { + "epoch": 0.8374039467513258, + "grad_norm": 0.14045096933841705, + "learning_rate": 0.00016652330783662866, + "loss": 0.1278, + "step": 11606 + }, + { + "epoch": 0.8374760994263862, + "grad_norm": 0.13117900490760803, + "learning_rate": 0.00016652042141723195, + "loss": 0.1536, + "step": 11607 + }, + { + "epoch": 0.8375482521014467, + "grad_norm": 0.1257036328315735, + "learning_rate": 0.0001665175349978352, + "loss": 0.1652, + "step": 11608 + }, + { + "epoch": 0.8376204047765071, + "grad_norm": 0.12943686544895172, + "learning_rate": 0.00016651464857843847, + "loss": 0.1552, + "step": 11609 + }, + { + "epoch": 0.8376925574515676, + "grad_norm": 0.13568373024463654, + "learning_rate": 0.0001665117621590417, + "loss": 0.1832, + "step": 11610 + }, + { + "epoch": 0.8377647101266279, + "grad_norm": 0.13962949812412262, + "learning_rate": 0.00016650887573964497, + "loss": 0.1481, + "step": 11611 + }, + { + "epoch": 0.8378368628016883, + "grad_norm": 0.14716729521751404, + "learning_rate": 0.00016650598932024823, + "loss": 0.1601, + "step": 11612 + }, + { + "epoch": 0.8379090154767488, + "grad_norm": 0.13211050629615784, + "learning_rate": 0.0001665031029008515, + "loss": 0.1902, + "step": 11613 + }, + { + "epoch": 0.8379811681518092, + "grad_norm": 0.15353256464004517, + "learning_rate": 0.0001665002164814548, + "loss": 0.1344, + "step": 11614 + }, + { + "epoch": 0.8380533208268697, + "grad_norm": 0.12700878083705902, + "learning_rate": 0.00016649733006205802, + "loss": 0.1139, + "step": 11615 + }, + { + "epoch": 0.8381254735019301, + "grad_norm": 0.11437607556581497, + "learning_rate": 0.00016649444364266129, + "loss": 0.1528, + "step": 11616 + }, + { + "epoch": 0.8381976261769906, + "grad_norm": 0.12418381124734879, + "learning_rate": 0.00016649155722326455, + "loss": 0.1254, + "step": 11617 + }, + { + "epoch": 0.8382697788520509, + "grad_norm": 0.1047118529677391, + "learning_rate": 0.0001664886708038678, + "loss": 0.0956, + "step": 11618 + }, + { + "epoch": 0.8383419315271113, + "grad_norm": 0.1184152364730835, + "learning_rate": 0.00016648578438447107, + "loss": 0.1315, + "step": 11619 + }, + { + "epoch": 0.8384140842021718, + "grad_norm": 0.10887156426906586, + "learning_rate": 0.00016648289796507434, + "loss": 0.1824, + "step": 11620 + }, + { + "epoch": 0.8384862368772322, + "grad_norm": 0.14598292112350464, + "learning_rate": 0.0001664800115456776, + "loss": 0.1581, + "step": 11621 + }, + { + "epoch": 0.8385583895522927, + "grad_norm": 0.1101701483130455, + "learning_rate": 0.00016647712512628086, + "loss": 0.1703, + "step": 11622 + }, + { + "epoch": 0.8386305422273531, + "grad_norm": 0.0960790365934372, + "learning_rate": 0.00016647423870688413, + "loss": 0.1462, + "step": 11623 + }, + { + "epoch": 0.8387026949024136, + "grad_norm": 0.12851935625076294, + "learning_rate": 0.0001664713522874874, + "loss": 0.1799, + "step": 11624 + }, + { + "epoch": 0.8387748475774739, + "grad_norm": 0.11664525419473648, + "learning_rate": 0.00016646846586809065, + "loss": 0.1433, + "step": 11625 + }, + { + "epoch": 0.8388470002525343, + "grad_norm": 0.13805916905403137, + "learning_rate": 0.0001664655794486939, + "loss": 0.1731, + "step": 11626 + }, + { + "epoch": 0.8389191529275948, + "grad_norm": 0.12190663814544678, + "learning_rate": 0.00016646269302929715, + "loss": 0.1282, + "step": 11627 + }, + { + "epoch": 0.8389913056026552, + "grad_norm": 0.13862162828445435, + "learning_rate": 0.00016645980660990044, + "loss": 0.1863, + "step": 11628 + }, + { + "epoch": 0.8390634582777157, + "grad_norm": 0.14046546816825867, + "learning_rate": 0.0001664569201905037, + "loss": 0.1539, + "step": 11629 + }, + { + "epoch": 0.8391356109527761, + "grad_norm": 0.13328999280929565, + "learning_rate": 0.00016645403377110697, + "loss": 0.1191, + "step": 11630 + }, + { + "epoch": 0.8392077636278366, + "grad_norm": 0.13714025914669037, + "learning_rate": 0.0001664511473517102, + "loss": 0.1423, + "step": 11631 + }, + { + "epoch": 0.8392799163028969, + "grad_norm": 0.16360893845558167, + "learning_rate": 0.00016644826093231347, + "loss": 0.2118, + "step": 11632 + }, + { + "epoch": 0.8393520689779573, + "grad_norm": 0.13132081925868988, + "learning_rate": 0.00016644537451291673, + "loss": 0.17, + "step": 11633 + }, + { + "epoch": 0.8394242216530178, + "grad_norm": 0.12430932372808456, + "learning_rate": 0.00016644248809352, + "loss": 0.2066, + "step": 11634 + }, + { + "epoch": 0.8394963743280782, + "grad_norm": 0.12707607448101044, + "learning_rate": 0.00016643960167412328, + "loss": 0.1372, + "step": 11635 + }, + { + "epoch": 0.8395685270031387, + "grad_norm": 0.14341352880001068, + "learning_rate": 0.00016643671525472652, + "loss": 0.1295, + "step": 11636 + }, + { + "epoch": 0.8396406796781991, + "grad_norm": 0.1361696720123291, + "learning_rate": 0.00016643382883532978, + "loss": 0.1591, + "step": 11637 + }, + { + "epoch": 0.8397128323532594, + "grad_norm": 0.1292620152235031, + "learning_rate": 0.00016643094241593304, + "loss": 0.159, + "step": 11638 + }, + { + "epoch": 0.8397849850283199, + "grad_norm": 0.11830408871173859, + "learning_rate": 0.0001664280559965363, + "loss": 0.1404, + "step": 11639 + }, + { + "epoch": 0.8398571377033803, + "grad_norm": 0.12927722930908203, + "learning_rate": 0.00016642516957713957, + "loss": 0.1789, + "step": 11640 + }, + { + "epoch": 0.8399292903784408, + "grad_norm": 0.14580398797988892, + "learning_rate": 0.00016642228315774283, + "loss": 0.1429, + "step": 11641 + }, + { + "epoch": 0.8400014430535012, + "grad_norm": 0.09831997752189636, + "learning_rate": 0.0001664193967383461, + "loss": 0.1347, + "step": 11642 + }, + { + "epoch": 0.8400735957285617, + "grad_norm": 0.11972676217556, + "learning_rate": 0.00016641651031894936, + "loss": 0.1382, + "step": 11643 + }, + { + "epoch": 0.8401457484036221, + "grad_norm": 0.11414449661970139, + "learning_rate": 0.00016641362389955262, + "loss": 0.163, + "step": 11644 + }, + { + "epoch": 0.8402179010786824, + "grad_norm": 0.1057526171207428, + "learning_rate": 0.00016641073748015588, + "loss": 0.0997, + "step": 11645 + }, + { + "epoch": 0.8402900537537429, + "grad_norm": 0.13895364105701447, + "learning_rate": 0.00016640785106075915, + "loss": 0.137, + "step": 11646 + }, + { + "epoch": 0.8403622064288033, + "grad_norm": 0.10537075996398926, + "learning_rate": 0.00016640496464136238, + "loss": 0.1089, + "step": 11647 + }, + { + "epoch": 0.8404343591038638, + "grad_norm": 0.12381959706544876, + "learning_rate": 0.00016640207822196564, + "loss": 0.1533, + "step": 11648 + }, + { + "epoch": 0.8405065117789242, + "grad_norm": 0.1299159824848175, + "learning_rate": 0.00016639919180256893, + "loss": 0.1418, + "step": 11649 + }, + { + "epoch": 0.8405786644539847, + "grad_norm": 0.14879825711250305, + "learning_rate": 0.0001663963053831722, + "loss": 0.1463, + "step": 11650 + }, + { + "epoch": 0.8406508171290451, + "grad_norm": 0.14904411137104034, + "learning_rate": 0.00016639341896377546, + "loss": 0.1493, + "step": 11651 + }, + { + "epoch": 0.8407229698041054, + "grad_norm": 0.14877448976039886, + "learning_rate": 0.0001663905325443787, + "loss": 0.1724, + "step": 11652 + }, + { + "epoch": 0.8407951224791659, + "grad_norm": 0.11669496446847916, + "learning_rate": 0.00016638764612498196, + "loss": 0.1217, + "step": 11653 + }, + { + "epoch": 0.8408672751542263, + "grad_norm": 0.13598738610744476, + "learning_rate": 0.00016638475970558522, + "loss": 0.1389, + "step": 11654 + }, + { + "epoch": 0.8409394278292868, + "grad_norm": 0.12701286375522614, + "learning_rate": 0.00016638187328618849, + "loss": 0.1995, + "step": 11655 + }, + { + "epoch": 0.8410115805043472, + "grad_norm": 0.13989871740341187, + "learning_rate": 0.00016637898686679175, + "loss": 0.119, + "step": 11656 + }, + { + "epoch": 0.8410837331794077, + "grad_norm": 0.12764325737953186, + "learning_rate": 0.000166376100447395, + "loss": 0.1129, + "step": 11657 + }, + { + "epoch": 0.8411558858544681, + "grad_norm": 0.1132327988743782, + "learning_rate": 0.00016637321402799827, + "loss": 0.187, + "step": 11658 + }, + { + "epoch": 0.8412280385295284, + "grad_norm": 0.13291585445404053, + "learning_rate": 0.00016637032760860154, + "loss": 0.1254, + "step": 11659 + }, + { + "epoch": 0.8413001912045889, + "grad_norm": 0.11555839329957962, + "learning_rate": 0.0001663674411892048, + "loss": 0.1212, + "step": 11660 + }, + { + "epoch": 0.8413723438796493, + "grad_norm": 0.15152710676193237, + "learning_rate": 0.00016636455476980806, + "loss": 0.1276, + "step": 11661 + }, + { + "epoch": 0.8414444965547098, + "grad_norm": 0.14820121228694916, + "learning_rate": 0.00016636166835041133, + "loss": 0.143, + "step": 11662 + }, + { + "epoch": 0.8415166492297702, + "grad_norm": 0.1499357670545578, + "learning_rate": 0.00016635878193101456, + "loss": 0.1566, + "step": 11663 + }, + { + "epoch": 0.8415888019048307, + "grad_norm": 0.16589221358299255, + "learning_rate": 0.00016635589551161785, + "loss": 0.1108, + "step": 11664 + }, + { + "epoch": 0.8416609545798911, + "grad_norm": 0.14025495946407318, + "learning_rate": 0.00016635300909222111, + "loss": 0.1645, + "step": 11665 + }, + { + "epoch": 0.8417331072549514, + "grad_norm": 0.10733349621295929, + "learning_rate": 0.00016635012267282438, + "loss": 0.1862, + "step": 11666 + }, + { + "epoch": 0.8418052599300119, + "grad_norm": 0.11976485699415207, + "learning_rate": 0.00016634723625342764, + "loss": 0.1642, + "step": 11667 + }, + { + "epoch": 0.8418774126050723, + "grad_norm": 0.1251676231622696, + "learning_rate": 0.00016634434983403088, + "loss": 0.1462, + "step": 11668 + }, + { + "epoch": 0.8419495652801328, + "grad_norm": 0.14736083149909973, + "learning_rate": 0.00016634146341463414, + "loss": 0.1838, + "step": 11669 + }, + { + "epoch": 0.8420217179551932, + "grad_norm": 0.11634893715381622, + "learning_rate": 0.0001663385769952374, + "loss": 0.1775, + "step": 11670 + }, + { + "epoch": 0.8420938706302536, + "grad_norm": 0.12968191504478455, + "learning_rate": 0.0001663356905758407, + "loss": 0.1387, + "step": 11671 + }, + { + "epoch": 0.8421660233053141, + "grad_norm": 0.12363044917583466, + "learning_rate": 0.00016633280415644395, + "loss": 0.1612, + "step": 11672 + }, + { + "epoch": 0.8422381759803744, + "grad_norm": 0.11333352327346802, + "learning_rate": 0.0001663299177370472, + "loss": 0.1801, + "step": 11673 + }, + { + "epoch": 0.8423103286554349, + "grad_norm": 0.11646721512079239, + "learning_rate": 0.00016632703131765045, + "loss": 0.1249, + "step": 11674 + }, + { + "epoch": 0.8423824813304953, + "grad_norm": 0.13021302223205566, + "learning_rate": 0.00016632414489825372, + "loss": 0.1557, + "step": 11675 + }, + { + "epoch": 0.8424546340055558, + "grad_norm": 0.13064372539520264, + "learning_rate": 0.00016632125847885698, + "loss": 0.1488, + "step": 11676 + }, + { + "epoch": 0.8425267866806162, + "grad_norm": 0.11225259304046631, + "learning_rate": 0.00016631837205946024, + "loss": 0.1922, + "step": 11677 + }, + { + "epoch": 0.8425989393556766, + "grad_norm": 0.1373172402381897, + "learning_rate": 0.0001663154856400635, + "loss": 0.1163, + "step": 11678 + }, + { + "epoch": 0.8426710920307371, + "grad_norm": 0.14084011316299438, + "learning_rate": 0.00016631259922066677, + "loss": 0.1572, + "step": 11679 + }, + { + "epoch": 0.8427432447057974, + "grad_norm": 0.12305203080177307, + "learning_rate": 0.00016630971280127003, + "loss": 0.159, + "step": 11680 + }, + { + "epoch": 0.8428153973808579, + "grad_norm": 0.11790595948696136, + "learning_rate": 0.0001663068263818733, + "loss": 0.2064, + "step": 11681 + }, + { + "epoch": 0.8428875500559183, + "grad_norm": 0.15226507186889648, + "learning_rate": 0.00016630393996247656, + "loss": 0.1463, + "step": 11682 + }, + { + "epoch": 0.8429597027309788, + "grad_norm": 0.12697234749794006, + "learning_rate": 0.00016630105354307982, + "loss": 0.1668, + "step": 11683 + }, + { + "epoch": 0.8430318554060392, + "grad_norm": 0.12504594027996063, + "learning_rate": 0.00016629816712368306, + "loss": 0.1444, + "step": 11684 + }, + { + "epoch": 0.8431040080810996, + "grad_norm": 0.0949978157877922, + "learning_rate": 0.00016629528070428635, + "loss": 0.1314, + "step": 11685 + }, + { + "epoch": 0.8431761607561601, + "grad_norm": 0.11104533821344376, + "learning_rate": 0.0001662923942848896, + "loss": 0.1454, + "step": 11686 + }, + { + "epoch": 0.8432483134312204, + "grad_norm": 0.1141478419303894, + "learning_rate": 0.00016628950786549287, + "loss": 0.0956, + "step": 11687 + }, + { + "epoch": 0.8433204661062809, + "grad_norm": 0.14451053738594055, + "learning_rate": 0.00016628662144609613, + "loss": 0.1338, + "step": 11688 + }, + { + "epoch": 0.8433926187813413, + "grad_norm": 0.1483115404844284, + "learning_rate": 0.00016628373502669937, + "loss": 0.1578, + "step": 11689 + }, + { + "epoch": 0.8434647714564018, + "grad_norm": 0.12317361682653427, + "learning_rate": 0.00016628084860730263, + "loss": 0.1108, + "step": 11690 + }, + { + "epoch": 0.8435369241314622, + "grad_norm": 0.15117180347442627, + "learning_rate": 0.0001662779621879059, + "loss": 0.1499, + "step": 11691 + }, + { + "epoch": 0.8436090768065226, + "grad_norm": 0.11195392906665802, + "learning_rate": 0.00016627507576850919, + "loss": 0.1197, + "step": 11692 + }, + { + "epoch": 0.8436812294815831, + "grad_norm": 0.10647310316562653, + "learning_rate": 0.00016627218934911245, + "loss": 0.097, + "step": 11693 + }, + { + "epoch": 0.8437533821566434, + "grad_norm": 0.11704418808221817, + "learning_rate": 0.00016626930292971568, + "loss": 0.161, + "step": 11694 + }, + { + "epoch": 0.8438255348317039, + "grad_norm": 0.12431292235851288, + "learning_rate": 0.00016626641651031895, + "loss": 0.1694, + "step": 11695 + }, + { + "epoch": 0.8438976875067643, + "grad_norm": 0.12180831283330917, + "learning_rate": 0.0001662635300909222, + "loss": 0.1201, + "step": 11696 + }, + { + "epoch": 0.8439698401818247, + "grad_norm": 0.1429813951253891, + "learning_rate": 0.00016626064367152547, + "loss": 0.1343, + "step": 11697 + }, + { + "epoch": 0.8440419928568852, + "grad_norm": 0.1192222610116005, + "learning_rate": 0.00016625775725212874, + "loss": 0.0985, + "step": 11698 + }, + { + "epoch": 0.8441141455319456, + "grad_norm": 0.10998225212097168, + "learning_rate": 0.000166254870832732, + "loss": 0.1364, + "step": 11699 + }, + { + "epoch": 0.844186298207006, + "grad_norm": 0.12994202971458435, + "learning_rate": 0.00016625198441333526, + "loss": 0.1249, + "step": 11700 + }, + { + "epoch": 0.8442584508820664, + "grad_norm": 0.11415591090917587, + "learning_rate": 0.00016624909799393853, + "loss": 0.1583, + "step": 11701 + }, + { + "epoch": 0.8443306035571269, + "grad_norm": 0.14631566405296326, + "learning_rate": 0.0001662462115745418, + "loss": 0.1399, + "step": 11702 + }, + { + "epoch": 0.8444027562321873, + "grad_norm": 0.1491636484861374, + "learning_rate": 0.00016624332515514505, + "loss": 0.1385, + "step": 11703 + }, + { + "epoch": 0.8444749089072477, + "grad_norm": 0.12752553820610046, + "learning_rate": 0.00016624043873574831, + "loss": 0.1757, + "step": 11704 + }, + { + "epoch": 0.8445470615823082, + "grad_norm": 0.12609128654003143, + "learning_rate": 0.00016623755231635158, + "loss": 0.1507, + "step": 11705 + }, + { + "epoch": 0.8446192142573686, + "grad_norm": 0.10666986554861069, + "learning_rate": 0.00016623466589695484, + "loss": 0.1555, + "step": 11706 + }, + { + "epoch": 0.844691366932429, + "grad_norm": 0.11576931178569794, + "learning_rate": 0.0001662317794775581, + "loss": 0.1805, + "step": 11707 + }, + { + "epoch": 0.8447635196074894, + "grad_norm": 0.12209343910217285, + "learning_rate": 0.00016622889305816137, + "loss": 0.1064, + "step": 11708 + }, + { + "epoch": 0.8448356722825499, + "grad_norm": 0.10738017410039902, + "learning_rate": 0.00016622600663876463, + "loss": 0.1554, + "step": 11709 + }, + { + "epoch": 0.8449078249576103, + "grad_norm": 0.14066986739635468, + "learning_rate": 0.0001662231202193679, + "loss": 0.1137, + "step": 11710 + }, + { + "epoch": 0.8449799776326707, + "grad_norm": 0.14349329471588135, + "learning_rate": 0.00016622023379997113, + "loss": 0.1877, + "step": 11711 + }, + { + "epoch": 0.8450521303077312, + "grad_norm": 0.12119587510824203, + "learning_rate": 0.0001662173473805744, + "loss": 0.185, + "step": 11712 + }, + { + "epoch": 0.8451242829827916, + "grad_norm": 0.12557916343212128, + "learning_rate": 0.00016621446096117768, + "loss": 0.1786, + "step": 11713 + }, + { + "epoch": 0.845196435657852, + "grad_norm": 0.12383130192756653, + "learning_rate": 0.00016621157454178094, + "loss": 0.1696, + "step": 11714 + }, + { + "epoch": 0.8452685883329124, + "grad_norm": 0.10766246169805527, + "learning_rate": 0.0001662086881223842, + "loss": 0.1514, + "step": 11715 + }, + { + "epoch": 0.8453407410079729, + "grad_norm": 0.14209994673728943, + "learning_rate": 0.00016620580170298744, + "loss": 0.1789, + "step": 11716 + }, + { + "epoch": 0.8454128936830333, + "grad_norm": 0.11148392409086227, + "learning_rate": 0.0001662029152835907, + "loss": 0.1402, + "step": 11717 + }, + { + "epoch": 0.8454850463580937, + "grad_norm": 0.1409093290567398, + "learning_rate": 0.00016620002886419397, + "loss": 0.1454, + "step": 11718 + }, + { + "epoch": 0.8455571990331542, + "grad_norm": 0.13453932106494904, + "learning_rate": 0.00016619714244479723, + "loss": 0.1292, + "step": 11719 + }, + { + "epoch": 0.8456293517082146, + "grad_norm": 0.13617649674415588, + "learning_rate": 0.00016619425602540052, + "loss": 0.1129, + "step": 11720 + }, + { + "epoch": 0.845701504383275, + "grad_norm": 0.12087401002645493, + "learning_rate": 0.00016619136960600376, + "loss": 0.1551, + "step": 11721 + }, + { + "epoch": 0.8457736570583354, + "grad_norm": 0.1239386722445488, + "learning_rate": 0.00016618848318660702, + "loss": 0.1647, + "step": 11722 + }, + { + "epoch": 0.8458458097333958, + "grad_norm": 0.17587853968143463, + "learning_rate": 0.00016618559676721028, + "loss": 0.2084, + "step": 11723 + }, + { + "epoch": 0.8459179624084563, + "grad_norm": 0.15436212718486786, + "learning_rate": 0.00016618271034781355, + "loss": 0.1165, + "step": 11724 + }, + { + "epoch": 0.8459901150835167, + "grad_norm": 0.13399550318717957, + "learning_rate": 0.0001661798239284168, + "loss": 0.1525, + "step": 11725 + }, + { + "epoch": 0.8460622677585772, + "grad_norm": 0.1814533770084381, + "learning_rate": 0.00016617693750902007, + "loss": 0.1584, + "step": 11726 + }, + { + "epoch": 0.8461344204336376, + "grad_norm": 0.11515140533447266, + "learning_rate": 0.00016617405108962333, + "loss": 0.1214, + "step": 11727 + }, + { + "epoch": 0.846206573108698, + "grad_norm": 0.11948894709348679, + "learning_rate": 0.0001661711646702266, + "loss": 0.2008, + "step": 11728 + }, + { + "epoch": 0.8462787257837584, + "grad_norm": 0.13980422914028168, + "learning_rate": 0.00016616827825082986, + "loss": 0.1524, + "step": 11729 + }, + { + "epoch": 0.8463508784588188, + "grad_norm": 0.10919107496738434, + "learning_rate": 0.00016616539183143312, + "loss": 0.1528, + "step": 11730 + }, + { + "epoch": 0.8464230311338793, + "grad_norm": 0.12131740152835846, + "learning_rate": 0.00016616250541203639, + "loss": 0.1795, + "step": 11731 + }, + { + "epoch": 0.8464951838089397, + "grad_norm": 0.1246170774102211, + "learning_rate": 0.00016615961899263962, + "loss": 0.1201, + "step": 11732 + }, + { + "epoch": 0.8465673364840002, + "grad_norm": 0.11502742022275925, + "learning_rate": 0.00016615673257324288, + "loss": 0.1408, + "step": 11733 + }, + { + "epoch": 0.8466394891590606, + "grad_norm": 0.14159716665744781, + "learning_rate": 0.00016615384615384617, + "loss": 0.1671, + "step": 11734 + }, + { + "epoch": 0.846711641834121, + "grad_norm": 0.12725681066513062, + "learning_rate": 0.00016615095973444944, + "loss": 0.1122, + "step": 11735 + }, + { + "epoch": 0.8467837945091814, + "grad_norm": 0.15508094429969788, + "learning_rate": 0.0001661480733150527, + "loss": 0.1269, + "step": 11736 + }, + { + "epoch": 0.8468559471842418, + "grad_norm": 0.15546204149723053, + "learning_rate": 0.00016614518689565594, + "loss": 0.1178, + "step": 11737 + }, + { + "epoch": 0.8469280998593023, + "grad_norm": 0.12115608155727386, + "learning_rate": 0.0001661423004762592, + "loss": 0.1976, + "step": 11738 + }, + { + "epoch": 0.8470002525343627, + "grad_norm": 0.14080692827701569, + "learning_rate": 0.00016613941405686246, + "loss": 0.1395, + "step": 11739 + }, + { + "epoch": 0.8470724052094232, + "grad_norm": 0.125188946723938, + "learning_rate": 0.00016613652763746573, + "loss": 0.0898, + "step": 11740 + }, + { + "epoch": 0.8471445578844836, + "grad_norm": 0.1310095638036728, + "learning_rate": 0.00016613364121806902, + "loss": 0.1495, + "step": 11741 + }, + { + "epoch": 0.847216710559544, + "grad_norm": 0.13303005695343018, + "learning_rate": 0.00016613075479867225, + "loss": 0.1362, + "step": 11742 + }, + { + "epoch": 0.8472888632346044, + "grad_norm": 0.12238302826881409, + "learning_rate": 0.00016612786837927551, + "loss": 0.1302, + "step": 11743 + }, + { + "epoch": 0.8473610159096648, + "grad_norm": 0.1152992770075798, + "learning_rate": 0.00016612498195987878, + "loss": 0.1432, + "step": 11744 + }, + { + "epoch": 0.8474331685847253, + "grad_norm": 0.12224925309419632, + "learning_rate": 0.00016612209554048204, + "loss": 0.1798, + "step": 11745 + }, + { + "epoch": 0.8475053212597857, + "grad_norm": 0.13379549980163574, + "learning_rate": 0.0001661192091210853, + "loss": 0.1379, + "step": 11746 + }, + { + "epoch": 0.8475774739348462, + "grad_norm": 0.12874086201190948, + "learning_rate": 0.00016611632270168857, + "loss": 0.1613, + "step": 11747 + }, + { + "epoch": 0.8476496266099066, + "grad_norm": 0.1376962661743164, + "learning_rate": 0.00016611343628229183, + "loss": 0.1632, + "step": 11748 + }, + { + "epoch": 0.847721779284967, + "grad_norm": 0.1415809541940689, + "learning_rate": 0.0001661105498628951, + "loss": 0.1397, + "step": 11749 + }, + { + "epoch": 0.8477939319600274, + "grad_norm": 0.12963339686393738, + "learning_rate": 0.00016610766344349835, + "loss": 0.1394, + "step": 11750 + }, + { + "epoch": 0.8478660846350878, + "grad_norm": 0.14303992688655853, + "learning_rate": 0.00016610477702410162, + "loss": 0.1652, + "step": 11751 + }, + { + "epoch": 0.8479382373101483, + "grad_norm": 0.14118750393390656, + "learning_rate": 0.00016610189060470488, + "loss": 0.1125, + "step": 11752 + }, + { + "epoch": 0.8480103899852087, + "grad_norm": 0.1844176948070526, + "learning_rate": 0.00016609900418530812, + "loss": 0.1899, + "step": 11753 + }, + { + "epoch": 0.8480825426602692, + "grad_norm": 0.15066631138324738, + "learning_rate": 0.00016609611776591138, + "loss": 0.2096, + "step": 11754 + }, + { + "epoch": 0.8481546953353296, + "grad_norm": 0.15434785187244415, + "learning_rate": 0.00016609323134651467, + "loss": 0.0953, + "step": 11755 + }, + { + "epoch": 0.8482268480103899, + "grad_norm": 0.1429002583026886, + "learning_rate": 0.00016609034492711793, + "loss": 0.1368, + "step": 11756 + }, + { + "epoch": 0.8482990006854504, + "grad_norm": 0.1369299292564392, + "learning_rate": 0.0001660874585077212, + "loss": 0.1339, + "step": 11757 + }, + { + "epoch": 0.8483711533605108, + "grad_norm": 0.11154896765947342, + "learning_rate": 0.00016608457208832443, + "loss": 0.1767, + "step": 11758 + }, + { + "epoch": 0.8484433060355713, + "grad_norm": 0.11356501281261444, + "learning_rate": 0.0001660816856689277, + "loss": 0.1901, + "step": 11759 + }, + { + "epoch": 0.8485154587106317, + "grad_norm": 0.12390024214982986, + "learning_rate": 0.00016607879924953096, + "loss": 0.175, + "step": 11760 + }, + { + "epoch": 0.8485876113856922, + "grad_norm": 0.1285458654165268, + "learning_rate": 0.00016607591283013422, + "loss": 0.1187, + "step": 11761 + }, + { + "epoch": 0.8486597640607525, + "grad_norm": 0.15921713411808014, + "learning_rate": 0.0001660730264107375, + "loss": 0.158, + "step": 11762 + }, + { + "epoch": 0.8487319167358129, + "grad_norm": 0.12651832401752472, + "learning_rate": 0.00016607013999134075, + "loss": 0.1487, + "step": 11763 + }, + { + "epoch": 0.8488040694108734, + "grad_norm": 0.2776840031147003, + "learning_rate": 0.000166067253571944, + "loss": 0.171, + "step": 11764 + }, + { + "epoch": 0.8488762220859338, + "grad_norm": 0.132423996925354, + "learning_rate": 0.00016606436715254727, + "loss": 0.1564, + "step": 11765 + }, + { + "epoch": 0.8489483747609943, + "grad_norm": 0.17471779882907867, + "learning_rate": 0.00016606148073315053, + "loss": 0.161, + "step": 11766 + }, + { + "epoch": 0.8490205274360547, + "grad_norm": 0.11781930923461914, + "learning_rate": 0.0001660585943137538, + "loss": 0.1571, + "step": 11767 + }, + { + "epoch": 0.8490926801111152, + "grad_norm": 0.15253686904907227, + "learning_rate": 0.00016605570789435706, + "loss": 0.1931, + "step": 11768 + }, + { + "epoch": 0.8491648327861755, + "grad_norm": 0.13133996725082397, + "learning_rate": 0.00016605282147496032, + "loss": 0.1382, + "step": 11769 + }, + { + "epoch": 0.8492369854612359, + "grad_norm": 0.11417736113071442, + "learning_rate": 0.00016604993505556359, + "loss": 0.1565, + "step": 11770 + }, + { + "epoch": 0.8493091381362964, + "grad_norm": 0.12417210638523102, + "learning_rate": 0.00016604704863616685, + "loss": 0.1294, + "step": 11771 + }, + { + "epoch": 0.8493812908113568, + "grad_norm": 0.1240205466747284, + "learning_rate": 0.0001660441622167701, + "loss": 0.1329, + "step": 11772 + }, + { + "epoch": 0.8494534434864173, + "grad_norm": 0.13930866122245789, + "learning_rate": 0.00016604127579737337, + "loss": 0.1656, + "step": 11773 + }, + { + "epoch": 0.8495255961614777, + "grad_norm": 0.10060317814350128, + "learning_rate": 0.0001660383893779766, + "loss": 0.1443, + "step": 11774 + }, + { + "epoch": 0.8495977488365382, + "grad_norm": 0.15756677091121674, + "learning_rate": 0.00016603550295857987, + "loss": 0.156, + "step": 11775 + }, + { + "epoch": 0.8496699015115985, + "grad_norm": 0.13482096791267395, + "learning_rate": 0.00016603261653918316, + "loss": 0.1377, + "step": 11776 + }, + { + "epoch": 0.8497420541866589, + "grad_norm": 0.16339333355426788, + "learning_rate": 0.00016602973011978643, + "loss": 0.1683, + "step": 11777 + }, + { + "epoch": 0.8498142068617194, + "grad_norm": 0.16918936371803284, + "learning_rate": 0.0001660268437003897, + "loss": 0.133, + "step": 11778 + }, + { + "epoch": 0.8498863595367798, + "grad_norm": 0.14417551457881927, + "learning_rate": 0.00016602395728099292, + "loss": 0.1369, + "step": 11779 + }, + { + "epoch": 0.8499585122118403, + "grad_norm": 0.1408488005399704, + "learning_rate": 0.0001660210708615962, + "loss": 0.196, + "step": 11780 + }, + { + "epoch": 0.8500306648869007, + "grad_norm": 0.12176976352930069, + "learning_rate": 0.00016601818444219945, + "loss": 0.1533, + "step": 11781 + }, + { + "epoch": 0.8501028175619612, + "grad_norm": 0.17833276093006134, + "learning_rate": 0.00016601529802280271, + "loss": 0.1841, + "step": 11782 + }, + { + "epoch": 0.8501749702370215, + "grad_norm": 0.11696712672710419, + "learning_rate": 0.000166012411603406, + "loss": 0.1609, + "step": 11783 + }, + { + "epoch": 0.8502471229120819, + "grad_norm": 0.12877492606639862, + "learning_rate": 0.00016600952518400924, + "loss": 0.1472, + "step": 11784 + }, + { + "epoch": 0.8503192755871424, + "grad_norm": 0.12504887580871582, + "learning_rate": 0.0001660066387646125, + "loss": 0.1627, + "step": 11785 + }, + { + "epoch": 0.8503914282622028, + "grad_norm": 0.15413124859333038, + "learning_rate": 0.00016600375234521577, + "loss": 0.1505, + "step": 11786 + }, + { + "epoch": 0.8504635809372633, + "grad_norm": 0.11818953603506088, + "learning_rate": 0.00016600086592581903, + "loss": 0.1606, + "step": 11787 + }, + { + "epoch": 0.8505357336123237, + "grad_norm": 0.14028437435626984, + "learning_rate": 0.0001659979795064223, + "loss": 0.1177, + "step": 11788 + }, + { + "epoch": 0.8506078862873842, + "grad_norm": 0.13598275184631348, + "learning_rate": 0.00016599509308702555, + "loss": 0.1538, + "step": 11789 + }, + { + "epoch": 0.8506800389624445, + "grad_norm": 0.12115483731031418, + "learning_rate": 0.00016599220666762882, + "loss": 0.119, + "step": 11790 + }, + { + "epoch": 0.8507521916375049, + "grad_norm": 0.13980644941329956, + "learning_rate": 0.00016598932024823208, + "loss": 0.1628, + "step": 11791 + }, + { + "epoch": 0.8508243443125654, + "grad_norm": 0.17200279235839844, + "learning_rate": 0.00016598643382883534, + "loss": 0.1413, + "step": 11792 + }, + { + "epoch": 0.8508964969876258, + "grad_norm": 0.12813691794872284, + "learning_rate": 0.0001659835474094386, + "loss": 0.1447, + "step": 11793 + }, + { + "epoch": 0.8509686496626863, + "grad_norm": 0.129331573843956, + "learning_rate": 0.00016598066099004187, + "loss": 0.1292, + "step": 11794 + }, + { + "epoch": 0.8510408023377467, + "grad_norm": 0.2023138403892517, + "learning_rate": 0.0001659777745706451, + "loss": 0.1565, + "step": 11795 + }, + { + "epoch": 0.8511129550128071, + "grad_norm": 0.13821351528167725, + "learning_rate": 0.00016597488815124837, + "loss": 0.1463, + "step": 11796 + }, + { + "epoch": 0.8511851076878675, + "grad_norm": 0.12791946530342102, + "learning_rate": 0.00016597200173185166, + "loss": 0.1642, + "step": 11797 + }, + { + "epoch": 0.8512572603629279, + "grad_norm": 0.10310852527618408, + "learning_rate": 0.00016596911531245492, + "loss": 0.1367, + "step": 11798 + }, + { + "epoch": 0.8513294130379884, + "grad_norm": 0.135204017162323, + "learning_rate": 0.00016596622889305818, + "loss": 0.142, + "step": 11799 + }, + { + "epoch": 0.8514015657130488, + "grad_norm": 0.11695842444896698, + "learning_rate": 0.00016596334247366142, + "loss": 0.1917, + "step": 11800 + }, + { + "epoch": 0.8514737183881093, + "grad_norm": 0.12734819948673248, + "learning_rate": 0.00016596045605426468, + "loss": 0.117, + "step": 11801 + }, + { + "epoch": 0.8515458710631697, + "grad_norm": 0.14392152428627014, + "learning_rate": 0.00016595756963486794, + "loss": 0.1608, + "step": 11802 + }, + { + "epoch": 0.8516180237382301, + "grad_norm": 0.10268928110599518, + "learning_rate": 0.0001659546832154712, + "loss": 0.2276, + "step": 11803 + }, + { + "epoch": 0.8516901764132905, + "grad_norm": 0.11644991487264633, + "learning_rate": 0.0001659517967960745, + "loss": 0.0831, + "step": 11804 + }, + { + "epoch": 0.8517623290883509, + "grad_norm": 0.1109037920832634, + "learning_rate": 0.00016594891037667773, + "loss": 0.1869, + "step": 11805 + }, + { + "epoch": 0.8518344817634114, + "grad_norm": 0.1327962577342987, + "learning_rate": 0.000165946023957281, + "loss": 0.1389, + "step": 11806 + }, + { + "epoch": 0.8519066344384718, + "grad_norm": 0.2042866200208664, + "learning_rate": 0.00016594313753788426, + "loss": 0.1413, + "step": 11807 + }, + { + "epoch": 0.8519787871135323, + "grad_norm": 0.14395737648010254, + "learning_rate": 0.00016594025111848752, + "loss": 0.1604, + "step": 11808 + }, + { + "epoch": 0.8520509397885927, + "grad_norm": 0.11978230625391006, + "learning_rate": 0.00016593736469909079, + "loss": 0.164, + "step": 11809 + }, + { + "epoch": 0.8521230924636531, + "grad_norm": 0.1249191090464592, + "learning_rate": 0.00016593447827969405, + "loss": 0.1563, + "step": 11810 + }, + { + "epoch": 0.8521952451387135, + "grad_norm": 0.13559097051620483, + "learning_rate": 0.0001659315918602973, + "loss": 0.1385, + "step": 11811 + }, + { + "epoch": 0.8522673978137739, + "grad_norm": 0.13980789482593536, + "learning_rate": 0.00016592870544090057, + "loss": 0.1819, + "step": 11812 + }, + { + "epoch": 0.8523395504888344, + "grad_norm": 0.1371464878320694, + "learning_rate": 0.00016592581902150384, + "loss": 0.125, + "step": 11813 + }, + { + "epoch": 0.8524117031638948, + "grad_norm": 0.1271425485610962, + "learning_rate": 0.0001659229326021071, + "loss": 0.1494, + "step": 11814 + }, + { + "epoch": 0.8524838558389553, + "grad_norm": 0.12910963594913483, + "learning_rate": 0.00016592004618271036, + "loss": 0.1224, + "step": 11815 + }, + { + "epoch": 0.8525560085140157, + "grad_norm": 0.177041158080101, + "learning_rate": 0.0001659171597633136, + "loss": 0.2076, + "step": 11816 + }, + { + "epoch": 0.8526281611890761, + "grad_norm": 0.10944493860006332, + "learning_rate": 0.00016591427334391686, + "loss": 0.1419, + "step": 11817 + }, + { + "epoch": 0.8527003138641365, + "grad_norm": 0.13446961343288422, + "learning_rate": 0.00016591138692452015, + "loss": 0.175, + "step": 11818 + }, + { + "epoch": 0.8527724665391969, + "grad_norm": 0.14175055921077728, + "learning_rate": 0.00016590850050512341, + "loss": 0.1884, + "step": 11819 + }, + { + "epoch": 0.8528446192142574, + "grad_norm": 0.1306513547897339, + "learning_rate": 0.00016590561408572668, + "loss": 0.1185, + "step": 11820 + }, + { + "epoch": 0.8529167718893178, + "grad_norm": 0.1797804832458496, + "learning_rate": 0.0001659027276663299, + "loss": 0.1726, + "step": 11821 + }, + { + "epoch": 0.8529889245643782, + "grad_norm": 0.14040648937225342, + "learning_rate": 0.00016589984124693318, + "loss": 0.139, + "step": 11822 + }, + { + "epoch": 0.8530610772394387, + "grad_norm": 0.12760290503501892, + "learning_rate": 0.00016589695482753644, + "loss": 0.171, + "step": 11823 + }, + { + "epoch": 0.853133229914499, + "grad_norm": 0.11597708612680435, + "learning_rate": 0.0001658940684081397, + "loss": 0.1488, + "step": 11824 + }, + { + "epoch": 0.8532053825895595, + "grad_norm": 0.1469823122024536, + "learning_rate": 0.000165891181988743, + "loss": 0.1322, + "step": 11825 + }, + { + "epoch": 0.8532775352646199, + "grad_norm": 0.13939408957958221, + "learning_rate": 0.00016588829556934623, + "loss": 0.1653, + "step": 11826 + }, + { + "epoch": 0.8533496879396804, + "grad_norm": 0.12158030271530151, + "learning_rate": 0.0001658854091499495, + "loss": 0.1278, + "step": 11827 + }, + { + "epoch": 0.8534218406147408, + "grad_norm": 0.13025440275669098, + "learning_rate": 0.00016588252273055275, + "loss": 0.1598, + "step": 11828 + }, + { + "epoch": 0.8534939932898012, + "grad_norm": 0.1255502849817276, + "learning_rate": 0.00016587963631115602, + "loss": 0.1613, + "step": 11829 + }, + { + "epoch": 0.8535661459648617, + "grad_norm": 0.11364217847585678, + "learning_rate": 0.00016587674989175928, + "loss": 0.1492, + "step": 11830 + }, + { + "epoch": 0.853638298639922, + "grad_norm": 0.14769354462623596, + "learning_rate": 0.00016587386347236254, + "loss": 0.1254, + "step": 11831 + }, + { + "epoch": 0.8537104513149825, + "grad_norm": 0.11613823473453522, + "learning_rate": 0.0001658709770529658, + "loss": 0.0857, + "step": 11832 + }, + { + "epoch": 0.8537826039900429, + "grad_norm": 0.12310709059238434, + "learning_rate": 0.00016586809063356907, + "loss": 0.1717, + "step": 11833 + }, + { + "epoch": 0.8538547566651034, + "grad_norm": 0.1325674206018448, + "learning_rate": 0.00016586520421417233, + "loss": 0.1637, + "step": 11834 + }, + { + "epoch": 0.8539269093401638, + "grad_norm": 0.15822456777095795, + "learning_rate": 0.0001658623177947756, + "loss": 0.152, + "step": 11835 + }, + { + "epoch": 0.8539990620152242, + "grad_norm": 0.12222934514284134, + "learning_rate": 0.00016585943137537886, + "loss": 0.1502, + "step": 11836 + }, + { + "epoch": 0.8540712146902847, + "grad_norm": 0.11390216648578644, + "learning_rate": 0.00016585654495598212, + "loss": 0.1064, + "step": 11837 + }, + { + "epoch": 0.854143367365345, + "grad_norm": 0.11588306725025177, + "learning_rate": 0.00016585365853658536, + "loss": 0.1002, + "step": 11838 + }, + { + "epoch": 0.8542155200404055, + "grad_norm": 0.11872265487909317, + "learning_rate": 0.00016585077211718865, + "loss": 0.1486, + "step": 11839 + }, + { + "epoch": 0.8542876727154659, + "grad_norm": 0.12992970645427704, + "learning_rate": 0.0001658478856977919, + "loss": 0.187, + "step": 11840 + }, + { + "epoch": 0.8543598253905264, + "grad_norm": 0.11380898952484131, + "learning_rate": 0.00016584499927839517, + "loss": 0.1943, + "step": 11841 + }, + { + "epoch": 0.8544319780655868, + "grad_norm": 0.1165139377117157, + "learning_rate": 0.00016584211285899843, + "loss": 0.1595, + "step": 11842 + }, + { + "epoch": 0.8545041307406472, + "grad_norm": 0.12911410629749298, + "learning_rate": 0.00016583922643960167, + "loss": 0.1654, + "step": 11843 + }, + { + "epoch": 0.8545762834157077, + "grad_norm": 0.12455809116363525, + "learning_rate": 0.00016583634002020493, + "loss": 0.1554, + "step": 11844 + }, + { + "epoch": 0.854648436090768, + "grad_norm": 0.14468957483768463, + "learning_rate": 0.0001658334536008082, + "loss": 0.1496, + "step": 11845 + }, + { + "epoch": 0.8547205887658285, + "grad_norm": 0.13239078223705292, + "learning_rate": 0.00016583056718141149, + "loss": 0.16, + "step": 11846 + }, + { + "epoch": 0.8547927414408889, + "grad_norm": 0.15964853763580322, + "learning_rate": 0.00016582768076201475, + "loss": 0.1798, + "step": 11847 + }, + { + "epoch": 0.8548648941159493, + "grad_norm": 0.125840961933136, + "learning_rate": 0.00016582479434261799, + "loss": 0.1847, + "step": 11848 + }, + { + "epoch": 0.8549370467910098, + "grad_norm": 0.12420251220464706, + "learning_rate": 0.00016582190792322125, + "loss": 0.1495, + "step": 11849 + }, + { + "epoch": 0.8550091994660702, + "grad_norm": 0.12960167229175568, + "learning_rate": 0.0001658190215038245, + "loss": 0.171, + "step": 11850 + }, + { + "epoch": 0.8550813521411307, + "grad_norm": 0.131041020154953, + "learning_rate": 0.00016581613508442777, + "loss": 0.1571, + "step": 11851 + }, + { + "epoch": 0.855153504816191, + "grad_norm": 0.14004012942314148, + "learning_rate": 0.00016581324866503104, + "loss": 0.1786, + "step": 11852 + }, + { + "epoch": 0.8552256574912515, + "grad_norm": 0.16974465548992157, + "learning_rate": 0.0001658103622456343, + "loss": 0.1707, + "step": 11853 + }, + { + "epoch": 0.8552978101663119, + "grad_norm": 0.12199197709560394, + "learning_rate": 0.00016580747582623756, + "loss": 0.1511, + "step": 11854 + }, + { + "epoch": 0.8553699628413723, + "grad_norm": 0.13741664588451385, + "learning_rate": 0.00016580458940684083, + "loss": 0.1303, + "step": 11855 + }, + { + "epoch": 0.8554421155164328, + "grad_norm": 0.15471656620502472, + "learning_rate": 0.0001658017029874441, + "loss": 0.1828, + "step": 11856 + }, + { + "epoch": 0.8555142681914932, + "grad_norm": 0.13550004363059998, + "learning_rate": 0.00016579881656804735, + "loss": 0.1784, + "step": 11857 + }, + { + "epoch": 0.8555864208665537, + "grad_norm": 0.13504238426685333, + "learning_rate": 0.00016579593014865061, + "loss": 0.1659, + "step": 11858 + }, + { + "epoch": 0.855658573541614, + "grad_norm": 0.11602792143821716, + "learning_rate": 0.00016579304372925385, + "loss": 0.1613, + "step": 11859 + }, + { + "epoch": 0.8557307262166745, + "grad_norm": 0.15487723052501678, + "learning_rate": 0.0001657901573098571, + "loss": 0.1583, + "step": 11860 + }, + { + "epoch": 0.8558028788917349, + "grad_norm": 0.12514373660087585, + "learning_rate": 0.0001657872708904604, + "loss": 0.1089, + "step": 11861 + }, + { + "epoch": 0.8558750315667953, + "grad_norm": 0.12423887848854065, + "learning_rate": 0.00016578438447106367, + "loss": 0.1917, + "step": 11862 + }, + { + "epoch": 0.8559471842418558, + "grad_norm": 0.11663633584976196, + "learning_rate": 0.00016578149805166693, + "loss": 0.1608, + "step": 11863 + }, + { + "epoch": 0.8560193369169162, + "grad_norm": 0.11846447736024857, + "learning_rate": 0.00016577861163227016, + "loss": 0.1215, + "step": 11864 + }, + { + "epoch": 0.8560914895919767, + "grad_norm": 0.10441578924655914, + "learning_rate": 0.00016577572521287343, + "loss": 0.1784, + "step": 11865 + }, + { + "epoch": 0.856163642267037, + "grad_norm": 0.0960303395986557, + "learning_rate": 0.0001657728387934767, + "loss": 0.1237, + "step": 11866 + }, + { + "epoch": 0.8562357949420975, + "grad_norm": 0.15245820581912994, + "learning_rate": 0.00016576995237407995, + "loss": 0.1516, + "step": 11867 + }, + { + "epoch": 0.8563079476171579, + "grad_norm": 0.13254792988300323, + "learning_rate": 0.00016576706595468324, + "loss": 0.1164, + "step": 11868 + }, + { + "epoch": 0.8563801002922183, + "grad_norm": 0.13312913477420807, + "learning_rate": 0.00016576417953528648, + "loss": 0.1717, + "step": 11869 + }, + { + "epoch": 0.8564522529672788, + "grad_norm": 0.1143551617860794, + "learning_rate": 0.00016576129311588974, + "loss": 0.1451, + "step": 11870 + }, + { + "epoch": 0.8565244056423392, + "grad_norm": 0.11746720969676971, + "learning_rate": 0.000165758406696493, + "loss": 0.1395, + "step": 11871 + }, + { + "epoch": 0.8565965583173997, + "grad_norm": 0.10211291909217834, + "learning_rate": 0.00016575552027709627, + "loss": 0.1794, + "step": 11872 + }, + { + "epoch": 0.85666871099246, + "grad_norm": 0.13879451155662537, + "learning_rate": 0.00016575263385769953, + "loss": 0.1596, + "step": 11873 + }, + { + "epoch": 0.8567408636675204, + "grad_norm": 0.10806053131818771, + "learning_rate": 0.0001657497474383028, + "loss": 0.186, + "step": 11874 + }, + { + "epoch": 0.8568130163425809, + "grad_norm": 0.11560015380382538, + "learning_rate": 0.00016574686101890606, + "loss": 0.1438, + "step": 11875 + }, + { + "epoch": 0.8568851690176413, + "grad_norm": 0.13607026636600494, + "learning_rate": 0.00016574397459950932, + "loss": 0.1432, + "step": 11876 + }, + { + "epoch": 0.8569573216927018, + "grad_norm": 0.11837439984083176, + "learning_rate": 0.00016574108818011258, + "loss": 0.0987, + "step": 11877 + }, + { + "epoch": 0.8570294743677622, + "grad_norm": 0.1011228933930397, + "learning_rate": 0.00016573820176071585, + "loss": 0.1205, + "step": 11878 + }, + { + "epoch": 0.8571016270428227, + "grad_norm": 0.12492066621780396, + "learning_rate": 0.0001657353153413191, + "loss": 0.1546, + "step": 11879 + }, + { + "epoch": 0.857173779717883, + "grad_norm": 0.1269485205411911, + "learning_rate": 0.00016573242892192234, + "loss": 0.1232, + "step": 11880 + }, + { + "epoch": 0.8572459323929434, + "grad_norm": 0.12102842330932617, + "learning_rate": 0.0001657295425025256, + "loss": 0.1611, + "step": 11881 + }, + { + "epoch": 0.8573180850680039, + "grad_norm": 0.1312066614627838, + "learning_rate": 0.0001657266560831289, + "loss": 0.1085, + "step": 11882 + }, + { + "epoch": 0.8573902377430643, + "grad_norm": 0.1548612266778946, + "learning_rate": 0.00016572376966373216, + "loss": 0.2026, + "step": 11883 + }, + { + "epoch": 0.8574623904181248, + "grad_norm": 0.12773779034614563, + "learning_rate": 0.00016572088324433542, + "loss": 0.1643, + "step": 11884 + }, + { + "epoch": 0.8575345430931852, + "grad_norm": 0.1022697240114212, + "learning_rate": 0.00016571799682493866, + "loss": 0.0982, + "step": 11885 + }, + { + "epoch": 0.8576066957682456, + "grad_norm": 0.11860981583595276, + "learning_rate": 0.00016571511040554192, + "loss": 0.1102, + "step": 11886 + }, + { + "epoch": 0.857678848443306, + "grad_norm": 0.11475259810686111, + "learning_rate": 0.00016571222398614518, + "loss": 0.153, + "step": 11887 + }, + { + "epoch": 0.8577510011183664, + "grad_norm": 0.12198229879140854, + "learning_rate": 0.00016570933756674845, + "loss": 0.1282, + "step": 11888 + }, + { + "epoch": 0.8578231537934269, + "grad_norm": 0.10108748078346252, + "learning_rate": 0.00016570645114735174, + "loss": 0.0983, + "step": 11889 + }, + { + "epoch": 0.8578953064684873, + "grad_norm": 0.20072031021118164, + "learning_rate": 0.00016570356472795497, + "loss": 0.1407, + "step": 11890 + }, + { + "epoch": 0.8579674591435478, + "grad_norm": 0.11227798461914062, + "learning_rate": 0.00016570067830855824, + "loss": 0.1458, + "step": 11891 + }, + { + "epoch": 0.8580396118186082, + "grad_norm": 0.13460202515125275, + "learning_rate": 0.0001656977918891615, + "loss": 0.1292, + "step": 11892 + }, + { + "epoch": 0.8581117644936685, + "grad_norm": 0.12463308125734329, + "learning_rate": 0.00016569490546976476, + "loss": 0.0953, + "step": 11893 + }, + { + "epoch": 0.858183917168729, + "grad_norm": 0.14697812497615814, + "learning_rate": 0.00016569201905036803, + "loss": 0.1126, + "step": 11894 + }, + { + "epoch": 0.8582560698437894, + "grad_norm": 0.14126932621002197, + "learning_rate": 0.0001656891326309713, + "loss": 0.1499, + "step": 11895 + }, + { + "epoch": 0.8583282225188499, + "grad_norm": 0.11501924693584442, + "learning_rate": 0.00016568624621157455, + "loss": 0.1156, + "step": 11896 + }, + { + "epoch": 0.8584003751939103, + "grad_norm": 0.10844486206769943, + "learning_rate": 0.00016568335979217781, + "loss": 0.1566, + "step": 11897 + }, + { + "epoch": 0.8584725278689708, + "grad_norm": 0.11743530631065369, + "learning_rate": 0.00016568047337278108, + "loss": 0.1794, + "step": 11898 + }, + { + "epoch": 0.8585446805440312, + "grad_norm": 0.1236339583992958, + "learning_rate": 0.00016567758695338434, + "loss": 0.1827, + "step": 11899 + }, + { + "epoch": 0.8586168332190915, + "grad_norm": 0.10101400315761566, + "learning_rate": 0.0001656747005339876, + "loss": 0.1801, + "step": 11900 + }, + { + "epoch": 0.858688985894152, + "grad_norm": 0.12601572275161743, + "learning_rate": 0.00016567181411459084, + "loss": 0.1398, + "step": 11901 + }, + { + "epoch": 0.8587611385692124, + "grad_norm": 0.10722434520721436, + "learning_rate": 0.0001656689276951941, + "loss": 0.132, + "step": 11902 + }, + { + "epoch": 0.8588332912442729, + "grad_norm": 0.12820284068584442, + "learning_rate": 0.0001656660412757974, + "loss": 0.1443, + "step": 11903 + }, + { + "epoch": 0.8589054439193333, + "grad_norm": 0.11345351487398148, + "learning_rate": 0.00016566315485640065, + "loss": 0.1394, + "step": 11904 + }, + { + "epoch": 0.8589775965943938, + "grad_norm": 0.13568373024463654, + "learning_rate": 0.00016566026843700392, + "loss": 0.1392, + "step": 11905 + }, + { + "epoch": 0.8590497492694542, + "grad_norm": 0.1328319013118744, + "learning_rate": 0.00016565738201760715, + "loss": 0.1457, + "step": 11906 + }, + { + "epoch": 0.8591219019445145, + "grad_norm": 0.10983971506357193, + "learning_rate": 0.00016565449559821042, + "loss": 0.1057, + "step": 11907 + }, + { + "epoch": 0.859194054619575, + "grad_norm": 0.12285325676202774, + "learning_rate": 0.00016565160917881368, + "loss": 0.2251, + "step": 11908 + }, + { + "epoch": 0.8592662072946354, + "grad_norm": 0.13944754004478455, + "learning_rate": 0.00016564872275941694, + "loss": 0.1366, + "step": 11909 + }, + { + "epoch": 0.8593383599696959, + "grad_norm": 0.11536892503499985, + "learning_rate": 0.00016564583634002023, + "loss": 0.1483, + "step": 11910 + }, + { + "epoch": 0.8594105126447563, + "grad_norm": 0.1251530647277832, + "learning_rate": 0.00016564294992062347, + "loss": 0.1474, + "step": 11911 + }, + { + "epoch": 0.8594826653198168, + "grad_norm": 0.11108506470918655, + "learning_rate": 0.00016564006350122673, + "loss": 0.1627, + "step": 11912 + }, + { + "epoch": 0.8595548179948772, + "grad_norm": 0.12315988540649414, + "learning_rate": 0.00016563717708183, + "loss": 0.1291, + "step": 11913 + }, + { + "epoch": 0.8596269706699375, + "grad_norm": 0.1117662563920021, + "learning_rate": 0.00016563429066243326, + "loss": 0.172, + "step": 11914 + }, + { + "epoch": 0.859699123344998, + "grad_norm": 0.10717906057834625, + "learning_rate": 0.00016563140424303652, + "loss": 0.1616, + "step": 11915 + }, + { + "epoch": 0.8597712760200584, + "grad_norm": 0.11858413368463516, + "learning_rate": 0.00016562851782363978, + "loss": 0.196, + "step": 11916 + }, + { + "epoch": 0.8598434286951189, + "grad_norm": 0.10697498917579651, + "learning_rate": 0.00016562563140424305, + "loss": 0.1471, + "step": 11917 + }, + { + "epoch": 0.8599155813701793, + "grad_norm": 0.11936324089765549, + "learning_rate": 0.0001656227449848463, + "loss": 0.1531, + "step": 11918 + }, + { + "epoch": 0.8599877340452398, + "grad_norm": 0.16321326792240143, + "learning_rate": 0.00016561985856544957, + "loss": 0.1923, + "step": 11919 + }, + { + "epoch": 0.8600598867203002, + "grad_norm": 0.1054329127073288, + "learning_rate": 0.00016561697214605283, + "loss": 0.1986, + "step": 11920 + }, + { + "epoch": 0.8601320393953605, + "grad_norm": 0.1419740617275238, + "learning_rate": 0.0001656140857266561, + "loss": 0.1601, + "step": 11921 + }, + { + "epoch": 0.860204192070421, + "grad_norm": 0.11228160560131073, + "learning_rate": 0.00016561119930725933, + "loss": 0.1515, + "step": 11922 + }, + { + "epoch": 0.8602763447454814, + "grad_norm": 0.10666772723197937, + "learning_rate": 0.0001656083128878626, + "loss": 0.1176, + "step": 11923 + }, + { + "epoch": 0.8603484974205419, + "grad_norm": 0.145542711019516, + "learning_rate": 0.00016560542646846589, + "loss": 0.1825, + "step": 11924 + }, + { + "epoch": 0.8604206500956023, + "grad_norm": 0.1109357476234436, + "learning_rate": 0.00016560254004906915, + "loss": 0.1566, + "step": 11925 + }, + { + "epoch": 0.8604928027706628, + "grad_norm": 0.13261841237545013, + "learning_rate": 0.0001655996536296724, + "loss": 0.1368, + "step": 11926 + }, + { + "epoch": 0.8605649554457232, + "grad_norm": 0.18109826743602753, + "learning_rate": 0.00016559676721027565, + "loss": 0.1794, + "step": 11927 + }, + { + "epoch": 0.8606371081207835, + "grad_norm": 0.12986978888511658, + "learning_rate": 0.0001655938807908789, + "loss": 0.1721, + "step": 11928 + }, + { + "epoch": 0.860709260795844, + "grad_norm": 0.14286606013774872, + "learning_rate": 0.00016559099437148217, + "loss": 0.1783, + "step": 11929 + }, + { + "epoch": 0.8607814134709044, + "grad_norm": 0.13024945557117462, + "learning_rate": 0.00016558810795208544, + "loss": 0.1656, + "step": 11930 + }, + { + "epoch": 0.8608535661459649, + "grad_norm": 0.1351431906223297, + "learning_rate": 0.00016558522153268873, + "loss": 0.1338, + "step": 11931 + }, + { + "epoch": 0.8609257188210253, + "grad_norm": 0.14224115014076233, + "learning_rate": 0.00016558233511329196, + "loss": 0.183, + "step": 11932 + }, + { + "epoch": 0.8609978714960858, + "grad_norm": 0.1591176688671112, + "learning_rate": 0.00016557944869389523, + "loss": 0.1574, + "step": 11933 + }, + { + "epoch": 0.8610700241711462, + "grad_norm": 0.13040365278720856, + "learning_rate": 0.0001655765622744985, + "loss": 0.1652, + "step": 11934 + }, + { + "epoch": 0.8611421768462065, + "grad_norm": 0.13435451686382294, + "learning_rate": 0.00016557367585510175, + "loss": 0.1408, + "step": 11935 + }, + { + "epoch": 0.861214329521267, + "grad_norm": 0.1458059549331665, + "learning_rate": 0.00016557078943570501, + "loss": 0.1505, + "step": 11936 + }, + { + "epoch": 0.8612864821963274, + "grad_norm": 0.11824122816324234, + "learning_rate": 0.00016556790301630828, + "loss": 0.1792, + "step": 11937 + }, + { + "epoch": 0.8613586348713879, + "grad_norm": 0.11797327548265457, + "learning_rate": 0.00016556501659691154, + "loss": 0.1492, + "step": 11938 + }, + { + "epoch": 0.8614307875464483, + "grad_norm": 0.164879709482193, + "learning_rate": 0.0001655621301775148, + "loss": 0.1701, + "step": 11939 + }, + { + "epoch": 0.8615029402215088, + "grad_norm": 0.13124053180217743, + "learning_rate": 0.00016555924375811807, + "loss": 0.1505, + "step": 11940 + }, + { + "epoch": 0.8615750928965692, + "grad_norm": 0.11650021374225616, + "learning_rate": 0.00016555635733872133, + "loss": 0.1222, + "step": 11941 + }, + { + "epoch": 0.8616472455716295, + "grad_norm": 0.11756842583417892, + "learning_rate": 0.0001655534709193246, + "loss": 0.1936, + "step": 11942 + }, + { + "epoch": 0.86171939824669, + "grad_norm": 0.12020300328731537, + "learning_rate": 0.00016555058449992785, + "loss": 0.1209, + "step": 11943 + }, + { + "epoch": 0.8617915509217504, + "grad_norm": 0.10335661470890045, + "learning_rate": 0.0001655476980805311, + "loss": 0.1269, + "step": 11944 + }, + { + "epoch": 0.8618637035968109, + "grad_norm": 0.10224736481904984, + "learning_rate": 0.00016554481166113438, + "loss": 0.1017, + "step": 11945 + }, + { + "epoch": 0.8619358562718713, + "grad_norm": 0.16791343688964844, + "learning_rate": 0.00016554192524173764, + "loss": 0.1442, + "step": 11946 + }, + { + "epoch": 0.8620080089469317, + "grad_norm": 0.12022384256124496, + "learning_rate": 0.0001655390388223409, + "loss": 0.1631, + "step": 11947 + }, + { + "epoch": 0.8620801616219921, + "grad_norm": 0.11751755326986313, + "learning_rate": 0.00016553615240294417, + "loss": 0.1516, + "step": 11948 + }, + { + "epoch": 0.8621523142970525, + "grad_norm": 0.11795671284198761, + "learning_rate": 0.0001655332659835474, + "loss": 0.1598, + "step": 11949 + }, + { + "epoch": 0.862224466972113, + "grad_norm": 0.11847096681594849, + "learning_rate": 0.00016553037956415067, + "loss": 0.1389, + "step": 11950 + }, + { + "epoch": 0.8622966196471734, + "grad_norm": 0.13842691481113434, + "learning_rate": 0.00016552749314475393, + "loss": 0.1496, + "step": 11951 + }, + { + "epoch": 0.8623687723222339, + "grad_norm": 0.1378527283668518, + "learning_rate": 0.00016552460672535722, + "loss": 0.1888, + "step": 11952 + }, + { + "epoch": 0.8624409249972943, + "grad_norm": 0.11756579577922821, + "learning_rate": 0.00016552172030596048, + "loss": 0.1009, + "step": 11953 + }, + { + "epoch": 0.8625130776723547, + "grad_norm": 0.12039361894130707, + "learning_rate": 0.00016551883388656372, + "loss": 0.2594, + "step": 11954 + }, + { + "epoch": 0.8625852303474151, + "grad_norm": 0.10809768736362457, + "learning_rate": 0.00016551594746716698, + "loss": 0.1223, + "step": 11955 + }, + { + "epoch": 0.8626573830224755, + "grad_norm": 0.11799143254756927, + "learning_rate": 0.00016551306104777025, + "loss": 0.1588, + "step": 11956 + }, + { + "epoch": 0.862729535697536, + "grad_norm": 0.11549495905637741, + "learning_rate": 0.0001655101746283735, + "loss": 0.1738, + "step": 11957 + }, + { + "epoch": 0.8628016883725964, + "grad_norm": 0.13709287345409393, + "learning_rate": 0.00016550728820897677, + "loss": 0.1597, + "step": 11958 + }, + { + "epoch": 0.8628738410476569, + "grad_norm": 0.11733334511518478, + "learning_rate": 0.00016550440178958003, + "loss": 0.1439, + "step": 11959 + }, + { + "epoch": 0.8629459937227173, + "grad_norm": 0.1170649528503418, + "learning_rate": 0.0001655015153701833, + "loss": 0.1483, + "step": 11960 + }, + { + "epoch": 0.8630181463977777, + "grad_norm": 0.11585357040166855, + "learning_rate": 0.00016549862895078656, + "loss": 0.133, + "step": 11961 + }, + { + "epoch": 0.8630902990728381, + "grad_norm": 0.12861251831054688, + "learning_rate": 0.00016549574253138982, + "loss": 0.1677, + "step": 11962 + }, + { + "epoch": 0.8631624517478985, + "grad_norm": 0.14745929837226868, + "learning_rate": 0.00016549285611199309, + "loss": 0.1699, + "step": 11963 + }, + { + "epoch": 0.863234604422959, + "grad_norm": 0.11474446207284927, + "learning_rate": 0.00016548996969259635, + "loss": 0.1778, + "step": 11964 + }, + { + "epoch": 0.8633067570980194, + "grad_norm": 0.1322767734527588, + "learning_rate": 0.00016548708327319958, + "loss": 0.1542, + "step": 11965 + }, + { + "epoch": 0.8633789097730799, + "grad_norm": 0.1421336829662323, + "learning_rate": 0.00016548419685380287, + "loss": 0.1424, + "step": 11966 + }, + { + "epoch": 0.8634510624481403, + "grad_norm": 0.13709428906440735, + "learning_rate": 0.00016548131043440614, + "loss": 0.1552, + "step": 11967 + }, + { + "epoch": 0.8635232151232007, + "grad_norm": 0.13968567550182343, + "learning_rate": 0.0001654784240150094, + "loss": 0.101, + "step": 11968 + }, + { + "epoch": 0.8635953677982611, + "grad_norm": 0.1329333335161209, + "learning_rate": 0.00016547553759561266, + "loss": 0.1844, + "step": 11969 + }, + { + "epoch": 0.8636675204733215, + "grad_norm": 0.12988804280757904, + "learning_rate": 0.0001654726511762159, + "loss": 0.1629, + "step": 11970 + }, + { + "epoch": 0.863739673148382, + "grad_norm": 0.18330612778663635, + "learning_rate": 0.00016546976475681916, + "loss": 0.144, + "step": 11971 + }, + { + "epoch": 0.8638118258234424, + "grad_norm": 0.13274584710597992, + "learning_rate": 0.00016546687833742242, + "loss": 0.1395, + "step": 11972 + }, + { + "epoch": 0.8638839784985028, + "grad_norm": 0.12589463591575623, + "learning_rate": 0.00016546399191802571, + "loss": 0.165, + "step": 11973 + }, + { + "epoch": 0.8639561311735633, + "grad_norm": 0.15424275398254395, + "learning_rate": 0.00016546110549862898, + "loss": 0.2213, + "step": 11974 + }, + { + "epoch": 0.8640282838486237, + "grad_norm": 0.14214026927947998, + "learning_rate": 0.00016545821907923221, + "loss": 0.1617, + "step": 11975 + }, + { + "epoch": 0.8641004365236841, + "grad_norm": 0.12770618498325348, + "learning_rate": 0.00016545533265983548, + "loss": 0.1558, + "step": 11976 + }, + { + "epoch": 0.8641725891987445, + "grad_norm": 0.1013932079076767, + "learning_rate": 0.00016545244624043874, + "loss": 0.1326, + "step": 11977 + }, + { + "epoch": 0.864244741873805, + "grad_norm": 0.1254311501979828, + "learning_rate": 0.000165449559821042, + "loss": 0.1413, + "step": 11978 + }, + { + "epoch": 0.8643168945488654, + "grad_norm": 0.11821615695953369, + "learning_rate": 0.00016544667340164527, + "loss": 0.1184, + "step": 11979 + }, + { + "epoch": 0.8643890472239258, + "grad_norm": 0.11322508752346039, + "learning_rate": 0.00016544378698224853, + "loss": 0.1394, + "step": 11980 + }, + { + "epoch": 0.8644611998989863, + "grad_norm": 0.1489751785993576, + "learning_rate": 0.0001654409005628518, + "loss": 0.1476, + "step": 11981 + }, + { + "epoch": 0.8645333525740467, + "grad_norm": 0.12493439763784409, + "learning_rate": 0.00016543801414345505, + "loss": 0.1311, + "step": 11982 + }, + { + "epoch": 0.8646055052491071, + "grad_norm": 0.12061279267072678, + "learning_rate": 0.00016543512772405832, + "loss": 0.1578, + "step": 11983 + }, + { + "epoch": 0.8646776579241675, + "grad_norm": 0.12649033963680267, + "learning_rate": 0.00016543224130466158, + "loss": 0.1538, + "step": 11984 + }, + { + "epoch": 0.864749810599228, + "grad_norm": 0.12275785952806473, + "learning_rate": 0.00016542935488526484, + "loss": 0.1375, + "step": 11985 + }, + { + "epoch": 0.8648219632742884, + "grad_norm": 0.12534111738204956, + "learning_rate": 0.00016542646846586808, + "loss": 0.1792, + "step": 11986 + }, + { + "epoch": 0.8648941159493488, + "grad_norm": 0.1655583679676056, + "learning_rate": 0.00016542358204647137, + "loss": 0.1193, + "step": 11987 + }, + { + "epoch": 0.8649662686244093, + "grad_norm": 0.13253222405910492, + "learning_rate": 0.00016542069562707463, + "loss": 0.1462, + "step": 11988 + }, + { + "epoch": 0.8650384212994697, + "grad_norm": 0.13312864303588867, + "learning_rate": 0.0001654178092076779, + "loss": 0.135, + "step": 11989 + }, + { + "epoch": 0.8651105739745301, + "grad_norm": 0.09539494663476944, + "learning_rate": 0.00016541492278828116, + "loss": 0.1916, + "step": 11990 + }, + { + "epoch": 0.8651827266495905, + "grad_norm": 0.11006974428892136, + "learning_rate": 0.0001654120363688844, + "loss": 0.1785, + "step": 11991 + }, + { + "epoch": 0.865254879324651, + "grad_norm": 0.11187838017940521, + "learning_rate": 0.00016540914994948766, + "loss": 0.1586, + "step": 11992 + }, + { + "epoch": 0.8653270319997114, + "grad_norm": 0.10977104306221008, + "learning_rate": 0.00016540626353009092, + "loss": 0.1669, + "step": 11993 + }, + { + "epoch": 0.8653991846747718, + "grad_norm": 0.10701952129602432, + "learning_rate": 0.0001654033771106942, + "loss": 0.1144, + "step": 11994 + }, + { + "epoch": 0.8654713373498323, + "grad_norm": 0.11350420862436295, + "learning_rate": 0.00016540049069129747, + "loss": 0.1286, + "step": 11995 + }, + { + "epoch": 0.8655434900248927, + "grad_norm": 0.11541275680065155, + "learning_rate": 0.0001653976042719007, + "loss": 0.1623, + "step": 11996 + }, + { + "epoch": 0.8656156426999531, + "grad_norm": 0.10224005579948425, + "learning_rate": 0.00016539471785250397, + "loss": 0.0958, + "step": 11997 + }, + { + "epoch": 0.8656877953750135, + "grad_norm": 0.1323241889476776, + "learning_rate": 0.00016539183143310723, + "loss": 0.1257, + "step": 11998 + }, + { + "epoch": 0.865759948050074, + "grad_norm": 0.14228209853172302, + "learning_rate": 0.0001653889450137105, + "loss": 0.145, + "step": 11999 + }, + { + "epoch": 0.8658321007251344, + "grad_norm": 0.14026106894016266, + "learning_rate": 0.00016538605859431376, + "loss": 0.1575, + "step": 12000 + }, + { + "epoch": 0.8659042534001948, + "grad_norm": 0.14158804714679718, + "learning_rate": 0.00016538317217491702, + "loss": 0.2045, + "step": 12001 + }, + { + "epoch": 0.8659764060752553, + "grad_norm": 0.1218271553516388, + "learning_rate": 0.00016538028575552029, + "loss": 0.1815, + "step": 12002 + }, + { + "epoch": 0.8660485587503157, + "grad_norm": 0.13607192039489746, + "learning_rate": 0.00016537739933612355, + "loss": 0.1223, + "step": 12003 + }, + { + "epoch": 0.866120711425376, + "grad_norm": 0.11378008872270584, + "learning_rate": 0.0001653745129167268, + "loss": 0.1253, + "step": 12004 + }, + { + "epoch": 0.8661928641004365, + "grad_norm": 0.12672148644924164, + "learning_rate": 0.00016537162649733007, + "loss": 0.1702, + "step": 12005 + }, + { + "epoch": 0.8662650167754969, + "grad_norm": 0.1406373679637909, + "learning_rate": 0.00016536874007793334, + "loss": 0.1351, + "step": 12006 + }, + { + "epoch": 0.8663371694505574, + "grad_norm": 0.16386227309703827, + "learning_rate": 0.00016536585365853657, + "loss": 0.1901, + "step": 12007 + }, + { + "epoch": 0.8664093221256178, + "grad_norm": 0.12637802958488464, + "learning_rate": 0.00016536296723913986, + "loss": 0.1616, + "step": 12008 + }, + { + "epoch": 0.8664814748006783, + "grad_norm": 0.11758668720722198, + "learning_rate": 0.00016536008081974313, + "loss": 0.151, + "step": 12009 + }, + { + "epoch": 0.8665536274757386, + "grad_norm": 0.13361458480358124, + "learning_rate": 0.0001653571944003464, + "loss": 0.1844, + "step": 12010 + }, + { + "epoch": 0.866625780150799, + "grad_norm": 0.14450719952583313, + "learning_rate": 0.00016535430798094965, + "loss": 0.161, + "step": 12011 + }, + { + "epoch": 0.8666979328258595, + "grad_norm": 0.12745462357997894, + "learning_rate": 0.0001653514215615529, + "loss": 0.166, + "step": 12012 + }, + { + "epoch": 0.8667700855009199, + "grad_norm": 0.13964685797691345, + "learning_rate": 0.00016534853514215615, + "loss": 0.1315, + "step": 12013 + }, + { + "epoch": 0.8668422381759804, + "grad_norm": 0.13584551215171814, + "learning_rate": 0.0001653456487227594, + "loss": 0.2064, + "step": 12014 + }, + { + "epoch": 0.8669143908510408, + "grad_norm": 0.14645154774188995, + "learning_rate": 0.0001653427623033627, + "loss": 0.1317, + "step": 12015 + }, + { + "epoch": 0.8669865435261013, + "grad_norm": 0.12439603358507156, + "learning_rate": 0.00016533987588396597, + "loss": 0.1519, + "step": 12016 + }, + { + "epoch": 0.8670586962011616, + "grad_norm": 0.1141846626996994, + "learning_rate": 0.0001653369894645692, + "loss": 0.1704, + "step": 12017 + }, + { + "epoch": 0.867130848876222, + "grad_norm": 0.1145968809723854, + "learning_rate": 0.00016533410304517247, + "loss": 0.1426, + "step": 12018 + }, + { + "epoch": 0.8672030015512825, + "grad_norm": 0.12812818586826324, + "learning_rate": 0.00016533121662577573, + "loss": 0.1639, + "step": 12019 + }, + { + "epoch": 0.8672751542263429, + "grad_norm": 0.12737447023391724, + "learning_rate": 0.000165328330206379, + "loss": 0.1526, + "step": 12020 + }, + { + "epoch": 0.8673473069014034, + "grad_norm": 0.11546266824007034, + "learning_rate": 0.00016532544378698225, + "loss": 0.1362, + "step": 12021 + }, + { + "epoch": 0.8674194595764638, + "grad_norm": 0.14184580743312836, + "learning_rate": 0.00016532255736758552, + "loss": 0.1934, + "step": 12022 + }, + { + "epoch": 0.8674916122515243, + "grad_norm": 0.10528424382209778, + "learning_rate": 0.00016531967094818878, + "loss": 0.1609, + "step": 12023 + }, + { + "epoch": 0.8675637649265846, + "grad_norm": 0.15018628537654877, + "learning_rate": 0.00016531678452879204, + "loss": 0.1719, + "step": 12024 + }, + { + "epoch": 0.867635917601645, + "grad_norm": 0.13803143799304962, + "learning_rate": 0.0001653138981093953, + "loss": 0.0904, + "step": 12025 + }, + { + "epoch": 0.8677080702767055, + "grad_norm": 0.20398986339569092, + "learning_rate": 0.00016531101168999857, + "loss": 0.1759, + "step": 12026 + }, + { + "epoch": 0.8677802229517659, + "grad_norm": 0.14385467767715454, + "learning_rate": 0.00016530812527060183, + "loss": 0.1553, + "step": 12027 + }, + { + "epoch": 0.8678523756268264, + "grad_norm": 0.10514909774065018, + "learning_rate": 0.00016530523885120507, + "loss": 0.1138, + "step": 12028 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 0.11758711189031601, + "learning_rate": 0.00016530235243180836, + "loss": 0.1221, + "step": 12029 + }, + { + "epoch": 0.8679966809769473, + "grad_norm": 0.10472291707992554, + "learning_rate": 0.00016529946601241162, + "loss": 0.1683, + "step": 12030 + }, + { + "epoch": 0.8680688336520076, + "grad_norm": 0.11128973215818405, + "learning_rate": 0.00016529657959301488, + "loss": 0.1502, + "step": 12031 + }, + { + "epoch": 0.868140986327068, + "grad_norm": 0.12483924627304077, + "learning_rate": 0.00016529369317361815, + "loss": 0.1499, + "step": 12032 + }, + { + "epoch": 0.8682131390021285, + "grad_norm": 0.12955377995967865, + "learning_rate": 0.00016529080675422138, + "loss": 0.1212, + "step": 12033 + }, + { + "epoch": 0.8682852916771889, + "grad_norm": 0.13617177307605743, + "learning_rate": 0.00016528792033482464, + "loss": 0.1881, + "step": 12034 + }, + { + "epoch": 0.8683574443522494, + "grad_norm": 0.136407732963562, + "learning_rate": 0.0001652850339154279, + "loss": 0.1583, + "step": 12035 + }, + { + "epoch": 0.8684295970273098, + "grad_norm": 0.12334705889225006, + "learning_rate": 0.0001652821474960312, + "loss": 0.1374, + "step": 12036 + }, + { + "epoch": 0.8685017497023703, + "grad_norm": 0.13375817239284515, + "learning_rate": 0.00016527926107663446, + "loss": 0.1291, + "step": 12037 + }, + { + "epoch": 0.8685739023774306, + "grad_norm": 0.1542121320962906, + "learning_rate": 0.0001652763746572377, + "loss": 0.1469, + "step": 12038 + }, + { + "epoch": 0.868646055052491, + "grad_norm": 0.12267142534255981, + "learning_rate": 0.00016527348823784096, + "loss": 0.1082, + "step": 12039 + }, + { + "epoch": 0.8687182077275515, + "grad_norm": 0.12496042251586914, + "learning_rate": 0.00016527060181844422, + "loss": 0.1775, + "step": 12040 + }, + { + "epoch": 0.8687903604026119, + "grad_norm": 0.12290967255830765, + "learning_rate": 0.00016526771539904749, + "loss": 0.1664, + "step": 12041 + }, + { + "epoch": 0.8688625130776724, + "grad_norm": 0.10611739009618759, + "learning_rate": 0.00016526482897965075, + "loss": 0.1673, + "step": 12042 + }, + { + "epoch": 0.8689346657527328, + "grad_norm": 0.13516947627067566, + "learning_rate": 0.000165261942560254, + "loss": 0.1949, + "step": 12043 + }, + { + "epoch": 0.8690068184277933, + "grad_norm": 0.11694905161857605, + "learning_rate": 0.00016525905614085727, + "loss": 0.1935, + "step": 12044 + }, + { + "epoch": 0.8690789711028536, + "grad_norm": 0.1327820122241974, + "learning_rate": 0.00016525616972146054, + "loss": 0.1673, + "step": 12045 + }, + { + "epoch": 0.869151123777914, + "grad_norm": 0.1115252748131752, + "learning_rate": 0.0001652532833020638, + "loss": 0.1141, + "step": 12046 + }, + { + "epoch": 0.8692232764529745, + "grad_norm": 0.11399739235639572, + "learning_rate": 0.00016525039688266706, + "loss": 0.1891, + "step": 12047 + }, + { + "epoch": 0.8692954291280349, + "grad_norm": 0.14533619582653046, + "learning_rate": 0.00016524751046327033, + "loss": 0.1543, + "step": 12048 + }, + { + "epoch": 0.8693675818030954, + "grad_norm": 0.13893792033195496, + "learning_rate": 0.00016524462404387356, + "loss": 0.123, + "step": 12049 + }, + { + "epoch": 0.8694397344781558, + "grad_norm": 0.11470438539981842, + "learning_rate": 0.00016524173762447682, + "loss": 0.2039, + "step": 12050 + }, + { + "epoch": 0.8695118871532163, + "grad_norm": 0.14197883009910583, + "learning_rate": 0.00016523885120508011, + "loss": 0.1669, + "step": 12051 + }, + { + "epoch": 0.8695840398282766, + "grad_norm": 0.11966146528720856, + "learning_rate": 0.00016523596478568338, + "loss": 0.1399, + "step": 12052 + }, + { + "epoch": 0.869656192503337, + "grad_norm": 0.15418173372745514, + "learning_rate": 0.00016523307836628664, + "loss": 0.1202, + "step": 12053 + }, + { + "epoch": 0.8697283451783975, + "grad_norm": 0.13532495498657227, + "learning_rate": 0.0001652301919468899, + "loss": 0.1673, + "step": 12054 + }, + { + "epoch": 0.8698004978534579, + "grad_norm": 0.13908350467681885, + "learning_rate": 0.00016522730552749314, + "loss": 0.131, + "step": 12055 + }, + { + "epoch": 0.8698726505285184, + "grad_norm": 0.150166854262352, + "learning_rate": 0.0001652244191080964, + "loss": 0.1332, + "step": 12056 + }, + { + "epoch": 0.8699448032035788, + "grad_norm": 0.13706979155540466, + "learning_rate": 0.00016522153268869966, + "loss": 0.1606, + "step": 12057 + }, + { + "epoch": 0.8700169558786393, + "grad_norm": 0.13957710564136505, + "learning_rate": 0.00016521864626930295, + "loss": 0.1066, + "step": 12058 + }, + { + "epoch": 0.8700891085536996, + "grad_norm": 0.12413793802261353, + "learning_rate": 0.00016521575984990622, + "loss": 0.1668, + "step": 12059 + }, + { + "epoch": 0.87016126122876, + "grad_norm": 0.13130804896354675, + "learning_rate": 0.00016521287343050945, + "loss": 0.1344, + "step": 12060 + }, + { + "epoch": 0.8702334139038205, + "grad_norm": 0.11016803979873657, + "learning_rate": 0.00016520998701111272, + "loss": 0.163, + "step": 12061 + }, + { + "epoch": 0.8703055665788809, + "grad_norm": 0.1339435875415802, + "learning_rate": 0.00016520710059171598, + "loss": 0.0847, + "step": 12062 + }, + { + "epoch": 0.8703777192539414, + "grad_norm": 0.13038188219070435, + "learning_rate": 0.00016520421417231924, + "loss": 0.1442, + "step": 12063 + }, + { + "epoch": 0.8704498719290018, + "grad_norm": 0.16010652482509613, + "learning_rate": 0.0001652013277529225, + "loss": 0.1278, + "step": 12064 + }, + { + "epoch": 0.8705220246040623, + "grad_norm": 0.11903893947601318, + "learning_rate": 0.00016519844133352577, + "loss": 0.1064, + "step": 12065 + }, + { + "epoch": 0.8705941772791226, + "grad_norm": 0.10911662876605988, + "learning_rate": 0.00016519555491412903, + "loss": 0.1323, + "step": 12066 + }, + { + "epoch": 0.870666329954183, + "grad_norm": 0.12734061479568481, + "learning_rate": 0.0001651926684947323, + "loss": 0.1139, + "step": 12067 + }, + { + "epoch": 0.8707384826292435, + "grad_norm": 0.1067054495215416, + "learning_rate": 0.00016518978207533556, + "loss": 0.1328, + "step": 12068 + }, + { + "epoch": 0.8708106353043039, + "grad_norm": 0.11850341409444809, + "learning_rate": 0.00016518689565593882, + "loss": 0.1604, + "step": 12069 + }, + { + "epoch": 0.8708827879793644, + "grad_norm": 0.13869835436344147, + "learning_rate": 0.00016518400923654208, + "loss": 0.1415, + "step": 12070 + }, + { + "epoch": 0.8709549406544248, + "grad_norm": 0.10934671759605408, + "learning_rate": 0.00016518112281714532, + "loss": 0.185, + "step": 12071 + }, + { + "epoch": 0.8710270933294851, + "grad_norm": 0.09486602246761322, + "learning_rate": 0.0001651782363977486, + "loss": 0.2053, + "step": 12072 + }, + { + "epoch": 0.8710992460045456, + "grad_norm": 0.1067877933382988, + "learning_rate": 0.00016517534997835187, + "loss": 0.1463, + "step": 12073 + }, + { + "epoch": 0.871171398679606, + "grad_norm": 0.13438858091831207, + "learning_rate": 0.00016517246355895513, + "loss": 0.1328, + "step": 12074 + }, + { + "epoch": 0.8712435513546665, + "grad_norm": 0.11246223002672195, + "learning_rate": 0.0001651695771395584, + "loss": 0.1913, + "step": 12075 + }, + { + "epoch": 0.8713157040297269, + "grad_norm": 0.15119744837284088, + "learning_rate": 0.00016516669072016163, + "loss": 0.1686, + "step": 12076 + }, + { + "epoch": 0.8713878567047874, + "grad_norm": 0.11665923148393631, + "learning_rate": 0.0001651638043007649, + "loss": 0.1706, + "step": 12077 + }, + { + "epoch": 0.8714600093798478, + "grad_norm": 0.11429120600223541, + "learning_rate": 0.00016516091788136816, + "loss": 0.1297, + "step": 12078 + }, + { + "epoch": 0.8715321620549081, + "grad_norm": 0.11058809608221054, + "learning_rate": 0.00016515803146197145, + "loss": 0.1065, + "step": 12079 + }, + { + "epoch": 0.8716043147299686, + "grad_norm": 0.12536856532096863, + "learning_rate": 0.0001651551450425747, + "loss": 0.1183, + "step": 12080 + }, + { + "epoch": 0.871676467405029, + "grad_norm": 0.11672108620405197, + "learning_rate": 0.00016515225862317795, + "loss": 0.1308, + "step": 12081 + }, + { + "epoch": 0.8717486200800895, + "grad_norm": 0.1285310685634613, + "learning_rate": 0.0001651493722037812, + "loss": 0.1425, + "step": 12082 + }, + { + "epoch": 0.8718207727551499, + "grad_norm": 0.12578968703746796, + "learning_rate": 0.00016514648578438447, + "loss": 0.1317, + "step": 12083 + }, + { + "epoch": 0.8718929254302104, + "grad_norm": 0.14941126108169556, + "learning_rate": 0.00016514359936498774, + "loss": 0.2002, + "step": 12084 + }, + { + "epoch": 0.8719650781052708, + "grad_norm": 0.1489430069923401, + "learning_rate": 0.000165140712945591, + "loss": 0.1426, + "step": 12085 + }, + { + "epoch": 0.8720372307803311, + "grad_norm": 0.12630683183670044, + "learning_rate": 0.00016513782652619426, + "loss": 0.1756, + "step": 12086 + }, + { + "epoch": 0.8721093834553916, + "grad_norm": 0.1323336958885193, + "learning_rate": 0.00016513494010679753, + "loss": 0.1421, + "step": 12087 + }, + { + "epoch": 0.872181536130452, + "grad_norm": 0.13433264195919037, + "learning_rate": 0.0001651320536874008, + "loss": 0.1435, + "step": 12088 + }, + { + "epoch": 0.8722536888055125, + "grad_norm": 0.1453068107366562, + "learning_rate": 0.00016512916726800405, + "loss": 0.151, + "step": 12089 + }, + { + "epoch": 0.8723258414805729, + "grad_norm": 0.14631931483745575, + "learning_rate": 0.00016512628084860731, + "loss": 0.1296, + "step": 12090 + }, + { + "epoch": 0.8723979941556333, + "grad_norm": 0.11547321081161499, + "learning_rate": 0.00016512339442921058, + "loss": 0.1698, + "step": 12091 + }, + { + "epoch": 0.8724701468306938, + "grad_norm": 0.13233087956905365, + "learning_rate": 0.0001651205080098138, + "loss": 0.1661, + "step": 12092 + }, + { + "epoch": 0.8725422995057541, + "grad_norm": 0.13904765248298645, + "learning_rate": 0.0001651176215904171, + "loss": 0.1652, + "step": 12093 + }, + { + "epoch": 0.8726144521808146, + "grad_norm": 0.11811047792434692, + "learning_rate": 0.00016511473517102037, + "loss": 0.1001, + "step": 12094 + }, + { + "epoch": 0.872686604855875, + "grad_norm": 0.1405722051858902, + "learning_rate": 0.00016511184875162363, + "loss": 0.1107, + "step": 12095 + }, + { + "epoch": 0.8727587575309355, + "grad_norm": 0.1355287730693817, + "learning_rate": 0.0001651089623322269, + "loss": 0.1465, + "step": 12096 + }, + { + "epoch": 0.8728309102059959, + "grad_norm": 0.13099505007266998, + "learning_rate": 0.00016510607591283013, + "loss": 0.1411, + "step": 12097 + }, + { + "epoch": 0.8729030628810563, + "grad_norm": 0.1373823583126068, + "learning_rate": 0.0001651031894934334, + "loss": 0.1878, + "step": 12098 + }, + { + "epoch": 0.8729752155561168, + "grad_norm": 0.1262364238500595, + "learning_rate": 0.00016510030307403665, + "loss": 0.1755, + "step": 12099 + }, + { + "epoch": 0.8730473682311771, + "grad_norm": 0.1368231475353241, + "learning_rate": 0.00016509741665463994, + "loss": 0.1496, + "step": 12100 + }, + { + "epoch": 0.8731195209062376, + "grad_norm": 0.1449502855539322, + "learning_rate": 0.0001650945302352432, + "loss": 0.1357, + "step": 12101 + }, + { + "epoch": 0.873191673581298, + "grad_norm": 0.14914466440677643, + "learning_rate": 0.00016509164381584644, + "loss": 0.1452, + "step": 12102 + }, + { + "epoch": 0.8732638262563585, + "grad_norm": 0.13216231763362885, + "learning_rate": 0.0001650887573964497, + "loss": 0.1819, + "step": 12103 + }, + { + "epoch": 0.8733359789314189, + "grad_norm": 0.14980573952198029, + "learning_rate": 0.00016508587097705297, + "loss": 0.1577, + "step": 12104 + }, + { + "epoch": 0.8734081316064793, + "grad_norm": 0.11215804517269135, + "learning_rate": 0.00016508298455765623, + "loss": 0.1322, + "step": 12105 + }, + { + "epoch": 0.8734802842815398, + "grad_norm": 0.11477739363908768, + "learning_rate": 0.0001650800981382595, + "loss": 0.1375, + "step": 12106 + }, + { + "epoch": 0.8735524369566001, + "grad_norm": 0.20511171221733093, + "learning_rate": 0.00016507721171886276, + "loss": 0.1463, + "step": 12107 + }, + { + "epoch": 0.8736245896316606, + "grad_norm": 0.13001227378845215, + "learning_rate": 0.00016507432529946602, + "loss": 0.1704, + "step": 12108 + }, + { + "epoch": 0.873696742306721, + "grad_norm": 0.12199117243289948, + "learning_rate": 0.00016507143888006928, + "loss": 0.1092, + "step": 12109 + }, + { + "epoch": 0.8737688949817815, + "grad_norm": 0.14155781269073486, + "learning_rate": 0.00016506855246067255, + "loss": 0.1333, + "step": 12110 + }, + { + "epoch": 0.8738410476568419, + "grad_norm": 0.13451431691646576, + "learning_rate": 0.0001650656660412758, + "loss": 0.1795, + "step": 12111 + }, + { + "epoch": 0.8739132003319023, + "grad_norm": 0.1325516253709793, + "learning_rate": 0.00016506277962187907, + "loss": 0.161, + "step": 12112 + }, + { + "epoch": 0.8739853530069628, + "grad_norm": 0.11864909529685974, + "learning_rate": 0.0001650598932024823, + "loss": 0.1461, + "step": 12113 + }, + { + "epoch": 0.8740575056820231, + "grad_norm": 0.12976987659931183, + "learning_rate": 0.0001650570067830856, + "loss": 0.1994, + "step": 12114 + }, + { + "epoch": 0.8741296583570836, + "grad_norm": 0.1296241134405136, + "learning_rate": 0.00016505412036368886, + "loss": 0.1489, + "step": 12115 + }, + { + "epoch": 0.874201811032144, + "grad_norm": 0.1189151182770729, + "learning_rate": 0.00016505123394429212, + "loss": 0.1615, + "step": 12116 + }, + { + "epoch": 0.8742739637072044, + "grad_norm": 0.13982558250427246, + "learning_rate": 0.00016504834752489539, + "loss": 0.1097, + "step": 12117 + }, + { + "epoch": 0.8743461163822649, + "grad_norm": 0.1390528380870819, + "learning_rate": 0.00016504546110549862, + "loss": 0.1315, + "step": 12118 + }, + { + "epoch": 0.8744182690573253, + "grad_norm": 0.12935444712638855, + "learning_rate": 0.00016504257468610188, + "loss": 0.1407, + "step": 12119 + }, + { + "epoch": 0.8744904217323858, + "grad_norm": 0.13345961272716522, + "learning_rate": 0.00016503968826670515, + "loss": 0.1685, + "step": 12120 + }, + { + "epoch": 0.8745625744074461, + "grad_norm": 0.1496542990207672, + "learning_rate": 0.00016503680184730844, + "loss": 0.1529, + "step": 12121 + }, + { + "epoch": 0.8746347270825066, + "grad_norm": 0.15997420251369476, + "learning_rate": 0.0001650339154279117, + "loss": 0.1783, + "step": 12122 + }, + { + "epoch": 0.874706879757567, + "grad_norm": 0.13938643038272858, + "learning_rate": 0.00016503102900851494, + "loss": 0.1333, + "step": 12123 + }, + { + "epoch": 0.8747790324326274, + "grad_norm": 0.15764518082141876, + "learning_rate": 0.0001650281425891182, + "loss": 0.1774, + "step": 12124 + }, + { + "epoch": 0.8748511851076879, + "grad_norm": 0.12833885848522186, + "learning_rate": 0.00016502525616972146, + "loss": 0.1454, + "step": 12125 + }, + { + "epoch": 0.8749233377827483, + "grad_norm": 0.1307978481054306, + "learning_rate": 0.00016502236975032473, + "loss": 0.1146, + "step": 12126 + }, + { + "epoch": 0.8749954904578088, + "grad_norm": 0.1131959781050682, + "learning_rate": 0.000165019483330928, + "loss": 0.1657, + "step": 12127 + }, + { + "epoch": 0.8750676431328691, + "grad_norm": 0.11155702918767929, + "learning_rate": 0.00016501659691153125, + "loss": 0.1831, + "step": 12128 + }, + { + "epoch": 0.8751397958079296, + "grad_norm": 0.1364208310842514, + "learning_rate": 0.00016501371049213451, + "loss": 0.1599, + "step": 12129 + }, + { + "epoch": 0.87521194848299, + "grad_norm": 0.11704082787036896, + "learning_rate": 0.00016501082407273778, + "loss": 0.1455, + "step": 12130 + }, + { + "epoch": 0.8752841011580504, + "grad_norm": 0.12343720346689224, + "learning_rate": 0.00016500793765334104, + "loss": 0.1096, + "step": 12131 + }, + { + "epoch": 0.8753562538331109, + "grad_norm": 0.119235560297966, + "learning_rate": 0.0001650050512339443, + "loss": 0.1918, + "step": 12132 + }, + { + "epoch": 0.8754284065081713, + "grad_norm": 0.11352014541625977, + "learning_rate": 0.00016500216481454757, + "loss": 0.1756, + "step": 12133 + }, + { + "epoch": 0.8755005591832317, + "grad_norm": 0.1395050287246704, + "learning_rate": 0.0001649992783951508, + "loss": 0.1255, + "step": 12134 + }, + { + "epoch": 0.8755727118582921, + "grad_norm": 0.11708272248506546, + "learning_rate": 0.0001649963919757541, + "loss": 0.1755, + "step": 12135 + }, + { + "epoch": 0.8756448645333526, + "grad_norm": 0.11024229973554611, + "learning_rate": 0.00016499350555635735, + "loss": 0.1124, + "step": 12136 + }, + { + "epoch": 0.875717017208413, + "grad_norm": 0.12274841964244843, + "learning_rate": 0.00016499061913696062, + "loss": 0.147, + "step": 12137 + }, + { + "epoch": 0.8757891698834734, + "grad_norm": 0.11719904094934464, + "learning_rate": 0.00016498773271756388, + "loss": 0.1524, + "step": 12138 + }, + { + "epoch": 0.8758613225585339, + "grad_norm": 0.13587170839309692, + "learning_rate": 0.00016498484629816712, + "loss": 0.1441, + "step": 12139 + }, + { + "epoch": 0.8759334752335943, + "grad_norm": 0.1277083307504654, + "learning_rate": 0.00016498195987877038, + "loss": 0.1409, + "step": 12140 + }, + { + "epoch": 0.8760056279086547, + "grad_norm": 0.18424269556999207, + "learning_rate": 0.00016497907345937364, + "loss": 0.1853, + "step": 12141 + }, + { + "epoch": 0.8760777805837151, + "grad_norm": 0.13032427430152893, + "learning_rate": 0.00016497618703997693, + "loss": 0.1719, + "step": 12142 + }, + { + "epoch": 0.8761499332587755, + "grad_norm": 0.11301422864198685, + "learning_rate": 0.0001649733006205802, + "loss": 0.1651, + "step": 12143 + }, + { + "epoch": 0.876222085933836, + "grad_norm": 0.1141270324587822, + "learning_rate": 0.00016497041420118343, + "loss": 0.1389, + "step": 12144 + }, + { + "epoch": 0.8762942386088964, + "grad_norm": 0.14169326424598694, + "learning_rate": 0.0001649675277817867, + "loss": 0.1953, + "step": 12145 + }, + { + "epoch": 0.8763663912839569, + "grad_norm": 0.1484052836894989, + "learning_rate": 0.00016496464136238996, + "loss": 0.1681, + "step": 12146 + }, + { + "epoch": 0.8764385439590173, + "grad_norm": 0.12363431602716446, + "learning_rate": 0.00016496175494299322, + "loss": 0.1425, + "step": 12147 + }, + { + "epoch": 0.8765106966340777, + "grad_norm": 0.18553423881530762, + "learning_rate": 0.00016495886852359648, + "loss": 0.1713, + "step": 12148 + }, + { + "epoch": 0.8765828493091381, + "grad_norm": 0.12179651856422424, + "learning_rate": 0.00016495598210419975, + "loss": 0.1991, + "step": 12149 + }, + { + "epoch": 0.8766550019841985, + "grad_norm": 0.11454205960035324, + "learning_rate": 0.000164953095684803, + "loss": 0.1546, + "step": 12150 + }, + { + "epoch": 0.876727154659259, + "grad_norm": 0.1588776707649231, + "learning_rate": 0.00016495020926540627, + "loss": 0.0914, + "step": 12151 + }, + { + "epoch": 0.8767993073343194, + "grad_norm": 0.10719560831785202, + "learning_rate": 0.00016494732284600953, + "loss": 0.1348, + "step": 12152 + }, + { + "epoch": 0.8768714600093799, + "grad_norm": 0.0964108407497406, + "learning_rate": 0.0001649444364266128, + "loss": 0.1281, + "step": 12153 + }, + { + "epoch": 0.8769436126844403, + "grad_norm": 0.13987363874912262, + "learning_rate": 0.00016494155000721606, + "loss": 0.1637, + "step": 12154 + }, + { + "epoch": 0.8770157653595007, + "grad_norm": 0.12305955588817596, + "learning_rate": 0.0001649386635878193, + "loss": 0.1668, + "step": 12155 + }, + { + "epoch": 0.8770879180345611, + "grad_norm": 0.1471739113330841, + "learning_rate": 0.00016493577716842259, + "loss": 0.1353, + "step": 12156 + }, + { + "epoch": 0.8771600707096215, + "grad_norm": 0.12235169112682343, + "learning_rate": 0.00016493289074902585, + "loss": 0.1549, + "step": 12157 + }, + { + "epoch": 0.877232223384682, + "grad_norm": 0.10822771489620209, + "learning_rate": 0.0001649300043296291, + "loss": 0.1443, + "step": 12158 + }, + { + "epoch": 0.8773043760597424, + "grad_norm": 0.15206243097782135, + "learning_rate": 0.00016492711791023237, + "loss": 0.1538, + "step": 12159 + }, + { + "epoch": 0.8773765287348029, + "grad_norm": 0.12351173907518387, + "learning_rate": 0.0001649242314908356, + "loss": 0.1008, + "step": 12160 + }, + { + "epoch": 0.8774486814098633, + "grad_norm": 0.14727401733398438, + "learning_rate": 0.00016492134507143887, + "loss": 0.1452, + "step": 12161 + }, + { + "epoch": 0.8775208340849237, + "grad_norm": 0.1338951289653778, + "learning_rate": 0.00016491845865204214, + "loss": 0.1469, + "step": 12162 + }, + { + "epoch": 0.8775929867599841, + "grad_norm": 0.13076646625995636, + "learning_rate": 0.00016491557223264543, + "loss": 0.1715, + "step": 12163 + }, + { + "epoch": 0.8776651394350445, + "grad_norm": 0.12688609957695007, + "learning_rate": 0.0001649126858132487, + "loss": 0.1392, + "step": 12164 + }, + { + "epoch": 0.877737292110105, + "grad_norm": 0.13240867853164673, + "learning_rate": 0.00016490979939385192, + "loss": 0.1551, + "step": 12165 + }, + { + "epoch": 0.8778094447851654, + "grad_norm": 0.11780638992786407, + "learning_rate": 0.0001649069129744552, + "loss": 0.131, + "step": 12166 + }, + { + "epoch": 0.8778815974602259, + "grad_norm": 0.11551574617624283, + "learning_rate": 0.00016490402655505845, + "loss": 0.1378, + "step": 12167 + }, + { + "epoch": 0.8779537501352863, + "grad_norm": 0.12620681524276733, + "learning_rate": 0.0001649011401356617, + "loss": 0.1612, + "step": 12168 + }, + { + "epoch": 0.8780259028103466, + "grad_norm": 0.1293182671070099, + "learning_rate": 0.00016489825371626498, + "loss": 0.153, + "step": 12169 + }, + { + "epoch": 0.8780980554854071, + "grad_norm": 0.12778621912002563, + "learning_rate": 0.00016489536729686824, + "loss": 0.1334, + "step": 12170 + }, + { + "epoch": 0.8781702081604675, + "grad_norm": 0.145514577627182, + "learning_rate": 0.0001648924808774715, + "loss": 0.1543, + "step": 12171 + }, + { + "epoch": 0.878242360835528, + "grad_norm": 0.132717102766037, + "learning_rate": 0.00016488959445807477, + "loss": 0.1442, + "step": 12172 + }, + { + "epoch": 0.8783145135105884, + "grad_norm": 0.12787772715091705, + "learning_rate": 0.00016488670803867803, + "loss": 0.1409, + "step": 12173 + }, + { + "epoch": 0.8783866661856489, + "grad_norm": 0.13414645195007324, + "learning_rate": 0.0001648838216192813, + "loss": 0.1746, + "step": 12174 + }, + { + "epoch": 0.8784588188607093, + "grad_norm": 0.1296178102493286, + "learning_rate": 0.00016488093519988455, + "loss": 0.146, + "step": 12175 + }, + { + "epoch": 0.8785309715357696, + "grad_norm": 0.16228878498077393, + "learning_rate": 0.00016487804878048782, + "loss": 0.1699, + "step": 12176 + }, + { + "epoch": 0.8786031242108301, + "grad_norm": 0.12227675318717957, + "learning_rate": 0.00016487516236109108, + "loss": 0.1425, + "step": 12177 + }, + { + "epoch": 0.8786752768858905, + "grad_norm": 0.20164965093135834, + "learning_rate": 0.00016487227594169434, + "loss": 0.1851, + "step": 12178 + }, + { + "epoch": 0.878747429560951, + "grad_norm": 0.1277187317609787, + "learning_rate": 0.0001648693895222976, + "loss": 0.1085, + "step": 12179 + }, + { + "epoch": 0.8788195822360114, + "grad_norm": 0.131092831492424, + "learning_rate": 0.00016486650310290087, + "loss": 0.1552, + "step": 12180 + }, + { + "epoch": 0.8788917349110719, + "grad_norm": 0.119987852871418, + "learning_rate": 0.00016486361668350413, + "loss": 0.1273, + "step": 12181 + }, + { + "epoch": 0.8789638875861323, + "grad_norm": 0.13632720708847046, + "learning_rate": 0.00016486073026410737, + "loss": 0.1183, + "step": 12182 + }, + { + "epoch": 0.8790360402611926, + "grad_norm": 0.14824025332927704, + "learning_rate": 0.00016485784384471063, + "loss": 0.1434, + "step": 12183 + }, + { + "epoch": 0.8791081929362531, + "grad_norm": 0.12320008873939514, + "learning_rate": 0.00016485495742531392, + "loss": 0.128, + "step": 12184 + }, + { + "epoch": 0.8791803456113135, + "grad_norm": 0.10985496640205383, + "learning_rate": 0.00016485207100591718, + "loss": 0.1667, + "step": 12185 + }, + { + "epoch": 0.879252498286374, + "grad_norm": 0.12912528216838837, + "learning_rate": 0.00016484918458652045, + "loss": 0.1368, + "step": 12186 + }, + { + "epoch": 0.8793246509614344, + "grad_norm": 0.136422798037529, + "learning_rate": 0.00016484629816712368, + "loss": 0.114, + "step": 12187 + }, + { + "epoch": 0.8793968036364949, + "grad_norm": 0.11968207359313965, + "learning_rate": 0.00016484341174772694, + "loss": 0.098, + "step": 12188 + }, + { + "epoch": 0.8794689563115553, + "grad_norm": 0.11828417330980301, + "learning_rate": 0.0001648405253283302, + "loss": 0.1117, + "step": 12189 + }, + { + "epoch": 0.8795411089866156, + "grad_norm": 0.11578892916440964, + "learning_rate": 0.00016483763890893347, + "loss": 0.1282, + "step": 12190 + }, + { + "epoch": 0.8796132616616761, + "grad_norm": 0.10919444262981415, + "learning_rate": 0.00016483475248953676, + "loss": 0.175, + "step": 12191 + }, + { + "epoch": 0.8796854143367365, + "grad_norm": 0.1338009089231491, + "learning_rate": 0.00016483186607014, + "loss": 0.163, + "step": 12192 + }, + { + "epoch": 0.879757567011797, + "grad_norm": 0.12004167586565018, + "learning_rate": 0.00016482897965074326, + "loss": 0.1751, + "step": 12193 + }, + { + "epoch": 0.8798297196868574, + "grad_norm": 0.14760249853134155, + "learning_rate": 0.00016482609323134652, + "loss": 0.1675, + "step": 12194 + }, + { + "epoch": 0.8799018723619179, + "grad_norm": 0.16406826674938202, + "learning_rate": 0.00016482320681194979, + "loss": 0.1977, + "step": 12195 + }, + { + "epoch": 0.8799740250369782, + "grad_norm": 0.11777998507022858, + "learning_rate": 0.00016482032039255305, + "loss": 0.1638, + "step": 12196 + }, + { + "epoch": 0.8800461777120386, + "grad_norm": 0.12022560834884644, + "learning_rate": 0.0001648174339731563, + "loss": 0.1686, + "step": 12197 + }, + { + "epoch": 0.8801183303870991, + "grad_norm": 0.12894538044929504, + "learning_rate": 0.00016481454755375957, + "loss": 0.1485, + "step": 12198 + }, + { + "epoch": 0.8801904830621595, + "grad_norm": 0.12284479290246964, + "learning_rate": 0.00016481166113436284, + "loss": 0.1721, + "step": 12199 + }, + { + "epoch": 0.88026263573722, + "grad_norm": 0.11004292219877243, + "learning_rate": 0.0001648087747149661, + "loss": 0.1955, + "step": 12200 + }, + { + "epoch": 0.8803347884122804, + "grad_norm": 0.12605997920036316, + "learning_rate": 0.00016480588829556936, + "loss": 0.1145, + "step": 12201 + }, + { + "epoch": 0.8804069410873409, + "grad_norm": 0.125811368227005, + "learning_rate": 0.00016480300187617263, + "loss": 0.1377, + "step": 12202 + }, + { + "epoch": 0.8804790937624012, + "grad_norm": 0.1498311460018158, + "learning_rate": 0.00016480011545677586, + "loss": 0.1361, + "step": 12203 + }, + { + "epoch": 0.8805512464374616, + "grad_norm": 0.12281493097543716, + "learning_rate": 0.00016479722903737912, + "loss": 0.1218, + "step": 12204 + }, + { + "epoch": 0.8806233991125221, + "grad_norm": 0.11560608446598053, + "learning_rate": 0.00016479434261798241, + "loss": 0.129, + "step": 12205 + }, + { + "epoch": 0.8806955517875825, + "grad_norm": 0.13548126816749573, + "learning_rate": 0.00016479145619858568, + "loss": 0.1461, + "step": 12206 + }, + { + "epoch": 0.880767704462643, + "grad_norm": 0.13236747682094574, + "learning_rate": 0.00016478856977918894, + "loss": 0.1244, + "step": 12207 + }, + { + "epoch": 0.8808398571377034, + "grad_norm": 0.13994522392749786, + "learning_rate": 0.00016478568335979218, + "loss": 0.1875, + "step": 12208 + }, + { + "epoch": 0.8809120098127639, + "grad_norm": 0.126194030046463, + "learning_rate": 0.00016478279694039544, + "loss": 0.1366, + "step": 12209 + }, + { + "epoch": 0.8809841624878242, + "grad_norm": 0.11344815045595169, + "learning_rate": 0.0001647799105209987, + "loss": 0.1843, + "step": 12210 + }, + { + "epoch": 0.8810563151628846, + "grad_norm": 0.1344424933195114, + "learning_rate": 0.00016477702410160196, + "loss": 0.1549, + "step": 12211 + }, + { + "epoch": 0.8811284678379451, + "grad_norm": 0.12091745436191559, + "learning_rate": 0.00016477413768220525, + "loss": 0.1731, + "step": 12212 + }, + { + "epoch": 0.8812006205130055, + "grad_norm": 0.1225610002875328, + "learning_rate": 0.0001647712512628085, + "loss": 0.1256, + "step": 12213 + }, + { + "epoch": 0.881272773188066, + "grad_norm": 0.13267618417739868, + "learning_rate": 0.00016476836484341175, + "loss": 0.122, + "step": 12214 + }, + { + "epoch": 0.8813449258631264, + "grad_norm": 0.15878993272781372, + "learning_rate": 0.00016476547842401502, + "loss": 0.1221, + "step": 12215 + }, + { + "epoch": 0.8814170785381868, + "grad_norm": 0.12710832059383392, + "learning_rate": 0.00016476259200461828, + "loss": 0.1372, + "step": 12216 + }, + { + "epoch": 0.8814892312132472, + "grad_norm": 0.12072007358074188, + "learning_rate": 0.00016475970558522154, + "loss": 0.1506, + "step": 12217 + }, + { + "epoch": 0.8815613838883076, + "grad_norm": 0.1832415908575058, + "learning_rate": 0.0001647568191658248, + "loss": 0.1452, + "step": 12218 + }, + { + "epoch": 0.8816335365633681, + "grad_norm": 0.11516207456588745, + "learning_rate": 0.00016475393274642807, + "loss": 0.1508, + "step": 12219 + }, + { + "epoch": 0.8817056892384285, + "grad_norm": 0.12112973630428314, + "learning_rate": 0.00016475104632703133, + "loss": 0.2082, + "step": 12220 + }, + { + "epoch": 0.881777841913489, + "grad_norm": 0.12404236942529678, + "learning_rate": 0.0001647481599076346, + "loss": 0.1129, + "step": 12221 + }, + { + "epoch": 0.8818499945885494, + "grad_norm": 0.12267255783081055, + "learning_rate": 0.00016474527348823786, + "loss": 0.1512, + "step": 12222 + }, + { + "epoch": 0.8819221472636098, + "grad_norm": 0.10327456891536713, + "learning_rate": 0.00016474238706884112, + "loss": 0.1504, + "step": 12223 + }, + { + "epoch": 0.8819942999386702, + "grad_norm": 0.12243549525737762, + "learning_rate": 0.00016473950064944436, + "loss": 0.1422, + "step": 12224 + }, + { + "epoch": 0.8820664526137306, + "grad_norm": 0.10996757447719574, + "learning_rate": 0.00016473661423004762, + "loss": 0.1667, + "step": 12225 + }, + { + "epoch": 0.8821386052887911, + "grad_norm": 0.130668506026268, + "learning_rate": 0.0001647337278106509, + "loss": 0.1026, + "step": 12226 + }, + { + "epoch": 0.8822107579638515, + "grad_norm": 0.12705399096012115, + "learning_rate": 0.00016473084139125417, + "loss": 0.1611, + "step": 12227 + }, + { + "epoch": 0.882282910638912, + "grad_norm": 0.1277860701084137, + "learning_rate": 0.00016472795497185743, + "loss": 0.1216, + "step": 12228 + }, + { + "epoch": 0.8823550633139724, + "grad_norm": 0.11852487176656723, + "learning_rate": 0.00016472506855246067, + "loss": 0.1557, + "step": 12229 + }, + { + "epoch": 0.8824272159890328, + "grad_norm": 0.12020008265972137, + "learning_rate": 0.00016472218213306393, + "loss": 0.1284, + "step": 12230 + }, + { + "epoch": 0.8824993686640932, + "grad_norm": 0.13153263926506042, + "learning_rate": 0.0001647192957136672, + "loss": 0.1085, + "step": 12231 + }, + { + "epoch": 0.8825715213391536, + "grad_norm": 0.15343275666236877, + "learning_rate": 0.00016471640929427046, + "loss": 0.1608, + "step": 12232 + }, + { + "epoch": 0.8826436740142141, + "grad_norm": 0.1277267336845398, + "learning_rate": 0.00016471352287487375, + "loss": 0.1511, + "step": 12233 + }, + { + "epoch": 0.8827158266892745, + "grad_norm": 0.1161343902349472, + "learning_rate": 0.00016471063645547699, + "loss": 0.1492, + "step": 12234 + }, + { + "epoch": 0.882787979364335, + "grad_norm": 0.1242692694067955, + "learning_rate": 0.00016470775003608025, + "loss": 0.1598, + "step": 12235 + }, + { + "epoch": 0.8828601320393954, + "grad_norm": 0.14076781272888184, + "learning_rate": 0.0001647048636166835, + "loss": 0.1475, + "step": 12236 + }, + { + "epoch": 0.8829322847144558, + "grad_norm": 0.1138407289981842, + "learning_rate": 0.00016470197719728677, + "loss": 0.1583, + "step": 12237 + }, + { + "epoch": 0.8830044373895162, + "grad_norm": 0.12430385500192642, + "learning_rate": 0.00016469909077789004, + "loss": 0.1189, + "step": 12238 + }, + { + "epoch": 0.8830765900645766, + "grad_norm": 0.13104628026485443, + "learning_rate": 0.0001646962043584933, + "loss": 0.1494, + "step": 12239 + }, + { + "epoch": 0.8831487427396371, + "grad_norm": 0.12466448545455933, + "learning_rate": 0.00016469331793909656, + "loss": 0.1471, + "step": 12240 + }, + { + "epoch": 0.8832208954146975, + "grad_norm": 0.13318614661693573, + "learning_rate": 0.00016469043151969983, + "loss": 0.1135, + "step": 12241 + }, + { + "epoch": 0.883293048089758, + "grad_norm": 0.1286313533782959, + "learning_rate": 0.0001646875451003031, + "loss": 0.1148, + "step": 12242 + }, + { + "epoch": 0.8833652007648184, + "grad_norm": 0.11111229658126831, + "learning_rate": 0.00016468465868090635, + "loss": 0.1358, + "step": 12243 + }, + { + "epoch": 0.8834373534398788, + "grad_norm": 0.1381540298461914, + "learning_rate": 0.00016468177226150961, + "loss": 0.1428, + "step": 12244 + }, + { + "epoch": 0.8835095061149392, + "grad_norm": 0.11672315746545792, + "learning_rate": 0.00016467888584211285, + "loss": 0.127, + "step": 12245 + }, + { + "epoch": 0.8835816587899996, + "grad_norm": 0.17194171249866486, + "learning_rate": 0.0001646759994227161, + "loss": 0.1502, + "step": 12246 + }, + { + "epoch": 0.8836538114650601, + "grad_norm": 0.1031903326511383, + "learning_rate": 0.00016467311300331938, + "loss": 0.1147, + "step": 12247 + }, + { + "epoch": 0.8837259641401205, + "grad_norm": 0.12334106117486954, + "learning_rate": 0.00016467022658392267, + "loss": 0.121, + "step": 12248 + }, + { + "epoch": 0.883798116815181, + "grad_norm": 0.13263913989067078, + "learning_rate": 0.00016466734016452593, + "loss": 0.1376, + "step": 12249 + }, + { + "epoch": 0.8838702694902414, + "grad_norm": 0.12783348560333252, + "learning_rate": 0.00016466445374512916, + "loss": 0.133, + "step": 12250 + }, + { + "epoch": 0.8839424221653018, + "grad_norm": 0.1204017698764801, + "learning_rate": 0.00016466156732573243, + "loss": 0.1071, + "step": 12251 + }, + { + "epoch": 0.8840145748403622, + "grad_norm": 0.12959232926368713, + "learning_rate": 0.0001646586809063357, + "loss": 0.1584, + "step": 12252 + }, + { + "epoch": 0.8840867275154226, + "grad_norm": 0.1044325977563858, + "learning_rate": 0.00016465579448693895, + "loss": 0.1868, + "step": 12253 + }, + { + "epoch": 0.884158880190483, + "grad_norm": 0.13037028908729553, + "learning_rate": 0.00016465290806754222, + "loss": 0.1786, + "step": 12254 + }, + { + "epoch": 0.8842310328655435, + "grad_norm": 0.10903565585613251, + "learning_rate": 0.00016465002164814548, + "loss": 0.1822, + "step": 12255 + }, + { + "epoch": 0.8843031855406039, + "grad_norm": 0.10615143924951553, + "learning_rate": 0.00016464713522874874, + "loss": 0.1216, + "step": 12256 + }, + { + "epoch": 0.8843753382156644, + "grad_norm": 0.10600007325410843, + "learning_rate": 0.000164644248809352, + "loss": 0.2071, + "step": 12257 + }, + { + "epoch": 0.8844474908907247, + "grad_norm": 0.19849993288516998, + "learning_rate": 0.00016464136238995527, + "loss": 0.1893, + "step": 12258 + }, + { + "epoch": 0.8845196435657852, + "grad_norm": 0.13202063739299774, + "learning_rate": 0.00016463847597055853, + "loss": 0.198, + "step": 12259 + }, + { + "epoch": 0.8845917962408456, + "grad_norm": 0.11806654185056686, + "learning_rate": 0.0001646355895511618, + "loss": 0.1756, + "step": 12260 + }, + { + "epoch": 0.884663948915906, + "grad_norm": 0.11329864710569382, + "learning_rate": 0.00016463270313176503, + "loss": 0.1644, + "step": 12261 + }, + { + "epoch": 0.8847361015909665, + "grad_norm": 0.11153256148099899, + "learning_rate": 0.00016462981671236832, + "loss": 0.1735, + "step": 12262 + }, + { + "epoch": 0.8848082542660269, + "grad_norm": 0.11486808955669403, + "learning_rate": 0.00016462693029297158, + "loss": 0.1716, + "step": 12263 + }, + { + "epoch": 0.8848804069410874, + "grad_norm": 0.13077178597450256, + "learning_rate": 0.00016462404387357485, + "loss": 0.1743, + "step": 12264 + }, + { + "epoch": 0.8849525596161477, + "grad_norm": 0.11764027178287506, + "learning_rate": 0.0001646211574541781, + "loss": 0.1531, + "step": 12265 + }, + { + "epoch": 0.8850247122912082, + "grad_norm": 0.19978061318397522, + "learning_rate": 0.00016461827103478134, + "loss": 0.1957, + "step": 12266 + }, + { + "epoch": 0.8850968649662686, + "grad_norm": 0.13150623440742493, + "learning_rate": 0.0001646153846153846, + "loss": 0.1832, + "step": 12267 + }, + { + "epoch": 0.885169017641329, + "grad_norm": 0.13365094363689423, + "learning_rate": 0.00016461249819598787, + "loss": 0.1367, + "step": 12268 + }, + { + "epoch": 0.8852411703163895, + "grad_norm": 0.143122598528862, + "learning_rate": 0.00016460961177659116, + "loss": 0.1552, + "step": 12269 + }, + { + "epoch": 0.8853133229914499, + "grad_norm": 0.13805808126926422, + "learning_rate": 0.00016460672535719442, + "loss": 0.1188, + "step": 12270 + }, + { + "epoch": 0.8853854756665104, + "grad_norm": 0.12481489777565002, + "learning_rate": 0.00016460383893779766, + "loss": 0.1625, + "step": 12271 + }, + { + "epoch": 0.8854576283415707, + "grad_norm": 0.12569913268089294, + "learning_rate": 0.00016460095251840092, + "loss": 0.1273, + "step": 12272 + }, + { + "epoch": 0.8855297810166312, + "grad_norm": 0.12131577730178833, + "learning_rate": 0.00016459806609900418, + "loss": 0.1331, + "step": 12273 + }, + { + "epoch": 0.8856019336916916, + "grad_norm": 0.14365747570991516, + "learning_rate": 0.00016459517967960745, + "loss": 0.1433, + "step": 12274 + }, + { + "epoch": 0.885674086366752, + "grad_norm": 0.11679968982934952, + "learning_rate": 0.0001645922932602107, + "loss": 0.1306, + "step": 12275 + }, + { + "epoch": 0.8857462390418125, + "grad_norm": 0.1197090595960617, + "learning_rate": 0.00016458940684081397, + "loss": 0.0986, + "step": 12276 + }, + { + "epoch": 0.8858183917168729, + "grad_norm": 0.1241610199213028, + "learning_rate": 0.00016458652042141724, + "loss": 0.1339, + "step": 12277 + }, + { + "epoch": 0.8858905443919334, + "grad_norm": 0.1254902333021164, + "learning_rate": 0.0001645836340020205, + "loss": 0.1433, + "step": 12278 + }, + { + "epoch": 0.8859626970669937, + "grad_norm": 0.14213827252388, + "learning_rate": 0.00016458074758262376, + "loss": 0.1072, + "step": 12279 + }, + { + "epoch": 0.8860348497420542, + "grad_norm": 0.1069590151309967, + "learning_rate": 0.00016457786116322703, + "loss": 0.1672, + "step": 12280 + }, + { + "epoch": 0.8861070024171146, + "grad_norm": 0.12080402672290802, + "learning_rate": 0.0001645749747438303, + "loss": 0.1717, + "step": 12281 + }, + { + "epoch": 0.886179155092175, + "grad_norm": 0.11748300492763519, + "learning_rate": 0.00016457208832443355, + "loss": 0.0995, + "step": 12282 + }, + { + "epoch": 0.8862513077672355, + "grad_norm": 0.11120393127202988, + "learning_rate": 0.00016456920190503681, + "loss": 0.1702, + "step": 12283 + }, + { + "epoch": 0.8863234604422959, + "grad_norm": 0.11398053914308548, + "learning_rate": 0.00016456631548564008, + "loss": 0.1158, + "step": 12284 + }, + { + "epoch": 0.8863956131173564, + "grad_norm": 0.10123290121555328, + "learning_rate": 0.00016456342906624334, + "loss": 0.1739, + "step": 12285 + }, + { + "epoch": 0.8864677657924167, + "grad_norm": 0.11475709825754166, + "learning_rate": 0.0001645605426468466, + "loss": 0.1122, + "step": 12286 + }, + { + "epoch": 0.8865399184674772, + "grad_norm": 0.13295650482177734, + "learning_rate": 0.00016455765622744987, + "loss": 0.1475, + "step": 12287 + }, + { + "epoch": 0.8866120711425376, + "grad_norm": 0.1309804469347, + "learning_rate": 0.0001645547698080531, + "loss": 0.1352, + "step": 12288 + }, + { + "epoch": 0.886684223817598, + "grad_norm": 0.17298905551433563, + "learning_rate": 0.00016455188338865636, + "loss": 0.1796, + "step": 12289 + }, + { + "epoch": 0.8867563764926585, + "grad_norm": 0.11581968516111374, + "learning_rate": 0.00016454899696925965, + "loss": 0.1089, + "step": 12290 + }, + { + "epoch": 0.8868285291677189, + "grad_norm": 0.13596712052822113, + "learning_rate": 0.00016454611054986292, + "loss": 0.1785, + "step": 12291 + }, + { + "epoch": 0.8869006818427794, + "grad_norm": 0.12236089259386063, + "learning_rate": 0.00016454322413046618, + "loss": 0.108, + "step": 12292 + }, + { + "epoch": 0.8869728345178397, + "grad_norm": 0.1264708787202835, + "learning_rate": 0.00016454033771106942, + "loss": 0.1284, + "step": 12293 + }, + { + "epoch": 0.8870449871929001, + "grad_norm": 0.12240929901599884, + "learning_rate": 0.00016453745129167268, + "loss": 0.1002, + "step": 12294 + }, + { + "epoch": 0.8871171398679606, + "grad_norm": 0.12874439358711243, + "learning_rate": 0.00016453456487227594, + "loss": 0.1123, + "step": 12295 + }, + { + "epoch": 0.887189292543021, + "grad_norm": 0.1249275654554367, + "learning_rate": 0.0001645316784528792, + "loss": 0.1654, + "step": 12296 + }, + { + "epoch": 0.8872614452180815, + "grad_norm": 0.1129474937915802, + "learning_rate": 0.0001645287920334825, + "loss": 0.1105, + "step": 12297 + }, + { + "epoch": 0.8873335978931419, + "grad_norm": 0.1413096785545349, + "learning_rate": 0.00016452590561408573, + "loss": 0.1364, + "step": 12298 + }, + { + "epoch": 0.8874057505682024, + "grad_norm": 0.19914685189723969, + "learning_rate": 0.000164523019194689, + "loss": 0.1652, + "step": 12299 + }, + { + "epoch": 0.8874779032432627, + "grad_norm": 0.1189395859837532, + "learning_rate": 0.00016452013277529226, + "loss": 0.1557, + "step": 12300 + }, + { + "epoch": 0.8875500559183231, + "grad_norm": 0.12614993751049042, + "learning_rate": 0.00016451724635589552, + "loss": 0.1148, + "step": 12301 + }, + { + "epoch": 0.8876222085933836, + "grad_norm": 0.12520967423915863, + "learning_rate": 0.00016451435993649878, + "loss": 0.1256, + "step": 12302 + }, + { + "epoch": 0.887694361268444, + "grad_norm": 0.117799773812294, + "learning_rate": 0.00016451147351710205, + "loss": 0.164, + "step": 12303 + }, + { + "epoch": 0.8877665139435045, + "grad_norm": 0.1549709290266037, + "learning_rate": 0.0001645085870977053, + "loss": 0.1556, + "step": 12304 + }, + { + "epoch": 0.8878386666185649, + "grad_norm": 0.1547599583864212, + "learning_rate": 0.00016450570067830857, + "loss": 0.1336, + "step": 12305 + }, + { + "epoch": 0.8879108192936254, + "grad_norm": 0.1339336782693863, + "learning_rate": 0.00016450281425891183, + "loss": 0.1222, + "step": 12306 + }, + { + "epoch": 0.8879829719686857, + "grad_norm": 0.12404187768697739, + "learning_rate": 0.0001644999278395151, + "loss": 0.1527, + "step": 12307 + }, + { + "epoch": 0.8880551246437461, + "grad_norm": 0.12418852001428604, + "learning_rate": 0.00016449704142011836, + "loss": 0.176, + "step": 12308 + }, + { + "epoch": 0.8881272773188066, + "grad_norm": 0.18659363687038422, + "learning_rate": 0.0001644941550007216, + "loss": 0.1825, + "step": 12309 + }, + { + "epoch": 0.888199429993867, + "grad_norm": 0.15691769123077393, + "learning_rate": 0.00016449126858132486, + "loss": 0.1564, + "step": 12310 + }, + { + "epoch": 0.8882715826689275, + "grad_norm": 0.11697795987129211, + "learning_rate": 0.00016448838216192815, + "loss": 0.1476, + "step": 12311 + }, + { + "epoch": 0.8883437353439879, + "grad_norm": 0.12733706831932068, + "learning_rate": 0.0001644854957425314, + "loss": 0.1746, + "step": 12312 + }, + { + "epoch": 0.8884158880190484, + "grad_norm": 0.13768993318080902, + "learning_rate": 0.00016448260932313467, + "loss": 0.1183, + "step": 12313 + }, + { + "epoch": 0.8884880406941087, + "grad_norm": 0.15787924826145172, + "learning_rate": 0.0001644797229037379, + "loss": 0.1567, + "step": 12314 + }, + { + "epoch": 0.8885601933691691, + "grad_norm": 0.15521618723869324, + "learning_rate": 0.00016447683648434117, + "loss": 0.0995, + "step": 12315 + }, + { + "epoch": 0.8886323460442296, + "grad_norm": 0.1418941766023636, + "learning_rate": 0.00016447395006494444, + "loss": 0.1839, + "step": 12316 + }, + { + "epoch": 0.88870449871929, + "grad_norm": 0.13130566477775574, + "learning_rate": 0.0001644710636455477, + "loss": 0.1611, + "step": 12317 + }, + { + "epoch": 0.8887766513943505, + "grad_norm": 0.11680661141872406, + "learning_rate": 0.000164468177226151, + "loss": 0.1155, + "step": 12318 + }, + { + "epoch": 0.8888488040694109, + "grad_norm": 0.12170635908842087, + "learning_rate": 0.00016446529080675422, + "loss": 0.1103, + "step": 12319 + }, + { + "epoch": 0.8889209567444712, + "grad_norm": 0.1398392617702484, + "learning_rate": 0.0001644624043873575, + "loss": 0.107, + "step": 12320 + }, + { + "epoch": 0.8889931094195317, + "grad_norm": 0.12478537112474442, + "learning_rate": 0.00016445951796796075, + "loss": 0.1589, + "step": 12321 + }, + { + "epoch": 0.8890652620945921, + "grad_norm": 0.11158666014671326, + "learning_rate": 0.00016445663154856401, + "loss": 0.1273, + "step": 12322 + }, + { + "epoch": 0.8891374147696526, + "grad_norm": 0.11000625789165497, + "learning_rate": 0.00016445374512916728, + "loss": 0.1377, + "step": 12323 + }, + { + "epoch": 0.889209567444713, + "grad_norm": 0.1148582473397255, + "learning_rate": 0.00016445085870977054, + "loss": 0.1342, + "step": 12324 + }, + { + "epoch": 0.8892817201197735, + "grad_norm": 0.12248571217060089, + "learning_rate": 0.0001644479722903738, + "loss": 0.136, + "step": 12325 + }, + { + "epoch": 0.8893538727948339, + "grad_norm": 0.13426116108894348, + "learning_rate": 0.00016444508587097707, + "loss": 0.1739, + "step": 12326 + }, + { + "epoch": 0.8894260254698942, + "grad_norm": 0.1333639919757843, + "learning_rate": 0.00016444219945158033, + "loss": 0.1213, + "step": 12327 + }, + { + "epoch": 0.8894981781449547, + "grad_norm": 0.12810485064983368, + "learning_rate": 0.0001644393130321836, + "loss": 0.1312, + "step": 12328 + }, + { + "epoch": 0.8895703308200151, + "grad_norm": 0.13391032814979553, + "learning_rate": 0.00016443642661278685, + "loss": 0.1716, + "step": 12329 + }, + { + "epoch": 0.8896424834950756, + "grad_norm": 0.1410336047410965, + "learning_rate": 0.0001644335401933901, + "loss": 0.15, + "step": 12330 + }, + { + "epoch": 0.889714636170136, + "grad_norm": 0.15608654916286469, + "learning_rate": 0.00016443065377399335, + "loss": 0.1752, + "step": 12331 + }, + { + "epoch": 0.8897867888451965, + "grad_norm": 0.11146390438079834, + "learning_rate": 0.00016442776735459664, + "loss": 0.1621, + "step": 12332 + }, + { + "epoch": 0.8898589415202569, + "grad_norm": 0.13638371229171753, + "learning_rate": 0.0001644248809351999, + "loss": 0.1706, + "step": 12333 + }, + { + "epoch": 0.8899310941953172, + "grad_norm": 0.1298755407333374, + "learning_rate": 0.00016442199451580317, + "loss": 0.147, + "step": 12334 + }, + { + "epoch": 0.8900032468703777, + "grad_norm": 0.13941478729248047, + "learning_rate": 0.0001644191080964064, + "loss": 0.166, + "step": 12335 + }, + { + "epoch": 0.8900753995454381, + "grad_norm": 0.13158883154392242, + "learning_rate": 0.00016441622167700967, + "loss": 0.1479, + "step": 12336 + }, + { + "epoch": 0.8901475522204986, + "grad_norm": 0.12248693406581879, + "learning_rate": 0.00016441333525761293, + "loss": 0.125, + "step": 12337 + }, + { + "epoch": 0.890219704895559, + "grad_norm": 0.12258122116327286, + "learning_rate": 0.0001644104488382162, + "loss": 0.1977, + "step": 12338 + }, + { + "epoch": 0.8902918575706195, + "grad_norm": 0.12818323075771332, + "learning_rate": 0.00016440756241881948, + "loss": 0.1544, + "step": 12339 + }, + { + "epoch": 0.8903640102456799, + "grad_norm": 0.12762263417243958, + "learning_rate": 0.00016440467599942272, + "loss": 0.1938, + "step": 12340 + }, + { + "epoch": 0.8904361629207402, + "grad_norm": 0.12197435647249222, + "learning_rate": 0.00016440178958002598, + "loss": 0.136, + "step": 12341 + }, + { + "epoch": 0.8905083155958007, + "grad_norm": 0.12227319926023483, + "learning_rate": 0.00016439890316062925, + "loss": 0.1331, + "step": 12342 + }, + { + "epoch": 0.8905804682708611, + "grad_norm": 0.12339022010564804, + "learning_rate": 0.0001643960167412325, + "loss": 0.1192, + "step": 12343 + }, + { + "epoch": 0.8906526209459216, + "grad_norm": 0.12474383413791656, + "learning_rate": 0.00016439313032183577, + "loss": 0.1259, + "step": 12344 + }, + { + "epoch": 0.890724773620982, + "grad_norm": 0.14406003057956696, + "learning_rate": 0.00016439024390243903, + "loss": 0.138, + "step": 12345 + }, + { + "epoch": 0.8907969262960425, + "grad_norm": 0.13146288692951202, + "learning_rate": 0.0001643873574830423, + "loss": 0.1627, + "step": 12346 + }, + { + "epoch": 0.8908690789711029, + "grad_norm": 0.1251259446144104, + "learning_rate": 0.00016438447106364556, + "loss": 0.1442, + "step": 12347 + }, + { + "epoch": 0.8909412316461632, + "grad_norm": 0.1310272067785263, + "learning_rate": 0.00016438158464424882, + "loss": 0.1309, + "step": 12348 + }, + { + "epoch": 0.8910133843212237, + "grad_norm": 0.13596667349338531, + "learning_rate": 0.00016437869822485209, + "loss": 0.1625, + "step": 12349 + }, + { + "epoch": 0.8910855369962841, + "grad_norm": 0.12340883165597916, + "learning_rate": 0.00016437581180545535, + "loss": 0.1453, + "step": 12350 + }, + { + "epoch": 0.8911576896713446, + "grad_norm": 0.14122344553470612, + "learning_rate": 0.00016437292538605858, + "loss": 0.1544, + "step": 12351 + }, + { + "epoch": 0.891229842346405, + "grad_norm": 0.12983869016170502, + "learning_rate": 0.00016437003896666185, + "loss": 0.17, + "step": 12352 + }, + { + "epoch": 0.8913019950214655, + "grad_norm": 0.12063828855752945, + "learning_rate": 0.00016436715254726514, + "loss": 0.1771, + "step": 12353 + }, + { + "epoch": 0.8913741476965259, + "grad_norm": 0.1242651641368866, + "learning_rate": 0.0001643642661278684, + "loss": 0.1481, + "step": 12354 + }, + { + "epoch": 0.8914463003715862, + "grad_norm": 0.1284290850162506, + "learning_rate": 0.00016436137970847166, + "loss": 0.1272, + "step": 12355 + }, + { + "epoch": 0.8915184530466467, + "grad_norm": 0.12441599369049072, + "learning_rate": 0.0001643584932890749, + "loss": 0.0934, + "step": 12356 + }, + { + "epoch": 0.8915906057217071, + "grad_norm": 0.10706061869859695, + "learning_rate": 0.00016435560686967816, + "loss": 0.1213, + "step": 12357 + }, + { + "epoch": 0.8916627583967676, + "grad_norm": 0.12179552763700485, + "learning_rate": 0.00016435272045028142, + "loss": 0.1591, + "step": 12358 + }, + { + "epoch": 0.891734911071828, + "grad_norm": 0.16151952743530273, + "learning_rate": 0.0001643498340308847, + "loss": 0.1726, + "step": 12359 + }, + { + "epoch": 0.8918070637468885, + "grad_norm": 0.1302734762430191, + "learning_rate": 0.00016434694761148798, + "loss": 0.1007, + "step": 12360 + }, + { + "epoch": 0.8918792164219489, + "grad_norm": 0.14528721570968628, + "learning_rate": 0.0001643440611920912, + "loss": 0.1522, + "step": 12361 + }, + { + "epoch": 0.8919513690970092, + "grad_norm": 0.15876981616020203, + "learning_rate": 0.00016434117477269448, + "loss": 0.1378, + "step": 12362 + }, + { + "epoch": 0.8920235217720697, + "grad_norm": 0.13293051719665527, + "learning_rate": 0.00016433828835329774, + "loss": 0.1772, + "step": 12363 + }, + { + "epoch": 0.8920956744471301, + "grad_norm": 0.10721206665039062, + "learning_rate": 0.000164335401933901, + "loss": 0.1711, + "step": 12364 + }, + { + "epoch": 0.8921678271221906, + "grad_norm": 0.14066430926322937, + "learning_rate": 0.00016433251551450427, + "loss": 0.1305, + "step": 12365 + }, + { + "epoch": 0.892239979797251, + "grad_norm": 0.13327549397945404, + "learning_rate": 0.00016432962909510753, + "loss": 0.1487, + "step": 12366 + }, + { + "epoch": 0.8923121324723114, + "grad_norm": 0.12224187701940536, + "learning_rate": 0.0001643267426757108, + "loss": 0.1402, + "step": 12367 + }, + { + "epoch": 0.8923842851473719, + "grad_norm": 0.11235873401165009, + "learning_rate": 0.00016432385625631405, + "loss": 0.1545, + "step": 12368 + }, + { + "epoch": 0.8924564378224322, + "grad_norm": 0.13327015936374664, + "learning_rate": 0.00016432096983691732, + "loss": 0.1212, + "step": 12369 + }, + { + "epoch": 0.8925285904974927, + "grad_norm": 0.0986461266875267, + "learning_rate": 0.00016431808341752058, + "loss": 0.1105, + "step": 12370 + }, + { + "epoch": 0.8926007431725531, + "grad_norm": 0.17483176290988922, + "learning_rate": 0.00016431519699812384, + "loss": 0.1472, + "step": 12371 + }, + { + "epoch": 0.8926728958476136, + "grad_norm": 0.11941957473754883, + "learning_rate": 0.00016431231057872708, + "loss": 0.1612, + "step": 12372 + }, + { + "epoch": 0.892745048522674, + "grad_norm": 0.1409369558095932, + "learning_rate": 0.00016430942415933034, + "loss": 0.1794, + "step": 12373 + }, + { + "epoch": 0.8928172011977344, + "grad_norm": 0.14732030034065247, + "learning_rate": 0.00016430653773993363, + "loss": 0.2351, + "step": 12374 + }, + { + "epoch": 0.8928893538727949, + "grad_norm": 0.11152256280183792, + "learning_rate": 0.0001643036513205369, + "loss": 0.1866, + "step": 12375 + }, + { + "epoch": 0.8929615065478552, + "grad_norm": 0.13826146721839905, + "learning_rate": 0.00016430076490114016, + "loss": 0.1083, + "step": 12376 + }, + { + "epoch": 0.8930336592229157, + "grad_norm": 0.12121112644672394, + "learning_rate": 0.0001642978784817434, + "loss": 0.1578, + "step": 12377 + }, + { + "epoch": 0.8931058118979761, + "grad_norm": 0.12537214159965515, + "learning_rate": 0.00016429499206234666, + "loss": 0.1067, + "step": 12378 + }, + { + "epoch": 0.8931779645730366, + "grad_norm": 0.12527628242969513, + "learning_rate": 0.00016429210564294992, + "loss": 0.1585, + "step": 12379 + }, + { + "epoch": 0.893250117248097, + "grad_norm": 0.1377083957195282, + "learning_rate": 0.00016428921922355318, + "loss": 0.1674, + "step": 12380 + }, + { + "epoch": 0.8933222699231574, + "grad_norm": 0.11523400247097015, + "learning_rate": 0.00016428633280415647, + "loss": 0.1275, + "step": 12381 + }, + { + "epoch": 0.8933944225982178, + "grad_norm": 0.14673997461795807, + "learning_rate": 0.0001642834463847597, + "loss": 0.2389, + "step": 12382 + }, + { + "epoch": 0.8934665752732782, + "grad_norm": 0.13048113882541656, + "learning_rate": 0.00016428055996536297, + "loss": 0.2019, + "step": 12383 + }, + { + "epoch": 0.8935387279483387, + "grad_norm": 0.13305433094501495, + "learning_rate": 0.00016427767354596623, + "loss": 0.1961, + "step": 12384 + }, + { + "epoch": 0.8936108806233991, + "grad_norm": 0.15378667414188385, + "learning_rate": 0.0001642747871265695, + "loss": 0.1607, + "step": 12385 + }, + { + "epoch": 0.8936830332984596, + "grad_norm": 0.16681982576847076, + "learning_rate": 0.00016427190070717276, + "loss": 0.219, + "step": 12386 + }, + { + "epoch": 0.89375518597352, + "grad_norm": 0.11870287358760834, + "learning_rate": 0.00016426901428777602, + "loss": 0.1189, + "step": 12387 + }, + { + "epoch": 0.8938273386485804, + "grad_norm": 0.1373557448387146, + "learning_rate": 0.00016426612786837929, + "loss": 0.1105, + "step": 12388 + }, + { + "epoch": 0.8938994913236408, + "grad_norm": 0.13363225758075714, + "learning_rate": 0.00016426324144898255, + "loss": 0.1998, + "step": 12389 + }, + { + "epoch": 0.8939716439987012, + "grad_norm": 0.12109125405550003, + "learning_rate": 0.0001642603550295858, + "loss": 0.1629, + "step": 12390 + }, + { + "epoch": 0.8940437966737617, + "grad_norm": 0.1376250982284546, + "learning_rate": 0.00016425746861018907, + "loss": 0.1867, + "step": 12391 + }, + { + "epoch": 0.8941159493488221, + "grad_norm": 0.11796516180038452, + "learning_rate": 0.00016425458219079234, + "loss": 0.1319, + "step": 12392 + }, + { + "epoch": 0.8941881020238825, + "grad_norm": 0.1224747747182846, + "learning_rate": 0.00016425169577139557, + "loss": 0.1562, + "step": 12393 + }, + { + "epoch": 0.894260254698943, + "grad_norm": 0.11128373444080353, + "learning_rate": 0.00016424880935199884, + "loss": 0.1912, + "step": 12394 + }, + { + "epoch": 0.8943324073740034, + "grad_norm": 0.1349061131477356, + "learning_rate": 0.00016424592293260213, + "loss": 0.1219, + "step": 12395 + }, + { + "epoch": 0.8944045600490638, + "grad_norm": 0.12552239000797272, + "learning_rate": 0.0001642430365132054, + "loss": 0.118, + "step": 12396 + }, + { + "epoch": 0.8944767127241242, + "grad_norm": 0.10382308065891266, + "learning_rate": 0.00016424015009380865, + "loss": 0.1273, + "step": 12397 + }, + { + "epoch": 0.8945488653991847, + "grad_norm": 0.1467970311641693, + "learning_rate": 0.0001642372636744119, + "loss": 0.1296, + "step": 12398 + }, + { + "epoch": 0.8946210180742451, + "grad_norm": 0.11753301322460175, + "learning_rate": 0.00016423437725501515, + "loss": 0.1716, + "step": 12399 + }, + { + "epoch": 0.8946931707493055, + "grad_norm": 0.1308908462524414, + "learning_rate": 0.0001642314908356184, + "loss": 0.1593, + "step": 12400 + }, + { + "epoch": 0.894765323424366, + "grad_norm": 0.14492283761501312, + "learning_rate": 0.00016422860441622168, + "loss": 0.1411, + "step": 12401 + }, + { + "epoch": 0.8948374760994264, + "grad_norm": 0.13398638367652893, + "learning_rate": 0.00016422571799682497, + "loss": 0.1848, + "step": 12402 + }, + { + "epoch": 0.8949096287744868, + "grad_norm": 0.15173323452472687, + "learning_rate": 0.00016422283157742823, + "loss": 0.1408, + "step": 12403 + }, + { + "epoch": 0.8949817814495472, + "grad_norm": 0.14614036679267883, + "learning_rate": 0.00016421994515803146, + "loss": 0.1572, + "step": 12404 + }, + { + "epoch": 0.8950539341246077, + "grad_norm": 0.13788336515426636, + "learning_rate": 0.00016421705873863473, + "loss": 0.1051, + "step": 12405 + }, + { + "epoch": 0.8951260867996681, + "grad_norm": 0.1386488974094391, + "learning_rate": 0.000164214172319238, + "loss": 0.1812, + "step": 12406 + }, + { + "epoch": 0.8951982394747285, + "grad_norm": 0.1316649317741394, + "learning_rate": 0.00016421128589984125, + "loss": 0.1131, + "step": 12407 + }, + { + "epoch": 0.895270392149789, + "grad_norm": 0.12185010313987732, + "learning_rate": 0.00016420839948044452, + "loss": 0.1823, + "step": 12408 + }, + { + "epoch": 0.8953425448248494, + "grad_norm": 0.12921541929244995, + "learning_rate": 0.00016420551306104778, + "loss": 0.1266, + "step": 12409 + }, + { + "epoch": 0.8954146974999098, + "grad_norm": 0.12820370495319366, + "learning_rate": 0.00016420262664165104, + "loss": 0.1816, + "step": 12410 + }, + { + "epoch": 0.8954868501749702, + "grad_norm": 0.13184615969657898, + "learning_rate": 0.0001641997402222543, + "loss": 0.1916, + "step": 12411 + }, + { + "epoch": 0.8955590028500306, + "grad_norm": 0.1054464802145958, + "learning_rate": 0.00016419685380285757, + "loss": 0.0965, + "step": 12412 + }, + { + "epoch": 0.8956311555250911, + "grad_norm": 0.12233717739582062, + "learning_rate": 0.00016419396738346083, + "loss": 0.1234, + "step": 12413 + }, + { + "epoch": 0.8957033082001515, + "grad_norm": 0.10888828337192535, + "learning_rate": 0.0001641910809640641, + "loss": 0.1317, + "step": 12414 + }, + { + "epoch": 0.895775460875212, + "grad_norm": 0.13491316139698029, + "learning_rate": 0.00016418819454466733, + "loss": 0.1156, + "step": 12415 + }, + { + "epoch": 0.8958476135502724, + "grad_norm": 0.13821394741535187, + "learning_rate": 0.00016418530812527062, + "loss": 0.136, + "step": 12416 + }, + { + "epoch": 0.8959197662253328, + "grad_norm": 0.12744927406311035, + "learning_rate": 0.00016418242170587388, + "loss": 0.1185, + "step": 12417 + }, + { + "epoch": 0.8959919189003932, + "grad_norm": 0.14690862596035004, + "learning_rate": 0.00016417953528647715, + "loss": 0.1198, + "step": 12418 + }, + { + "epoch": 0.8960640715754536, + "grad_norm": 0.1134641095995903, + "learning_rate": 0.0001641766488670804, + "loss": 0.1544, + "step": 12419 + }, + { + "epoch": 0.8961362242505141, + "grad_norm": 0.13892638683319092, + "learning_rate": 0.00016417376244768364, + "loss": 0.1552, + "step": 12420 + }, + { + "epoch": 0.8962083769255745, + "grad_norm": 0.1294938623905182, + "learning_rate": 0.0001641708760282869, + "loss": 0.1247, + "step": 12421 + }, + { + "epoch": 0.896280529600635, + "grad_norm": 0.12001454830169678, + "learning_rate": 0.00016416798960889017, + "loss": 0.1152, + "step": 12422 + }, + { + "epoch": 0.8963526822756954, + "grad_norm": 0.14264273643493652, + "learning_rate": 0.00016416510318949346, + "loss": 0.1921, + "step": 12423 + }, + { + "epoch": 0.8964248349507558, + "grad_norm": 0.11626090109348297, + "learning_rate": 0.00016416221677009672, + "loss": 0.1392, + "step": 12424 + }, + { + "epoch": 0.8964969876258162, + "grad_norm": 0.13233236968517303, + "learning_rate": 0.00016415933035069996, + "loss": 0.1176, + "step": 12425 + }, + { + "epoch": 0.8965691403008766, + "grad_norm": 0.147277370095253, + "learning_rate": 0.00016415644393130322, + "loss": 0.1517, + "step": 12426 + }, + { + "epoch": 0.8966412929759371, + "grad_norm": 0.12489044666290283, + "learning_rate": 0.00016415355751190648, + "loss": 0.1451, + "step": 12427 + }, + { + "epoch": 0.8967134456509975, + "grad_norm": 0.12134548276662827, + "learning_rate": 0.00016415067109250975, + "loss": 0.1487, + "step": 12428 + }, + { + "epoch": 0.896785598326058, + "grad_norm": 0.15623758733272552, + "learning_rate": 0.000164147784673113, + "loss": 0.1654, + "step": 12429 + }, + { + "epoch": 0.8968577510011184, + "grad_norm": 0.14344453811645508, + "learning_rate": 0.00016414489825371627, + "loss": 0.1764, + "step": 12430 + }, + { + "epoch": 0.8969299036761788, + "grad_norm": 0.13360056281089783, + "learning_rate": 0.00016414201183431954, + "loss": 0.1335, + "step": 12431 + }, + { + "epoch": 0.8970020563512392, + "grad_norm": 0.16808222234249115, + "learning_rate": 0.0001641391254149228, + "loss": 0.1346, + "step": 12432 + }, + { + "epoch": 0.8970742090262996, + "grad_norm": 0.20998232066631317, + "learning_rate": 0.00016413623899552606, + "loss": 0.2015, + "step": 12433 + }, + { + "epoch": 0.8971463617013601, + "grad_norm": 0.12847408652305603, + "learning_rate": 0.00016413335257612933, + "loss": 0.1336, + "step": 12434 + }, + { + "epoch": 0.8972185143764205, + "grad_norm": 0.14319449663162231, + "learning_rate": 0.0001641304661567326, + "loss": 0.1304, + "step": 12435 + }, + { + "epoch": 0.897290667051481, + "grad_norm": 0.1424330621957779, + "learning_rate": 0.00016412757973733582, + "loss": 0.1457, + "step": 12436 + }, + { + "epoch": 0.8973628197265414, + "grad_norm": 0.1303742378950119, + "learning_rate": 0.00016412469331793911, + "loss": 0.1281, + "step": 12437 + }, + { + "epoch": 0.8974349724016017, + "grad_norm": 0.11268772184848785, + "learning_rate": 0.00016412180689854238, + "loss": 0.1346, + "step": 12438 + }, + { + "epoch": 0.8975071250766622, + "grad_norm": 0.13568419218063354, + "learning_rate": 0.00016411892047914564, + "loss": 0.1778, + "step": 12439 + }, + { + "epoch": 0.8975792777517226, + "grad_norm": 0.14853152632713318, + "learning_rate": 0.0001641160340597489, + "loss": 0.1839, + "step": 12440 + }, + { + "epoch": 0.8976514304267831, + "grad_norm": 0.11737915128469467, + "learning_rate": 0.00016411314764035214, + "loss": 0.1111, + "step": 12441 + }, + { + "epoch": 0.8977235831018435, + "grad_norm": 0.17313599586486816, + "learning_rate": 0.0001641102612209554, + "loss": 0.183, + "step": 12442 + }, + { + "epoch": 0.897795735776904, + "grad_norm": 0.13710598647594452, + "learning_rate": 0.00016410737480155866, + "loss": 0.1558, + "step": 12443 + }, + { + "epoch": 0.8978678884519643, + "grad_norm": 0.12724435329437256, + "learning_rate": 0.00016410448838216193, + "loss": 0.1142, + "step": 12444 + }, + { + "epoch": 0.8979400411270247, + "grad_norm": 0.15760374069213867, + "learning_rate": 0.00016410160196276522, + "loss": 0.1551, + "step": 12445 + }, + { + "epoch": 0.8980121938020852, + "grad_norm": 0.16165392100811005, + "learning_rate": 0.00016409871554336845, + "loss": 0.1822, + "step": 12446 + }, + { + "epoch": 0.8980843464771456, + "grad_norm": 0.13458803296089172, + "learning_rate": 0.00016409582912397172, + "loss": 0.1332, + "step": 12447 + }, + { + "epoch": 0.8981564991522061, + "grad_norm": 0.10760502517223358, + "learning_rate": 0.00016409294270457498, + "loss": 0.1418, + "step": 12448 + }, + { + "epoch": 0.8982286518272665, + "grad_norm": 0.1373852640390396, + "learning_rate": 0.00016409005628517824, + "loss": 0.1644, + "step": 12449 + }, + { + "epoch": 0.898300804502327, + "grad_norm": 0.1437990516424179, + "learning_rate": 0.0001640871698657815, + "loss": 0.1508, + "step": 12450 + }, + { + "epoch": 0.8983729571773873, + "grad_norm": 0.11738962680101395, + "learning_rate": 0.00016408428344638477, + "loss": 0.1662, + "step": 12451 + }, + { + "epoch": 0.8984451098524477, + "grad_norm": 0.15426859259605408, + "learning_rate": 0.00016408139702698803, + "loss": 0.1088, + "step": 12452 + }, + { + "epoch": 0.8985172625275082, + "grad_norm": 0.1113508865237236, + "learning_rate": 0.0001640785106075913, + "loss": 0.1271, + "step": 12453 + }, + { + "epoch": 0.8985894152025686, + "grad_norm": 0.150124654173851, + "learning_rate": 0.00016407562418819456, + "loss": 0.194, + "step": 12454 + }, + { + "epoch": 0.8986615678776291, + "grad_norm": 0.13325640559196472, + "learning_rate": 0.00016407273776879782, + "loss": 0.1425, + "step": 12455 + }, + { + "epoch": 0.8987337205526895, + "grad_norm": 0.11264251172542572, + "learning_rate": 0.00016406985134940108, + "loss": 0.191, + "step": 12456 + }, + { + "epoch": 0.89880587322775, + "grad_norm": 0.13700659573078156, + "learning_rate": 0.00016406696493000432, + "loss": 0.1505, + "step": 12457 + }, + { + "epoch": 0.8988780259028103, + "grad_norm": 0.16137053072452545, + "learning_rate": 0.00016406407851060758, + "loss": 0.1943, + "step": 12458 + }, + { + "epoch": 0.8989501785778707, + "grad_norm": 0.12548089027404785, + "learning_rate": 0.00016406119209121087, + "loss": 0.1418, + "step": 12459 + }, + { + "epoch": 0.8990223312529312, + "grad_norm": 0.12392608076334, + "learning_rate": 0.00016405830567181413, + "loss": 0.0933, + "step": 12460 + }, + { + "epoch": 0.8990944839279916, + "grad_norm": 0.12155414372682571, + "learning_rate": 0.0001640554192524174, + "loss": 0.1841, + "step": 12461 + }, + { + "epoch": 0.8991666366030521, + "grad_norm": 0.13689649105072021, + "learning_rate": 0.00016405253283302063, + "loss": 0.1192, + "step": 12462 + }, + { + "epoch": 0.8992387892781125, + "grad_norm": 0.13133807480335236, + "learning_rate": 0.0001640496464136239, + "loss": 0.1471, + "step": 12463 + }, + { + "epoch": 0.899310941953173, + "grad_norm": 0.12057525664567947, + "learning_rate": 0.00016404675999422716, + "loss": 0.1704, + "step": 12464 + }, + { + "epoch": 0.8993830946282333, + "grad_norm": 0.11189797520637512, + "learning_rate": 0.00016404387357483042, + "loss": 0.1724, + "step": 12465 + }, + { + "epoch": 0.8994552473032937, + "grad_norm": 0.11240506917238235, + "learning_rate": 0.0001640409871554337, + "loss": 0.1414, + "step": 12466 + }, + { + "epoch": 0.8995273999783542, + "grad_norm": 0.13948418200016022, + "learning_rate": 0.00016403810073603695, + "loss": 0.1724, + "step": 12467 + }, + { + "epoch": 0.8995995526534146, + "grad_norm": 0.1354711651802063, + "learning_rate": 0.0001640352143166402, + "loss": 0.1177, + "step": 12468 + }, + { + "epoch": 0.8996717053284751, + "grad_norm": 0.1620495766401291, + "learning_rate": 0.00016403232789724347, + "loss": 0.1898, + "step": 12469 + }, + { + "epoch": 0.8997438580035355, + "grad_norm": 0.13859111070632935, + "learning_rate": 0.00016402944147784674, + "loss": 0.1376, + "step": 12470 + }, + { + "epoch": 0.899816010678596, + "grad_norm": 0.1199670135974884, + "learning_rate": 0.00016402655505845, + "loss": 0.1747, + "step": 12471 + }, + { + "epoch": 0.8998881633536563, + "grad_norm": 0.1164129227399826, + "learning_rate": 0.00016402366863905326, + "loss": 0.1454, + "step": 12472 + }, + { + "epoch": 0.8999603160287167, + "grad_norm": 0.11841829866170883, + "learning_rate": 0.00016402078221965653, + "loss": 0.1219, + "step": 12473 + }, + { + "epoch": 0.9000324687037772, + "grad_norm": 0.12008563429117203, + "learning_rate": 0.0001640178958002598, + "loss": 0.1216, + "step": 12474 + }, + { + "epoch": 0.9001046213788376, + "grad_norm": 0.11511898040771484, + "learning_rate": 0.00016401500938086305, + "loss": 0.1556, + "step": 12475 + }, + { + "epoch": 0.9001767740538981, + "grad_norm": 0.11898694932460785, + "learning_rate": 0.00016401212296146631, + "loss": 0.1861, + "step": 12476 + }, + { + "epoch": 0.9002489267289585, + "grad_norm": 0.16332779824733734, + "learning_rate": 0.00016400923654206958, + "loss": 0.2241, + "step": 12477 + }, + { + "epoch": 0.900321079404019, + "grad_norm": 0.13668668270111084, + "learning_rate": 0.0001640063501226728, + "loss": 0.1629, + "step": 12478 + }, + { + "epoch": 0.9003932320790793, + "grad_norm": 0.0982336550951004, + "learning_rate": 0.00016400346370327608, + "loss": 0.0953, + "step": 12479 + }, + { + "epoch": 0.9004653847541397, + "grad_norm": 0.12939897179603577, + "learning_rate": 0.00016400057728387937, + "loss": 0.1281, + "step": 12480 + }, + { + "epoch": 0.9005375374292002, + "grad_norm": 0.16161668300628662, + "learning_rate": 0.00016399769086448263, + "loss": 0.1457, + "step": 12481 + }, + { + "epoch": 0.9006096901042606, + "grad_norm": 0.13126331567764282, + "learning_rate": 0.0001639948044450859, + "loss": 0.1349, + "step": 12482 + }, + { + "epoch": 0.9006818427793211, + "grad_norm": 0.12295763939619064, + "learning_rate": 0.00016399191802568913, + "loss": 0.1537, + "step": 12483 + }, + { + "epoch": 0.9007539954543815, + "grad_norm": 0.13977226614952087, + "learning_rate": 0.0001639890316062924, + "loss": 0.1223, + "step": 12484 + }, + { + "epoch": 0.900826148129442, + "grad_norm": 0.1546591967344284, + "learning_rate": 0.00016398614518689565, + "loss": 0.1674, + "step": 12485 + }, + { + "epoch": 0.9008983008045023, + "grad_norm": 0.09397142380475998, + "learning_rate": 0.00016398325876749892, + "loss": 0.0949, + "step": 12486 + }, + { + "epoch": 0.9009704534795627, + "grad_norm": 0.138092502951622, + "learning_rate": 0.0001639803723481022, + "loss": 0.2451, + "step": 12487 + }, + { + "epoch": 0.9010426061546232, + "grad_norm": 0.12386883795261383, + "learning_rate": 0.00016397748592870544, + "loss": 0.1326, + "step": 12488 + }, + { + "epoch": 0.9011147588296836, + "grad_norm": 0.11555249989032745, + "learning_rate": 0.0001639745995093087, + "loss": 0.1195, + "step": 12489 + }, + { + "epoch": 0.9011869115047441, + "grad_norm": 0.172087162733078, + "learning_rate": 0.00016397171308991197, + "loss": 0.1241, + "step": 12490 + }, + { + "epoch": 0.9012590641798045, + "grad_norm": 0.12430737912654877, + "learning_rate": 0.00016396882667051523, + "loss": 0.162, + "step": 12491 + }, + { + "epoch": 0.901331216854865, + "grad_norm": 0.12657372653484344, + "learning_rate": 0.0001639659402511185, + "loss": 0.1487, + "step": 12492 + }, + { + "epoch": 0.9014033695299253, + "grad_norm": 0.12591242790222168, + "learning_rate": 0.00016396305383172176, + "loss": 0.1452, + "step": 12493 + }, + { + "epoch": 0.9014755222049857, + "grad_norm": 0.11260319501161575, + "learning_rate": 0.00016396016741232502, + "loss": 0.163, + "step": 12494 + }, + { + "epoch": 0.9015476748800462, + "grad_norm": 0.138995960354805, + "learning_rate": 0.00016395728099292828, + "loss": 0.1362, + "step": 12495 + }, + { + "epoch": 0.9016198275551066, + "grad_norm": 0.14916054904460907, + "learning_rate": 0.00016395439457353155, + "loss": 0.1919, + "step": 12496 + }, + { + "epoch": 0.9016919802301671, + "grad_norm": 0.10642482340335846, + "learning_rate": 0.0001639515081541348, + "loss": 0.1997, + "step": 12497 + }, + { + "epoch": 0.9017641329052275, + "grad_norm": 0.1367085725069046, + "learning_rate": 0.00016394862173473807, + "loss": 0.1338, + "step": 12498 + }, + { + "epoch": 0.9018362855802879, + "grad_norm": 0.16044127941131592, + "learning_rate": 0.0001639457353153413, + "loss": 0.1513, + "step": 12499 + }, + { + "epoch": 0.9019084382553483, + "grad_norm": 0.13745981454849243, + "learning_rate": 0.00016394284889594457, + "loss": 0.1736, + "step": 12500 + } + ], + "logging_steps": 1, + "max_steps": 69295, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.686622915010806e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}